date:20180329

Re: [PATCH net-next 0/9] devlink: Add support for region access

2018-03-29 Thread Alex Vesker




On 3/29/2018 10:51 PM, Andrew Lunn wrote:

Show all of the exposed regions with region sizes:
$ devlink region show
pci/:00:05.0/cr-space: size 1048576 snapshot [1 2]

So you have 2Mbytes of snapshot data. Is this held in the device, or
kernel memory?

This is allocated in devlink, the maximum number of snapshots is set by the
driver.

And it seems to want contiguous pages. How well does that work after
the system has been running for a while and memory is fragmented?


The allocation can be changed, there is no read need for contiguous pages.
It is important to note that we the amount of snapshots is limited by 
the driver

this can be based on the dump size or expected frequency of collection.
I also prefer not to pre-allocate this memory.

Dump a snapshot:
$ devlink region dump pci/:00:05.0/fw-health snapshot 1
 0014 95dc 0014 9514 0035 1670 0034 db30
0010    ff04 0029 8c00 0028 8cc8
0020 0016 0bb8 0016 1720   c00f 3ffc
0030 bada cce5 bada cce5 bada cce5 bada cce5

Read a specific part of a snapshot:
$ devlink region read pci/:00:05.0/fw-health snapshot 1 address 0
length 16
 0014 95dc 0014 9514 0035 1670 0034 db30

Why a separate command? It seems to be just a subset of dump.

This is useful when debugging values on specific addresses, this also
brings the API one step closer for a read and write API.

The functionality is useful, yes. But why two commands? Why not one
command, dump, which takes optional parameters?


Dump in devlink means provide all the data, saying dump address x length 
y sounds

confusing.  Do you see this as a critical issue?


Also, i doubt write support will be accepted. That sounds like the
start of an API to allow a user space driver.


If this will be an issue we will stay with read access only.



   Andrew

[PATCH net-next 3/6] inet: frags: add a pointer to struct netns_frags

2018-03-29 Thread Eric Dumazet

In order to simplify the API, add a pointer to struct inet_frags.
This will allow us to make things less complex.

These functions no longer have a struct inet_frags parameter :

inet_frag_destroy(struct inet_frag_queue *q  /*, struct inet_frags *f */)
inet_frag_put(struct inet_frag_queue *q /*, struct inet_frags *f */)
inet_frag_kill(struct inet_frag_queue *q /*, struct inet_frags *f */)
inet_frags_exit_net(struct netns_frags *nf /*, struct inet_frags *f */);

Signed-off-by: Eric Dumazet 
---
 include/net/inet_frag.h | 11 ++-
 net/ieee802154/6lowpan/reassembly.c | 13 +++--
 net/ipv4/inet_fragment.c| 17 ++---
 net/ipv4/ip_fragment.c  |  9 +
 net/ipv6/netfilter/nf_conntrack_reasm.c | 14 --
 net/ipv6/reassembly.c   | 15 ---
 6 files changed, 44 insertions(+), 35 deletions(-)

diff --git a/include/net/inet_frag.h b/include/net/inet_frag.h
index 
b1d62176f3b4fcf100bd263e8eae0db656a3d9b6..69e531ed81894393e07cac9e953825fcb55ef42a
 100644
--- a/include/net/inet_frag.h
+++ b/include/net/inet_frag.h
@@ -10,6 +10,7 @@ struct netns_frags {
int high_thresh;
int low_thresh;
int max_dist;
+   struct inet_frags   *f;
 };
 
 /**
@@ -109,20 +110,20 @@ static inline int inet_frags_init_net(struct netns_frags 
*nf)
atomic_set(>mem, 0);
return 0;
 }
-void inet_frags_exit_net(struct netns_frags *nf, struct inet_frags *f);
+void inet_frags_exit_net(struct netns_frags *nf);
 
-void inet_frag_kill(struct inet_frag_queue *q, struct inet_frags *f);
-void inet_frag_destroy(struct inet_frag_queue *q, struct inet_frags *f);
+void inet_frag_kill(struct inet_frag_queue *q);
+void inet_frag_destroy(struct inet_frag_queue *q);
 struct inet_frag_queue *inet_frag_find(struct netns_frags *nf,
struct inet_frags *f, void *key, unsigned int hash);
 
 void inet_frag_maybe_warn_overflow(struct inet_frag_queue *q,
   const char *prefix);
 
-static inline void inet_frag_put(struct inet_frag_queue *q, struct inet_frags 
*f)
+static inline void inet_frag_put(struct inet_frag_queue *q)
 {
if (refcount_dec_and_test(>refcnt))
-   inet_frag_destroy(q, f);
+   inet_frag_destroy(q);
 }
 
 static inline bool inet_frag_evicting(struct inet_frag_queue *q)
diff --git a/net/ieee802154/6lowpan/reassembly.c 
b/net/ieee802154/6lowpan/reassembly.c
index 
2aaab4bba42961647a4d3d1c0b8497917d5065ce..6badc05b7baedac2051a1aaea15f9e9b180c
 100644
--- a/net/ieee802154/6lowpan/reassembly.c
+++ b/net/ieee802154/6lowpan/reassembly.c
@@ -94,10 +94,10 @@ static void lowpan_frag_expire(struct timer_list *t)
if (fq->q.flags & INET_FRAG_COMPLETE)
goto out;
 
-   inet_frag_kill(>q, _frags);
+   inet_frag_kill(>q);
 out:
spin_unlock(>q.lock);
-   inet_frag_put(>q, _frags);
+   inet_frag_put(>q);
 }
 
 static inline struct lowpan_frag_queue *
@@ -230,7 +230,7 @@ static int lowpan_frag_reasm(struct lowpan_frag_queue *fq, 
struct sk_buff *prev,
struct sk_buff *fp, *head = fq->q.fragments;
int sum_truesize;
 
-   inet_frag_kill(>q, _frags);
+   inet_frag_kill(>q);
 
/* Make the one we just received the head. */
if (prev) {
@@ -438,7 +438,7 @@ int lowpan_frag_rcv(struct sk_buff *skb, u8 frag_type)
ret = lowpan_frag_queue(fq, skb, frag_type);
spin_unlock(>q.lock);
 
-   inet_frag_put(>q, _frags);
+   inet_frag_put(>q);
return ret;
}
 
@@ -586,13 +586,14 @@ static int __net_init lowpan_frags_init_net(struct net 
*net)
ieee802154_lowpan->frags.high_thresh = IPV6_FRAG_HIGH_THRESH;
ieee802154_lowpan->frags.low_thresh = IPV6_FRAG_LOW_THRESH;
ieee802154_lowpan->frags.timeout = IPV6_FRAG_TIMEOUT;
+   ieee802154_lowpan->frags.f = _frags;
 
res = inet_frags_init_net(_lowpan->frags);
if (res < 0)
return res;
res = lowpan_frags_ns_sysctl_register(net);
if (res < 0)
-   inet_frags_exit_net(_lowpan->frags, _frags);
+   inet_frags_exit_net(_lowpan->frags);
return res;
 }
 
@@ -602,7 +603,7 @@ static void __net_exit lowpan_frags_exit_net(struct net 
*net)
net_ieee802154_lowpan(net);
 
lowpan_frags_ns_sysctl_unregister(net);
-   inet_frags_exit_net(_lowpan->frags, _frags);
+   inet_frags_exit_net(_lowpan->frags);
 }
 
 static struct pernet_operations lowpan_frags_ops = {
diff --git a/net/ipv4/inet_fragment.c b/net/ipv4/inet_fragment.c
index 
e8ec28999f5ce0c5d496e9a97ca1748b18db0cf0..1ac69f65d0dee600d0ab4db20ff5942952932c40
 100644
--- a/net/ipv4/inet_fragment.c
+++ b/net/ipv4/inet_fragment.c
@@ -219,8 +219,9 @@ void inet_frags_fini(struct inet_frags *f)
 }

[PATCH net-next 5/6] inet: frags: remove some helpers

2018-03-29 Thread Eric Dumazet

Remove sum_frag_mem_limit(), ip_frag_mem() & ip6_frag_mem()

Also since we use rhashtable we can bring back the number of fragments
in "grep FRAG /proc/net/sockstat /proc/net/sockstat6" that was
removed in commit 434d305405ab ("inet: frag: don't account number
of fragment queues")

Signed-off-by: Eric Dumazet 
---
 include/net/inet_frag.h | 5 -
 include/net/ip.h| 1 -
 include/net/ipv6.h  | 7 ---
 net/ipv4/ip_fragment.c  | 5 -
 net/ipv4/proc.c | 6 +++---
 net/ipv6/proc.c | 5 +++--
 6 files changed, 6 insertions(+), 23 deletions(-)

diff --git a/include/net/inet_frag.h b/include/net/inet_frag.h
index 
05099f9f980e2384c0c8cd7e74659656b585cd22..fae84c10679c012bddc4367bcd0d44e34bd51372
 100644
--- a/include/net/inet_frag.h
+++ b/include/net/inet_frag.h
@@ -135,11 +135,6 @@ static inline void add_frag_mem_limit(struct netns_frags 
*nf, int i)
atomic_add(i, >mem);
 }
 
-static inline int sum_frag_mem_limit(struct netns_frags *nf)
-{
-   return atomic_read(>mem);
-}
-
 /* RFC 3168 support :
  * We want to check ECN values of all fragments, do detect invalid 
combinations.
  * In ipq->ecn, we store the OR value of each ip4_frag_ecn() fragment value.
diff --git a/include/net/ip.h b/include/net/ip.h
index 
36f8f7811093c37de06194dc7410b7596f8bf9fa..ecffd843e7b896a83416847fdaa452be6223f3dc
 100644
--- a/include/net/ip.h
+++ b/include/net/ip.h
@@ -588,7 +588,6 @@ static inline struct sk_buff *ip_check_defrag(struct net 
*net, struct sk_buff *s
return skb;
 }
 #endif
-int ip_frag_mem(struct net *net);
 
 /*
  * Functions provided by ip_forward.c
diff --git a/include/net/ipv6.h b/include/net/ipv6.h
index 
76f84d4be91b92761fb9a26e7f52e2101ee34c0a..abbed2687fbd61cb47e2b6d0164ab6cf4d40a618
 100644
--- a/include/net/ipv6.h
+++ b/include/net/ipv6.h
@@ -379,13 +379,6 @@ static inline bool ipv6_accept_ra(struct inet6_dev *idev)
idev->cnf.accept_ra;
 }
 
-#if IS_ENABLED(CONFIG_IPV6)
-static inline int ip6_frag_mem(struct net *net)
-{
-   return sum_frag_mem_limit(>ipv6.frags);
-}
-#endif
-
 #define IPV6_FRAG_HIGH_THRESH  (4 * 1024*1024) /* 4194304 */
 #define IPV6_FRAG_LOW_THRESH   (3 * 1024*1024) /* 3145728 */
 #define IPV6_FRAG_TIMEOUT  (60 * HZ)   /* 60 seconds */
diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c
index 
1a7423e8ec0a8f88782ad8c945dc0cd6046f79f0..ef864db73613b491fb430ff3b594c7286705a1b3
 100644
--- a/net/ipv4/ip_fragment.c
+++ b/net/ipv4/ip_fragment.c
@@ -83,11 +83,6 @@ static u8 ip4_frag_ecn(u8 tos)
 
 static struct inet_frags ip4_frags;
 
-int ip_frag_mem(struct net *net)
-{
-   return sum_frag_mem_limit(>ipv4.frags);
-}
-
 static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev,
 struct net_device *dev);
 
diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c
index 
adfb75340275d240487574257c10feb295df44fe..aacfce0d7d82cf59269a69ef4d6ac8d9955b0bdc
 100644
--- a/net/ipv4/proc.c
+++ b/net/ipv4/proc.c
@@ -54,7 +54,6 @@
 static int sockstat_seq_show(struct seq_file *seq, void *v)
 {
struct net *net = seq->private;
-   unsigned int frag_mem;
int orphans, sockets;
 
orphans = percpu_counter_sum_positive(_orphan_count);
@@ -72,8 +71,9 @@ static int sockstat_seq_show(struct seq_file *seq, void *v)
   sock_prot_inuse_get(net, _prot));
seq_printf(seq, "RAW: inuse %d\n",
   sock_prot_inuse_get(net, _prot));
-   frag_mem = ip_frag_mem(net);
-   seq_printf(seq,  "FRAG: inuse %u memory %u\n", !!frag_mem, frag_mem);
+   seq_printf(seq,  "FRAG: inuse %u memory %u\n",
+  atomic_read(>ipv4.frags.rhashtable.nelems),
+  frag_mem_limit(>ipv4.frags));
return 0;
 }
 
diff --git a/net/ipv6/proc.c b/net/ipv6/proc.c
index 
6e57028d2e9160be264d07f9312658fcb677a568..8befeb91e0712ecc4d05c4c0a6ecca1808dcbcac
 100644
--- a/net/ipv6/proc.c
+++ b/net/ipv6/proc.c
@@ -38,7 +38,6 @@
 static int sockstat6_seq_show(struct seq_file *seq, void *v)
 {
struct net *net = seq->private;
-   unsigned int frag_mem = ip6_frag_mem(net);
 
seq_printf(seq, "TCP6: inuse %d\n",
   sock_prot_inuse_get(net, _prot));
@@ -48,7 +47,9 @@ static int sockstat6_seq_show(struct seq_file *seq, void *v)
sock_prot_inuse_get(net, _prot));
seq_printf(seq, "RAW6: inuse %d\n",
   sock_prot_inuse_get(net, _prot));
-   seq_printf(seq, "FRAG6: inuse %u memory %u\n", !!frag_mem, frag_mem);
+   seq_printf(seq, "FRAG6: inuse %u memory %u\n",
+  atomic_read(>ipv6.frags.rhashtable.nelems),
+  frag_mem_limit(>ipv6.frags));
return 0;
 }
 
-- 
2.17.0.rc1.321.gba9d0f2565-goog

[PATCH net-next 6/6] inet: frags: break the 2GB limit for frags storage

2018-03-29 Thread Eric Dumazet

Some users are willing to provision huge amounts of memory to be able
to perform reassembly reasonnably well under pressure.

Current memory tracking is using one atomic_t and integers.

Switch to atomic_long_t so that 64bit arches can use more than 2GB,
without any cost for 32bit arches.

Tested:

$ echo 160 >/proc/sys/net/ipv4/ipfrag_high_thresh



$ grep FRAG /proc/net/sockstat
FRAG: inuse 14705885 memory 1602880

$ nstat -n ; sleep 1 ; nstat | grep Reas
IpReasmReqds33171500.0
IpReasmFails33171120.0

Signed-off-by: Eric Dumazet 
---
 Documentation/networking/ip-sysctl.txt  |  4 ++--
 include/net/inet_frag.h | 20 ++--
 net/ieee802154/6lowpan/reassembly.c | 10 +-
 net/ipv4/ip_fragment.c  | 10 +-
 net/ipv4/proc.c |  2 +-
 net/ipv6/netfilter/nf_conntrack_reasm.c | 10 +-
 net/ipv6/proc.c |  2 +-
 net/ipv6/reassembly.c   |  6 +++---
 8 files changed, 32 insertions(+), 32 deletions(-)

diff --git a/Documentation/networking/ip-sysctl.txt 
b/Documentation/networking/ip-sysctl.txt
index 
c3b65f24e58aa72b720861d816fb76f9956800f0..1c18aa6606b771fb6afb95e55fb880b2484775dd
 100644
--- a/Documentation/networking/ip-sysctl.txt
+++ b/Documentation/networking/ip-sysctl.txt
@@ -133,10 +133,10 @@ min_adv_mss - INTEGER
 
 IP Fragmentation:
 
-ipfrag_high_thresh - INTEGER
+ipfrag_high_thresh - LONG INTEGER
Maximum memory used to reassemble IP fragments.
 
-ipfrag_low_thresh - INTEGER
+ipfrag_low_thresh - LONG INTEGER
(Obsolete since linux-4.17)
Maximum memory used to reassemble IP fragments before the kernel
begins to remove incomplete fragment queues to free up resources.
diff --git a/include/net/inet_frag.h b/include/net/inet_frag.h
index 
fae84c10679c012bddc4367bcd0d44e34bd51372..b68f5d4338b63701587deb5ba0e48a324920c332
 100644
--- a/include/net/inet_frag.h
+++ b/include/net/inet_frag.h
@@ -6,14 +6,14 @@
 
 struct netns_frags {
/* sysctls */
+   longhigh_thresh;
+   longlow_thresh;
int timeout;
-   int high_thresh;
-   int low_thresh;
int max_dist;
struct inet_frags   *f;
 
/* Keep atomic mem on separate cachelines in structs that include it */
-   atomic_tmem cacheline_aligned_in_smp;
+   atomic_long_t   mem cacheline_aligned_in_smp;
 
struct rhashtable   rhashtable cacheline_aligned_in_smp;
 };
@@ -103,7 +103,7 @@ void inet_frags_fini(struct inet_frags *);
 
 static inline int inet_frags_init_net(struct netns_frags *nf)
 {
-   atomic_set(>mem, 0);
+   atomic_long_set(>mem, 0);
return rhashtable_init(>rhashtable, >f->rhash_params);
 }
 void inet_frags_exit_net(struct netns_frags *nf);
@@ -120,19 +120,19 @@ static inline void inet_frag_put(struct inet_frag_queue 
*q)
 
 /* Memory Tracking Functions. */
 
-static inline int frag_mem_limit(struct netns_frags *nf)
+static inline long frag_mem_limit(const struct netns_frags *nf)
 {
-   return atomic_read(>mem);
+   return atomic_long_read(>mem);
 }
 
-static inline void sub_frag_mem_limit(struct netns_frags *nf, int i)
+static inline void sub_frag_mem_limit(struct netns_frags *nf, long val)
 {
-   atomic_sub(i, >mem);
+   atomic_long_sub(val, >mem);
 }
 
-static inline void add_frag_mem_limit(struct netns_frags *nf, int i)
+static inline void add_frag_mem_limit(struct netns_frags *nf, long val)
 {
-   atomic_add(i, >mem);
+   atomic_long_add(val, >mem);
 }
 
 /* RFC 3168 support :
diff --git a/net/ieee802154/6lowpan/reassembly.c 
b/net/ieee802154/6lowpan/reassembly.c
index 
9ee4d22666c26d6d9796d0f484bb4beb265dea42..d7125507c065cd23501f29664892d24c36050ea7
 100644
--- a/net/ieee802154/6lowpan/reassembly.c
+++ b/net/ieee802154/6lowpan/reassembly.c
@@ -411,23 +411,23 @@ int lowpan_frag_rcv(struct sk_buff *skb, u8 frag_type)
 }
 
 #ifdef CONFIG_SYSCTL
-static int zero;
+static long zero;
 
 static struct ctl_table lowpan_frags_ns_ctl_table[] = {
{
.procname   = "6lowpanfrag_high_thresh",
.data   = _net.ieee802154_lowpan.frags.high_thresh,
-   .maxlen = sizeof(int),
+   .maxlen = sizeof(unsigned long),
.mode   = 0644,
-   .proc_handler   = proc_dointvec_minmax,
+   .proc_handler   = proc_doulongvec_minmax,
.extra1 = _net.ieee802154_lowpan.frags.low_thresh
},
{
.procname   = "6lowpanfrag_low_thresh",
.data   = _net.ieee802154_lowpan.frags.low_thresh,
-   .maxlen = sizeof(int),
+   .maxlen

[PATCH net-next 4/6] inet: frags: use rhashtables for reassembly units

2018-03-29 Thread Eric Dumazet

Some applications still rely on IP fragmentation, and to be fair linux
reassembly unit is not working under any serious load.

It uses static hash tables of 1024 buckets, and up to 128 items per bucket (!!!)

A work queue is supposed to garbage collect items when host is under memory
pressure, and doing a hash rebuild, changing seed used in hash computations.

This work queue blocks softirqs for up to 25 ms when doing a hash rebuild,
occurring every 5 seconds if host is under fire.

Then there is the problem of sharing this hash table for all netns.

It is time to switch to rhashtables, and allocate one of them per netns
to speedup netns dismantle, since this is a critical metric these days.

Lookup is now using RCU. A followup patch will even remove
the refcount hold/release left from prior implementation and save
a couple of atomic operations.

Before this patch, 16 cpus (16 RX queue NIC) could not handle more
than 1 Mpps frags DDOS.

After the patch, I reach 7 Mpps without any tuning, and can use up to 2GB
of storage for the fragments.

$ grep FRAG /proc/net/sockstat
FRAG: inuse 1966916 memory 2140004608

A followup patch will change the limits for 64bit arches.

Signed-off-by: Eric Dumazet 
Cc: Florian Westphal 
Cc: Nikolay Aleksandrov 
Cc: Jesper Dangaard Brouer 
Cc: Alexander Aring 
Cc: Stefan Schmidt 
---
 Documentation/networking/ip-sysctl.txt  |   7 +-
 include/net/inet_frag.h |  99 +++---
 include/net/ipv6.h  |  20 +-
 net/ieee802154/6lowpan/6lowpan_i.h  |  26 +-
 net/ieee802154/6lowpan/reassembly.c | 108 +++
 net/ipv4/inet_fragment.c| 399 +---
 net/ipv4/ip_fragment.c  | 165 +-
 net/ipv6/netfilter/nf_conntrack_reasm.c |  62 ++--
 net/ipv6/reassembly.c   | 152 +
 9 files changed, 344 insertions(+), 694 deletions(-)

diff --git a/Documentation/networking/ip-sysctl.txt 
b/Documentation/networking/ip-sysctl.txt
index 
1d1120753ae82d0aee3e934a3d9c074b70dcbca6..c3b65f24e58aa72b720861d816fb76f9956800f0
 100644
--- a/Documentation/networking/ip-sysctl.txt
+++ b/Documentation/networking/ip-sysctl.txt
@@ -134,13 +134,10 @@ min_adv_mss - INTEGER
 IP Fragmentation:
 
 ipfrag_high_thresh - INTEGER
-   Maximum memory used to reassemble IP fragments. When
-   ipfrag_high_thresh bytes of memory is allocated for this purpose,
-   the fragment handler will toss packets until ipfrag_low_thresh
-   is reached. This also serves as a maximum limit to namespaces
-   different from the initial one.
+   Maximum memory used to reassemble IP fragments.
 
 ipfrag_low_thresh - INTEGER
+   (Obsolete since linux-4.17)
Maximum memory used to reassemble IP fragments before the kernel
begins to remove incomplete fragment queues to free up resources.
The kernel still accepts new fragments for defragmentation.
diff --git a/include/net/inet_frag.h b/include/net/inet_frag.h
index 
69e531ed81894393e07cac9e953825fcb55ef42a..05099f9f980e2384c0c8cd7e74659656b585cd22
 100644
--- a/include/net/inet_frag.h
+++ b/include/net/inet_frag.h
@@ -2,15 +2,20 @@
 #ifndef __NET_FRAG_H__
 #define __NET_FRAG_H__
 
+#include 
+
 struct netns_frags {
-   /* Keep atomic mem on separate cachelines in structs that include it */
-   atomic_tmem cacheline_aligned_in_smp;
/* sysctls */
int timeout;
int high_thresh;
int low_thresh;
int max_dist;
struct inet_frags   *f;
+
+   /* Keep atomic mem on separate cachelines in structs that include it */
+   atomic_tmem cacheline_aligned_in_smp;
+
+   struct rhashtable   rhashtable cacheline_aligned_in_smp;
 };
 
 /**
@@ -26,12 +31,31 @@ enum {
INET_FRAG_COMPLETE  = BIT(2),
 };
 
+struct frag_v4_compare_key {
+   __be32  saddr;
+   __be32  daddr;
+   u32 user;
+   u32 vif;
+   __be16  id;
+   u16 protocol;
+};
+
+struct frag_v6_compare_key {
+   struct in6_addr saddr;
+   struct in6_addr daddr;
+   u32 user;
+   __be32  id;
+   u32 iif;
+};
+
 /**
  * struct inet_frag_queue - fragment queue
  *
- * @lock: spinlock protecting the queue
+ * @node: rhash node
+ * @key: keys identifying this frag.
  * @timer: queue expiration timer
- * @list: hash bucket list
+ * @net: namespace that this frag belongs to
+ * @lock: spinlock protecting this frag
  * @refcnt: reference count of the queue
  * @fragments: received fragments head
  * @fragments_tail: received fragments tail
@@ -40,66 +64,38 @@ enum {
  * @meat: length of received fragments so far
  * @flags: fragment queue flags
  *

[PATCH net-next 2/6] inet: frags: change inet_frags_init_net() return value

2018-03-29 Thread Eric Dumazet

We will soon initialize one rhashtable per struct netns_frags
in inet_frags_init_net().

This patch changes the return value to eventually propagate an
error.

Signed-off-by: Eric Dumazet 
---
 include/net/inet_frag.h |  3 ++-
 net/ieee802154/6lowpan/reassembly.c | 11 ---
 net/ipv4/ip_fragment.c  | 12 +---
 net/ipv6/netfilter/nf_conntrack_reasm.c | 12 +---
 net/ipv6/reassembly.c   | 11 +--
 5 files changed, 37 insertions(+), 12 deletions(-)

diff --git a/include/net/inet_frag.h b/include/net/inet_frag.h
index 
351f0c3cdcd9df16078a40f07963fb605eeaa882..b1d62176f3b4fcf100bd263e8eae0db656a3d9b6
 100644
--- a/include/net/inet_frag.h
+++ b/include/net/inet_frag.h
@@ -104,9 +104,10 @@ struct inet_frags {
 int inet_frags_init(struct inet_frags *);
 void inet_frags_fini(struct inet_frags *);
 
-static inline void inet_frags_init_net(struct netns_frags *nf)
+static inline int inet_frags_init_net(struct netns_frags *nf)
 {
atomic_set(>mem, 0);
+   return 0;
 }
 void inet_frags_exit_net(struct netns_frags *nf, struct inet_frags *f);
 
diff --git a/net/ieee802154/6lowpan/reassembly.c 
b/net/ieee802154/6lowpan/reassembly.c
index 
85bf86ad6b1801066a4252af18b5b511070a9e08..2aaab4bba42961647a4d3d1c0b8497917d5065ce
 100644
--- a/net/ieee802154/6lowpan/reassembly.c
+++ b/net/ieee802154/6lowpan/reassembly.c
@@ -581,14 +581,19 @@ static int __net_init lowpan_frags_init_net(struct net 
*net)
 {
struct netns_ieee802154_lowpan *ieee802154_lowpan =
net_ieee802154_lowpan(net);
+   int res;
 
ieee802154_lowpan->frags.high_thresh = IPV6_FRAG_HIGH_THRESH;
ieee802154_lowpan->frags.low_thresh = IPV6_FRAG_LOW_THRESH;
ieee802154_lowpan->frags.timeout = IPV6_FRAG_TIMEOUT;
 
-   inet_frags_init_net(_lowpan->frags);
-
-   return lowpan_frags_ns_sysctl_register(net);
+   res = inet_frags_init_net(_lowpan->frags);
+   if (res < 0)
+   return res;
+   res = lowpan_frags_ns_sysctl_register(net);
+   if (res < 0)
+   inet_frags_exit_net(_lowpan->frags, _frags);
+   return res;
 }
 
 static void __net_exit lowpan_frags_exit_net(struct net *net)
diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c
index 
bbf1b94942c0ed53d1ddb87e4ee63833c08f2684..e0b39d4ecbd411ff4bb72d1ed973e45bd6da9ef1
 100644
--- a/net/ipv4/ip_fragment.c
+++ b/net/ipv4/ip_fragment.c
@@ -846,6 +846,8 @@ static void __init ip4_frags_ctl_register(void)
 
 static int __net_init ipv4_frags_init_net(struct net *net)
 {
+   int res;
+
/* Fragment cache limits.
 *
 * The fragment memory accounting code, (tries to) account for
@@ -871,9 +873,13 @@ static int __net_init ipv4_frags_init_net(struct net *net)
 
net->ipv4.frags.max_dist = 64;
 
-   inet_frags_init_net(>ipv4.frags);
-
-   return ip4_frags_ns_ctl_register(net);
+   res = inet_frags_init_net(>ipv4.frags);
+   if (res < 0)
+   return res;
+   res = ip4_frags_ns_ctl_register(net);
+   if (res < 0)
+   inet_frags_exit_net(>ipv4.frags, _frags);
+   return res;
 }
 
 static void __net_exit ipv4_frags_exit_net(struct net *net)
diff --git a/net/ipv6/netfilter/nf_conntrack_reasm.c 
b/net/ipv6/netfilter/nf_conntrack_reasm.c
index 
b84ce3e6d728a5b8af65b91faf42ec640ff03910..6ff41569134ae36809a8b42d8e46d50d19ffde53
 100644
--- a/net/ipv6/netfilter/nf_conntrack_reasm.c
+++ b/net/ipv6/netfilter/nf_conntrack_reasm.c
@@ -629,12 +629,18 @@ EXPORT_SYMBOL_GPL(nf_ct_frag6_gather);
 
 static int nf_ct_net_init(struct net *net)
 {
+   int res;
+
net->nf_frag.frags.high_thresh = IPV6_FRAG_HIGH_THRESH;
net->nf_frag.frags.low_thresh = IPV6_FRAG_LOW_THRESH;
net->nf_frag.frags.timeout = IPV6_FRAG_TIMEOUT;
-   inet_frags_init_net(>nf_frag.frags);
-
-   return nf_ct_frag6_sysctl_register(net);
+   res = inet_frags_init_net(>nf_frag.frags);
+   if (res < 0)
+   return res;
+   res = nf_ct_frag6_sysctl_register(net);
+   if (res < 0)
+   inet_frags_exit_net(>nf_frag.frags, _frags);
+   return res;
 }
 
 static void nf_ct_net_exit(struct net *net)
diff --git a/net/ipv6/reassembly.c b/net/ipv6/reassembly.c
index 
08a139f14d0f6fa8ca326088cce1144411e09bf5..a8f7a5f0251a7af0b14cc6de5006b924d9d05672
 100644
--- a/net/ipv6/reassembly.c
+++ b/net/ipv6/reassembly.c
@@ -711,13 +711,20 @@ static void ip6_frags_sysctl_unregister(void)
 
 static int __net_init ipv6_frags_init_net(struct net *net)
 {
+   int res;
+
net->ipv6.frags.high_thresh = IPV6_FRAG_HIGH_THRESH;
net->ipv6.frags.low_thresh = IPV6_FRAG_LOW_THRESH;
net->ipv6.frags.timeout = IPV6_FRAG_TIMEOUT;
 
-   inet_frags_init_net(>ipv6.frags);
+   res = inet_frags_init_net(>ipv6.frags);
+   if (res < 0)
+   return res;
 
-   return ip6_frags_ns_sysctl_register(net);
+

[PATCH net-next 1/6] ipv6: frag: remove unused field

2018-03-29 Thread Eric Dumazet

csum field in struct frag_queue is not used, remove it.

Signed-off-by: Eric Dumazet 
---
 include/net/ipv6.h | 1 -
 1 file changed, 1 deletion(-)

diff --git a/include/net/ipv6.h b/include/net/ipv6.h
index 
50a6f0ddb8780f6c9169f4ae0b3b35af2d66cd4b..5c18836672e9d1c560cdce15f5b34928c337abfd
 100644
--- a/include/net/ipv6.h
+++ b/include/net/ipv6.h
@@ -603,7 +603,6 @@ struct frag_queue {
struct in6_addr daddr;
 
int iif;
-   unsigned intcsum;
__u16   nhoffset;
u8  ecn;
 };
-- 
2.17.0.rc1.321.gba9d0f2565-goog

[PATCH net-next 0/6] inet: frags: bring rhashtables to IP defrag

2018-03-29 Thread Eric Dumazet

IP defrag processing is one of the remaining problematic layer in linux.

It uses static hash tables of 1024 buckets, and up to 128 items per bucket.

A work queue is supposed to garbage collect items when host is under memory
pressure, and doing a hash rebuild, changing seed used in hash computations.

This work queue blocks softirqs for up to 25 ms when doing a hash rebuild,
occurring every 5 seconds if host is under fire.

Then there is the problem of sharing this hash table for all netns.

It is time to switch to rhashtables, and allocate one of them per netns
to speedup netns dismantle, since this is a critical metric these days.

Lookup is now using RCU, and 64bit hosts can now provision whatever amount
of memory needed to handle the expected workloads.

Eric Dumazet (6):
  ipv6: frag: remove unused field
  inet: frags: change inet_frags_init_net() return value
  inet: frags: add a pointer to struct netns_frags
  inet: frags: use rhashtables for reassembly units
  inet: frags: remove some helpers
  inet: frags: break the 2GB limit for frags storage

 Documentation/networking/ip-sysctl.txt  |  13 +-
 include/net/inet_frag.h | 134 
 include/net/ip.h|   1 -
 include/net/ipv6.h  |  28 +-
 net/ieee802154/6lowpan/6lowpan_i.h  |  26 +-
 net/ieee802154/6lowpan/reassembly.c | 140 
 net/ipv4/inet_fragment.c| 404 +---
 net/ipv4/ip_fragment.c  | 199 ++--
 net/ipv4/proc.c |   6 +-
 net/ipv6/netfilter/nf_conntrack_reasm.c |  96 +++---
 net/ipv6/proc.c |   5 +-
 net/ipv6/reassembly.c   | 182 ++-
 12 files changed, 450 insertions(+), 784 deletions(-)

-- 
2.17.0.rc1.321.gba9d0f2565-goog

Re: [RFC PATCH ghak32 V2 01/13] audit: add container id

2018-03-29 Thread Richard Guy Briggs

On 2018-03-29 07:03, Jonathan Corbet wrote:
> On Thu, 29 Mar 2018 05:01:32 -0400
> Richard Guy Briggs  wrote:
> 
> > > A little detail, but still...  
> > 
> > I am understanding that you would prefer more context (as opposed to
> > operational detail) in the description, laying out the use case for this
> > patch(set)?
> 
> No, sorry, "a little detail" was referring to my comment.  The use case,
> I believe, has been well described.

Ah!  "A minor nit".  :-)

> jon

- RGB

--
Richard Guy Briggs 
Sr. S/W Engineer, Kernel Security, Base Operating Systems
Remote, Ottawa, Red Hat Canada
IRC: rgb, SunRaycer
Voice: +1.647.777.2635, Internal: (81) 32635

Re: [PATCH v2 bpf-next 1/2] lib/scatterlist: add sg_init_marker() helper

2018-03-29 Thread John Fastabend

On 03/29/2018 05:20 PM, Prashant Bhole wrote:
> sg_init_marker initializes sg_magic in the sg table and calls
> sg_mark_end() on the last entry of the table. This can be useful to
> avoid memset in sg_init_table() when scatterlist is already zeroed out
> 
> For example: when scatterlist is embedded inside other struct and that
> container struct is zeroed out
> 
> Suggested-by: Daniel Borkmann 
> Signed-off-by: Prashant Bhole 
> ---

Acked-by: John Fastabend

Re: [PATCH v2 bpf-next 0/2] sockmap: fix sg api usage

2018-03-29 Thread John Fastabend

On 03/29/2018 05:20 PM, Prashant Bhole wrote:
> These patches fix sg api usage in sockmap. Previously sockmap didn't
> use sg_init_table(), which caused hitting BUG_ON in sg api, when
> CONFIG_DEBUG_SG is enabled
> 
> v1: added sg_init_table() calls wherever needed.
> 
> v2:
> - Patch1 adds new helper function in sg api. sg_init_marker()
> - Patch2 sg_init_marker() and sg_init_table() in appropriate places
> 
> Backgroud:
> While reviewing v1, John Fastabend raised a valid point about
> unnecessary memset in sg_init_table() because sockmap uses sg table
> which embedded in a struct. As enclosing struct is zeroed out, there
> is unnecessary memset in sg_init_table.
> 
> So Daniel Borkmann suggested to define another static inline function
> in scatterlist.h which only initializes sg_magic. Also this function 
> will be called from sg_init_table. From this suggestion I defined a
> function sg_init_marker() which sets sg_magic and calls sg_mark_end()
> 

Series looks good to me thanks for finding and fixing this!

Re: [PATCH v2 bpf-next 2/2] bpf: sockmap: initialize sg table entries properly

2018-03-29 Thread John Fastabend

On 03/29/2018 05:21 PM, Prashant Bhole wrote:
> When CONFIG_DEBUG_SG is set, sg->sg_magic is initialized in
> sg_init_table() and it is verified in sg api while navigating. We hit
> BUG_ON when magic check is failed.
> 
> In functions sg_tcp_sendpage and sg_tcp_sendmsg, the struct containing
> the scatterlist is already zeroed out. So to avoid extra memset, we
> use sg_init_marker() to initialize sg_magic.
> 
> Fixed following things:
> - In bpf_tcp_sendpage: initialize sg using sg_init_marker
> - In bpf_tcp_sendmsg: Replace sg_init_table with sg_init_marker
> - In bpf_tcp_push: Replace memset with sg_init_table where consumed
>   sg entry needs to be re-initialized.
> 
> Signed-off-by: Prashant Bhole 
> ---
>  kernel/bpf/sockmap.c | 13 -
>  1 file changed, 8 insertions(+), 5 deletions(-)
> 

Acked-by: John Fastabend

Re: [PATCH net-next] net/mlx4_en: CHECKSUM_COMPLETE support for fragments

2018-03-29 Thread Eric Dumazet

On Thu, Mar 29, 2018 at 5:44 PM Saeed Mahameed 
wrote:

> On Thu, Mar 29, 2018 at 11:07 AM, David Miller 
wrote:
> > From: Eric Dumazet 
> > Date: Tue, 27 Mar 2018 14:21:14 -0700
> >
> >> Refine the RX check summing handling to propagate the
> >> hardware provided checksum so that we do not have to
> >> compute it later in software.
> >>
> >> Signed-off-by: Eric Dumazet 
> >
> > Tariq, please review.

> Hi Dave, Eric.

> The patch looks ok but i would let tariq review it and decide if he
> wants to run full regression coverage on it
> since it changes the default behavior of the driver's checksum reporting.

> It is already weekend for him and for the team in Israel, and i don't
> think this can be handled before next week :).
> So it is really up to you guys.

> Thanks,
> Saeed.


Hi Saaed

This definitely can wait, nothing urgent really.

Thanks.

Re: [RFC PATCH V2 8/8] vhost: event suppression for packed ring

2018-03-29 Thread Jason Wang




On 2018年03月30日 10:05, Tiwei Bie wrote:

On Mon, Mar 26, 2018 at 11:38:53AM +0800, Jason Wang wrote:

This patch introduces basic support for event suppression aka driver
and device area. Compile tested only.

Signed-off-by: Jason Wang 
---

[...]

+
+static bool vhost_notify_packed(struct vhost_dev *dev,
+   struct vhost_virtqueue *vq)
+{
+   __virtio16 event_off_wrap, event_flags;
+   __u16 old, new;
+   bool v, wrap;
+   int off;
+
+   /* Flush out used descriptors updates. This is paired
+* with the barrier that the Guest executes when enabling
+* interrupts.
+*/
+   smp_mb();
+
+   if (vhost_get_avail(vq, event_flags,
+  >driver_event->desc_event_flags) < 0) {
+   vq_err(vq, "Failed to get driver desc_event_flags");
+   return true;
+   }
+
+   if (!(event_flags & cpu_to_vhost16(vq, RING_EVENT_FLAGS_DESC)))
+   return event_flags ==
+  cpu_to_vhost16(vq, RING_EVENT_FLAGS_ENABLE);

Maybe it would be better to not use '&' here. Because these flags
are not defined as bits which can be ORed or ANDed. Instead, they
are defined as values:

0x0  enable
0x1  disable
0x2  desc
0x3  reserved


Yes the code seems tricky. Let me fix it in next version.




+
+   /* Read desc event flags before event_off and event_wrap */
+   smp_rmb();
+
+   if (vhost_get_avail(vq, event_off_wrap,
+   >driver_event->desc_event_off_warp) < 0) {
+   vq_err(vq, "Failed to get driver desc_event_off/wrap");
+   return true;
+   }
+
+   off = vhost16_to_cpu(vq, event_off_wrap);
+
+   wrap = off & 0x1;
+   off >>= 1;

Based on the below definitions in spec, wrap counter is
the most significant bit.

struct pvirtq_event_suppress {
le16 {
desc_event_off : 15; /* Descriptor Ring Change Event Offset */
desc_event_wrap : 1; /* Descriptor Ring Change Event Wrap 
Counter */
} desc; /* If desc_event_flags set to RING_EVENT_FLAGS_DESC */
le16 {
desc_event_flags : 2, /* Descriptor Ring Change Event Flags */
reserved : 14; /* Reserved, set to 0 */
} flags;
};


Will fix this in next version.




+
+
+   old = vq->signalled_used;
+   v = vq->signalled_used_valid;
+   new = vq->signalled_used = vq->last_used_idx;
+   vq->signalled_used_valid = true;
+
+   if (unlikely(!v))
+   return true;
+
+   return vhost_vring_packed_need_event(vq, new, old, off) &&
+  wrap == vq->used_wrap_counter;
+}
+
+static bool vhost_notify(struct vhost_dev *dev, struct vhost_virtqueue *vq)
+{
+   if (vhost_has_feature(vq, VIRTIO_F_RING_PACKED))
+   return vhost_notify_packed(dev, vq);
+   else
+   return vhost_notify_split(dev, vq);
+}
+
  /* This actually signals the guest, using eventfd. */
  void vhost_signal(struct vhost_dev *dev, struct vhost_virtqueue *vq)
  {
@@ -2789,7 +2911,17 @@ static bool vhost_enable_notify_packed(struct vhost_dev 
*dev,
__virtio16 flags;
int ret;
  
-	/* FIXME: disable notification through device area */

+   if (!(vq->used_flags & VRING_USED_F_NO_NOTIFY))
+   return false;
+   vq->used_flags &= ~VRING_USED_F_NO_NOTIFY;
+
+   flags = cpu_to_vhost16(vq, RING_EVENT_FLAGS_ENABLE);
+   ret = vhost_update_device_flags(vq, flags);
+   if (ret) {
+   vq_err(vq, "Failed to enable notification at %p: %d\n",
+  >device_event->desc_event_flags, ret);
+   return false;
+   }
  
  	/* They could have slipped one in as we were doing that: make

 * sure it's written, then check again. */
@@ -2855,7 +2987,18 @@ EXPORT_SYMBOL_GPL(vhost_enable_notify);
  static void vhost_disable_notify_packed(struct vhost_dev *dev,
struct vhost_virtqueue *vq)
  {
-   /* FIXME: disable notification through device area */
+   __virtio16 flags;
+   int r;
+
+   if (vq->used_flags & VRING_USED_F_NO_NOTIFY)
+   return;
+   vq->used_flags |= VRING_USED_F_NO_NOTIFY;
+
+   flags = cpu_to_vhost16(vq, RING_EVENT_FLAGS_DISABLE);
+   r = vhost_update_device_flags(vq, flags);
+   if (r)
+   vq_err(vq, "Failed to enable notification at %p: %d\n",
+  >device_event->desc_event_flags, r);
  }
  
  static void vhost_disable_notify_split(struct vhost_dev *dev,

diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h
index 8a9df4f..02d7a36 100644
--- a/drivers/vhost/vhost.h
+++ b/drivers/vhost/vhost.h
@@ -96,8 +96,14 @@ struct vhost_virtqueue {
struct vring_desc __user *desc;
struct vring_desc_packed __user *desc_packed;

Do you think it'd be better to name the desc type as
struct vring_packed_desc?


Ok.

Re: [PATCH net] vhost: validate log when IOTLB is enabled

2018-03-29 Thread Jason Wang




On 2018年03月29日 22:44, Michael S. Tsirkin wrote:

On Thu, Mar 29, 2018 at 04:00:04PM +0800, Jason Wang wrote:

Vq log_base is the userspace address of bitmap which has nothing to do
with IOTLB. So it needs to be validated unconditionally otherwise we
may try use 0 as log_base which may lead to pin pages that will lead
unexpected result (e.g trigger BUG_ON() in set_bit_to_user()).

Fixes: 6b1e6cc7855b0 ("vhost: new device IOTLB API")
Reported-by:syzbot+6304bf97ef436580f...@syzkaller.appspotmail.com
Signed-off-by: Jason Wang

One follow-up question:

We still observe that get user pages returns 0 sometimes. While I agree
we should not pass in unvalidated addresses, isn't this worth
documenting?




Looking at get_user_pages_fast(), it has:

    if (unlikely(!access_ok(write ? VERIFY_WRITE : VERIFY_READ,
                    (void __user *)start, len)))
        return 0;

So this is expected I think.

Thanks

[PATCH net-next V2] net: hns3: remove unnecessary pci_set_drvdata() and devm_kfree()

2018-03-29 Thread Wei Yongjun

There is no need for explicit calls of devm_kfree(), as the allocated
memory will be freed during driver's detach.

The driver core clears the driver data to NULL after device_release.
Thus, it is not needed to manually clear the device driver data to NULL.

So remove the unnecessary pci_set_drvdata() and devm_kfree().

Signed-off-by: Wei Yongjun 
---
v1 -> v2: change commit log
---
 drivers/net/ethernet/hisilicon/hns3/hns3_enet.c | 4 
 1 file changed, 4 deletions(-)

diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c 
b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c
index a31b4ad..8c55965 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c
+++ b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c
@@ -1614,10 +1614,6 @@ static void hns3_remove(struct pci_dev *pdev)
struct hnae3_ae_dev *ae_dev = pci_get_drvdata(pdev);
 
hnae3_unregister_ae_dev(ae_dev);
-
-   devm_kfree(>dev, ae_dev);
-
-   pci_set_drvdata(pdev, NULL);
 }
 
 static struct pci_driver hns3_driver = {

[PATCH] net: sched: do not emit messages while holding spinlock

2018-03-29 Thread Li RongQing

move messages emitting out of sch_tree_lock to avoid holding
this lock too long.

Signed-off-by: Li RongQing 
---
 net/sched/sch_htb.c | 11 +++
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/net/sched/sch_htb.c b/net/sched/sch_htb.c
index 1ea9846cc6ce..2a4ab7caf553 100644
--- a/net/sched/sch_htb.c
+++ b/net/sched/sch_htb.c
@@ -1337,6 +1337,7 @@ static int htb_change_class(struct Qdisc *sch, u32 
classid,
struct nlattr *tb[TCA_HTB_MAX + 1];
struct tc_htb_opt *hopt;
u64 rate64, ceil64;
+   int warn = 0;
 
/* extract all subattrs from opt attr */
if (!opt)
@@ -1499,13 +1500,11 @@ static int htb_change_class(struct Qdisc *sch, u32 
classid,
cl->quantum = min_t(u64, quantum, INT_MAX);
 
if (!hopt->quantum && cl->quantum < 1000) {
-   pr_warn("HTB: quantum of class %X is small. Consider 
r2q change.\n",
-   cl->common.classid);
+   warn = -1;
cl->quantum = 1000;
}
if (!hopt->quantum && cl->quantum > 20) {
-   pr_warn("HTB: quantum of class %X is big. Consider r2q 
change.\n",
-   cl->common.classid);
+   warn = 1;
cl->quantum = 20;
}
if (hopt->quantum)
@@ -1519,6 +1518,10 @@ static int htb_change_class(struct Qdisc *sch, u32 
classid,
 
sch_tree_unlock(sch);
 
+   if (warn)
+   pr_warn("HTB: quantum of class %X is %s. Consider r2q 
change.\n",
+   cl->common.classid, (warn == -1 ? "small" : "big"));
+
qdisc_class_hash_grow(sch, >clhash);
 
*arg = (unsigned long)cl;
-- 
2.11.0

Re: [RFC PATCH V2 8/8] vhost: event suppression for packed ring

2018-03-29 Thread Tiwei Bie

On Mon, Mar 26, 2018 at 11:38:53AM +0800, Jason Wang wrote:
> This patch introduces basic support for event suppression aka driver
> and device area. Compile tested only.
> 
> Signed-off-by: Jason Wang 
> ---
[...]
> +
> +static bool vhost_notify_packed(struct vhost_dev *dev,
> + struct vhost_virtqueue *vq)
> +{
> + __virtio16 event_off_wrap, event_flags;
> + __u16 old, new;
> + bool v, wrap;
> + int off;
> +
> + /* Flush out used descriptors updates. This is paired
> +  * with the barrier that the Guest executes when enabling
> +  * interrupts.
> +  */
> + smp_mb();
> +
> + if (vhost_get_avail(vq, event_flags,
> +>driver_event->desc_event_flags) < 0) {
> + vq_err(vq, "Failed to get driver desc_event_flags");
> + return true;
> + }
> +
> + if (!(event_flags & cpu_to_vhost16(vq, RING_EVENT_FLAGS_DESC)))
> + return event_flags ==
> +cpu_to_vhost16(vq, RING_EVENT_FLAGS_ENABLE);

Maybe it would be better to not use '&' here. Because these flags
are not defined as bits which can be ORed or ANDed. Instead, they
are defined as values:

0x0  enable
0x1  disable
0x2  desc
0x3  reserved

> +
> + /* Read desc event flags before event_off and event_wrap */
> + smp_rmb();
> +
> + if (vhost_get_avail(vq, event_off_wrap,
> + >driver_event->desc_event_off_warp) < 0) {
> + vq_err(vq, "Failed to get driver desc_event_off/wrap");
> + return true;
> + }
> +
> + off = vhost16_to_cpu(vq, event_off_wrap);
> +
> + wrap = off & 0x1;
> + off >>= 1;

Based on the below definitions in spec, wrap counter is
the most significant bit.

struct pvirtq_event_suppress {
le16 {
desc_event_off : 15; /* Descriptor Ring Change Event Offset */
desc_event_wrap : 1; /* Descriptor Ring Change Event Wrap 
Counter */
} desc; /* If desc_event_flags set to RING_EVENT_FLAGS_DESC */
le16 {
desc_event_flags : 2, /* Descriptor Ring Change Event Flags */
reserved : 14; /* Reserved, set to 0 */
} flags;
};

> +
> +
> + old = vq->signalled_used;
> + v = vq->signalled_used_valid;
> + new = vq->signalled_used = vq->last_used_idx;
> + vq->signalled_used_valid = true;
> +
> + if (unlikely(!v))
> + return true;
> +
> + return vhost_vring_packed_need_event(vq, new, old, off) &&
> +wrap == vq->used_wrap_counter;
> +}
> +
> +static bool vhost_notify(struct vhost_dev *dev, struct vhost_virtqueue *vq)
> +{
> + if (vhost_has_feature(vq, VIRTIO_F_RING_PACKED))
> + return vhost_notify_packed(dev, vq);
> + else
> + return vhost_notify_split(dev, vq);
> +}
> +
>  /* This actually signals the guest, using eventfd. */
>  void vhost_signal(struct vhost_dev *dev, struct vhost_virtqueue *vq)
>  {
> @@ -2789,7 +2911,17 @@ static bool vhost_enable_notify_packed(struct 
> vhost_dev *dev,
>   __virtio16 flags;
>   int ret;
>  
> - /* FIXME: disable notification through device area */
> + if (!(vq->used_flags & VRING_USED_F_NO_NOTIFY))
> + return false;
> + vq->used_flags &= ~VRING_USED_F_NO_NOTIFY;
> +
> + flags = cpu_to_vhost16(vq, RING_EVENT_FLAGS_ENABLE);
> + ret = vhost_update_device_flags(vq, flags);
> + if (ret) {
> + vq_err(vq, "Failed to enable notification at %p: %d\n",
> +>device_event->desc_event_flags, ret);
> + return false;
> + }
>  
>   /* They could have slipped one in as we were doing that: make
>* sure it's written, then check again. */
> @@ -2855,7 +2987,18 @@ EXPORT_SYMBOL_GPL(vhost_enable_notify);
>  static void vhost_disable_notify_packed(struct vhost_dev *dev,
>   struct vhost_virtqueue *vq)
>  {
> - /* FIXME: disable notification through device area */
> + __virtio16 flags;
> + int r;
> +
> + if (vq->used_flags & VRING_USED_F_NO_NOTIFY)
> + return;
> + vq->used_flags |= VRING_USED_F_NO_NOTIFY;
> +
> + flags = cpu_to_vhost16(vq, RING_EVENT_FLAGS_DISABLE);
> + r = vhost_update_device_flags(vq, flags);
> + if (r)
> + vq_err(vq, "Failed to enable notification at %p: %d\n",
> +>device_event->desc_event_flags, r);
>  }
>  
>  static void vhost_disable_notify_split(struct vhost_dev *dev,
> diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h
> index 8a9df4f..02d7a36 100644
> --- a/drivers/vhost/vhost.h
> +++ b/drivers/vhost/vhost.h
> @@ -96,8 +96,14 @@ struct vhost_virtqueue {
>   struct vring_desc __user *desc;
>   struct vring_desc_packed __user *desc_packed;

Do you think it'd be better to name the desc type as
struct vring_packed_desc? And it will be consistent
with other names, like:

struct

Re: [PATCH net-next 0/6] rxrpc: Fixes

2018-03-29 Thread David Miller

From: David Howells 
Date: Thu, 29 Mar 2018 23:25:46 +0100

> David Miller  wrote:
> 
>> David, this GIT URL has tons of unrelated changes.  It seems to bring in
>> the parts of Linus's tree that haven't proagated to 'net' yet.
> 
> Sorry about that, I rebased on the wrong branch by accident.
> 
> I've got some more fixes.  Should I just give the lot to you to pull into your
> net-next tree, given that the merge window may well open Sunday?

That's up to you.

[PATCH net] vlan: also check phy_driver ts_info for vlan's real device

2018-03-29 Thread Hangbin Liu

Just like function ethtool_get_ts_info(), we should also consider the
phy_driver ts_info call back. For example, driver dp83640.

Fixes: 37dd9255b2f6 ("vlan: Pass ethtool get_ts_info queries to real device.")
Acked-by: Richard Cochran 
Signed-off-by: Hangbin Liu 
---
 net/8021q/vlan_dev.c | 6 +-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/net/8021q/vlan_dev.c b/net/8021q/vlan_dev.c
index f7e83f6..236452e 100644
--- a/net/8021q/vlan_dev.c
+++ b/net/8021q/vlan_dev.c
@@ -29,6 +29,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 
@@ -665,8 +666,11 @@ static int vlan_ethtool_get_ts_info(struct net_device *dev,
 {
const struct vlan_dev_priv *vlan = vlan_dev_priv(dev);
const struct ethtool_ops *ops = vlan->real_dev->ethtool_ops;
+   struct phy_device *phydev = vlan->real_dev->phydev;
 
-   if (ops->get_ts_info) {
+   if (phydev && phydev->drv && phydev->drv->ts_info) {
+return phydev->drv->ts_info(phydev, info);
+   } else if (ops->get_ts_info) {
return ops->get_ts_info(vlan->real_dev, info);
} else {
info->so_timestamping = SOF_TIMESTAMPING_RX_SOFTWARE |
-- 
2.5.5

Re: RFC on writel and writel_relaxed

2018-03-29 Thread Benjamin Herrenschmidt

On Thu, 2018-03-29 at 09:56 -0400, Sinan Kaya wrote:
> On 3/28/2018 11:55 AM, David Miller wrote:
> > From: Benjamin Herrenschmidt 
> > Date: Thu, 29 Mar 2018 02:13:16 +1100
> > 
> > > Let's fix all archs, it's way easier than fixing all drivers. Half of
> > > the archs are unused or dead anyway.
> > 
> > Agreed.
> > 
> 
> I pinged most of the maintainers yesterday.
> Which arches do we care about these days?
> I have not been paying attention any other architecture besides arm64.

Thanks for going through that exercise !

Once sparc, s390, microblaze and mips reply, I think we'll have a good
coverage, maybe riscv is to put in that lot too.
 
Cheers,
Ben.

> 
> arch  status  detail
> ---   
> 
> alpha question sent
> arc   question sent   ys...@users.sourceforge.jp will fix it.
> arm   no issues
> arm64 no issues
> blackfin  question sent   about to be removed
> c6x   question sent
> cris  question sent
> frv
> h8300 question sent
> hexagon   question sent
> ia64  no issues   confirmed by Tony Luck
> m32r
> m68k  question sent
> metag
> microblazequestion sent
> mips  question sent
> mn10300   question sent
> nios2 question sent
> openrisc  no issues   sho...@gmail.com says should no issues
> pariscno issues   grantgrund...@gmail.com says 
> most probably no problem but still looking
> powerpc   no issues
> riscv question sent
> s390  question sent
> score question sent
> shquestion sent
> sparc question sent
> tile  question sent
> unicore32 question sent
> x86   no issues
> xtensaquestion sent
> 
>

Re: [PATCH net] net/ipv6: Fix route leaking between VRFs

2018-03-29 Thread David Ahern

On 3/29/18 6:44 PM, David Ahern wrote:
> Donald reported that IPv6 route leaking between VRFs is not working.
> The root cause is the strict argument in the call to rt6_lookup when
> validating the nexthop spec.
> 
> ip6_route_check_nh validates the gateway and device (if given) of a
> route spec. It in turn could call rt6_lookup (e.g., lookup in a given
> table did not succeed so it falls back to a full lookup) and if so
> sets the strict argument to 1. That means if the egress device is given,
> the route lookup needs to return a result with the same device. This
> strict requirement does not work with VRFs (IPv4 or IPv6) because the
> oif in the flow struct is overridden with the index of the VRF device
> to trigger a match on the l3mdev rule and force the lookup to its table.
> 
> The right long term solution is to add an l3mdev index to the flow
> struct such that the oif is not overridden. That solution will not
> backport well, so this patch aims for a simpler solution to relax the
> strict argument if the route spec device is an l3mdev slave. As done
> in other places, use the FLOWI_FLAG_SKIP_NH_OIF to know that the
> RT6_LOOKUP_F_IFACE flag needs to be removed.
> 

Forgot the fixes tag:
Fixes: ca254490c8df ("net: Add VRF support to IPv6 stack")

Dave: I can resend if needed. Key backports are to 4.14 and 4.9. Those
are the only LTS releases affected.

> Reported-by: Donald Sharp 
> Signed-off-by: David Ahern 
> ---
>  net/ipv6/route.c | 3 +++
>  1 file changed, 3 insertions(+)
> 
> diff --git a/net/ipv6/route.c b/net/ipv6/route.c
> index b33d057ac5eb..fc74352fac12 100644
> --- a/net/ipv6/route.c
> +++ b/net/ipv6/route.c
> @@ -919,6 +919,9 @@ static struct rt6_info *ip6_pol_route_lookup(struct net 
> *net,
>   struct rt6_info *rt, *rt_cache;
>   struct fib6_node *fn;
>  
> + if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF)
> + flags &= ~RT6_LOOKUP_F_IFACE;
> +
>   rcu_read_lock();
>   fn = fib6_lookup(>tb6_root, >daddr, >saddr);
>  restart:
>

Re: [PATCH net-next] bridge: Allow max MTU when multiple VLANs present

2018-03-29 Thread Toshiaki Makita

On 2018/03/30 1:49, Roopa Prabhu wrote:
> On Thu, Mar 22, 2018 at 9:53 PM, Roopa Prabhu  
> wrote:
>> On Thu, Mar 22, 2018 at 8:34 AM, Chas Williams <3ch...@gmail.com> wrote:
>>> If the bridge is allowing multiple VLANs, some VLANs may have
>>> different MTUs.  Instead of choosing the minimum MTU for the
>>> bridge interface, choose the maximum MTU of the bridge members.
>>> With this the user only needs to set a larger MTU on the member
>>> ports that are participating in the large MTU VLANS.
>>>
>>> Signed-off-by: Chas Williams <3ch...@gmail.com>
>>> ---
>>
>> Acked-by: Roopa Prabhu 
>>
>> This or an equivalent fix is necessary: as stated above, today the
>> bridge mtu capped at min port mtu limits all
>> vlan devices on top of the vlan filtering bridge to min port mtu.
> 
> 
> On further thought, since this patch changes default behavior, it may
> upset people. ie with this patch, a vlan device
> on the bridge by default will now use the  bridge max mtu and that
> could cause unexpected drops in the bridge driver
> if the xmit port had a lower mtu. This may surprise users.
> 
> The other equivalent fix i was thinking about is to keep the default
> behavior as is, and allow a max mtu to be
> configured on the bridge. This will allow a sys admin to fix the
> current mtu limitations if
> deployments require it.
> 
> we will submit an incremental patch to re-work this patch to restore
> default behavior.

+1

This makes sense to me.

-- 
Toshiaki Makita

Re: [PATCH net-next] net/mlx4_en: CHECKSUM_COMPLETE support for fragments

2018-03-29 Thread Saeed Mahameed

On Thu, Mar 29, 2018 at 11:07 AM, David Miller  wrote:
> From: Eric Dumazet 
> Date: Tue, 27 Mar 2018 14:21:14 -0700
>
>> Refine the RX check summing handling to propagate the
>> hardware provided checksum so that we do not have to
>> compute it later in software.
>>
>> Signed-off-by: Eric Dumazet 
>
> Tariq, please review.

Hi Dave, Eric.

The patch looks ok but i would let tariq review it and decide if he
wants to run full regression coverage on it
since it changes the default behavior of the driver's checksum reporting.

It is already weekend for him and for the team in Israel, and i don't
think this can be handled before next week :).
So it is really up to you guys.

Thanks,
Saeed.

[PATCH net] net/ipv6: Fix route leaking between VRFs

2018-03-29 Thread David Ahern

Donald reported that IPv6 route leaking between VRFs is not working.
The root cause is the strict argument in the call to rt6_lookup when
validating the nexthop spec.

ip6_route_check_nh validates the gateway and device (if given) of a
route spec. It in turn could call rt6_lookup (e.g., lookup in a given
table did not succeed so it falls back to a full lookup) and if so
sets the strict argument to 1. That means if the egress device is given,
the route lookup needs to return a result with the same device. This
strict requirement does not work with VRFs (IPv4 or IPv6) because the
oif in the flow struct is overridden with the index of the VRF device
to trigger a match on the l3mdev rule and force the lookup to its table.

The right long term solution is to add an l3mdev index to the flow
struct such that the oif is not overridden. That solution will not
backport well, so this patch aims for a simpler solution to relax the
strict argument if the route spec device is an l3mdev slave. As done
in other places, use the FLOWI_FLAG_SKIP_NH_OIF to know that the
RT6_LOOKUP_F_IFACE flag needs to be removed.

Reported-by: Donald Sharp 
Signed-off-by: David Ahern 
---
 net/ipv6/route.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index b33d057ac5eb..fc74352fac12 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -919,6 +919,9 @@ static struct rt6_info *ip6_pol_route_lookup(struct net 
*net,
struct rt6_info *rt, *rt_cache;
struct fib6_node *fn;
 
+   if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF)
+   flags &= ~RT6_LOOKUP_F_IFACE;
+
rcu_read_lock();
fn = fib6_lookup(>tb6_root, >daddr, >saddr);
 restart:
-- 
2.11.0

Re: [PATCH net-next] net/mlx4_en: CHECKSUM_COMPLETE support for fragments

2018-03-29 Thread Saeed Mahameed

On Tue, Mar 27, 2018 at 2:21 PM, Eric Dumazet  wrote:
> Refine the RX check summing handling to propagate the
> hardware provided checksum so that we do not have to
> compute it later in software.
>
> Signed-off-by: Eric Dumazet 
> Cc: Willem de Bruijn 
> Cc: Tariq Toukan 
> ---
>  drivers/net/ethernet/mellanox/mlx4/en_rx.c | 10 --
>  1 file changed, 4 insertions(+), 6 deletions(-)
>
> diff --git a/drivers/net/ethernet/mellanox/mlx4/en_rx.c 
> b/drivers/net/ethernet/mellanox/mlx4/en_rx.c
> index 
> 05787efef492b1c0c6ce540ef73647fad91ce282..5c613c6663da51a4ae792eeb4d8956b54655786b
>  100644
> --- a/drivers/net/ethernet/mellanox/mlx4/en_rx.c
> +++ b/drivers/net/ethernet/mellanox/mlx4/en_rx.c
> @@ -821,14 +821,12 @@ int mlx4_en_process_rx_cq(struct net_device *dev, 
> struct mlx4_en_cq *cq, int bud
> skb_record_rx_queue(skb, cq_ring);
>
> if (likely(dev->features & NETIF_F_RXCSUM)) {
> -   if (cqe->status & cpu_to_be16(MLX4_CQE_STATUS_TCP |
> - MLX4_CQE_STATUS_UDP)) {
> +   if ((cqe->status & cpu_to_be16(MLX4_CQE_STATUS_TCP |
> +  MLX4_CQE_STATUS_UDP)) 
> &&
> +   (cqe->status & cpu_to_be16(MLX4_CQE_STATUS_IPOK)) 
> &&
> +   cqe->checksum == cpu_to_be16(0x)) {
> bool l2_tunnel;
>

LGTM, this code even aligns better with the mlx4 HW documentation:

"When L4_CSUM field is not supported, L4 checksum for TCP/UDP packets
can be validated by: (IP_OK && (TCP || UDP)) && (checksum ==
0x))."

in the code we don't even consider L4_CSUM at the moment, As a future
patch, it could be a nice acceleration for the above 3 steps
condition.

Small comment, if we expect that  cqe->checksum is NOT likely to be
0x for UDP/TCP packets, maybe it is better performance wise to
move (cqe->checksum == cpu_to_be16(0x)) to be evaluated first in
the condition.

> -   if (!((cqe->status & 
> cpu_to_be16(MLX4_CQE_STATUS_IPOK)) &&
> - cqe->checksum == cpu_to_be16(0x)))
> -   goto csum_none;
> -
> l2_tunnel = (dev->hw_enc_features & 
> NETIF_F_RXCSUM) &&
> (cqe->vlan_my_qpn & 
> cpu_to_be32(MLX4_CQE_L2_TUNNEL));
> ip_summed = CHECKSUM_UNNECESSARY;
> --
> 2.17.0.rc1.321.gba9d0f2565-goog
>

[PATCH v2 bpf-next 1/2] lib/scatterlist: add sg_init_marker() helper

2018-03-29 Thread Prashant Bhole

sg_init_marker initializes sg_magic in the sg table and calls
sg_mark_end() on the last entry of the table. This can be useful to
avoid memset in sg_init_table() when scatterlist is already zeroed out

For example: when scatterlist is embedded inside other struct and that
container struct is zeroed out

Suggested-by: Daniel Borkmann 
Signed-off-by: Prashant Bhole 
---
 include/linux/scatterlist.h | 18 ++
 lib/scatterlist.c   |  9 +
 2 files changed, 19 insertions(+), 8 deletions(-)

diff --git a/include/linux/scatterlist.h b/include/linux/scatterlist.h
index 22b2131bcdcd..aa5d4eb725f5 100644
--- a/include/linux/scatterlist.h
+++ b/include/linux/scatterlist.h
@@ -248,6 +248,24 @@ static inline void *sg_virt(struct scatterlist *sg)
return page_address(sg_page(sg)) + sg->offset;
 }
 
+/**
+ * sg_init_marker - Initialize markers in sg table
+ * @sgl:  The SG table
+ * @nents:Number of entries in table
+ *
+ **/
+static inline void sg_init_marker(struct scatterlist *sgl,
+ unsigned int nents)
+{
+#ifdef CONFIG_DEBUG_SG
+   unsigned int i;
+
+   for (i = 0; i < nents; i++)
+   sgl[i].sg_magic = SG_MAGIC;
+#endif
+   sg_mark_end([nents - 1]);
+}
+
 int sg_nents(struct scatterlist *sg);
 int sg_nents_for_len(struct scatterlist *sg, u64 len);
 struct scatterlist *sg_next(struct scatterlist *);
diff --git a/lib/scatterlist.c b/lib/scatterlist.c
index 53728d391d3a..06dad7a072fd 100644
--- a/lib/scatterlist.c
+++ b/lib/scatterlist.c
@@ -132,14 +132,7 @@ EXPORT_SYMBOL(sg_last);
 void sg_init_table(struct scatterlist *sgl, unsigned int nents)
 {
memset(sgl, 0, sizeof(*sgl) * nents);
-#ifdef CONFIG_DEBUG_SG
-   {
-   unsigned int i;
-   for (i = 0; i < nents; i++)
-   sgl[i].sg_magic = SG_MAGIC;
-   }
-#endif
-   sg_mark_end([nents - 1]);
+   sg_init_marker(sgl, nents);
 }
 EXPORT_SYMBOL(sg_init_table);
 
-- 
2.14.3

[PATCH v2 bpf-next 0/2] sockmap: fix sg api usage

2018-03-29 Thread Prashant Bhole

These patches fix sg api usage in sockmap. Previously sockmap didn't
use sg_init_table(), which caused hitting BUG_ON in sg api, when
CONFIG_DEBUG_SG is enabled

v1: added sg_init_table() calls wherever needed.

v2:
- Patch1 adds new helper function in sg api. sg_init_marker()
- Patch2 sg_init_marker() and sg_init_table() in appropriate places

Backgroud:
While reviewing v1, John Fastabend raised a valid point about
unnecessary memset in sg_init_table() because sockmap uses sg table
which embedded in a struct. As enclosing struct is zeroed out, there
is unnecessary memset in sg_init_table.

So Daniel Borkmann suggested to define another static inline function
in scatterlist.h which only initializes sg_magic. Also this function 
will be called from sg_init_table. From this suggestion I defined a
function sg_init_marker() which sets sg_magic and calls sg_mark_end()

Prashant Bhole (2):
  lib/scatterlist: add sg_init_marker() helper
  bpf: sockmap: initialize sg table entries properly

 include/linux/scatterlist.h | 18 ++
 kernel/bpf/sockmap.c| 13 -
 lib/scatterlist.c   |  9 +
 3 files changed, 27 insertions(+), 13 deletions(-)

-- 
2.14.3

[PATCH v2 bpf-next 2/2] bpf: sockmap: initialize sg table entries properly

2018-03-29 Thread Prashant Bhole

When CONFIG_DEBUG_SG is set, sg->sg_magic is initialized in
sg_init_table() and it is verified in sg api while navigating. We hit
BUG_ON when magic check is failed.

In functions sg_tcp_sendpage and sg_tcp_sendmsg, the struct containing
the scatterlist is already zeroed out. So to avoid extra memset, we
use sg_init_marker() to initialize sg_magic.

Fixed following things:
- In bpf_tcp_sendpage: initialize sg using sg_init_marker
- In bpf_tcp_sendmsg: Replace sg_init_table with sg_init_marker
- In bpf_tcp_push: Replace memset with sg_init_table where consumed
  sg entry needs to be re-initialized.

Signed-off-by: Prashant Bhole 
---
 kernel/bpf/sockmap.c | 13 -
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c
index 69c5bccabd22..b4f01656c452 100644
--- a/kernel/bpf/sockmap.c
+++ b/kernel/bpf/sockmap.c
@@ -312,7 +312,7 @@ static int bpf_tcp_push(struct sock *sk, int apply_bytes,
md->sg_start++;
if (md->sg_start == MAX_SKB_FRAGS)
md->sg_start = 0;
-   memset(sg, 0, sizeof(*sg));
+   sg_init_table(sg, 1);
 
if (md->sg_start == md->sg_end)
break;
@@ -656,7 +656,7 @@ static int bpf_tcp_sendmsg(struct sock *sk, struct msghdr 
*msg, size_t size)
}
 
sg = md.sg_data;
-   sg_init_table(sg, MAX_SKB_FRAGS);
+   sg_init_marker(sg, MAX_SKB_FRAGS);
rcu_read_unlock();
 
lock_sock(sk);
@@ -763,10 +763,14 @@ static int bpf_tcp_sendpage(struct sock *sk, struct page 
*page,
 
lock_sock(sk);
 
-   if (psock->cork_bytes)
+   if (psock->cork_bytes) {
m = psock->cork;
-   else
+   sg = >sg_data[m->sg_end];
+   } else {
m = 
+   sg = m->sg_data;
+   sg_init_marker(sg, MAX_SKB_FRAGS);
+   }
 
/* Catch case where ring is full and sendpage is stalled. */
if (unlikely(m->sg_end == m->sg_start &&
@@ -774,7 +778,6 @@ static int bpf_tcp_sendpage(struct sock *sk, struct page 
*page,
goto out_err;
 
psock->sg_size += size;
-   sg = >sg_data[m->sg_end];
sg_set_page(sg, page, size, offset);
get_page(page);
m->sg_copy[m->sg_end] = true;
-- 
2.14.3

Re: Regression in 4.16-rc7 - ipsec vpn broken

2018-03-29 Thread Derek Robson

Thanks, that patch has solved issue.


On Thu, Mar 29, 2018 at 7:51 PM, Steffen Klassert
 wrote:
> Please always make sure to Cc netdev@vger.kernel.org
> on networking problems.
>
> On Wed, Mar 28, 2018 at 10:21:32PM +, Derek Robson wrote:
>> The ipsec VPN is broken in 4.16-rc7 and seem to have been broken in all of
>> 4.15
>>
>> connecting from an iphone seems to give a timeout.
>>
>>
>> A bisect brings me to this commit as the one that is the issue.
>>
>> commit: acf568ee859f098279eadf551612f103afdacb4e  (xfrm: Reinject
>> transport-mode packets through tasklet)
>
> I have a fix queued for this commit in the ipsec tree.
>
> Can you please try if the patch below fixes your problems?
>
> Thanks!
>
> Subject: [PATCH] xfrm: Fix transport mode skb control buffer usage.
>
> A recent commit introduced a new struct xfrm_trans_cb
> that is used with the sk_buff control buffer. Unfortunately
> it placed the structure in front of the control buffer and
> overlooked that the IPv4/IPv6 control buffer is still needed
> for some layer 4 protocols. As a result the IPv4/IPv6 control
> buffer is overwritten with this structure. Fix this by setting
> a apropriate header in front of the structure.
>
> Fixes acf568ee859f ("xfrm: Reinject transport-mode packets ...")
> Signed-off-by: Steffen Klassert 
> ---
>  net/xfrm/xfrm_input.c | 6 ++
>  1 file changed, 6 insertions(+)
>
> diff --git a/net/xfrm/xfrm_input.c b/net/xfrm/xfrm_input.c
> index 1472c0857975..81788105c164 100644
> --- a/net/xfrm/xfrm_input.c
> +++ b/net/xfrm/xfrm_input.c
> @@ -26,6 +26,12 @@ struct xfrm_trans_tasklet {
>  };
>
>  struct xfrm_trans_cb {
> +   union {
> +   struct inet_skb_parmh4;
> +#if IS_ENABLED(CONFIG_IPV6)
> +   struct inet6_skb_parm   h6;
> +#endif
> +   } header;
> int (*finish)(struct net *net, struct sock *sk, struct sk_buff *skb);
>  };
>
> --
> 2.14.1
>

Re: [PATCH bpf-next] bpf: sockmap: initialize sg table entries properly

2018-03-29 Thread Prashant Bhole




On 3/28/2018 5:51 PM, Daniel Borkmann wrote:

On 03/28/2018 08:18 AM, Prashant Bhole wrote:

On 3/27/2018 6:05 PM, Daniel Borkmann wrote:

On 03/27/2018 10:41 AM, Prashant Bhole wrote:

On 3/27/2018 12:15 PM, John Fastabend wrote:

On 03/25/2018 11:54 PM, Prashant Bhole wrote:

When CONFIG_DEBUG_SG is set, sg->sg_magic is initialized to SG_MAGIC,
when sg table is initialized using sg_init_table(). Magic is checked
while navigating the scatterlist. We hit BUG_ON when magic check is
failed.

Fixed following things:
- Initialization of sg table in bpf_tcp_sendpage() was missing,
     initialized it using sg_init_table()

- bpf_tcp_sendmsg() initializes sg table using sg_init_table() before
     entering the loop, but further consumed sg entries are initialized
     using memset. Fixed it by replacing memset with sg_init_table() in
     function bpf_tcp_push()

Signed-off-by: Prashant Bhole 
---
    kernel/bpf/sockmap.c | 11 +++
    1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c
index 69c5bccabd22..8a848a99d768 100644
--- a/kernel/bpf/sockmap.c
+++ b/kernel/bpf/sockmap.c
@@ -312,7 +312,7 @@ static int bpf_tcp_push(struct sock *sk, int apply_bytes,
    md->sg_start++;
    if (md->sg_start == MAX_SKB_FRAGS)
    md->sg_start = 0;
-    memset(sg, 0, sizeof(*sg));
+    sg_init_table(sg, 1);


Looks OK here.


      if (md->sg_start == md->sg_end)
    break;
@@ -763,10 +763,14 @@ static int bpf_tcp_sendpage(struct sock *sk, struct page 
*page,
      lock_sock(sk);
    -    if (psock->cork_bytes)
+    if (psock->cork_bytes) {
    m = psock->cork;
-    else
+    sg = >sg_data[m->sg_end];
+    } else {
    m = 
+    sg = m->sg_data;
+    sg_init_table(sg, MAX_SKB_FRAGS);


sg_init_table() does an unnecessary memset() though. We
probably either want a new scatterlist API or just open
code this,

#ifdef CONFIG_DEBUG_SG
{
  unsigned int i;
  for (i = 0; i < nents; i++)
  sgl[i].sg_magic = SG_MAGIC;
}


Similar sg_init_table() is present in bpf_tcp_sendmsg().
I agree that it causes unnecessary memset, but I don't agree with open coded 
fix.


But then lets fix is properly and add a static inline helper to the
include/linux/scatterlist.h header like ...

static inline void sg_init_debug_marker(struct scatterlist *sgl,
     unsigned int nents)
{
#ifdef CONFIG_DEBUG_SG
 unsigned int i;

 for (i = 0; i < nents; i++)
     sgl[i].sg_magic = SG_MAGIC;
#endif
}

... and reuse it in all the places that would otherwise open-code this,
as well as sg_init_table():

void sg_init_table(struct scatterlist *sgl, unsigned int nents)
{
  memset(sgl, 0, sizeof(*sgl) * nents);
 sg_init_debug_marker(sgl, nents);
  sg_mark_end([nents - 1]);
}

This would be a lot cleaner than having this duplicated in various places.


Daniel, This is a good suggestion. Is it ok if I submit both changes in
a patch series?


Sure, that's fine.


How scatterlist related changes will be picked up by other subsystems?


Once this gets applied into bpf-next, this will be pushed to net-next tree,
and during the merge window net-next will be pulled into Linus' tree if this
is what you are asking. Then also other subsystems outside of bpf/networking
can make use of the sg_init_debug_marker() helper if suitable for their
situation.


Thanks. I am submitting V2 soon.

-Prashant

Re: [PATCH v2 bpf-next 3/9] bpf: Hooks for sys_bind

2018-03-29 Thread Alexei Starovoitov


On 3/29/18 4:06 PM, Daniel Borkmann wrote:

On 03/28/2018 05:41 AM, Alexei Starovoitov wrote:
[...]

diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index e8c7fad8c329..2dec266507dc 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -450,6 +450,13 @@ int inet_bind(struct socket *sock, struct sockaddr *uaddr, 
int addr_len)
if (addr_len < sizeof(struct sockaddr_in))
goto out;

+   /* BPF prog is run before any checks are done so that if the prog
+* changes context in a wrong way it will be caught.
+*/
+   err = BPF_CGROUP_RUN_PROG_INET4_BIND(sk, uaddr);
+   if (err)
+   goto out;
+


Should the hook not come at the very beginning?

/* If the socket has its own bind function then use it. (RAW) */
if (sk->sk_prot->bind) {
err = sk->sk_prot->bind(sk, uaddr, addr_len);
goto out;
}
err = -EINVAL;
if (addr_len < sizeof(struct sockaddr_in))
goto out;

/* BPF prog is run before any checks are done so that if the prog
 * changes context in a wrong way it will be caught.
 */
err = BPF_CGROUP_RUN_PROG_INET4_BIND(sk, uaddr);
if (err)
goto out;

E.g. when you have v4/v6 ping or raw sockets used from language runtimes
or apps, then they provide their own bind handler here in kernel, thus any
bind rewrite won't be caught for them. Shouldn't this be covered as well
and the BPF_CGROUP_RUN_PROG_INET4_BIND() come first?


the reason for hook to be called after
'if (addr_len < sizeof(struct sockaddr_in))' check is that
'struct bpf_sock_addr' rewrite assumes either sockaddr_in
or sockaddr_in6 when accessing fields.

For example, raw_bind(s) have a variety of sockaddr_* types that
we cannot recognize from bpf side without introducing special
ctx rewriter for each possible protocol and different bpf ctx for each.

That's why the hooks are called INET4_BIND and INET6_BIND and
later in __cgroup_bpf_run_filter_sock_addr() we do:
if (sk->sk_family != AF_INET && sk->sk_family != AF_INET6)
return 0;

I don't think it's possible to have one generic bind hook
for all sk_proto. What do we pass into bpf prog as context?
They all have different sockaddr*.
Consider sockaddr_sco vs sockaddr_can, etc.
In the future this feature can be extend with per-protocol bind hooks
(if really necessary), but the hooks probably will be inside specific
raw_bind() functions instead of here.

The crazy alternative approach would be to pass blob of bytes into
bpf prog as ctx and let program parse it differently depending
on protocol, but then we'd need to make 'struct bpf_sock_addr'
variable length or size it up to the largest possible sockaddr_*.
Sanitizing fields becomes complex and so on. That won't be clean.

Re: [PATCH v2 bpf-next 3/9] bpf: Hooks for sys_bind

2018-03-29 Thread Daniel Borkmann

On 03/28/2018 05:41 AM, Alexei Starovoitov wrote:
[...]
> diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
> index e8c7fad8c329..2dec266507dc 100644
> --- a/net/ipv4/af_inet.c
> +++ b/net/ipv4/af_inet.c
> @@ -450,6 +450,13 @@ int inet_bind(struct socket *sock, struct sockaddr 
> *uaddr, int addr_len)
>   if (addr_len < sizeof(struct sockaddr_in))
>   goto out;
>  
> + /* BPF prog is run before any checks are done so that if the prog
> +  * changes context in a wrong way it will be caught.
> +  */
> + err = BPF_CGROUP_RUN_PROG_INET4_BIND(sk, uaddr);
> + if (err)
> + goto out;
> +

Should the hook not come at the very beginning?

/* If the socket has its own bind function then use it. (RAW) */
if (sk->sk_prot->bind) {
err = sk->sk_prot->bind(sk, uaddr, addr_len);
goto out;
}
err = -EINVAL;
if (addr_len < sizeof(struct sockaddr_in))
goto out;

/* BPF prog is run before any checks are done so that if the prog
 * changes context in a wrong way it will be caught.
 */
err = BPF_CGROUP_RUN_PROG_INET4_BIND(sk, uaddr);
if (err)
goto out;

E.g. when you have v4/v6 ping or raw sockets used from language runtimes
or apps, then they provide their own bind handler here in kernel, thus any
bind rewrite won't be caught for them. Shouldn't this be covered as well
and the BPF_CGROUP_RUN_PROG_INET4_BIND() come first?

>   if (addr->sin_family != AF_INET) {
>   /* Compatibility games : accept AF_UNSPEC (mapped to AF_INET)
>* only if s_addr is INADDR_ANY.
> diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c
> index dbbe04018813..fa24e3f06ac6 100644
> --- a/net/ipv6/af_inet6.c
> +++ b/net/ipv6/af_inet6.c
> @@ -295,6 +295,13 @@ int inet6_bind(struct socket *sock, struct sockaddr 
> *uaddr, int addr_len)
>   if (addr_len < SIN6_LEN_RFC2133)
>   return -EINVAL;
>  
> + /* BPF prog is run before any checks are done so that if the prog
> +  * changes context in a wrong way it will be caught.
> +  */
> + err = BPF_CGROUP_RUN_PROG_INET6_BIND(sk, uaddr);
> + if (err)
> + return err;
> +
>   if (addr->sin6_family != AF_INET6)
>   return -EAFNOSUPPORT;

(Same here?)

[PATCH net-next v1 1/2] dt-bindings: net: meson-dwmac: add support for the Meson8m2 SoC

2018-03-29 Thread Martin Blumenstingl

The Meson8m2 SoC uses a similar (potentially even identical) register
layout for the dwmac glue as Meson8b and GXBB. Unfortunately there is no
documentation available.
Testing shows that both, RMII and RGMII PHYs are working if they are
configured as on Meson8b. Add a new compatible string to the
documentation so differences (if there are any) between Meson8m2 and the
other SoCs can be taken care of within the driver.

Signed-off-by: Martin Blumenstingl 
---
 Documentation/devicetree/bindings/net/meson-dwmac.txt | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/Documentation/devicetree/bindings/net/meson-dwmac.txt 
b/Documentation/devicetree/bindings/net/meson-dwmac.txt
index 354dd9896bb5..61cada22ae6c 100644
--- a/Documentation/devicetree/bindings/net/meson-dwmac.txt
+++ b/Documentation/devicetree/bindings/net/meson-dwmac.txt
@@ -9,6 +9,7 @@ Required properties on all platforms:
 - compatible:  Depending on the platform this should be one of:
- "amlogic,meson6-dwmac"
- "amlogic,meson8b-dwmac"
+   - "amlogic,meson8m2-dwmac"
- "amlogic,meson-gxbb-dwmac"
Additionally "snps,dwmac" and any applicable more
detailed version number described in net/stmmac.txt
@@ -19,13 +20,13 @@ Required properties on all platforms:
configuration (for example the PRG_ETHERNET register range
on Meson8b and newer)
 
-Required properties on Meson8b and newer:
+Required properties on Meson8b, Meson8m2, GXBB and newer:
 - clock-names: Should contain the following:
- "stmmaceth" - see stmmac.txt
- "clkin0" - first parent clock of the internal mux
- "clkin1" - second parent clock of the internal mux
 
-Optional properties on Meson8b and newer:
+Optional properties on Meson8b, Meson8m2, GXBB and newer:
 - amlogic,tx-delay-ns: The internal RGMII TX clock delay (provided
by this driver) in nanoseconds. Allowed values
are: 0ns, 2ns, 4ns, 6ns.
-- 
2.16.3

[PATCH net-next v1 0/2] Meson8m2 support for dwmac-meson8b

2018-03-29 Thread Martin Blumenstingl

The Meson8m2 SoC is an updated version of the Meson8 SoC. Some of the
peripherals are shared with Meson8b (for example the watchdog registers
and the internal temperature sensor calibration procedure).
Meson8m2 also seems to include the same Gigabit MAC register layout as
Meson8b.

The registers in the Amlogic dwmac "glue" seem identical between Meson8b
and Meson8m2. Manual testing seems to confirm this.

To be extra-safe a new compatible string is added because there's no
(public) documentation on the Meson8m2 SoC. This will allow us to
implement any SoC-specific variations later on (if needed).


Martin Blumenstingl (2):
  dt-bindings: net: meson-dwmac: add support for the Meson8m2 SoC
  net: stmmac: dwmac-meson8b: Add support for the Meson8m2 SoC

 Documentation/devicetree/bindings/net/meson-dwmac.txt | 5 +++--
 drivers/net/ethernet/stmicro/stmmac/dwmac-meson8b.c   | 5 +++--
 2 files changed, 6 insertions(+), 4 deletions(-)

-- 
2.16.3

[PATCH net-next v1 2/2] net: stmmac: dwmac-meson8b: Add support for the Meson8m2 SoC

2018-03-29 Thread Martin Blumenstingl

The Meson8m2 SoC uses a similar (potentially even identical) register
layout as the Meson8b and GXBB SoCs for the dwmac glue.
Add a new compatible string and update the module description to
indicate support for these SoCs.

Signed-off-by: Martin Blumenstingl 
---
 drivers/net/ethernet/stmicro/stmmac/dwmac-meson8b.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-meson8b.c 
b/drivers/net/ethernet/stmicro/stmmac/dwmac-meson8b.c
index 2d5d4aea3bcb..7cb794094a70 100644
--- a/drivers/net/ethernet/stmicro/stmmac/dwmac-meson8b.c
+++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-meson8b.c
@@ -1,5 +1,5 @@
 /*
- * Amlogic Meson8b and GXBB DWMAC glue layer
+ * Amlogic Meson8b, Meson8m2 and GXBB DWMAC glue layer
  *
  * Copyright (C) 2016 Martin Blumenstingl 
  *
@@ -318,6 +318,7 @@ static int meson8b_dwmac_probe(struct platform_device *pdev)
 
 static const struct of_device_id meson8b_dwmac_match[] = {
{ .compatible = "amlogic,meson8b-dwmac" },
+   { .compatible = "amlogic,meson8m2-dwmac" },
{ .compatible = "amlogic,meson-gxbb-dwmac" },
{ }
 };
@@ -335,5 +336,5 @@ static struct platform_driver meson8b_dwmac_driver = {
 module_platform_driver(meson8b_dwmac_driver);
 
 MODULE_AUTHOR("Martin Blumenstingl ");
-MODULE_DESCRIPTION("Amlogic Meson8b and GXBB DWMAC glue layer");
+MODULE_DESCRIPTION("Amlogic Meson8b, Meson8m2 and GXBB DWMAC glue layer");
 MODULE_LICENSE("GPL v2");
-- 
2.16.3

Re: [PATCH v2 bpf-next 0/3] bpf/verifier: subprog/func_call simplifications

2018-03-29 Thread Edward Cree

On 29/03/18 23:44, Edward Cree wrote:
> By storing subprog boundaries as a subprogno mark on each insn, rather than
>  a start (and implicit end) for each subprog, we collect a number of gains:
> * More efficient determination of which subprog contains a given insn, and
>   thus of find_subprog (which subprog begins at a given insn).
> * Number of verifier "full recursive walk" passes is reduced, since most of
>   the work is done in the main insn walk (do_check()).  Leftover work in
>   other passes is mostly linear scans (O(insn_cnt)) or, in the case of
>   check_max_stack_depth(), a topological sort (O(subprog_cnt)).
>
> Some other changes were also included to support this:
> * Per-subprog info is stored in env->subprog_info, an array of structs,
>   rather than several arrays with a common index.
> * Call graph is now stored in the new bpf_subprog_info struct; used here
>   for check_max_stack_depth() but may have other uses too.
>
> Along with this, patch #3 puts parent pointers (used by liveness analysis)
>  in the registers instead of the func_state or verifier_state, so that we
>  don't need skip_callee() machinery.  This also does the right thing for
>  stack slots, so they don't need their own special handling for liveness
>  marking either.
Whoops, forgot to add:
Changes from v1:
* No longer allows non-contiguous subprogs.
* No longer allows LD_ABS|IND and pseudo-calls in the same prog.

> Edward Cree (3):
>   bpf/verifier: validate func_calls by marking at do_check() time
>   bpf/verifier: update selftests
>   bpf/verifier: per-register parent pointers
>
>  include/linux/bpf_verifier.h|  32 +-
>  kernel/bpf/verifier.c   | 631 
> +---
>  tools/testing/selftests/bpf/test_verifier.c |  51 ++-
>  3 files changed, 344 insertions(+), 370 deletions(-)
>

RE: [iproute2-next 0/2] tipc: changes to addressing structure

2018-03-29 Thread Jon Maloy


> -Original Message-
> From: netdev-ow...@vger.kernel.org [mailto:netdev-
> ow...@vger.kernel.org] On Behalf Of David Ahern
> Sent: Thursday, March 29, 2018 13:59
> To: Jon Maloy ; da...@davemloft.net;
> netdev@vger.kernel.org
> Cc: Mohan Krishna Ghanta Krishnamurthy
[..]
bit node addresses as an integer in hex format,
> >>i.e., we remove the assumption about an internal structure.
> >>
> >
> > Applied to iproute2-next. Thanks,
> >
> 
> BTW, please consider adding json support to tipc. It will make tipc command
> more robust to changes in output format.

Yes, we will do that.

///jon

[PATCH v2 bpf-next 2/3] bpf/verifier: update selftests

2018-03-29 Thread Edward Cree

Error messages for some bad programs have changed, partly because we now
 check for loops / out-of-bounds jumps before checking subprogs.
Also added a test ("calls: interleaved functions") to ensure that subprogs
 are required to be contiguous.
It wasn't entirely clear to me what "calls: wrong recursive calls" was
 meant to test for, since all of the JMP|CALL insns are unreachable.  I've
 changed it so that they are now reachable, which causes static back-edges
 to be detected (since that, like insn reachability, is now tested before
 subprog boundaries are determined).

Signed-off-by: Edward Cree 
---
 tools/testing/selftests/bpf/test_verifier.c | 51 ++---
 1 file changed, 32 insertions(+), 19 deletions(-)

diff --git a/tools/testing/selftests/bpf/test_verifier.c 
b/tools/testing/selftests/bpf/test_verifier.c
index 3e7718b1a9ae..cc45a0b52439 100644
--- a/tools/testing/selftests/bpf/test_verifier.c
+++ b/tools/testing/selftests/bpf/test_verifier.c
@@ -646,7 +646,7 @@ static struct bpf_test tests[] = {
.insns = {
BPF_ALU64_REG(BPF_MOV, BPF_REG_0, BPF_REG_2),
},
-   .errstr = "not an exit",
+   .errstr = "jump out of range",
.result = REJECT,
},
{
@@ -9442,13 +9442,13 @@ static struct bpf_test tests[] = {
BPF_EXIT_INSN(),
},
.prog_type = BPF_PROG_TYPE_TRACEPOINT,
-   .errstr = "last insn is not an exit or jmp",
+   .errstr = "insn 1 was in subprog 1, now 0",
.result = REJECT,
},
{
"calls: wrong recursive calls",
.insns = {
-   BPF_JMP_IMM(BPF_JA, 0, 0, 4),
+   BPF_JMP_IMM(BPF_JA, 0, 0, 3),
BPF_JMP_IMM(BPF_JA, 0, 0, 4),
BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, -2),
BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, -2),
@@ -9457,7 +9457,7 @@ static struct bpf_test tests[] = {
BPF_EXIT_INSN(),
},
.prog_type = BPF_PROG_TYPE_TRACEPOINT,
-   .errstr = "jump out of range",
+   .errstr = "back-edge from insn",
.result = REJECT,
},
{
@@ -9508,7 +9508,7 @@ static struct bpf_test tests[] = {
BPF_EXIT_INSN(),
},
.prog_type = BPF_PROG_TYPE_TRACEPOINT,
-   .errstr = "jump out of range",
+   .errstr = "insn 5 was in subprog 1, now 0",
.result = REJECT,
},
{
@@ -9787,7 +9787,7 @@ static struct bpf_test tests[] = {
BPF_EXIT_INSN(),
},
.prog_type = BPF_PROG_TYPE_SCHED_CLS,
-   .errstr = "jump out of range from insn 1 to 4",
+   .errstr = "insn 5 was in subprog 1, now 0",
.result = REJECT,
},
{
@@ -9803,13 +9803,12 @@ static struct bpf_test tests[] = {
BPF_ALU64_REG(BPF_ADD, BPF_REG_7, BPF_REG_0),
BPF_MOV64_REG(BPF_REG_0, BPF_REG_7),
BPF_EXIT_INSN(),
-   BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1,
-   offsetof(struct __sk_buff, len)),
+   BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1, 8),
BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, -3),
BPF_EXIT_INSN(),
},
.prog_type = BPF_PROG_TYPE_TRACEPOINT,
-   .errstr = "jump out of range from insn 11 to 9",
+   .errstr = "insn 9 was in subprog 1, now 2",
.result = REJECT,
},
{
@@ -9861,7 +9860,7 @@ static struct bpf_test tests[] = {
BPF_EXIT_INSN(),
},
.prog_type = BPF_PROG_TYPE_TRACEPOINT,
-   .errstr = "invalid destination",
+   .errstr = "jump out of range from insn 2 to -1",
.result = REJECT,
},
{
@@ -9873,7 +9872,7 @@ static struct bpf_test tests[] = {
BPF_EXIT_INSN(),
},
.prog_type = BPF_PROG_TYPE_TRACEPOINT,
-   .errstr = "invalid destination",
+   .errstr = "jump out of range from insn 2 to -2147483646",
.result = REJECT,
},
{
@@ -9886,7 +9885,7 @@ static struct bpf_test tests[] = {
BPF_EXIT_INSN(),
},
.prog_type = BPF_PROG_TYPE_TRACEPOINT,
-   .errstr = "jump out of range",
+   .errstr = "insn 1 was in subprog 0, now 1",
.result = REJECT,
},
{
@@ -9899,7 +9898,7 @@ static struct bpf_test tests[] = {
BPF_EXIT_INSN(),
},

[PATCH v2 bpf-next 3/3] bpf/verifier: per-register parent pointers

2018-03-29 Thread Edward Cree

By giving each register its own liveness chain, we elide the skip_callee()
 logic.  Instead, each register's parent is the state it inherits from;
 both check_func_call() and prepare_func_exit() automatically connect
 reg states to the correct chain since when they copy the reg state across
 (r1-r5 into the callee as args, and r0 out as the return value) they also
 copy the parent pointer.

Signed-off-by: Edward Cree 
---
 include/linux/bpf_verifier.h |   8 +-
 kernel/bpf/verifier.c| 180 ++-
 2 files changed, 45 insertions(+), 143 deletions(-)

diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index 3af3f9cceede..2ec31b388dd6 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -41,6 +41,7 @@ enum bpf_reg_liveness {
 };
 
 struct bpf_reg_state {
+   /* Ordering of fields matters.  See states_equal() */
enum bpf_reg_type type;
union {
/* valid when type == PTR_TO_PACKET */
@@ -59,7 +60,6 @@ struct bpf_reg_state {
 * came from, when one is tested for != NULL.
 */
u32 id;
-   /* Ordering of fields matters.  See states_equal() */
/* For scalar types (SCALAR_VALUE), this represents our knowledge of
 * the actual value.
 * For pointer types, this represents the variable part of the offset
@@ -76,15 +76,15 @@ struct bpf_reg_state {
s64 smax_value; /* maximum possible (s64)value */
u64 umin_value; /* minimum possible (u64)value */
u64 umax_value; /* maximum possible (u64)value */
+   /* parentage chain for liveness checking */
+   struct bpf_reg_state *parent;
/* Inside the callee two registers can be both PTR_TO_STACK like
 * R1=fp-8 and R2=fp-8, but one of them points to this function stack
 * while another to the caller's stack. To differentiate them 'frameno'
 * is used which is an index in bpf_verifier_state->frame[] array
 * pointing to bpf_func_state.
-* This field must be second to last, for states_equal() reasons.
 */
u32 frameno;
-   /* This field must be last, for states_equal() reasons. */
enum bpf_reg_liveness live;
 };
 
@@ -107,7 +107,6 @@ struct bpf_stack_state {
  */
 struct bpf_func_state {
struct bpf_reg_state regs[MAX_BPF_REG];
-   struct bpf_verifier_state *parent;
/* index of call instruction that called into this func */
int callsite;
/* stack frame number of this function state from pov of
@@ -129,7 +128,6 @@ struct bpf_func_state {
 struct bpf_verifier_state {
/* call stack tracking */
struct bpf_func_state *frame[MAX_CALL_FRAMES];
-   struct bpf_verifier_state *parent;
u32 curframe;
 };
 
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 33963357a7ef..edb2ec0da95c 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -355,9 +355,9 @@ static int copy_stack_state(struct bpf_func_state *dst,
 /* do_check() starts with zero-sized stack in struct bpf_verifier_state to
  * make it consume minimal amount of memory. check_stack_write() access from
  * the program calls into realloc_func_state() to grow the stack size.
- * Note there is a non-zero 'parent' pointer inside bpf_verifier_state
- * which this function copies over. It points to previous bpf_verifier_state
- * which is never reallocated
+ * Note there is a non-zero parent pointer inside each reg of 
bpf_verifier_state
+ * which this function copies over. It points to corresponding reg in previous
+ * bpf_verifier_state which is never reallocated
  */
 static int realloc_func_state(struct bpf_func_state *state, int size,
  bool copy_old)
@@ -441,7 +441,6 @@ static int copy_verifier_state(struct bpf_verifier_state 
*dst_state,
dst_state->frame[i] = NULL;
}
dst_state->curframe = src->curframe;
-   dst_state->parent = src->parent;
for (i = 0; i <= src->curframe; i++) {
dst = dst_state->frame[i];
if (!dst) {
@@ -707,6 +706,7 @@ static void init_reg_state(struct bpf_verifier_env *env,
for (i = 0; i < MAX_BPF_REG; i++) {
mark_reg_not_init(env, regs, i);
regs[i].live = REG_LIVE_NONE;
+   regs[i].parent = NULL;
}
 
/* frame pointer */
@@ -781,74 +781,21 @@ static int add_subprog(struct bpf_verifier_env *env, int 
off)
return ret;
 }
 
-static
-struct bpf_verifier_state *skip_callee(struct bpf_verifier_env *env,
-  const struct bpf_verifier_state *state,
-  struct bpf_verifier_state *parent,
-  u32 regno)
-{
-   struct bpf_verifier_state *tmp = NULL;
-
-   /* 'parent' could be a state of caller and
-* 'state' could be a state of callee. In such case

[PATCH v2 bpf-next 1/3] bpf/verifier: validate func_calls by marking at do_check() time

2018-03-29 Thread Edward Cree

Removes a couple of passes from the verifier, one to check subprogs don't
 overlap etc., and one to compute max stack depth (which now is done by
 topologically sorting the call graph).  This improves the asymptotic
 complexity of a number of operations, for instance the max stack depth
 check is now O(n) in the number of subprogs, rather than having to walk
 every insn of every possible call chain.

Signed-off-by: Edward Cree 
---
 include/linux/bpf_verifier.h |  24 ++-
 kernel/bpf/verifier.c| 451 ---
 2 files changed, 267 insertions(+), 208 deletions(-)

diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index 7e61c395fddf..3af3f9cceede 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -146,6 +146,7 @@ struct bpf_insn_aux_data {
s32 call_imm;   /* saved imm field of call insn 
*/
};
int ctx_field_size; /* the ctx field size for load insn, maybe 0 */
+   u16 subprogno; /* subprog in which this insn resides, valid iff @seen */
bool seen; /* this insn was processed by the verifier */
 };
 
@@ -173,6 +174,15 @@ static inline bool bpf_verifier_log_needed(const struct 
bpf_verifier_log *log)
 
 #define BPF_MAX_SUBPROGS 256
 
+struct bpf_subprog_info {
+   /* which other subprogs does this one directly call? */
+   DECLARE_BITMAP(callees, BPF_MAX_SUBPROGS);
+   u32 start; /* insn idx of function entry point */
+   u16 stack_depth; /* max. stack depth used by this function */
+   u16 total_stack_depth; /* max. stack depth used by entire call chain */
+   u16 len; /* #insns in this subprog */
+};
+
 /* single container for all structs
  * one verifier_env per bpf_check() call
  */
@@ -189,11 +199,10 @@ struct bpf_verifier_env {
u32 id_gen; /* used to generate unique reg IDs */
bool allow_ptr_leaks;
bool seen_direct_write;
+   bool seen_pseudo_call;  /* populated at check_cfg() time */
struct bpf_insn_aux_data *insn_aux_data; /* array of per-insn state */
struct bpf_verifier_log log;
-   u32 subprog_starts[BPF_MAX_SUBPROGS];
-   /* computes the stack depth of each bpf function */
-   u16 subprog_stack_depth[BPF_MAX_SUBPROGS + 1];
+   struct bpf_subprog_info subprog_info[BPF_MAX_SUBPROGS];
u32 subprog_cnt;
 };
 
@@ -202,11 +211,16 @@ void bpf_verifier_vlog(struct bpf_verifier_log *log, 
const char *fmt,
 __printf(2, 3) void bpf_verifier_log_write(struct bpf_verifier_env *env,
   const char *fmt, ...);
 
-static inline struct bpf_reg_state *cur_regs(struct bpf_verifier_env *env)
+static inline struct bpf_func_state *cur_frame(struct bpf_verifier_env *env)
 {
struct bpf_verifier_state *cur = env->cur_state;
 
-   return cur->frame[cur->curframe]->regs;
+   return cur->frame[cur->curframe];
+}
+
+static inline struct bpf_reg_state *cur_regs(struct bpf_verifier_env *env)
+{
+   return cur_frame(env)->regs;
 }
 
 int bpf_prog_offload_verifier_prep(struct bpf_verifier_env *env);
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 8acd2207e412..33963357a7ef 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -736,111 +736,49 @@ enum reg_arg_type {
DST_OP_NO_MARK  /* same as above, check only, don't mark */
 };
 
-static int cmp_subprogs(const void *a, const void *b)
+static int find_subprog(struct bpf_verifier_env *env, int insn_idx)
 {
-   return *(int *)a - *(int *)b;
-}
-
-static int find_subprog(struct bpf_verifier_env *env, int off)
-{
-   u32 *p;
+   struct bpf_insn_aux_data *aux;
+   int insn_cnt = env->prog->len;
+   u32 subprogno;
 
-   p = bsearch(, env->subprog_starts, env->subprog_cnt,
-   sizeof(env->subprog_starts[0]), cmp_subprogs);
-   if (!p)
+   if (insn_idx >= insn_cnt || insn_idx < 0) {
+   verbose(env, "find_subprog of invalid insn_idx %d\n", insn_idx);
+   return -EINVAL;
+   }
+   aux = >insn_aux_data[insn_idx];
+   if (!aux->seen) /* haven't visited this line yet */
return -ENOENT;
-   return p - env->subprog_starts;
-
+   subprogno = aux->subprogno;
+   /* validate that we are at start of subprog */
+   if (env->subprog_info[subprogno].start != insn_idx) {
+   verbose(env, "insn_idx %d is in subprog %u but that starts at 
%d\n",
+   insn_idx, subprogno, 
env->subprog_info[subprogno].start);
+   return -EINVAL;
+   }
+   return subprogno;
 }
 
 static int add_subprog(struct bpf_verifier_env *env, int off)
 {
int insn_cnt = env->prog->len;
+   struct bpf_subprog_info *info;
int ret;
 
if (off >= insn_cnt || off < 0) {
verbose(env, "call to invalid destination\n");
return -EINVAL;

[PATCH v2 bpf-next 0/3] bpf/verifier: subprog/func_call simplifications

2018-03-29 Thread Edward Cree

By storing subprog boundaries as a subprogno mark on each insn, rather than
 a start (and implicit end) for each subprog, we collect a number of gains:
* More efficient determination of which subprog contains a given insn, and
  thus of find_subprog (which subprog begins at a given insn).
* Number of verifier "full recursive walk" passes is reduced, since most of
  the work is done in the main insn walk (do_check()).  Leftover work in
  other passes is mostly linear scans (O(insn_cnt)) or, in the case of
  check_max_stack_depth(), a topological sort (O(subprog_cnt)).

Some other changes were also included to support this:
* Per-subprog info is stored in env->subprog_info, an array of structs,
  rather than several arrays with a common index.
* Call graph is now stored in the new bpf_subprog_info struct; used here
  for check_max_stack_depth() but may have other uses too.

Along with this, patch #3 puts parent pointers (used by liveness analysis)
 in the registers instead of the func_state or verifier_state, so that we
 don't need skip_callee() machinery.  This also does the right thing for
 stack slots, so they don't need their own special handling for liveness
 marking either.

Edward Cree (3):
  bpf/verifier: validate func_calls by marking at do_check() time
  bpf/verifier: update selftests
  bpf/verifier: per-register parent pointers

 include/linux/bpf_verifier.h|  32 +-
 kernel/bpf/verifier.c   | 631 +---
 tools/testing/selftests/bpf/test_verifier.c |  51 ++-
 3 files changed, 344 insertions(+), 370 deletions(-)

Re: [PATCH 07/30] aio: add delayed cancel support

2018-03-29 Thread Al Viro

On Thu, Mar 29, 2018 at 10:33:05PM +0200, Christoph Hellwig wrote:
> The upcoming aio poll support would like to be able to complete the
> iocb inline from the cancellation context, but that would cause a
> double lock of ctx_lock with the current locking scheme.  Move the
> cancelation outside the context lock to avoid this reversal, which
> suits the existing usb gadgets users just fine as well (in fact
> both unconditionally disable irqs and thus seem broken without
> this change).
> 
> To make this safe aio_complete needs to check if this call should
> complete the iocb.  If it didn't the callers must not release any
> other resources.

Uh-oh...  What happens to existing users of kiocb_set_cancel_fn() now?
AFAICS, those guys will *not* get aio_kiocb freed at all in case of
io_cancel(2).  Look: we mark them with AIO_IOCB_CANCELLED and
call whatever ->ki_cancel() the driver has set.  Later the damn
thing calls ->ki_complete() (i.e. aio_complete_rw()), which calls
aio_complete(iocb, res, res2, 0) and gets false.  Nothing's freed,
struct file is leaked.

Frankly, the more I look at that, the less I like what you've done
with ->ki_cancel() overloading.  In regular case it's just accelerating
the call of ->ki_complete(), which will do freeing.  Here you have
->ki_cancel() free the damn thing, with the resulting need to play
silly buggers with locking, freeing logics in aio_complete(), etc.

Re: [PATCH net-next 0/6] rxrpc: Fixes

2018-03-29 Thread David Howells

David Miller  wrote:

> David, this GIT URL has tons of unrelated changes.  It seems to bring in
> the parts of Linus's tree that haven't proagated to 'net' yet.

Sorry about that, I rebased on the wrong branch by accident.

I've got some more fixes.  Should I just give the lot to you to pull into your
net-next tree, given that the merge window may well open Sunday?

David

[PATCH iproute2-next 1/1] tc: add online mode

2018-03-29 Thread Roman Mashak

Add initial support for oneline mode in tc; actions, filters and qdiscs
will be gradually updated in the follow-up patches.

Signed-off-by: Roman Mashak 
---
 man/man8/tc.8 | 15 ++-
 tc/tc.c   |  8 +++-
 2 files changed, 21 insertions(+), 2 deletions(-)

diff --git a/man/man8/tc.8 b/man/man8/tc.8
index 3dc30ee489e5..840880fbdba6 100644
--- a/man/man8/tc.8
+++ b/man/man8/tc.8
@@ -95,7 +95,8 @@ tc \- show / manipulate traffic control settings
 \fB[ \fB-n\fR[\fIetns\fR] name \fB] \fR|
 \fB[ \fB-nm \fR| \fB-nam\fR[\fIes\fR] \fB] \fR|
 \fB[ \fR{ \fB-cf \fR| \fB-c\fR[\fIonf\fR] \fR} \fB[ filename ] \fB] \fR
-\fB[ -t\fR[imestamp\fR] \fB\] \fR| \fB[ -t\fR[short\fR] \fB]\fR }
+\fB[ -t\fR[imestamp\fR] \fB\] \fR| \fB[ -t\fR[short\fR] \fR| \fB[
+-o\fR[neline\fR] \fB]\fR }
 
 .ti 8
 .IR FORMAT " := {"
@@ -649,6 +650,18 @@ don't terminate tc on errors in batch mode.
 If there were any errors during execution of the commands, the application 
return code will be non zero.
 
 .TP
+.BR "\-o" , " \-oneline"
+output each record on a single line, replacing line feeds
+with the
+.B '\e'
+character. This is convenient when you want to count records
+with
+.BR wc (1)
+or to
+.BR grep (1)
+the output.
+
+.TP
 .BR "\-n" , " \-net" , " \-netns " 
 switches
 .B tc
diff --git a/tc/tc.c b/tc/tc.c
index a31f075d1ffe..68475c156057 100644
--- a/tc/tc.c
+++ b/tc/tc.c
@@ -42,6 +42,8 @@ int force;
 bool use_names;
 int json;
 int color;
+int oneline;
+const char *_SL_;
 
 static char *conf_file;
 
@@ -191,7 +193,7 @@ static void usage(void)
"where  OBJECT := { qdisc | class | filter | action | monitor | 
exec }\n"
"   OPTIONS := { -s[tatistics] | -d[etails] | -r[aw] | 
-b[atch] [filename] | -n[etns] name |\n"
"-nm | -nam[es] | { -cf | -conf } path } 
|\n"
-   "-j[son] -p[retty] -c[olor]\n");
+   "-o[neline] -j[son] -p[retty] -c[olor]\n");
 }
 
 static int do_cmd(int argc, char **argv, void *buf, size_t buflen)
@@ -487,6 +489,8 @@ int main(int argc, char **argv)
++timestamp_short;
} else if (matches(argv[1], "-json") == 0) {
++json;
+   } else if (matches(argv[1], "-oneline") == 0) {
+   ++oneline;
} else {
fprintf(stderr, "Option \"%s\" is unknown, try \"tc 
-help\".\n", argv[1]);
return -1;
@@ -494,6 +498,8 @@ int main(int argc, char **argv)
argc--; argv++;
}
 
+   _SL_ = oneline ? "\\" : "\n";
+
if (color & !json)
enable_color();
 
-- 
2.7.4

WARNING in refcount_sub_and_test (2)

2018-03-29 Thread syzbot


Hello,

syzbot hit the following crash on bpf-next commit
22527437e0a0c96ee3153e9d0382942b0fd4f9dd (Thu Mar 29 02:36:15 2018 +)
Merge branch 'nfp-bpf-updates'
syzbot dashboard link:  
https://syzkaller.appspot.com/bug?extid=c7b0dde061c523bc4b0f


C reproducer: https://syzkaller.appspot.com/x/repro.c?id=5996614741131264
syzkaller reproducer:  
https://syzkaller.appspot.com/x/repro.syz?id=5947747274326016
Raw console output:  
https://syzkaller.appspot.com/x/log.txt?id=6215237837520896
Kernel config:  
https://syzkaller.appspot.com/x/.config?id=-1280663959502969741

compiler: gcc (GCC) 7.1.1 20170620

IMPORTANT: if you fix the bug, please add the following tag to the commit:
Reported-by: syzbot+c7b0dde061c523bc4...@syzkaller.appspotmail.com
It will help syzbot understand when the bug is fixed. See footer for  
details.

If you forward the report, please keep this part and the footer.

R13: 0030656c69662f2e R14: 0005 R15: 2f30656c69662f2e
[ cut here ]
[ cut here ]
refcount_t: increment on 0; use-after-free.
refcount_t: underflow; use-after-free.
WARNING: CPU: 0 PID: 4450 at lib/refcount.c:187  
refcount_sub_and_test+0x167/0x1b0 lib/refcount.c:187
WARNING: CPU: 1 PID: 4460 at lib/refcount.c:153 refcount_inc+0x47/0x50  
lib/refcount.c:153

Kernel panic - not syncing: panic_on_warn set ...

Modules linked in:
CPU: 0 PID: 4450 Comm: syzkaller428798 Not tainted 4.16.0-rc6+ #40
CPU: 1 PID: 4460 Comm: syzkaller428798 Not tainted 4.16.0-rc6+ #40
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS  
Google 01/01/2011
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS  
Google 01/01/2011

Call Trace:
RIP: 0010:refcount_inc+0x47/0x50 lib/refcount.c:153
 __dump_stack lib/dump_stack.c:17 [inline]
 dump_stack+0x194/0x24d lib/dump_stack.c:53
RSP: 0018:8801b534f860 EFLAGS: 00010286
RAX: dc08 RBX: 8801b1b8c184 RCX: 815ba4be
 panic+0x1e4/0x41c kernel/panic.c:183
RDX:  RSI: 110036a69ebc RDI: 110036a69e91
RBP: 8801b534f868 R08:  R09: 
R10:  R11:  R12: 8801b534faf8
R13: 8801b04db513 R14: 8801b1b8c180 R15: 8801b04db501
FS:  008e6880() GS:8801db30() knlGS:
CS:  0010 DS:  ES:  CR0: 80050033
 __warn+0x1dc/0x200 kernel/panic.c:547
CR2: 006ea510 CR3: 0001b106f005 CR4: 001606e0
DR0:  DR1:  DR2: 
 report_bug+0x1f4/0x2b0 lib/bug.c:186
DR3:  DR6: fffe0ff0 DR7: 0400
 fixup_bug.part.11+0x37/0x80 arch/x86/kernel/traps.c:178
Call Trace:
 fixup_bug arch/x86/kernel/traps.c:247 [inline]
 do_error_trap+0x2d7/0x3e0 arch/x86/kernel/traps.c:296
 get_net include/net/net_namespace.h:204 [inline]
 sk_alloc+0x3f9/0x1440 net/core/sock.c:1540
 do_invalid_op+0x1b/0x20 arch/x86/kernel/traps.c:315
 invalid_op+0x1b/0x40 arch/x86/entry/entry_64.S:986
RIP: 0010:refcount_sub_and_test+0x167/0x1b0 lib/refcount.c:187
RSP: 0018:8801b0e87728 EFLAGS: 00010286
RAX: dc08 RBX:  RCX: 815ba4be
RDX:  RSI: 1100361d0e95 RDI: 0293
RBP: 8801b0e877b8 R08:  R09: 
R10: 8801b0e87850 R11:  R12: 1100361d0ee6
 inet_create+0x47c/0xf50 net/ipv4/af_inet.c:320
R13:  R14: 0001 R15: 8801b0816204
 __sock_create+0x4d4/0x850 net/socket.c:1285
 sock_create net/socket.c:1325 [inline]
 SYSC_socket net/socket.c:1355 [inline]
 SyS_socket+0xeb/0x1d0 net/socket.c:1335
 refcount_dec_and_test+0x1a/0x20 lib/refcount.c:212
 put_net include/net/net_namespace.h:222 [inline]
 __sk_destruct+0x560/0x920 net/core/sock.c:1592
 do_syscall_64+0x281/0x940 arch/x86/entry/common.c:287
 sk_destruct+0x47/0x80 net/core/sock.c:1601
 entry_SYSCALL_64_after_hwframe+0x42/0xb7
 __sk_free+0xf1/0x2b0 net/core/sock.c:1612
RIP: 0033:0x44ac67
 sk_free+0x2a/0x40 net/core/sock.c:1623
RSP: 002b:7ffcd4f45588 EFLAGS: 0202
 sock_put include/net/sock.h:1660 [inline]
 tcp_close+0x967/0x1190 net/ipv4/tcp.c:2321
 ORIG_RAX: 0029
RAX: ffda RBX:  RCX: 0044ac67
RDX: 0006 RSI: 0001 RDI: 0002
RBP: 7ffcd4f456b0 R08:  R09: 0001
R10: 0006 R11: 0202 R12: 0002
 inet_release+0xed/0x1c0 net/ipv4/af_inet.c:427
R13: 0002 R14: b38f R15: 7ffcd4f456d8
 sock_release+0x8d/0x1e0 net/socket.c:594
Code:
be
 sock_close+0x16/0x20 net/socket.c:1149
fe
 __fput+0x327/0x7e0 fs/file_table.c:209
5b
5d
c3
 fput+0x15/0x20 fs/file_table.c:243
e8
 task_work_run+0x199/0x270 kernel/task_work.c:113
5a
3c
be
fe
 tracehook_notify_resume include/linux/tracehook.h:191 [inline]
 exit_to_usermode_loop+0x275/0x2f0

[PATCH v4 iproute2-next 7/7] rdma: Add PD resource tracking information

2018-03-29 Thread Steve Wise

Sample output:

Without CAP_NET_ADMIN capability:

dev mlx4_0 users 0 pid 0 comm [ib_srpt]
dev mlx4_0 users 0 pid 0 comm [ib_srp]
dev mlx4_0 users 1 pid 0 comm [ib_core]
dev cxgb4_0 users 0 pid 0 comm [ib_srp]

With CAP_NET_ADMIN capability:
dev mlx4_0 local_dma_lkey 0x8000 users 0 pid 0 comm [ib_srpt]
dev mlx4_0 local_dma_lkey 0x8000 users 0 pid 0 comm [ib_srp]
dev mlx4_0 local_dma_lkey 0x8000 users 1 pid 0 comm [ib_core]
dev cxgb4_0 local_dma_lkey 0x0 users 0 pid 0 comm [ib_srp]

Signed-off-by: Steve Wise 
Reviewed-by: Leon Romanovsky 
---
 rdma/res.c | 95 ++
 1 file changed, 95 insertions(+)

diff --git a/rdma/res.c b/rdma/res.c
index 9c1f736..1a0aab6 100644
--- a/rdma/res.c
+++ b/rdma/res.c
@@ -927,6 +927,91 @@ static int res_mr_parse_cb(const struct nlmsghdr *nlh, 
void *data)
return MNL_CB_OK;
 }
 
+static int res_pd_parse_cb(const struct nlmsghdr *nlh, void *data)
+{
+   struct nlattr *tb[RDMA_NLDEV_ATTR_MAX] = {};
+   struct nlattr *nla_table, *nla_entry;
+   struct rd *rd = data;
+   const char *name;
+   uint32_t idx;
+
+   mnl_attr_parse(nlh, 0, rd_attr_cb, tb);
+   if (!tb[RDMA_NLDEV_ATTR_DEV_INDEX] ||
+   !tb[RDMA_NLDEV_ATTR_DEV_NAME] ||
+   !tb[RDMA_NLDEV_ATTR_RES_PD])
+   return MNL_CB_ERROR;
+
+   name = mnl_attr_get_str(tb[RDMA_NLDEV_ATTR_DEV_NAME]);
+   idx =  mnl_attr_get_u32(tb[RDMA_NLDEV_ATTR_DEV_INDEX]);
+   nla_table = tb[RDMA_NLDEV_ATTR_RES_PD];
+
+   mnl_attr_for_each_nested(nla_entry, nla_table) {
+   uint32_t local_dma_lkey = 0, unsafe_global_rkey = 0;
+   struct nlattr *nla_line[RDMA_NLDEV_ATTR_MAX] = {};
+   char *comm = NULL;
+   uint32_t pid = 0;
+   uint64_t users;
+   int err;
+
+   err = mnl_attr_parse_nested(nla_entry, rd_attr_cb, nla_line);
+   if (err != MNL_CB_OK)
+   return MNL_CB_ERROR;
+
+   if (!nla_line[RDMA_NLDEV_ATTR_RES_USECNT] ||
+   (!nla_line[RDMA_NLDEV_ATTR_RES_PID] &&
+!nla_line[RDMA_NLDEV_ATTR_RES_KERN_NAME])) {
+   return MNL_CB_ERROR;
+   }
+
+   if (nla_line[RDMA_NLDEV_ATTR_RES_LOCAL_DMA_LKEY])
+   local_dma_lkey = mnl_attr_get_u32(
+   nla_line[RDMA_NLDEV_ATTR_RES_LOCAL_DMA_LKEY]);
+
+   users = mnl_attr_get_u64(nla_line[RDMA_NLDEV_ATTR_RES_USECNT]);
+   if (rd_check_is_filtered(rd, "users", users))
+   continue;
+
+   if (nla_line[RDMA_NLDEV_ATTR_RES_UNSAFE_GLOBAL_RKEY])
+   unsafe_global_rkey = mnl_attr_get_u32(
+ nla_line[RDMA_NLDEV_ATTR_RES_UNSAFE_GLOBAL_RKEY]);
+
+   if (nla_line[RDMA_NLDEV_ATTR_RES_PID]) {
+   pid = mnl_attr_get_u32(
+   nla_line[RDMA_NLDEV_ATTR_RES_PID]);
+   comm = get_task_name(pid);
+   }
+
+   if (rd_check_is_filtered(rd, "pid", pid))
+   continue;
+
+   if (nla_line[RDMA_NLDEV_ATTR_RES_KERN_NAME])
+   /* discard const from mnl_attr_get_str */
+   comm = (char *)mnl_attr_get_str(
+   nla_line[RDMA_NLDEV_ATTR_RES_KERN_NAME]);
+
+   if (rd->json_output)
+   jsonw_start_array(rd->jw);
+
+   print_dev(rd, idx, name);
+   if (nla_line[RDMA_NLDEV_ATTR_RES_LOCAL_DMA_LKEY])
+   print_key(rd, "local_dma_lkey", local_dma_lkey);
+   print_users(rd, users);
+   if (nla_line[RDMA_NLDEV_ATTR_RES_UNSAFE_GLOBAL_RKEY])
+   print_key(rd, "unsafe_global_rkey", unsafe_global_rkey);
+   print_pid(rd, pid);
+   print_comm(rd, comm, nla_line);
+
+   if (nla_line[RDMA_NLDEV_ATTR_RES_PID])
+   free(comm);
+
+   if (rd->json_output)
+   jsonw_end_array(rd->jw);
+   else
+   pr_out("\n");
+   }
+   return MNL_CB_OK;
+}
+
 RES_FUNC(res_no_args,  RDMA_NLDEV_CMD_RES_GET, NULL, true);
 
 static const struct
@@ -990,6 +1075,15 @@ struct filters mr_valid_filters[MAX_NUMBER_OF_FILTERS] = {
 
 RES_FUNC(res_mr, RDMA_NLDEV_CMD_RES_MR_GET, mr_valid_filters, true);
 
+static const
+struct filters pd_valid_filters[MAX_NUMBER_OF_FILTERS] = {
+   { .name = "dev", .is_number = false },
+   { .name = "users", .is_number = true },
+   { .name = "pid", .is_number = true }
+};
+
+RES_FUNC(res_pd, RDMA_NLDEV_CMD_RES_PD_GET, pd_valid_filters, true);
+
 static int res_show(struct rd *rd)
 {
const struct rd_cmd cmds[] = {
@@ -998,6 +1092,7 @@ static int res_show(struct

[PATCH v4 iproute2-next 6/7] rdma: Add MR resource tracking information

2018-03-29 Thread Steve Wise

Sample output:

Without CAP_NET_ADMIN:

$ rdma resource show mr mrlen 65536
dev mlx4_0 mrlen 65536 pid 0 comm [nvme_rdma]
dev cxgb4_0 mrlen 65536 pid 0 comm [nvme_rdma]

With CAP_NET_ADMIN:

# rdma resource show mr mrlen 65536
dev mlx4_0 rkey 0x12702 lkey 0x12702 iova 0x85724a000 mrlen 65536 pid 0 comm 
[nvme_rdma]
dev cxgb4_0 rkey 0x68fe4e9 lkey 0x68fe4e9 iova 0x835b91000 mrlen 65536 pid 0 
comm [nvme_rdma]

Signed-off-by: Steve Wise 
Reviewed-by: Leon Romanovsky 
---
 include/json_writer.h |   2 +
 lib/json_writer.c |  11 +
 rdma/res.c| 127 ++
 rdma/utils.c  |   6 +++
 4 files changed, 146 insertions(+)

diff --git a/include/json_writer.h b/include/json_writer.h
index 45459fa..4b4dec2 100644
--- a/include/json_writer.h
+++ b/include/json_writer.h
@@ -35,6 +35,7 @@ void jsonw_bool(json_writer_t *self, bool value);
 void jsonw_float(json_writer_t *self, double number);
 void jsonw_float_fmt(json_writer_t *self, const char *fmt, double num);
 void jsonw_uint(json_writer_t *self, uint64_t number);
+void jsonw_xint(json_writer_t *self, uint64_t number);
 void jsonw_hu(json_writer_t *self, unsigned short number);
 void jsonw_int(json_writer_t *self, int64_t number);
 void jsonw_null(json_writer_t *self);
@@ -45,6 +46,7 @@ void jsonw_string_field(json_writer_t *self, const char 
*prop, const char *val);
 void jsonw_bool_field(json_writer_t *self, const char *prop, bool value);
 void jsonw_float_field(json_writer_t *self, const char *prop, double num);
 void jsonw_uint_field(json_writer_t *self, const char *prop, uint64_t num);
+void jsonw_xint_field(json_writer_t *self, const char *prop, uint64_t num);
 void jsonw_hu_field(json_writer_t *self, const char *prop, unsigned short num);
 void jsonw_int_field(json_writer_t *self, const char *prop, int64_t num);
 void jsonw_null_field(json_writer_t *self, const char *prop);
diff --git a/lib/json_writer.c b/lib/json_writer.c
index 68401ae..0ad0421 100644
--- a/lib/json_writer.c
+++ b/lib/json_writer.c
@@ -225,6 +225,11 @@ void jsonw_uint(json_writer_t *self, uint64_t num)
jsonw_printf(self, "%"PRIu64, num);
 }
 
+void jsonw_xint(json_writer_t *self, uint64_t num)
+{
+   jsonw_printf(self, "%"PRIx64, num);
+}
+
 void jsonw_lluint(json_writer_t *self, unsigned long long int num)
 {
jsonw_printf(self, "%llu", num);
@@ -269,6 +274,12 @@ void jsonw_uint_field(json_writer_t *self, const char 
*prop, uint64_t num)
jsonw_uint(self, num);
 }
 
+void jsonw_xint_field(json_writer_t *self, const char *prop, uint64_t num)
+{
+   jsonw_name(self, prop);
+   jsonw_xint(self, num);
+}
+
 void jsonw_hu_field(json_writer_t *self, const char *prop, unsigned short num)
 {
jsonw_name(self, prop);
diff --git a/rdma/res.c b/rdma/res.c
index bb5f3dd..9c1f736 100644
--- a/rdma/res.c
+++ b/rdma/res.c
@@ -812,6 +812,121 @@ static int res_cq_parse_cb(const struct nlmsghdr *nlh, 
void *data)
return MNL_CB_OK;
 }
 
+static void print_key(struct rd *rd, const char *name, uint32_t val)
+{
+   if (rd->json_output)
+   jsonw_xint_field(rd->jw, name, val);
+   else
+   pr_out("%s 0x%x ", name, val);
+}
+
+static void print_iova(struct rd *rd, uint64_t val)
+{
+   if (rd->json_output)
+   jsonw_xint_field(rd->jw, "iova", val);
+   else
+   pr_out("iova 0x%" PRIx64 " ", val);
+}
+
+static void print_mrlen(struct rd *rd, uint64_t val)
+{
+   if (rd->json_output)
+   jsonw_uint_field(rd->jw, "mrlen", val);
+   else
+   pr_out("mrlen %" PRIu64 " ", val);
+}
+
+static int res_mr_parse_cb(const struct nlmsghdr *nlh, void *data)
+{
+   struct nlattr *tb[RDMA_NLDEV_ATTR_MAX] = {};
+   struct nlattr *nla_table, *nla_entry;
+   struct rd *rd = data;
+   const char *name;
+   uint32_t idx;
+
+   mnl_attr_parse(nlh, 0, rd_attr_cb, tb);
+   if (!tb[RDMA_NLDEV_ATTR_DEV_INDEX] ||
+   !tb[RDMA_NLDEV_ATTR_DEV_NAME] ||
+   !tb[RDMA_NLDEV_ATTR_RES_MR])
+   return MNL_CB_ERROR;
+
+   name = mnl_attr_get_str(tb[RDMA_NLDEV_ATTR_DEV_NAME]);
+   idx =  mnl_attr_get_u32(tb[RDMA_NLDEV_ATTR_DEV_INDEX]);
+   nla_table = tb[RDMA_NLDEV_ATTR_RES_MR];
+
+   mnl_attr_for_each_nested(nla_entry, nla_table) {
+   struct nlattr *nla_line[RDMA_NLDEV_ATTR_MAX] = {};
+   uint32_t rkey = 0, lkey = 0;
+   uint64_t iova = 0, mrlen;
+   char *comm = NULL;
+   uint32_t pid = 0;
+   int err;
+
+   err = mnl_attr_parse_nested(nla_entry, rd_attr_cb, nla_line);
+   if (err != MNL_CB_OK)
+   return MNL_CB_ERROR;
+
+   if (!nla_line[RDMA_NLDEV_ATTR_RES_MRLEN] ||
+   (!nla_line[RDMA_NLDEV_ATTR_RES_PID] &&
+!nla_line[RDMA_NLDEV_ATTR_RES_KERN_NAME])) {
+

[PATCH v4 iproute2-next 5/7] rdma: Add CQ resource tracking information

2018-03-29 Thread Steve Wise

Sample output:

# rdma resource show cq
dev cxgb4_0 cqe 46 users 2 pid 30503 comm rping
dev cxgb4_0 cqe 46 users 2 pid 30498 comm rping
dev mlx4_0 cqe 63 users 2 pid 30494 comm rping
dev mlx4_0 cqe 63 users 2 pid 30489 comm rping
dev mlx4_0 cqe 1023 users 2 poll_ctx WORKQUEUE pid 0 comm [ib_core]

# rdma resource show cq pid 30489
dev mlx4_0 cqe 63 users 2 pid 30489 comm rping

Signed-off-by: Steve Wise 
Reviewed-by: Leon Romanovsky 
---
 rdma/res.c   | 149 +++
 rdma/utils.c |   5 ++
 2 files changed, 154 insertions(+)

diff --git a/rdma/res.c b/rdma/res.c
index 5506cf3..bb5f3dd 100644
--- a/rdma/res.c
+++ b/rdma/res.c
@@ -21,6 +21,8 @@ static int res_help(struct rd *rd)
pr_out("  resource show qp link [DEV/PORT] [FILTER-NAME 
FILTER-VALUE]\n");
pr_out("  resource show cm_id link [DEV/PORT]\n");
pr_out("  resource show cm_id link [DEV/PORT] [FILTER-NAME 
FILTER-VALUE]\n");
+   pr_out("  resource show cq link [DEV/PORT]\n");
+   pr_out("  resource show cq link [DEV/PORT] [FILTER-NAME 
FILTER-VALUE]\n");
return 0;
 }
 
@@ -265,6 +267,16 @@ static void print_comm(struct rd *rd, const char *str,
pr_out("comm %s ", tmp);
 }
 
+static void print_dev(struct rd *rd, uint32_t idx, const char *name)
+{
+   if (rd->json_output) {
+   jsonw_uint_field(rd->jw, "ifindex", idx);
+   jsonw_string_field(rd->jw, "ifname", name);
+   } else {
+   pr_out("dev %s ", name);
+   }
+}
+
 static void print_link(struct rd *rd, uint32_t idx, const char *name,
   uint32_t port, struct nlattr **nla_line)
 {
@@ -674,6 +686,132 @@ static int res_cm_id_parse_cb(const struct nlmsghdr *nlh, 
void *data)
return MNL_CB_OK;
 }
 
+static void print_cqe(struct rd *rd, uint32_t val)
+{
+   if (rd->json_output)
+   jsonw_uint_field(rd->jw, "cqe", val);
+   else
+   pr_out("cqe %u ", val);
+}
+
+static void print_users(struct rd *rd, uint64_t val)
+{
+   if (rd->json_output)
+   jsonw_uint_field(rd->jw, "users", val);
+   else
+   pr_out("users %" PRIu64 " ", val);
+}
+
+static const char *poll_ctx_to_str(uint8_t idx)
+{
+   static const char * const cm_id_states_str[] = {
+   "DIRECT", "SOFTIRQ", "WORKQUEUE"};
+
+   if (idx < ARRAY_SIZE(cm_id_states_str))
+   return cm_id_states_str[idx];
+   return "UNKNOWN";
+}
+
+static void print_poll_ctx(struct rd *rd, uint8_t poll_ctx)
+{
+   if (rd->json_output) {
+   jsonw_string_field(rd->jw, "poll-ctx",
+  poll_ctx_to_str(poll_ctx));
+   return;
+   }
+   pr_out("poll-ctx %s ", poll_ctx_to_str(poll_ctx));
+}
+
+static int res_cq_parse_cb(const struct nlmsghdr *nlh, void *data)
+{
+   struct nlattr *tb[RDMA_NLDEV_ATTR_MAX] = {};
+   struct nlattr *nla_table, *nla_entry;
+   struct rd *rd = data;
+   const char *name;
+   uint32_t idx;
+
+   mnl_attr_parse(nlh, 0, rd_attr_cb, tb);
+   if (!tb[RDMA_NLDEV_ATTR_DEV_INDEX] ||
+   !tb[RDMA_NLDEV_ATTR_DEV_NAME] ||
+   !tb[RDMA_NLDEV_ATTR_RES_CQ])
+   return MNL_CB_ERROR;
+
+   name = mnl_attr_get_str(tb[RDMA_NLDEV_ATTR_DEV_NAME]);
+   idx =  mnl_attr_get_u32(tb[RDMA_NLDEV_ATTR_DEV_INDEX]);
+   nla_table = tb[RDMA_NLDEV_ATTR_RES_CQ];
+
+   mnl_attr_for_each_nested(nla_entry, nla_table) {
+   struct nlattr *nla_line[RDMA_NLDEV_ATTR_MAX] = {};
+   char *comm = NULL;
+   uint32_t pid = 0;
+   uint8_t poll_ctx = 0;
+   uint64_t users;
+   uint32_t cqe;
+   int err;
+
+   err = mnl_attr_parse_nested(nla_entry, rd_attr_cb, nla_line);
+   if (err != MNL_CB_OK)
+   return MNL_CB_ERROR;
+
+   if (!nla_line[RDMA_NLDEV_ATTR_RES_CQE] ||
+   !nla_line[RDMA_NLDEV_ATTR_RES_USECNT] ||
+   (!nla_line[RDMA_NLDEV_ATTR_RES_PID] &&
+!nla_line[RDMA_NLDEV_ATTR_RES_KERN_NAME])) {
+   return MNL_CB_ERROR;
+   }
+
+   cqe = mnl_attr_get_u32(nla_line[RDMA_NLDEV_ATTR_RES_CQE]);
+
+   users = mnl_attr_get_u64(nla_line[RDMA_NLDEV_ATTR_RES_USECNT]);
+   if (rd_check_is_filtered(rd, "users", users))
+   continue;
+
+   if (nla_line[RDMA_NLDEV_ATTR_RES_POLL_CTX]) {
+   poll_ctx = mnl_attr_get_u8(
+   nla_line[RDMA_NLDEV_ATTR_RES_POLL_CTX]);
+   if (rd_check_is_string_filtered(rd, "poll-ctx",
+   poll_ctx_to_str(poll_ctx)))
+   continue;
+   }
+
+

[PATCH v4 iproute2-next 4/7] rdma: Add CM_ID resource tracking information

2018-03-29 Thread Steve Wise

Sample output:

# rdma resource
2: cxgb4_0: pd 5 cq 2 qp 2 cm_id 3 mr 7
3: mlx4_0: pd 7 cq 3 qp 3 cm_id 3 mr 7

# rdma resource show cm_id
link cxgb4_0/- lqpn 0 qp-type RC state LISTEN ps TCP pid 30485 comm rping 
src-addr 0.0.0.0:7174
link cxgb4_0/2 lqpn 1048 qp-type RC state CONNECT ps TCP pid 30503 comm rping 
src-addr 172.16.2.1:7174 dst-addr 172.16.2.1:38246
link cxgb4_0/2 lqpn 1040 qp-type RC state CONNECT ps TCP pid 30498 comm rping 
src-addr 172.16.2.1:38246 dst-addr 172.16.2.1:7174
link mlx4_0/- lqpn 0 qp-type RC state LISTEN ps TCP pid 30485 comm rping 
src-addr 0.0.0.0:7174
link mlx4_0/1 lqpn 539 qp-type RC state CONNECT ps TCP pid 30494 comm rping 
src-addr 172.16.99.1:7174 dst-addr 172.16.99.1:43670
link mlx4_0/1 lqpn 538 qp-type RC state CONNECT ps TCP pid 30492 comm rping 
src-addr 172.16.99.1:43670 dst-addr 172.16.99.1:7174

# rdma resource show cm_id dst-port 7174
link cxgb4_0/2 lqpn 1040 qp-type RC state CONNECT ps TCP pid 30498 comm rping 
src-addr 172.16.2.1:38246 dst-addr 172.16.2.1:7174
link mlx4_0/1 lqpn 538 qp-type RC state CONNECT ps TCP pid 30492 comm rping 
src-addr 172.16.99.1:43670 dst-addr 172.16.99.1:7174

Signed-off-by: Steve Wise 
Reviewed-by: Leon Romanovsky 
---
 rdma/rdma.h  |   2 +
 rdma/res.c   | 262 ++-
 rdma/utils.c |   5 ++
 3 files changed, 268 insertions(+), 1 deletion(-)

diff --git a/rdma/rdma.h b/rdma/rdma.h
index 5809f70..1908fc4 100644
--- a/rdma/rdma.h
+++ b/rdma/rdma.h
@@ -17,7 +17,9 @@
 #include 
 #include 
 #include 
+#include 
 #include 
+#include 
 
 #include "list.h"
 #include "utils.h"
diff --git a/rdma/res.c b/rdma/res.c
index 62f5c54..5506cf3 100644
--- a/rdma/res.c
+++ b/rdma/res.c
@@ -16,9 +16,11 @@ static int res_help(struct rd *rd)
 {
pr_out("Usage: %s resource\n", rd->filename);
pr_out("  resource show [DEV]\n");
-   pr_out("  resource show [qp]\n");
+   pr_out("  resource show [qp|cm_id]\n");
pr_out("  resource show qp link [DEV/PORT]\n");
pr_out("  resource show qp link [DEV/PORT] [FILTER-NAME 
FILTER-VALUE]\n");
+   pr_out("  resource show cm_id link [DEV/PORT]\n");
+   pr_out("  resource show cm_id link [DEV/PORT] [FILTER-NAME 
FILTER-VALUE]\n");
return 0;
 }
 
@@ -433,6 +435,245 @@ static int res_qp_parse_cb(const struct nlmsghdr *nlh, 
void *data)
return MNL_CB_OK;
 }
 
+static void print_qp_type(struct rd *rd, uint32_t val)
+{
+   if (rd->json_output)
+   jsonw_string_field(rd->jw, "qp-type",
+  qp_types_to_str(val));
+   else
+   pr_out("qp-type %s ", qp_types_to_str(val));
+}
+
+static const char *cm_id_state_to_str(uint8_t idx)
+{
+   static const char * const cm_id_states_str[] = {
+   "IDLE", "ADDR_QUERY", "ADDR_RESOLVED", "ROUTE_QUERY",
+   "ROUTE_RESOLVED", "CONNECT", "DISCONNECT", "ADDR_BOUND",
+   "LISTEN", "DEVICE_REMOVAL", "DESTROYING" };
+
+   if (idx < ARRAY_SIZE(cm_id_states_str))
+   return cm_id_states_str[idx];
+   return "UNKNOWN";
+}
+
+static const char *cm_id_ps_to_str(uint32_t ps)
+{
+   switch (ps) {
+   case RDMA_PS_IPOIB:
+   return "IPoIB";
+   case RDMA_PS_IB:
+   return "IPoIB";
+   case RDMA_PS_TCP:
+   return "TCP";
+   case RDMA_PS_UDP:
+   return "UDP";
+   default:
+   return "---";
+   }
+}
+
+static void print_cm_id_state(struct rd *rd, uint8_t state)
+{
+   if (rd->json_output) {
+   jsonw_string_field(rd->jw, "state", cm_id_state_to_str(state));
+   return;
+   }
+   pr_out("state %s ", cm_id_state_to_str(state));
+}
+
+static void print_ps(struct rd *rd, uint32_t ps)
+{
+   if (rd->json_output) {
+   jsonw_string_field(rd->jw, "ps", cm_id_ps_to_str(ps));
+   return;
+   }
+   pr_out("ps %s ", cm_id_ps_to_str(ps));
+}
+
+static void print_ipaddr(struct rd *rd, const char *key, char *addrstr,
+uint16_t port)
+{
+   if (rd->json_output) {
+   int name_size = INET6_ADDRSTRLEN+strlen(":65535");
+   char json_name[name_size];
+
+   snprintf(json_name, name_size, "%s:%u", addrstr, port);
+   jsonw_string_field(rd->jw, key, json_name);
+   return;
+   }
+   pr_out("%s %s:%u ", key, addrstr, port);
+}
+
+static int ss_ntop(struct nlattr *nla_line, char *addr_str, uint16_t *port)
+{
+   struct __kernel_sockaddr_storage *addr;
+
+   addr = (struct __kernel_sockaddr_storage *)
+   mnl_attr_get_payload(nla_line);
+   switch (addr->ss_family) {
+   case AF_INET: {
+   struct sockaddr_in *sin = (struct sockaddr_in *)addr;
+
+   if

[PATCH v4 iproute2-next 3/7] rdma: initialize the rd struct

2018-03-29 Thread Steve Wise

Initialize the rd struct so port_idx is 0 unless set otherwise.
Otherwise, strict_port queries end up passing an uninitialized PORT
nlattr.

Signed-off-by: Steve Wise 
Reviewed-by: Leon Romanovsky 
---
 rdma/rdma.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/rdma/rdma.c b/rdma/rdma.c
index ab2c960..b43e538 100644
--- a/rdma/rdma.c
+++ b/rdma/rdma.c
@@ -135,7 +135,7 @@ int main(int argc, char **argv)
bool json_output = false;
bool force = false;
char *filename;
-   struct rd rd;
+   struct rd rd = {};
int opt;
int err;
 
-- 
1.8.3.1

[PATCH v4 iproute2-next 1/7] rdma: update rdma_netlink.h

2018-03-29 Thread Steve Wise

From: Steve Wise 

Pull in the latest rdma_netlink.h which has support for
the rdma nldev resource tracking objects being added
with this patch series.

Signed-off-by: Steve Wise 
---
 rdma/include/uapi/rdma/rdma_netlink.h | 38 +++
 1 file changed, 38 insertions(+)

diff --git a/rdma/include/uapi/rdma/rdma_netlink.h 
b/rdma/include/uapi/rdma/rdma_netlink.h
index dbac3b8..9446a72 100644
--- a/rdma/include/uapi/rdma/rdma_netlink.h
+++ b/rdma/include/uapi/rdma/rdma_netlink.h
@@ -238,6 +238,14 @@ enum rdma_nldev_command {
 
RDMA_NLDEV_CMD_RES_QP_GET, /* can dump */
 
+   RDMA_NLDEV_CMD_RES_CM_ID_GET, /* can dump */
+
+   RDMA_NLDEV_CMD_RES_CQ_GET, /* can dump */
+
+   RDMA_NLDEV_CMD_RES_MR_GET, /* can dump */
+
+   RDMA_NLDEV_CMD_RES_PD_GET, /* can dump */
+
RDMA_NLDEV_NUM_OPS
 };
 
@@ -350,6 +358,36 @@ enum rdma_nldev_attr {
 */
RDMA_NLDEV_ATTR_RES_KERN_NAME,  /* string */
 
+   RDMA_NLDEV_ATTR_RES_CM_ID,  /* nested table */
+   RDMA_NLDEV_ATTR_RES_CM_ID_ENTRY,/* nested table */
+   /*
+* rdma_cm_id port space.
+*/
+   RDMA_NLDEV_ATTR_RES_PS, /* u32 */
+   /*
+* Source and destination socket addresses
+*/
+   RDMA_NLDEV_ATTR_RES_SRC_ADDR,   /* __kernel_sockaddr_storage */
+   RDMA_NLDEV_ATTR_RES_DST_ADDR,   /* __kernel_sockaddr_storage */
+
+   RDMA_NLDEV_ATTR_RES_CQ, /* nested table */
+   RDMA_NLDEV_ATTR_RES_CQ_ENTRY,   /* nested table */
+   RDMA_NLDEV_ATTR_RES_CQE,/* u32 */
+   RDMA_NLDEV_ATTR_RES_USECNT, /* u64 */
+   RDMA_NLDEV_ATTR_RES_POLL_CTX,   /* u8 */
+
+   RDMA_NLDEV_ATTR_RES_MR, /* nested table */
+   RDMA_NLDEV_ATTR_RES_MR_ENTRY,   /* nested table */
+   RDMA_NLDEV_ATTR_RES_RKEY,   /* u32 */
+   RDMA_NLDEV_ATTR_RES_LKEY,   /* u32 */
+   RDMA_NLDEV_ATTR_RES_IOVA,   /* u64 */
+   RDMA_NLDEV_ATTR_RES_MRLEN,  /* u64 */
+
+   RDMA_NLDEV_ATTR_RES_PD, /* nested table */
+   RDMA_NLDEV_ATTR_RES_PD_ENTRY,   /* nested table */
+   RDMA_NLDEV_ATTR_RES_LOCAL_DMA_LKEY, /* u32 */
+   RDMA_NLDEV_ATTR_RES_UNSAFE_GLOBAL_RKEY, /* u32 */
+
RDMA_NLDEV_ATTR_MAX
 };
 #endif /* _RDMA_NETLINK_H */
-- 
1.8.3.1

[PATCH v4 iproute2-next 2/7] rdma: add UAPI rdma_user_cm.h

2018-03-29 Thread Steve Wise

This allows parsing rdma_cm_id UAPI values.

Signed-off-by: Steve Wise 
---
 rdma/include/uapi/rdma/rdma_user_cm.h | 324 ++
 1 file changed, 324 insertions(+)
 create mode 100644 rdma/include/uapi/rdma/rdma_user_cm.h

diff --git a/rdma/include/uapi/rdma/rdma_user_cm.h 
b/rdma/include/uapi/rdma/rdma_user_cm.h
new file mode 100644
index 000..da099af
--- /dev/null
+++ b/rdma/include/uapi/rdma/rdma_user_cm.h
@@ -0,0 +1,324 @@
+/* SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR 
BSD-2-Clause) */
+/*
+ * Copyright (c) 2005-2006 Intel Corporation.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ *  - Redistributions of source code must retain the above
+ *copyright notice, this list of conditions and the following
+ *disclaimer.
+ *
+ *  - Redistributions in binary form must reproduce the above
+ *copyright notice, this list of conditions and the following
+ *disclaimer in the documentation and/or other materials
+ *provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef _RDMA_USER_CM_H
+#define _RDMA_USER_CM_H
+
+#include 
+#include 
+#include 
+#include 
+#include 
+
+#define RDMA_USER_CM_ABI_VERSION   4
+
+#define RDMA_MAX_PRIVATE_DATA  256
+
+enum {
+   RDMA_USER_CM_CMD_CREATE_ID,
+   RDMA_USER_CM_CMD_DESTROY_ID,
+   RDMA_USER_CM_CMD_BIND_IP,
+   RDMA_USER_CM_CMD_RESOLVE_IP,
+   RDMA_USER_CM_CMD_RESOLVE_ROUTE,
+   RDMA_USER_CM_CMD_QUERY_ROUTE,
+   RDMA_USER_CM_CMD_CONNECT,
+   RDMA_USER_CM_CMD_LISTEN,
+   RDMA_USER_CM_CMD_ACCEPT,
+   RDMA_USER_CM_CMD_REJECT,
+   RDMA_USER_CM_CMD_DISCONNECT,
+   RDMA_USER_CM_CMD_INIT_QP_ATTR,
+   RDMA_USER_CM_CMD_GET_EVENT,
+   RDMA_USER_CM_CMD_GET_OPTION,
+   RDMA_USER_CM_CMD_SET_OPTION,
+   RDMA_USER_CM_CMD_NOTIFY,
+   RDMA_USER_CM_CMD_JOIN_IP_MCAST,
+   RDMA_USER_CM_CMD_LEAVE_MCAST,
+   RDMA_USER_CM_CMD_MIGRATE_ID,
+   RDMA_USER_CM_CMD_QUERY,
+   RDMA_USER_CM_CMD_BIND,
+   RDMA_USER_CM_CMD_RESOLVE_ADDR,
+   RDMA_USER_CM_CMD_JOIN_MCAST
+};
+
+/* See IBTA Annex A11, servies ID bytes 4 & 5 */
+enum rdma_ucm_port_space {
+   RDMA_PS_IPOIB = 0x0002,
+   RDMA_PS_IB= 0x013F,
+   RDMA_PS_TCP   = 0x0106,
+   RDMA_PS_UDP   = 0x0111,
+};
+
+/*
+ * command ABI structures.
+ */
+struct rdma_ucm_cmd_hdr {
+   __u32 cmd;
+   __u16 in;
+   __u16 out;
+};
+
+struct rdma_ucm_create_id {
+   __aligned_u64 uid;
+   __aligned_u64 response;
+   __u16 ps;  /* use enum rdma_ucm_port_space */
+   __u8  qp_type;
+   __u8  reserved[5];
+};
+
+struct rdma_ucm_create_id_resp {
+   __u32 id;
+};
+
+struct rdma_ucm_destroy_id {
+   __aligned_u64 response;
+   __u32 id;
+   __u32 reserved;
+};
+
+struct rdma_ucm_destroy_id_resp {
+   __u32 events_reported;
+};
+
+struct rdma_ucm_bind_ip {
+   __aligned_u64 response;
+   struct sockaddr_in6 addr;
+   __u32 id;
+};
+
+struct rdma_ucm_bind {
+   __u32 id;
+   __u16 addr_size;
+   __u16 reserved;
+   struct __kernel_sockaddr_storage addr;
+};
+
+struct rdma_ucm_resolve_ip {
+   struct sockaddr_in6 src_addr;
+   struct sockaddr_in6 dst_addr;
+   __u32 id;
+   __u32 timeout_ms;
+};
+
+struct rdma_ucm_resolve_addr {
+   __u32 id;
+   __u32 timeout_ms;
+   __u16 src_size;
+   __u16 dst_size;
+   __u32 reserved;
+   struct __kernel_sockaddr_storage src_addr;
+   struct __kernel_sockaddr_storage dst_addr;
+};
+
+struct rdma_ucm_resolve_route {
+   __u32 id;
+   __u32 timeout_ms;
+};
+
+enum {
+   RDMA_USER_CM_QUERY_ADDR,
+   RDMA_USER_CM_QUERY_PATH,
+   RDMA_USER_CM_QUERY_GID
+};
+
+struct rdma_ucm_query {
+   __aligned_u64 response;
+   __u32 id;
+   __u32 option;
+};
+
+struct rdma_ucm_query_route_resp {
+   __aligned_u64 node_guid;
+   struct ib_user_path_rec ib_route[2];
+   struct sockaddr_in6 src_addr;

[PATCH v4 iproute2-next 0/7] cm_id, cq, mr, and pd resource tracking

2018-03-29 Thread Steve Wise

This series enhances the iproute2 rdma tool to include dumping of
connection manager id (cm_id), completion queue (cq), memory region (mr),
and protection domain (pd) rdma resources.  It is the user-space part of
the kernel resource tracking series merged into rdma-next for 4.17 [1]
and [2].

Changes since v3:
- replaced rdma_cma.h inclusion with UAPI rdma_user_cm.h
- display only device names instead of device/port for cq, mr, and pd
since they are not associated with a specific port.

Changes since v2:
- pull in rdma-core:include/rdma/rdma_cma.h
- 80 column reformat
- add reviewed-by tags

Changes since v1/RFC:
- removed RFC tag
- initialize rd properly to avoid passing a garbage port number
- revert accidental change to qp_valid_filters 
- removed cm_id dev/network/transport types
- cm_id ip addrs now passed up as __kernel_sockaddr_storage
- cm_id ip address ports printed as "address:port" strings
- only parse/display memory keys and iova if available
- filter on "users" for cqs and pds
- fixed memory leaks 
- removed PD_FLAGS attribute
- filter on "mrlen" for mrs
- filter on "poll-ctx" for cqs
- don't require addrs or qp_type for parsing cm_ids
- only filter optional attrs if they are present
- remove PGSIZE MR attr to match kernel 

[1] https://www.spinics.net/lists/linux-rdma/msg61720.html
[2] https://www.spinics.net/lists/linux-rdma/msg62979.html
https://www.spinics.net/lists/linux-rdma/msg62980.html

---

Steve Wise (7):
  rdma: update rdma_netlink.h
  rdma: add UAPI rdma_user_cm.h
  rdma: initialize the rd struct
  rdma: Add CM_ID resource tracking information
  rdma: Add CQ resource tracking information
  rdma: Add MR resource tracking information
  rdma: Add PD resource tracking information

 include/json_writer.h |   2 +
 lib/json_writer.c |  11 +
 rdma/include/uapi/rdma/rdma_netlink.h |  38 ++
 rdma/include/uapi/rdma/rdma_user_cm.h | 324 +
 rdma/rdma.c   |   2 +-
 rdma/rdma.h   |   2 +
 rdma/res.c| 633 +-
 rdma/utils.c  |  16 +
 8 files changed, 1026 insertions(+), 2 deletions(-)
 create mode 100644 rdma/include/uapi/rdma/rdma_user_cm.h

-- 
1.8.3.1

Re: [PATCH iproute2] json_print: fix print_uint hidden type promotion

2018-03-29 Thread Kevin Darbyshire-Bryant



> On 29 Mar 2018, at 22:02, Stephen Hemminger  
> wrote:
> 
> On Thu, 29 Mar 2018 20:22:20 +0100
> Kevin Darbyshire-Bryant  wrote:
> 
>> print_int used 'int' type internally, whereas print_uint used 'uint64_t'
>> 
>> These helper functions eventually call vfprintf(fp, fmt, args) which is
>> a variable argument list function and is dependent upon 'fmt' containing
>> correct information about the length of the passed arguments.
>> 
>> Unfortunately print_int v print_uint offered no clue to the programmer
>> that internally passed ints to print_uint were being promoted to 64bits,
>> thus the format passed in 'fmt' string vs the actual passed integer
>> could be different lengths.  This is even more interesting on big endian
>> architectures where 'vfprintf' would be looking in the middle of an
>> int64 type.
>> 
>> print_u/int now stick with native int size.
>> 
>> Signed-off-by: Kevin Darbyshire-Bryant 
>> ---
>> include/json_print.h | 2 +-
>> lib/json_print.c | 2 +-
>> 2 files changed, 2 insertions(+), 2 deletions(-)
>> 
>> diff --git a/include/json_print.h b/include/json_print.h
>> index 2ca7830a..45bc653d 100644
>> --- a/include/json_print.h
>> +++ b/include/json_print.h
>> @@ -56,10 +56,10 @@ void close_json_array(enum output_type type, const char 
>> *delim);
>>  print_color_##type_name(t, COLOR_NONE, key, fmt, value);
>> \
>>  }
>> _PRINT_FUNC(int, int);
>> +_PRINT_FUNC(uint, unsigned int);
>> _PRINT_FUNC(bool, bool);
>> _PRINT_FUNC(null, const char*);
>> _PRINT_FUNC(string, const char*);
>> -_PRINT_FUNC(uint, uint64_t);
>> _PRINT_FUNC(hu, unsigned short);
>> _PRINT_FUNC(hex, unsigned int);
>> _PRINT_FUNC(0xhex, unsigned int);
>> diff --git a/lib/json_print.c b/lib/json_print.c
>> index 6518ba98..8d54d1d4 100644
>> --- a/lib/json_print.c
>> +++ b/lib/json_print.c
>> @@ -117,8 +117,8 @@ void close_json_array(enum output_type type, const char 
>> *str)
>>  }   \
>>  }
>> _PRINT_FUNC(int, int);
>> +_PRINT_FUNC(uint, unsigned int);
>> _PRINT_FUNC(hu, unsigned short);
>> -_PRINT_FUNC(uint, uint64_t);
>> _PRINT_FUNC(lluint, unsigned long long int);
>> _PRINT_FUNC(float, double);
>> #undef _PRINT_FUNC
> 
> 
> I am concerned that this will break output of 64 bit statistics on 32 bit 
> hosts.

I honestly don’t know what to do.  Without the patch I see breakage on <33 bit 
stats with 32 bit big endian hosts ‘cos the printf formatting doesn’t know the 
type passed internally by the function is 64bits long. e.g.

tc qdisc
qdisc noqueue 0: dev lo root refcnt 4486716 
qdisc fq_codel 0: dev eth1 root refcnt 4486716 limit 4498840p flows 4536204 
quantum 4539856 target 5.0ms interval 100.0ms memory_limit 4Mb ecn 
qdisc noqueue 0: dev br-lan root refcnt 4486716 
qdisc noqueue 0: dev eth1.2 root refcnt 4486716 
qdisc noqueue 0: dev br-wifi_guest root refcnt 4486716 
qdisc noqueue 0: dev eth1.15 root refcnt 4486716 
qdisc noqueue 0: dev wlan1 root refcnt 4486716 
qdisc noqueue 0: dev wlan0 root refcnt 4486716 
qdisc noqueue 0: dev wlan1-1 root refcnt 4486716 
qdisc noqueue 0: dev wlan0-1 root refcnt 4486716

I guess _PRINT_FUNC(int, int) could be _PRINT_FUNC(int, int64_t) and then at 
least we’d be consistent in doing hidden promotions and see breakage for both 
signed & unsigned types on certain architectures.

But I think I’ve hit my (lack of) skill limit and don’t really know how to take 
this further forward, or wish to break more protocols :-)

[net-next v2 2/5] tipc: refactor name table translate function

2018-03-29 Thread Jon Maloy

The function tipc_nametbl_translate() function is ugly and hard to
follow. This can be improved somewhat by introducing a stack variable
for holding the publication list to be used and re-ordering the if-
clauses for selection of algorithm.

Signed-off-by: Jon Maloy 
---
 net/tipc/name_table.c | 61 +--
 1 file changed, 25 insertions(+), 36 deletions(-)

diff --git a/net/tipc/name_table.c b/net/tipc/name_table.c
index e06c7a8..4bdc580 100644
--- a/net/tipc/name_table.c
+++ b/net/tipc/name_table.c
@@ -399,29 +399,32 @@ struct publication *tipc_nametbl_remove_publ(struct net 
*net, u32 type,
 /**
  * tipc_nametbl_translate - perform service instance to socket translation
  *
- * On entry, 'destnode' is the search domain used during translation.
+ * On entry, 'dnode' is the search domain used during translation.
  *
  * On exit:
- * - if name translation is deferred to another node/cluster/zone,
- *   leaves 'destnode' unchanged (will be non-zero) and returns 0
- * - if name translation is attempted and succeeds, sets 'destnode'
- *   to publication node and returns port reference (will be non-zero)
- * - if name translation is attempted and fails, sets 'destnode' to 0
- *   and returns 0
+ * - if translation is deferred to another node, leave 'dnode' unchanged and
+ *   return 0
+ * - if translation is attempted and succeeds, set 'dnode' to the publishing
+ *   node and return the published (non-zero) port number
+ * - if translation is attempted and fails, set 'dnode' to 0 and return 0
+ *
+ * Note that for legacy users (node configured with Z.C.N address format) the
+ * 'closest-first' lookup algorithm must be maintained, i.e., if dnode is 0
+ * we must look in the local binding list first
  */
-u32 tipc_nametbl_translate(struct net *net, u32 type, u32 instance,
-  u32 *destnode)
+u32 tipc_nametbl_translate(struct net *net, u32 type, u32 instance, u32 *dnode)
 {
struct tipc_net *tn = tipc_net(net);
bool legacy = tn->legacy_addr_format;
u32 self = tipc_own_addr(net);
struct service_range *sr;
struct tipc_service *sc;
+   struct list_head *list;
struct publication *p;
u32 port = 0;
u32 node = 0;
 
-   if (!tipc_in_scope(legacy, *destnode, self))
+   if (!tipc_in_scope(legacy, *dnode, self))
return 0;
 
rcu_read_lock();
@@ -434,43 +437,29 @@ u32 tipc_nametbl_translate(struct net *net, u32 type, u32 
instance,
if (unlikely(!sr))
goto no_match;
 
-   /* Closest-First Algorithm */
-   if (legacy && !*destnode) {
-   if (!list_empty(>local_publ)) {
-   p = list_first_entry(>local_publ,
-struct publication,
-local_publ);
-   list_move_tail(>local_publ,
-  >local_publ);
-   } else {
-   p = list_first_entry(>all_publ,
-struct publication,
-all_publ);
-   list_move_tail(>all_publ,
-  >all_publ);
-   }
-   }
-
-   /* Round-Robin Algorithm */
-   else if (*destnode == self) {
-   if (list_empty(>local_publ))
+   /* Select lookup algorithm: local, closest-first or round-robin */
+   if (*dnode == self) {
+   list = >local_publ;
+   if (list_empty(list))
goto no_match;
-   p = list_first_entry(>local_publ, struct publication,
-local_publ);
+   p = list_first_entry(list, struct publication, local_publ);
+   list_move_tail(>local_publ, >local_publ);
+   } else if (legacy && !*dnode && !list_empty(>local_publ)) {
+   list = >local_publ;
+   p = list_first_entry(list, struct publication, local_publ);
list_move_tail(>local_publ, >local_publ);
} else {
-   p = list_first_entry(>all_publ, struct publication,
-all_publ);
+   list = >all_publ;
+   p = list_first_entry(list, struct publication, all_publ);
list_move_tail(>all_publ, >all_publ);
}
-
port = p->port;
node = p->node;
 no_match:
spin_unlock_bh(>lock);
 not_found:
rcu_read_unlock();
-   *destnode = node;
+   *dnode = node;
return port;
 }
 
-- 
2.1.4

[net-next v2 5/5] tipc: avoid possible string overflow

2018-03-29 Thread Jon Maloy

gcc points out that the combined length of the fixed-length inputs to
l->name is larger than the destination buffer size:

net/tipc/link.c: In function 'tipc_link_create':
net/tipc/link.c:465:26: error: '%s' directive writing up to 32 bytes
into a region of size between 26 and 58 [-Werror=format-overflow=]
sprintf(l->name, "%s:%s-%s:unknown", self_str, if_name, peer_str);

net/tipc/link.c:465:2: note: 'sprintf' output 11 or more bytes
(assuming 75) into a destination of size 60
sprintf(l->name, "%s:%s-%s:unknown", self_str, if_name, peer_str);

A detailed analysis reveals that the theoretical maximum length of
a link name is:
max self_str + 1 + max if_name + 1 + max peer_str + 1 + max if_name =
16 + 1 + 15 + 1 + 16 + 1 + 15 = 65
Since we also need space for a trailing zero we now set MAX_LINK_NAME
to 68.

Just to be on the safe side we also replace the sprintf() call with
snprintf().

Fixes: 25b0b9c4e835 ("tipc: handle collisions of 32-bit node address
hash values")
Reported-by: Arnd Bergmann 

Signed-off-by: Jon Maloy 
---
 include/uapi/linux/tipc.h | 2 +-
 net/tipc/link.c   | 3 ++-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/include/uapi/linux/tipc.h b/include/uapi/linux/tipc.h
index 156224a..bf6d286 100644
--- a/include/uapi/linux/tipc.h
+++ b/include/uapi/linux/tipc.h
@@ -216,7 +216,7 @@ struct tipc_group_req {
 #define TIPC_MAX_MEDIA_NAME16
 #define TIPC_MAX_IF_NAME   16
 #define TIPC_MAX_BEARER_NAME   32
-#define TIPC_MAX_LINK_NAME 60
+#define TIPC_MAX_LINK_NAME 68
 
 #define SIOCGETLINKNAMESIOCPROTOPRIVATE
 
diff --git a/net/tipc/link.c b/net/tipc/link.c
index 8f2a949..695acb7 100644
--- a/net/tipc/link.c
+++ b/net/tipc/link.c
@@ -462,7 +462,8 @@ bool tipc_link_create(struct net *net, char *if_name, int 
bearer_id,
sprintf(peer_str, "%x", peer);
}
/* Peer i/f name will be completed by reset/activate message */
-   sprintf(l->name, "%s:%s-%s:unknown", self_str, if_name, peer_str);
+   snprintf(l->name, sizeof(l->name), "%s:%s-%s:unknown",
+self_str, if_name, peer_str);
 
strcpy(l->if_name, if_name);
l->addr = peer;
-- 
2.1.4

[net-next v2 1/5] tipc: replace name table service range array with rb tree

2018-03-29 Thread Jon Maloy

The current design of the binding table has an unnecessary memory
consuming and complex data structure. It aggregates the service range
items into an array, which is expanded by a factor two every time it
becomes too small to hold a new item. Furthermore, the arrays never
shrink when the number of ranges diminishes.

We now replace this array with an RB tree that is holding the range
items as tree nodes, each range directly holding a list of bindings.

This, along with a few name changes, improves both readability and
volume of the code, as well as reducing memory consumption and hopefully
improving cache hit rate.

Signed-off-by: Jon Maloy 
---
 net/tipc/core.h   |1 +
 net/tipc/link.c   |2 +-
 net/tipc/name_table.c | 1032 ++---
 net/tipc/name_table.h |2 +-
 net/tipc/node.c   |4 +-
 net/tipc/subscr.h |4 +-
 6 files changed, 477 insertions(+), 568 deletions(-)

diff --git a/net/tipc/core.h b/net/tipc/core.h
index d0f64ca..8020a6c 100644
--- a/net/tipc/core.h
+++ b/net/tipc/core.h
@@ -58,6 +58,7 @@
 #include 
 #include 
 #include 
+#include 
 
 struct tipc_node;
 struct tipc_bearer;
diff --git a/net/tipc/link.c b/net/tipc/link.c
index 1289b4b..8f2a949 100644
--- a/net/tipc/link.c
+++ b/net/tipc/link.c
@@ -1810,7 +1810,7 @@ int tipc_link_bc_nack_rcv(struct tipc_link *l, struct 
sk_buff *skb,
 
 void tipc_link_set_queue_limits(struct tipc_link *l, u32 win)
 {
-   int max_bulk = TIPC_MAX_PUBLICATIONS / (l->mtu / ITEM_SIZE);
+   int max_bulk = TIPC_MAX_PUBL / (l->mtu / ITEM_SIZE);
 
l->window = win;
l->backlog[TIPC_LOW_IMPORTANCE].limit  = max_t(u16, 50, win);
diff --git a/net/tipc/name_table.c b/net/tipc/name_table.c
index 4359605..e06c7a8 100644
--- a/net/tipc/name_table.c
+++ b/net/tipc/name_table.c
@@ -44,52 +44,40 @@
 #include "addr.h"
 #include "node.h"
 #include "group.h"
-#include 
-
-#define TIPC_NAMETBL_SIZE 1024 /* must be a power of 2 */
 
 /**
- * struct name_info - name sequence publication info
- * @node_list: list of publications on own node of this 
- * @all_publ: list of all publications of this 
+ * struct service_range - container for all bindings of a service range
+ * @lower: service range lower bound
+ * @upper: service range upper bound
+ * @tree_node: member of service range RB tree
+ * @local_publ: list of identical publications made from this node
+ *   Used by closest_first lookup and multicast lookup algorithm
+ * @all_publ: all publications identical to this one, whatever node and scope
+ *   Used by round-robin lookup algorithm
  */
-struct name_info {
-   struct list_head local_publ;
-   struct list_head all_publ;
-};
-
-/**
- * struct sub_seq - container for all published instances of a name sequence
- * @lower: name sequence lower bound
- * @upper: name sequence upper bound
- * @info: pointer to name sequence publication info
- */
-struct sub_seq {
+struct service_range {
u32 lower;
u32 upper;
-   struct name_info *info;
+   struct rb_node tree_node;
+   struct list_head local_publ;
+   struct list_head all_publ;
 };
 
 /**
- * struct name_seq - container for all published instances of a name type
- * @type: 32 bit 'type' value for name sequence
- * @sseq: pointer to dynamically-sized array of sub-sequences of this 'type';
- *sub-sequences are sorted in ascending order
- * @alloc: number of sub-sequences currently in array
- * @first_free: array index of first unused sub-sequence entry
- * @ns_list: links to adjacent name sequences in hash chain
- * @subscriptions: list of subscriptions for this 'type'
- * @lock: spinlock controlling access to publication lists of all sub-sequences
+ * struct tipc_service - container for all published instances of a service 
type
+ * @type: 32 bit 'type' value for service
+ * @ranges: rb tree containing all service ranges for this service
+ * @service_list: links to adjacent name ranges in hash chain
+ * @subscriptions: list of subscriptions for this service type
+ * @lock: spinlock controlling access to pertaining service ranges/publications
  * @rcu: RCU callback head used for deferred freeing
  */
-struct name_seq {
+struct tipc_service {
u32 type;
-   struct sub_seq *sseqs;
-   u32 alloc;
-   u32 first_free;
-   struct hlist_node ns_list;
+   struct rb_root ranges;
+   struct hlist_node service_list;
struct list_head subscriptions;
-   spinlock_t lock;
+   spinlock_t lock; /* Covers service range list */
struct rcu_head rcu;
 };
 
@@ -99,17 +87,16 @@ static int hash(int x)
 }
 
 /**
- * publ_create - create a publication structure
+ * tipc_publ_create - create a publication structure
  */
-static struct publication *publ_create(u32 type, u32 lower, u32 upper,
-  u32 scope, u32 node, u32 port,
-

[net-next v2 3/5] tipc: permit overlapping service ranges in name table

2018-03-29 Thread Jon Maloy

With the new RB tree structure for service ranges it becomes possible to
solve an old problem; - we can now allow overlapping service ranges in
the table.

When inserting a new service range to the tree, we use 'lower' as primary
key, and when necessary 'upper' as secondary key.

Since there may now be multiple service ranges matching an indicated
'lower' value, we must also add the 'upper' value to the functions
used for removing publications, so that the correct, corresponding
range item can be found.

These changes guarantee that a well-formed publication/withdrawal item
from a peer node never will be rejected, and make it possible to
eliminate the problematic backlog functionality we currently have for
handling such cases.

Signed-off-by: Jon Maloy 
---
 net/tipc/name_distr.c | 90 +--
 net/tipc/name_distr.h |  1 -
 net/tipc/name_table.c | 64 +---
 net/tipc/name_table.h |  8 ++---
 net/tipc/net.c|  2 +-
 net/tipc/node.c   |  2 +-
 net/tipc/socket.c |  4 +--
 7 files changed, 60 insertions(+), 111 deletions(-)

diff --git a/net/tipc/name_distr.c b/net/tipc/name_distr.c
index 8240a85..51b4b96 100644
--- a/net/tipc/name_distr.c
+++ b/net/tipc/name_distr.c
@@ -204,12 +204,12 @@ void tipc_named_node_up(struct net *net, u32 dnode)
  */
 static void tipc_publ_purge(struct net *net, struct publication *publ, u32 
addr)
 {
-   struct tipc_net *tn = net_generic(net, tipc_net_id);
+   struct tipc_net *tn = tipc_net(net);
struct publication *p;
 
spin_lock_bh(>nametbl_lock);
-   p = tipc_nametbl_remove_publ(net, publ->type, publ->lower,
-publ->node, publ->port, publ->key);
+   p = tipc_nametbl_remove_publ(net, publ->type, publ->lower, publ->upper,
+publ->node, publ->key);
if (p)
tipc_node_unsubscribe(net, >binding_node, addr);
spin_unlock_bh(>nametbl_lock);
@@ -261,28 +261,31 @@ void tipc_publ_notify(struct net *net, struct list_head 
*nsub_list, u32 addr)
 static bool tipc_update_nametbl(struct net *net, struct distr_item *i,
u32 node, u32 dtype)
 {
-   struct publication *publ = NULL;
+   struct publication *p = NULL;
+   u32 lower = ntohl(i->lower);
+   u32 upper = ntohl(i->upper);
+   u32 type = ntohl(i->type);
+   u32 port = ntohl(i->port);
+   u32 key = ntohl(i->key);
 
if (dtype == PUBLICATION) {
-   publ = tipc_nametbl_insert_publ(net, ntohl(i->type),
-   ntohl(i->lower),
-   ntohl(i->upper),
-   TIPC_CLUSTER_SCOPE, node,
-   ntohl(i->port), ntohl(i->key));
-   if (publ) {
-   tipc_node_subscribe(net, >binding_node, node);
+   p = tipc_nametbl_insert_publ(net, type, lower, upper,
+TIPC_CLUSTER_SCOPE, node,
+port, key);
+   if (p) {
+   tipc_node_subscribe(net, >binding_node, node);
return true;
}
} else if (dtype == WITHDRAWAL) {
-   publ = tipc_nametbl_remove_publ(net, ntohl(i->type),
-   ntohl(i->lower),
-   node, ntohl(i->port),
-   ntohl(i->key));
-   if (publ) {
-   tipc_node_unsubscribe(net, >binding_node, node);
-   kfree_rcu(publ, rcu);
+   p = tipc_nametbl_remove_publ(net, type, lower,
+upper, node, key);
+   if (p) {
+   tipc_node_unsubscribe(net, >binding_node, node);
+   kfree_rcu(p, rcu);
return true;
}
+   pr_warn_ratelimited("Failed to remove binding %u,%u from %x\n",
+   type, lower, node);
} else {
pr_warn("Unrecognized name table message received\n");
}
@@ -290,53 +293,6 @@ static bool tipc_update_nametbl(struct net *net, struct 
distr_item *i,
 }
 
 /**
- * tipc_named_add_backlog - add a failed name table update to the backlog
- *
- */
-static void tipc_named_add_backlog(struct net *net, struct distr_item *i,
-  u32 type, u32 node)
-{
-   struct distr_queue_item *e;
-   struct tipc_net *tn = net_generic(net, tipc_net_id);
-   unsigned long now = get_jiffies_64();
-
-   e = kzalloc(sizeof(*e), GFP_ATOMIC);
-   if (!e)
-   return;
-   e->dtype = type;
-   e->node = node;
-   e->expires = now +

[net-next v2 4/5] tipc: tipc: rename address types in user api

2018-03-29 Thread Jon Maloy

The three address type structs in the user API have names that in
reality reflect the specific, non-Linux environment where they were
originally created.

We now give them more intuitive names, in accordance with how TIPC is
described in the current documentation.

struct tipc_portid   -> struct tipc_socket_addr
struct tipc_name -> struct tipc_service_addr
struct tipc_name_seq -> struct tipc_service_range

To avoid confusion, we also update some commmets and macro names to
 match the new terminology.

For compatibility, we add macros that map all old names to the new ones.

Signed-off-by: Jon Maloy 
---
 include/uapi/linux/tipc.h | 57 +++
 1 file changed, 33 insertions(+), 24 deletions(-)

diff --git a/include/uapi/linux/tipc.h b/include/uapi/linux/tipc.h
index 4ac9f1f..156224a 100644
--- a/include/uapi/linux/tipc.h
+++ b/include/uapi/linux/tipc.h
@@ -45,33 +45,33 @@
  * TIPC addressing primitives
  */
 
-struct tipc_portid {
+struct tipc_socket_addr {
__u32 ref;
__u32 node;
 };
 
-struct tipc_name {
+struct tipc_service_addr {
__u32 type;
__u32 instance;
 };
 
-struct tipc_name_seq {
+struct tipc_service_range {
__u32 type;
__u32 lower;
__u32 upper;
 };
 
 /*
- * Application-accessible port name types
+ * Application-accessible service types
  */
 
-#define TIPC_CFG_SRV   0   /* configuration service name type */
-#define TIPC_TOP_SRV   1   /* topology service name type */
-#define TIPC_LINK_STATE2   /* link state name type */
-#define TIPC_RESERVED_TYPES64  /* lowest user-publishable name type */
+#define TIPC_NODE_STATE0   /* node state service type */
+#define TIPC_TOP_SRV   1   /* topology server service type */
+#define TIPC_LINK_STATE2   /* link state service type */
+#define TIPC_RESERVED_TYPES64  /* lowest user-allowed service type */
 
 /*
- * Publication scopes when binding port names and port name sequences
+ * Publication scopes when binding service / service range
  */
 enum tipc_scope {
TIPC_CLUSTER_SCOPE = 2, /* 0 can also be used */
@@ -108,28 +108,28 @@ enum tipc_scope {
  * TIPC topology subscription service definitions
  */
 
-#define TIPC_SUB_PORTS 0x01/* filter for port availability */
-#define TIPC_SUB_SERVICE   0x02/* filter for service availability */
-#define TIPC_SUB_CANCEL0x04/* cancel a subscription */
+#define TIPC_SUB_PORTS  0x01/* filter: evt at each match */
+#define TIPC_SUB_SERVICE0x02/* filter: evt at first up/last down */
+#define TIPC_SUB_CANCEL 0x04/* filter: cancel a subscription */
 
 #define TIPC_WAIT_FOREVER  (~0)/* timeout for permanent subscription */
 
 struct tipc_subscr {
-   struct tipc_name_seq seq;   /* name sequence of interest */
+   struct tipc_service_range seq;  /* range of interest */
__u32 timeout;  /* subscription duration (in ms) */
__u32 filter;   /* bitmask of filter options */
char usr_handle[8]; /* available for subscriber use */
 };
 
 #define TIPC_PUBLISHED 1   /* publication event */
-#define TIPC_WITHDRAWN 2   /* withdraw event */
+#define TIPC_WITHDRAWN 2   /* withdrawal event */
 #define TIPC_SUBSCR_TIMEOUT3   /* subscription timeout event */
 
 struct tipc_event {
__u32 event;/* event type */
-   __u32 found_lower;  /* matching name seq instances */
-   __u32 found_upper;  /*"  "" "  */
-   struct tipc_portid port;/* associated port */
+   __u32 found_lower;  /* matching range */
+   __u32 found_upper;  /*"  "*/
+   struct tipc_socket_addr port;   /* associated socket */
struct tipc_subscr s;   /* associated subscription */
 };
 
@@ -149,20 +149,20 @@ struct tipc_event {
 #define SOL_TIPC   271
 #endif
 
-#define TIPC_ADDR_NAMESEQ  1
-#define TIPC_ADDR_MCAST1
-#define TIPC_ADDR_NAME 2
-#define TIPC_ADDR_ID   3
+#define TIPC_ADDR_MCAST 1
+#define TIPC_SERVICE_RANGE  1
+#define TIPC_SERVICE_ADDR   2
+#define TIPC_SOCKET_ADDR3
 
 struct sockaddr_tipc {
unsigned short family;
unsigned char  addrtype;
signed   char  scope;
union {
-   struct tipc_portid id;
-   struct tipc_name_seq nameseq;
+   struct tipc_socket_addr id;
+   struct tipc_service_range nameseq;
struct {
-   struct tipc_name name;
+   struct tipc_service_addr name;
__u32 domain;
} name;
} addr;
@@ -230,8 +230,13 @@ struct

[net-next v2 0/5] tipc: slim down name table

2018-03-29 Thread Jon Maloy

We clean up and improve the name binding table:

 - Replace the memory consuming 'sub_sequence/service range' array with
   an RB tree.
 - Introduce support for overlapping service sequences/ranges

 v2: #1: Fixed a missing initialization reported by David Miller
 #4: Obsoleted and replaced a few more macros to get a consistent
 terminology in the API.
 #5: Added new commit to fix a potential string overflow bug (it
 is still only in net-next) reported by Arnd Bergmann

Jon Maloy (5):
  tipc: replace name table service range array with rb tree
  tipc: refactor name table translate function
  tipc: permit overlapping service ranges in name table
  tipc: tipc: rename address types in user api
  tipc: avoid possible string overflow

 include/uapi/linux/tipc.h |   59 +--
 net/tipc/core.h   |1 +
 net/tipc/link.c   |5 +-
 net/tipc/name_distr.c |   90 +---
 net/tipc/name_distr.h |1 -
 net/tipc/name_table.c | 1075 -
 net/tipc/name_table.h |   10 +-
 net/tipc/net.c|2 +-
 net/tipc/node.c   |4 +-
 net/tipc/socket.c |4 +-
 net/tipc/subscr.h |4 +-
 11 files changed, 556 insertions(+), 699 deletions(-)

-- 
2.1.4

Re: possible deadlock in perf_event_detach_bpf_prog

2018-03-29 Thread Daniel Borkmann

On 03/29/2018 11:04 PM, syzbot wrote:
> Hello,
> 
> syzbot hit the following crash on upstream commit
> 3eb2ce825ea1ad89d20f7a3b5780df850e4be274 (Sun Mar 25 22:44:30 2018 +)
> Linux 4.16-rc7
> syzbot dashboard link: 
> https://syzkaller.appspot.com/bug?extid=dc5ca0e4c9bfafaf2bae
> 
> Unfortunately, I don't have any reproducer for this crash yet.
> Raw console output: 
> https://syzkaller.appspot.com/x/log.txt?id=4742532743299072
> Kernel config: https://syzkaller.appspot.com/x/.config?id=-8440362230543204781
> compiler: gcc (GCC) 7.1.1 20170620
> 
> IMPORTANT: if you fix the bug, please add the following tag to the commit:
> Reported-by: syzbot+dc5ca0e4c9bfafaf2...@syzkaller.appspotmail.com
> It will help syzbot understand when the bug is fixed. See footer for details.
> If you forward the report, please keep this part and the footer.
> 
> 
> ==
> WARNING: possible circular locking dependency detected
> 4.16.0-rc7+ #3 Not tainted
> --
> syz-executor7/24531 is trying to acquire lock:
>  (bpf_event_mutex){+.+.}, at: [<8a849b07>] 
> perf_event_detach_bpf_prog+0x92/0x3d0 kernel/trace/bpf_trace.c:854
> 
> but task is already holding lock:
>  (>mmap_sem){}, at: [<38768f87>] vm_mmap_pgoff+0x198/0x280 
> mm/util.c:353
> 
> which lock already depends on the new lock.
> 
> 
> the existing dependency chain (in reverse order) is:
> 
> -> #1 (>mmap_sem){}:
>    __might_fault+0x13a/0x1d0 mm/memory.c:4571
>    _copy_to_user+0x2c/0xc0 lib/usercopy.c:25
>    copy_to_user include/linux/uaccess.h:155 [inline]
>    bpf_prog_array_copy_info+0xf2/0x1c0 kernel/bpf/core.c:1694
>    perf_event_query_prog_array+0x1c7/0x2c0 kernel/trace/bpf_trace.c:891

Looks like we should move the two copy_to_user() outside of
bpf_event_mutex section to avoid the deadlock.

>    _perf_ioctl kernel/events/core.c:4750 [inline]
>    perf_ioctl+0x3e1/0x1480 kernel/events/core.c:4770
>    vfs_ioctl fs/ioctl.c:46 [inline]
>    do_vfs_ioctl+0x1b1/0x1520 fs/ioctl.c:686
>    SYSC_ioctl fs/ioctl.c:701 [inline]
>    SyS_ioctl+0x8f/0xc0 fs/ioctl.c:692
>    do_syscall_64+0x281/0x940 arch/x86/entry/common.c:287
>    entry_SYSCALL_64_after_hwframe+0x42/0xb7
> 
> -> #0 (bpf_event_mutex){+.+.}:
>    lock_acquire+0x1d5/0x580 kernel/locking/lockdep.c:3920
>    __mutex_lock_common kernel/locking/mutex.c:756 [inline]
>    __mutex_lock+0x16f/0x1a80 kernel/locking/mutex.c:893
>    mutex_lock_nested+0x16/0x20 kernel/locking/mutex.c:908
>    perf_event_detach_bpf_prog+0x92/0x3d0 kernel/trace/bpf_trace.c:854
>    perf_event_free_bpf_prog kernel/events/core.c:8147 [inline]
>    _free_event+0xbdb/0x10f0 kernel/events/core.c:4116
>    put_event+0x24/0x30 kernel/events/core.c:4204
>    perf_mmap_close+0x60d/0x1010 kernel/events/core.c:5172
>    remove_vma+0xb4/0x1b0 mm/mmap.c:172
>    remove_vma_list mm/mmap.c:2490 [inline]
>    do_munmap+0x82a/0xdf0 mm/mmap.c:2731
>    mmap_region+0x59e/0x15a0 mm/mmap.c:1646
>    do_mmap+0x6c0/0xe00 mm/mmap.c:1483
>    do_mmap_pgoff include/linux/mm.h:2223 [inline]
>    vm_mmap_pgoff+0x1de/0x280 mm/util.c:355
>    SYSC_mmap_pgoff mm/mmap.c:1533 [inline]
>    SyS_mmap_pgoff+0x462/0x5f0 mm/mmap.c:1491
>    SYSC_mmap arch/x86/kernel/sys_x86_64.c:100 [inline]
>    SyS_mmap+0x16/0x20 arch/x86/kernel/sys_x86_64.c:91
>    do_syscall_64+0x281/0x940 arch/x86/entry/common.c:287
>    entry_SYSCALL_64_after_hwframe+0x42/0xb7
> 
> other info that might help us debug this:
> 
>  Possible unsafe locking scenario:
> 
>    CPU0    CPU1
>        
>   lock(>mmap_sem);
>    lock(bpf_event_mutex);
>    lock(>mmap_sem);
>   lock(bpf_event_mutex);
> 
>  *** DEADLOCK ***
> 
> 1 lock held by syz-executor7/24531:
>  #0:  (>mmap_sem){}, at: [<38768f87>] 
> vm_mmap_pgoff+0x198/0x280 mm/util.c:353
> 
> stack backtrace:
> CPU: 0 PID: 24531 Comm: syz-executor7 Not tainted 4.16.0-rc7+ #3
> Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS 
> Google 01/01/2011
> Call Trace:
>  __dump_stack lib/dump_stack.c:17 [inline]
>  dump_stack+0x194/0x24d lib/dump_stack.c:53
>  print_circular_bug.isra.38+0x2cd/0x2dc kernel/locking/lockdep.c:1223
>  check_prev_add kernel/locking/lockdep.c:1863 [inline]
>  check_prevs_add kernel/locking/lockdep.c:1976 [inline]
>  validate_chain kernel/locking/lockdep.c:2417 [inline]
>  __lock_acquire+0x30a8/0x3e00 kernel/locking/lockdep.c:3431
>  lock_acquire+0x1d5/0x580 kernel/locking/lockdep.c:3920
>  __mutex_lock_common kernel/locking/mutex.c:756 [inline]
>  __mutex_lock+0x16f/0x1a80 kernel/locking/mutex.c:893
>  mutex_lock_nested+0x16/0x20 kernel/locking/mutex.c:908
>  perf_event_detach_bpf_prog+0x92/0x3d0 kernel/trace/bpf_trace.c:854
>

Re: Passing uninitialised local variable

2018-03-29 Thread Arend van Spriel


On 3/28/2018 1:20 PM, Himanshu Jha wrote:

Hello everyone,


You added everyone, but me :-(

Not really a problem, but it would help if the driver name was mentioned 
in the subject.



I recently found that a local variable in passed uninitialised to the
function at

drivers/net/wireless/broadcom/brcm80211/brcmfmac/cfg80211.c:2950

 u32 var;
 err = brcmf_fil_iovar_int_get(ifp, "dtim_assoc", );
 if (err) {
 brcmf_err("wl dtim_assoc failed (%d)\n", err);
 goto update_bss_info_out;
 }
 dtim_period = (u8)var;


Now, the brcmf_fil_iovar_int_get() is defined as:

s32
brcmf_fil_iovar_int_get(struct brcmf_if *ifp, char *name, u32 *data)
{
 __le32 data_le = cpu_to_le32(*data);
 s32 err;

 err = brcmf_fil_iovar_data_get(ifp, name, _le, sizeof(data_le));
 if (err == 0)
 *data = le32_to_cpu(data_le);
 return err;
}

We can cleary see that 'var' in used uninitialised in the very first line
which is an undefined behavior.


Why undefined? We copy some stack data and we do transfer that to the 
device. However in this case the device does nothing with it and it is 
simply overwritten by the response.



So, what could be a possible fix for the above ?

I'm not sure initialising 'var' to 0 would be the correct solution.


Coverity flagged this and probably still does. For this particular 
instance setting var to '0' is fine. However, there are quite a few 
other places. For some instances the data contains a selector value for 
obtaining info from the device, which is what we copy in 
brcmf_fil_iovar_int_get(). So maybe it would be best to have a separate 
function for those, eg. brcmf_fil_iovar_int_selget() or so.


Regards,
Arend

[next-queue PATCH v6 02/10] igb: Fix queue selection on MAC filters on i210

2018-03-29 Thread Vinicius Costa Gomes

On the RAH registers there are semantic differences on the meaning of
the "queue" parameter for traffic steering depending on the controller
model: there is the 82575 meaning, which "queue" means a RX Hardware
Queue, and the i350 meaning, where it is a reception pool.

The previous behaviour was having no effect for i210 based controllers
because the QSEL bit of the RAH register wasn't being set.

This patch separates the condition in discrete cases, so the different
handling is clearer.

Fixes: 83c21335c876 ("igb: improve MAC filter handling")
Signed-off-by: Vinicius Costa Gomes 
---
 drivers/net/ethernet/intel/igb/igb_main.c | 13 +
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/drivers/net/ethernet/intel/igb/igb_main.c 
b/drivers/net/ethernet/intel/igb/igb_main.c
index c1c0bc30a16d..0a79fef3c4fb 100644
--- a/drivers/net/ethernet/intel/igb/igb_main.c
+++ b/drivers/net/ethernet/intel/igb/igb_main.c
@@ -8748,12 +8748,17 @@ static void igb_rar_set_index(struct igb_adapter 
*adapter, u32 index)
if (is_valid_ether_addr(addr))
rar_high |= E1000_RAH_AV;
 
-   if (hw->mac.type == e1000_82575)
+   switch (hw->mac.type) {
+   case e1000_82575:
+   case e1000_i210:
rar_high |= E1000_RAH_POOL_1 *
-   adapter->mac_table[index].queue;
-   else
+ adapter->mac_table[index].queue;
+   break;
+   default:
rar_high |= E1000_RAH_POOL_1 <<
-   adapter->mac_table[index].queue;
+   adapter->mac_table[index].queue;
+   break;
+   }
}
 
wr32(E1000_RAL(index), rar_low);
-- 
2.16.3

[next-queue PATCH v6 03/10] igb: Enable the hardware traffic class feature bit for igb models

2018-03-29 Thread Vinicius Costa Gomes

This will allow functionality depending on the hardware being traffic
class aware to work. In particular the tc-flower offloading checks
verifies that this bit is set.

Signed-off-by: Vinicius Costa Gomes 
---
 drivers/net/ethernet/intel/igb/igb_main.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/drivers/net/ethernet/intel/igb/igb_main.c 
b/drivers/net/ethernet/intel/igb/igb_main.c
index 0a79fef3c4fb..976898d39d6e 100644
--- a/drivers/net/ethernet/intel/igb/igb_main.c
+++ b/drivers/net/ethernet/intel/igb/igb_main.c
@@ -2807,6 +2807,9 @@ static int igb_probe(struct pci_dev *pdev, const struct 
pci_device_id *ent)
if (hw->mac.type >= e1000_82576)
netdev->features |= NETIF_F_SCTP_CRC;
 
+   if (hw->mac.type >= e1000_i350)
+   netdev->features |= NETIF_F_HW_TC;
+
 #define IGB_GSO_PARTIAL_FEATURES (NETIF_F_GSO_GRE | \
  NETIF_F_GSO_GRE_CSUM | \
  NETIF_F_GSO_IPXIP4 | \
-- 
2.16.3

[next-queue PATCH v6 09/10] igb: Add the skeletons for tc-flower offloading

2018-03-29 Thread Vinicius Costa Gomes

This adds basic functions needed to implement offloading for filters
created by tc-flower.

Signed-off-by: Vinicius Costa Gomes 
---
 drivers/net/ethernet/intel/igb/igb_main.c | 66 +++
 1 file changed, 66 insertions(+)

diff --git a/drivers/net/ethernet/intel/igb/igb_main.c 
b/drivers/net/ethernet/intel/igb/igb_main.c
index 1b6fad88107a..e3f33fb8064e 100644
--- a/drivers/net/ethernet/intel/igb/igb_main.c
+++ b/drivers/net/ethernet/intel/igb/igb_main.c
@@ -36,6 +36,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -2498,6 +2499,69 @@ static int igb_offload_cbs(struct igb_adapter *adapter,
return 0;
 }
 
+static int igb_configure_clsflower(struct igb_adapter *adapter,
+  struct tc_cls_flower_offload *cls_flower)
+{
+   return -EOPNOTSUPP;
+}
+
+static int igb_delete_clsflower(struct igb_adapter *adapter,
+   struct tc_cls_flower_offload *cls_flower)
+{
+   return -EOPNOTSUPP;
+}
+
+static int igb_setup_tc_cls_flower(struct igb_adapter *adapter,
+  struct tc_cls_flower_offload *cls_flower)
+{
+   switch (cls_flower->command) {
+   case TC_CLSFLOWER_REPLACE:
+   return igb_configure_clsflower(adapter, cls_flower);
+   case TC_CLSFLOWER_DESTROY:
+   return igb_delete_clsflower(adapter, cls_flower);
+   case TC_CLSFLOWER_STATS:
+   return -EOPNOTSUPP;
+   default:
+   return -EINVAL;
+   }
+}
+
+static int igb_setup_tc_block_cb(enum tc_setup_type type, void *type_data,
+void *cb_priv)
+{
+   struct igb_adapter *adapter = cb_priv;
+
+   if (!tc_cls_can_offload_and_chain0(adapter->netdev, type_data))
+   return -EOPNOTSUPP;
+
+   switch (type) {
+   case TC_SETUP_CLSFLOWER:
+   return igb_setup_tc_cls_flower(adapter, type_data);
+
+   default:
+   return -EOPNOTSUPP;
+   }
+}
+
+static int igb_setup_tc_block(struct igb_adapter *adapter,
+ struct tc_block_offload *f)
+{
+   if (f->binder_type != TCF_BLOCK_BINDER_TYPE_CLSACT_INGRESS)
+   return -EOPNOTSUPP;
+
+   switch (f->command) {
+   case TC_BLOCK_BIND:
+   return tcf_block_cb_register(f->block, igb_setup_tc_block_cb,
+adapter, adapter);
+   case TC_BLOCK_UNBIND:
+   tcf_block_cb_unregister(f->block, igb_setup_tc_block_cb,
+   adapter);
+   return 0;
+   default:
+   return -EOPNOTSUPP;
+   }
+}
+
 static int igb_setup_tc(struct net_device *dev, enum tc_setup_type type,
void *type_data)
 {
@@ -2506,6 +2570,8 @@ static int igb_setup_tc(struct net_device *dev, enum 
tc_setup_type type,
switch (type) {
case TC_SETUP_QDISC_CBS:
return igb_offload_cbs(adapter, type_data);
+   case TC_SETUP_BLOCK:
+   return igb_setup_tc_block(adapter, type_data);
 
default:
return -EOPNOTSUPP;
-- 
2.16.3

[next-queue PATCH v6 04/10] igb: Add support for MAC address filters specifying source addresses

2018-03-29 Thread Vinicius Costa Gomes

Makes it possible to direct packets to queues based on their source
address. Documents the expected usage of the 'flags' parameter.

Signed-off-by: Vinicius Costa Gomes 
---
 drivers/net/ethernet/intel/igb/e1000_defines.h |  1 +
 drivers/net/ethernet/intel/igb/igb.h   |  1 +
 drivers/net/ethernet/intel/igb/igb_main.c  | 40 ++
 3 files changed, 37 insertions(+), 5 deletions(-)

diff --git a/drivers/net/ethernet/intel/igb/e1000_defines.h 
b/drivers/net/ethernet/intel/igb/e1000_defines.h
index 98534f765e0e..5417edbe3125 100644
--- a/drivers/net/ethernet/intel/igb/e1000_defines.h
+++ b/drivers/net/ethernet/intel/igb/e1000_defines.h
@@ -491,6 +491,7 @@
  * manageability enabled, allowing us room for 15 multicast addresses.
  */
 #define E1000_RAH_AV  0x8000/* Receive descriptor valid */
+#define E1000_RAH_ASEL_SRC_ADDR 0x0001
 #define E1000_RAL_MAC_ADDR_LEN 4
 #define E1000_RAH_MAC_ADDR_LEN 2
 #define E1000_RAH_POOL_MASK 0x03FC
diff --git a/drivers/net/ethernet/intel/igb/igb.h 
b/drivers/net/ethernet/intel/igb/igb.h
index 8dbc399b345e..f3ecda46f287 100644
--- a/drivers/net/ethernet/intel/igb/igb.h
+++ b/drivers/net/ethernet/intel/igb/igb.h
@@ -474,6 +474,7 @@ struct igb_mac_addr {
 
 #define IGB_MAC_STATE_DEFAULT  0x1
 #define IGB_MAC_STATE_IN_USE   0x2
+#define IGB_MAC_STATE_SRC_ADDR  0x4
 
 /* board specific private data structure */
 struct igb_adapter {
diff --git a/drivers/net/ethernet/intel/igb/igb_main.c 
b/drivers/net/ethernet/intel/igb/igb_main.c
index 976898d39d6e..2033ec3c9b27 100644
--- a/drivers/net/ethernet/intel/igb/igb_main.c
+++ b/drivers/net/ethernet/intel/igb/igb_main.c
@@ -6844,8 +6844,14 @@ static void igb_set_default_mac_filter(struct 
igb_adapter *adapter)
igb_rar_set_index(adapter, 0);
 }
 
-static int igb_add_mac_filter(struct igb_adapter *adapter, const u8 *addr,
- const u8 queue)
+/* Add a MAC filter for 'addr' directing matching traffic to 'queue',
+ * 'flags' is used to indicate what kind of match is made, match is by
+ * default for the destination address, if matching by source address
+ * is desired the flag IGB_MAC_STATE_SRC_ADDR can be used.
+ */
+static int igb_add_mac_filter_flags(struct igb_adapter *adapter,
+   const u8 *addr, const u8 queue,
+   const u8 flags)
 {
struct e1000_hw *hw = >hw;
int rar_entries = hw->mac.rar_entry_count -
@@ -6865,7 +6871,7 @@ static int igb_add_mac_filter(struct igb_adapter 
*adapter, const u8 *addr,
 
ether_addr_copy(adapter->mac_table[i].addr, addr);
adapter->mac_table[i].queue = queue;
-   adapter->mac_table[i].state |= IGB_MAC_STATE_IN_USE;
+   adapter->mac_table[i].state |= IGB_MAC_STATE_IN_USE | flags;
 
igb_rar_set_index(adapter, i);
return i;
@@ -6874,8 +6880,21 @@ static int igb_add_mac_filter(struct igb_adapter 
*adapter, const u8 *addr,
return -ENOSPC;
 }
 
-static int igb_del_mac_filter(struct igb_adapter *adapter, const u8 *addr,
+static int igb_add_mac_filter(struct igb_adapter *adapter, const u8 *addr,
  const u8 queue)
+{
+   return igb_add_mac_filter_flags(adapter, addr, queue, 0);
+}
+
+/* Remove a MAC filter for 'addr' directing matching traffic to
+ * 'queue', 'flags' is used to indicate what kind of match need to be
+ * removed, match is by default for the destination address, if
+ * matching by source address is to be removed the flag
+ * IGB_MAC_STATE_SRC_ADDR can be used.
+ */
+static int igb_del_mac_filter_flags(struct igb_adapter *adapter,
+   const u8 *addr, const u8 queue,
+   const u8 flags)
 {
struct e1000_hw *hw = >hw;
int rar_entries = hw->mac.rar_entry_count -
@@ -6892,12 +6911,14 @@ static int igb_del_mac_filter(struct igb_adapter 
*adapter, const u8 *addr,
for (i = 0; i < rar_entries; i++) {
if (!(adapter->mac_table[i].state & IGB_MAC_STATE_IN_USE))
continue;
+   if ((adapter->mac_table[i].state & flags) != flags)
+   continue;
if (adapter->mac_table[i].queue != queue)
continue;
if (!ether_addr_equal(adapter->mac_table[i].addr, addr))
continue;
 
-   adapter->mac_table[i].state &= ~IGB_MAC_STATE_IN_USE;
+   adapter->mac_table[i].state = 0;
memset(adapter->mac_table[i].addr, 0, ETH_ALEN);
adapter->mac_table[i].queue = 0;
 
@@ -6908,6 +6929,12 @@ static int igb_del_mac_filter(struct igb_adapter 
*adapter, const u8 *addr,
return -ENOENT;
 }
 
+static int igb_del_mac_filter(struct igb_adapter *adapter, const u8 *addr,
+ const u8 queue)
+{
+   return

[next-queue PATCH v6 06/10] igb: Allow filters to be added for the local MAC address

2018-03-29 Thread Vinicius Costa Gomes

Users expect that when adding a steering filter for the local MAC
address, that all the traffic directed to that address will go to some
queue.

Currently, it's not possible to configure entries in the "in use"
state, which is the normal state of the local MAC address entry (it is
the default), this patch allows to override the steering configuration
of "in use" entries, if the filter to be added match the address and
address type (source or destination) of an existing entry.

There is a bit of a special handling for entries referring to the
local MAC address, when they are removed, only the steering
configuration is reset.

Signed-off-by: Vinicius Costa Gomes 
---
 drivers/net/ethernet/intel/igb/igb_main.c | 40 +++
 1 file changed, 36 insertions(+), 4 deletions(-)

diff --git a/drivers/net/ethernet/intel/igb/igb_main.c 
b/drivers/net/ethernet/intel/igb/igb_main.c
index e3da35cab786..1b6fad88107a 100644
--- a/drivers/net/ethernet/intel/igb/igb_main.c
+++ b/drivers/net/ethernet/intel/igb/igb_main.c
@@ -6844,6 +6844,27 @@ static void igb_set_default_mac_filter(struct 
igb_adapter *adapter)
igb_rar_set_index(adapter, 0);
 }
 
+/* If the filter to be added and an already existing filter express
+ * the same address and address type, it should be possible to only
+ * override the other configurations, for example the queue to steer
+ * traffic.
+ */
+static bool igb_mac_entry_can_be_used(const struct igb_mac_addr *entry,
+ const u8 *addr, const u8 flags)
+{
+   if (!(entry->state & IGB_MAC_STATE_IN_USE))
+   return true;
+
+   if ((entry->state & IGB_MAC_STATE_SRC_ADDR) !=
+   (flags & IGB_MAC_STATE_SRC_ADDR))
+   return false;
+
+   if (!ether_addr_equal(addr, entry->addr))
+   return false;
+
+   return true;
+}
+
 /* Add a MAC filter for 'addr' directing matching traffic to 'queue',
  * 'flags' is used to indicate what kind of match is made, match is by
  * default for the destination address, if matching by source address
@@ -6866,7 +6887,8 @@ static int igb_add_mac_filter_flags(struct igb_adapter 
*adapter,
 * addresses.
 */
for (i = 0; i < rar_entries; i++) {
-   if (adapter->mac_table[i].state & IGB_MAC_STATE_IN_USE)
+   if (!igb_mac_entry_can_be_used(>mac_table[i],
+  addr, flags))
continue;
 
ether_addr_copy(adapter->mac_table[i].addr, addr);
@@ -6918,9 +6940,19 @@ static int igb_del_mac_filter_flags(struct igb_adapter 
*adapter,
if (!ether_addr_equal(adapter->mac_table[i].addr, addr))
continue;
 
-   adapter->mac_table[i].state = 0;
-   memset(adapter->mac_table[i].addr, 0, ETH_ALEN);
-   adapter->mac_table[i].queue = 0;
+   /* When a filter for the default address is "deleted",
+* we return it to its initial configuration
+*/
+   if (adapter->mac_table[i].state & IGB_MAC_STATE_DEFAULT) {
+   adapter->mac_table[i].state =
+   IGB_MAC_STATE_DEFAULT | IGB_MAC_STATE_IN_USE;
+   adapter->mac_table[i].queue =
+   adapter->vfs_allocated_count;
+   } else {
+   adapter->mac_table[i].state = 0;
+   adapter->mac_table[i].queue = 0;
+   memset(adapter->mac_table[i].addr, 0, ETH_ALEN);
+   }
 
igb_rar_set_index(adapter, i);
return 0;
-- 
2.16.3

[next-queue PATCH v6 00/10] igb: offloading of receive filters

2018-03-29 Thread Vinicius Costa Gomes


Hi,

Changes from v5:
 - Filters can now be added for local MAC addresses, when removed,
   they return to their initial configuration (thanks for the testing
   Aaron);

Changes from v4:
 - Added a new bit to the MAC address filters internal
 representation to mean that some filters are steering filters (i.e.
 they direct traffic to queues);
 - And, this is only supported in i210;

Changes from v3:
 - Addressed review comments from Aaron F. Brown and
   Jakub Kicinski;

Changes from v2:
 - Addressed review comments from Jakub Kicinski, mostly about coding
   style adjustments and more consistent error reporting;

Changes from v1:
 - Addressed review comments from Alexander Duyck and Florian
   Fainelli;
 - Adding and removing cls_flower filters are now proposed in the same
   patch;
 - cls_flower filters are kept in a separated list from "ethtool"
   filters (so that section of the original cover letter is no longer
   valid);
 - The patch adding support for ethtool filters is now independent from
   the rest of the series;

Original cover letter:

This series enables some ethtool and tc-flower filters to be offloaded
to igb-based network controllers. This is useful when the system
configurator want to steer kinds of traffic to a specific hardware
queue.

The first two commits are bug fixes.

The basis of this series is to export the internal API used to
configure address filters, so they can be used by ethtool, and
extending the functionality so an source address can be handled.

Then, we enable the tc-flower offloading implementation to re-use the
same infrastructure as ethtool, and storing them in the per-adapter
"nfc" (Network Filter Config?) list. But for consistency, for
destructive access they are separated, i.e. an filter added by
tc-flower can only be removed by tc-flower, but ethtool can read them
all.

Only support for VLAN Prio, Source and Destination MAC Address, and
Ethertype is enabled for now.

Open question:
  - igb is initialized with the number of traffic classes as 1, if we
  want to use multiple traffic classes we need to increase this value,
  the only way I could find is to use mqprio (for example). Should igb
  be initialized with, say, the number of queues as its "num_tc"?


Vinicius Costa Gomes (10):
  igb: Fix not adding filter elements to the list
  igb: Fix queue selection on MAC filters on i210
  igb: Enable the hardware traffic class feature bit for igb models
  igb: Add support for MAC address filters specifying source addresses
  igb: Add support for enabling queue steering in filters
  igb: Allow filters to be added for the local MAC address
  igb: Enable nfc filters to specify MAC addresses
  igb: Add MAC address support for ethtool nftuple filters
  igb: Add the skeletons for tc-flower offloading
  igb: Add support for adding offloaded clsflower filters

 drivers/net/ethernet/intel/igb/e1000_defines.h |   2 +
 drivers/net/ethernet/intel/igb/igb.h   |  13 +
 drivers/net/ethernet/intel/igb/igb_ethtool.c   |  65 -
 drivers/net/ethernet/intel/igb/igb_main.c  | 370 -
 4 files changed, 433 insertions(+), 17 deletions(-)

--
2.16.3

[next-queue PATCH v6 10/10] igb: Add support for adding offloaded clsflower filters

2018-03-29 Thread Vinicius Costa Gomes

This allows filters added by tc-flower and specifying MAC addresses,
Ethernet types, and the VLAN priority field, to be offloaded to the
controller.

This reuses most of the infrastructure used by ethtool, but clsflower
filters are kept in a separated list, so they are invisible to
ethtool.

Signed-off-by: Vinicius Costa Gomes 
---
 drivers/net/ethernet/intel/igb/igb.h  |   2 +
 drivers/net/ethernet/intel/igb/igb_main.c | 188 +-
 2 files changed, 188 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/intel/igb/igb.h 
b/drivers/net/ethernet/intel/igb/igb.h
index b9b965921e9f..a413284fada6 100644
--- a/drivers/net/ethernet/intel/igb/igb.h
+++ b/drivers/net/ethernet/intel/igb/igb.h
@@ -465,6 +465,7 @@ struct igb_nfc_input {
 struct igb_nfc_filter {
struct hlist_node nfc_node;
struct igb_nfc_input filter;
+   unsigned long cookie;
u16 etype_reg_index;
u16 sw_idx;
u16 action;
@@ -604,6 +605,7 @@ struct igb_adapter {
 
/* RX network flow classification support */
struct hlist_head nfc_filter_list;
+   struct hlist_head cls_flower_list;
unsigned int nfc_filter_count;
/* lock for RX network flow classification filter */
spinlock_t nfc_lock;
diff --git a/drivers/net/ethernet/intel/igb/igb_main.c 
b/drivers/net/ethernet/intel/igb/igb_main.c
index e3f33fb8064e..3c2e68dd0902 100644
--- a/drivers/net/ethernet/intel/igb/igb_main.c
+++ b/drivers/net/ethernet/intel/igb/igb_main.c
@@ -2499,16 +2499,197 @@ static int igb_offload_cbs(struct igb_adapter *adapter,
return 0;
 }
 
+#define ETHER_TYPE_FULL_MASK ((__force __be16)~0)
+#define VLAN_PRIO_FULL_MASK (0x07)
+
+static int igb_parse_cls_flower(struct igb_adapter *adapter,
+   struct tc_cls_flower_offload *f,
+   int traffic_class,
+   struct igb_nfc_filter *input)
+{
+   struct netlink_ext_ack *extack = f->common.extack;
+
+   if (f->dissector->used_keys &
+   ~(BIT(FLOW_DISSECTOR_KEY_BASIC) |
+ BIT(FLOW_DISSECTOR_KEY_CONTROL) |
+ BIT(FLOW_DISSECTOR_KEY_ETH_ADDRS) |
+ BIT(FLOW_DISSECTOR_KEY_VLAN))) {
+   NL_SET_ERR_MSG_MOD(extack,
+  "Unsupported key used, only BASIC, CONTROL, 
ETH_ADDRS and VLAN are supported");
+   return -EOPNOTSUPP;
+   }
+
+   if (dissector_uses_key(f->dissector, FLOW_DISSECTOR_KEY_ETH_ADDRS)) {
+   struct flow_dissector_key_eth_addrs *key, *mask;
+
+   key = skb_flow_dissector_target(f->dissector,
+   FLOW_DISSECTOR_KEY_ETH_ADDRS,
+   f->key);
+   mask = skb_flow_dissector_target(f->dissector,
+FLOW_DISSECTOR_KEY_ETH_ADDRS,
+f->mask);
+
+   if (!is_zero_ether_addr(mask->dst)) {
+   if (!is_broadcast_ether_addr(mask->dst)) {
+   NL_SET_ERR_MSG_MOD(extack, "Only full masks are 
supported for destination MAC address");
+   return -EINVAL;
+   }
+
+   input->filter.match_flags |=
+   IGB_FILTER_FLAG_DST_MAC_ADDR;
+   ether_addr_copy(input->filter.dst_addr, key->dst);
+   }
+
+   if (!is_zero_ether_addr(mask->src)) {
+   if (!is_broadcast_ether_addr(mask->src)) {
+   NL_SET_ERR_MSG_MOD(extack, "Only full masks are 
supported for source MAC address");
+   return -EINVAL;
+   }
+
+   input->filter.match_flags |=
+   IGB_FILTER_FLAG_SRC_MAC_ADDR;
+   ether_addr_copy(input->filter.src_addr, key->src);
+   }
+   }
+
+   if (dissector_uses_key(f->dissector, FLOW_DISSECTOR_KEY_BASIC)) {
+   struct flow_dissector_key_basic *key, *mask;
+
+   key = skb_flow_dissector_target(f->dissector,
+   FLOW_DISSECTOR_KEY_BASIC,
+   f->key);
+   mask = skb_flow_dissector_target(f->dissector,
+FLOW_DISSECTOR_KEY_BASIC,
+f->mask);
+
+   if (mask->n_proto) {
+   if (mask->n_proto != ETHER_TYPE_FULL_MASK) {
+   NL_SET_ERR_MSG_MOD(extack, "Only full mask is 
supported for EtherType filter");
+   return -EINVAL;
+   }
+
+   input->filter.match_flags |= IGB_FILTER_FLAG_ETHER_TYPE;
+

[next-queue PATCH v6 05/10] igb: Add support for enabling queue steering in filters

2018-03-29 Thread Vinicius Costa Gomes

On some igb models (82575 and i210) the MAC address filters can
control to which queue the packet will be assigned.

This extends the 'state' with one more state to signify that queue
selection should be enabled for that filter.

As 82575 parts are no longer easily obtained (and this was developed
against i210), only support for the i210 model is enabled.

These functions are exported and will be used in the next patch.

Signed-off-by: Vinicius Costa Gomes 
---
 drivers/net/ethernet/intel/igb/e1000_defines.h |  1 +
 drivers/net/ethernet/intel/igb/igb.h   |  6 ++
 drivers/net/ethernet/intel/igb/igb_main.c  | 26 ++
 3 files changed, 33 insertions(+)

diff --git a/drivers/net/ethernet/intel/igb/e1000_defines.h 
b/drivers/net/ethernet/intel/igb/e1000_defines.h
index 5417edbe3125..d3d1d868e7ba 100644
--- a/drivers/net/ethernet/intel/igb/e1000_defines.h
+++ b/drivers/net/ethernet/intel/igb/e1000_defines.h
@@ -492,6 +492,7 @@
  */
 #define E1000_RAH_AV  0x8000/* Receive descriptor valid */
 #define E1000_RAH_ASEL_SRC_ADDR 0x0001
+#define E1000_RAH_QSEL_ENABLE 0x1000
 #define E1000_RAL_MAC_ADDR_LEN 4
 #define E1000_RAH_MAC_ADDR_LEN 2
 #define E1000_RAH_POOL_MASK 0x03FC
diff --git a/drivers/net/ethernet/intel/igb/igb.h 
b/drivers/net/ethernet/intel/igb/igb.h
index f3ecda46f287..f48ba090fd6a 100644
--- a/drivers/net/ethernet/intel/igb/igb.h
+++ b/drivers/net/ethernet/intel/igb/igb.h
@@ -475,6 +475,7 @@ struct igb_mac_addr {
 #define IGB_MAC_STATE_DEFAULT  0x1
 #define IGB_MAC_STATE_IN_USE   0x2
 #define IGB_MAC_STATE_SRC_ADDR  0x4
+#define IGB_MAC_STATE_QUEUE_STEERING 0x8
 
 /* board specific private data structure */
 struct igb_adapter {
@@ -740,4 +741,9 @@ int igb_add_filter(struct igb_adapter *adapter,
 int igb_erase_filter(struct igb_adapter *adapter,
 struct igb_nfc_filter *input);
 
+int igb_add_mac_steering_filter(struct igb_adapter *adapter,
+   const u8 *addr, u8 queue, u8 flags);
+int igb_del_mac_steering_filter(struct igb_adapter *adapter,
+   const u8 *addr, u8 queue, u8 flags);
+
 #endif /* _IGB_H_ */
diff --git a/drivers/net/ethernet/intel/igb/igb_main.c 
b/drivers/net/ethernet/intel/igb/igb_main.c
index 2033ec3c9b27..e3da35cab786 100644
--- a/drivers/net/ethernet/intel/igb/igb_main.c
+++ b/drivers/net/ethernet/intel/igb/igb_main.c
@@ -6935,6 +6935,28 @@ static int igb_del_mac_filter(struct igb_adapter 
*adapter, const u8 *addr,
return igb_del_mac_filter_flags(adapter, addr, queue, 0);
 }
 
+int igb_add_mac_steering_filter(struct igb_adapter *adapter,
+   const u8 *addr, u8 queue, u8 flags)
+{
+   struct e1000_hw *hw = >hw;
+
+   /* In theory, this should be supported on 82575 as well, but
+* that part wasn't easily accessible during development.
+*/
+   if (hw->mac.type != e1000_i210)
+   return -EOPNOTSUPP;
+
+   return igb_add_mac_filter_flags(adapter, addr, queue,
+   IGB_MAC_STATE_QUEUE_STEERING | flags);
+}
+
+int igb_del_mac_steering_filter(struct igb_adapter *adapter,
+   const u8 *addr, u8 queue, u8 flags)
+{
+   return igb_del_mac_filter_flags(adapter, addr, queue,
+   IGB_MAC_STATE_QUEUE_STEERING | flags);
+}
+
 static int igb_uc_sync(struct net_device *netdev, const unsigned char *addr)
 {
struct igb_adapter *adapter = netdev_priv(netdev);
@@ -8784,6 +8806,10 @@ static void igb_rar_set_index(struct igb_adapter 
*adapter, u32 index)
switch (hw->mac.type) {
case e1000_82575:
case e1000_i210:
+   if (adapter->mac_table[index].state &
+   IGB_MAC_STATE_QUEUE_STEERING)
+   rar_high |= E1000_RAH_QSEL_ENABLE;
+
rar_high |= E1000_RAH_POOL_1 *
  adapter->mac_table[index].queue;
break;
-- 
2.16.3

[next-queue PATCH v6 08/10] igb: Add MAC address support for ethtool nftuple filters

2018-03-29 Thread Vinicius Costa Gomes

This adds the capability of configuring the queue steering of arriving
packets based on their source and destination MAC addresses.

In practical terms this adds support for the following use cases,
characterized by these examples:

$ ethtool -N eth0 flow-type ether dst aa:aa:aa:aa:aa:aa action 0
(this will direct packets with destination address "aa:aa:aa:aa:aa:aa"
to the RX queue 0)

$ ethtool -N eth0 flow-type ether src 44:44:44:44:44:44 action 3
(this will direct packets with source address "44:44:44:44:44:44" to
the RX queue 3)

Signed-off-by: Vinicius Costa Gomes 
---
 drivers/net/ethernet/intel/igb/igb_ethtool.c | 35 
 1 file changed, 31 insertions(+), 4 deletions(-)

diff --git a/drivers/net/ethernet/intel/igb/igb_ethtool.c 
b/drivers/net/ethernet/intel/igb/igb_ethtool.c
index 31b2960a7869..491946e09f8e 100644
--- a/drivers/net/ethernet/intel/igb/igb_ethtool.c
+++ b/drivers/net/ethernet/intel/igb/igb_ethtool.c
@@ -2495,6 +2495,23 @@ static int igb_get_ethtool_nfc_entry(struct igb_adapter 
*adapter,
fsp->h_ext.vlan_tci = rule->filter.vlan_tci;
fsp->m_ext.vlan_tci = htons(VLAN_PRIO_MASK);
}
+   if (rule->filter.match_flags & IGB_FILTER_FLAG_DST_MAC_ADDR) {
+   ether_addr_copy(fsp->h_u.ether_spec.h_dest,
+   rule->filter.dst_addr);
+   /* As we only support matching by the full
+* mask, return the mask to userspace
+*/
+   eth_broadcast_addr(fsp->m_u.ether_spec.h_dest);
+   }
+   if (rule->filter.match_flags & IGB_FILTER_FLAG_SRC_MAC_ADDR) {
+   ether_addr_copy(fsp->h_u.ether_spec.h_source,
+   rule->filter.src_addr);
+   /* As we only support matching by the full
+* mask, return the mask to userspace
+*/
+   eth_broadcast_addr(fsp->m_u.ether_spec.h_source);
+   }
+
return 0;
}
return -EINVAL;
@@ -2933,10 +2950,6 @@ static int igb_add_ethtool_nfc_entry(struct igb_adapter 
*adapter,
if ((fsp->flow_type & ~FLOW_EXT) != ETHER_FLOW)
return -EINVAL;
 
-   if (fsp->m_u.ether_spec.h_proto != ETHER_TYPE_FULL_MASK &&
-   fsp->m_ext.vlan_tci != htons(VLAN_PRIO_MASK))
-   return -EINVAL;
-
input = kzalloc(sizeof(*input), GFP_KERNEL);
if (!input)
return -ENOMEM;
@@ -2946,6 +2959,20 @@ static int igb_add_ethtool_nfc_entry(struct igb_adapter 
*adapter,
input->filter.match_flags = IGB_FILTER_FLAG_ETHER_TYPE;
}
 
+   /* Only support matching addresses by the full mask */
+   if (is_broadcast_ether_addr(fsp->m_u.ether_spec.h_source)) {
+   input->filter.match_flags |= IGB_FILTER_FLAG_SRC_MAC_ADDR;
+   ether_addr_copy(input->filter.src_addr,
+   fsp->h_u.ether_spec.h_source);
+   }
+
+   /* Only support matching addresses by the full mask */
+   if (is_broadcast_ether_addr(fsp->m_u.ether_spec.h_dest)) {
+   input->filter.match_flags |= IGB_FILTER_FLAG_DST_MAC_ADDR;
+   ether_addr_copy(input->filter.dst_addr,
+   fsp->h_u.ether_spec.h_dest);
+   }
+
if ((fsp->flow_type & FLOW_EXT) && fsp->m_ext.vlan_tci) {
if (fsp->m_ext.vlan_tci != htons(VLAN_PRIO_MASK)) {
err = -EINVAL;
-- 
2.16.3

[next-queue PATCH v6 07/10] igb: Enable nfc filters to specify MAC addresses

2018-03-29 Thread Vinicius Costa Gomes

This allows igb_add_filter()/igb_erase_filter() to work on filters
that include MAC addresses (both source and destination).

For now, this only exposes the functionality, the next commit glues
ethtool into this. Later in this series, these APIs are used to allow
offloading of cls_flower filters.

Signed-off-by: Vinicius Costa Gomes 
---
 drivers/net/ethernet/intel/igb/igb.h |  4 
 drivers/net/ethernet/intel/igb/igb_ethtool.c | 28 
 2 files changed, 32 insertions(+)

diff --git a/drivers/net/ethernet/intel/igb/igb.h 
b/drivers/net/ethernet/intel/igb/igb.h
index f48ba090fd6a..b9b965921e9f 100644
--- a/drivers/net/ethernet/intel/igb/igb.h
+++ b/drivers/net/ethernet/intel/igb/igb.h
@@ -442,6 +442,8 @@ struct hwmon_buff {
 enum igb_filter_match_flags {
IGB_FILTER_FLAG_ETHER_TYPE = 0x1,
IGB_FILTER_FLAG_VLAN_TCI   = 0x2,
+   IGB_FILTER_FLAG_SRC_MAC_ADDR   = 0x4,
+   IGB_FILTER_FLAG_DST_MAC_ADDR   = 0x8,
 };
 
 #define IGB_MAX_RXNFC_FILTERS 16
@@ -456,6 +458,8 @@ struct igb_nfc_input {
u8 match_flags;
__be16 etype;
__be16 vlan_tci;
+   u8 src_addr[ETH_ALEN];
+   u8 dst_addr[ETH_ALEN];
 };
 
 struct igb_nfc_filter {
diff --git a/drivers/net/ethernet/intel/igb/igb_ethtool.c 
b/drivers/net/ethernet/intel/igb/igb_ethtool.c
index 5975d432836f..31b2960a7869 100644
--- a/drivers/net/ethernet/intel/igb/igb_ethtool.c
+++ b/drivers/net/ethernet/intel/igb/igb_ethtool.c
@@ -2776,6 +2776,25 @@ int igb_add_filter(struct igb_adapter *adapter, struct 
igb_nfc_filter *input)
return err;
}
 
+   if (input->filter.match_flags & IGB_FILTER_FLAG_DST_MAC_ADDR) {
+   err = igb_add_mac_steering_filter(adapter,
+ input->filter.dst_addr,
+ input->action, 0);
+   err = min_t(int, err, 0);
+   if (err)
+   return err;
+   }
+
+   if (input->filter.match_flags & IGB_FILTER_FLAG_SRC_MAC_ADDR) {
+   err = igb_add_mac_steering_filter(adapter,
+ input->filter.src_addr,
+ input->action,
+ IGB_MAC_STATE_SRC_ADDR);
+   err = min_t(int, err, 0);
+   if (err)
+   return err;
+   }
+
if (input->filter.match_flags & IGB_FILTER_FLAG_VLAN_TCI)
err = igb_rxnfc_write_vlan_prio_filter(adapter, input);
 
@@ -2824,6 +2843,15 @@ int igb_erase_filter(struct igb_adapter *adapter, struct 
igb_nfc_filter *input)
igb_clear_vlan_prio_filter(adapter,
   ntohs(input->filter.vlan_tci));
 
+   if (input->filter.match_flags & IGB_FILTER_FLAG_SRC_MAC_ADDR)
+   igb_del_mac_steering_filter(adapter, input->filter.src_addr,
+   input->action,
+   IGB_MAC_STATE_SRC_ADDR);
+
+   if (input->filter.match_flags & IGB_FILTER_FLAG_DST_MAC_ADDR)
+   igb_del_mac_steering_filter(adapter, input->filter.dst_addr,
+   input->action, 0);
+
return 0;
 }
 
-- 
2.16.3

[next-queue PATCH v6 01/10] igb: Fix not adding filter elements to the list

2018-03-29 Thread Vinicius Costa Gomes

Because the order of the parameters passes to 'hlist_add_behind()' was
inverted, the 'parent' node was added "behind" the 'input', as input
is not in the list, this causes the 'input' node to be lost.

Fixes: 0e71def25281 ("igb: add support of RX network flow classification")
Signed-off-by: Vinicius Costa Gomes 
---
 drivers/net/ethernet/intel/igb/igb_ethtool.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/intel/igb/igb_ethtool.c 
b/drivers/net/ethernet/intel/igb/igb_ethtool.c
index e77ba0d5866d..5975d432836f 100644
--- a/drivers/net/ethernet/intel/igb/igb_ethtool.c
+++ b/drivers/net/ethernet/intel/igb/igb_ethtool.c
@@ -2865,7 +2865,7 @@ static int igb_update_ethtool_nfc_entry(struct 
igb_adapter *adapter,
 
/* add filter to the list */
if (parent)
-   hlist_add_behind(>nfc_node, >nfc_node);
+   hlist_add_behind(>nfc_node, >nfc_node);
else
hlist_add_head(>nfc_node, >nfc_filter_list);
 
-- 
2.16.3

Re: [PATCH iproute2-next] json_print: fix print_uint with helper type extensions

2018-03-29 Thread Kevin Darbyshire-Bryant



> On 29 Mar 2018, at 22:03, Stephen Hemminger  
> wrote:
> 
> On Thu, 29 Mar 2018 20:32:10 +0100
> Kevin Darbyshire-Bryant  wrote:
> 
>> Introduce print helper functions for int, uint, explicit int32, uint32,
>> int64 & uint64.
>> 
>> print_int used 'int' type internally, whereas print_uint used 'uint64_t'
>> 
>> These helper functions eventually call vfprintf(fp, fmt, args) which is
>> a variable argument list function and is dependent upon 'fmt' containing
>> correct information about the length of the passed arguments.
>> 
>> Unfortunately print_int v print_uint offered no clue to the programmer
>> that internally passed ints to print_uint were being promoted to 64bits,
>> thus the format passed in 'fmt' string vs the actual passed integer
>> could be different lengths.  This is even more interesting on big endian
>> architectures where 'vfprintf' would be looking in the middle of an
>> int64 type.
>> 
>> print_u/int now stick with native int size.  print_u/int32 & print
>> u/int64 functions offer explicit integer sizes.
>> 
>> To portably use these formats you should use the relevant PRIdN or PRIuN
>> formats as defined in inttypes.h
>> 
>> e.g.
>> 
>> print_uint64(PRINT_ANY, "refcnt", "refcnt %" PRIu64 " ", t->tcm_info)
>> 
>> Signed-off-by: Kevin Darbyshire-Bryant 
>> ---
>> include/json_print.h | 6 +-
>> lib/json_print.c | 6 +-
>> 2 files changed, 10 insertions(+), 2 deletions(-)
>> 
>> diff --git a/include/json_print.h b/include/json_print.h
>> index 2ca7830a..fb62b142 100644
>> --- a/include/json_print.h
>> +++ b/include/json_print.h
>> @@ -56,10 +56,14 @@ void close_json_array(enum output_type type, const char 
>> *delim);
>>  print_color_##type_name(t, COLOR_NONE, key, fmt, value);
>> \
>>  }
>> _PRINT_FUNC(int, int);
>> +_PRINT_FUNC(uint, unsigned int);
>> _PRINT_FUNC(bool, bool);
>> _PRINT_FUNC(null, const char*);
>> _PRINT_FUNC(string, const char*);
>> -_PRINT_FUNC(uint, uint64_t);
>> +_PRINT_FUNC(int32, int32_t);
>> +_PRINT_FUNC(uint32, uint32_t);
>> +_PRINT_FUNC(int64, int64_t);
>> +_PRINT_FUNC(uint64, uint64_t);
>> _PRINT_FUNC(hu, unsigned short);
>> _PRINT_FUNC(hex, unsigned int);
>> _PRINT_FUNC(0xhex, unsigned int);
>> diff --git a/lib/json_print.c b/lib/json_print.c
>> index bda72933..1194a6ec 100644
>> --- a/lib/json_print.c
>> +++ b/lib/json_print.c
>> @@ -116,8 +116,12 @@ void close_json_array(enum output_type type, const char 
>> *str)
>>  }   \
>>  }
>> _PRINT_FUNC(int, int);
>> +_PRINT_FUNC(uint, unsigned int);
>> _PRINT_FUNC(hu, unsigned short);
>> -_PRINT_FUNC(uint, uint64_t);
>> +_PRINT_FUNC(int32, int32_t);
>> +_PRINT_FUNC(uint32, uint32_t);
>> +_PRINT_FUNC(int64, int64_t);
>> +_PRINT_FUNC(uint64, uint64_t);
>> _PRINT_FUNC(lluint, unsigned long long int);
>> _PRINT_FUNC(float, double);
>> #undef _PRINT_FUNC
> 
> You sent patches to both trees. That is not the correct protocol.
> Choose one, get it reviewed.  iproute2-next will get merged from master (in 
> fact
> dave should be doing it regularly).

I got this from Dave "Kevin: I guess you need to split the patch. Extract the 
bug fix piece
and send for iproute2; enhancements go to iproute2-next.”

So I thought I was doing the right thing.

But to be blunt, I’m giving up now.


Cheers,

Kevin D-B

012C ACB2 28C6 C53E 9775  9123 B3A2 389B 9DE2 334A

possible deadlock in perf_event_detach_bpf_prog

2018-03-29 Thread syzbot


Hello,

syzbot hit the following crash on upstream commit
3eb2ce825ea1ad89d20f7a3b5780df850e4be274 (Sun Mar 25 22:44:30 2018 +)
Linux 4.16-rc7
syzbot dashboard link:  
https://syzkaller.appspot.com/bug?extid=dc5ca0e4c9bfafaf2bae


Unfortunately, I don't have any reproducer for this crash yet.
Raw console output:  
https://syzkaller.appspot.com/x/log.txt?id=4742532743299072
Kernel config:  
https://syzkaller.appspot.com/x/.config?id=-8440362230543204781

compiler: gcc (GCC) 7.1.1 20170620

IMPORTANT: if you fix the bug, please add the following tag to the commit:
Reported-by: syzbot+dc5ca0e4c9bfafaf2...@syzkaller.appspotmail.com
It will help syzbot understand when the bug is fixed. See footer for  
details.

If you forward the report, please keep this part and the footer.


==
WARNING: possible circular locking dependency detected
4.16.0-rc7+ #3 Not tainted
--
syz-executor7/24531 is trying to acquire lock:
 (bpf_event_mutex){+.+.}, at: [<8a849b07>]  
perf_event_detach_bpf_prog+0x92/0x3d0 kernel/trace/bpf_trace.c:854


but task is already holding lock:
 (>mmap_sem){}, at: [<38768f87>] vm_mmap_pgoff+0x198/0x280  
mm/util.c:353


which lock already depends on the new lock.


the existing dependency chain (in reverse order) is:

-> #1 (>mmap_sem){}:
   __might_fault+0x13a/0x1d0 mm/memory.c:4571
   _copy_to_user+0x2c/0xc0 lib/usercopy.c:25
   copy_to_user include/linux/uaccess.h:155 [inline]
   bpf_prog_array_copy_info+0xf2/0x1c0 kernel/bpf/core.c:1694
   perf_event_query_prog_array+0x1c7/0x2c0 kernel/trace/bpf_trace.c:891
   _perf_ioctl kernel/events/core.c:4750 [inline]
   perf_ioctl+0x3e1/0x1480 kernel/events/core.c:4770
   vfs_ioctl fs/ioctl.c:46 [inline]
   do_vfs_ioctl+0x1b1/0x1520 fs/ioctl.c:686
   SYSC_ioctl fs/ioctl.c:701 [inline]
   SyS_ioctl+0x8f/0xc0 fs/ioctl.c:692
   do_syscall_64+0x281/0x940 arch/x86/entry/common.c:287
   entry_SYSCALL_64_after_hwframe+0x42/0xb7

-> #0 (bpf_event_mutex){+.+.}:
   lock_acquire+0x1d5/0x580 kernel/locking/lockdep.c:3920
   __mutex_lock_common kernel/locking/mutex.c:756 [inline]
   __mutex_lock+0x16f/0x1a80 kernel/locking/mutex.c:893
   mutex_lock_nested+0x16/0x20 kernel/locking/mutex.c:908
   perf_event_detach_bpf_prog+0x92/0x3d0 kernel/trace/bpf_trace.c:854
   perf_event_free_bpf_prog kernel/events/core.c:8147 [inline]
   _free_event+0xbdb/0x10f0 kernel/events/core.c:4116
   put_event+0x24/0x30 kernel/events/core.c:4204
   perf_mmap_close+0x60d/0x1010 kernel/events/core.c:5172
   remove_vma+0xb4/0x1b0 mm/mmap.c:172
   remove_vma_list mm/mmap.c:2490 [inline]
   do_munmap+0x82a/0xdf0 mm/mmap.c:2731
   mmap_region+0x59e/0x15a0 mm/mmap.c:1646
   do_mmap+0x6c0/0xe00 mm/mmap.c:1483
   do_mmap_pgoff include/linux/mm.h:2223 [inline]
   vm_mmap_pgoff+0x1de/0x280 mm/util.c:355
   SYSC_mmap_pgoff mm/mmap.c:1533 [inline]
   SyS_mmap_pgoff+0x462/0x5f0 mm/mmap.c:1491
   SYSC_mmap arch/x86/kernel/sys_x86_64.c:100 [inline]
   SyS_mmap+0x16/0x20 arch/x86/kernel/sys_x86_64.c:91
   do_syscall_64+0x281/0x940 arch/x86/entry/common.c:287
   entry_SYSCALL_64_after_hwframe+0x42/0xb7

other info that might help us debug this:

 Possible unsafe locking scenario:

   CPU0CPU1
   
  lock(>mmap_sem);
   lock(bpf_event_mutex);
   lock(>mmap_sem);
  lock(bpf_event_mutex);

 *** DEADLOCK ***

1 lock held by syz-executor7/24531:
 #0:  (>mmap_sem){}, at: [<38768f87>]  
vm_mmap_pgoff+0x198/0x280 mm/util.c:353


stack backtrace:
CPU: 0 PID: 24531 Comm: syz-executor7 Not tainted 4.16.0-rc7+ #3
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS  
Google 01/01/2011

Call Trace:
 __dump_stack lib/dump_stack.c:17 [inline]
 dump_stack+0x194/0x24d lib/dump_stack.c:53
 print_circular_bug.isra.38+0x2cd/0x2dc kernel/locking/lockdep.c:1223
 check_prev_add kernel/locking/lockdep.c:1863 [inline]
 check_prevs_add kernel/locking/lockdep.c:1976 [inline]
 validate_chain kernel/locking/lockdep.c:2417 [inline]
 __lock_acquire+0x30a8/0x3e00 kernel/locking/lockdep.c:3431
 lock_acquire+0x1d5/0x580 kernel/locking/lockdep.c:3920
 __mutex_lock_common kernel/locking/mutex.c:756 [inline]
 __mutex_lock+0x16f/0x1a80 kernel/locking/mutex.c:893
 mutex_lock_nested+0x16/0x20 kernel/locking/mutex.c:908
 perf_event_detach_bpf_prog+0x92/0x3d0 kernel/trace/bpf_trace.c:854
 perf_event_free_bpf_prog kernel/events/core.c:8147 [inline]
 _free_event+0xbdb/0x10f0 kernel/events/core.c:4116
 put_event+0x24/0x30 kernel/events/core.c:4204
 perf_mmap_close+0x60d/0x1010 kernel/events/core.c:5172
 remove_vma+0xb4/0x1b0 mm/mmap.c:172
 remove_vma_list mm/mmap.c:2490 [inline]
 do_munmap+0x82a/0xdf0 mm/mmap.c:2731
 mmap_region+0x59e/0x15a0

Re: [PATCH iproute2-next] json_print: fix print_uint with helper type extensions

2018-03-29 Thread Stephen Hemminger

On Thu, 29 Mar 2018 20:32:10 +0100
Kevin Darbyshire-Bryant  wrote:

> Introduce print helper functions for int, uint, explicit int32, uint32,
> int64 & uint64.
> 
> print_int used 'int' type internally, whereas print_uint used 'uint64_t'
> 
> These helper functions eventually call vfprintf(fp, fmt, args) which is
> a variable argument list function and is dependent upon 'fmt' containing
> correct information about the length of the passed arguments.
> 
> Unfortunately print_int v print_uint offered no clue to the programmer
> that internally passed ints to print_uint were being promoted to 64bits,
> thus the format passed in 'fmt' string vs the actual passed integer
> could be different lengths.  This is even more interesting on big endian
> architectures where 'vfprintf' would be looking in the middle of an
> int64 type.
> 
> print_u/int now stick with native int size.  print_u/int32 & print
> u/int64 functions offer explicit integer sizes.
> 
> To portably use these formats you should use the relevant PRIdN or PRIuN
> formats as defined in inttypes.h
> 
> e.g.
> 
> print_uint64(PRINT_ANY, "refcnt", "refcnt %" PRIu64 " ", t->tcm_info)
> 
> Signed-off-by: Kevin Darbyshire-Bryant 
> ---
>  include/json_print.h | 6 +-
>  lib/json_print.c | 6 +-
>  2 files changed, 10 insertions(+), 2 deletions(-)
> 
> diff --git a/include/json_print.h b/include/json_print.h
> index 2ca7830a..fb62b142 100644
> --- a/include/json_print.h
> +++ b/include/json_print.h
> @@ -56,10 +56,14 @@ void close_json_array(enum output_type type, const char 
> *delim);
>   print_color_##type_name(t, COLOR_NONE, key, fmt, value);
> \
>   }
>  _PRINT_FUNC(int, int);
> +_PRINT_FUNC(uint, unsigned int);
>  _PRINT_FUNC(bool, bool);
>  _PRINT_FUNC(null, const char*);
>  _PRINT_FUNC(string, const char*);
> -_PRINT_FUNC(uint, uint64_t);
> +_PRINT_FUNC(int32, int32_t);
> +_PRINT_FUNC(uint32, uint32_t);
> +_PRINT_FUNC(int64, int64_t);
> +_PRINT_FUNC(uint64, uint64_t);
>  _PRINT_FUNC(hu, unsigned short);
>  _PRINT_FUNC(hex, unsigned int);
>  _PRINT_FUNC(0xhex, unsigned int);
> diff --git a/lib/json_print.c b/lib/json_print.c
> index bda72933..1194a6ec 100644
> --- a/lib/json_print.c
> +++ b/lib/json_print.c
> @@ -116,8 +116,12 @@ void close_json_array(enum output_type type, const char 
> *str)
>   }   \
>   }
>  _PRINT_FUNC(int, int);
> +_PRINT_FUNC(uint, unsigned int);
>  _PRINT_FUNC(hu, unsigned short);
> -_PRINT_FUNC(uint, uint64_t);
> +_PRINT_FUNC(int32, int32_t);
> +_PRINT_FUNC(uint32, uint32_t);
> +_PRINT_FUNC(int64, int64_t);
> +_PRINT_FUNC(uint64, uint64_t);
>  _PRINT_FUNC(lluint, unsigned long long int);
>  _PRINT_FUNC(float, double);
>  #undef _PRINT_FUNC

You sent patches to both trees. That is not the correct protocol.
Choose one, get it reviewed.  iproute2-next will get merged from master (in fact
dave should be doing it regularly).

Re: [PATCH iproute2] json_print: fix print_uint hidden type promotion

2018-03-29 Thread Stephen Hemminger

On Thu, 29 Mar 2018 20:22:20 +0100
Kevin Darbyshire-Bryant  wrote:

> print_int used 'int' type internally, whereas print_uint used 'uint64_t'
> 
> These helper functions eventually call vfprintf(fp, fmt, args) which is
> a variable argument list function and is dependent upon 'fmt' containing
> correct information about the length of the passed arguments.
> 
> Unfortunately print_int v print_uint offered no clue to the programmer
> that internally passed ints to print_uint were being promoted to 64bits,
> thus the format passed in 'fmt' string vs the actual passed integer
> could be different lengths.  This is even more interesting on big endian
> architectures where 'vfprintf' would be looking in the middle of an
> int64 type.
> 
> print_u/int now stick with native int size.
> 
> Signed-off-by: Kevin Darbyshire-Bryant 
> ---
>  include/json_print.h | 2 +-
>  lib/json_print.c | 2 +-
>  2 files changed, 2 insertions(+), 2 deletions(-)
> 
> diff --git a/include/json_print.h b/include/json_print.h
> index 2ca7830a..45bc653d 100644
> --- a/include/json_print.h
> +++ b/include/json_print.h
> @@ -56,10 +56,10 @@ void close_json_array(enum output_type type, const char 
> *delim);
>   print_color_##type_name(t, COLOR_NONE, key, fmt, value);
> \
>   }
>  _PRINT_FUNC(int, int);
> +_PRINT_FUNC(uint, unsigned int);
>  _PRINT_FUNC(bool, bool);
>  _PRINT_FUNC(null, const char*);
>  _PRINT_FUNC(string, const char*);
> -_PRINT_FUNC(uint, uint64_t);
>  _PRINT_FUNC(hu, unsigned short);
>  _PRINT_FUNC(hex, unsigned int);
>  _PRINT_FUNC(0xhex, unsigned int);
> diff --git a/lib/json_print.c b/lib/json_print.c
> index 6518ba98..8d54d1d4 100644
> --- a/lib/json_print.c
> +++ b/lib/json_print.c
> @@ -117,8 +117,8 @@ void close_json_array(enum output_type type, const char 
> *str)
>   }   \
>   }
>  _PRINT_FUNC(int, int);
> +_PRINT_FUNC(uint, unsigned int);
>  _PRINT_FUNC(hu, unsigned short);
> -_PRINT_FUNC(uint, uint64_t);
>  _PRINT_FUNC(lluint, unsigned long long int);
>  _PRINT_FUNC(float, double);
>  #undef _PRINT_FUNC


I am concerned that this will break output of 64 bit statistics on 32 bit hosts.

aio poll and a new in-kernel poll API V8

2018-03-29 Thread Christoph Hellwig

Hi all,

this series adds support for the IOCB_CMD_POLL operation to poll for the
readyness of file descriptors using the aio subsystem.  The API is based
on patches that existed in RHAS2.1 and RHEL3, which means it already is
supported by libaio.  To implement the poll support efficiently new
methods to poll are introduced in struct file_operations:  get_poll_head
and poll_mask.  The first one returns a wait_queue_head to wait on
(lifetime is bound by the file), and the second does a non-blocking
check for the POLL* events.  This allows aio poll to work without
any additional context switches, unlike epoll.

This series sits on top of the aio-fsync series that also includes
support for io_pgetevents.

The changes were sponsored by Scylladb, and improve performance
of the seastar framework up to 10%, while also removing the need
for a privileged SCHED_FIFO epoll listener thread.

git://git.infradead.org/users/hch/vfs.git aio-poll.8

Gitweb:

http://git.infradead.org/users/hch/vfs.git/shortlog/refs/heads/aio-poll.8

Libaio changes:

https://pagure.io/libaio.git io-poll

Seastar changes (not updated for the new io_pgetevens ABI yet):

https://github.com/avikivity/seastar/commits/aio

Changes since V8:
 - make delayed cancellation safe and unconditional

Changes since V7:
 - reworked cancellation

Changes since V6:
 - small changelog updates
 - rebased on top of the aio-fsync changes

Changes since V4:
 - rebased ontop of Linux 4.16-rc4

Changes since V3:
 - remove the pre-sleep ->poll_mask call in vfs_poll,
   allow ->get_poll_head to return POLL* values.

Changes since V2:
 - removed a double initialization
 - new vfs_get_poll_head helper
 - document that ->get_poll_head can return NULL
 - call ->poll_mask before sleeping
 - various ACKs
 - add conversion of random to ->poll_mask
 - add conversion of af_alg to ->poll_mask
 - lacking ->poll_mask support now returns -EINVAL for IOCB_CMD_POLL
 - reshuffled the series so that prep patches and everything not
   requiring the new in-kernel poll API is in the beginning

Changes since V1:
 - handle the NULL ->poll case in vfs_poll
 - dropped the file argument to the ->poll_mask socket operation
 - replace the ->pre_poll socket operation with ->get_poll_head as
   in the file operations

[PATCH 02/30] fs: cleanup do_pollfd

2018-03-29 Thread Christoph Hellwig

Use straightline code with failure handling gotos instead of a lot
of nested conditionals.

Signed-off-by: Christoph Hellwig 
Reviewed-by: Greg Kroah-Hartman 
Reviewed-by: Darrick J. Wong 
---
 fs/select.c | 48 +++-
 1 file changed, 23 insertions(+), 25 deletions(-)

diff --git a/fs/select.c b/fs/select.c
index 686de7b3a1db..c6c504a814f9 100644
--- a/fs/select.c
+++ b/fs/select.c
@@ -806,34 +806,32 @@ static inline __poll_t do_pollfd(struct pollfd *pollfd, 
poll_table *pwait,
 bool *can_busy_poll,
 __poll_t busy_flag)
 {
-   __poll_t mask;
-   int fd;
-
-   mask = 0;
-   fd = pollfd->fd;
-   if (fd >= 0) {
-   struct fd f = fdget(fd);
-   mask = EPOLLNVAL;
-   if (f.file) {
-   /* userland u16 ->events contains POLL... bitmap */
-   __poll_t filter = demangle_poll(pollfd->events) |
-   EPOLLERR | EPOLLHUP;
-   mask = DEFAULT_POLLMASK;
-   if (f.file->f_op->poll) {
-   pwait->_key = filter;
-   pwait->_key |= busy_flag;
-   mask = f.file->f_op->poll(f.file, pwait);
-   if (mask & busy_flag)
-   *can_busy_poll = true;
-   }
-   /* Mask out unneeded events. */
-   mask &= filter;
-   fdput(f);
-   }
+   int fd = pollfd->fd;
+   __poll_t mask = 0, filter;
+   struct fd f;
+
+   if (fd < 0)
+   goto out;
+   mask = EPOLLNVAL;
+   f = fdget(fd);
+   if (!f.file)
+   goto out;
+
+   /* userland u16 ->events contains POLL... bitmap */
+   filter = demangle_poll(pollfd->events) | EPOLLERR | EPOLLHUP;
+   mask = DEFAULT_POLLMASK;
+   if (f.file->f_op->poll) {
+   pwait->_key = filter | busy_flag;
+   mask = f.file->f_op->poll(f.file, pwait);
+   if (mask & busy_flag)
+   *can_busy_poll = true;
}
+   mask &= filter; /* Mask out unneeded events. */
+   fdput(f);
+
+out:
/* ... and so does ->revents */
pollfd->revents = mangle_poll(mask);
-
return mask;
 }
 
-- 
2.14.2

[PATCH 04/30] fs: add new vfs_poll and file_can_poll helpers

2018-03-29 Thread Christoph Hellwig

These abstract out calls to the poll method in preparation for changes
in how we poll.

Signed-off-by: Christoph Hellwig 
Reviewed-by: Greg Kroah-Hartman 
Reviewed-by: Darrick J. Wong 
---
 drivers/staging/comedi/drivers/serial2002.c |  4 ++--
 drivers/vfio/virqfd.c   |  2 +-
 drivers/vhost/vhost.c   |  2 +-
 fs/eventpoll.c  |  5 ++---
 fs/select.c | 23 ---
 include/linux/poll.h| 12 
 mm/memcontrol.c |  2 +-
 net/9p/trans_fd.c   | 18 --
 virt/kvm/eventfd.c  |  2 +-
 9 files changed, 32 insertions(+), 38 deletions(-)

diff --git a/drivers/staging/comedi/drivers/serial2002.c 
b/drivers/staging/comedi/drivers/serial2002.c
index b3f3b4a201af..5471b2212a62 100644
--- a/drivers/staging/comedi/drivers/serial2002.c
+++ b/drivers/staging/comedi/drivers/serial2002.c
@@ -113,7 +113,7 @@ static void serial2002_tty_read_poll_wait(struct file *f, 
int timeout)
long elapsed;
__poll_t mask;
 
-   mask = f->f_op->poll(f, );
+   mask = vfs_poll(f, );
if (mask & (EPOLLRDNORM | EPOLLRDBAND | EPOLLIN |
EPOLLHUP | EPOLLERR)) {
break;
@@ -136,7 +136,7 @@ static int serial2002_tty_read(struct file *f, int timeout)
 
result = -1;
if (!IS_ERR(f)) {
-   if (f->f_op->poll) {
+   if (file_can_poll(f)) {
serial2002_tty_read_poll_wait(f, timeout);
 
if (kernel_read(f, , 1, ) == 1)
diff --git a/drivers/vfio/virqfd.c b/drivers/vfio/virqfd.c
index 085700f1be10..2a1be859ee71 100644
--- a/drivers/vfio/virqfd.c
+++ b/drivers/vfio/virqfd.c
@@ -166,7 +166,7 @@ int vfio_virqfd_enable(void *opaque,
init_waitqueue_func_entry(>wait, virqfd_wakeup);
init_poll_funcptr(>pt, virqfd_ptable_queue_proc);
 
-   events = irqfd.file->f_op->poll(irqfd.file, >pt);
+   events = vfs_poll(irqfd.file, >pt);
 
/*
 * Check if there was an event already pending on the eventfd
diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index 1b3e8d2d5c8b..4d27e288bb1d 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -208,7 +208,7 @@ int vhost_poll_start(struct vhost_poll *poll, struct file 
*file)
if (poll->wqh)
return 0;
 
-   mask = file->f_op->poll(file, >table);
+   mask = vfs_poll(file, >table);
if (mask)
vhost_poll_wakeup(>wait, 0, 0, poll_to_key(mask));
if (mask & EPOLLERR) {
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index 0f3494ed3ed0..2bebae5a38cf 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -884,8 +884,7 @@ static __poll_t ep_item_poll(const struct epitem *epi, 
poll_table *pt,
 
pt->_key = epi->event.events;
if (!is_file_epoll(epi->ffd.file))
-   return epi->ffd.file->f_op->poll(epi->ffd.file, pt) &
-  epi->event.events;
+   return vfs_poll(epi->ffd.file, pt) & epi->event.events;
 
ep = epi->ffd.file->private_data;
poll_wait(epi->ffd.file, >poll_wait, pt);
@@ -2020,7 +2019,7 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
 
/* The target file descriptor must support poll */
error = -EPERM;
-   if (!tf.file->f_op->poll)
+   if (!file_can_poll(tf.file))
goto error_tgt_fput;
 
/* Check if EPOLLWAKEUP is allowed */
diff --git a/fs/select.c b/fs/select.c
index c6c504a814f9..ba91103707ea 100644
--- a/fs/select.c
+++ b/fs/select.c
@@ -502,14 +502,10 @@ static int do_select(int n, fd_set_bits *fds, struct 
timespec64 *end_time)
continue;
f = fdget(i);
if (f.file) {
-   const struct file_operations *f_op;
-   f_op = f.file->f_op;
-   mask = DEFAULT_POLLMASK;
-   if (f_op->poll) {
-   wait_key_set(wait, in, out,
-bit, busy_flag);
-   mask = (*f_op->poll)(f.file, 
wait);
-   }
+   wait_key_set(wait, in, out, bit,
+busy_flag);
+   mask = vfs_poll(f.file, wait);
+
fdput(f);
if ((mask & POLLIN_SET) && (in & bit)) {
res_in

[PATCH 03/30] fs: update documentation to mention __poll_t and match the code

2018-03-29 Thread Christoph Hellwig

Signed-off-by: Christoph Hellwig 
Reviewed-by: Darrick J. Wong 
Reviewed-by: Greg Kroah-Hartman 
---
 Documentation/filesystems/Locking | 2 +-
 Documentation/filesystems/vfs.txt | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/Documentation/filesystems/Locking 
b/Documentation/filesystems/Locking
index 75d2d57e2c44..220bba28f72b 100644
--- a/Documentation/filesystems/Locking
+++ b/Documentation/filesystems/Locking
@@ -439,7 +439,7 @@ prototypes:
ssize_t (*read_iter) (struct kiocb *, struct iov_iter *);
ssize_t (*write_iter) (struct kiocb *, struct iov_iter *);
int (*iterate) (struct file *, struct dir_context *);
-   unsigned int (*poll) (struct file *, struct poll_table_struct *);
+   __poll_t (*poll) (struct file *, struct poll_table_struct *);
long (*unlocked_ioctl) (struct file *, unsigned int, unsigned long);
long (*compat_ioctl) (struct file *, unsigned int, unsigned long);
int (*mmap) (struct file *, struct vm_area_struct *);
diff --git a/Documentation/filesystems/vfs.txt 
b/Documentation/filesystems/vfs.txt
index 5fd325df59e2..f608180ad59d 100644
--- a/Documentation/filesystems/vfs.txt
+++ b/Documentation/filesystems/vfs.txt
@@ -856,7 +856,7 @@ struct file_operations {
ssize_t (*read_iter) (struct kiocb *, struct iov_iter *);
ssize_t (*write_iter) (struct kiocb *, struct iov_iter *);
int (*iterate) (struct file *, struct dir_context *);
-   unsigned int (*poll) (struct file *, struct poll_table_struct *);
+   __poll_t (*poll) (struct file *, struct poll_table_struct *);
long (*unlocked_ioctl) (struct file *, unsigned int, unsigned long);
long (*compat_ioctl) (struct file *, unsigned int, unsigned long);
int (*mmap) (struct file *, struct vm_area_struct *);
-- 
2.14.2

[PATCH 06/30] aio: simplify cancellation

2018-03-29 Thread Christoph Hellwig

With the current aio code there is no need for the magic KIOCB_CANCELLED
value, as a cancelation just kicks the driver to queue the completion
ASAP, with all actual completion handling done in another thread. Given
that both the completion path and cancelation take the context lock there
is no need for magic cmpxchg loops either.  If we remove iocbs from the
active list before calling ->ki_cancel we can also rely on the invariant
thay anything found on the list has a ->ki_cancel callback and can be
cancelled, further simplifing the code.

Signed-off-by: Christoph Hellwig 
---
 fs/aio.c | 49 ++---
 1 file changed, 6 insertions(+), 43 deletions(-)

diff --git a/fs/aio.c b/fs/aio.c
index 0df07d399a05..c724f1429176 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -162,19 +162,6 @@ struct fsync_iocb {
booldatasync;
 };
 
-/*
- * We use ki_cancel == KIOCB_CANCELLED to indicate that a kiocb has been either
- * cancelled or completed (this makes a certain amount of sense because
- * successful cancellation - io_cancel() - does deliver the completion to
- * userspace).
- *
- * And since most things don't implement kiocb cancellation and we'd really 
like
- * kiocb completion to be lockless when possible, we use ki_cancel to
- * synchronize cancellation and completion - we only set it to KIOCB_CANCELLED
- * with xchg() or cmpxchg(), see batch_complete_aio() and kiocb_cancel().
- */
-#define KIOCB_CANCELLED((void *) (~0ULL))
-
 struct aio_kiocb {
union {
struct kiocbrw;
@@ -572,27 +559,6 @@ void kiocb_set_cancel_fn(struct kiocb *iocb, 
kiocb_cancel_fn *cancel)
 }
 EXPORT_SYMBOL(kiocb_set_cancel_fn);
 
-static int kiocb_cancel(struct aio_kiocb *kiocb)
-{
-   kiocb_cancel_fn *old, *cancel;
-
-   /*
-* Don't want to set kiocb->ki_cancel = KIOCB_CANCELLED unless it
-* actually has a cancel function, hence the cmpxchg()
-*/
-
-   cancel = READ_ONCE(kiocb->ki_cancel);
-   do {
-   if (!cancel || cancel == KIOCB_CANCELLED)
-   return -EINVAL;
-
-   old = cancel;
-   cancel = cmpxchg(>ki_cancel, old, KIOCB_CANCELLED);
-   } while (cancel != old);
-
-   return cancel(>rw);
-}
-
 static void free_ioctx(struct work_struct *work)
 {
struct kioctx *ctx = container_of(work, struct kioctx, free_work);
@@ -633,9 +599,8 @@ static void free_ioctx_users(struct percpu_ref *ref)
while (!list_empty(>active_reqs)) {
req = list_first_entry(>active_reqs,
   struct aio_kiocb, ki_list);
-
list_del_init(>ki_list);
-   kiocb_cancel(req);
+   req->ki_cancel(>rw);
}
 
spin_unlock_irq(>ctx_lock);
@@ -1837,8 +1802,8 @@ SYSCALL_DEFINE3(io_cancel, aio_context_t, ctx_id, struct 
iocb __user *, iocb,
 {
struct kioctx *ctx;
struct aio_kiocb *kiocb;
+   int ret = -EINVAL;
u32 key;
-   int ret;
 
ret = get_user(key, >aio_key);
if (unlikely(ret))
@@ -1849,13 +1814,11 @@ SYSCALL_DEFINE3(io_cancel, aio_context_t, ctx_id, 
struct iocb __user *, iocb,
return -EINVAL;
 
spin_lock_irq(>ctx_lock);
-
kiocb = lookup_kiocb(ctx, iocb, key);
-   if (kiocb)
-   ret = kiocb_cancel(kiocb);
-   else
-   ret = -EINVAL;
-
+   if (kiocb) {
+   list_del_init(>ki_list);
+   ret = kiocb->ki_cancel(>rw);
+   }
spin_unlock_irq(>ctx_lock);
 
if (!ret) {
-- 
2.14.2

[PATCH 07/30] aio: add delayed cancel support

2018-03-29 Thread Christoph Hellwig

The upcoming aio poll support would like to be able to complete the
iocb inline from the cancellation context, but that would cause a
double lock of ctx_lock with the current locking scheme.  Move the
cancelation outside the context lock to avoid this reversal, which
suits the existing usb gadgets users just fine as well (in fact
both unconditionally disable irqs and thus seem broken without
this change).

To make this safe aio_complete needs to check if this call should
complete the iocb.  If it didn't the callers must not release any
other resources.

Signed-off-by: Christoph Hellwig 
---
 fs/aio.c | 60 
 1 file changed, 44 insertions(+), 16 deletions(-)

diff --git a/fs/aio.c b/fs/aio.c
index c724f1429176..2406644e1ecc 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -177,6 +177,9 @@ struct aio_kiocb {
struct list_headki_list;/* the aio core uses this
 * for cancellation */
 
+   unsigned intflags;  /* protected by ctx->ctx_lock */
+#define AIO_IOCB_CANCELLED (1 << 0)
+
/*
 * If the aio_resfd field of the userspace iocb is not zero,
 * this is the underlying eventfd context to deliver events to.
@@ -543,9 +546,9 @@ static int aio_setup_ring(struct kioctx *ctx, unsigned int 
nr_events)
 #define AIO_EVENTS_FIRST_PAGE  ((PAGE_SIZE - sizeof(struct aio_ring)) / 
sizeof(struct io_event))
 #define AIO_EVENTS_OFFSET  (AIO_EVENTS_PER_PAGE - AIO_EVENTS_FIRST_PAGE)
 
-void kiocb_set_cancel_fn(struct kiocb *iocb, kiocb_cancel_fn *cancel)
+static void __kiocb_set_cancel_fn(struct aio_kiocb *req,
+   kiocb_cancel_fn *cancel)
 {
-   struct aio_kiocb *req = container_of(iocb, struct aio_kiocb, rw);
struct kioctx *ctx = req->ki_ctx;
unsigned long flags;
 
@@ -557,6 +560,12 @@ void kiocb_set_cancel_fn(struct kiocb *iocb, 
kiocb_cancel_fn *cancel)
req->ki_cancel = cancel;
spin_unlock_irqrestore(>ctx_lock, flags);
 }
+
+void kiocb_set_cancel_fn(struct kiocb *iocb, kiocb_cancel_fn *cancel)
+{
+   return __kiocb_set_cancel_fn(container_of(iocb, struct aio_kiocb, rw),
+   cancel);
+}
 EXPORT_SYMBOL(kiocb_set_cancel_fn);
 
 static void free_ioctx(struct work_struct *work)
@@ -593,18 +602,23 @@ static void free_ioctx_users(struct percpu_ref *ref)
 {
struct kioctx *ctx = container_of(ref, struct kioctx, users);
struct aio_kiocb *req;
+   LIST_HEAD(list);
 
spin_lock_irq(>ctx_lock);
-
while (!list_empty(>active_reqs)) {
req = list_first_entry(>active_reqs,
   struct aio_kiocb, ki_list);
+   req->flags |= AIO_IOCB_CANCELLED;
+   list_move_tail(>ki_list, );
+   }
+   spin_unlock_irq(>ctx_lock);
+
+   while (!list_empty()) {
+   req = list_first_entry(, struct aio_kiocb, ki_list);
list_del_init(>ki_list);
req->ki_cancel(>rw);
}
 
-   spin_unlock_irq(>ctx_lock);
-
percpu_ref_kill(>reqs);
percpu_ref_put(>reqs);
 }
@@ -1040,22 +1054,30 @@ static struct kioctx *lookup_ioctx(unsigned long ctx_id)
return ret;
 }
 
+#define AIO_COMPLETE_CANCEL(1 << 0)
+
 /* aio_complete
  * Called when the io request on the given iocb is complete.
  */
-static void aio_complete(struct aio_kiocb *iocb, long res, long res2)
+static bool aio_complete(struct aio_kiocb *iocb, long res, long res2,
+   unsigned complete_flags)
 {
struct kioctx   *ctx = iocb->ki_ctx;
struct aio_ring *ring;
struct io_event *ev_page, *event;
unsigned tail, pos, head;
-   unsigned long   flags;
-
-   if (!list_empty_careful(>ki_list)) {
-   unsigned long flags;
+   unsigned long flags;
 
+   if (iocb->ki_cancel) {
spin_lock_irqsave(>ctx_lock, flags);
-   list_del(>ki_list);
+   if (!(complete_flags & AIO_COMPLETE_CANCEL) &&
+   (iocb->flags & AIO_IOCB_CANCELLED)) {
+   spin_unlock_irqrestore(>ctx_lock, flags);
+   return false;
+   }
+
+   if (!list_empty(>ki_list))
+   list_del(>ki_list);
spin_unlock_irqrestore(>ctx_lock, flags);
}
 
@@ -1131,6 +1153,7 @@ static void aio_complete(struct aio_kiocb *iocb, long 
res, long res2)
wake_up(>wait);
 
percpu_ref_put(>reqs);
+   return true;
 }
 
 /* aio_read_events_ring
@@ -1379,6 +1402,7 @@ SYSCALL_DEFINE1(io_destroy, aio_context_t, ctx)
 static void aio_complete_rw(struct kiocb *kiocb, long res, long res2)
 {
struct aio_kiocb *iocb = container_of(kiocb, struct aio_kiocb, rw);
+   struct file *file = kiocb->ki_filp;
 
if (kiocb->ki_flags & IOCB_WRITE) {
struct inode *inode =

[PATCH net-next 00/11] mlxsw: Various cleanups

2018-03-29 Thread Ido Schimmel

Hi,

The first 10 patches from Jiri perform small and unrelated cleanups. The
largest being the conversion of the KVD linear partitions from a list to
an array, which simplifies the code.

The last patch from Petr is a bug fix for a recent net-next commit that
prevented the "kvd" resource from being marked as the parent of its
various child resources (e.g., "/kvd/linear").

Jiri Pirko (10):
  mlxsw: spectrum_acl: Fix flex actions header ifndef define construct
  mlxsw: spectrum_kvdl: Fix handling of resource_size_param
  mlxsw: Constify devlink_resource_ops
  mlxsw: spectrum: Change KVD linear parts from list to array
  mlxsw: remove kvd_hash_granularity from config profile struct
  mlxsw: core: Fix arg name of MLXSW_CORE_RES_VALID and
MLXSW_CORE_RES_GET
  mlxsw: Move "used_kvd_sizes" check to mlxsw_pci_config_profile
  mlxsw: Move "resources_query_enable" out of mlxsw_config_profile
  devlink: convert occ_get op to separate registration
  mlxsw: spectrum: Pass mlxsw_core as arg of
mlxsw_sp_kvdl_resources_register()

Petr Machata (1):
  mlxsw: spectrum: Don't use resource ID of 0

 drivers/net/ethernet/mellanox/mlxsw/core.c |   5 +-
 drivers/net/ethernet/mellanox/mlxsw/core.h |  14 +-
 drivers/net/ethernet/mellanox/mlxsw/pci.c  |  11 +-
 drivers/net/ethernet/mellanox/mlxsw/spectrum.c |  38 +--
 drivers/net/ethernet/mellanox/mlxsw/spectrum.h |   5 +-
 .../mellanox/mlxsw/spectrum_acl_flex_actions.h |   4 +-
 .../net/ethernet/mellanox/mlxsw/spectrum_kvdl.c| 339 +
 drivers/net/ethernet/mellanox/mlxsw/switchib.c |   1 -
 drivers/net/ethernet/mellanox/mlxsw/switchx2.c |   1 -
 include/net/devlink.h  |  40 ++-
 net/core/devlink.c |  74 -
 11 files changed, 258 insertions(+), 274 deletions(-)

-- 
2.14.3

[PATCH net-next 02/11] mlxsw: spectrum_kvdl: Fix handling of resource_size_param

2018-03-29 Thread Ido Schimmel

From: Jiri Pirko 

Current code uses global variables, adjusts them and passes pointer down
to devlink. With every other mlxsw_core instance, the previously passed
pointer values are rewritten. Fix this by de-globalize the variables.

Fixes: 7f47b19bd744 ("mlxsw: spectrum_kvdl: Add support for per part occupancy")
Signed-off-by: Jiri Pirko 
Acked-by: Arkadi Sharshevsky 
Signed-off-by: Ido Schimmel 
---
 .../net/ethernet/mellanox/mlxsw/spectrum_kvdl.c| 47 +++---
 1 file changed, 14 insertions(+), 33 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_kvdl.c 
b/drivers/net/ethernet/mellanox/mlxsw/spectrum_kvdl.c
index 85503e93b93f..9e61518c4945 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_kvdl.c
+++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_kvdl.c
@@ -420,67 +420,48 @@ static struct devlink_resource_ops 
mlxsw_sp_kvdl_chunks_large_ops = {
.occ_get = mlxsw_sp_kvdl_large_chunks_occ_get,
 };
 
-static struct devlink_resource_size_params mlxsw_sp_kvdl_single_size_params = {
-   .size_min = 0,
-   .size_granularity = 1,
-   .unit = DEVLINK_RESOURCE_UNIT_ENTRY,
-};
-
-static struct devlink_resource_size_params mlxsw_sp_kvdl_chunks_size_params = {
-   .size_min = 0,
-   .size_granularity = MLXSW_SP_CHUNK_MAX,
-   .unit = DEVLINK_RESOURCE_UNIT_ENTRY,
-};
-
-static struct devlink_resource_size_params 
mlxsw_sp_kvdl_large_chunks_size_params = {
-   .size_min = 0,
-   .size_granularity = MLXSW_SP_LARGE_CHUNK_MAX,
-   .unit = DEVLINK_RESOURCE_UNIT_ENTRY,
-};
-
-static void
-mlxsw_sp_kvdl_resource_size_params_prepare(struct devlink *devlink)
+int mlxsw_sp_kvdl_resources_register(struct devlink *devlink)
 {
struct mlxsw_core *mlxsw_core = devlink_priv(devlink);
+   static struct devlink_resource_size_params size_params;
u32 kvdl_max_size;
+   int err;
 
kvdl_max_size = MLXSW_CORE_RES_GET(mlxsw_core, KVD_SIZE) -
MLXSW_CORE_RES_GET(mlxsw_core, KVD_SINGLE_MIN_SIZE) -
MLXSW_CORE_RES_GET(mlxsw_core, KVD_DOUBLE_MIN_SIZE);
 
-   mlxsw_sp_kvdl_single_size_params.size_max = kvdl_max_size;
-   mlxsw_sp_kvdl_chunks_size_params.size_max = kvdl_max_size;
-   mlxsw_sp_kvdl_large_chunks_size_params.size_max = kvdl_max_size;
-}
-
-int mlxsw_sp_kvdl_resources_register(struct devlink *devlink)
-{
-   int err;
-
-   mlxsw_sp_kvdl_resource_size_params_prepare(devlink);
+   devlink_resource_size_params_init(_params, 0, kvdl_max_size, 1,
+ DEVLINK_RESOURCE_UNIT_ENTRY);
err = devlink_resource_register(devlink, 
MLXSW_SP_RESOURCE_NAME_KVD_LINEAR_SINGLES,
MLXSW_SP_KVDL_SINGLE_SIZE,
MLXSW_SP_RESOURCE_KVD_LINEAR_SINGLE,
MLXSW_SP_RESOURCE_KVD_LINEAR,
-   _sp_kvdl_single_size_params,
+   _params,
_sp_kvdl_single_ops);
if (err)
return err;
 
+   devlink_resource_size_params_init(_params, 0, kvdl_max_size,
+ MLXSW_SP_CHUNK_MAX,
+ DEVLINK_RESOURCE_UNIT_ENTRY);
err = devlink_resource_register(devlink, 
MLXSW_SP_RESOURCE_NAME_KVD_LINEAR_CHUNKS,
MLXSW_SP_KVDL_CHUNKS_SIZE,
MLXSW_SP_RESOURCE_KVD_LINEAR_CHUNKS,
MLXSW_SP_RESOURCE_KVD_LINEAR,
-   _sp_kvdl_chunks_size_params,
+   _params,
_sp_kvdl_chunks_ops);
if (err)
return err;
 
+   devlink_resource_size_params_init(_params, 0, kvdl_max_size,
+ MLXSW_SP_LARGE_CHUNK_MAX,
+ DEVLINK_RESOURCE_UNIT_ENTRY);
err = devlink_resource_register(devlink, 
MLXSW_SP_RESOURCE_NAME_KVD_LINEAR_LARGE_CHUNKS,
MLXSW_SP_KVDL_LARGE_CHUNKS_SIZE,

MLXSW_SP_RESOURCE_KVD_LINEAR_LARGE_CHUNKS,
MLXSW_SP_RESOURCE_KVD_LINEAR,
-   _sp_kvdl_large_chunks_size_params,
+   _params,
_sp_kvdl_chunks_large_ops);
return err;
 }
-- 
2.14.3

[PATCH 08/30] aio: implement IOCB_CMD_POLL

2018-03-29 Thread Christoph Hellwig

Simple one-shot poll through the io_submit() interface.  To poll for
a file descriptor the application should submit an iocb of type
IOCB_CMD_POLL.  It will poll the fd for the events specified in the
the first 32 bits of the aio_buf field of the iocb.

Unlike poll or epoll without EPOLLONESHOT this interface always works
in one shot mode, that is once the iocb is completed, it will have to be
resubmitted.

Signed-off-by: Christoph Hellwig 
Acked-by: Jeff Moyer 
Reviewed-by: Greg Kroah-Hartman 
Reviewed-by: Darrick J. Wong 
---
 fs/aio.c | 97 +++-
 include/uapi/linux/aio_abi.h |  6 +--
 2 files changed, 98 insertions(+), 5 deletions(-)

diff --git a/fs/aio.c b/fs/aio.c
index 2406644e1ecc..e61e04b01f50 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -5,6 +5,7 @@
  * Implements an efficient asynchronous io interface.
  *
  * Copyright 2000, 2001, 2002 Red Hat, Inc.  All Rights Reserved.
+ * Copyright 2018 Christoph Hellwig.
  *
  * See ../COPYING for licensing terms.
  */
@@ -162,10 +163,18 @@ struct fsync_iocb {
booldatasync;
 };
 
+struct poll_iocb {
+   struct file *file;
+   __poll_tevents;
+   struct wait_queue_head  *head;
+   struct wait_queue_entry wait;
+};
+
 struct aio_kiocb {
union {
struct kiocbrw;
struct fsync_iocb   fsync;
+   struct poll_iocbpoll;
};
 
struct kioctx   *ki_ctx;
@@ -1577,7 +1586,6 @@ static int aio_fsync(struct fsync_iocb *req, struct iocb 
*iocb, bool datasync)
return -EINVAL;
if (iocb->aio_offset || iocb->aio_nbytes || iocb->aio_rw_flags)
return -EINVAL;
-
req->file = fget(iocb->aio_fildes);
if (unlikely(!req->file))
return -EBADF;
@@ -1596,6 +1604,91 @@ static int aio_fsync(struct fsync_iocb *req, struct iocb 
*iocb, bool datasync)
return ret;
 }
 
+static void aio_complete_poll(struct poll_iocb *req, __poll_t mask)
+{
+   struct aio_kiocb *iocb = container_of(req, struct aio_kiocb, poll);
+   struct file *file = req->file;
+
+   if (aio_complete(iocb, mangle_poll(mask), 0, 0))
+   fput(file);
+}
+
+static int aio_poll_cancel(struct kiocb *rw)
+{
+   struct aio_kiocb *iocb = container_of(rw, struct aio_kiocb, rw);
+   struct file *file = iocb->poll.file;
+
+   remove_wait_queue(iocb->poll.head, >poll.wait);
+   if (aio_complete(iocb, 0, 0, AIO_COMPLETE_CANCEL))
+   fput(file);
+   return 0;
+}
+
+static int aio_poll_wake(struct wait_queue_entry *wait, unsigned mode, int 
sync,
+   void *key)
+{
+   struct poll_iocb *req = container_of(wait, struct poll_iocb, wait);
+   struct file *file = req->file;
+   __poll_t mask = key_to_poll(key);
+
+   assert_spin_locked(>head->lock);
+
+   /* for instances that support it check for an event match first: */
+   if (mask && !(mask & req->events))
+   return 0;
+
+   mask = vfs_poll_mask(file, req->events);
+   if (!mask)
+   return 0;
+
+   __remove_wait_queue(req->head, >wait);
+   aio_complete_poll(req, mask);
+   return 1;
+}
+
+static ssize_t aio_poll(struct aio_kiocb *aiocb, struct iocb *iocb)
+{
+   struct poll_iocb *req = >poll;
+   unsigned long flags;
+   __poll_t mask;
+
+   /* reject any unknown events outside the normal event mask. */
+   if ((u16)iocb->aio_buf != iocb->aio_buf)
+   return -EINVAL;
+   /* reject fields that are not defined for poll */
+   if (iocb->aio_offset || iocb->aio_nbytes || iocb->aio_rw_flags)
+   return -EINVAL;
+
+   req->events = demangle_poll(iocb->aio_buf) | POLLERR | POLLHUP;
+   req->file = fget(iocb->aio_fildes);
+   if (unlikely(!req->file))
+   return -EBADF;
+
+   req->head = vfs_get_poll_head(req->file, req->events);
+   if (!req->head) {
+   fput(req->file);
+   return -EINVAL; /* same as no support for IOCB_CMD_POLL */
+   }
+   if (IS_ERR(req->head)) {
+   mask = PTR_TO_POLL(req->head);
+   goto done;
+   }
+
+   init_waitqueue_func_entry(>wait, aio_poll_wake);
+
+   spin_lock_irqsave(>head->lock, flags);
+   mask = vfs_poll_mask(req->file, req->events);
+   if (!mask) {
+   __kiocb_set_cancel_fn(aiocb, aio_poll_cancel);
+   __add_wait_queue(req->head, >wait);
+   }
+   spin_unlock_irqrestore(>head->lock, flags);
+done:
+   if (mask)
+   aio_complete_poll(req, mask);
+   return -EIOCBQUEUED;
+}
+
 static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb,
 struct iocb *iocb, bool compat)
 {
@@ -1664,6

[PATCH net-next 01/11] mlxsw: spectrum_acl: Fix flex actions header ifndef define construct

2018-03-29 Thread Ido Schimmel

From: Jiri Pirko 

Fix copy error in flex actions header ifndef define construct

Signed-off-by: Jiri Pirko 
Signed-off-by: Ido Schimmel 
---
 drivers/net/ethernet/mellanox/mlxsw/spectrum_acl_flex_actions.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_acl_flex_actions.h 
b/drivers/net/ethernet/mellanox/mlxsw/spectrum_acl_flex_actions.h
index 2726192836ad..bd6d552d95b9 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_acl_flex_actions.h
+++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_acl_flex_actions.h
@@ -33,8 +33,8 @@
  * POSSIBILITY OF SUCH DAMAGE.
  */
 
-#ifndef _MLXSW_SPECTRUM_ACL_FLEX_KEYS_H
-#define _MLXSW_SPECTRUM_ACL_FLEX_KEYS_H
+#ifndef _MLXSW_SPECTRUM_ACL_FLEX_ACTIONS_H
+#define _MLXSW_SPECTRUM_ACL_FLEX_ACTIONS_H
 
 #include "spectrum.h"
 
-- 
2.14.3

[PATCH net-next 04/11] mlxsw: spectrum: Change KVD linear parts from list to array

2018-03-29 Thread Ido Schimmel

From: Jiri Pirko 

The parts info is array. The parts copy this info array, yet they are a
list. So make the indexing according to the id and change the list of
parts into array of parts. This helps to eliminate lookups and
constructs like mlxsw_sp_kvdl_part_update() (took me some non-trivial
time to figure out what is going on there).
Alongside with that, introduce a helper macro to define the parts infos.

Signed-off-by: Jiri Pirko 
Signed-off-by: Ido Schimmel 
---
 .../net/ethernet/mellanox/mlxsw/spectrum_kvdl.c| 235 -
 1 file changed, 92 insertions(+), 143 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_kvdl.c 
b/drivers/net/ethernet/mellanox/mlxsw/spectrum_kvdl.c
index 201825c0019b..7b28f65d6407 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_kvdl.c
+++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_kvdl.c
@@ -55,24 +55,47 @@
 #define MLXSW_SP_KVDL_LARGE_CHUNKS_END \
(MLXSW_SP_KVDL_LARGE_CHUNKS_SIZE + MLXSW_SP_KVDL_LARGE_CHUNKS_BASE - 1)
 
-#define MLXSW_SP_CHUNK_MAX 32
-#define MLXSW_SP_LARGE_CHUNK_MAX 512
+#define MLXSW_SP_KVDL_SINGLE_ALLOC_SIZE 1
+#define MLXSW_SP_KVDL_CHUNKS_ALLOC_SIZE 32
+#define MLXSW_SP_KVDL_LARGE_CHUNKS_ALLOC_SIZE 512
 
 struct mlxsw_sp_kvdl_part_info {
unsigned int part_index;
unsigned int start_index;
unsigned int end_index;
unsigned int alloc_size;
+   enum mlxsw_sp_resource_id resource_id;
+};
+
+enum mlxsw_sp_kvdl_part_id {
+   MLXSW_SP_KVDL_PART_ID_SINGLE,
+   MLXSW_SP_KVDL_PART_ID_CHUNKS,
+   MLXSW_SP_KVDL_PART_ID_LARGE_CHUNKS,
 };
 
+#define MLXSW_SP_KVDL_PART_INFO(id)\
+[MLXSW_SP_KVDL_PART_ID_##id] = {   \
+   .start_index = MLXSW_SP_KVDL_##id##_BASE,   \
+   .end_index = MLXSW_SP_KVDL_##id##_END,  \
+   .alloc_size = MLXSW_SP_KVDL_##id##_ALLOC_SIZE,  \
+   .resource_id = MLXSW_SP_RESOURCE_KVD_LINEAR_##id,   \
+}
+
+static const struct mlxsw_sp_kvdl_part_info mlxsw_sp_kvdl_parts_info[] = {
+   MLXSW_SP_KVDL_PART_INFO(SINGLE),
+   MLXSW_SP_KVDL_PART_INFO(CHUNKS),
+   MLXSW_SP_KVDL_PART_INFO(LARGE_CHUNKS),
+};
+
+#define MLXSW_SP_KVDL_PARTS_INFO_LEN ARRAY_SIZE(mlxsw_sp_kvdl_parts_info)
+
 struct mlxsw_sp_kvdl_part {
-   struct list_head list;
-   struct mlxsw_sp_kvdl_part_info *info;
+   struct mlxsw_sp_kvdl_part_info info;
unsigned long usage[0]; /* Entries */
 };
 
 struct mlxsw_sp_kvdl {
-   struct list_head parts_list;
+   struct mlxsw_sp_kvdl_part *parts[MLXSW_SP_KVDL_PARTS_INFO_LEN];
 };
 
 static struct mlxsw_sp_kvdl_part *
@@ -80,11 +103,13 @@ mlxsw_sp_kvdl_alloc_size_part(struct mlxsw_sp_kvdl *kvdl,
  unsigned int alloc_size)
 {
struct mlxsw_sp_kvdl_part *part, *min_part = NULL;
+   int i;
 
-   list_for_each_entry(part, >parts_list, list) {
-   if (alloc_size <= part->info->alloc_size &&
+   for (i = 0; i < MLXSW_SP_KVDL_PARTS_INFO_LEN; i++) {
+   part = kvdl->parts[i];
+   if (alloc_size <= part->info.alloc_size &&
(!min_part ||
-part->info->alloc_size <= min_part->info->alloc_size))
+part->info.alloc_size <= min_part->info.alloc_size))
min_part = part;
}
 
@@ -95,10 +120,12 @@ static struct mlxsw_sp_kvdl_part *
 mlxsw_sp_kvdl_index_part(struct mlxsw_sp_kvdl *kvdl, u32 kvdl_index)
 {
struct mlxsw_sp_kvdl_part *part;
+   int i;
 
-   list_for_each_entry(part, >parts_list, list) {
-   if (kvdl_index >= part->info->start_index &&
-   kvdl_index <= part->info->end_index)
+   for (i = 0; i < MLXSW_SP_KVDL_PARTS_INFO_LEN; i++) {
+   part = kvdl->parts[i];
+   if (kvdl_index >= part->info.start_index &&
+   kvdl_index <= part->info.end_index)
return part;
}
 
@@ -122,7 +149,7 @@ mlxsw_sp_kvdl_index_entry_index(const struct 
mlxsw_sp_kvdl_part_info *info,
 static int mlxsw_sp_kvdl_part_alloc(struct mlxsw_sp_kvdl_part *part,
u32 *p_kvdl_index)
 {
-   const struct mlxsw_sp_kvdl_part_info *info = part->info;
+   const struct mlxsw_sp_kvdl_part_info *info = >info;
unsigned int entry_index, nr_entries;
 
nr_entries = (info->end_index - info->start_index + 1) /
@@ -132,8 +159,7 @@ static int mlxsw_sp_kvdl_part_alloc(struct 
mlxsw_sp_kvdl_part *part,
return -ENOBUFS;
__set_bit(entry_index, part->usage);
 
-   *p_kvdl_index = mlxsw_sp_entry_index_kvdl_index(part->info,
-   entry_index);
+   *p_kvdl_index = mlxsw_sp_entry_index_kvdl_index(info, entry_index);
 
return 0;
 }
@@ -141,10 +167,10 @@ static int

[PATCH 09/30] net: refactor socket_poll

2018-03-29 Thread Christoph Hellwig

Factor out two busy poll related helpers for late reuse, and remove
a command that isn't very helpful, especially with the __poll_t
annotations in place.

Signed-off-by: Christoph Hellwig 
---
 include/net/busy_poll.h | 15 +++
 net/socket.c| 21 -
 2 files changed, 19 insertions(+), 17 deletions(-)

diff --git a/include/net/busy_poll.h b/include/net/busy_poll.h
index 71c72a939bf8..c5187438af38 100644
--- a/include/net/busy_poll.h
+++ b/include/net/busy_poll.h
@@ -121,6 +121,21 @@ static inline void sk_busy_loop(struct sock *sk, int 
nonblock)
 #endif
 }
 
+static inline void sock_poll_busy_loop(struct socket *sock, __poll_t events)
+{
+   if (sk_can_busy_loop(sock->sk) &&
+   events && (events & POLL_BUSY_LOOP)) {
+   /* once, only if requested by syscall */
+   sk_busy_loop(sock->sk, 1);
+   }
+}
+
+/* if this socket can poll_ll, tell the system call */
+static inline __poll_t sock_poll_busy_flag(struct socket *sock)
+{
+   return sk_can_busy_loop(sock->sk) ? POLL_BUSY_LOOP : 0;
+}
+
 /* used in the NIC receive handler to mark the skb */
 static inline void skb_mark_napi_id(struct sk_buff *skb,
struct napi_struct *napi)
diff --git a/net/socket.c b/net/socket.c
index a93c99b518ca..3f859a07641a 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -1117,24 +1117,11 @@ EXPORT_SYMBOL(sock_create_lite);
 /* No kernel lock held - perfect */
 static __poll_t sock_poll(struct file *file, poll_table *wait)
 {
-   __poll_t busy_flag = 0;
-   struct socket *sock;
-
-   /*
-*  We can't return errors to poll, so it's either yes or no.
-*/
-   sock = file->private_data;
-
-   if (sk_can_busy_loop(sock->sk)) {
-   /* this socket can poll_ll so tell the system call */
-   busy_flag = POLL_BUSY_LOOP;
-
-   /* once, only if requested by syscall */
-   if (wait && (wait->_key & POLL_BUSY_LOOP))
-   sk_busy_loop(sock->sk, 1);
-   }
+   struct socket *sock = file->private_data;
+   __poll_t events = poll_requested_events(wait);
 
-   return busy_flag | sock->ops->poll(file, sock, wait);
+   sock_poll_busy_loop(sock, events);
+   return sock->ops->poll(file, sock, wait) | sock_poll_busy_flag(sock);
 }
 
 static int sock_mmap(struct file *file, struct vm_area_struct *vma)
-- 
2.14.2

[PATCH net-next 06/11] mlxsw: core: Fix arg name of MLXSW_CORE_RES_VALID and MLXSW_CORE_RES_GET

2018-03-29 Thread Ido Schimmel

From: Jiri Pirko 

First arg of these helpers should be "mlxsw_core".

Signed-off-by: Jiri Pirko 
Signed-off-by: Ido Schimmel 
---
 drivers/net/ethernet/mellanox/mlxsw/core.h | 8 
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlxsw/core.h 
b/drivers/net/ethernet/mellanox/mlxsw/core.h
index fd30eaf40475..0d6452699364 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/core.h
+++ b/drivers/net/ethernet/mellanox/mlxsw/core.h
@@ -325,14 +325,14 @@ int mlxsw_core_kvd_sizes_get(struct mlxsw_core 
*mlxsw_core,
 bool mlxsw_core_res_valid(struct mlxsw_core *mlxsw_core,
  enum mlxsw_res_id res_id);
 
-#define MLXSW_CORE_RES_VALID(res, short_res_id)\
-   mlxsw_core_res_valid(res, MLXSW_RES_ID_##short_res_id)
+#define MLXSW_CORE_RES_VALID(mlxsw_core, short_res_id) \
+   mlxsw_core_res_valid(mlxsw_core, MLXSW_RES_ID_##short_res_id)
 
 u64 mlxsw_core_res_get(struct mlxsw_core *mlxsw_core,
   enum mlxsw_res_id res_id);
 
-#define MLXSW_CORE_RES_GET(res, short_res_id)  \
-   mlxsw_core_res_get(res, MLXSW_RES_ID_##short_res_id)
+#define MLXSW_CORE_RES_GET(mlxsw_core, short_res_id)   \
+   mlxsw_core_res_get(mlxsw_core, MLXSW_RES_ID_##short_res_id)
 
 #define MLXSW_BUS_F_TXRX   BIT(0)
 
-- 
2.14.3

[PATCH net-next 05/11] mlxsw: remove kvd_hash_granularity from config profile struct

2018-03-29 Thread Ido Schimmel

From: Jiri Pirko 

This should not be part of the struct, as the struct fields
are tightly coupled with the FW command payload of the same name.
Just use the "granularity" define directly, as in other places.

Signed-off-by: Jiri Pirko 
Signed-off-by: Ido Schimmel 
---
 drivers/net/ethernet/mellanox/mlxsw/core.h | 1 -
 drivers/net/ethernet/mellanox/mlxsw/spectrum.c | 5 ++---
 2 files changed, 2 insertions(+), 4 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlxsw/core.h 
b/drivers/net/ethernet/mellanox/mlxsw/core.h
index 5ddafd74dc00..fd30eaf40475 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/core.h
+++ b/drivers/net/ethernet/mellanox/mlxsw/core.h
@@ -256,7 +256,6 @@ struct mlxsw_config_profile {
u16 adaptive_routing_group_cap;
u8  arn;
u32 kvd_linear_size;
-   u16 kvd_hash_granularity;
u8  kvd_hash_single_parts;
u8  kvd_hash_double_parts;
u8  resource_query_enable;
diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum.c 
b/drivers/net/ethernet/mellanox/mlxsw/spectrum.c
index 0e9ed41ce8bc..d503cdbeae29 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/spectrum.c
+++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum.c
@@ -3794,7 +3794,6 @@ static const struct mlxsw_config_profile 
mlxsw_sp_config_profile = {
.used_max_pkey  = 1,
.max_pkey   = 0,
.used_kvd_split_data= 1,
-   .kvd_hash_granularity   = MLXSW_SP_KVD_GRANULARITY,
.kvd_hash_single_parts  = 59,
.kvd_hash_double_parts  = 41,
.kvd_linear_size= MLXSW_SP_KVD_LINEAR_SIZE,
@@ -3902,7 +3901,7 @@ static int mlxsw_sp_resources_register(struct mlxsw_core 
*mlxsw_core)
double_size *= profile->kvd_hash_double_parts;
double_size /= profile->kvd_hash_double_parts +
   profile->kvd_hash_single_parts;
-   double_size = rounddown(double_size, profile->kvd_hash_granularity);
+   double_size = rounddown(double_size, MLXSW_SP_KVD_GRANULARITY);
err = devlink_resource_register(devlink, 
MLXSW_SP_RESOURCE_NAME_KVD_HASH_DOUBLE,
double_size,
MLXSW_SP_RESOURCE_KVD_HASH_DOUBLE,
@@ -3962,7 +3961,7 @@ static int mlxsw_sp_kvd_sizes_get(struct mlxsw_core 
*mlxsw_core,
double_size /= profile->kvd_hash_double_parts +
   profile->kvd_hash_single_parts;
*p_double_size = rounddown(double_size,
-  profile->kvd_hash_granularity);
+  MLXSW_SP_KVD_GRANULARITY);
}
 
err = devlink_resource_size_get(devlink,
-- 
2.14.3

[PATCH net-next 07/11] mlxsw: Move "used_kvd_sizes" check to mlxsw_pci_config_profile

2018-03-29 Thread Ido Schimmel

From: Jiri Pirko 

The check should be done directly in mlxsw_pci_config_profile, as for
other profile items. Also, be consistent in naming with the rest and
rename to "used_kvd_sizes".

Signed-off-by: Jiri Pirko 
Signed-off-by: Ido Schimmel 
---
 drivers/net/ethernet/mellanox/mlxsw/core.h | 3 +--
 drivers/net/ethernet/mellanox/mlxsw/pci.c  | 2 +-
 drivers/net/ethernet/mellanox/mlxsw/spectrum.c | 5 ++---
 3 files changed, 4 insertions(+), 6 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlxsw/core.h 
b/drivers/net/ethernet/mellanox/mlxsw/core.h
index 0d6452699364..ff9daa09341d 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/core.h
+++ b/drivers/net/ethernet/mellanox/mlxsw/core.h
@@ -235,8 +235,7 @@ struct mlxsw_config_profile {
used_max_pkey:1,
used_ar_sec:1,
used_adaptive_routing_group_cap:1,
-   used_kvd_split_data:1; /* indicate for the kvd's values */
-
+   used_kvd_sizes:1;
u8  max_vepa_channels;
u16 max_mid;
u16 max_pgt;
diff --git a/drivers/net/ethernet/mellanox/mlxsw/pci.c 
b/drivers/net/ethernet/mellanox/mlxsw/pci.c
index e30c6ce3dcb4..5ab068aec033 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/pci.c
+++ b/drivers/net/ethernet/mellanox/mlxsw/pci.c
@@ -1164,7 +1164,7 @@ static int mlxsw_pci_config_profile(struct mlxsw_pci 
*mlxsw_pci, char *mbox,
mlxsw_cmd_mbox_config_profile_adaptive_routing_group_cap_set(
mbox, profile->adaptive_routing_group_cap);
}
-   if (MLXSW_RES_VALID(res, KVD_SIZE)) {
+   if (profile->used_kvd_sizes && MLXSW_RES_VALID(res, KVD_SIZE)) {
err = mlxsw_pci_profile_get_kvd_sizes(mlxsw_pci, profile, res);
if (err)
return err;
diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum.c 
b/drivers/net/ethernet/mellanox/mlxsw/spectrum.c
index d503cdbeae29..12062aab13c5 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/spectrum.c
+++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum.c
@@ -3793,7 +3793,7 @@ static const struct mlxsw_config_profile 
mlxsw_sp_config_profile = {
.max_ib_mc  = 0,
.used_max_pkey  = 1,
.max_pkey   = 0,
-   .used_kvd_split_data= 1,
+   .used_kvd_sizes = 1,
.kvd_hash_single_parts  = 59,
.kvd_hash_double_parts  = 41,
.kvd_linear_size= MLXSW_SP_KVD_LINEAR_SIZE,
@@ -3934,8 +3934,7 @@ static int mlxsw_sp_kvd_sizes_get(struct mlxsw_core 
*mlxsw_core,
int err;
 
if (!MLXSW_CORE_RES_VALID(mlxsw_core, KVD_SINGLE_MIN_SIZE) ||
-   !MLXSW_CORE_RES_VALID(mlxsw_core, KVD_DOUBLE_MIN_SIZE) ||
-   !profile->used_kvd_split_data)
+   !MLXSW_CORE_RES_VALID(mlxsw_core, KVD_DOUBLE_MIN_SIZE))
return -EIO;
 
/* The hash part is what left of the kvd without the
-- 
2.14.3

[PATCH 10/30] net: add support for ->poll_mask in proto_ops

2018-03-29 Thread Christoph Hellwig

The socket file operations still implement ->poll until all protocols are
switched over.

Signed-off-by: Christoph Hellwig 
---
 include/linux/net.h |  3 +++
 net/socket.c| 51 ++-
 2 files changed, 49 insertions(+), 5 deletions(-)

diff --git a/include/linux/net.h b/include/linux/net.h
index 91216b16feb7..ce3d4dacb51e 100644
--- a/include/linux/net.h
+++ b/include/linux/net.h
@@ -147,6 +147,9 @@ struct proto_ops {
int (*getname)   (struct socket *sock,
  struct sockaddr *addr,
  int *sockaddr_len, int peer);
+   struct wait_queue_head *(*get_poll_head)(struct socket *sock,
+ __poll_t events);
+   __poll_t(*poll_mask) (struct socket *sock, __poll_t events);
__poll_t(*poll)  (struct file *file, struct socket *sock,
  struct poll_table_struct *wait);
int (*ioctl) (struct socket *sock, unsigned int cmd,
diff --git a/net/socket.c b/net/socket.c
index 3f859a07641a..ceb69ddcd7bd 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -118,8 +118,10 @@ static ssize_t sock_write_iter(struct kiocb *iocb, struct 
iov_iter *from);
 static int sock_mmap(struct file *file, struct vm_area_struct *vma);
 
 static int sock_close(struct inode *inode, struct file *file);
-static __poll_t sock_poll(struct file *file,
- struct poll_table_struct *wait);
+static struct wait_queue_head *sock_get_poll_head(struct file *file,
+   __poll_t events);
+static __poll_t sock_poll_mask(struct file *file, __poll_t);
+static __poll_t sock_poll(struct file *file, struct poll_table_struct *wait);
 static long sock_ioctl(struct file *file, unsigned int cmd, unsigned long arg);
 #ifdef CONFIG_COMPAT
 static long compat_sock_ioctl(struct file *file,
@@ -142,6 +144,8 @@ static const struct file_operations socket_file_ops = {
.llseek =   no_llseek,
.read_iter =sock_read_iter,
.write_iter =   sock_write_iter,
+   .get_poll_head = sock_get_poll_head,
+   .poll_mask =sock_poll_mask,
.poll = sock_poll,
.unlocked_ioctl = sock_ioctl,
 #ifdef CONFIG_COMPAT
@@ -1114,14 +1118,51 @@ int sock_create_lite(int family, int type, int 
protocol, struct socket **res)
 }
 EXPORT_SYMBOL(sock_create_lite);
 
+static struct wait_queue_head *sock_get_poll_head(struct file *file,
+   __poll_t events)
+{
+   struct socket *sock = file->private_data;
+
+   if (!sock->ops->poll_mask)
+   return NULL;
+   if (sock->ops->get_poll_head)
+   return sock->ops->get_poll_head(sock, events);
+
+   sock_poll_busy_loop(sock, events);
+   return sk_sleep(sock->sk);
+}
+
+static __poll_t sock_poll_mask(struct file *file, __poll_t events)
+{
+   struct socket *sock = file->private_data;
+
+   /*
+* We need to be sure we are in sync with the socket flags modification.
+*
+* This memory barrier is paired in the wq_has_sleeper.
+*/
+   smp_mb();
+
+   /* this socket can poll_ll so tell the system call */
+   return sock->ops->poll_mask(sock, events) |
+   (sk_can_busy_loop(sock->sk) ? POLL_BUSY_LOOP : 0);
+}
+
 /* No kernel lock held - perfect */
 static __poll_t sock_poll(struct file *file, poll_table *wait)
 {
struct socket *sock = file->private_data;
-   __poll_t events = poll_requested_events(wait);
+   __poll_t events = poll_requested_events(wait), mask = 0;
 
-   sock_poll_busy_loop(sock, events);
-   return sock->ops->poll(file, sock, wait) | sock_poll_busy_flag(sock);
+   if (sock->ops->poll) {
+   sock_poll_busy_loop(sock, events);
+   mask = sock->ops->poll(file, sock, wait);
+   } else if (sock->ops->poll_mask) {
+   sock_poll_wait(file, sock_get_poll_head(file, events), wait);
+   mask = sock->ops->poll_mask(sock, events);
+   }
+
+   return mask | sock_poll_busy_flag(sock);
 }
 
 static int sock_mmap(struct file *file, struct vm_area_struct *vma)
-- 
2.14.2

[PATCH net-next 08/11] mlxsw: Move "resources_query_enable" out of mlxsw_config_profile

2018-03-29 Thread Ido Schimmel

From: Jiri Pirko 

As struct mlxsw_config_profile is mapped to the payload of the FW
command of the same name, resources_query_enable flag does not belong
there. Move it to struct mlxsw_driver.

Signed-off-by: Jiri Pirko 
Signed-off-by: Ido Schimmel 
---
 drivers/net/ethernet/mellanox/mlxsw/core.c | 5 +++--
 drivers/net/ethernet/mellanox/mlxsw/core.h | 2 +-
 drivers/net/ethernet/mellanox/mlxsw/pci.c  | 9 +++--
 drivers/net/ethernet/mellanox/mlxsw/spectrum.c | 2 +-
 drivers/net/ethernet/mellanox/mlxsw/switchib.c | 1 -
 drivers/net/ethernet/mellanox/mlxsw/switchx2.c | 1 -
 6 files changed, 8 insertions(+), 12 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlxsw/core.c 
b/drivers/net/ethernet/mellanox/mlxsw/core.c
index 3529b545675d..93ea56620a24 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/core.c
+++ b/drivers/net/ethernet/mellanox/mlxsw/core.c
@@ -1008,6 +1008,7 @@ int mlxsw_core_bus_device_register(const struct 
mlxsw_bus_info *mlxsw_bus_info,
const char *device_kind = mlxsw_bus_info->device_kind;
struct mlxsw_core *mlxsw_core;
struct mlxsw_driver *mlxsw_driver;
+   struct mlxsw_res *res;
size_t alloc_size;
int err;
 
@@ -1032,8 +1033,8 @@ int mlxsw_core_bus_device_register(const struct 
mlxsw_bus_info *mlxsw_bus_info,
mlxsw_core->bus_priv = bus_priv;
mlxsw_core->bus_info = mlxsw_bus_info;
 
-   err = mlxsw_bus->init(bus_priv, mlxsw_core, mlxsw_driver->profile,
- _core->res);
+   res = mlxsw_driver->res_query_enabled ? _core->res : NULL;
+   err = mlxsw_bus->init(bus_priv, mlxsw_core, mlxsw_driver->profile, res);
if (err)
goto err_bus_init;
 
diff --git a/drivers/net/ethernet/mellanox/mlxsw/core.h 
b/drivers/net/ethernet/mellanox/mlxsw/core.h
index ff9daa09341d..092d39399f3c 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/core.h
+++ b/drivers/net/ethernet/mellanox/mlxsw/core.h
@@ -257,7 +257,6 @@ struct mlxsw_config_profile {
u32 kvd_linear_size;
u8  kvd_hash_single_parts;
u8  kvd_hash_double_parts;
-   u8  resource_query_enable;
struct mlxsw_swid_config swid_config[MLXSW_CONFIG_PROFILE_SWID_COUNT];
 };
 
@@ -314,6 +313,7 @@ struct mlxsw_driver {
 u64 *p_linear_size);
u8 txhdr_len;
const struct mlxsw_config_profile *profile;
+   bool res_query_enabled;
 };
 
 int mlxsw_core_kvd_sizes_get(struct mlxsw_core *mlxsw_core,
diff --git a/drivers/net/ethernet/mellanox/mlxsw/pci.c 
b/drivers/net/ethernet/mellanox/mlxsw/pci.c
index 5ab068aec033..3a9381977d6d 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/pci.c
+++ b/drivers/net/ethernet/mellanox/mlxsw/pci.c
@@ -1015,16 +1015,14 @@ mlxsw_pci_config_profile_swid_config(struct mlxsw_pci 
*mlxsw_pci,
 }
 
 static int mlxsw_pci_resources_query(struct mlxsw_pci *mlxsw_pci, char *mbox,
-struct mlxsw_res *res,
-u8 query_enabled)
+struct mlxsw_res *res)
 {
int index, i;
u64 data;
u16 id;
int err;
 
-   /* Not all the versions support resources query */
-   if (!query_enabled)
+   if (!res)
return 0;
 
mlxsw_cmd_mbox_zero(mbox);
@@ -1376,8 +1374,7 @@ static int mlxsw_pci_init(void *bus_priv, struct 
mlxsw_core *mlxsw_core,
if (err)
goto err_boardinfo;
 
-   err = mlxsw_pci_resources_query(mlxsw_pci, mbox, res,
-   profile->resource_query_enable);
+   err = mlxsw_pci_resources_query(mlxsw_pci, mbox, res);
if (err)
goto err_query_resources;
 
diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum.c 
b/drivers/net/ethernet/mellanox/mlxsw/spectrum.c
index 12062aab13c5..b831af38e0a1 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/spectrum.c
+++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum.c
@@ -3803,7 +3803,6 @@ static const struct mlxsw_config_profile 
mlxsw_sp_config_profile = {
.type   = MLXSW_PORT_SWID_TYPE_ETH,
}
},
-   .resource_query_enable  = 1,
 };
 
 static u64 mlxsw_sp_resource_kvd_linear_occ_get(struct devlink *devlink)
@@ -4002,6 +4001,7 @@ static struct mlxsw_driver mlxsw_sp_driver = {
.kvd_sizes_get  = mlxsw_sp_kvd_sizes_get,
.txhdr_len  = MLXSW_TXHDR_LEN,
.profile= _sp_config_profile,
+   .res_query_enabled  = true,
 };
 
 bool mlxsw_sp_port_dev_check(const struct net_device *dev)
diff --git a/drivers/net/ethernet/mellanox/mlxsw/switchib.c 
b/drivers/net/ethernet/mellanox/mlxsw/switchib.c
index ab7a29846bfa..c698ec4fd9d4 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/switchib.c
+++

[PATCH 12/30] net/tcp: convert to ->poll_mask

2018-03-29 Thread Christoph Hellwig

Signed-off-by: Christoph Hellwig 
---
 include/net/tcp.h   |  4 ++--
 net/ipv4/af_inet.c  |  3 ++-
 net/ipv4/tcp.c  | 31 ++-
 net/ipv6/af_inet6.c |  3 ++-
 4 files changed, 20 insertions(+), 21 deletions(-)

diff --git a/include/net/tcp.h b/include/net/tcp.h
index e3fc667f9ac2..fb52f93d556c 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -387,8 +387,8 @@ bool tcp_peer_is_proven(struct request_sock *req, struct 
dst_entry *dst);
 void tcp_close(struct sock *sk, long timeout);
 void tcp_init_sock(struct sock *sk);
 void tcp_init_transfer(struct sock *sk, int bpf_op);
-__poll_t tcp_poll(struct file *file, struct socket *sock,
- struct poll_table_struct *wait);
+struct wait_queue_head *tcp_get_poll_head(struct socket *sock, __poll_t 
events);
+__poll_t tcp_poll_mask(struct socket *sock, __poll_t events);
 int tcp_getsockopt(struct sock *sk, int level, int optname,
   char __user *optval, int __user *optlen);
 int tcp_setsockopt(struct sock *sk, int level, int optname,
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index e4329e161943..ec32cc263b18 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -952,7 +952,8 @@ const struct proto_ops inet_stream_ops = {
.socketpair= sock_no_socketpair,
.accept= inet_accept,
.getname   = inet_getname,
-   .poll  = tcp_poll,
+   .get_poll_head = tcp_get_poll_head,
+   .poll_mask = tcp_poll_mask,
.ioctl = inet_ioctl,
.listen= inet_listen,
.shutdown  = inet_shutdown,
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 48636aee23c3..ad8e281066a0 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -484,33 +484,30 @@ static void tcp_tx_timestamp(struct sock *sk, u16 tsflags)
}
 }
 
+struct wait_queue_head *tcp_get_poll_head(struct socket *sock, __poll_t events)
+{
+   sock_poll_busy_loop(sock, events);
+   sock_rps_record_flow(sock->sk);
+   return sk_sleep(sock->sk);
+}
+EXPORT_SYMBOL(tcp_get_poll_head);
+
 /*
- * Wait for a TCP event.
- *
- * Note that we don't need to lock the socket, as the upper poll layers
- * take care of normal races (between the test and the event) and we don't
- * go look at any of the socket buffers directly.
+ * Socket is not locked. We are protected from async events by poll logic and
+ * correct handling of state changes made by other threads is impossible in
+ * any case.
  */
-__poll_t tcp_poll(struct file *file, struct socket *sock, poll_table *wait)
+__poll_t tcp_poll_mask(struct socket *sock, __poll_t events)
 {
-   __poll_t mask;
struct sock *sk = sock->sk;
const struct tcp_sock *tp = tcp_sk(sk);
+   __poll_t mask = 0;
int state;
 
-   sock_poll_wait(file, sk_sleep(sk), wait);
-
state = inet_sk_state_load(sk);
if (state == TCP_LISTEN)
return inet_csk_listen_poll(sk);
 
-   /* Socket is not locked. We are protected from async events
-* by poll logic and correct handling of state changes
-* made by other threads is impossible in any case.
-*/
-
-   mask = 0;
-
/*
 * EPOLLHUP is certainly not done right. But poll() doesn't
 * have a notion of HUP in just one direction, and for a
@@ -591,7 +588,7 @@ __poll_t tcp_poll(struct file *file, struct socket *sock, 
poll_table *wait)
 
return mask;
 }
-EXPORT_SYMBOL(tcp_poll);
+EXPORT_SYMBOL(tcp_poll_mask);
 
 int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg)
 {
diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c
index 416917719a6f..c470549d6ef9 100644
--- a/net/ipv6/af_inet6.c
+++ b/net/ipv6/af_inet6.c
@@ -547,7 +547,8 @@ const struct proto_ops inet6_stream_ops = {
.socketpair= sock_no_socketpair,/* a do nothing */
.accept= inet_accept,   /* ok   */
.getname   = inet6_getname,
-   .poll  = tcp_poll,  /* ok   */
+   .get_poll_head = tcp_get_poll_head,
+   .poll_mask = tcp_poll_mask, /* ok   */
.ioctl = inet6_ioctl,   /* must change  */
.listen= inet_listen,   /* ok   */
.shutdown  = inet_shutdown, /* ok   */
-- 
2.14.2

[PATCH 11/30] net: remove sock_no_poll

2018-03-29 Thread Christoph Hellwig

Now that sock_poll handles a NULL ->poll or ->poll_mask there is no need
for a stub.

Signed-off-by: Christoph Hellwig 
---
 crypto/af_alg.c | 1 -
 crypto/algif_hash.c | 2 --
 crypto/algif_rng.c  | 1 -
 drivers/isdn/mISDN/socket.c | 1 -
 drivers/net/ppp/pptp.c  | 1 -
 include/net/sock.h  | 2 --
 net/bluetooth/bnep/sock.c   | 1 -
 net/bluetooth/cmtp/sock.c   | 1 -
 net/bluetooth/hidp/sock.c   | 1 -
 net/core/sock.c | 6 --
 10 files changed, 17 deletions(-)

diff --git a/crypto/af_alg.c b/crypto/af_alg.c
index c49766b03165..50d75de539f5 100644
--- a/crypto/af_alg.c
+++ b/crypto/af_alg.c
@@ -347,7 +347,6 @@ static const struct proto_ops alg_proto_ops = {
.sendpage   =   sock_no_sendpage,
.sendmsg=   sock_no_sendmsg,
.recvmsg=   sock_no_recvmsg,
-   .poll   =   sock_no_poll,
 
.bind   =   alg_bind,
.release=   af_alg_release,
diff --git a/crypto/algif_hash.c b/crypto/algif_hash.c
index 6c9b1927a520..bfcf595fd8f9 100644
--- a/crypto/algif_hash.c
+++ b/crypto/algif_hash.c
@@ -288,7 +288,6 @@ static struct proto_ops algif_hash_ops = {
.mmap   =   sock_no_mmap,
.bind   =   sock_no_bind,
.setsockopt =   sock_no_setsockopt,
-   .poll   =   sock_no_poll,
 
.release=   af_alg_release,
.sendmsg=   hash_sendmsg,
@@ -396,7 +395,6 @@ static struct proto_ops algif_hash_ops_nokey = {
.mmap   =   sock_no_mmap,
.bind   =   sock_no_bind,
.setsockopt =   sock_no_setsockopt,
-   .poll   =   sock_no_poll,
 
.release=   af_alg_release,
.sendmsg=   hash_sendmsg_nokey,
diff --git a/crypto/algif_rng.c b/crypto/algif_rng.c
index 150c2b6480ed..22df3799a17b 100644
--- a/crypto/algif_rng.c
+++ b/crypto/algif_rng.c
@@ -106,7 +106,6 @@ static struct proto_ops algif_rng_ops = {
.bind   =   sock_no_bind,
.accept =   sock_no_accept,
.setsockopt =   sock_no_setsockopt,
-   .poll   =   sock_no_poll,
.sendmsg=   sock_no_sendmsg,
.sendpage   =   sock_no_sendpage,
 
diff --git a/drivers/isdn/mISDN/socket.c b/drivers/isdn/mISDN/socket.c
index c5603d1a07d6..c84270e16bdd 100644
--- a/drivers/isdn/mISDN/socket.c
+++ b/drivers/isdn/mISDN/socket.c
@@ -746,7 +746,6 @@ static const struct proto_ops base_sock_ops = {
.getname= sock_no_getname,
.sendmsg= sock_no_sendmsg,
.recvmsg= sock_no_recvmsg,
-   .poll   = sock_no_poll,
.listen = sock_no_listen,
.shutdown   = sock_no_shutdown,
.setsockopt = sock_no_setsockopt,
diff --git a/drivers/net/ppp/pptp.c b/drivers/net/ppp/pptp.c
index 6dde9a0cfe76..87f892f1d0fe 100644
--- a/drivers/net/ppp/pptp.c
+++ b/drivers/net/ppp/pptp.c
@@ -627,7 +627,6 @@ static const struct proto_ops pptp_ops = {
.socketpair = sock_no_socketpair,
.accept = sock_no_accept,
.getname= pptp_getname,
-   .poll   = sock_no_poll,
.listen = sock_no_listen,
.shutdown   = sock_no_shutdown,
.setsockopt = sock_no_setsockopt,
diff --git a/include/net/sock.h b/include/net/sock.h
index 169c92afcafa..d9249fe65859 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -1585,8 +1585,6 @@ int sock_no_connect(struct socket *, struct sockaddr *, 
int, int);
 int sock_no_socketpair(struct socket *, struct socket *);
 int sock_no_accept(struct socket *, struct socket *, int, bool);
 int sock_no_getname(struct socket *, struct sockaddr *, int *, int);
-__poll_t sock_no_poll(struct file *, struct socket *,
- struct poll_table_struct *);
 int sock_no_ioctl(struct socket *, unsigned int, unsigned long);
 int sock_no_listen(struct socket *, int);
 int sock_no_shutdown(struct socket *, int);
diff --git a/net/bluetooth/bnep/sock.c b/net/bluetooth/bnep/sock.c
index b5116fa9835e..00deacdcb51c 100644
--- a/net/bluetooth/bnep/sock.c
+++ b/net/bluetooth/bnep/sock.c
@@ -175,7 +175,6 @@ static const struct proto_ops bnep_sock_ops = {
.getname= sock_no_getname,
.sendmsg= sock_no_sendmsg,
.recvmsg= sock_no_recvmsg,
-   .poll   = sock_no_poll,
.listen = sock_no_listen,
.shutdown   = sock_no_shutdown,
.setsockopt = sock_no_setsockopt,
diff --git a/net/bluetooth/cmtp/sock.c b/net/bluetooth/cmtp/sock.c
index ce86a7bae844..e08f28fadd65 100644
--- a/net/bluetooth/cmtp/sock.c
+++ b/net/bluetooth/cmtp/sock.c
@@ -178,7 +178,6 @@ static const struct proto_ops cmtp_sock_ops = {
.getname= sock_no_getname,
.sendmsg= sock_no_sendmsg,
.recvmsg=

[PATCH net-next 09/11] devlink: convert occ_get op to separate registration

2018-03-29 Thread Ido Schimmel

From: Jiri Pirko 

This resolves race during initialization where the resources with
ops are registered before driver and the structures used by occ_get
op is initialized. So keep occ_get callbacks registered only when
all structs are initialized.

Signed-off-by: Jiri Pirko 
Signed-off-by: Ido Schimmel 
---
 drivers/net/ethernet/mellanox/mlxsw/spectrum.c | 24 ++-
 drivers/net/ethernet/mellanox/mlxsw/spectrum.h |  1 -
 .../net/ethernet/mellanox/mlxsw/spectrum_kvdl.c| 67 
 include/net/devlink.h  | 40 
 net/core/devlink.c | 74 +++---
 5 files changed, 134 insertions(+), 72 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum.c 
b/drivers/net/ethernet/mellanox/mlxsw/spectrum.c
index b831af38e0a1..0d95d2cb73e3 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/spectrum.c
+++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum.c
@@ -3805,18 +3805,6 @@ static const struct mlxsw_config_profile 
mlxsw_sp_config_profile = {
},
 };
 
-static u64 mlxsw_sp_resource_kvd_linear_occ_get(struct devlink *devlink)
-{
-   struct mlxsw_core *mlxsw_core = devlink_priv(devlink);
-   struct mlxsw_sp *mlxsw_sp = mlxsw_core_driver_priv(mlxsw_core);
-
-   return mlxsw_sp_kvdl_occ_get(mlxsw_sp);
-}
-
-static const struct devlink_resource_ops mlxsw_sp_resource_kvd_linear_ops = {
-   .occ_get = mlxsw_sp_resource_kvd_linear_occ_get,
-};
-
 static void
 mlxsw_sp_resource_size_params_prepare(struct mlxsw_core *mlxsw_core,
  struct devlink_resource_size_params 
*kvd_size_params,
@@ -3877,8 +3865,7 @@ static int mlxsw_sp_resources_register(struct mlxsw_core 
*mlxsw_core)
err = devlink_resource_register(devlink, MLXSW_SP_RESOURCE_NAME_KVD,
kvd_size, MLXSW_SP_RESOURCE_KVD,
DEVLINK_RESOURCE_ID_PARENT_TOP,
-   _size_params,
-   NULL);
+   _size_params);
if (err)
return err;
 
@@ -3887,8 +3874,7 @@ static int mlxsw_sp_resources_register(struct mlxsw_core 
*mlxsw_core)
linear_size,
MLXSW_SP_RESOURCE_KVD_LINEAR,
MLXSW_SP_RESOURCE_KVD,
-   _size_params,
-   _sp_resource_kvd_linear_ops);
+   _size_params);
if (err)
return err;
 
@@ -3905,8 +3891,7 @@ static int mlxsw_sp_resources_register(struct mlxsw_core 
*mlxsw_core)
double_size,
MLXSW_SP_RESOURCE_KVD_HASH_DOUBLE,
MLXSW_SP_RESOURCE_KVD,
-   _double_size_params,
-   NULL);
+   _double_size_params);
if (err)
return err;
 
@@ -3915,8 +3900,7 @@ static int mlxsw_sp_resources_register(struct mlxsw_core 
*mlxsw_core)
single_size,
MLXSW_SP_RESOURCE_KVD_HASH_SINGLE,
MLXSW_SP_RESOURCE_KVD,
-   _single_size_params,
-   NULL);
+   _single_size_params);
if (err)
return err;
 
diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum.h 
b/drivers/net/ethernet/mellanox/mlxsw/spectrum.h
index 21bee8f19894..c59a0d7d81d5 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/spectrum.h
+++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum.h
@@ -442,7 +442,6 @@ void mlxsw_sp_kvdl_free(struct mlxsw_sp *mlxsw_sp, int 
entry_index);
 int mlxsw_sp_kvdl_alloc_size_query(struct mlxsw_sp *mlxsw_sp,
   unsigned int entry_count,
   unsigned int *p_alloc_size);
-u64 mlxsw_sp_kvdl_occ_get(const struct mlxsw_sp *mlxsw_sp);
 int mlxsw_sp_kvdl_resources_register(struct devlink *devlink);
 
 struct mlxsw_sp_acl_rule_info {
diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_kvdl.c 
b/drivers/net/ethernet/mellanox/mlxsw/spectrum_kvdl.c
index 7b28f65d6407..1b7280168e6b 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_kvdl.c
+++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_kvdl.c
@@ -315,8 +315,9 @@ static u64 mlxsw_sp_kvdl_part_occ(struct mlxsw_sp_kvdl_part 
*part)
return occ;
 }
 
-u64 mlxsw_sp_kvdl_occ_get(const struct mlxsw_sp *mlxsw_sp)
+static u64 mlxsw_sp_kvdl_occ_get(void *priv)
 {
+   const struct mlxsw_sp *mlxsw_sp = priv;

[PATCH 13/30] net/unix: convert to ->poll_mask

2018-03-29 Thread Christoph Hellwig

Signed-off-by: Christoph Hellwig 
---
 net/unix/af_unix.c | 30 +++---
 1 file changed, 11 insertions(+), 19 deletions(-)

diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index 2d465bdeccbc..619c6921dd46 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -638,9 +638,8 @@ static int unix_stream_connect(struct socket *, struct 
sockaddr *,
 static int unix_socketpair(struct socket *, struct socket *);
 static int unix_accept(struct socket *, struct socket *, int, bool);
 static int unix_getname(struct socket *, struct sockaddr *, int *, int);
-static __poll_t unix_poll(struct file *, struct socket *, poll_table *);
-static __poll_t unix_dgram_poll(struct file *, struct socket *,
-   poll_table *);
+static __poll_t unix_poll_mask(struct socket *, __poll_t);
+static __poll_t unix_dgram_poll_mask(struct socket *, __poll_t);
 static int unix_ioctl(struct socket *, unsigned int, unsigned long);
 static int unix_shutdown(struct socket *, int);
 static int unix_stream_sendmsg(struct socket *, struct msghdr *, size_t);
@@ -681,7 +680,7 @@ static const struct proto_ops unix_stream_ops = {
.socketpair =   unix_socketpair,
.accept =   unix_accept,
.getname =  unix_getname,
-   .poll = unix_poll,
+   .poll_mask =unix_poll_mask,
.ioctl =unix_ioctl,
.listen =   unix_listen,
.shutdown = unix_shutdown,
@@ -704,7 +703,7 @@ static const struct proto_ops unix_dgram_ops = {
.socketpair =   unix_socketpair,
.accept =   sock_no_accept,
.getname =  unix_getname,
-   .poll = unix_dgram_poll,
+   .poll_mask =unix_dgram_poll_mask,
.ioctl =unix_ioctl,
.listen =   sock_no_listen,
.shutdown = unix_shutdown,
@@ -726,7 +725,7 @@ static const struct proto_ops unix_seqpacket_ops = {
.socketpair =   unix_socketpair,
.accept =   unix_accept,
.getname =  unix_getname,
-   .poll = unix_dgram_poll,
+   .poll_mask =unix_dgram_poll_mask,
.ioctl =unix_ioctl,
.listen =   unix_listen,
.shutdown = unix_shutdown,
@@ -2640,13 +2639,10 @@ static int unix_ioctl(struct socket *sock, unsigned int 
cmd, unsigned long arg)
return err;
 }
 
-static __poll_t unix_poll(struct file *file, struct socket *sock, poll_table 
*wait)
+static __poll_t unix_poll_mask(struct socket *sock, __poll_t events)
 {
struct sock *sk = sock->sk;
-   __poll_t mask;
-
-   sock_poll_wait(file, sk_sleep(sk), wait);
-   mask = 0;
+   __poll_t mask = 0;
 
/* exceptional events? */
if (sk->sk_err)
@@ -2675,15 +2671,11 @@ static __poll_t unix_poll(struct file *file, struct 
socket *sock, poll_table *wa
return mask;
 }
 
-static __poll_t unix_dgram_poll(struct file *file, struct socket *sock,
-   poll_table *wait)
+static __poll_t unix_dgram_poll_mask(struct socket *sock, __poll_t events)
 {
struct sock *sk = sock->sk, *other;
-   unsigned int writable;
-   __poll_t mask;
-
-   sock_poll_wait(file, sk_sleep(sk), wait);
-   mask = 0;
+   int writable;
+   __poll_t mask = 0;
 
/* exceptional events? */
if (sk->sk_err || !skb_queue_empty(>sk_error_queue))
@@ -2709,7 +2701,7 @@ static __poll_t unix_dgram_poll(struct file *file, struct 
socket *sock,
}
 
/* No write status requested, avoid expensive OUT tests. */
-   if (!(poll_requested_events(wait) & (EPOLLWRBAND|EPOLLWRNORM|EPOLLOUT)))
+   if (!(events & (EPOLLWRBAND|EPOLLWRNORM|EPOLLOUT)))
return mask;
 
writable = unix_writable(sk);
-- 
2.14.2

[PATCH 14/30] net: convert datagram_poll users tp ->poll_mask

2018-03-29 Thread Christoph Hellwig

Signed-off-by: Christoph Hellwig 
Reviewed-by: Greg Kroah-Hartman 
---
 drivers/isdn/mISDN/socket.c|  2 +-
 drivers/net/ppp/pppoe.c|  2 +-
 drivers/staging/ipx/af_ipx.c   |  2 +-
 drivers/staging/irda/net/af_irda.c |  6 +++---
 include/linux/skbuff.h |  3 +--
 include/net/udp.h  |  2 +-
 net/appletalk/ddp.c|  2 +-
 net/ax25/af_ax25.c |  2 +-
 net/bluetooth/hci_sock.c   |  2 +-
 net/can/bcm.c  |  2 +-
 net/can/raw.c  |  2 +-
 net/core/datagram.c| 13 -
 net/decnet/af_decnet.c |  6 +++---
 net/ieee802154/socket.c|  4 ++--
 net/ipv4/af_inet.c |  6 +++---
 net/ipv4/udp.c | 10 +-
 net/ipv6/af_inet6.c|  2 +-
 net/ipv6/raw.c |  4 ++--
 net/kcm/kcmsock.c  |  4 ++--
 net/key/af_key.c   |  2 +-
 net/l2tp/l2tp_ip.c |  2 +-
 net/l2tp/l2tp_ip6.c|  2 +-
 net/l2tp/l2tp_ppp.c|  2 +-
 net/llc/af_llc.c   |  2 +-
 net/netlink/af_netlink.c   |  2 +-
 net/netrom/af_netrom.c |  2 +-
 net/nfc/rawsock.c  |  4 ++--
 net/packet/af_packet.c |  9 -
 net/phonet/socket.c|  2 +-
 net/qrtr/qrtr.c|  2 +-
 net/rose/af_rose.c |  2 +-
 net/x25/af_x25.c   |  2 +-
 32 files changed, 52 insertions(+), 59 deletions(-)

diff --git a/drivers/isdn/mISDN/socket.c b/drivers/isdn/mISDN/socket.c
index c84270e16bdd..61d6e4c9e7d1 100644
--- a/drivers/isdn/mISDN/socket.c
+++ b/drivers/isdn/mISDN/socket.c
@@ -589,7 +589,7 @@ static const struct proto_ops data_sock_ops = {
.getname= data_sock_getname,
.sendmsg= mISDN_sock_sendmsg,
.recvmsg= mISDN_sock_recvmsg,
-   .poll   = datagram_poll,
+   .poll_mask  = datagram_poll_mask,
.listen = sock_no_listen,
.shutdown   = sock_no_shutdown,
.setsockopt = data_sock_setsockopt,
diff --git a/drivers/net/ppp/pppoe.c b/drivers/net/ppp/pppoe.c
index 5aa59f41bf8c..8c311e626884 100644
--- a/drivers/net/ppp/pppoe.c
+++ b/drivers/net/ppp/pppoe.c
@@ -1120,7 +1120,7 @@ static const struct proto_ops pppoe_ops = {
.socketpair = sock_no_socketpair,
.accept = sock_no_accept,
.getname= pppoe_getname,
-   .poll   = datagram_poll,
+   .poll_mask  = datagram_poll_mask,
.listen = sock_no_listen,
.shutdown   = sock_no_shutdown,
.setsockopt = sock_no_setsockopt,
diff --git a/drivers/staging/ipx/af_ipx.c b/drivers/staging/ipx/af_ipx.c
index d21a9d128d3e..3373f7f67d35 100644
--- a/drivers/staging/ipx/af_ipx.c
+++ b/drivers/staging/ipx/af_ipx.c
@@ -1967,7 +1967,7 @@ static const struct proto_ops ipx_dgram_ops = {
.socketpair = sock_no_socketpair,
.accept = sock_no_accept,
.getname= ipx_getname,
-   .poll   = datagram_poll,
+   .poll_mask  = datagram_poll_mask,
.ioctl  = ipx_ioctl,
 #ifdef CONFIG_COMPAT
.compat_ioctl   = ipx_compat_ioctl,
diff --git a/drivers/staging/irda/net/af_irda.c 
b/drivers/staging/irda/net/af_irda.c
index 2f1e9ab3d6d0..77659b1c40ba 100644
--- a/drivers/staging/irda/net/af_irda.c
+++ b/drivers/staging/irda/net/af_irda.c
@@ -2600,7 +2600,7 @@ static const struct proto_ops irda_seqpacket_ops = {
.socketpair =   sock_no_socketpair,
.accept =   irda_accept,
.getname =  irda_getname,
-   .poll = datagram_poll,
+   .poll_mask =datagram_poll_mask,
.ioctl =irda_ioctl,
 #ifdef CONFIG_COMPAT
.compat_ioctl = irda_compat_ioctl,
@@ -2624,7 +2624,7 @@ static const struct proto_ops irda_dgram_ops = {
.socketpair =   sock_no_socketpair,
.accept =   irda_accept,
.getname =  irda_getname,
-   .poll = datagram_poll,
+   .poll_mask =datagram_poll_mask,
.ioctl =irda_ioctl,
 #ifdef CONFIG_COMPAT
.compat_ioctl = irda_compat_ioctl,
@@ -2649,7 +2649,7 @@ static const struct proto_ops irda_ultra_ops = {
.socketpair =   sock_no_socketpair,
.accept =   sock_no_accept,
.getname =  irda_getname,
-   .poll = datagram_poll,
+   .poll_mask =datagram_poll_mask,
.ioctl =irda_ioctl,
 #ifdef CONFIG_COMPAT
.compat_ioctl = irda_compat_ioctl,
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index ddf77cf4ff2d..1ac027bd33ec 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -3246,8 +3246,7 @@ struct sk_buff *__skb_recv_datagram(struct sock *sk, 
unsigned flags,

1 2 3 4 >

1 - 100 of 388 matches

Mail list logo