[PATCH RFC 0/2] Convert GRO receive over to hash table.

2018-06-23 Thread David Miller


When many parallel flows are present and being received on the same
RX queue, GRO processing can become expensive because each incoming
frame must traverse the per-NAPI GRO list at each protocol layer
of GRO receive (eth --> ipv{4,6} --> tcp).

Use the already computed hash to chain these SKBs in a hash table
instead of a simple list.

The first patch makes the GRO list a true list_head.

The second patch implements the hash table.

This series patches basic testing and I added some diagnostics
to make sure we really were aggregating GRO frames :-)

Signed-off-by: David S. Miller 


[PATCH RFC 1/2] net: Convert GRO SKB handling to list_head.

2018-06-23 Thread David Miller


Manage pending per-NAPI GRO packets via list_head.

Return an SKB pointer from the GRO receive handlers.  When GRO receive
handlers return non-NULL, it means that this SKB needs to be completed
at this time and removed from the NAPI queue.

Several operations are greatly simplified by this transformation,
especially timing out the oldest SKB in the list when gro_count
exceeds MAX_GRO_SKBS, and napi_gro_flush() which walks the queue
in reverse order.

Signed-off-by: David S. Miller 
---
 drivers/net/geneve.c| 11 +++---
 drivers/net/vxlan.c | 11 +++---
 include/linux/etherdevice.h |  3 +-
 include/linux/netdevice.h   | 32 -
 include/linux/skbuff.h  |  3 +-
 include/linux/udp.h |  4 +--
 include/net/inet_common.h   |  2 +-
 include/net/tcp.h   |  2 +-
 include/net/udp.h   |  4 +--
 include/net/udp_tunnel.h|  6 ++--
 net/8021q/vlan.c| 13 +++
 net/core/dev.c  | 68 +++--
 net/core/skbuff.c   |  4 +--
 net/ethernet/eth.c  | 12 +++
 net/ipv4/af_inet.c  | 12 +++
 net/ipv4/esp4_offload.c |  4 +--
 net/ipv4/fou.c  | 20 +--
 net/ipv4/gre_offload.c  |  8 ++---
 net/ipv4/tcp_offload.c  | 14 
 net/ipv4/udp_offload.c  | 13 +++
 net/ipv6/esp6_offload.c |  4 +--
 net/ipv6/ip6_offload.c  | 16 -
 net/ipv6/tcpv6_offload.c|  4 +--
 net/ipv6/udp_offload.c  |  4 +--
 24 files changed, 133 insertions(+), 141 deletions(-)

diff --git a/drivers/net/geneve.c b/drivers/net/geneve.c
index 750eaa53bf0c..3e94375b9b01 100644
--- a/drivers/net/geneve.c
+++ b/drivers/net/geneve.c
@@ -418,11 +418,12 @@ static int geneve_hlen(struct genevehdr *gh)
return sizeof(*gh) + gh->opt_len * 4;
 }
 
-static struct sk_buff **geneve_gro_receive(struct sock *sk,
-  struct sk_buff **head,
-  struct sk_buff *skb)
+static struct sk_buff *geneve_gro_receive(struct sock *sk,
+ struct list_head *head,
+ struct sk_buff *skb)
 {
-   struct sk_buff *p, **pp = NULL;
+   struct sk_buff *pp = NULL;
+   struct sk_buff *p;
struct genevehdr *gh, *gh2;
unsigned int hlen, gh_len, off_gnv;
const struct packet_offload *ptype;
@@ -449,7 +450,7 @@ static struct sk_buff **geneve_gro_receive(struct sock *sk,
goto out;
}
 
-   for (p = *head; p; p = p->next) {
+   list_for_each_entry(p, head, list) {
if (!NAPI_GRO_CB(p)->same_flow)
continue;
 
diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c
index aee0e60471f1..cc14e0cd5647 100644
--- a/drivers/net/vxlan.c
+++ b/drivers/net/vxlan.c
@@ -568,11 +568,12 @@ static struct vxlanhdr *vxlan_gro_remcsum(struct sk_buff 
*skb,
return vh;
 }
 
-static struct sk_buff **vxlan_gro_receive(struct sock *sk,
- struct sk_buff **head,
- struct sk_buff *skb)
+static struct sk_buff *vxlan_gro_receive(struct sock *sk,
+struct list_head *head,
+struct sk_buff *skb)
 {
-   struct sk_buff *p, **pp = NULL;
+   struct sk_buff *pp = NULL;
+   struct sk_buff *p;
struct vxlanhdr *vh, *vh2;
unsigned int hlen, off_vx;
int flush = 1;
@@ -607,7 +608,7 @@ static struct sk_buff **vxlan_gro_receive(struct sock *sk,
 
skb_gro_pull(skb, sizeof(struct vxlanhdr)); /* pull vxlan header */
 
-   for (p = *head; p; p = p->next) {
+   list_for_each_entry(p, head, list) {
if (!NAPI_GRO_CB(p)->same_flow)
continue;
 
diff --git a/include/linux/etherdevice.h b/include/linux/etherdevice.h
index 79563840c295..572e11bb8696 100644
--- a/include/linux/etherdevice.h
+++ b/include/linux/etherdevice.h
@@ -59,8 +59,7 @@ struct net_device *devm_alloc_etherdev_mqs(struct device 
*dev, int sizeof_priv,
   unsigned int rxqs);
 #define devm_alloc_etherdev(dev, sizeof_priv) devm_alloc_etherdev_mqs(dev, 
sizeof_priv, 1, 1)
 
-struct sk_buff **eth_gro_receive(struct sk_buff **head,
-struct sk_buff *skb);
+struct sk_buff *eth_gro_receive(struct list_head *head, struct sk_buff *skb);
 int eth_gro_complete(struct sk_buff *skb, int nhoff);
 
 /* Reserved Ethernet Addresses per IEEE 802.1Q */
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 3ec9850c7936..f176d9873910 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -322,7 +322,7 @@ struct napi_struct {
int poll_owner;
 #endif
struct net_device   *dev;
-   struct sk_buff  *gro_list;
+   struct list_head

[PATCH RFC 2/2] net: Convert NAPI gro list into a small hash table.

2018-06-23 Thread David Miller


Improve the performance of GRO receive by splitting flows into
multiple hash chains.

Suggested-by: Eric Dumazet 
Signed-off-by: David S. Miller 
---
 include/linux/netdevice.h |   3 +-
 net/core/dev.c| 105 --
 2 files changed, 81 insertions(+), 27 deletions(-)

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index f176d9873910..c6b377a15869 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -305,6 +305,7 @@ int __init netdev_boot_setup(char *str);
 /*
  * Structure for NAPI scheduling similar to tasklet but with weighting
  */
+#define GRO_HASH_BUCKETS   8
 struct napi_struct {
/* The poll_list must only be managed by the entity which
 * changes the state of the NAPI_STATE_SCHED bit.  This means
@@ -322,7 +323,7 @@ struct napi_struct {
int poll_owner;
 #endif
struct net_device   *dev;
-   struct list_headgro_list;
+   struct list_headgro_hash[GRO_HASH_BUCKETS];
struct sk_buff  *skb;
struct hrtimer  timer;
struct list_headdev_list;
diff --git a/net/core/dev.c b/net/core/dev.c
index aa61b9344b46..dffed642e686 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -4875,15 +4875,12 @@ static int napi_gro_complete(struct sk_buff *skb)
return netif_receive_skb_internal(skb);
 }
 
-/* napi->gro_list contains packets ordered by age.
- * youngest packets at the head of it.
- * Complete skbs in reverse order to reduce latencies.
- */
-void napi_gro_flush(struct napi_struct *napi, bool flush_old)
+static void __napi_gro_flush_chain(struct napi_struct *napi, struct list_head 
*head,
+  bool flush_old)
 {
struct sk_buff *skb, *p;
 
-   list_for_each_entry_safe_reverse(skb, p, >gro_list, list) {
+   list_for_each_entry_safe_reverse(skb, p, head, list) {
if (flush_old && NAPI_GRO_CB(skb)->age == jiffies)
return;
list_del_init(>list);
@@ -4891,15 +4888,33 @@ void napi_gro_flush(struct napi_struct *napi, bool 
flush_old)
napi->gro_count--;
}
 }
+
+/* napi->gro_hash contains packets ordered by age.
+ * youngest packets at the head of it.
+ * Complete skbs in reverse order to reduce latencies.
+ */
+void napi_gro_flush(struct napi_struct *napi, bool flush_old)
+{
+   int i;
+
+   for (i = 0; i < GRO_HASH_BUCKETS; i++) {
+   struct list_head *head = >gro_hash[i];
+
+   __napi_gro_flush_chain(napi, head, flush_old);
+   }
+}
 EXPORT_SYMBOL(napi_gro_flush);
 
-static void gro_list_prepare(struct napi_struct *napi, struct sk_buff *skb)
+static struct list_head *gro_list_prepare(struct napi_struct *napi,
+ struct sk_buff *skb)
 {
unsigned int maclen = skb->dev->hard_header_len;
u32 hash = skb_get_hash_raw(skb);
+   struct list_head *head;
struct sk_buff *p;
 
-   list_for_each_entry(p, >gro_list, list) {
+   head = >gro_hash[hash & (GRO_HASH_BUCKETS - 1)];
+   list_for_each_entry(p, head, list) {
unsigned long diffs;
 
NAPI_GRO_CB(p)->flush = 0;
@@ -4922,6 +4937,8 @@ static void gro_list_prepare(struct napi_struct *napi, 
struct sk_buff *skb)
   maclen);
NAPI_GRO_CB(p)->same_flow = !diffs;
}
+
+   return head;
 }
 
 static void skb_gro_reset_offset(struct sk_buff *skb)
@@ -4964,11 +4981,45 @@ static void gro_pull_from_frag0(struct sk_buff *skb, 
int grow)
}
 }
 
+static void gro_flush_oldest(struct napi_struct *napi)
+{
+   struct sk_buff *oldest = NULL;
+   unsigned long age = jiffies;
+   int i;
+
+   for (i = 0; i < GRO_HASH_BUCKETS; i++) {
+   struct list_head *head = >gro_hash[i];
+   struct sk_buff *skb;
+
+   if (list_empty(head))
+   continue;
+
+   skb = list_last_entry(head, struct sk_buff, list);
+   if (!oldest || time_before(NAPI_GRO_CB(skb)->age, age)) {
+   oldest = skb;
+   age = NAPI_GRO_CB(skb)->age;
+   }
+   }
+
+   /* We are called with napi->gro_count >= MAX_GRO_SKBS, so this is
+* impossible.
+*/
+   if (WARN_ON_ONCE(!oldest))
+   return;
+
+   /* Do not adjust napi->gro_count, caller is adding a new SKB to
+* the chain.
+*/
+   list_del(>list);
+   napi_gro_complete(oldest);
+}
+
 static enum gro_result dev_gro_receive(struct napi_struct *napi, struct 
sk_buff *skb)
 {
struct list_head *head = _base;
struct packet_offload *ptype;
__be16 type = skb->protocol;
+   struct list_head *gro_head;
struct sk_buff *pp = NULL;
enum gro_result ret;
int same_flow;
@@ -4977,7 +5028,7 @@ 

Re: [PATCH net] cxgb4: when disabling dcb set txq dcb priority to 0

2018-06-23 Thread David Miller
From: Ganesh Goudar 
Date: Sat, 23 Jun 2018 20:28:26 +0530

> When we are disabling DCB, store "0" in txq->dcb_prio
> since that's used for future TX Work Request "OVLAN_IDX"
> values. Setting non zero priority upon disabling DCB
> would halt the traffic.
> 
> Reported-by: AMG Zollner Robert 
> CC: David Ahern 
> Signed-off-by: Casey Leedom 
> Signed-off-by: Ganesh Goudar 

Applied, thanks.


Re: [PATCH] fib_rules: match rules based on suppress_* properties too

2018-06-23 Thread David Miller
From: "Jason A. Donenfeld" 
Date: Sat, 23 Jun 2018 17:59:30 +0200

> Two rules with different values of suppress_prefix or suppress_ifgroup
> are not the same. This fixes an -EEXIST when running:
> 
>$ ip -4 rule add table main suppress_prefixlength 0
> 
> Signed-off-by: Jason A. Donenfeld 
> Fixes: f9d4b0c1e969 ("fib_rules: move common handling of newrule delrule msgs 
> into fib_nl2rule")

But the old rule_find() code didn't check this key either, so I can't
see how the behavior in this area changed.

I think the behavior changed for a different reason.

The commit mentioned in your Fixes: tag changed newrule semantics
wrt. defaults or "any" values.

The original code matched on pure values of the keys, whereas the new
code only compares the keys when the new rule is not specifying an
"any" value.

-   if (r->table != rule->table)
+   if (rule->table && r->table != rule->table)
continue;

And I think these changes are what makes your test case fail after the
commit.  Some other key didn't match previous due to the handling of
"any" values.


[PATCH net-next] net: phy: fixed-phy: Make the error path simpler

2018-06-23 Thread Fabio Estevam
From: Fabio Estevam 

When platform_device_register_simple() fails we can return
the error immediately instead of jumping to the 'err_pdev'
label.

This makes the error path a bit simpler.

Signed-off-by: Fabio Estevam 
---
 drivers/net/phy/fixed_phy.c | 7 ++-
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/drivers/net/phy/fixed_phy.c b/drivers/net/phy/fixed_phy.c
index 001fe1d..67b2608 100644
--- a/drivers/net/phy/fixed_phy.c
+++ b/drivers/net/phy/fixed_phy.c
@@ -259,10 +259,8 @@ static int __init fixed_mdio_bus_init(void)
int ret;
 
pdev = platform_device_register_simple("Fixed MDIO bus", 0, NULL, 0);
-   if (IS_ERR(pdev)) {
-   ret = PTR_ERR(pdev);
-   goto err_pdev;
-   }
+   if (IS_ERR(pdev))
+   return PTR_ERR(pdev);
 
fmb->mii_bus = mdiobus_alloc();
if (fmb->mii_bus == NULL) {
@@ -287,7 +285,6 @@ static int __init fixed_mdio_bus_init(void)
mdiobus_free(fmb->mii_bus);
 err_mdiobus_reg:
platform_device_unregister(pdev);
-err_pdev:
return ret;
 }
 module_init(fixed_mdio_bus_init);
-- 
2.7.4



Re: [PATCH v2 net-next] net/sched: add skbprio scheduler

2018-06-23 Thread Alexander Duyck
On Sat, Jun 23, 2018 at 1:47 PM, Nishanth Devarajan  wrote:
> net/sched: add skbprio scheduler
>
> Skbprio (SKB Priority Queue) is a queueing discipline that prioritizes packets
> according to their skb->priority field. Although Skbprio can be employed in 
> any
> scenario in which a higher skb->priority field means a higher priority packet,
> Skbprio was concieved as a solution for denial-of-service defenses that need 
> to
> route packets with different priorities.

Really this description is not very good. Reading it I was thinking to
myself "why do we need this, prio already does this". It wasn't until
I read through the code that I figured out that you are basically
adding dropping of lower priority frames.

>
> v2
> *Use skb->priority field rather than DS field. Rename queueing discipline as
> SKB Priority Queue (previously Gatekeeper Priority Queue).
>
> *Queueing discipline is made classful to expose Skbprio's internal priority
> queues.
>
> Signed-off-by: Nishanth Devarajan 
> Reviewed-by: Sachin Paryani 
> Reviewed-by: Cody Doucette 
> Reviewed-by: Michel Machado 
> ---
>  include/uapi/linux/pkt_sched.h |  15 ++
>  net/sched/Kconfig  |  13 ++
>  net/sched/Makefile |   1 +
>  net/sched/sch_skbprio.c| 347 
> +
>  4 files changed, 376 insertions(+)
>  create mode 100644 net/sched/sch_skbprio.c
>
> diff --git a/include/uapi/linux/pkt_sched.h b/include/uapi/linux/pkt_sched.h
> index 37b5096..6fd07e8 100644
> --- a/include/uapi/linux/pkt_sched.h
> +++ b/include/uapi/linux/pkt_sched.h
> @@ -124,6 +124,21 @@ struct tc_fifo_qopt {
> __u32   limit;  /* Queue length: bytes for bfifo, packets for pfifo */
>  };
>
> +/* SKBPRIO section */
> +
> +/*
> + * Priorities go from zero to (SKBPRIO_MAX_PRIORITY - 1).
> + * SKBPRIO_MAX_PRIORITY should be at least 64 in order for skbprio to be able
> + * to map one to one the DS field of IPV4 and IPV6 headers.
> + * Memory allocation grows linearly with SKBPRIO_MAX_PRIORITY.
> + */
> +
> +#define SKBPRIO_MAX_PRIORITY 64
> +
> +struct tc_skbprio_qopt {
> +   __u32   limit;  /* Queue length in packets. */
> +};
> +
>  /* PRIO section */
>
>  #define TCQ_PRIO_BANDS 16
> diff --git a/net/sched/Kconfig b/net/sched/Kconfig
> index a01169f..9ac4b53 100644
> --- a/net/sched/Kconfig
> +++ b/net/sched/Kconfig
> @@ -240,6 +240,19 @@ config NET_SCH_MQPRIO
>
>   If unsure, say N.
>
> +config NET_SCH_SKBPRIO
> +   tristate "SKB priority queue scheduler (SKBPRIO)"
> +   help
> + Say Y here if you want to use the SKB priority queue
> + scheduler. This schedules packets according to skb->priority,
> + which is useful for request packets in DoS mitigation systems such
> + as Gatekeeper.
> +
> + To compile this driver as a module, choose M here: the module will
> + be called sch_skbprio.
> +
> + If unsure, say N.
> +
>  config NET_SCH_CHOKE
> tristate "CHOose and Keep responsive flow scheduler (CHOKE)"
> help
> diff --git a/net/sched/Makefile b/net/sched/Makefile
> index 8811d38..a4d8893 100644
> --- a/net/sched/Makefile
> +++ b/net/sched/Makefile
> @@ -46,6 +46,7 @@ obj-$(CONFIG_NET_SCH_NETEM)   += sch_netem.o
>  obj-$(CONFIG_NET_SCH_DRR)  += sch_drr.o
>  obj-$(CONFIG_NET_SCH_PLUG) += sch_plug.o
>  obj-$(CONFIG_NET_SCH_MQPRIO)   += sch_mqprio.o
> +obj-$(CONFIG_NET_SCH_SKBPRIO)  += sch_skbprio.o
>  obj-$(CONFIG_NET_SCH_CHOKE)+= sch_choke.o
>  obj-$(CONFIG_NET_SCH_QFQ)  += sch_qfq.o
>  obj-$(CONFIG_NET_SCH_CODEL)+= sch_codel.o
> diff --git a/net/sched/sch_skbprio.c b/net/sched/sch_skbprio.c
> new file mode 100644
> index 000..5e89446
> --- /dev/null
> +++ b/net/sched/sch_skbprio.c
> @@ -0,0 +1,347 @@
> +/*
> + * net/sched/sch_skbprio.c  SKB Priority Queue.
> + *
> + * This program is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU General Public License
> + * as published by the Free Software Foundation; either version
> + * 2 of the License, or (at your option) any later version.
> + *
> + * Authors:Nishanth Devarajan, 
> + * Cody Doucette, 
> + * original idea by Michel Machado, Cody Doucette, and Qiaobin Fu
> + */
> +
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +
> +
> +/*   SKB Priority Queue
> + * =
> + *
> + * This qdisc schedules a packet according to skb->priority, where a higher
> + * value places the packet closer to the exit of the queue. When the queue is
> + * full, the lowest priority packet in the queue is dropped to make room for
> + * the packet to be added if it has higher priority. If the packet to be 
> added
> + * has lower priority than all packets in the queue, it is dropped.
> + *
> + * Without the SKB priority queue, queue 

Re: [PATCH v2 net-next] net/sched: add skbprio scheduler

2018-06-23 Thread Cong Wang
On Sat, Jun 23, 2018 at 1:47 PM, Nishanth Devarajan  wrote:
> diff --git a/include/uapi/linux/pkt_sched.h b/include/uapi/linux/pkt_sched.h
> index 37b5096..6fd07e8 100644
> --- a/include/uapi/linux/pkt_sched.h
> +++ b/include/uapi/linux/pkt_sched.h
...
> +#define SKBPRIO_MAX_PRIORITY 64
> +
> +struct tc_skbprio_qopt {
> +   __u32   limit;  /* Queue length in packets. */
> +};


Since this is just an integer, you can just make it NLA_U32 instead
of a struct?


> +static int skbprio_change(struct Qdisc *sch, struct nlattr *opt,
> +   struct netlink_ext_ack *extack)
> +{
> +   struct skbprio_sched_data *q = qdisc_priv(sch);
> +   struct tc_skbprio_qopt *ctl = nla_data(opt);
> +   const unsigned int min_limit = 1;
> +
> +   if (ctl->limit == (typeof(ctl->limit))-1)
> +   q->max_limit = max(qdisc_dev(sch)->tx_queue_len, min_limit);
> +   else if (ctl->limit < min_limit ||
> +   ctl->limit > qdisc_dev(sch)->tx_queue_len)
> +   return -EINVAL;
> +   else
> +   q->max_limit = ctl->limit;
> +
> +   return 0;
> +}

Isn't q->max_limit same with sch->limit?

Also, please avoid dev->tx_queue_len here, it may change
independently of your qdisc change, unless you want to implement
ops->change_tx_queue_len().


[PATCH v2 net-next] net/sched: add skbprio scheduler

2018-06-23 Thread Nishanth Devarajan
net/sched: add skbprio scheduler

Skbprio (SKB Priority Queue) is a queueing discipline that prioritizes packets
according to their skb->priority field. Although Skbprio can be employed in any
scenario in which a higher skb->priority field means a higher priority packet,
Skbprio was concieved as a solution for denial-of-service defenses that need to
route packets with different priorities.

v2
*Use skb->priority field rather than DS field. Rename queueing discipline as
SKB Priority Queue (previously Gatekeeper Priority Queue).

*Queueing discipline is made classful to expose Skbprio's internal priority
queues.

Signed-off-by: Nishanth Devarajan 
Reviewed-by: Sachin Paryani 
Reviewed-by: Cody Doucette 
Reviewed-by: Michel Machado 
---
 include/uapi/linux/pkt_sched.h |  15 ++
 net/sched/Kconfig  |  13 ++
 net/sched/Makefile |   1 +
 net/sched/sch_skbprio.c| 347 +
 4 files changed, 376 insertions(+)
 create mode 100644 net/sched/sch_skbprio.c

diff --git a/include/uapi/linux/pkt_sched.h b/include/uapi/linux/pkt_sched.h
index 37b5096..6fd07e8 100644
--- a/include/uapi/linux/pkt_sched.h
+++ b/include/uapi/linux/pkt_sched.h
@@ -124,6 +124,21 @@ struct tc_fifo_qopt {
__u32   limit;  /* Queue length: bytes for bfifo, packets for pfifo */
 };
 
+/* SKBPRIO section */
+
+/*
+ * Priorities go from zero to (SKBPRIO_MAX_PRIORITY - 1).
+ * SKBPRIO_MAX_PRIORITY should be at least 64 in order for skbprio to be able
+ * to map one to one the DS field of IPV4 and IPV6 headers.
+ * Memory allocation grows linearly with SKBPRIO_MAX_PRIORITY.
+ */
+
+#define SKBPRIO_MAX_PRIORITY 64
+
+struct tc_skbprio_qopt {
+   __u32   limit;  /* Queue length in packets. */
+};
+
 /* PRIO section */
 
 #define TCQ_PRIO_BANDS 16
diff --git a/net/sched/Kconfig b/net/sched/Kconfig
index a01169f..9ac4b53 100644
--- a/net/sched/Kconfig
+++ b/net/sched/Kconfig
@@ -240,6 +240,19 @@ config NET_SCH_MQPRIO
 
  If unsure, say N.
 
+config NET_SCH_SKBPRIO
+   tristate "SKB priority queue scheduler (SKBPRIO)"
+   help
+ Say Y here if you want to use the SKB priority queue
+ scheduler. This schedules packets according to skb->priority,
+ which is useful for request packets in DoS mitigation systems such
+ as Gatekeeper.
+
+ To compile this driver as a module, choose M here: the module will
+ be called sch_skbprio.
+
+ If unsure, say N.
+
 config NET_SCH_CHOKE
tristate "CHOose and Keep responsive flow scheduler (CHOKE)"
help
diff --git a/net/sched/Makefile b/net/sched/Makefile
index 8811d38..a4d8893 100644
--- a/net/sched/Makefile
+++ b/net/sched/Makefile
@@ -46,6 +46,7 @@ obj-$(CONFIG_NET_SCH_NETEM)   += sch_netem.o
 obj-$(CONFIG_NET_SCH_DRR)  += sch_drr.o
 obj-$(CONFIG_NET_SCH_PLUG) += sch_plug.o
 obj-$(CONFIG_NET_SCH_MQPRIO)   += sch_mqprio.o
+obj-$(CONFIG_NET_SCH_SKBPRIO)  += sch_skbprio.o
 obj-$(CONFIG_NET_SCH_CHOKE)+= sch_choke.o
 obj-$(CONFIG_NET_SCH_QFQ)  += sch_qfq.o
 obj-$(CONFIG_NET_SCH_CODEL)+= sch_codel.o
diff --git a/net/sched/sch_skbprio.c b/net/sched/sch_skbprio.c
new file mode 100644
index 000..5e89446
--- /dev/null
+++ b/net/sched/sch_skbprio.c
@@ -0,0 +1,347 @@
+/*
+ * net/sched/sch_skbprio.c  SKB Priority Queue.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Authors:Nishanth Devarajan, 
+ * Cody Doucette, 
+ * original idea by Michel Machado, Cody Doucette, and Qiaobin Fu
+ */
+
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+
+/*   SKB Priority Queue
+ * =
+ *
+ * This qdisc schedules a packet according to skb->priority, where a higher
+ * value places the packet closer to the exit of the queue. When the queue is
+ * full, the lowest priority packet in the queue is dropped to make room for
+ * the packet to be added if it has higher priority. If the packet to be added
+ * has lower priority than all packets in the queue, it is dropped.
+ *
+ * Without the SKB priority queue, queue length limits must be imposed
+ * for individual queues, and there is no easy way to enforce a global queue
+ * length limit across all priorities. With the SKBprio queue, a global
+ * queue length limit can be enforced while not restricting the queue lengths
+ * of individual priorities.
+ *
+ * This is especially useful for a denial-of-service defense system like
+ * Gatekeeper, which prioritizes packets in flows that demonstrate expected
+ * behavior of legitimate users. The queue is flexible to allow any number
+ * of packets of any priority up to the global 

[Patch net-next] net_sched: remove unused htb drop_list

2018-06-23 Thread Cong Wang
After commit a09ceb0e0814 ("sched: remove qdisc->drop"),
it is no longer used.

Cc: Florian Westphal 
Signed-off-by: Cong Wang 
---
 net/sched/sch_htb.c | 13 -
 1 file changed, 13 deletions(-)

diff --git a/net/sched/sch_htb.c b/net/sched/sch_htb.c
index 2a4ab7caf553..43c4bfe625a9 100644
--- a/net/sched/sch_htb.c
+++ b/net/sched/sch_htb.c
@@ -126,7 +126,6 @@ struct htb_class {
 
union {
struct htb_class_leaf {
-   struct list_head drop_list;
int deficit[TC_HTB_MAXDEPTH];
struct Qdisc*q;
} leaf;
@@ -171,7 +170,6 @@ struct htb_sched {
struct qdisc_watchdog   watchdog;
 
s64 now;/* cached dequeue time */
-   struct list_headdrops[TC_HTB_NUMPRIO];/* active leaves (for 
drops) */
 
/* time of nearest event per level (row) */
s64 near_ev_cache[TC_HTB_MAXDEPTH];
@@ -562,8 +560,6 @@ static inline void htb_activate(struct htb_sched *q, struct 
htb_class *cl)
if (!cl->prio_activity) {
cl->prio_activity = 1 << cl->prio;
htb_activate_prios(q, cl);
-   list_add_tail(>un.leaf.drop_list,
- q->drops + cl->prio);
}
 }
 
@@ -579,7 +575,6 @@ static inline void htb_deactivate(struct htb_sched *q, 
struct htb_class *cl)
 
htb_deactivate_prios(q, cl);
cl->prio_activity = 0;
-   list_del_init(>un.leaf.drop_list);
 }
 
 static void htb_enqueue_tail(struct sk_buff *skb, struct Qdisc *sch,
@@ -981,7 +976,6 @@ static void htb_reset(struct Qdisc *sch)
else {
if (cl->un.leaf.q)
qdisc_reset(cl->un.leaf.q);
-   INIT_LIST_HEAD(>un.leaf.drop_list);
}
cl->prio_activity = 0;
cl->cmode = HTB_CAN_SEND;
@@ -993,8 +987,6 @@ static void htb_reset(struct Qdisc *sch)
sch->qstats.backlog = 0;
memset(q->hlevel, 0, sizeof(q->hlevel));
memset(q->row_mask, 0, sizeof(q->row_mask));
-   for (i = 0; i < TC_HTB_NUMPRIO; i++)
-   INIT_LIST_HEAD(q->drops + i);
 }
 
 static const struct nla_policy htb_policy[TCA_HTB_MAX + 1] = {
@@ -1024,7 +1016,6 @@ static int htb_init(struct Qdisc *sch, struct nlattr *opt,
struct nlattr *tb[TCA_HTB_MAX + 1];
struct tc_htb_glob *gopt;
int err;
-   int i;
 
qdisc_watchdog_init(>watchdog, sch);
INIT_WORK(>work, htb_work_func);
@@ -1050,8 +1041,6 @@ static int htb_init(struct Qdisc *sch, struct nlattr *opt,
err = qdisc_class_hash_init(>clhash);
if (err < 0)
return err;
-   for (i = 0; i < TC_HTB_NUMPRIO; i++)
-   INIT_LIST_HEAD(q->drops + i);
 
qdisc_skb_head_init(>direct_queue);
 
@@ -1224,7 +1213,6 @@ static void htb_parent_to_leaf(struct htb_sched *q, 
struct htb_class *cl,
 
parent->level = 0;
memset(>un.inner, 0, sizeof(parent->un.inner));
-   INIT_LIST_HEAD(>un.leaf.drop_list);
parent->un.leaf.q = new_q ? new_q : _qdisc;
parent->tokens = parent->buffer;
parent->ctokens = parent->cbuffer;
@@ -1418,7 +1406,6 @@ static int htb_change_class(struct Qdisc *sch, u32 
classid,
}
 
cl->children = 0;
-   INIT_LIST_HEAD(>un.leaf.drop_list);
RB_CLEAR_NODE(>pq_node);
 
for (prio = 0; prio < TC_HTB_NUMPRIO; prio++)
-- 
2.14.4



[PATCH] fib_rules: match rules based on suppress_* properties too

2018-06-23 Thread Jason A. Donenfeld
Two rules with different values of suppress_prefix or suppress_ifgroup
are not the same. This fixes an -EEXIST when running:

   $ ip -4 rule add table main suppress_prefixlength 0

Signed-off-by: Jason A. Donenfeld 
Fixes: f9d4b0c1e969 ("fib_rules: move common handling of newrule delrule msgs 
into fib_nl2rule")
---
 net/core/fib_rules.c | 6 ++
 1 file changed, 6 insertions(+)

diff --git a/net/core/fib_rules.c b/net/core/fib_rules.c
index 126ffc5bc630..665799311b98 100644
--- a/net/core/fib_rules.c
+++ b/net/core/fib_rules.c
@@ -416,6 +416,12 @@ static struct fib_rule *rule_find(struct fib_rules_ops 
*ops,
if (rule->mark && r->mark != rule->mark)
continue;
 
+   if (r->suppress_ifgroup != rule->suppress_ifgroup)
+   continue;
+
+   if (r->suppress_prefixlen != rule->suppress_prefixlen)
+   continue;
+
if (rule->mark_mask && r->mark_mask != rule->mark_mask)
continue;
 
-- 
2.17.1



[net regression] "fib_rules: move common handling of newrule delrule msgs into fib_nl2rule" breaks suppress_prefixlength

2018-06-23 Thread Jason A. Donenfeld
Hey Roopa,

On a kernel with a minimal networking config,
CONFIG_IP_MULTIPLE_TABLES appears to be broken for certain rules after
f9d4b0c1e9695e3de7af3768205bacc27312320c.

Try, for example, running:

$ ip -4 rule add table main suppress_prefixlength 0

It returns with EEXIST.

Perhaps the reason is that the new rule_find function does not match
on suppress_prefixlength? However, rule_exist from before didn't do
that either. I'll keep playing and see if I can track it down myself,
but thought I should let you know first.

A relevant .config can be found at https://א.cc/iq5HoUY0

Jason


Re: [PATCH net-next 3/4] netdevsim: add ipsec offload testing

2018-06-23 Thread Shannon Nelson

On 6/22/2018 9:07 PM, Jakub Kicinski wrote:

On Fri, 22 Jun 2018 17:31:37 -0700, Shannon Nelson wrote:

Implement the IPsec/XFRM offload API for testing.

Signed-off-by: Shannon Nelson 


Thanks for the patch!  Just a number of stylistic nit picks.


Thanks for the comments, I'll do a v2 in a couple of days.
sln




diff --git a/drivers/net/netdevsim/ipsec.c b/drivers/net/netdevsim/ipsec.c
new file mode 100644
index 000..ad64266
--- /dev/null
+++ b/drivers/net/netdevsim/ipsec.c
@@ -0,0 +1,345 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright(c) 2018 Oracle and/or its affiliates. All rights reserved. */
+
+#include 
+#include 
+#include 
+#include "netdevsim.h"


Other files in the driver sort headers alphabetically and put an empty
line between global and local headers.


+#define NSIM_IPSEC_AUTH_BITS   128
+
+/**
+ * nsim_ipsec_dbg_read - read for ipsec data
+ * @filp: the opened file
+ * @buffer: where to write the data for the user to read
+ * @count: the size of the user's buffer
+ * @ppos: file position offset
+ **/
+static ssize_t nsim_dbg_netdev_ops_read(struct file *filp,


Doesn't match the kdoc.  Please run

./scripts/kernel-doc -none $file

if you want kdoc.  Although IMHO you may as well drop the kdoc, your
code is quite self explanatory and local.


+   char __user *buffer,
+   size_t count, loff_t *ppos)
+{
+   struct netdevsim *ns = filp->private_data;
+   struct nsim_ipsec *ipsec = >ipsec;
+   size_t bufsize;
+   char *buf, *p;
+   int len;
+   int i;
+
+   /* don't allow partial reads */
+   if (*ppos != 0)
+   return 0;
+
+   /* the buffer needed is
+* (num SAs * 3 lines each * ~60 bytes per line) + one more line
+*/
+   bufsize = (ipsec->count * 4 * 60) + 60;
+   buf = kzalloc(bufsize, GFP_KERNEL);
+   if (!buf)
+   return -ENOMEM;
+
+   p = buf;
+   p += snprintf(p, bufsize - (p - buf),
+ "SA count=%u tx=%u\n",
+ ipsec->count, ipsec->tx);
+
+   for (i = 0; i < NSIM_IPSEC_MAX_SA_COUNT; i++) {
+   struct nsim_sa *sap = >sa[i];
+
+   if (!sap->used)
+   continue;
+
+   p += snprintf(p, bufsize - (p - buf),
+ "sa[%i] %cx ipaddr=0x%08x %08x %08x %08x\n",
+ i, (sap->rx ? 'r' : 't'), sap->ipaddr[0],
+ sap->ipaddr[1], sap->ipaddr[2], sap->ipaddr[3]);
+   p += snprintf(p, bufsize - (p - buf),
+ "sa[%i]spi=0x%08x proto=0x%x salt=0x%08x 
crypt=%d\n",
+ i, be32_to_cpu(sap->xs->id.spi),
+ sap->xs->id.proto, sap->salt, sap->crypt);
+   p += snprintf(p, bufsize - (p - buf),
+ "sa[%i]key=0x%08x %08x %08x %08x\n",
+ i, sap->key[0], sap->key[1],
+ sap->key[2], sap->key[3]);
+   }
+
+   len = simple_read_from_buffer(buffer, count, ppos, buf, p - buf);


Why not seq_file for this?


+   kfree(buf);
+   return len;
+}
+
+static const struct file_operations ipsec_dbg_fops = {
+   .owner = THIS_MODULE,
+   .open = simple_open,
+   .read = nsim_dbg_netdev_ops_read,
+};
+
+/**
+ * nsim_ipsec_find_empty_idx - find the first unused security parameter index
+ * @ipsec: pointer to ipsec struct
+ **/
+static int nsim_ipsec_find_empty_idx(struct nsim_ipsec *ipsec)
+{
+   u32 i;
+
+   if (ipsec->count == NSIM_IPSEC_MAX_SA_COUNT)
+   return -ENOSPC;
+
+   /* search sa table */
+   for (i = 0; i < NSIM_IPSEC_MAX_SA_COUNT; i++) {
+   if (!ipsec->sa[i].used)
+   return i;
+   }
+
+   return -ENOSPC;


FWIW I personally find bitmaps and find_first_zero_bit() etc. nice and
concise for a small ID allocator, but no objection to open coding.


+}
+
+/**
+ * nsim_ipsec_parse_proto_keys - find the key and salt based on the protocol
+ * @xs: pointer to xfrm_state struct
+ * @mykey: pointer to key array to populate
+ * @mysalt: pointer to salt value to populate
+ *
+ * This copies the protocol keys and salt to our own data tables.  The
+ * 82599 family only supports the one algorithm.


82599 is a fine chip, it's not netdevsim tho? ;)


+ **/
+static int nsim_ipsec_parse_proto_keys(struct xfrm_state *xs,
+  u32 *mykey, u32 *mysalt)
+{
+   struct net_device *dev = xs->xso.dev;
+   unsigned char *key_data;
+   char *alg_name = NULL;
+   const char aes_gcm_name[] = "rfc4106(gcm(aes))";
+   int key_len;


reverse xmas tree please


+
+   if (!xs->aead) {
+   netdev_err(dev, "Unsupported IPsec algorithm\n");
+   return -EINVAL;
+   }
+
+   if (xs->aead->alg_icv_len != NSIM_IPSEC_AUTH_BITS) {
+

[PATCH net] cxgb4: when disabling dcb set txq dcb priority to 0

2018-06-23 Thread Ganesh Goudar
When we are disabling DCB, store "0" in txq->dcb_prio
since that's used for future TX Work Request "OVLAN_IDX"
values. Setting non zero priority upon disabling DCB
would halt the traffic.

Reported-by: AMG Zollner Robert 
CC: David Ahern 
Signed-off-by: Casey Leedom 
Signed-off-by: Ganesh Goudar 
---
 drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c 
b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c
index 35cb3ae..aaaf775 100644
--- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c
+++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c
@@ -263,7 +263,7 @@ static void dcb_tx_queue_prio_enable(struct net_device 
*dev, int enable)
"Can't %s DCB Priority on port %d, TX Queue %d: 
err=%d\n",
enable ? "set" : "unset", pi->port_id, i, -err);
else
-   txq->dcb_prio = value;
+   txq->dcb_prio = enable ? value : 0;
}
 }
 
-- 
2.1.0



Re: [PATCH net-next 0/2] r8169: enable ASPM on RTL8168E-VL

2018-06-23 Thread David Miller
From: Heiner Kallweit 
Date: Sat, 23 Jun 2018 09:49:37 +0200

> This patch series enables ASPM for the RTL8168E-VL and aligns ASPM entry
> latency handling with the vendor driver before.

Looks good to me, series applied, thank you.


Re: [PATCH 00/14] net: pch_gbe: Cleanups

2018-06-23 Thread David Miller
From: Paul Burton 
Date: Fri, 22 Jun 2018 20:17:39 -0700

> This series begins the process of cleaning up the pch_gbe network
> driver. Whilst my ultimate goal is to add support for using this driver
> on the MIPS Boston development board, this series sets that aside in
> favor of making some more general cleanups. My hope is that this will
> both make the driver a little more maleable & reduce the probability of
> me gouging out my eyes.
> 
> Applies cleanly atop net-next as of 5424ea27390f ("netns: get more
> entropy from net_hash_mix()").

It is rather awesome to see all of that HAL stuff disappear.

Series applied, thanks!


bpf-next is OPEN

2018-06-23 Thread Daniel Borkmann
Merge window is over so new bpf-next development round begins. Due to travel
this whole week till this Sun mostly offline, but feel free to push your
patches out already so they land in patchwork meanwhile.


[PATCH net-next 1/2] r8169: align ASPM entry latency setting with vendor driver

2018-06-23 Thread Heiner Kallweit
The r8168 vendor driver always uses value 0x27. In r8169 we have few
chips where 0x17 is used. So far this didn't matter because ASPM was
disabled anyway. Now that ASPM was re-enabled let's also use 0x27 only.

One of the chips affected by this change is RTL8168E-VL, on my system
with this chip value 0x27 works fine.

In addition rename rtl_csi_access_enable_2() to
rtl_set_def_aspm_entry_latency() to make clear that we set the default
ASPM entry latency.

Signed-off-by: Heiner Kallweit 
---
 drivers/net/ethernet/realtek/r8169.c | 43 
 1 file changed, 19 insertions(+), 24 deletions(-)

diff --git a/drivers/net/ethernet/realtek/r8169.c 
b/drivers/net/ethernet/realtek/r8169.c
index 06e14da1..d5e380d9 100644
--- a/drivers/net/ethernet/realtek/r8169.c
+++ b/drivers/net/ethernet/realtek/r8169.c
@@ -5235,12 +5235,7 @@ static void rtl_csi_access_enable(struct rtl8169_private 
*tp, u8 val)
rtl_csi_write(tp, 0x070c, csi | val << 24);
 }
 
-static void rtl_csi_access_enable_1(struct rtl8169_private *tp)
-{
-   rtl_csi_access_enable(tp, 0x17);
-}
-
-static void rtl_csi_access_enable_2(struct rtl8169_private *tp)
+static void rtl_set_def_aspm_entry_latency(struct rtl8169_private *tp)
 {
rtl_csi_access_enable(tp, 0x27);
 }
@@ -5347,7 +5342,7 @@ static void rtl_hw_start_8168cp_1(struct rtl8169_private 
*tp)
{ 0x07, 0,  0x2000 }
};
 
-   rtl_csi_access_enable_2(tp);
+   rtl_set_def_aspm_entry_latency(tp);
 
rtl_ephy_init(tp, e_info_8168cp, ARRAY_SIZE(e_info_8168cp));
 
@@ -5356,7 +5351,7 @@ static void rtl_hw_start_8168cp_1(struct rtl8169_private 
*tp)
 
 static void rtl_hw_start_8168cp_2(struct rtl8169_private *tp)
 {
-   rtl_csi_access_enable_2(tp);
+   rtl_set_def_aspm_entry_latency(tp);
 
RTL_W8(tp, Config3, RTL_R8(tp, Config3) & ~Beacon_en);
 
@@ -5369,7 +5364,7 @@ static void rtl_hw_start_8168cp_2(struct rtl8169_private 
*tp)
 
 static void rtl_hw_start_8168cp_3(struct rtl8169_private *tp)
 {
-   rtl_csi_access_enable_2(tp);
+   rtl_set_def_aspm_entry_latency(tp);
 
RTL_W8(tp, Config3, RTL_R8(tp, Config3) & ~Beacon_en);
 
@@ -5393,7 +5388,7 @@ static void rtl_hw_start_8168c_1(struct rtl8169_private 
*tp)
{ 0x06, 0x0080, 0x }
};
 
-   rtl_csi_access_enable_2(tp);
+   rtl_set_def_aspm_entry_latency(tp);
 
RTL_W8(tp, DBG_REG, 0x06 | FIX_NAK_1 | FIX_NAK_2);
 
@@ -5409,7 +5404,7 @@ static void rtl_hw_start_8168c_2(struct rtl8169_private 
*tp)
{ 0x03, 0x0400, 0x0220 }
};
 
-   rtl_csi_access_enable_2(tp);
+   rtl_set_def_aspm_entry_latency(tp);
 
rtl_ephy_init(tp, e_info_8168c_2, ARRAY_SIZE(e_info_8168c_2));
 
@@ -5423,14 +5418,14 @@ static void rtl_hw_start_8168c_3(struct rtl8169_private 
*tp)
 
 static void rtl_hw_start_8168c_4(struct rtl8169_private *tp)
 {
-   rtl_csi_access_enable_2(tp);
+   rtl_set_def_aspm_entry_latency(tp);
 
__rtl_hw_start_8168cp(tp);
 }
 
 static void rtl_hw_start_8168d(struct rtl8169_private *tp)
 {
-   rtl_csi_access_enable_2(tp);
+   rtl_set_def_aspm_entry_latency(tp);
 
rtl_disable_clock_request(tp);
 
@@ -5445,7 +5440,7 @@ static void rtl_hw_start_8168d(struct rtl8169_private *tp)
 
 static void rtl_hw_start_8168dp(struct rtl8169_private *tp)
 {
-   rtl_csi_access_enable_1(tp);
+   rtl_set_def_aspm_entry_latency(tp);
 
if (tp->dev->mtu <= ETH_DATA_LEN)
rtl_tx_performance_tweak(tp, PCI_EXP_DEVCTL_READRQ_4096B);
@@ -5463,7 +5458,7 @@ static void rtl_hw_start_8168d_4(struct rtl8169_private 
*tp)
{ 0x0c, 0x0100, 0x0020 }
};
 
-   rtl_csi_access_enable_1(tp);
+   rtl_set_def_aspm_entry_latency(tp);
 
rtl_tx_performance_tweak(tp, PCI_EXP_DEVCTL_READRQ_4096B);
 
@@ -5492,7 +5487,7 @@ static void rtl_hw_start_8168e_1(struct rtl8169_private 
*tp)
{ 0x0a, 0x, 0x0040 }
};
 
-   rtl_csi_access_enable_2(tp);
+   rtl_set_def_aspm_entry_latency(tp);
 
rtl_ephy_init(tp, e_info_8168e_1, ARRAY_SIZE(e_info_8168e_1));
 
@@ -5517,7 +5512,7 @@ static void rtl_hw_start_8168e_2(struct rtl8169_private 
*tp)
{ 0x19, 0x, 0x0224 }
};
 
-   rtl_csi_access_enable_1(tp);
+   rtl_set_def_aspm_entry_latency(tp);
 
rtl_ephy_init(tp, e_info_8168e_2, ARRAY_SIZE(e_info_8168e_2));
 
@@ -5550,7 +5545,7 @@ static void rtl_hw_start_8168e_2(struct rtl8169_private 
*tp)
 
 static void rtl_hw_start_8168f(struct rtl8169_private *tp)
 {
-   rtl_csi_access_enable_2(tp);
+   rtl_set_def_aspm_entry_latency(tp);
 
rtl_tx_performance_tweak(tp, PCI_EXP_DEVCTL_READRQ_4096B);
 
@@ -5621,7 +5616,7 @@ static void rtl_hw_start_8168g(struct rtl8169_private *tp)
rtl_eri_write(tp, 0xd0, ERIAR_MASK_0001, 0x48, ERIAR_EXGMAC);
rtl_eri_write(tp, 0xe8, ERIAR_MASK_, 0x0016, ERIAR_EXGMAC);
 
-   

[PATCH net-next 2/2] r8169: enable ASPM on RTL8168E-VL

2018-06-23 Thread Heiner Kallweit
Let's enable ASPM also on the RTL8168E-VL (chip version 34).
Works fine on my Zotac Mini PC with this chip. Temperature when
being idle is significantly lower than before due to reaching
deeper PC states.

Signed-off-by: Heiner Kallweit 
---
 drivers/net/ethernet/realtek/r8169.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/drivers/net/ethernet/realtek/r8169.c 
b/drivers/net/ethernet/realtek/r8169.c
index d5e380d9..44715958 100644
--- a/drivers/net/ethernet/realtek/r8169.c
+++ b/drivers/net/ethernet/realtek/r8169.c
@@ -5541,6 +5541,8 @@ static void rtl_hw_start_8168e_2(struct rtl8169_private 
*tp)
RTL_W8(tp, DLLPR, RTL_R8(tp, DLLPR) | PFM_EN);
RTL_W32(tp, MISC, RTL_R32(tp, MISC) | PWM_EN);
RTL_W8(tp, Config5, RTL_R8(tp, Config5) & ~Spi_en);
+
+   rtl_hw_aspm_clkreq_enable(tp, true);
 }
 
 static void rtl_hw_start_8168f(struct rtl8169_private *tp)
-- 
2.18.0




[PATCH net-next 0/2] r8169: enable ASPM on RTL8168E-VL

2018-06-23 Thread Heiner Kallweit
This patch series enables ASPM for the RTL8168E-VL and aligns ASPM entry
latency handling with the vendor driver before.

Heiner Kallweit (2):
  r8169: align ASPM entry latency setting with vendor driver
  r8169: enable ASPM on RTL8168E-VL

 drivers/net/ethernet/realtek/r8169.c | 45 +---
 1 file changed, 21 insertions(+), 24 deletions(-)

-- 
2.18.0



Re: [bpf PATCH v3 3/4] bpf: sockhash fix omitted bucket lock in sock_close

2018-06-23 Thread Martin KaFai Lau
On Fri, Jun 22, 2018 at 08:21:44AM -0700, John Fastabend wrote:
> First in tcp_close, reduce scope of sk_callback_lock() the lock is
> only needed for protecting maps list the ingress and cork
> lists are protected by sock lock. Having the lock in wider scope is
> harmless but may confuse the reader who may infer it is in fact
> needed.
> 
> Next, in sock_hash_delete_elem() the pattern is as follows,
> 
>   sock_hash_delete_elem()
>  [...]
>  spin_lock(bucket_lock)
>  l = lookup_elem_raw()
>  if (l)
> hlist_del_rcu()
> write_lock(sk_callback_lock)
>   destroy psock ...
> write_unlock(sk_callback_lock)
>  spin_unlock(bucket_lock)
> 
> The ordering is necessary because we only know the {p}sock after
> dereferencing the hash table which we can't do unless we have the
> bucket lock held. Once we have the bucket lock and the psock element
> it is deleted from the hashmap to ensure any other path doing a lookup
> will fail. Finally, the refcnt is decremented and if zero the psock
> is destroyed.
> 
> In parallel with the above (or free'ing the map) a tcp close event
> may trigger tcp_close(). Which at the moment omits the bucket lock
> altogether (oops!) where the flow looks like this,
> 
>   bpf_tcp_close()
>  [...]
>  write_lock(sk_callback_lock)
>  for each psock->maps // list of maps this sock is part of
>  hlist_del_rcu(ref_hash_node);
>   destroy psock ...
>  write_unlock(sk_callback_lock)
> 
> Obviously, and demonstrated by syzbot, this is broken because
> we can have multiple threads deleting entries via hlist_del_rcu().
> 
> To fix this we might be tempted to wrap the hlist operation in a
> bucket lock but that would create a lock inversion problem. In
> summary to follow locking rules the psocks maps list needs the
> sk_callback_lock but we need the bucket lock to do the hlist_del_rcu.
> To resolve the lock inversion problem pop the head of the maps list
> repeatedly and remove the reference until no more are left. If a
> delete happens in parallel from the BPF API that is OK as well because
> it will do a similar action, lookup the lock in the map/hash, delete
> it from the map/hash, and dec the refcnt. We check for this case
> before doing a destroy on the psock to ensure we don't have two
> threads tearing down a psock. The new logic is as follows,
> 
>   bpf_tcp_close()
>   e = psock_map_pop(psock->maps) // done with sk_callback_lock
>   bucket_lock() // lock hash list bucket
>   l = lookup_elem_raw(head, hash, key, key_size);
>   if (l) {
>  //only get here if elmnt was not already removed
>  hlist_del_rcu()
>  ... destroy psock...
>   }
>   bucket_unlock()
> 
> And finally for all the above to work add missing sk_callback_lock
> around smap_list_remove in sock_hash_ctx_update_elem(). Otherwise
> delete and update may corrupt maps list. Then add RCU annotations and
> use rcu_dereference/rcu_assign_pointer to manage values relying on
> RCU so that the object is not free'd from sock_hash_free() while it
> is being referenced in bpf_tcp_close().
> 
> (As an aside the sk_callback_lock serves two purposes. The
>  first, is to update the sock callbacks sk_data_ready, sk_write_space,
>  etc. The second is to protect the psock 'maps' list. The 'maps' list
>  is used to (as shown above) to delete all map/hash references to a
>  sock when the sock is closed)
> 
> Reported-by: syzbot+0ce137753c78f7b6a...@syzkaller.appspotmail.com
> Fixes: 81110384441a ("bpf: sockmap, add hash map support")
> Signed-off-by: John Fastabend 
> ---
>  kernel/bpf/sockmap.c |  120 
> +++---
>  1 file changed, 84 insertions(+), 36 deletions(-)
> 
> diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c
> index 69b26af..333427b 100644
> --- a/kernel/bpf/sockmap.c
> +++ b/kernel/bpf/sockmap.c
> @@ -72,6 +72,7 @@ struct bpf_htab {
>   u32 n_buckets;
>   u32 elem_size;
>   struct bpf_sock_progs progs;
> + struct rcu_head rcu;
>  };
>  
>  struct htab_elem {
> @@ -89,8 +90,8 @@ enum smap_psock_state {
>  struct smap_psock_map_entry {
>   struct list_head list;
>   struct sock **entry;
> - struct htab_elem *hash_link;
> - struct bpf_htab *htab;
> + struct htab_elem __rcu *hash_link;
> + struct bpf_htab __rcu *htab;
>  };
>  
>  struct smap_psock {
> @@ -258,16 +259,54 @@ static void bpf_tcp_release(struct sock *sk)
>   rcu_read_unlock();
>  }
>  
> +static struct htab_elem *lookup_elem_raw(struct hlist_head *head,
> +  u32 hash, void *key, u32 key_size)
> +{
> + struct htab_elem *l;
> +
> + hlist_for_each_entry_rcu(l, head, hash_node) {
> + if (l->hash == hash && !memcmp(>key, key, key_size))
> + return l;
> + }
> +
> + return NULL;
> +}
> +
> +static inline struct bucket *__select_bucket(struct bpf_htab *htab, u32 hash)
> +{
> + return >buckets[hash & 

Re: [bpf PATCH v3 2/4] bpf: sockmap, fix smap_list_map_remove when psock is in many maps

2018-06-23 Thread Martin KaFai Lau
On Fri, Jun 22, 2018 at 08:21:39AM -0700, John Fastabend wrote:
> If a hashmap is free'd with open socks it removes the reference to
> the hash entry from the psock. If that is the last reference to the
> psock then it will also be free'd by the reference counting logic.
> However the current logic that removes the hash reference from the
> list of references is broken. In map_list_map_remove() we first check
s/map_list_map_remove/smap_list_remove/

> if the sockmap entry matches and then check if the hashmap entry
> matches. But, the sockmap entry sill always match because its NULL in
> this case which causes the first entry to be removed from the list.
> If this is always the "right" entry (because the user adds/removes
> entries in order) then everything is OK but otherwise a subsequent
> bpf_tcp_close() may reference a free'd object.
> 
> To fix this create two list handlers one for sockmap and one for
> sockhash.
> 
> Reported-by: syzbot+0ce137753c78f7b6a...@syzkaller.appspotmail.com
> Fixes: 81110384441a ("bpf: sockmap, add hash map support")
> Signed-off-by: John Fastabend 
One nit.  Other than that,

Acked-by: Martin KaFai Lau 

> ---
>  kernel/bpf/sockmap.c |   33 +
>  1 file changed, 21 insertions(+), 12 deletions(-)
> 
> diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c
> index d7fd17a..69b26af 100644
> --- a/kernel/bpf/sockmap.c
> +++ b/kernel/bpf/sockmap.c
> @@ -1602,17 +1602,26 @@ static struct bpf_map *sock_map_alloc(union bpf_attr 
> *attr)
>   return ERR_PTR(err);
>  }
>  
> -static void smap_list_remove(struct smap_psock *psock,
> -  struct sock **entry,
> -  struct htab_elem *hash_link)
> +static void smap_list_map_remove(struct smap_psock *psock,
> +  struct sock **entry)
>  {
>   struct smap_psock_map_entry *e, *tmp;
>  
>   list_for_each_entry_safe(e, tmp, >maps, list) {
> - if (e->entry == entry || e->hash_link == hash_link) {
> + if (e->entry == entry)
> + list_del(>list);
> + }
> +}
Nit. Add an empty line.

> +static void smap_list_hash_remove(struct smap_psock *psock,
> +   struct htab_elem *hash_link)
> +{
> + struct smap_psock_map_entry *e, *tmp;
> +
> + list_for_each_entry_safe(e, tmp, >maps, list) {
> + struct htab_elem *c = e->hash_link;
> +
> + if (c == hash_link)
>   list_del(>list);
> - break;
> - }
>   }
>  }
>  
> @@ -1647,7 +1656,7 @@ static void sock_map_free(struct bpf_map *map)
>* to be null and queued for garbage collection.
>*/
>   if (likely(psock)) {
> - smap_list_remove(psock, >sock_map[i], NULL);
> + smap_list_map_remove(psock, >sock_map[i]);
>   smap_release_sock(psock, sock);
>   }
>   write_unlock_bh(>sk_callback_lock);
> @@ -1706,7 +1715,7 @@ static int sock_map_delete_elem(struct bpf_map *map, 
> void *key)
>  
>   if (psock->bpf_parse)
>   smap_stop_sock(psock, sock);
> - smap_list_remove(psock, >sock_map[k], NULL);
> + smap_list_map_remove(psock, >sock_map[k]);
>   smap_release_sock(psock, sock);
>  out:
>   write_unlock_bh(>sk_callback_lock);
> @@ -1908,7 +1917,7 @@ static int sock_map_ctx_update_elem(struct 
> bpf_sock_ops_kern *skops,
>   struct smap_psock *opsock = smap_psock_sk(osock);
>  
>   write_lock_bh(>sk_callback_lock);
> - smap_list_remove(opsock, >sock_map[i], NULL);
> + smap_list_map_remove(opsock, >sock_map[i]);
>   smap_release_sock(opsock, osock);
>   write_unlock_bh(>sk_callback_lock);
>   }
> @@ -2124,7 +2133,7 @@ static void sock_hash_free(struct bpf_map *map)
>* (psock) to be null and queued for garbage collection.
>*/
>   if (likely(psock)) {
> - smap_list_remove(psock, NULL, l);
> + smap_list_hash_remove(psock, l);
>   smap_release_sock(psock, sock);
>   }
>   write_unlock_bh(>sk_callback_lock);
> @@ -2304,7 +2313,7 @@ static int sock_hash_ctx_update_elem(struct 
> bpf_sock_ops_kern *skops,
>   psock = smap_psock_sk(l_old->sk);
>  
>   hlist_del_rcu(_old->hash_node);
> - smap_list_remove(psock, NULL, l_old);
> + smap_list_hash_remove(psock, l_old);
>   smap_release_sock(psock, l_old->sk);
>   free_htab_elem(htab, l_old);
>   }
> @@ -2372,7 +2381,7 @@ static int sock_hash_delete_elem(struct bpf_map *map, 
> void *key)
>* to be null and queued for garbage collection.
>*/
>   if (likely(psock)) {
> -   

Re: [bpf PATCH v3 1/4] bpf: sockmap, fix crash when ipv6 sock is added

2018-06-23 Thread Martin KaFai Lau
On Fri, Jun 22, 2018 at 08:21:34AM -0700, John Fastabend wrote:
> This fixes a crash where we assign tcp_prot to IPv6 sockets instead
> of tcpv6_prot.
> 
> Previously we overwrote the sk->prot field with tcp_prot even in the
> AF_INET6 case. This patch ensures the correct tcp_prot and tcpv6_prot
> are used.
> 
> Tested with 'netserver -6' and 'netperf -H [IPv6]' as well as
> 'netperf -H [IPv4]'. The ESTABLISHED check resolves the previously
> crashing case here.
> 
> Fixes: 174a79ff9515 ("bpf: sockmap with sk redirect support")
> Reported-by: syzbot+5c063698bdbfac19f...@syzkaller.appspotmail.com
> Signed-off-by: John Fastabend 
> Signed-off-by: Wei Wang 
Acked-by: Martin KaFai Lau 

> ---
>  kernel/bpf/sockmap.c |   58 
> +-
>  1 file changed, 48 insertions(+), 10 deletions(-)
> 
> diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c
> index 52a91d8..d7fd17a 100644
> --- a/kernel/bpf/sockmap.c
> +++ b/kernel/bpf/sockmap.c
> @@ -140,6 +140,7 @@ static int bpf_tcp_recvmsg(struct sock *sk, struct msghdr 
> *msg, size_t len,
>  static int bpf_tcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t size);
>  static int bpf_tcp_sendpage(struct sock *sk, struct page *page,
>   int offset, size_t size, int flags);
> +static void bpf_tcp_close(struct sock *sk, long timeout);
>  
>  static inline struct smap_psock *smap_psock_sk(const struct sock *sk)
>  {
> @@ -161,7 +162,42 @@ static bool bpf_tcp_stream_read(const struct sock *sk)
>   return !empty;
>  }
>  
> -static struct proto tcp_bpf_proto;
> +enum {
> + SOCKMAP_IPV4,
> + SOCKMAP_IPV6,
> + SOCKMAP_NUM_PROTS,
> +};
> +
> +enum {
> + SOCKMAP_BASE,
> + SOCKMAP_TX,
> + SOCKMAP_NUM_CONFIGS,
> +};
> +
> +static struct proto *saved_tcpv6_prot __read_mostly;
> +static DEFINE_SPINLOCK(tcpv6_prot_lock);
> +static struct proto bpf_tcp_prots[SOCKMAP_NUM_PROTS][SOCKMAP_NUM_CONFIGS];
> +static void build_protos(struct proto prot[SOCKMAP_NUM_CONFIGS],
> +  struct proto *base)
> +{
> + prot[SOCKMAP_BASE]  = *base;
> + prot[SOCKMAP_BASE].close= bpf_tcp_close;
> + prot[SOCKMAP_BASE].recvmsg  = bpf_tcp_recvmsg;
> + prot[SOCKMAP_BASE].stream_memory_read   = bpf_tcp_stream_read;
> +
> + prot[SOCKMAP_TX]= prot[SOCKMAP_BASE];
> + prot[SOCKMAP_TX].sendmsg= bpf_tcp_sendmsg;
> + prot[SOCKMAP_TX].sendpage   = bpf_tcp_sendpage;
> +}
> +
> +static void update_sk_prot(struct sock *sk, struct smap_psock *psock)
> +{
> + int family = sk->sk_family == AF_INET6 ? SOCKMAP_IPV6 : SOCKMAP_IPV4;
> + int conf = psock->bpf_tx_msg ? SOCKMAP_TX : SOCKMAP_BASE;
> +
> + sk->sk_prot = _tcp_prots[family][conf];
> +}
> +
>  static int bpf_tcp_init(struct sock *sk)
>  {
>   struct smap_psock *psock;
> @@ -181,14 +217,17 @@ static int bpf_tcp_init(struct sock *sk)
>   psock->save_close = sk->sk_prot->close;
>   psock->sk_proto = sk->sk_prot;
>  
> - if (psock->bpf_tx_msg) {
> - tcp_bpf_proto.sendmsg = bpf_tcp_sendmsg;
> - tcp_bpf_proto.sendpage = bpf_tcp_sendpage;
> - tcp_bpf_proto.recvmsg = bpf_tcp_recvmsg;
> - tcp_bpf_proto.stream_memory_read = bpf_tcp_stream_read;
> + /* Build IPv6 sockmap whenever the address of tcpv6_prot changes */
> + if (sk->sk_family == AF_INET6 &&
> + unlikely(sk->sk_prot != smp_load_acquire(_tcpv6_prot))) {
> + spin_lock_bh(_prot_lock);
> + if (likely(sk->sk_prot != saved_tcpv6_prot)) {
> + build_protos(bpf_tcp_prots[SOCKMAP_IPV6], sk->sk_prot);
> + smp_store_release(_tcpv6_prot, sk->sk_prot);
> + }
> + spin_unlock_bh(_prot_lock);
>   }
> -
> - sk->sk_prot = _bpf_proto;
> + update_sk_prot(sk, psock);
>   rcu_read_unlock();
>   return 0;
>  }
> @@ -,8 +1150,7 @@ static void bpf_tcp_msg_add(struct smap_psock *psock,
>  
>  static int bpf_tcp_ulp_register(void)
>  {
> - tcp_bpf_proto = tcp_prot;
> - tcp_bpf_proto.close = bpf_tcp_close;
> + build_protos(bpf_tcp_prots[SOCKMAP_IPV4], _prot);
>   /* Once BPF TX ULP is registered it is never unregistered. It
>* will be in the ULP list for the lifetime of the system. Doing
>* duplicate registers is not a problem.
>