[PATCH v2] net/9p: trans_xen: add missing MODULE_AUTHOR/DESCRIPTION/LICENSE

2017-11-19 Thread Jesse Chan
This change resolves a new compile-time warning
when built as a loadable module:

WARNING: modpost: missing MODULE_LICENSE() in net/9p/9pnet_xen.o
see include/linux/module.h for more information

This adds the license as "Dual MIT/GPL", which matches the header of the file.

MODULE_DESCRIPTION and MODULE_AUTHOR are also added.

Signed-off-by: Jesse Chan 
---
 net/9p/trans_xen.c | 4 
 1 file changed, 4 insertions(+)

diff --git a/net/9p/trans_xen.c b/net/9p/trans_xen.c
index 6ad3e043c617..90402e744fbf 100644
--- a/net/9p/trans_xen.c
+++ b/net/9p/trans_xen.c
@@ -543,3 +543,7 @@ static void p9_trans_xen_exit(void)
return xenbus_unregister_driver(_9pfs_front_driver);
 }
 module_exit(p9_trans_xen_exit);
+
+MODULE_AUTHOR("Stefano Stabellini ");
+MODULE_DESCRIPTION("Xen Transport for 9P");
+MODULE_LICENSE("Dual MIT/GPL");
-- 
2.14.1



[PATCH RFC 5/5] esp: Don't require synchronous crypto fallback on offloading anymore.

2017-11-19 Thread Steffen Klassert
We support asynchronous crypto on layer 2 ESP now.
So no need to force synchronous crypto fallback on
offloading anymore.

Signed-off-by: Steffen Klassert 
---
 net/ipv4/esp4.c | 12 ++--
 net/ipv6/esp6.c | 12 ++--
 2 files changed, 4 insertions(+), 20 deletions(-)

diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c
index 7948833dc204..6f00e43120a8 100644
--- a/net/ipv4/esp4.c
+++ b/net/ipv4/esp4.c
@@ -843,17 +843,13 @@ static int esp_init_aead(struct xfrm_state *x)
char aead_name[CRYPTO_MAX_ALG_NAME];
struct crypto_aead *aead;
int err;
-   u32 mask = 0;
 
err = -ENAMETOOLONG;
if (snprintf(aead_name, CRYPTO_MAX_ALG_NAME, "%s(%s)",
 x->geniv, x->aead->alg_name) >= CRYPTO_MAX_ALG_NAME)
goto error;
 
-   if (x->xso.offload_handle)
-   mask |= CRYPTO_ALG_ASYNC;
-
-   aead = crypto_alloc_aead(aead_name, 0, mask);
+   aead = crypto_alloc_aead(aead_name, 0, 0);
err = PTR_ERR(aead);
if (IS_ERR(aead))
goto error;
@@ -883,7 +879,6 @@ static int esp_init_authenc(struct xfrm_state *x)
char authenc_name[CRYPTO_MAX_ALG_NAME];
unsigned int keylen;
int err;
-   u32 mask = 0;
 
err = -EINVAL;
if (!x->ealg)
@@ -909,10 +904,7 @@ static int esp_init_authenc(struct xfrm_state *x)
goto error;
}
 
-   if (x->xso.offload_handle)
-   mask |= CRYPTO_ALG_ASYNC;
-
-   aead = crypto_alloc_aead(authenc_name, 0, mask);
+   aead = crypto_alloc_aead(authenc_name, 0, 0);
err = PTR_ERR(aead);
if (IS_ERR(aead))
goto error;
diff --git a/net/ipv6/esp6.c b/net/ipv6/esp6.c
index 08a424fa8009..7c888c6e53a9 100644
--- a/net/ipv6/esp6.c
+++ b/net/ipv6/esp6.c
@@ -752,17 +752,13 @@ static int esp_init_aead(struct xfrm_state *x)
char aead_name[CRYPTO_MAX_ALG_NAME];
struct crypto_aead *aead;
int err;
-   u32 mask = 0;
 
err = -ENAMETOOLONG;
if (snprintf(aead_name, CRYPTO_MAX_ALG_NAME, "%s(%s)",
 x->geniv, x->aead->alg_name) >= CRYPTO_MAX_ALG_NAME)
goto error;
 
-   if (x->xso.offload_handle)
-   mask |= CRYPTO_ALG_ASYNC;
-
-   aead = crypto_alloc_aead(aead_name, 0, mask);
+   aead = crypto_alloc_aead(aead_name, 0, 0);
err = PTR_ERR(aead);
if (IS_ERR(aead))
goto error;
@@ -792,7 +788,6 @@ static int esp_init_authenc(struct xfrm_state *x)
char authenc_name[CRYPTO_MAX_ALG_NAME];
unsigned int keylen;
int err;
-   u32 mask = 0;
 
err = -EINVAL;
if (!x->ealg)
@@ -818,10 +813,7 @@ static int esp_init_authenc(struct xfrm_state *x)
goto error;
}
 
-   if (x->xso.offload_handle)
-   mask |= CRYPTO_ALG_ASYNC;
-
-   aead = crypto_alloc_aead(authenc_name, 0, mask);
+   aead = crypto_alloc_aead(authenc_name, 0, 0);
err = PTR_ERR(aead);
if (IS_ERR(aead))
goto error;
-- 
2.14.1



[PATCH RFC 2/5] net: Add asynchronous callbacks for xfrm on layer 2.

2017-11-19 Thread Steffen Klassert
This patch implements asynchronous crypto callbacks
and a backlog handler that can be used when IPsec
is done at layer 2 in the TX path. It also extends
the skb validate functions so that we can update
the driver transmit return codes based on async
crypto operation or to indicate that we queued the
packet in a backlog queue.

Joint work with: Aviv Heller 

Signed-off-by: Steffen Klassert 
---
 include/linux/netdevice.h |   6 ++-
 include/net/xfrm.h|  15 ++-
 net/core/dev.c|  16 +---
 net/ipv4/esp4.c   |  24 +--
 net/ipv6/esp6.c   |  24 +--
 net/packet/af_packet.c|   3 +-
 net/sched/sch_generic.c   |  18 -
 net/xfrm/xfrm_device.c| 100 +-
 8 files changed, 171 insertions(+), 35 deletions(-)

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 6b274bfe489f..19d5c7d7587d 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -2791,7 +2791,9 @@ struct softnet_data {
struct Qdisc*output_queue;
struct Qdisc**output_queue_tailp;
struct sk_buff  *completion_queue;
-
+#ifdef CONFIG_XFRM_OFFLOAD
+   struct sk_buff_head xfrm_backlog;
+#endif
 #ifdef CONFIG_RPS
/* input_queue_head should be written by cpu owning this struct,
 * and only read by other cpus. Worth using a cache line.
@@ -3323,7 +3325,7 @@ int dev_get_phys_port_id(struct net_device *dev,
 int dev_get_phys_port_name(struct net_device *dev,
   char *name, size_t len);
 int dev_change_proto_down(struct net_device *dev, bool proto_down);
-struct sk_buff *validate_xmit_skb_list(struct sk_buff *skb, struct net_device 
*dev);
+struct sk_buff *validate_xmit_skb_list(struct sk_buff *skb, struct net_device 
*dev, bool *again);
 struct sk_buff *dev_hard_start_xmit(struct sk_buff *skb, struct net_device 
*dev,
struct netdev_queue *txq, int *ret);
 
diff --git a/include/net/xfrm.h b/include/net/xfrm.h
index 509e85f330c9..76ae5c306776 100644
--- a/include/net/xfrm.h
+++ b/include/net/xfrm.h
@@ -1021,6 +1021,7 @@ struct xfrm_offload {
 #defineXFRM_GSO_SEGMENT16
 #defineXFRM_GRO32
 #defineXFRM_ESP_NO_TRAILER 64
+#defineXFRM_DEV_RESUME 128
 
__u32   status;
 #define CRYPTO_SUCCESS 1
@@ -1858,7 +1859,9 @@ static inline struct xfrm_offload *xfrm_offload(struct 
sk_buff *skb)
 void __net_init xfrm_dev_init(void);
 
 #ifdef CONFIG_XFRM_OFFLOAD
-struct sk_buff *validate_xmit_xfrm(struct sk_buff *skb, netdev_features_t 
features);
+void xfrm_dev_resume(struct sk_buff *skb);
+void xfrm_dev_backlog(struct softnet_data *sd);
+struct sk_buff *validate_xmit_xfrm(struct sk_buff *skb, netdev_features_t 
features, bool *again);
 int xfrm_dev_state_add(struct net *net, struct xfrm_state *x,
   struct xfrm_user_offload *xuo);
 bool xfrm_dev_offload_ok(struct sk_buff *skb, struct xfrm_state *x);
@@ -1897,7 +1900,15 @@ static inline void xfrm_dev_state_free(struct xfrm_state 
*x)
}
 }
 #else
-static inline struct sk_buff *validate_xmit_xfrm(struct sk_buff *skb, 
netdev_features_t features)
+static inline void xfrm_dev_resume(struct sk_buff *skb)
+{
+}
+
+static inline void xfrm_dev_backlog(struct softnet_data *sd)
+{
+}
+
+static inline struct sk_buff *validate_xmit_xfrm(struct sk_buff *skb, 
netdev_features_t features, bool *again)
 {
return skb;
 }
diff --git a/net/core/dev.c b/net/core/dev.c
index 8434669f8f36..f893f98df128 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -3041,7 +3041,7 @@ int skb_csum_hwoffload_help(struct sk_buff *skb,
 }
 EXPORT_SYMBOL(skb_csum_hwoffload_help);
 
-static struct sk_buff *validate_xmit_skb(struct sk_buff *skb, struct 
net_device *dev)
+static struct sk_buff *validate_xmit_skb(struct sk_buff *skb, struct 
net_device *dev, bool *again)
 {
netdev_features_t features;
 
@@ -3081,7 +3081,7 @@ static struct sk_buff *validate_xmit_skb(struct sk_buff 
*skb, struct net_device
}
}
 
-   skb = validate_xmit_xfrm(skb, features);
+   skb = validate_xmit_xfrm(skb, features, again);
 
return skb;
 
@@ -3092,7 +3092,7 @@ static struct sk_buff *validate_xmit_skb(struct sk_buff 
*skb, struct net_device
return NULL;
 }
 
-struct sk_buff *validate_xmit_skb_list(struct sk_buff *skb, struct net_device 
*dev)
+struct sk_buff *validate_xmit_skb_list(struct sk_buff *skb, struct net_device 
*dev, bool *again)
 {
struct sk_buff *next, *head = NULL, *tail;
 
@@ -3103,7 +3103,7 @@ struct sk_buff *validate_xmit_skb_list(struct sk_buff 
*skb, struct net_device *d
/* in case skb wont be segmented, point to itself */
skb->prev = skb;
 
-   skb = validate_xmit_skb(skb, dev);
+  

[PATCH RFC 4/5] xfrm: Allow IPsec GSO with software crypto for local sockets.

2017-11-19 Thread Steffen Klassert
With support of async crypto operations in the GSO codepath
we have everything in place to allow GSO for local sockets.
This patch enables the GSO codepath.

Signed-off-by: Steffen Klassert 
---
 include/net/xfrm.h | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/include/net/xfrm.h b/include/net/xfrm.h
index 76ae5c306776..80b6a1f1290e 100644
--- a/include/net/xfrm.h
+++ b/include/net/xfrm.h
@@ -1873,6 +1873,8 @@ static inline bool xfrm_dst_offload_ok(struct dst_entry 
*dst)
if (!x || !x->type_offload)
return false;
 
+   if (!x->xso.offload_handle && !dst->child->xfrm)
+   return true;
if (x->xso.offload_handle && (x->xso.dev == dst->path->dev) &&
!dst->child->xfrm)
return true;
-- 
2.14.1



[PATCH RFC 0/5] Support asynchronous crypto for IPsec GSO.

2017-11-19 Thread Steffen Klassert
This patchset implements asynchronous crypto handling
in the layer 2 TX path. With this we can allow IPsec
ESP GSO for software crypto. This also merges the IPsec
GSO and non-GSO paths to both use validate_xmit_xfrm().

1) Separate ESP handling from segmentation for GRO packets.
   This unifies the IPsec GSO and non GSO codepath.

2) Add asynchronous callbacks for xfrm on layer 2. This
   adds the necessary infrastructure to core networking.

3) Allow to use the layer2 IPsec GSO codepath for software
   crypto, all infrastructure is there now.

4) Also allow IPsec GSO with software crypto for local sockets.

5) Don't require synchronous crypto fallback on IPsec offloading,
   it is not needed anymore.


[PATCH RFC 1/5] xfrm: Separate ESP handling from segmentation for GRO packets.

2017-11-19 Thread Steffen Klassert
We change the ESP GSO handlers to only segment the packets.
The ESP handling and encryption is defered to validate_xmit_xfrm()
where this is done for non GRO packets too. This makes the code
more robust and prepares for asynchronous crypto handling.

Signed-off-by: Steffen Klassert 
---
 include/net/xfrm.h   |  6 +--
 net/core/dev.c   |  5 +--
 net/ipv4/esp4_offload.c  | 73 +++--
 net/ipv4/xfrm4_mode_tunnel.c |  5 +--
 net/ipv6/esp6_offload.c  | 80 
 net/ipv6/xfrm6_mode_tunnel.c |  5 +--
 net/xfrm/xfrm_device.c   | 87 +++-
 7 files changed, 129 insertions(+), 132 deletions(-)

diff --git a/include/net/xfrm.h b/include/net/xfrm.h
index dc28a98ce97c..509e85f330c9 100644
--- a/include/net/xfrm.h
+++ b/include/net/xfrm.h
@@ -1858,7 +1858,7 @@ static inline struct xfrm_offload *xfrm_offload(struct 
sk_buff *skb)
 void __net_init xfrm_dev_init(void);
 
 #ifdef CONFIG_XFRM_OFFLOAD
-int validate_xmit_xfrm(struct sk_buff *skb, netdev_features_t features);
+struct sk_buff *validate_xmit_xfrm(struct sk_buff *skb, netdev_features_t 
features);
 int xfrm_dev_state_add(struct net *net, struct xfrm_state *x,
   struct xfrm_user_offload *xuo);
 bool xfrm_dev_offload_ok(struct sk_buff *skb, struct xfrm_state *x);
@@ -1897,9 +1897,9 @@ static inline void xfrm_dev_state_free(struct xfrm_state 
*x)
}
 }
 #else
-static inline int validate_xmit_xfrm(struct sk_buff *skb, netdev_features_t 
features)
+static inline struct sk_buff *validate_xmit_xfrm(struct sk_buff *skb, 
netdev_features_t features)
 {
-   return 0;
+   return skb;
 }
 
 static inline int xfrm_dev_state_add(struct net *net, struct xfrm_state *x, 
struct xfrm_user_offload *xuo)
diff --git a/net/core/dev.c b/net/core/dev.c
index 8ee29f4f5fa9..8434669f8f36 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -3065,9 +3065,6 @@ static struct sk_buff *validate_xmit_skb(struct sk_buff 
*skb, struct net_device
__skb_linearize(skb))
goto out_kfree_skb;
 
-   if (validate_xmit_xfrm(skb, features))
-   goto out_kfree_skb;
-
/* If packet is not checksummed and device does not
 * support checksumming for this protocol, complete
 * checksumming here.
@@ -3084,6 +3081,8 @@ static struct sk_buff *validate_xmit_skb(struct sk_buff 
*skb, struct net_device
}
}
 
+   skb = validate_xmit_xfrm(skb, features);
+
return skb;
 
 out_kfree_skb:
diff --git a/net/ipv4/esp4_offload.c b/net/ipv4/esp4_offload.c
index f8b918c766b0..c359f3cfeec3 100644
--- a/net/ipv4/esp4_offload.c
+++ b/net/ipv4/esp4_offload.c
@@ -108,75 +108,36 @@ static void esp4_gso_encap(struct xfrm_state *x, struct 
sk_buff *skb)
 static struct sk_buff *esp4_gso_segment(struct sk_buff *skb,
netdev_features_t features)
 {
-   __u32 seq;
-   int err = 0;
-   struct sk_buff *skb2;
struct xfrm_state *x;
struct ip_esp_hdr *esph;
struct crypto_aead *aead;
-   struct sk_buff *segs = ERR_PTR(-EINVAL);
netdev_features_t esp_features = features;
struct xfrm_offload *xo = xfrm_offload(skb);
 
if (!xo)
-   goto out;
-
-   seq = xo->seq.low;
+   return ERR_PTR(-EINVAL);
 
x = skb->sp->xvec[skb->sp->len - 1];
aead = x->data;
esph = ip_esp_hdr(skb);
 
if (esph->spi != x->id.spi)
-   goto out;
+   return ERR_PTR(-EINVAL);
 
if (!pskb_may_pull(skb, sizeof(*esph) + crypto_aead_ivsize(aead)))
-   goto out;
+   return ERR_PTR(-EINVAL);
 
__skb_pull(skb, sizeof(*esph) + crypto_aead_ivsize(aead));
 
skb->encap_hdr_csum = 1;
 
-   if (!(features & NETIF_F_HW_ESP))
+   if (!(features & NETIF_F_HW_ESP) || !x->xso.offload_handle ||
+   (x->xso.dev != skb->dev))
esp_features = features & ~(NETIF_F_SG | NETIF_F_CSUM_MASK);
 
-   segs = x->outer_mode->gso_segment(x, skb, esp_features);
-   if (IS_ERR_OR_NULL(segs))
-   goto out;
-
-   __skb_pull(skb, skb->data - skb_mac_header(skb));
-
-   skb2 = segs;
-   do {
-   struct sk_buff *nskb = skb2->next;
-
-   xo = xfrm_offload(skb2);
-   xo->flags |= XFRM_GSO_SEGMENT;
-   xo->seq.low = seq;
-   xo->seq.hi = xfrm_replay_seqhi(x, seq);
+   xo->flags |= XFRM_GSO_SEGMENT;
 
-   if(!(features & NETIF_F_HW_ESP))
-   xo->flags |= CRYPTO_FALLBACK;
-
-   x->outer_mode->xmit(x, skb2);
-
-   err = x->type_offload->xmit(x, skb2, esp_features);
-   if (err) {
-   kfree_skb_list(segs);
-   

[PATCH RFC 3/5] xfrm: Allow to use the layer2 IPsec GSO codepath for software crypto.

2017-11-19 Thread Steffen Klassert
We now have support for asynchronous crypto operations in the layer 2 TX
path. This was the missing part to allow the GSO codepath for software
crypto, so allow this codepath now.

Signed-off-by: Steffen Klassert 
---
 net/xfrm/xfrm_device.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/net/xfrm/xfrm_device.c b/net/xfrm/xfrm_device.c
index ae00fda7a0dc..d04fd64b63f3 100644
--- a/net/xfrm/xfrm_device.c
+++ b/net/xfrm/xfrm_device.c
@@ -202,8 +202,8 @@ bool xfrm_dev_offload_ok(struct sk_buff *skb, struct 
xfrm_state *x)
if (!x->type_offload || x->encap)
return false;
 
-   if ((x->xso.offload_handle && (dev == dst->path->dev)) &&
-!dst->child->xfrm && x->type->get_mtu) {
+   if ((!dev || (x->xso.offload_handle && (dev == dst->path->dev))) &&
+   (!dst->child->xfrm && x->type->get_mtu)) {
mtu = x->type->get_mtu(x, xdst->child_mtu_cached);
 
if (skb->len <= mtu)
-- 
2.14.1



Re: Linux ECN Handling

2017-11-19 Thread Steve Ibanez
Hi Folks,

I wanted to check back in on this for another update and to solicit
some more suggestions. I did a bit more digging to try an isolate the
problem.

As I explained earlier, the log generated by tcp_probe indicates that
the snd_cwnd is set to 1 just before the end host receives an ECN
marked ACK and unexpectedly enters a timeout (
https://drive.google.com/open?id=1iyt8PvBxQga2jpRpBJ8KdQw3Q_mPTzZF ).
I was trying to track down where this is happening, but the only place
I could find that might be setting the snd_cwnd to 1 is in the
tcp_enter_loss() function. I inserted a printk() call in this function
to see when it is being invoked and it looks like it is only called by
the tcp_retransmit_timer() function after the timer expires.

I decided to try recording the snd_cwnd, ss-thresh, and icsk_ca_state
inside the tcp_fastretrans_alert() function whenever it processes an
ECN marked ACK (
https://drive.google.com/open?id=17GD77lb9lkCSu0_s9p40GZ5r4EU8B4VB )
This plot also shows when the tcp_retransmit_timer() and
tcp_enter_loss() functions are invoked (red and purple dots
respectively). And I see that the ACK state machine is always either
in the TCP_CA_Open or TCP_CA_CWR state whenever the
tcp_fastretrans_alert() function processes ECN marked ACKs (
https://drive.google.com/open?id=1xwuPxjgwriT9DSblFx2uILfQ95Fy-Eqq ).
So I'm not sure where the snd_cwnd is being set to 1 (or possibly 0 as
Neal suggested) just before entering a timeout. Any suggestions here?

In order to do a bit of profiling of the tcp_dctcp code I added
support into tcp_probe for recording the dctcp alpha parameter. I see
that alpha oscillates around about 0.1 when the flow rates have
converged, it goes to zero when the other host enters a timeout, and I
don't see any unexpected behavior just before the timeout (
https://drive.google.com/open?id=1zPdyS57TrUYZIekbid9p1UNyraLYrdw7 ).

So I haven't had much luck yet trying to track down where the problem
is. If you have any suggestions that would help me to focus my search
efforts, I would appreciate the comments.

Thanks!
-Steve


On Mon, Nov 6, 2017 at 3:31 PM, Steve Ibanez  wrote:
> Hi Daniel,
>
> Apologies for the delay. I tried out Neal's suggestion to printk the
> cwnd and ss-thresh in the tcp_init_cwnd_reduction(),
> tcp_cwnd_reduction(), and tcp_end_cwnd_reduction() functions in
> tcp_input.c. From what I can tell, none of these functions are setting
> the cwnd to 0.
>
> Here is the kernel log with the cwnd and ss-thresh print statements:
> https://drive.google.com/open?id=1LEWIkz64NuZN3yuDpBOAXbUfJfiju55O
> And here is the corresponding packet trace at this end host:
> https://drive.google.com/open?id=1qf4cSW3wzsiwPngcYpZY-AoBspuqONLH
> (The kernel log buffer was not large enough to capture the full
> 3-second experiment, so there's only about a second of data for the
> log buffer and about 3 seconds for the packet trace.)
>
> Here are is a plot of the cwnd and ss-thresh from each of the three functions:
> - tcp_init_cwnd_reduction:
> https://drive.google.com/open?id=1KOEXG2ISJQMi9c6KyPOQ6rpVUVsQwtWU
> - tcp_cwnd_reduction:
> https://drive.google.com/open?id=1awoPWC3hi4CGZt7HyuI4aAaLG1LPLwJE
> - tcp_end_cwnd_reduction:
> https://drive.google.com/open?id=1G7XUSnkX8tP7Z5XdY2O97OWj6jguQHO5
>
> Here is a plot of the measured flow rates:
> https://drive.google.com/open?id=1XwmGve10J4qa1nPE3LustK8NbvhZscac
>
> The kernel log and packet trace data was collected on the 10.0.0.3
> host. The cwnd and ss-thresh plots are from the final second or so of
> the experiment and they show two timeout events. In the first event,
> the 10.0.0.1 host times out allowing 10.0.0.3 to increase it's cwnd.
> And in the second event, the 10.0.0.3 host times out causing the cwnd
> to decrease from ~100 to about ~10. The cwnd samples from tcp_probe (
> https://drive.google.com/open?id=1QCuPspLqbGoA68MKTaAh7rx2wCv3Cr_e )
> indicate that the cwnd is 1 MSS just before the timeout event, but I
> don't see that in the data collected from the tcp_*_cwnd_reduction
> functions.
>
> Here is a diff of the changes that I applied to the tcp_input.c file:
> https://drive.google.com/open?id=1k5x3AkfTr3tJhohSIcmQp-3g2yTVNMWm
>
> Are there other places in the code that you would suggest I check for
> how the cwnd and ss-thresh are changing?
>
> Thanks,
> -Steve
>
>
> On Mon, Nov 6, 2017 at 6:08 AM, Daniel Borkmann  wrote:
>> On 10/24/2017 03:11 AM, Neal Cardwell wrote:
>>>
>>> On Mon, Oct 23, 2017 at 6:15 PM, Steve Ibanez 
>>> wrote:

 Hi All,

 I upgraded the kernel on all of our machines to Linux
 4.13.8-041308-lowlatency. However, I'm still observing the same
 behavior where the source enters a timeout when the CWND=1MSS and it
 receives ECN marks.

 Here are the measured flow rates:

 

 Here are snapshots of the 

Re: [PATCH net 04/10] bpf: offload: move offload device validation out to the drivers

2017-11-19 Thread Jiri Pirko
Mon, Nov 20, 2017 at 05:55:16AM CET, jakub.kicin...@netronome.com wrote:
>With TC shared block changes we can't depend on correct netdev
>pointer being available in cls_bpf.  Move the device validation
>to the driver.  Core will only make sure that offloaded programs
>are always attached in the driver (or in HW by the driver).  We
>trust that drivers which implement offload callbacks will perform
>necessary checks.
>
>Moving the checks to the driver is generally a useful thing,
>in practice the check should be against a switchdev instance,
>not a netdev, given that most ASICs will probably allow using
>the same program on many ports.
>
>Signed-off-by: Jakub Kicinski 
>Reviewed-by: Quentin Monnet 
>Acked-by: Alexei Starovoitov 
>Acked-by: Daniel Borkmann 

Acked-by: Jiri Pirko 


Re: [PATCH net,stable] net: qmi_wwan: add Quectel BG96 2c7c:0296

2017-11-19 Thread Bjørn Mork


On November 20, 2017 5:19:21 AM GMT+01:00, ssjoh...@mac.com wrote:
>From: ssjoholm 
>
>Signed-off-by: ssjoholm 
>
>Quectel BG96 is an Qualcomm MDM9206 based IoT modem, supporting both
>CAT-M and NB-IoT. Tested hardware is BG96 mounted on Quectel
>development board (EVB).
>The USB id is added to qmi_wwan.c to allow QMI communication with the
>BG96.
>---
> drivers/net/usb/qmi_wwan.c | 1 +
> 1 file changed, 1 insertion(+)
>
>diff --git a/drivers/net/usb/qmi_wwan.c b/drivers/net/usb/qmi_wwan.c
>index 720a3a248070..c750cf7c042b 100644
>--- a/drivers/net/usb/qmi_wwan.c
>+++ b/drivers/net/usb/qmi_wwan.c
>@@ -1239,6 +1239,7 @@ static const struct usb_device_id products[] = {
>   {QMI_FIXED_INTF(0x1e0e, 0x9001, 5)},/* SIMCom 7230E */
>   {QMI_QUIRK_SET_DTR(0x2c7c, 0x0125, 4)}, /* Quectel EC25, EC20 R2.0 
>Mini PCIe */
>   {QMI_QUIRK_SET_DTR(0x2c7c, 0x0121, 4)}, /* Quectel EC21 Mini PCIe */
>+  {QMI_FIXED_INTF(0x2c7c, 0x0296, 4)},/* Quectel BG96 */
> 
>   /* 4. Gobi 1000 devices */
>   {QMI_GOBI1K_DEVICE(0x05c6, 0x9212)},/* Acer Gobi Modem Device */

Patch looks fine. But you need to use your full name in the tags. 
See the part about identity;
https://git-scm.com/book/en/v2/Getting-Started-First-Time-Git-Setup

And the SOB and other tags go after the rest of the commit message. Your SOB 
should always be the last line. 



Bjørn 


RE: NETDEV WATCHDOG: eth0 (dwc-eth-dwmac): transmit queue 1 timed out

2017-11-19 Thread Bhadram Varka
Hi Joao/Peppe,

Observed this issue more frequently with multi-channel case. Am I missing 
something in DT ?
Please help here to understand the issue.

Thanks,
Bhadram

-Original Message-
From: Bhadram Varka 
Sent: Thursday, November 16, 2017 9:41 AM
To: linux-netdev 
Subject: NETDEV WATCHDOG: eth0 (dwc-eth-dwmac): transmit queue 1 timed out

Hi,

I am trying to enable multi-queue in Tegra186 EQOS (which has support for 4 
channels). Observed below netdev watchdog warning. Its easily reproable with 
iperf test. 
In normal ping scenario this is not observed. I did not observe any issue if we 
disable TSO. Looks like issue in stmmac_tso_xmit() in multi-channel scenario.

[   88.801672] NETDEV WATCHDOG: eth0 (dwc-eth-dwmac): transmit queue 0 timed out
[   88.808818] [ cut here ]
[   88.813435] WARNING: CPU: 5 PID: 0 at net/sched/sch_generic.c:320 
dev_watchdog+0x2cc/0x2d8
[   88.821681] Modules linked in: dwmac_dwc_qos_eth stmmac_platform crc32_ce 
crct10dif_ce stmmac ip_tables x_tables ipv6
[   88.832290] CPU: 5 PID: 0 Comm: swapper/5 Tainted: G S  
4.14.0-rc7-01956-g9395db5-dirty #21
[   88.841663] Hardware name: NVIDIA Tegra186 P2771- Development Board (DT)
[   88.848697] task: 8001ec8fd400 task.stack: 09e38000
[   88.854606] PC is at dev_watchdog+0x2cc/0x2d8
[   88.858952] LR is at dev_watchdog+0x2cc/0x2d8
[   88.863300] pc : [] lr : [] pstate: 
2145
[   88.870678] sp : 0802bd80
[   88.873983] x29: 0802bd80 x28: 00a0
[   88.879287] x27:  x26: 8001eae2c3b0
[   88.884589] x25: 0005 x24: 8001ecb6be80
[   88.889891] x23: 8001eae2c39c x22: 8001eae2bfb0
[   88.895192] x21: 8001eae2c000 x20: 08fe7000
[   88.900493] x19: 0001 x18: 0010
[   88.905795] x17:  x16: 
[   88.911098] x15:  x14: 756f2064656d6974
[   88.916399] x13: 2031206575657571 x12: 08fe9df0
[   88.921699] x11: 08586180 x10: 642d6874652d6377
[   88.927000] x9 : 0016 x8 : 3a474f4448435441
[   88.932301] x7 : 572056454454454e x6 : 014f
[   88.937602] x5 : 0020 x4 : 
[   88.942902] x3 :  x2 : 08fec4c0
[   88.948203] x1 : 8001ec8fd400 x0 : 0041
[   88.953504] Call trace:
[   88.955944] Exception stack(0x0802bc40 to 0x0802bd80)
[   88.962371] bc40: 0041 8001ec8fd400 08fec4c0 

[   88.970184] bc60:  0020 014f 
572056454454454e
[   88.977998] bc80: 3a474f4448435441 0016 642d6874652d6377 
08586180
[   88.985811] bca0: 08fe9df0 2031206575657571 756f2064656d6974 

[   88.993624] bcc0:   0010 
0001
[   89.001439] bce0: 08fe7000 8001eae2c000 8001eae2bfb0 
8001eae2c39c
[   89.009252] bd00: 8001ecb6be80 0005 8001eae2c3b0 

[   89.017065] bd20: 00a0 0802bd80 0894a76c 
0802bd80
[   89.024879] bd40: 0894a76c 2145 00b67570 
0001
[   89.032693] bd60: 0001 8001ecb6b200 0802bd80 
0894a76c
[   89.040508] [] dev_watchdog+0x2cc/0x2d8
[   89.045900] [] call_timer_fn.isra.5+0x24/0x80
[   89.051809] [] expire_timers+0xa4/0xb0
[   89.057111] [] run_timer_softirq+0x140/0x170
[   89.062933] [] __do_softirq+0x12c/0x228
[   89.068323] [] irq_exit+0xd0/0x108
[   89.073278] [] __handle_domain_irq+0x60/0xb8
[   89.079098] [] gic_handle_irq+0x58/0xa8
[   89.084484] Exception stack(0x09e3be20 to 0x09e3bf60)
[   89.090910] be20:   0001 

[   89.098724] be40:  09e3bf60 8001ecffd000 
0001
[   89.106537] be60: 0002 09e3bee0 0a00 

[   89.114351] be80: 0001  001c3dfbd9959589 
1daf5b7a4860
[   89.122164] bea0: 0825b000  c0311284 
08fc5000
[   89.129978] bec0: 08fe9000 08fe9000 08fd04a0 
08fe9e90
[   89.137792] bee0:   8001ec8fd400 

[   89.145605] bf00:  09e3bf60 0808548c 
09e3bf60
[   89.153418] bf20: 08085490 0145  

[   89.161231] bf40:  081409c4 09e3bf60 
08085490
[   89.169044] [] el1_irq+0xb0/0x124
[   89.173912] [] arch_cpu_idle+0x10/0x18
[   89.179213] [] do_idle+0x120/0x1e0
[   89.184166] [] cpu_startup_entry+0x24/0x28
[   89.189814] [] secondary_start_kernel+0x110/0x120
[   89.196067] ---[ end trace 039d403d63546b77 ]---
 
Below are the DT 

[PATCH] net/9p: trans_xen: add missing MODULE_AUTHOR/DESCRIPTION/LICENSE

2017-11-19 Thread Jesse Chan
Signed-off-by: Jesse Chan 
---
 net/9p/trans_xen.c | 4 
 1 file changed, 4 insertions(+)

diff --git a/net/9p/trans_xen.c b/net/9p/trans_xen.c
index 6ad3e043c617..90402e744fbf 100644
--- a/net/9p/trans_xen.c
+++ b/net/9p/trans_xen.c
@@ -543,3 +543,7 @@ static void p9_trans_xen_exit(void)
return xenbus_unregister_driver(_9pfs_front_driver);
 }
 module_exit(p9_trans_xen_exit);
+
+MODULE_AUTHOR("Stefano Stabellini ");
+MODULE_DESCRIPTION("Xen Transport for 9P");
+MODULE_LICENSE("Dual MIT/GPL");
-- 
2.14.1



Re: [PATCH net] net: accept UFO datagrams from tuntap and packet

2017-11-19 Thread David Miller
From: Jason Wang 
Date: Mon, 20 Nov 2017 11:03:54 +0800

> Looks good to me. The only concern is that whether or not stable can
> accept this patch:

Stable submission will not be a problem, please do not worry about
it :-)


[PATCH net,stable] net: qmi_wwan: add Quectel BG96 2c7c:0296

2017-11-19 Thread ssjoholm
From: ssjoholm 

Signed-off-by: ssjoholm 

Quectel BG96 is an Qualcomm MDM9206 based IoT modem, supporting both CAT-M and 
NB-IoT. Tested hardware is BG96 mounted on Quectel development board (EVB).
The USB id is added to qmi_wwan.c to allow QMI communication with the BG96.
---
 drivers/net/usb/qmi_wwan.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/net/usb/qmi_wwan.c b/drivers/net/usb/qmi_wwan.c
index 720a3a248070..c750cf7c042b 100644
--- a/drivers/net/usb/qmi_wwan.c
+++ b/drivers/net/usb/qmi_wwan.c
@@ -1239,6 +1239,7 @@ static const struct usb_device_id products[] = {
{QMI_FIXED_INTF(0x1e0e, 0x9001, 5)},/* SIMCom 7230E */
{QMI_QUIRK_SET_DTR(0x2c7c, 0x0125, 4)}, /* Quectel EC25, EC20 R2.0  
Mini PCIe */
{QMI_QUIRK_SET_DTR(0x2c7c, 0x0121, 4)}, /* Quectel EC21 Mini PCIe */
+   {QMI_FIXED_INTF(0x2c7c, 0x0296, 4)},/* Quectel BG96 */
 
/* 4. Gobi 1000 devices */
{QMI_GOBI1K_DEVICE(0x05c6, 0x9212)},/* Acer Gobi Modem Device */
-- 
2.11.0 (Apple Git-81)



[PATCH net 01/10] bpf: offload: add comment warning developers about double destroy

2017-11-19 Thread Jakub Kicinski
Offload state may get destroyed either because the device for which
it was constructed is going away, or because the refcount of bpf
program itself has reached 0.  In both of those cases we will call
__bpf_prog_offload_destroy() to unlink the offload from the device.
We may in fact call it twice, which works just fine, but we should
make clear this is intended and caution others trying to extend the
function.

Signed-off-by: Jakub Kicinski 
Reviewed-by: Quentin Monnet 
Acked-by: Alexei Starovoitov 
Acked-by: Daniel Borkmann 
---
 kernel/bpf/offload.c | 4 
 1 file changed, 4 insertions(+)

diff --git a/kernel/bpf/offload.c b/kernel/bpf/offload.c
index 2816feb38be1..fd696d3dd429 100644
--- a/kernel/bpf/offload.c
+++ b/kernel/bpf/offload.c
@@ -85,6 +85,10 @@ static void __bpf_prog_offload_destroy(struct bpf_prog *prog)
struct bpf_dev_offload *offload = prog->aux->offload;
struct netdev_bpf data = {};
 
+   /* Caution - if netdev is destroyed before the program, this function
+* will be called twice.
+*/
+
data.offload.prog = prog;
 
if (offload->verifier_running)
-- 
2.14.1



[PATCH net 06/10] bpf: turn bpf_prog_get_type() into a wrapper

2017-11-19 Thread Jakub Kicinski
bpf_prog_get_type() is identical to bpf_prog_get_type_dev(),
with false passed as attach_drv.  Instead of keeping it as
an exported symbol turn it into static inline wrapper.

Signed-off-by: Jakub Kicinski 
Reviewed-by: Quentin Monnet 
Acked-by: Alexei Starovoitov 
Acked-by: Daniel Borkmann 
---
 include/linux/bpf.h  | 13 ++---
 kernel/bpf/syscall.c | 10 --
 2 files changed, 6 insertions(+), 17 deletions(-)

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index f82be640731e..37bbab8c0f56 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -334,7 +334,6 @@ extern const struct bpf_verifier_ops 
tc_cls_act_analyzer_ops;
 extern const struct bpf_verifier_ops xdp_analyzer_ops;
 
 struct bpf_prog *bpf_prog_get(u32 ufd);
-struct bpf_prog *bpf_prog_get_type(u32 ufd, enum bpf_prog_type type);
 struct bpf_prog *bpf_prog_get_type_dev(u32 ufd, enum bpf_prog_type type,
   bool attach_drv);
 struct bpf_prog * __must_check bpf_prog_add(struct bpf_prog *prog, int i);
@@ -425,12 +424,6 @@ static inline struct bpf_prog *bpf_prog_get(u32 ufd)
return ERR_PTR(-EOPNOTSUPP);
 }
 
-static inline struct bpf_prog *bpf_prog_get_type(u32 ufd,
-enum bpf_prog_type type)
-{
-   return ERR_PTR(-EOPNOTSUPP);
-}
-
 static inline struct bpf_prog *bpf_prog_get_type_dev(u32 ufd,
 enum bpf_prog_type type,
 bool attach_drv)
@@ -514,6 +507,12 @@ static inline int cpu_map_enqueue(struct bpf_cpu_map_entry 
*rcpu,
 }
 #endif /* CONFIG_BPF_SYSCALL */
 
+static inline struct bpf_prog *bpf_prog_get_type(u32 ufd,
+enum bpf_prog_type type)
+{
+   return bpf_prog_get_type_dev(ufd, type, false);
+}
+
 int bpf_prog_offload_compile(struct bpf_prog *prog);
 void bpf_prog_offload_destroy(struct bpf_prog *prog);
 u32 bpf_prog_offload_ifindex(struct bpf_prog *prog);
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 38da55905ab0..41509cf825d8 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -1097,16 +1097,6 @@ struct bpf_prog *bpf_prog_get(u32 ufd)
return __bpf_prog_get(ufd, NULL, false);
 }
 
-struct bpf_prog *bpf_prog_get_type(u32 ufd, enum bpf_prog_type type)
-{
-   struct bpf_prog *prog = __bpf_prog_get(ufd, , false);
-
-   if (!IS_ERR(prog))
-   trace_bpf_prog_get_type(prog);
-   return prog;
-}
-EXPORT_SYMBOL_GPL(bpf_prog_get_type);
-
 struct bpf_prog *bpf_prog_get_type_dev(u32 ufd, enum bpf_prog_type type,
   bool attach_drv)
 {
-- 
2.14.1



[PATCH net 00/10] bpf: offload: check netdev pointer in the drivers and namespace trouble

2017-11-19 Thread Jakub Kicinski
Hi!

This series addresses some late comments and moves checking if program
has been loaded for the correct device to the drivers.  There are also
some problems with net namespaces which I didn't take into consideration.
On the kernel side we will now simply ignore namespace moves.  Since the
user space API is not reporting any namespace identification we have to
remove the ifindex until a correct way of reporting is agreed upon.


Jakub Kicinski (10):
  bpf: offload: add comment warning developers about double destroy
  bpf: offload: limit offload to cls_bpf and xdp programs only
  bpf: offload: rename the ifindex field
  bpf: offload: move offload device validation out to the drivers
  net: xdp: don't allow device-bound programs in driver mode
  bpf: turn bpf_prog_get_type() into a wrapper
  bpf: offload: ignore namespace moves
  bpftool: revert printing program device bound info
  bpf: revert report offload info to user space
  bpf: make bpf_prog_offload_verifier_prep() static inline

 drivers/net/ethernet/netronome/nfp/bpf/offload.c | 10 --
 include/linux/bpf.h  | 18 +--
 include/linux/bpf_verifier.h |  2 +-
 include/uapi/linux/bpf.h |  8 +
 kernel/bpf/offload.c | 27 +++-
 kernel/bpf/syscall.c | 40 
 net/core/dev.c   | 14 ++---
 net/sched/cls_bpf.c  |  8 ++---
 tools/bpf/bpftool/prog.c | 31 --
 tools/include/uapi/linux/bpf.h   |  8 +
 10 files changed, 56 insertions(+), 110 deletions(-)

-- 
2.14.1



[PATCH net 03/10] bpf: offload: rename the ifindex field

2017-11-19 Thread Jakub Kicinski
bpf_target_prog seems long and clunky, rename it to prog_ifindex.
We don't want to call this field just ifindex, because maps
may need a similar field in the future and bpf_attr members for
programs and maps are unnamed.

Signed-off-by: Jakub Kicinski 
Reviewed-by: Quentin Monnet 
Acked-by: Alexei Starovoitov 
Acked-by: Daniel Borkmann 
---
 include/uapi/linux/bpf.h   | 2 +-
 kernel/bpf/offload.c   | 2 +-
 kernel/bpf/syscall.c   | 4 ++--
 tools/include/uapi/linux/bpf.h | 2 +-
 4 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index e880ae6434ee..3f626df42516 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -262,7 +262,7 @@ union bpf_attr {
__u32   kern_version;   /* checked when 
prog_type=kprobe */
__u32   prog_flags;
charprog_name[BPF_OBJ_NAME_LEN];
-   __u32   prog_target_ifindex;/* ifindex of netdev to 
prep for */
+   __u32   prog_ifindex;   /* ifindex of netdev to prep 
for */
};
 
struct { /* anonymous struct used by BPF_OBJ_* commands */
diff --git a/kernel/bpf/offload.c b/kernel/bpf/offload.c
index ac187f9ee182..a778e5df7e26 100644
--- a/kernel/bpf/offload.c
+++ b/kernel/bpf/offload.c
@@ -29,7 +29,7 @@ int bpf_prog_offload_init(struct bpf_prog *prog, union 
bpf_attr *attr)
init_waitqueue_head(>verifier_done);
 
rtnl_lock();
-   offload->netdev = __dev_get_by_index(net, attr->prog_target_ifindex);
+   offload->netdev = __dev_get_by_index(net, attr->prog_ifindex);
if (!offload->netdev) {
rtnl_unlock();
kfree(offload);
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 09badc37e864..8e9d065bb7cd 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -1118,7 +1118,7 @@ struct bpf_prog *bpf_prog_get_type_dev(u32 ufd, enum 
bpf_prog_type type,
 EXPORT_SYMBOL_GPL(bpf_prog_get_type_dev);
 
 /* last field in 'union bpf_attr' used by this command */
-#defineBPF_PROG_LOAD_LAST_FIELD prog_target_ifindex
+#defineBPF_PROG_LOAD_LAST_FIELD prog_ifindex
 
 static int bpf_prog_load(union bpf_attr *attr)
 {
@@ -1181,7 +1181,7 @@ static int bpf_prog_load(union bpf_attr *attr)
atomic_set(>aux->refcnt, 1);
prog->gpl_compatible = is_gpl ? 1 : 0;
 
-   if (attr->prog_target_ifindex) {
+   if (attr->prog_ifindex) {
err = bpf_prog_offload_init(prog, attr);
if (err)
goto free_prog;
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index e880ae6434ee..3f626df42516 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -262,7 +262,7 @@ union bpf_attr {
__u32   kern_version;   /* checked when 
prog_type=kprobe */
__u32   prog_flags;
charprog_name[BPF_OBJ_NAME_LEN];
-   __u32   prog_target_ifindex;/* ifindex of netdev to 
prep for */
+   __u32   prog_ifindex;   /* ifindex of netdev to prep 
for */
};
 
struct { /* anonymous struct used by BPF_OBJ_* commands */
-- 
2.14.1



[PATCH net 04/10] bpf: offload: move offload device validation out to the drivers

2017-11-19 Thread Jakub Kicinski
With TC shared block changes we can't depend on correct netdev
pointer being available in cls_bpf.  Move the device validation
to the driver.  Core will only make sure that offloaded programs
are always attached in the driver (or in HW by the driver).  We
trust that drivers which implement offload callbacks will perform
necessary checks.

Moving the checks to the driver is generally a useful thing,
in practice the check should be against a switchdev instance,
not a netdev, given that most ASICs will probably allow using
the same program on many ports.

Signed-off-by: Jakub Kicinski 
Reviewed-by: Quentin Monnet 
Acked-by: Alexei Starovoitov 
Acked-by: Daniel Borkmann 
---
 drivers/net/ethernet/netronome/nfp/bpf/offload.c | 10 --
 include/linux/bpf.h  |  4 ++--
 kernel/bpf/syscall.c | 23 ---
 net/core/dev.c   |  7 ++-
 net/sched/cls_bpf.c  |  8 +++-
 5 files changed, 27 insertions(+), 25 deletions(-)

diff --git a/drivers/net/ethernet/netronome/nfp/bpf/offload.c 
b/drivers/net/ethernet/netronome/nfp/bpf/offload.c
index b6cee71f49d3..bc879aeb62d4 100644
--- a/drivers/net/ethernet/netronome/nfp/bpf/offload.c
+++ b/drivers/net/ethernet/netronome/nfp/bpf/offload.c
@@ -214,8 +214,14 @@ int nfp_net_bpf_offload(struct nfp_net *nn, struct 
bpf_prog *prog,
 {
int err;
 
-   if (prog && !prog->aux->offload)
-   return -EINVAL;
+   if (prog) {
+   struct bpf_dev_offload *offload = prog->aux->offload;
+
+   if (!offload)
+   return -EINVAL;
+   if (offload->netdev != nn->dp.netdev)
+   return -EINVAL;
+   }
 
if (prog && old_prog) {
u8 cap;
diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index c397934f91dd..f82be640731e 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -336,7 +336,7 @@ extern const struct bpf_verifier_ops xdp_analyzer_ops;
 struct bpf_prog *bpf_prog_get(u32 ufd);
 struct bpf_prog *bpf_prog_get_type(u32 ufd, enum bpf_prog_type type);
 struct bpf_prog *bpf_prog_get_type_dev(u32 ufd, enum bpf_prog_type type,
-  struct net_device *netdev);
+  bool attach_drv);
 struct bpf_prog * __must_check bpf_prog_add(struct bpf_prog *prog, int i);
 void bpf_prog_sub(struct bpf_prog *prog, int i);
 struct bpf_prog * __must_check bpf_prog_inc(struct bpf_prog *prog);
@@ -433,7 +433,7 @@ static inline struct bpf_prog *bpf_prog_get_type(u32 ufd,
 
 static inline struct bpf_prog *bpf_prog_get_type_dev(u32 ufd,
 enum bpf_prog_type type,
-struct net_device *netdev)
+bool attach_drv)
 {
return ERR_PTR(-EOPNOTSUPP);
 }
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 8e9d065bb7cd..38da55905ab0 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -1057,22 +1057,23 @@ struct bpf_prog *bpf_prog_inc_not_zero(struct bpf_prog 
*prog)
 }
 EXPORT_SYMBOL_GPL(bpf_prog_inc_not_zero);
 
-static bool bpf_prog_can_attach(struct bpf_prog *prog,
-   enum bpf_prog_type *attach_type,
-   struct net_device *netdev)
+static bool bpf_prog_get_ok(struct bpf_prog *prog,
+   enum bpf_prog_type *attach_type, bool attach_drv)
 {
-   struct bpf_dev_offload *offload = prog->aux->offload;
+   /* not an attachment, just a refcount inc, always allow */
+   if (!attach_type)
+   return true;
 
if (prog->type != *attach_type)
return false;
-   if (offload && offload->netdev != netdev)
+   if (bpf_prog_is_dev_bound(prog->aux) && !attach_drv)
return false;
 
return true;
 }
 
 static struct bpf_prog *__bpf_prog_get(u32 ufd, enum bpf_prog_type 
*attach_type,
-  struct net_device *netdev)
+  bool attach_drv)
 {
struct fd f = fdget(ufd);
struct bpf_prog *prog;
@@ -1080,7 +1081,7 @@ static struct bpf_prog *__bpf_prog_get(u32 ufd, enum 
bpf_prog_type *attach_type,
prog = bpf_prog_get(f);
if (IS_ERR(prog))
return prog;
-   if (attach_type && !bpf_prog_can_attach(prog, attach_type, netdev)) {
+   if (!bpf_prog_get_ok(prog, attach_type, attach_drv)) {
prog = ERR_PTR(-EINVAL);
goto out;
}
@@ -1093,12 +1094,12 @@ static struct bpf_prog *__bpf_prog_get(u32 ufd, enum 
bpf_prog_type *attach_type,
 
 struct bpf_prog *bpf_prog_get(u32 ufd)
 {
-   return __bpf_prog_get(ufd, NULL, NULL);
+   

[PATCH net 08/10] bpftool: revert printing program device bound info

2017-11-19 Thread Jakub Kicinski
This reverts commit 928631e05495 ("bpftool: print program device bound
info").  We will remove this API and redo it right in -next.

Signed-off-by: Jakub Kicinski 
---
 tools/bpf/bpftool/prog.c   | 31 ---
 tools/include/uapi/linux/bpf.h |  6 --
 2 files changed, 37 deletions(-)

diff --git a/tools/bpf/bpftool/prog.c b/tools/bpf/bpftool/prog.c
index f45c44ef9bec..ad619b96c276 100644
--- a/tools/bpf/bpftool/prog.c
+++ b/tools/bpf/bpftool/prog.c
@@ -41,7 +41,6 @@
 #include 
 #include 
 #include 
-#include 
 #include 
 #include 
 
@@ -230,21 +229,6 @@ static void print_prog_json(struct bpf_prog_info *info, 
int fd)
 info->tag[0], info->tag[1], info->tag[2], info->tag[3],
 info->tag[4], info->tag[5], info->tag[6], info->tag[7]);
 
-   if (info->status & BPF_PROG_STATUS_DEV_BOUND) {
-   jsonw_name(json_wtr, "dev");
-   if (info->ifindex) {
-   char name[IF_NAMESIZE];
-
-   if (!if_indextoname(info->ifindex, name))
-   jsonw_printf(json_wtr, "\"ifindex:%d\"",
-info->ifindex);
-   else
-   jsonw_printf(json_wtr, "\"%s\"", name);
-   } else {
-   jsonw_printf(json_wtr, "\"unknown\"");
-   }
-   }
-
if (info->load_time) {
char buf[32];
 
@@ -302,21 +286,6 @@ static void print_prog_plain(struct bpf_prog_info *info, 
int fd)
 
printf("tag ");
fprint_hex(stdout, info->tag, BPF_TAG_SIZE, "");
-   printf(" ");
-
-   if (info->status & BPF_PROG_STATUS_DEV_BOUND) {
-   printf("dev ");
-   if (info->ifindex) {
-   char name[IF_NAMESIZE];
-
-   if (!if_indextoname(info->ifindex, name))
-   printf("ifindex:%d ", info->ifindex);
-   else
-   printf("%s ", name);
-   } else {
-   printf("unknown ");
-   }
-   }
printf("\n");
 
if (info->load_time) {
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index 3f626df42516..4c223ab30293 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -897,10 +897,6 @@ enum sk_action {
 
 #define BPF_TAG_SIZE   8
 
-enum bpf_prog_status {
-   BPF_PROG_STATUS_DEV_BOUND   = (1 << 0),
-};
-
 struct bpf_prog_info {
__u32 type;
__u32 id;
@@ -914,8 +910,6 @@ struct bpf_prog_info {
__u32 nr_map_ids;
__aligned_u64 map_ids;
char name[BPF_OBJ_NAME_LEN];
-   __u32 ifindex;
-   __u32 status;
 } __attribute__((aligned(8)));
 
 struct bpf_map_info {
-- 
2.14.1



[PATCH net 09/10] bpf: revert report offload info to user space

2017-11-19 Thread Jakub Kicinski
This reverts commit bd601b6ada11 ("bpf: report offload info to user
space").  The ifindex by itself is not sufficient, we should provide
information on which network namespace this ifindex belongs to.
After considering some options we concluded that it's best to just
remove this API for now, and rework it in -next.

Signed-off-by: Jakub Kicinski 
---
 include/linux/bpf.h  |  1 -
 include/uapi/linux/bpf.h |  6 --
 kernel/bpf/offload.c | 12 
 kernel/bpf/syscall.c |  5 -
 4 files changed, 24 deletions(-)

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 37bbab8c0f56..76c577281d78 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -515,7 +515,6 @@ static inline struct bpf_prog *bpf_prog_get_type(u32 ufd,
 
 int bpf_prog_offload_compile(struct bpf_prog *prog);
 void bpf_prog_offload_destroy(struct bpf_prog *prog);
-u32 bpf_prog_offload_ifindex(struct bpf_prog *prog);
 
 #if defined(CONFIG_NET) && defined(CONFIG_BPF_SYSCALL)
 int bpf_prog_offload_init(struct bpf_prog *prog, union bpf_attr *attr);
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 3f626df42516..4c223ab30293 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -897,10 +897,6 @@ enum sk_action {
 
 #define BPF_TAG_SIZE   8
 
-enum bpf_prog_status {
-   BPF_PROG_STATUS_DEV_BOUND   = (1 << 0),
-};
-
 struct bpf_prog_info {
__u32 type;
__u32 id;
@@ -914,8 +910,6 @@ struct bpf_prog_info {
__u32 nr_map_ids;
__aligned_u64 map_ids;
char name[BPF_OBJ_NAME_LEN];
-   __u32 ifindex;
-   __u32 status;
 } __attribute__((aligned(8)));
 
 struct bpf_map_info {
diff --git a/kernel/bpf/offload.c b/kernel/bpf/offload.c
index d4267c674fec..68ec884440b7 100644
--- a/kernel/bpf/offload.c
+++ b/kernel/bpf/offload.c
@@ -149,18 +149,6 @@ int bpf_prog_offload_compile(struct bpf_prog *prog)
return bpf_prog_offload_translate(prog);
 }
 
-u32 bpf_prog_offload_ifindex(struct bpf_prog *prog)
-{
-   struct bpf_dev_offload *offload = prog->aux->offload;
-   u32 ifindex;
-
-   rtnl_lock();
-   ifindex = offload->netdev ? offload->netdev->ifindex : 0;
-   rtnl_unlock();
-
-   return ifindex;
-}
-
 const struct bpf_prog_ops bpf_offload_prog_ops = {
 };
 
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 41509cf825d8..2c4cfeaa8d5e 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -1616,11 +1616,6 @@ static int bpf_prog_get_info_by_fd(struct bpf_prog *prog,
return -EFAULT;
}
 
-   if (bpf_prog_is_dev_bound(prog->aux)) {
-   info.status |= BPF_PROG_STATUS_DEV_BOUND;
-   info.ifindex = bpf_prog_offload_ifindex(prog);
-   }
-
 done:
if (copy_to_user(uinfo, , info_len) ||
put_user(info_len, >info.info_len))
-- 
2.14.1



[PATCH net 02/10] bpf: offload: limit offload to cls_bpf and xdp programs only

2017-11-19 Thread Jakub Kicinski
We are currently only allowing attachment of device-bound
cls_bpf and XDP programs.  Make this restriction explicit in
the BPF offload code.  This way we can potentially reuse the
ifindex field in the future.

Since XDP and cls_bpf programs can only be loaded by admin,
we can drop the explicit capability check from offload code.

Signed-off-by: Jakub Kicinski 
Reviewed-by: Quentin Monnet 
Acked-by: Alexei Starovoitov 
Acked-by: Daniel Borkmann 
---
 kernel/bpf/offload.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/kernel/bpf/offload.c b/kernel/bpf/offload.c
index fd696d3dd429..ac187f9ee182 100644
--- a/kernel/bpf/offload.c
+++ b/kernel/bpf/offload.c
@@ -14,8 +14,9 @@ int bpf_prog_offload_init(struct bpf_prog *prog, union 
bpf_attr *attr)
struct net *net = current->nsproxy->net_ns;
struct bpf_dev_offload *offload;
 
-   if (!capable(CAP_SYS_ADMIN))
-   return -EPERM;
+   if (attr->prog_type != BPF_PROG_TYPE_SCHED_CLS &&
+   attr->prog_type != BPF_PROG_TYPE_XDP)
+   return -EINVAL;
 
if (attr->prog_flags)
return -EINVAL;
-- 
2.14.1



[PATCH net 05/10] net: xdp: don't allow device-bound programs in driver mode

2017-11-19 Thread Jakub Kicinski
Currently device-bound programs are not able to run on the host
to save resources (host JIT is not invoked).  Don't allow XDP
programs to be attached without the HW_MODE flag.  In theory
if program is already translated for device offload the driver
should choose to offload it instead of loading it in the driver.
However, offloading translated program may still fail resulting
in device-bound program being run on the host.

Prevent this by refusing to attach device bound programs if
XDP_FLAGS_HW_MODE is not set.

Signed-off-by: Jakub Kicinski 
Reviewed-by: Quentin Monnet 
Acked-by: Alexei Starovoitov 
Acked-by: Daniel Borkmann 
---
 net/core/dev.c | 7 +++
 1 file changed, 7 insertions(+)

diff --git a/net/core/dev.c b/net/core/dev.c
index 09525a27319c..21de2d37a0ba 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -7143,6 +7143,13 @@ int dev_change_xdp_fd(struct net_device *dev, struct 
netlink_ext_ack *extack,
 bpf_op == ops->ndo_bpf);
if (IS_ERR(prog))
return PTR_ERR(prog);
+
+   if (!(flags & XDP_FLAGS_HW_MODE) &&
+   bpf_prog_is_dev_bound(prog->aux)) {
+   NL_SET_ERR_MSG_MOD(extack, "using device-bound program 
without HW_MODE flag not supported");
+   bpf_prog_put(prog);
+   return -EINVAL;
+   }
}
 
err = dev_xdp_install(dev, bpf_op, extack, flags, prog);
-- 
2.14.1



[PATCH net 07/10] bpf: offload: ignore namespace moves

2017-11-19 Thread Jakub Kicinski
We are currently destroying the device offload state when device
moves to another net namespace.  This doesn't break with current
NFP code, because offload state is not used on program removal,
but it's not correct behaviour.

Ignore the device unregister notifications on namespace move.

Signed-off-by: Jakub Kicinski 
---
 kernel/bpf/offload.c | 4 
 1 file changed, 4 insertions(+)

diff --git a/kernel/bpf/offload.c b/kernel/bpf/offload.c
index a778e5df7e26..d4267c674fec 100644
--- a/kernel/bpf/offload.c
+++ b/kernel/bpf/offload.c
@@ -174,6 +174,10 @@ static int bpf_offload_notification(struct notifier_block 
*notifier,
 
switch (event) {
case NETDEV_UNREGISTER:
+   /* ignore namespace changes */
+   if (netdev->reg_state != NETREG_UNREGISTERING)
+   break;
+
list_for_each_entry_safe(offload, tmp, _prog_offload_devs,
 offloads) {
if (offload->netdev == netdev)
-- 
2.14.1



[PATCH net 10/10] bpf: make bpf_prog_offload_verifier_prep() static inline

2017-11-19 Thread Jakub Kicinski
Header implementation of bpf_prog_offload_verifier_prep() which
is used if CONFIG_NET=n should be a static inline.

Signed-off-by: Jakub Kicinski 
---
 include/linux/bpf_verifier.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index 07b96aaca256..b61482d354a2 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -171,7 +171,7 @@ static inline struct bpf_reg_state *cur_regs(struct 
bpf_verifier_env *env)
 #if defined(CONFIG_NET) && defined(CONFIG_BPF_SYSCALL)
 int bpf_prog_offload_verifier_prep(struct bpf_verifier_env *env);
 #else
-int bpf_prog_offload_verifier_prep(struct bpf_verifier_env *env)
+static inline int bpf_prog_offload_verifier_prep(struct bpf_verifier_env *env)
 {
return -EOPNOTSUPP;
 }
-- 
2.14.1



[PATCH iproute2] iproute2: fixes to compile on some systems.

2017-11-19 Thread Lorenzo Colitti
1. Put the declarations of strlcpy and strlcat inside
   an #ifdef NEED_STRLCPY. Their declarations were already in a
   similar #ifdef.
2. In bpf_scm.h, include sys/un.h for struct sockaddr_un.
3. In utils.h, include time.h for struct timeval.

Tested: builds on ubuntu 14.04 with "make clean distclean; ./configure && make 
-j64"
Tested: 4.14.1 builds on Android with Android-specific #ifndefs for missing 
library code
Signed-off-by: Lorenzo Colitti 
---
 include/bpf_scm.h | 1 +
 include/utils.h   | 3 +++
 2 files changed, 4 insertions(+)

diff --git a/include/bpf_scm.h b/include/bpf_scm.h
index 35117d11ee..122d59fc43 100644
--- a/include/bpf_scm.h
+++ b/include/bpf_scm.h
@@ -3,6 +3,7 @@
 
 #include 
 #include 
+#include 
 
 #include "utils.h"
 #include "bpf_elf.h"
diff --git a/include/utils.h b/include/utils.h
index 3d91c50db0..10749fbee1 100644
--- a/include/utils.h
+++ b/include/utils.h
@@ -6,6 +6,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include "libnetlink.h"
 #include "ll_map.h"
@@ -256,7 +257,9 @@ int make_path(const char *path, mode_t mode);
 char *find_cgroup2_mount(void);
 int get_command_name(const char *pid, char *comm, size_t len);
 
+#ifdef NEED_STRLCPY
 size_t strlcpy(char *dst, const char *src, size_t size);
 size_t strlcat(char *dst, const char *src, size_t size);
+#endif
 
 #endif /* __UTILS_H__ */
-- 
2.15.0.448.gf294e3d99a-goog



[PATCHv2 net-next 1/1] forcedeth: replace pci_unmap_page with dma_unmap_page

2017-11-19 Thread Zhu Yanjun
The function pci_unmap_page is obsolete. So it is replaced with
the function dma_unmap_page.

CC: Srinivas Eeda 
CC: Joe Jin 
CC: Junxiao Bi 
Signed-off-by: Zhu Yanjun 
---
V1->V2: fix direction flag error.
---
 drivers/net/ethernet/nvidia/forcedeth.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/nvidia/forcedeth.c 
b/drivers/net/ethernet/nvidia/forcedeth.c
index ac8439c..481876b 100644
--- a/drivers/net/ethernet/nvidia/forcedeth.c
+++ b/drivers/net/ethernet/nvidia/forcedeth.c
@@ -1986,9 +1986,9 @@ static void nv_unmap_txskb(struct fe_priv *np, struct 
nv_skb_map *tx_skb)
 tx_skb->dma_len,
 DMA_TO_DEVICE);
else
-   pci_unmap_page(np->pci_dev, tx_skb->dma,
+   dma_unmap_page(>pci_dev->dev, tx_skb->dma,
   tx_skb->dma_len,
-  PCI_DMA_TODEVICE);
+  DMA_TO_DEVICE);
tx_skb->dma = 0;
}
 }
-- 
2.7.4



Re: [PATCH net] net: accept UFO datagrams from tuntap and packet

2017-11-19 Thread Jason Wang



On 2017年11月18日 06:59, Willem de Bruijn wrote:

From: Willem de Bruijn

Tuntap and similar devices can inject GSO packets. Accept type
VIRTIO_NET_HDR_GSO_UDP, even though not generating UFO natively.

Processes are expected to use feature negotiation such as TUNSETOFFLOAD
to detect supported offload types and refrain from injecting other
packets. This process breaks down with live migration: guest kernels
do not renegotiate flags, so destination hosts need to expose all
features that the source host does.

Partially revert the UFO removal from 182e0b6b5846~1..d9d30adf5677.
This patch introduces nearly(*) no new code to simplify verification.
It brings back verbatim tuntap UFO negotiation, VIRTIO_NET_HDR_GSO_UDP
insertion and software UFO segmentation.

It does not reinstate protocol stack support, hardware offload
(NETIF_F_UFO), SKB_GSO_UDP tunneling in SKB_GSO_SOFTWARE or reception
of VIRTIO_NET_HDR_GSO_UDP packets in tuntap.

To support SKB_GSO_UDP reappearing in the stack, also reinstate
logic in act_csum and openvswitch. Achieve equivalence with v4.13 HEAD
by squashing in commit 939912216fa8 ("net: skb_needs_check() removes
CHECKSUM_UNNECESSARY check for tx.") and reverting commit 8d63bee643f1
("net: avoid skb_warn_bad_offload false positives on UFO").

(*) To avoid having to bring back skb_shinfo(skb)->ip6_frag_id,
ipv6_proxy_select_ident is changed to return a __be32, which is
assigned directly to the frag_hdr. Also, SKB_GSO_UDP is inserted
at the end of the enum to minimize code churn.


Re: [PATCH net-next 1/1] forcedeth: replace pci_unmap_page with dma_unmap_page

2017-11-19 Thread David Miller
From: Zhu Yanjun 
Date: Sun, 19 Nov 2017 09:05:31 -0500

> @@ -1986,7 +1986,7 @@ static void nv_unmap_txskb(struct fe_priv *np, struct 
> nv_skb_map *tx_skb)
>tx_skb->dma_len,
>DMA_TO_DEVICE);
>   else
> - pci_unmap_page(np->pci_dev, tx_skb->dma,
> + dma_unmap_page(>pci_dev->dev, tx_skb->dma,
>  tx_skb->dma_len,
>  PCI_DMA_TODEVICE);

So you're going to call a dma_*() function using a PCI_DMA_* direction
flag?

Please correct this.

Thank you.


Re: [PATCH net] net: ena: fix race condition between device reset and link up setup

2017-11-19 Thread David Miller
From: 
Date: Sun, 19 Nov 2017 18:03:40 +

> From: Netanel Belgazal 
> 
> In rare cases, ena driver would reset and re-start the device,
> for example, in case of misbehaving application that causes
> transmit timeout
> 
> The first step in the reset procedure is to stop the Tx traffic by
> calling ena_carrier_off().
> 
> After the driver have just started the device reset procedure, device
> happens to send an asynchronous notification (via AENQ) to the driver
> than there was a link change (to link-up state).
> This link change is mapped to a call to netif_carrier_on() which
> re-activates the Tx queues, violating the assumption of no tx traffic
> until device reset is completed, as the reset task might still be in
> the process of queues initialization, leading to an access to
> uninitialized memory.
> 
> Signed-off-by: Netanel Belgazal 

Applied.


Re: [PATCH] net: vxge: Fix some indentation issues

2017-11-19 Thread David Miller
From: Christophe JAILLET 
Date: Sun, 19 Nov 2017 13:41:33 +0100

> Some statements are not enough or too much indented.
> Fix it to improve readalbility.
> 
> Signed-off-by: Christophe JAILLET 

Applied.


Re: [kernel-hardening] [PATCH v4] scripts: add leaking_addresses.pl

2017-11-19 Thread Tobin C. Harding
On Fri, Nov 10, 2017 at 07:26:34PM +0530, kaiwan.billimo...@gmail.com wrote:
> On Tue, 2017-11-07 at 21:32 +1100, Tobin C. Harding wrote:
[snip]

> Finally, unsure if am working against the latest ver of your script Tobin, 
> apologies if not.

The latest version of leaking_addresses.pl is now in Linus' tree.

thanks,
Tobin.


Re: [patch net-next v2 2/4] net: sched: introduce per-egress action device callbacks

2017-11-19 Thread Jiri Pirko
Sun, Nov 19, 2017 at 08:37:49PM CET, manish.cho...@cavium.com wrote:
>> -Original Message-
>> From: netdev-ow...@vger.kernel.org [mailto:netdev-ow...@vger.kernel.org]
>> On Behalf Of Jiri Pirko
>> Sent: Wednesday, October 11, 2017 1:11 PM
>> To: netdev@vger.kernel.org
>> Cc: da...@davemloft.net; j...@mojatatu.com; xiyou.wangc...@gmail.com;
>> sae...@mellanox.com; mat...@mellanox.com; leo...@mellanox.com;
>> ml...@mellanox.com; david.lai...@aculab.com; gerlitz...@gmail.com
>> Subject: [patch net-next v2 2/4] net: sched: introduce per-egress action 
>> device
>> callbacks
>> 
>> From: Jiri Pirko 
>> 
>> Introduce infrastructure that allows drivers to register callbacks that are 
>> called
>> whenever tc would offload inserted rule and specified device acts as tc 
>> action
>> egress device.
>> 
>> Signed-off-by: Jiri Pirko 
>> ---
>> v1->v2:
>> - take rtnl for register/unregister
>> ---
>>  include/net/act_api.h |  34 
>>  include/net/pkt_cls.h |   2 +
>>  net/sched/act_api.c   | 220
>> ++
>>  net/sched/cls_api.c   |  30 +++
>>  4 files changed, 286 insertions(+)
>> 
>> diff --git a/include/net/act_api.h b/include/net/act_api.h index
>> 900168a..f5e8c90 100644
>> --- a/include/net/act_api.h
>> +++ b/include/net/act_api.h
>> @@ -174,4 +174,38 @@ static inline void tcf_action_stats_update(struct
>> tc_action *a, u64 bytes,  #endif  }
>> 
>> +typedef int tc_setup_cb_t(enum tc_setup_type type,
>> +  void *type_data, void *cb_priv);
>> +
>> +#ifdef CONFIG_NET_CLS_ACT
>> +int tc_setup_cb_egdev_register(const struct net_device *dev,
>> +   tc_setup_cb_t *cb, void *cb_priv); void
>> +tc_setup_cb_egdev_unregister(const struct net_device *dev,
>> +  tc_setup_cb_t *cb, void *cb_priv); int
>> +tc_setup_cb_egdev_call(const struct net_device *dev,
>> +   enum tc_setup_type type, void *type_data,
>> +   bool err_stop);
>> +#else
>> +static inline
>> +int tc_setup_cb_egdev_register(const struct net_device *dev,
>> +   tc_setup_cb_t *cb, void *cb_priv) {
>> +return 0;
>> +}
>> +
>> +static inline
>> +void tc_setup_cb_egdev_unregister(const struct net_device *dev,
>> +  tc_setup_cb_t *cb, void *cb_priv) { }
>> +
>> +static inline
>> +int tc_setup_cb_egdev_call(const struct net_device *dev,
>> +   enum tc_setup_type type, void *type_data,
>> +   bool err_stop)
>> +{
>> +return 0;
>> +}
>> +#endif
>> +
>>  #endif
>> diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h index
>> e80edd8..6f8149c 100644
>> --- a/include/net/pkt_cls.h
>> +++ b/include/net/pkt_cls.h
>> @@ -206,6 +206,8 @@ int tcf_exts_dump(struct sk_buff *skb, struct tcf_exts
>> *exts);  int tcf_exts_dump_stats(struct sk_buff *skb, struct tcf_exts 
>> *exts);  int
>> tcf_exts_get_dev(struct net_device *dev, struct tcf_exts *exts,
>>   struct net_device **hw_dev);
>> +int tcf_exts_egdev_cb_call(struct tcf_exts *exts, enum tc_setup_type type,
>> +   void *type_data, bool err_stop);
>> 
>>  /**
>>   * struct tcf_pkt_info - packet information diff --git a/net/sched/act_api.c
>> b/net/sched/act_api.c index da6fa82..ac97db9 100644
>> --- a/net/sched/act_api.c
>> +++ b/net/sched/act_api.c
>> @@ -21,6 +21,8 @@
>>  #include 
>>  #include 
>>  #include 
>> +#include 
>> +#include 
>>  #include 
>>  #include 
>>  #include 
>> @@ -1249,8 +1251,226 @@ static int tc_dump_action(struct sk_buff *skb,
>> struct netlink_callback *cb)
>>  return skb->len;
>>  }
>> 
>> +struct tcf_action_net {
>> +struct rhashtable egdev_ht;
>> +};
>> +
>> +static unsigned int tcf_action_net_id;
>> +
>> +struct tcf_action_egdev_cb {
>> +struct list_head list;
>> +tc_setup_cb_t *cb;
>> +void *cb_priv;
>> +};
>> +
>> +struct tcf_action_egdev {
>> +struct rhash_head ht_node;
>> +const struct net_device *dev;
>> +unsigned int refcnt;
>> +struct list_head cb_list;
>> +};
>> +
>> +static const struct rhashtable_params tcf_action_egdev_ht_params = {
>> +.key_offset = offsetof(struct tcf_action_egdev, dev),
>> +.head_offset = offsetof(struct tcf_action_egdev, ht_node),
>> +.key_len = sizeof(const struct net_device *), };
>> +
>> +static struct tcf_action_egdev *
>> +tcf_action_egdev_lookup(const struct net_device *dev) {
>> +struct net *net = dev_net(dev);
>> +struct tcf_action_net *tan = net_generic(net, tcf_action_net_id);
>> +
>> +return rhashtable_lookup_fast(>egdev_ht, ,
>> +  tcf_action_egdev_ht_params);
>> +}
>> +
>> +static struct tcf_action_egdev *
>> +tcf_action_egdev_get(const struct net_device *dev) {
>> +struct tcf_action_egdev *egdev;
>> +struct tcf_action_net *tan;
>> +
>> +egdev = tcf_action_egdev_lookup(dev);
>> +if (egdev)
>> +  

RE: [patch net-next v2 2/4] net: sched: introduce per-egress action device callbacks

2017-11-19 Thread Chopra, Manish
> -Original Message-
> From: netdev-ow...@vger.kernel.org [mailto:netdev-ow...@vger.kernel.org]
> On Behalf Of Jiri Pirko
> Sent: Wednesday, October 11, 2017 1:11 PM
> To: netdev@vger.kernel.org
> Cc: da...@davemloft.net; j...@mojatatu.com; xiyou.wangc...@gmail.com;
> sae...@mellanox.com; mat...@mellanox.com; leo...@mellanox.com;
> ml...@mellanox.com; david.lai...@aculab.com; gerlitz...@gmail.com
> Subject: [patch net-next v2 2/4] net: sched: introduce per-egress action 
> device
> callbacks
> 
> From: Jiri Pirko 
> 
> Introduce infrastructure that allows drivers to register callbacks that are 
> called
> whenever tc would offload inserted rule and specified device acts as tc action
> egress device.
> 
> Signed-off-by: Jiri Pirko 
> ---
> v1->v2:
> - take rtnl for register/unregister
> ---
>  include/net/act_api.h |  34 
>  include/net/pkt_cls.h |   2 +
>  net/sched/act_api.c   | 220
> ++
>  net/sched/cls_api.c   |  30 +++
>  4 files changed, 286 insertions(+)
> 
> diff --git a/include/net/act_api.h b/include/net/act_api.h index
> 900168a..f5e8c90 100644
> --- a/include/net/act_api.h
> +++ b/include/net/act_api.h
> @@ -174,4 +174,38 @@ static inline void tcf_action_stats_update(struct
> tc_action *a, u64 bytes,  #endif  }
> 
> +typedef int tc_setup_cb_t(enum tc_setup_type type,
> +   void *type_data, void *cb_priv);
> +
> +#ifdef CONFIG_NET_CLS_ACT
> +int tc_setup_cb_egdev_register(const struct net_device *dev,
> +tc_setup_cb_t *cb, void *cb_priv); void
> +tc_setup_cb_egdev_unregister(const struct net_device *dev,
> +   tc_setup_cb_t *cb, void *cb_priv); int
> +tc_setup_cb_egdev_call(const struct net_device *dev,
> +enum tc_setup_type type, void *type_data,
> +bool err_stop);
> +#else
> +static inline
> +int tc_setup_cb_egdev_register(const struct net_device *dev,
> +tc_setup_cb_t *cb, void *cb_priv) {
> + return 0;
> +}
> +
> +static inline
> +void tc_setup_cb_egdev_unregister(const struct net_device *dev,
> +   tc_setup_cb_t *cb, void *cb_priv) { }
> +
> +static inline
> +int tc_setup_cb_egdev_call(const struct net_device *dev,
> +enum tc_setup_type type, void *type_data,
> +bool err_stop)
> +{
> + return 0;
> +}
> +#endif
> +
>  #endif
> diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h index
> e80edd8..6f8149c 100644
> --- a/include/net/pkt_cls.h
> +++ b/include/net/pkt_cls.h
> @@ -206,6 +206,8 @@ int tcf_exts_dump(struct sk_buff *skb, struct tcf_exts
> *exts);  int tcf_exts_dump_stats(struct sk_buff *skb, struct tcf_exts *exts); 
>  int
> tcf_exts_get_dev(struct net_device *dev, struct tcf_exts *exts,
>struct net_device **hw_dev);
> +int tcf_exts_egdev_cb_call(struct tcf_exts *exts, enum tc_setup_type type,
> +void *type_data, bool err_stop);
> 
>  /**
>   * struct tcf_pkt_info - packet information diff --git a/net/sched/act_api.c
> b/net/sched/act_api.c index da6fa82..ac97db9 100644
> --- a/net/sched/act_api.c
> +++ b/net/sched/act_api.c
> @@ -21,6 +21,8 @@
>  #include 
>  #include 
>  #include 
> +#include 
> +#include 
>  #include 
>  #include 
>  #include 
> @@ -1249,8 +1251,226 @@ static int tc_dump_action(struct sk_buff *skb,
> struct netlink_callback *cb)
>   return skb->len;
>  }
> 
> +struct tcf_action_net {
> + struct rhashtable egdev_ht;
> +};
> +
> +static unsigned int tcf_action_net_id;
> +
> +struct tcf_action_egdev_cb {
> + struct list_head list;
> + tc_setup_cb_t *cb;
> + void *cb_priv;
> +};
> +
> +struct tcf_action_egdev {
> + struct rhash_head ht_node;
> + const struct net_device *dev;
> + unsigned int refcnt;
> + struct list_head cb_list;
> +};
> +
> +static const struct rhashtable_params tcf_action_egdev_ht_params = {
> + .key_offset = offsetof(struct tcf_action_egdev, dev),
> + .head_offset = offsetof(struct tcf_action_egdev, ht_node),
> + .key_len = sizeof(const struct net_device *), };
> +
> +static struct tcf_action_egdev *
> +tcf_action_egdev_lookup(const struct net_device *dev) {
> + struct net *net = dev_net(dev);
> + struct tcf_action_net *tan = net_generic(net, tcf_action_net_id);
> +
> + return rhashtable_lookup_fast(>egdev_ht, ,
> +   tcf_action_egdev_ht_params);
> +}
> +
> +static struct tcf_action_egdev *
> +tcf_action_egdev_get(const struct net_device *dev) {
> + struct tcf_action_egdev *egdev;
> + struct tcf_action_net *tan;
> +
> + egdev = tcf_action_egdev_lookup(dev);
> + if (egdev)
> + goto inc_ref;
> +
> + egdev = kzalloc(sizeof(*egdev), GFP_KERNEL);
> + if (!egdev)
> + return NULL;
> + INIT_LIST_HEAD(>cb_list);
> + tan = 

[PATCH net] net: ena: fix race condition between device reset and link up setup

2017-11-19 Thread netanel
From: Netanel Belgazal 

In rare cases, ena driver would reset and re-start the device,
for example, in case of misbehaving application that causes
transmit timeout

The first step in the reset procedure is to stop the Tx traffic by
calling ena_carrier_off().

After the driver have just started the device reset procedure, device
happens to send an asynchronous notification (via AENQ) to the driver
than there was a link change (to link-up state).
This link change is mapped to a call to netif_carrier_on() which
re-activates the Tx queues, violating the assumption of no tx traffic
until device reset is completed, as the reset task might still be in
the process of queues initialization, leading to an access to
uninitialized memory.

Signed-off-by: Netanel Belgazal 
---
 drivers/net/ethernet/amazon/ena/ena_netdev.c | 11 +--
 drivers/net/ethernet/amazon/ena/ena_netdev.h |  3 ++-
 2 files changed, 11 insertions(+), 3 deletions(-)

diff --git a/drivers/net/ethernet/amazon/ena/ena_netdev.c 
b/drivers/net/ethernet/amazon/ena/ena_netdev.c
index 5417e4da64ca..988d0383b4e7 100644
--- a/drivers/net/ethernet/amazon/ena/ena_netdev.c
+++ b/drivers/net/ethernet/amazon/ena/ena_netdev.c
@@ -2579,6 +2579,7 @@ static int ena_restore_device(struct ena_adapter *adapter)
bool wd_state;
int rc;
 
+   set_bit(ENA_FLAG_ONGOING_RESET, >flags);
rc = ena_device_init(ena_dev, adapter->pdev, _feat_ctx, _state);
if (rc) {
dev_err(>dev, "Can not initialize device\n");
@@ -2592,6 +2593,11 @@ static int ena_restore_device(struct ena_adapter 
*adapter)
goto err_device_destroy;
}
 
+   clear_bit(ENA_FLAG_ONGOING_RESET, >flags);
+   /* Make sure we don't have a race with AENQ Links state handler */
+   if (test_bit(ENA_FLAG_LINK_UP, >flags))
+   netif_carrier_on(adapter->netdev);
+
rc = ena_enable_msix_and_set_admin_interrupts(adapter,
  adapter->num_queues);
if (rc) {
@@ -2618,7 +2624,7 @@ static int ena_restore_device(struct ena_adapter *adapter)
ena_com_admin_destroy(ena_dev);
 err:
clear_bit(ENA_FLAG_DEVICE_RUNNING, >flags);
-
+   clear_bit(ENA_FLAG_ONGOING_RESET, >flags);
dev_err(>dev,
"Reset attempt failed. Can not reset the device\n");
 
@@ -3495,7 +3501,8 @@ static void ena_update_on_link_change(void *adapter_data,
if (status) {
netdev_dbg(adapter->netdev, "%s\n", __func__);
set_bit(ENA_FLAG_LINK_UP, >flags);
-   netif_carrier_on(adapter->netdev);
+   if (!test_bit(ENA_FLAG_ONGOING_RESET, >flags))
+   netif_carrier_on(adapter->netdev);
} else {
clear_bit(ENA_FLAG_LINK_UP, >flags);
netif_carrier_off(adapter->netdev);
diff --git a/drivers/net/ethernet/amazon/ena/ena_netdev.h 
b/drivers/net/ethernet/amazon/ena/ena_netdev.h
index ed8bd0a579c4..3bbc003871de 100644
--- a/drivers/net/ethernet/amazon/ena/ena_netdev.h
+++ b/drivers/net/ethernet/amazon/ena/ena_netdev.h
@@ -272,7 +272,8 @@ enum ena_flags_t {
ENA_FLAG_DEV_UP,
ENA_FLAG_LINK_UP,
ENA_FLAG_MSIX_ENABLED,
-   ENA_FLAG_TRIGGER_RESET
+   ENA_FLAG_TRIGGER_RESET,
+   ENA_FLAG_ONGOING_RESET
 };
 
 /* adapter specific private data structure */
-- 
2.7.3.AMZN



Re: [PATCH] net: sched: fix crash when deleting secondary chains

2017-11-19 Thread Jiri Pirko
Sun, Nov 19, 2017 at 12:44:05PM CET, c...@rkapl.cz wrote:
>If you flush (delete) a filter chain other than chain 0 (such as when
>deleting the device), the kernel may run into a use-after-free. The
>chain refcount must not be decremented unless we are sure we are done
>with the chain.
>
>To reproduce the bug, run:
>ip link add dtest type dummy
>tc qdisc add dev dtest ingress
>tc filter add dev dtest chain 1  parent : flower
>ip link del dtest
>
>Introduced in: commit f93e1cdcf42c ("net/sched: fix filter flushing"),
>but unless you have KAsan or luck, you won't notice it until
>commit 0dadc117ac8b ("cls_flower: use tcf_exts_get_net() before call_rcu()")


Patch looks fine. Please repost with proper "Fixes: " tag.
Feel free to add my:
Acked-by: Jiri Pirko 
tag.

Thanks!



>
>Signed-off-by: Roman Kapl 
>---
> net/sched/cls_api.c | 7 ---
> 1 file changed, 4 insertions(+), 3 deletions(-)
>
>diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c
>index ab255b421781..7d97f612c9b9 100644
>--- a/net/sched/cls_api.c
>+++ b/net/sched/cls_api.c
>@@ -205,13 +205,14 @@ static void tcf_chain_head_change(struct tcf_chain 
>*chain,
> 
> static void tcf_chain_flush(struct tcf_chain *chain)
> {
>-  struct tcf_proto *tp;
>+  struct tcf_proto *tp = rtnl_dereference(chain->filter_chain);
> 
>   tcf_chain_head_change(chain, NULL);
>-  while ((tp = rtnl_dereference(chain->filter_chain)) != NULL) {
>+  while (tp) {
>   RCU_INIT_POINTER(chain->filter_chain, tp->next);
>-  tcf_chain_put(chain);
>   tcf_proto_destroy(tp);
>+  tp = rtnl_dereference(chain->filter_chain);
>+  tcf_chain_put(chain);
>   }
> }
> 
>-- 
>2.15.0
>


Re: [patch net-next RFC v2 08/11] mlxsw: spectrum_dpipe: Connect dpipe tables to resources

2017-11-19 Thread David Ahern
On 11/19/17 2:16 AM, Arkadi Sharshevsky wrote:
> 
> 
> On 11/18/2017 09:19 PM, David Ahern wrote:
>> On 11/14/17 9:18 AM, Jiri Pirko wrote:
>>> From: Arkadi Sharshevsky 
>>>
>>> Connect current dpipe tables to resources. The tables are connected
>>> in the following fashion:
>>> 1. IPv4 host - KVD hash single
>>> 2. IPv6 host - KVD hash double
>>> 3. Adjacency - KVD linear
>>
>> Those descriptions would be helpful to the user. A description attribute
>> for the resources?
>>
> 
> As described in the cover letter this resources are used by the
> majority of the ASICs lookup processes. So currently there is one
> to one mapping but is should increase as more tables are exposed,
> so I don't think its a good idea to maintain such an attribute.
> 

'IPv4 host' yes, but I mean the term 'KVD hash single'? Is it the same
across all h/w vendors? I have only seen that in the context of MLX. If
it is a MLX term then a description to the user that KVD hash single ==
IPv4 host is warranted.


Re: [patch net-next RFC v2 02/11] devlink: Add support for resource abstraction

2017-11-19 Thread David Ahern
On 11/19/17 1:17 AM, Arkadi Sharshevsky wrote:
> 
> 
> On 11/18/2017 08:34 PM, David Ahern wrote:
>> On 11/14/17 9:18 AM, Jiri Pirko wrote:
>>> diff --git a/include/net/devlink.h b/include/net/devlink.h
>>> index 4d2c6fc..960e80a 100644
>>> --- a/include/net/devlink.h
>>> +++ b/include/net/devlink.h
>> ...
>>
>>> @@ -469,6 +523,32 @@ devlink_dpipe_match_put(struct sk_buff *skb,
>>> return 0;
>>>  }
>>>  
>>> +static inline int
>>> +devlink_resource_register(struct devlink *devlink,
>>> + const char *resource_name,
>>> + bool top_hierarchy,
>>> + bool reload_required,
>>> + u64 resource_size,
>>> + u64 resource_id,
>>> + u64 parent_resource_id,
>>> + struct devlink_resource_ops *resource_ops)
>>> +{
>>> +   return 0;
>>> +}
>>> +
>>> +static inline void
>>> +devlink_resources_unregister(struct devlink *devlink,
>>> +struct devlink_resource *resource)
>>> +{
>>> +}
>>> +
>>> +static inline int
>>> +devlink_resource_size_get(struct devlink *devlink, u64 resource_id,
>>> + u64 *p_resource_size)
>>> +{
>>> +   return -EINVAL;
>>
>> It's compiled out so -EOPNOTSUPP seems more appropriate.
>>
> 
> will fix
> 
>>
>>
>>> diff --git a/net/core/devlink.c b/net/core/devlink.c
>>> index 0114dfc..6ae644f 100644
>>> --- a/net/core/devlink.c
>>> +++ b/net/core/devlink.c
>>> +static int devlink_nl_cmd_resource_set(struct sk_buff *skb,
>>> +  struct genl_info *info)
>>> +{
>>> +   struct devlink *devlink = info->user_ptr[0];
>>> +   struct devlink_resource *resource;
>>> +   u64 resource_id;
>>> +   u64 size;
>>> +   int err;
>>> +
>>> +   if (!info->attrs[DEVLINK_ATTR_RESOURCE_ID] ||
>>> +   !info->attrs[DEVLINK_ATTR_RESOURCE_SIZE])
>>> +   return -EINVAL;
>>
>> several of the of the DEVLINK_ATTR_RESOURCE attributes are kernel to
>> user only (e.g., DEVLINK_ATTR_RESOURCE_SIZE_NEW and
>> DEVLINK_ATTR_RESOURCE_RELOAD_REQUIRED), so if they are given by the user
>> that should be an error too right?
>>
> 
> Not sure I understood. As you see I only check for the mandatory
> attributes, if the user provides not relevant data its ignored.
> 
> We use one single nla_policy for all the commands (devlink_nl_policy)
> 
>>
>>> +   resource_id = nla_get_u64(info->attrs[DEVLINK_ATTR_RESOURCE_ID]);
>>
>> I don't see where these attributes are validated for proper size.
>>
> 
> right, forgot to update the policy.
> 
>>> +
>>> +   resource = devlink_resource_find(devlink, NULL, resource_id);
>>> +   if (!resource)
>>> +   return -EINVAL;
>>> +
>>> +   if (!resource->resource_ops->size_validate)
>>> +   return -EINVAL;
>>
>> genl_info has extack; please add user messages for the above failures.
>>
> 
> Isn't EOPNOTSUPP enough ?

No, I mean every failure above returns EINVAL. Add an extack message
telling the user what is wrong. e.g,

resource = devlink_resource_find(devlink, NULL, resource_id);
if (!resource) {
NL_SET_ERR_MSG(extack, "Invalid resource id");
return -EINVAL;
}

similarly for the rest.


[PATCH net-next 1/1] forcedeth: replace pci_unmap_page with dma_unmap_page

2017-11-19 Thread Zhu Yanjun
The function pci_unmap_page is obsolete. So it is replaced with
the function dma_unmap_page.

CC: Srinivas Eeda 
CC: Joe Jin 
CC: Junxiao Bi 
Signed-off-by: Zhu Yanjun 
---
 drivers/net/ethernet/nvidia/forcedeth.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/nvidia/forcedeth.c 
b/drivers/net/ethernet/nvidia/forcedeth.c
index ac8439c..0febe41 100644
--- a/drivers/net/ethernet/nvidia/forcedeth.c
+++ b/drivers/net/ethernet/nvidia/forcedeth.c
@@ -1986,7 +1986,7 @@ static void nv_unmap_txskb(struct fe_priv *np, struct 
nv_skb_map *tx_skb)
 tx_skb->dma_len,
 DMA_TO_DEVICE);
else
-   pci_unmap_page(np->pci_dev, tx_skb->dma,
+   dma_unmap_page(>pci_dev->dev, tx_skb->dma,
   tx_skb->dma_len,
   PCI_DMA_TODEVICE);
tx_skb->dma = 0;
-- 
2.7.4



[PATCH net-next] netlink: optimize err assignment

2017-11-19 Thread yuan linyu
From: yuan linyu 

Signed-off-by: yuan linyu 
---
 net/netlink/af_netlink.c | 54 +---
 1 file changed, 23 insertions(+), 31 deletions(-)

diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c
index b9e0ee4..37ce0d3 100644
--- a/net/netlink/af_netlink.c
+++ b/net/netlink/af_netlink.c
@@ -533,14 +533,16 @@ static int netlink_insert(struct sock *sk, u32 portid)
 
lock_sock(sk);
 
-   err = nlk_sk(sk)->portid == portid ? 0 : -EBUSY;
-   if (nlk_sk(sk)->bound)
+   if (nlk_sk(sk)->bound) {
+   err = nlk_sk(sk)->portid == portid ? 0 : -EBUSY;
goto err;
+   }
 
-   err = -ENOMEM;
if (BITS_PER_LONG > 32 &&
-   unlikely(atomic_read(>hash.nelems) >= UINT_MAX))
+   unlikely(atomic_read(>hash.nelems) >= UINT_MAX)) {
+   err = -ENOMEM;
goto err;
+   }
 
nlk_sk(sk)->portid = portid;
sock_hold(sk);
@@ -1585,8 +1587,8 @@ static int netlink_setsockopt(struct socket *sock, int 
level, int optname,
 {
struct sock *sk = sock->sk;
struct netlink_sock *nlk = nlk_sk(sk);
-   unsigned int val = 0;
-   int err;
+   unsigned int val;
+   int err = 0;
 
if (level != SOL_NETLINK)
return -ENOPROTOOPT;
@@ -1601,7 +1603,6 @@ static int netlink_setsockopt(struct socket *sock, int 
level, int optname,
nlk->flags |= NETLINK_F_RECV_PKTINFO;
else
nlk->flags &= ~NETLINK_F_RECV_PKTINFO;
-   err = 0;
break;
case NETLINK_ADD_MEMBERSHIP:
case NETLINK_DROP_MEMBERSHIP: {
@@ -1623,8 +1624,6 @@ static int netlink_setsockopt(struct socket *sock, int 
level, int optname,
netlink_table_ungrab();
if (optname == NETLINK_DROP_MEMBERSHIP && nlk->netlink_unbind)
nlk->netlink_unbind(sock_net(sk), val);
-
-   err = 0;
break;
}
case NETLINK_BROADCAST_ERROR:
@@ -1632,7 +1631,6 @@ static int netlink_setsockopt(struct socket *sock, int 
level, int optname,
nlk->flags |= NETLINK_F_BROADCAST_SEND_ERROR;
else
nlk->flags &= ~NETLINK_F_BROADCAST_SEND_ERROR;
-   err = 0;
break;
case NETLINK_NO_ENOBUFS:
if (val) {
@@ -1642,7 +1640,6 @@ static int netlink_setsockopt(struct socket *sock, int 
level, int optname,
} else {
nlk->flags &= ~NETLINK_F_RECV_NO_ENOBUFS;
}
-   err = 0;
break;
case NETLINK_LISTEN_ALL_NSID:
if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_BROADCAST))
@@ -1652,21 +1649,18 @@ static int netlink_setsockopt(struct socket *sock, int 
level, int optname,
nlk->flags |= NETLINK_F_LISTEN_ALL_NSID;
else
nlk->flags &= ~NETLINK_F_LISTEN_ALL_NSID;
-   err = 0;
break;
case NETLINK_CAP_ACK:
if (val)
nlk->flags |= NETLINK_F_CAP_ACK;
else
nlk->flags &= ~NETLINK_F_CAP_ACK;
-   err = 0;
break;
case NETLINK_EXT_ACK:
if (val)
nlk->flags |= NETLINK_F_EXT_ACK;
else
nlk->flags &= ~NETLINK_F_EXT_ACK;
-   err = 0;
break;
default:
err = -ENOPROTOOPT;
@@ -1679,7 +1673,7 @@ static int netlink_getsockopt(struct socket *sock, int 
level, int optname,
 {
struct sock *sk = sock->sk;
struct netlink_sock *nlk = nlk_sk(sk);
-   int len, val, err;
+   int len, val, err = 0;
 
if (level != SOL_NETLINK)
return -ENOPROTOOPT;
@@ -1698,7 +1692,6 @@ static int netlink_getsockopt(struct socket *sock, int 
level, int optname,
if (put_user(len, optlen) ||
put_user(val, optval))
return -EFAULT;
-   err = 0;
break;
case NETLINK_BROADCAST_ERROR:
if (len < sizeof(int))
@@ -1708,7 +1701,6 @@ static int netlink_getsockopt(struct socket *sock, int 
level, int optname,
if (put_user(len, optlen) ||
put_user(val, optval))
return -EFAULT;
-   err = 0;
break;
case NETLINK_NO_ENOBUFS:
if (len < sizeof(int))
@@ -1718,12 +1710,10 @@ static int netlink_getsockopt(struct socket *sock, int 
level, int optname,
if (put_user(len, optlen) ||
put_user(val, optval))
return -EFAULT;
-   err = 0;
break;
  

[PATCH ethtool] ethtool: Add extended compliance codes parsing to sfp modules

2017-11-19 Thread Gal Pressman
Update parsing according to SFP28 spec with extended compliance codes.
SFF-8472, SFF-8024 specify the description of module capability present
in the 36th byte.

Signed-off-by: Gal Pressman 
Reviewed-by: Eran Ben Elisha 
---
 sfpid.c | 25 +++--
 1 file changed, 23 insertions(+), 2 deletions(-)

diff --git a/sfpid.c b/sfpid.c
index 1732e5e..a1753d3 100644
--- a/sfpid.c
+++ b/sfpid.c
@@ -40,10 +40,10 @@ static void sff8079_show_transceiver(const __u8 *id)
"\tTransceiver type  :";
 
printf("\t%-41s : 0x%02x 0x%02x 0x%02x " \
-  "0x%02x 0x%02x 0x%02x 0x%02x 0x%02x\n",
+  "0x%02x 0x%02x 0x%02x 0x%02x 0x%02x 0x%02x\n",
   "Transceiver codes",
   id[3], id[4], id[5], id[6],
-  id[7], id[8], id[9], id[10]);
+  id[7], id[8], id[9], id[10], id[36]);
/* 10G Ethernet Compliance Codes */
if (id[3] & (1 << 7))
printf("%s 10G Ethernet: 10G Base-ER" \
@@ -168,6 +168,27 @@ static void sff8079_show_transceiver(const __u8 *id)
printf("%s FC: 200 MBytes/sec\n", pfx);
if (id[10] & (1 << 0))
printf("%s FC: 100 MBytes/sec\n", pfx);
+   /* Extended Specification Compliance Codes from SFF-8024 */
+   if (id[36] == 0x1)
+   printf("%s Extended: 100G AOC or 25GAUI C2M AOC with worst BER 
of 5x10^(-5)\n", pfx);
+   if (id[36] == 0x2)
+   printf("%s Extended: 100G Base-SR4 or 25GBase-SR\n", pfx);
+   if (id[36] == 0x3)
+   printf("%s Extended: 100G Base-LR4 or 25GBase-LR\n", pfx);
+   if (id[36] == 0x4)
+   printf("%s Extended: 100G Base-ER4 or 25GBase-ER\n", pfx);
+   if (id[36] == 0x8)
+   printf("%s Extended: 100G ACC or 25GAUI C2M ACC with worst BER 
of 5x10^(-5)\n", pfx);
+   if (id[36] == 0xb)
+   printf("%s Extended: 100G Base-CR4 or 25G Base-CR CA-L\n", pfx);
+   if (id[36] == 0xc)
+   printf("%s Extended: 25G Base-CR CA-S\n", pfx);
+   if (id[36] == 0xd)
+   printf("%s Extended: 25G Base-CR CA-N\n", pfx);
+   if (id[36] == 0x18)
+   printf("%s Extended: 100G AOC or 25GAUI C2M AOC with worst BER 
of 10^(-12)\n", pfx);
+   if (id[36] == 0x19)
+   printf("%s Extended: 100G ACC or 25GAUI C2M ACC with worst BER 
of 10^(-12)\n", pfx);
 }
 
 static void sff8079_show_encoding(const __u8 *id)
-- 
2.7.4



general protection fault in dst_destroy() - 4.13.9

2017-11-19 Thread Anders K. Pedersen | Cohaesio
Hello,

A few days ago, one of our routers (running Linux 4.13.9) crashed due
to a general protection fault in dst_destroy(). At the time, it had run
for several weeks without any problems, but then crashed three times in
a row within a few minutes - all due to a general protection fault at
dst_destroy()+0x35. Since then, it has run for several days without any
further problems, so I suspect that this was triggered by a traffic
pattern in the routed packets, but I don't have a way to reproduce it.

Disassembly shows that this is in the inlined dev_put(), which does
this_cpu_dec(*dev->pcpu_refcnt). As far as I can tell there haven't
been any fixes in this area since 4.13, and a Google search didn't find
anything recent, so I'm guessing this is not a known problem.

I have included the kernel output via serial console below as well as
gdb and objdump information. Please let me know, if I can provide any
additional information.


[2024260.461401] general protection fault:  [#1] SMP
[2024260.467193] Modules linked in:
[2024260.470897] CPU: 15 PID: 0 Comm: swapper/15 Tainted: GW   
4.13.9 #2
[2024260.479488] Hardware name: Dell Inc. PowerEdge R730/0H21J3, BIOS 2.5.5 
08/16/2017
[2024260.488279] task: 88085b625cc0 task.stack: c90e4000
[2024260.495277] RIP: 0010:dst_destroy+0x35/0xa0
[2024260.500277] RSP: 0018:88085f5c3f08 EFLAGS: 00010286
[2024260.506474] RAX: 88085ac0e880 RBX: 88082cf9fb00 RCX: 
0020
[2024260.514868] RDX: 88082cf9fbc0 RSI:  RDI: 
816786c0
[2024260.523258] RBP:  R08: ff00 R09: 

[2024260.531649] R10:  R11:  R12: 
88085f5da678
[2024260.540040] R13: 000a R14: 88085b625cc0 R15: 
88085b625cc0
[2024260.548431] FS:  () GS:88085f5c() 
knlGS:
[2024260.557924] CS:  0010 DS:  ES:  CR0: 80050033
[2024260.564719] CR2: 7fc800e48e88 CR3: 01809000 CR4: 
001406e0
[2024260.573112] Call Trace:
[2024260.576113]  
[2024260.578618]  ? rcu_process_callbacks+0x18f/0x460
[2024260.584126]  ? rebalance_domains+0xe2/0x290
[2024260.589128]  ? __do_softirq+0x100/0x292
[2024260.593727]  ? irq_exit+0x92/0xa0
[2024260.597729]  ? smp_apic_timer_interrupt+0x39/0x50
[2024260.603328]  ? apic_timer_interrupt+0x7c/0x90
[2024260.608528]  
[2024260.611134]  ? cpuidle_enter_state+0x14c/0x2b0
[2024260.616432]  ? cpuidle_enter_state+0x128/0x2b0
[2024260.621731]  ? do_idle+0xf9/0x190
[2024260.625733]  ? cpu_startup_entry+0x5f/0x70
[2024260.630636]  ? start_secondary+0x12a/0x130
[2024260.635536]  ? secondary_startup_64+0x9f/0x9f
[2024260.640731] Code: f6 47 60 08 48 8b 6f 18 74 62 48 8b 43 20 48 8b 40 30 48 
85 c0 74 05 48
89 df ff d0 48 8b 03 48 85 c0 74 0a 48 8b 80 e0 03 00 00 <65> ff 08 f6 43 60 80 
74 26 48 8d bb
e0 00 00 00 e8 e6 7f 01 00
[2024260.662626] RIP: dst_destroy+0x35/0xa0 RSP: 88085f5c3f08
[2024260.669333] ---[ end trace 3c1827251806827c ]---
[2024260.724173] Kernel panic - not syncing: Fatal exception in interrupt
[2024261.102792] Kernel Offset: disabled
[2024261.156022] Rebooting in 60 seconds..
[2024321.167958] ACPI MEMORY or I/O RESET_REG.


[   36.620034] general protection fault:  [#1] SMP
[   36.625637] Modules linked in:
[   36.629141] CPU: 2 PID: 0 Comm: swapper/2 Not tainted 4.13.9 #2
[   36.635938] Hardware name: Dell Inc. PowerEdge R730/0H21J3, BIOS 2.5.5 
08/16/2017
[   36.644532] task: 88085b46a7c0 task.stack: c907c000
[   36.651333] RIP: 0010:dst_destroy+0x35/0xa0
[   36.656133] RSP: 0018:88085f283f08 EFLAGS: 00010286
[   36.662133] RAX: 2e37307830203a65 RBX: 88082ac1 RCX: 0020
[   36.670326] RDX: 88082ac100c0 RSI:  RDI: 816786c0
[   36.678521] RBP:  R08: 30e3e201 R09: 00010080007a
[   36.686714] R10: 88085f283e20 R11: ea0020c38e00 R12: 88085f29a678
[   36.694906] R13: 000a R14: 88085b46a7c0 R15: 88085b46a7c0
[   36.703102] FS:  () GS:88085f28() 
knlGS:
[   36.712395] CS:  0010 DS:  ES:  CR0: 80050033
[   36.718992] CR2: 55568c725558 CR3: 01809000 CR4: 001406e0
[   36.727184] Call Trace:
[   36.729987]  
[   36.732287]  ? rcu_process_callbacks+0x18f/0x460
[   36.737588]  ? rebalance_domains+0xe2/0x290
[   36.742388]  ? __do_softirq+0x100/0x292
[   36.746790]  ? irq_exit+0x92/0xa0
[   36.750590]  ? smp_apic_timer_interrupt+0x39/0x50
[   36.755990]  ? apic_timer_interrupt+0x7c/0x90
[   36.760987]  
[   36.763392]  ? poll_idle+0x46/0x7a
[   36.767295]  ? cpuidle_enter_state+0x102/0x2b0
[   36.772396]  ? do_idle+0xf9/0x190
[   36.776197]  ? cpu_startup_entry+0x5f/0x70
[   36.780892]  ? start_secondary+0x12a/0x130
[   36.785592]  ? secondary_startup_64+0x9f/0x9f
[   36.790590] Code: f6 47 60 08 48 8b 6f 18 74 62 48 8b 43 20 48 8b 40 30 48 
85 

[PATCH] net: vxge: Fix some indentation issues

2017-11-19 Thread Christophe JAILLET
Some statements are not enough or too much indented.
Fix it to improve readalbility.

Signed-off-by: Christophe JAILLET 
---
 drivers/net/ethernet/neterion/vxge/vxge-main.c | 37 +-
 1 file changed, 18 insertions(+), 19 deletions(-)

diff --git a/drivers/net/ethernet/neterion/vxge/vxge-main.c 
b/drivers/net/ethernet/neterion/vxge/vxge-main.c
index fe7e0e1dd01d..b2299f2b2155 100644
--- a/drivers/net/ethernet/neterion/vxge/vxge-main.c
+++ b/drivers/net/ethernet/neterion/vxge/vxge-main.c
@@ -1530,7 +1530,7 @@ static int vxge_reset_vpath(struct vxgedev *vdev, int 
vp_id)
vxge_debug_init(VXGE_ERR,
"vxge_hw_vpath_reset failed for"
"vpath:%d", vp_id);
-   return status;
+   return status;
}
} else
return VXGE_HW_FAIL;
@@ -1950,19 +1950,19 @@ static enum vxge_hw_status vxge_rth_configure(struct 
vxgedev *vdev)
 * for all VPATHs. The h/w only uses the lowest numbered VPATH
 * when steering frames.
 */
-for (index = 0; index < vdev->no_of_vpath; index++) {
+   for (index = 0; index < vdev->no_of_vpath; index++) {
status = vxge_hw_vpath_rts_rth_set(
vdev->vpaths[index].handle,
vdev->config.rth_algorithm,
_types,
vdev->config.rth_bkt_sz);
-if (status != VXGE_HW_OK) {
+   if (status != VXGE_HW_OK) {
vxge_debug_init(VXGE_ERR,
"RTH configuration failed for vpath:%d",
vdev->vpaths[index].device_id);
return status;
-}
-}
+   }
+   }
 
return status;
 }
@@ -1991,7 +1991,7 @@ static enum vxge_hw_status vxge_reset_all_vpaths(struct 
vxgedev *vdev)
vxge_debug_init(VXGE_ERR,
"vxge_hw_vpath_reset failed for "
"vpath:%d", i);
-   return status;
+   return status;
}
}
}
@@ -2474,32 +2474,31 @@ static int vxge_add_isr(struct vxgedev *vdev)
switch (msix_idx) {
case 0:
snprintf(vdev->desc[intr_cnt], VXGE_INTR_STRLEN,
-   "%s:vxge:MSI-X %d - Tx - fn:%d vpath:%d",
+   "%s:vxge:MSI-X %d - Tx - fn:%d 
vpath:%d",
vdev->ndev->name,
vdev->entries[intr_cnt].entry,
pci_fun, vp_idx);
ret = request_irq(
-   vdev->entries[intr_cnt].vector,
+   vdev->entries[intr_cnt].vector,
vxge_tx_msix_handle, 0,
vdev->desc[intr_cnt],
>vpaths[vp_idx].fifo);
-   vdev->vxge_entries[intr_cnt].arg =
+   vdev->vxge_entries[intr_cnt].arg =
>vpaths[vp_idx].fifo;
irq_req = 1;
break;
case 1:
snprintf(vdev->desc[intr_cnt], VXGE_INTR_STRLEN,
-   "%s:vxge:MSI-X %d - Rx - fn:%d vpath:%d",
+   "%s:vxge:MSI-X %d - Rx - fn:%d 
vpath:%d",
vdev->ndev->name,
vdev->entries[intr_cnt].entry,
pci_fun, vp_idx);
ret = request_irq(
-   vdev->entries[intr_cnt].vector,
-   vxge_rx_msix_napi_handle,
-   0,
+   vdev->entries[intr_cnt].vector,
+   vxge_rx_msix_napi_handle, 0,
vdev->desc[intr_cnt],
>vpaths[vp_idx].ring);
-   vdev->vxge_entries[intr_cnt].arg =
+   vdev->vxge_entries[intr_cnt].arg =
>vpaths[vp_idx].ring;
irq_req = 1;
break;
@@ -2512,9 +2511,9 @@ static int vxge_add_isr(struct vxgedev *vdev)

Re: [PATCH v2 1/2] r8169: fix RTL8111EVL EEE and green settings

2017-11-19 Thread David Miller
From: Heiner Kallweit 
Date: Sun, 19 Nov 2017 11:09:58 +0100

> Name of functions rtl_w0w1_eri and rtl_w0w1_phy is somewhat misleading
> regarding order of arguments. One could assume that w0w1 means
> argument with bits to be reset comes before argument with bits to set.
> However this is not the case.
> So fix the order of arguments in several statements.
> 
> In addition fix EEE advertisement. The current code resets the bits
> for 100BaseT and 1000BaseT EEE advertisement what is not what we want.
> 
> I have a little of a hard time to find a proper "Fixes" line as the
> issue seems to have been there forever (at least it existed already
> when the driver was moved to the current place in 2011).
> 
> The patch was tested on a Zotac Mini-PC with a RTL8111E-VL chip.
> Before the patch EEE was disabled, now it's properly advertised and
> works fine.
> 
> Signed-off-by: Heiner Kallweit 

Applied.


Re: [PATCH v2 2/2] r8169: use same RTL8111EVL green settings as in vendor driver

2017-11-19 Thread David Miller
From: Heiner Kallweit 
Date: Sun, 19 Nov 2017 11:15:46 +0100

> Adjust the code to use the same green settings as in the latest
> vendor driver.
> 
> Signed-off-by: Heiner Kallweit 

Applied.


Re: [PATCH net] tun: fix rcu_read_lock imbalance in tun_build_skb

2017-11-19 Thread David Miller
From: Xin Long 
Date: Sun, 19 Nov 2017 19:31:04 +0800

> rcu_read_lock in tun_build_skb is used to rcu_dereference tun->xdp_prog
> safely, rcu_read_unlock should be done in every return path.
> 
> Now I could see one place missing it, where it returns NULL in switch-case
> XDP_REDIRECT,  another palce using rcu_read_lock wrongly, where it returns
> NULL in if (xdp_xmit) chunk.
> 
> So fix both in this patch.
> 
> Fixes: 761876c857cb ("tap: XDP support")
> Signed-off-by: Xin Long 

Good catch, applied, thanks!


[PATCH net] tun: fix rcu_read_lock imbalance in tun_build_skb

2017-11-19 Thread Xin Long
rcu_read_lock in tun_build_skb is used to rcu_dereference tun->xdp_prog
safely, rcu_read_unlock should be done in every return path.

Now I could see one place missing it, where it returns NULL in switch-case
XDP_REDIRECT,  another palce using rcu_read_lock wrongly, where it returns
NULL in if (xdp_xmit) chunk.

So fix both in this patch.

Fixes: 761876c857cb ("tap: XDP support")
Signed-off-by: Xin Long 
---
 drivers/net/tun.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index 6bb1e60..5a2ea78 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -1485,6 +1485,7 @@ static struct sk_buff *tun_build_skb(struct tun_struct 
*tun,
err = xdp_do_redirect(tun->dev, , xdp_prog);
if (err)
goto err_redirect;
+   rcu_read_unlock();
return NULL;
case XDP_TX:
xdp_xmit = true;
@@ -1517,7 +1518,7 @@ static struct sk_buff *tun_build_skb(struct tun_struct 
*tun,
if (xdp_xmit) {
skb->dev = tun->dev;
generic_xdp_tx(skb, xdp_prog);
-   rcu_read_lock();
+   rcu_read_unlock();
return NULL;
}
 
-- 
2.1.0



[PATCH v2 2/2] r8169: use same RTL8111EVL green settings as in vendor driver

2017-11-19 Thread Heiner Kallweit
Adjust the code to use the same green settings as in the latest
vendor driver.

Signed-off-by: Heiner Kallweit 
---
v2:
- replace magic numbers with constants
---
 drivers/net/ethernet/realtek/r8169.c | 5 +
 1 file changed, 5 insertions(+)

diff --git a/drivers/net/ethernet/realtek/r8169.c 
b/drivers/net/ethernet/realtek/r8169.c
index 19f3074a0..2cb3622c4 100644
--- a/drivers/net/ethernet/realtek/r8169.c
+++ b/drivers/net/ethernet/realtek/r8169.c
@@ -3810,6 +3810,11 @@ static void rtl8168e_2_hw_phy_config(struct 
rtl8169_private *tp)
rtl_w0w1_phy(tp, 0x19, 0x0001, 0x);
rtl_w0w1_phy(tp, 0x10, 0x0400, 0x);
rtl_writephy(tp, 0x1f, 0x);
+   rtl_writephy(tp, 0x1f, 0x0005);
+   rtl_w0w1_phy(tp, 0x01, 0x0100, 0x);
+   rtl_writephy(tp, 0x1f, 0x);
+   /* soft-reset phy */
+   rtl_writephy(tp, MII_BMCR, BMCR_RESET | BMCR_ANENABLE | BMCR_ANRESTART);
 
/* Broken BIOS workaround: feed GigaMAC registers with MAC address. */
rtl_rar_exgmac_set(tp, tp->dev->dev_addr);
-- 
2.15.0




[PATCH v2 1/2] r8169: fix RTL8111EVL EEE and green settings

2017-11-19 Thread Heiner Kallweit
Name of functions rtl_w0w1_eri and rtl_w0w1_phy is somewhat misleading
regarding order of arguments. One could assume that w0w1 means
argument with bits to be reset comes before argument with bits to set.
However this is not the case.
So fix the order of arguments in several statements.

In addition fix EEE advertisement. The current code resets the bits
for 100BaseT and 1000BaseT EEE advertisement what is not what we want.

I have a little of a hard time to find a proper "Fixes" line as the
issue seems to have been there forever (at least it existed already
when the driver was moved to the current place in 2011).

The patch was tested on a Zotac Mini-PC with a RTL8111E-VL chip.
Before the patch EEE was disabled, now it's properly advertised and
works fine.

Signed-off-by: Heiner Kallweit 
---
v2:
- no changes
---
 drivers/net/ethernet/realtek/r8169.c | 12 ++--
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/drivers/net/ethernet/realtek/r8169.c 
b/drivers/net/ethernet/realtek/r8169.c
index dcb8c3938..19f3074a0 100644
--- a/drivers/net/ethernet/realtek/r8169.c
+++ b/drivers/net/ethernet/realtek/r8169.c
@@ -3789,26 +3789,26 @@ static void rtl8168e_2_hw_phy_config(struct 
rtl8169_private *tp)
rtl_writephy(tp, 0x1f, 0x);
 
/* EEE setting */
-   rtl_w0w1_eri(tp, 0x1b0, ERIAR_MASK_, 0x, 0x0003, ERIAR_EXGMAC);
+   rtl_w0w1_eri(tp, 0x1b0, ERIAR_MASK_, 0x0003, 0x, ERIAR_EXGMAC);
rtl_writephy(tp, 0x1f, 0x0005);
rtl_writephy(tp, 0x05, 0x8b85);
-   rtl_w0w1_phy(tp, 0x06, 0x, 0x2000);
+   rtl_w0w1_phy(tp, 0x06, 0x2000, 0x);
rtl_writephy(tp, 0x1f, 0x0004);
rtl_writephy(tp, 0x1f, 0x0007);
rtl_writephy(tp, 0x1e, 0x0020);
-   rtl_w0w1_phy(tp, 0x15, 0x, 0x0100);
+   rtl_w0w1_phy(tp, 0x15, 0x0100, 0x);
rtl_writephy(tp, 0x1f, 0x0002);
rtl_writephy(tp, 0x1f, 0x);
rtl_writephy(tp, 0x0d, 0x0007);
rtl_writephy(tp, 0x0e, 0x003c);
rtl_writephy(tp, 0x0d, 0x4007);
-   rtl_writephy(tp, 0x0e, 0x);
+   rtl_writephy(tp, 0x0e, 0x0006);
rtl_writephy(tp, 0x0d, 0x);
 
/* Green feature */
rtl_writephy(tp, 0x1f, 0x0003);
-   rtl_w0w1_phy(tp, 0x19, 0x, 0x0001);
-   rtl_w0w1_phy(tp, 0x10, 0x, 0x0400);
+   rtl_w0w1_phy(tp, 0x19, 0x0001, 0x);
+   rtl_w0w1_phy(tp, 0x10, 0x0400, 0x);
rtl_writephy(tp, 0x1f, 0x);
 
/* Broken BIOS workaround: feed GigaMAC registers with MAC address. */
-- 
2.15.0



Re: [patch net-next RFC v2 09/11] mlxsw: spectrum: Add support for getting kvdl occupancy

2017-11-19 Thread Arkadi Sharshevsky


On 11/18/2017 09:21 PM, David Ahern wrote:
> On 11/14/17 9:18 AM, Jiri Pirko wrote:
>> From: Arkadi Sharshevsky 
>>
>> Add support for getting the kvdl occupancy through the resource interface.
>>
> 
> Do you intend to add occ_get for the other kvd partitions?
> 

Yes of course, its a separate patchset due to its complexity.


Re: [patch net-next RFC v2 08/11] mlxsw: spectrum_dpipe: Connect dpipe tables to resources

2017-11-19 Thread Arkadi Sharshevsky


On 11/18/2017 09:19 PM, David Ahern wrote:
> On 11/14/17 9:18 AM, Jiri Pirko wrote:
>> From: Arkadi Sharshevsky 
>>
>> Connect current dpipe tables to resources. The tables are connected
>> in the following fashion:
>> 1. IPv4 host - KVD hash single
>> 2. IPv6 host - KVD hash double
>> 3. Adjacency - KVD linear
> 
> Those descriptions would be helpful to the user. A description attribute
> for the resources?
> 

As described in the cover letter this resources are used by the
majority of the ASICs lookup processes. So currently there is one
to one mapping but is should increase as more tables are exposed,
so I don't think its a good idea to maintain such an attribute.



Re: [patch net-next RFC v2 07/11] mlxsw: spectrum: Register KVD resources with devlink

2017-11-19 Thread Arkadi Sharshevsky


On 11/18/2017 09:18 PM, David Ahern wrote:
> On 11/14/17 9:18 AM, Jiri Pirko wrote:
>> diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum.c 
>> b/drivers/net/ethernet/mellanox/mlxsw/spectrum.c
>> index d02c130..f0cbd67 100644
>> --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum.c
>> +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum.c
>> @@ -3927,6 +3927,173 @@ static const struct mlxsw_config_profile 
>> mlxsw_sp_config_profile = {
>>  .resource_query_enable  = 1,
>>  };
>>  
>> +static bool
>> +mlxsw_sp_resource_kvd_granularity_validate(struct netlink_ext_ack *extack,
>> +   u64 size)
>> +{
>> +const struct mlxsw_config_profile *profile;
>> +
>> +profile = _sp_config_profile;
>> +if (size % profile->kvd_hash_granularity) {
>> +NL_SET_ERR_MSG(extack, MLXSW_SP_PREFIX "resource set with wrong 
>> granularity");
>> +return false;
>> +}
>> +return true;
>> +}
>> +
>> +static int
>> +mlxsw_sp_resource_kvd_size_validate(struct devlink *devlink, u64 size,
>> +struct list_head *resource_list,
>> +struct netlink_ext_ack *extack)
>> +{
>> +struct mlxsw_core *mlxsw_core = devlink_priv(devlink);
>> +u32 kvd_size, single_size, double_size, linear_size;
>> +struct devlink_resource *resource;
>> +
>> +kvd_size = MLXSW_CORE_RES_GET(mlxsw_core, KVD_SIZE);
>> +if (kvd_size != size) {
>> +NL_SET_ERR_MSG(extack, MLXSW_SP_PREFIX "kvd size cannot be 
>> chagned");
> 
> s/chagned/changed/
> 
>> +return -EINVAL;
>> +}
>> +
>> +list_for_each_entry(resource, resource_list, list) {
>> +switch (resource->id) {
>> +case MLXSW_SP_RESOURCE_KVD_LINEAR:
>> +linear_size = resource->size_new;
>> +break;
>> +case MLXSW_SP_RESOURCE_KVD_HASH_SINGLE:
>> +single_size = resource->size_new;
>> +break;
>> +case MLXSW_SP_RESOURCE_KVD_HASH_DOUBLE:
>> +double_size = resource->size_new;
>> +break;
>> +}
>> +}
>> +
>> +/* Overlap is not supported */
>> +if (linear_size + single_size + double_size > kvd_size) {
>> +NL_SET_ERR_MSG(extack, MLXSW_SP_PREFIX "Overlap is not 
>> supported");
> 
> Overlap? Isn't that sum of the partitions are greater than total size?
> 

In case sum of the partitions is greater than the kvd tot size, the
hash single/double will be set in an overlapping state, which we do
not support currently.

> 
>> +return -EINVAL;
>> +}
>> +
>> +return 0;
>> +}
>> +
>> +static int
>> +mlxsw_sp_resource_kvd_linear_size_validate(struct devlink *devlink, u64 
>> size,
>> +   struct list_head *resource_list,
>> +   struct netlink_ext_ack *extack)
>> +{
>> +if (!mlxsw_sp_resource_kvd_granularity_validate(extack, size))
>> +return -EINVAL;
>> +
>> +return 0;
>> +}
>> +
>> +static int
>> +mlxsw_sp_resource_kvd_hash_single_size_validate(struct devlink *devlink, 
>> u64 size,
>> +struct list_head *resource_list,
>> +struct netlink_ext_ack *extack)
>> +{
>> +struct mlxsw_core *mlxsw_core = devlink_priv(devlink);
>> +
>> +if (!mlxsw_sp_resource_kvd_granularity_validate(extack, size))
>> +return -EINVAL;
>> +
>> +if (size < MLXSW_CORE_RES_GET(mlxsw_core, KVD_SINGLE_MIN_SIZE)) {
>> +NL_SET_ERR_MSG(extack, MLXSW_SP_PREFIX "hash single size is 
>> smaller then min");
> 
> s/then min/than minimium/
> 
>> +return -EINVAL;
>> +}
>> +return 0;
>> +}
>> +
>> +static int
>> +mlxsw_sp_resource_kvd_hash_double_size_validate(struct devlink *devlink, 
>> u64 size,
>> +struct list_head *resource_list,
>> +struct netlink_ext_ack *extack)
>> +{
>> +struct mlxsw_core *mlxsw_core = devlink_priv(devlink);
>> +
>> +if (!mlxsw_sp_resource_kvd_granularity_validate(extack, size))
>> +return -EINVAL;
>> +
>> +if (size < MLXSW_CORE_RES_GET(mlxsw_core, KVD_DOUBLE_MIN_SIZE)) {
>> +NL_SET_ERR_MSG(extack, MLXSW_SP_PREFIX "hash double size is 
>> smaller then min");
> 
> s/then min/than minimium/
> 
> How does the user learn the minimum size and the granularity for the KVD
> resources? Seems like those could be read-only attributes in the
> resource dump to make it easier for the user.
> 

This seems to me as too case specific and I didn't want to add
UAPI attributes for this stuff..

The resource shouldn't be define as only memory based hardware blocks.
I actually plane expose the rifs as resource as well.

I think that if the user try to configure and receives an such error
it is very clear 

Re: [patch net-next RFC v2 02/11] devlink: Add support for resource abstraction

2017-11-19 Thread Arkadi Sharshevsky


On 11/18/2017 08:34 PM, David Ahern wrote:
> On 11/14/17 9:18 AM, Jiri Pirko wrote:
>> diff --git a/include/net/devlink.h b/include/net/devlink.h
>> index 4d2c6fc..960e80a 100644
>> --- a/include/net/devlink.h
>> +++ b/include/net/devlink.h
> ...
> 
>> @@ -469,6 +523,32 @@ devlink_dpipe_match_put(struct sk_buff *skb,
>>  return 0;
>>  }
>>  
>> +static inline int
>> +devlink_resource_register(struct devlink *devlink,
>> +  const char *resource_name,
>> +  bool top_hierarchy,
>> +  bool reload_required,
>> +  u64 resource_size,
>> +  u64 resource_id,
>> +  u64 parent_resource_id,
>> +  struct devlink_resource_ops *resource_ops)
>> +{
>> +return 0;
>> +}
>> +
>> +static inline void
>> +devlink_resources_unregister(struct devlink *devlink,
>> + struct devlink_resource *resource)
>> +{
>> +}
>> +
>> +static inline int
>> +devlink_resource_size_get(struct devlink *devlink, u64 resource_id,
>> +  u64 *p_resource_size)
>> +{
>> +return -EINVAL;
> 
> It's compiled out so -EOPNOTSUPP seems more appropriate.
> 

will fix

> 
> 
>> diff --git a/net/core/devlink.c b/net/core/devlink.c
>> index 0114dfc..6ae644f 100644
>> --- a/net/core/devlink.c
>> +++ b/net/core/devlink.c
>> +static int devlink_nl_cmd_resource_set(struct sk_buff *skb,
>> +   struct genl_info *info)
>> +{
>> +struct devlink *devlink = info->user_ptr[0];
>> +struct devlink_resource *resource;
>> +u64 resource_id;
>> +u64 size;
>> +int err;
>> +
>> +if (!info->attrs[DEVLINK_ATTR_RESOURCE_ID] ||
>> +!info->attrs[DEVLINK_ATTR_RESOURCE_SIZE])
>> +return -EINVAL;
> 
> several of the of the DEVLINK_ATTR_RESOURCE attributes are kernel to
> user only (e.g., DEVLINK_ATTR_RESOURCE_SIZE_NEW and
> DEVLINK_ATTR_RESOURCE_RELOAD_REQUIRED), so if they are given by the user
> that should be an error too right?
> 

Not sure I understood. As you see I only check for the mandatory
attributes, if the user provides not relevant data its ignored.

We use one single nla_policy for all the commands (devlink_nl_policy)

> 
>> +resource_id = nla_get_u64(info->attrs[DEVLINK_ATTR_RESOURCE_ID]);
> 
> I don't see where these attributes are validated for proper size.
> 

right, forgot to update the policy.

>> +
>> +resource = devlink_resource_find(devlink, NULL, resource_id);
>> +if (!resource)
>> +return -EINVAL;
>> +
>> +if (!resource->resource_ops->size_validate)
>> +return -EINVAL;
> 
> genl_info has extack; please add user messages for the above failures.
> 

Isn't EOPNOTSUPP enough ?

> 
>> +
>> +size = nla_get_u64(info->attrs[DEVLINK_ATTR_RESOURCE_SIZE]);
>> +err = resource->resource_ops->size_validate(devlink, size,
>> +>resource_list,
>> +info->extack);
>> +if (err)
>> +return err;
>> +
>> +resource->size_new = size;
>> +return 0;
>> +}
>> +
>