Re: [PATCH bpf-next] xdp: sample code for redirecting vlan packets to specific cpus

2018-11-07 Thread Shannon Nelson
On Wed, Nov 7, 2018 at 1:44 PM Daniel Borkmann  wrote:
>
> On 10/29/2018 11:11 PM, John Fastabend wrote:
> > On 10/29/2018 02:19 PM, Shannon Nelson wrote:
> >> This is an example of using XDP to redirect the processing of
> >> particular vlan packets to specific CPUs.  This is in response
> >> to comments received on a kernel patch put forth previously
> >> to do something similar using RPS.
> >>  https://www.spinics.net/lists/netdev/msg528210.html
> >>  [PATCH net-next] net: enable RPS on vlan devices
> >>
> >> This XDP application watches for inbound vlan-tagged packets
> >> and redirects those packets to be processed on a specific CPU
> >> as configured in a BPF map.  The BPF map can be modified by
> >> this user program, which can also load and unload the kernel
> >> XDP code.
> >>
> >> One example use is for supporting VMs where we can't control the
> >> OS being used: we'd like to separate the VM CPU processing from
> >> the host's CPUs as a way to help mitigate L1TF related issues.
> >> When running the VM's traffic on a vlan we can stick the host's
> >> Rx processing on one set of CPUs separate from the VM's CPUs.
> >>
> >> This example currently uses a vlan key and cpu value in the
> >> BPF map, so only can do one CPU per vlan.  This could easily
> >> be modified to use a bitpattern of CPUs rather than a CPU id
> >> to allow multiple CPUs per vlan.
> >
> > Great, so does this solve your use case then? At least on drivers
> > with XDP support?
> >
> >>
> >> Signed-off-by: Shannon Nelson 
> >> ---
> >
> > Some really small and trivial nits below.
> >
> > Acked-by: John Fastabend 
> >
> > [...]
> >
> >> +if (install) {
> >> +
> >
> > new line probably not needed.
> >
> >> +/* check to see if already installed */
> >> +errno = 0;
> >> +access(pin_prog_name, R_OK);
> >> +if (errno != ENOENT) {
> >> +fprintf(stderr, "ERR: %s is already installed\n", 
> >> argv[0]);
> >> +return -1;
> >> +}
> >> +
> >> +/* load the XDP program and maps with the convenient library 
> >> */
> >> +if (load_bpf_file(filename)) {
> >> +fprintf(stderr, "ERR: load_bpf_file(%s): \n%s",
> >> +filename, bpf_log_buf);
> >> +return -1;
> >> +}
> >> +if (!prog_fd[0]) {
> >> +fprintf(stderr, "ERR: load_bpf_file(%s): %d %s\n",
> >> +filename, errno, strerror(errno));
> >> +return -1;
> >> +}
> >> +
> >> +/* pin the XDP program and maps */
> >> +if (bpf_obj_pin(prog_fd[0], pin_prog_name) < 0) {
> >> +fprintf(stderr, "ERR: bpf_obj_pin(%s): %d %s\n",
> >> +pin_prog_name, errno, strerror(errno));
> >> +if (errno == 2)
> >> +fprintf(stderr, " (is the BPF fs mounted 
> >> on /sys/fs/bpf?)\n");
> >> +return -1;
> >> +}
> >> +if (bpf_obj_pin(map_fd[0], pin_vlanmap_name) < 0) {
> >> +fprintf(stderr, "ERR: bpf_obj_pin(%s): %d %s\n",
> >> +pin_vlanmap_name, errno, strerror(errno));
> >> +return -1;
> >> +}
> >> +if (bpf_obj_pin(map_fd[2], pin_countermap_name) < 0) {
> >> +fprintf(stderr, "ERR: bpf_obj_pin(%s): %d %s\n",
> >> +pin_countermap_name, errno, strerror(errno));
> >> +return -1;
> >> +}
> >> +
> >> +/* prep the vlan map with "not used" values */
> >> +c64 = UNDEF_CPU;
> >> +for (v64 = 0; v64 < 4096; v64++) {
> >
> > maybe #define MAX_VLANS 4096 just to avoid constants.
> >
> >> +if (bpf_map_update_elem(map_fd[0], , , 0)) {
> >> +fprintf(stderr, "ERR: preping vlan map failed 
> >> on v=%l

[RFC PATCH] net: vlan ipsec-xfrm offload

2018-10-30 Thread Shannon Nelson
This is an RFC post - not working code.  I welcome your comments.

If the real device offers IPsec offload, why shouldn't the VLAN device
also offer the offload?  This essentially becomes a passthru interface
to the real device by swapping out the xfrm_state's netdevice reference
before calling to the real_dev.

Signed-off-by: Shannon Nelson 
---
 net/8021q/vlan_dev.c | 98 
 1 file changed, 98 insertions(+)

diff --git a/net/8021q/vlan_dev.c b/net/8021q/vlan_dev.c
index ff720f1..a46e056 100644
--- a/net/8021q/vlan_dev.c
+++ b/net/8021q/vlan_dev.c
@@ -32,6 +32,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include "vlan.h"
 #include "vlanproc.h"
@@ -546,6 +547,7 @@ static struct device_type vlan_type = {
 };
 
 static const struct net_device_ops vlan_netdev_ops;
+static const struct xfrmdev_ops vlan_xfrmdev_ops;
 
 static int vlan_dev_init(struct net_device *dev)
 {
@@ -553,6 +555,7 @@ static int vlan_dev_init(struct net_device *dev)
 
netif_carrier_off(dev);
 
+   /* copy netdev features into list of user selectable features */
/* IFF_BROADCAST|IFF_MULTICAST; ??? */
dev->flags  = real_dev->flags & ~(IFF_UP | IFF_PROMISC | IFF_ALLMULTI |
  IFF_MASTER | IFF_SLAVE);
@@ -564,6 +567,12 @@ static int vlan_dev_init(struct net_device *dev)
   NETIF_F_FRAGLIST | NETIF_F_GSO_SOFTWARE |
   NETIF_F_HIGHDMA | NETIF_F_SCTP_CRC |
   NETIF_F_ALL_FCOE;
+#ifdef CONFIG_XFRM_OFFLOAD
+   dev->hw_features |= real_dev->hw_features & (NETIF_F_HW_ESP |
+NETIF_F_HW_ESP_TX_CSUM |
+NETIF_F_GSO_ESP);
+   dev->xfrmdev_ops = _xfrmdev_ops;
+#endif
 
dev->features |= dev->hw_features | NETIF_F_LLTX;
dev->gso_max_size = real_dev->gso_max_size;
@@ -767,6 +776,95 @@ static int vlan_dev_get_iflink(const struct net_device 
*dev)
return real_dev->ifindex;
 }
 
+static int vlan_xdo_state_add(struct xfrm_state *xs)
+{
+   struct net_device *dev = xs->xso.dev;
+   struct net_device *real_dev = vlan_dev_priv(dev)->real_dev;
+   int ret;
+
+   /* swap out the vlan dev and put a hold on the real device */
+   xs->xso.dev = real_dev;
+   dev_hold(real_dev);
+
+   ret = real_dev->xfrmdev_ops->xdo_dev_state_add(xs);
+
+   /* if it wasn't successful, release the real_dev */
+   if (ret)
+   dev_put(real_dev);
+
+   /* put dev back before we leave */
+   xs->xso.dev = dev;
+
+   return ret;
+}
+
+static void vlan_xdo_state_delete(struct xfrm_state *xs)
+{
+   struct net_device *dev = xs->xso.dev;
+   struct net_device *real_dev = vlan_dev_priv(dev)->real_dev;
+
+   xs->xso.dev = real_dev;
+   real_dev->xfrmdev_ops->xdo_dev_state_delete(xs);
+   xs->xso.dev = dev;
+}
+
+static void vlan_xdo_state_free(struct xfrm_state *xs)
+{
+   struct net_device *dev = xs->xso.dev;
+   struct net_device *real_dev = vlan_dev_priv(dev)->real_dev;
+
+   /* check for optional interface */
+   if (real_dev->xfrmdev_ops->xdo_dev_state_free) {
+   xs->xso.dev = real_dev;
+   real_dev->xfrmdev_ops->xdo_dev_state_free(xs);
+   xs->xso.dev = dev;
+   }
+
+   /* let go of the real device, we're done with it */
+   dev_put(real_dev);
+}
+
+static bool vlan_xdo_offload_ok(struct sk_buff *skb, struct xfrm_state *xs)
+{
+   struct net_device *dev = xs->xso.dev;
+   struct net_device *real_dev = vlan_dev_priv(dev)->real_dev;
+   bool ret = true;
+
+   /* check for optional interface */
+   if (likely(real_dev->xfrmdev_ops->xdo_dev_offload_ok)) {
+   xs->xso.dev = real_dev;
+   ret = real_dev->xfrmdev_ops->xdo_dev_offload_ok(skb, xs);
+   xs->xso.dev = dev;
+   }
+
+   if (!ret)
+   netdev_err(dev, "%s: real_dev %s NAKd the offload\n",
+  __func__, real_dev->name);
+
+   return ret;
+}
+
+static void vlan_xdo_advance_esn(struct xfrm_state *xs)
+{
+   struct net_device *dev = xs->xso.dev;
+   struct net_device *real_dev = vlan_dev_priv(dev)->real_dev;
+
+   /* check for optional interface */
+   if (real_dev->xfrmdev_ops->xdo_dev_state_advance_esn) {
+   xs->xso.dev = real_dev;
+   real_dev->xfrmdev_ops->xdo_dev_state_advance_esn(xs);
+   xs->xso.dev = dev;
+   }
+}
+
+static const struct xfrmdev_ops vlan_xfrmdev_ops = {
+   .xdo_dev_state_add = vlan_xdo_state_add,
+   .xdo_dev_state_delete = vlan_xdo_state_delete,
+   .xdo_dev_state_free = vlan_xd

Re: [PATCH bpf-next] xdp: sample code for redirecting vlan packets to specific cpus

2018-10-29 Thread Shannon Nelson

On 10/29/2018 3:11 PM, John Fastabend wrote:

On 10/29/2018 02:19 PM, Shannon Nelson wrote:

This is an example of using XDP to redirect the processing of
particular vlan packets to specific CPUs.  This is in response
to comments received on a kernel patch put forth previously
to do something similar using RPS.
  https://www.spinics.net/lists/netdev/msg528210.html
  [PATCH net-next] net: enable RPS on vlan devices

This XDP application watches for inbound vlan-tagged packets
and redirects those packets to be processed on a specific CPU
as configured in a BPF map.  The BPF map can be modified by
this user program, which can also load and unload the kernel
XDP code.

One example use is for supporting VMs where we can't control the
OS being used: we'd like to separate the VM CPU processing from
the host's CPUs as a way to help mitigate L1TF related issues.
When running the VM's traffic on a vlan we can stick the host's
Rx processing on one set of CPUs separate from the VM's CPUs.

This example currently uses a vlan key and cpu value in the
BPF map, so only can do one CPU per vlan.  This could easily
be modified to use a bitpattern of CPUs rather than a CPU id
to allow multiple CPUs per vlan.


Great, so does this solve your use case then? At least on drivers
with XDP support?


Well, more or less... the actual issue was a request for our UEK5 
distribution, based on v4.14, which doesn't have support for the CPU 
redirect.  Internal discussion continues.






Signed-off-by: Shannon Nelson 
---


Some really small and trivial nits below.

Acked-by: John Fastabend 



Thanks,
sln



[PATCH bpf-next] xdp: sample code for redirecting vlan packets to specific cpus

2018-10-29 Thread Shannon Nelson
This is an example of using XDP to redirect the processing of
particular vlan packets to specific CPUs.  This is in response
to comments received on a kernel patch put forth previously
to do something similar using RPS.
 https://www.spinics.net/lists/netdev/msg528210.html
 [PATCH net-next] net: enable RPS on vlan devices

This XDP application watches for inbound vlan-tagged packets
and redirects those packets to be processed on a specific CPU
as configured in a BPF map.  The BPF map can be modified by
this user program, which can also load and unload the kernel
XDP code.

One example use is for supporting VMs where we can't control the
OS being used: we'd like to separate the VM CPU processing from
the host's CPUs as a way to help mitigate L1TF related issues.
When running the VM's traffic on a vlan we can stick the host's
Rx processing on one set of CPUs separate from the VM's CPUs.

This example currently uses a vlan key and cpu value in the
BPF map, so only can do one CPU per vlan.  This could easily
be modified to use a bitpattern of CPUs rather than a CPU id
to allow multiple CPUs per vlan.

Signed-off-by: Shannon Nelson 
---
 samples/bpf/Makefile |   3 +
 samples/bpf/xdp_vlan_redirect_kern.c | 118 +++
 samples/bpf/xdp_vlan_redirect_user.c | 393 +++
 3 files changed, 514 insertions(+)
 create mode 100644 samples/bpf/xdp_vlan_redirect_kern.c
 create mode 100644 samples/bpf/xdp_vlan_redirect_user.c

diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile
index 1303af1..a9746ce 100644
--- a/samples/bpf/Makefile
+++ b/samples/bpf/Makefile
@@ -44,6 +44,7 @@ hostprogs-y += load_sock_ops
 hostprogs-y += xdp_redirect
 hostprogs-y += xdp_redirect_map
 hostprogs-y += xdp_redirect_cpu
+hostprogs-y += xdp_vlan_redirect
 hostprogs-y += xdp_monitor
 hostprogs-y += xdp_rxq_info
 hostprogs-y += syscall_tp
@@ -99,6 +100,7 @@ per_socket_stats_example-objs := cookie_uid_helper_example.o
 xdp_redirect-objs := bpf_load.o xdp_redirect_user.o
 xdp_redirect_map-objs := bpf_load.o xdp_redirect_map_user.o
 xdp_redirect_cpu-objs := bpf_load.o xdp_redirect_cpu_user.o
+xdp_vlan_redirect-objs := bpf_load.o xdp_vlan_redirect_user.o
 xdp_monitor-objs := bpf_load.o xdp_monitor_user.o
 xdp_rxq_info-objs := xdp_rxq_info_user.o
 syscall_tp-objs := bpf_load.o syscall_tp_user.o
@@ -154,6 +156,7 @@ always += tcp_basertt_kern.o
 always += xdp_redirect_kern.o
 always += xdp_redirect_map_kern.o
 always += xdp_redirect_cpu_kern.o
+always += xdp_vlan_redirect_kern.o
 always += xdp_monitor_kern.o
 always += xdp_rxq_info_kern.o
 always += xdp2skb_meta_kern.o
diff --git a/samples/bpf/xdp_vlan_redirect_kern.c 
b/samples/bpf/xdp_vlan_redirect_kern.c
new file mode 100644
index 000..3d561a0
--- /dev/null
+++ b/samples/bpf/xdp_vlan_redirect_kern.c
@@ -0,0 +1,118 @@
+// SPDX-License-Identifier: GPL-2.0
+/*  XDP redirect vlans to CPUs - kernel code
+ *
+ *  Copyright(c) 2018 Oracle Corp.
+ */
+
+#include 
+#include 
+
+#include 
+#include "bpf_helpers.h"
+
+char _license[] SEC("license") = "GPL";
+
+#define MAX_CPUS 64   /* WARNING - sync with _user.c */
+#define UNDEF_CPU 0xff00  /* WARNING - sync with _user.c */
+
+/* The vlan index finds cpu(s) for processing a packet */
+struct bpf_map_def SEC("maps") vlan_redirect_map = {
+   .type = BPF_MAP_TYPE_ARRAY,
+   .key_size = sizeof(u32),/* vlan tag */
+   .value_size = sizeof(u64),  /* cpu bitpattern */
+   .max_entries = 4096,
+};
+
+/* List of cpus that can participate in the vlan redirect */
+struct bpf_map_def SEC("maps") vlan_redirect_cpus_map = {
+   .type   = BPF_MAP_TYPE_CPUMAP,
+   .key_size   = sizeof(u32),  /* cpu id */
+   .value_size = sizeof(u32),  /* queue size */
+   .max_entries= MAX_CPUS,
+};
+
+/* Counters for debug */
+#define VRC_CALLS  0/* number of calls to this program */
+#define VRC_VLANS  1/* number of vlan packets seen */
+#define VRC_HITS   2/* number of redirects attempted */
+#define CPU_COUNT  3/* number of CPUs found */
+struct bpf_map_def SEC("maps") vlan_redirect_counters_map = {
+   .type = BPF_MAP_TYPE_ARRAY,
+   .key_size = sizeof(u32),/* vlan tag */
+   .value_size = sizeof(u64),  /* cpu bitpattern */
+   .max_entries = 4,
+};
+
+SEC("xdp_vlan_redirect")
+int xdp_vlan_redirect(struct xdp_md *ctx)
+{
+   void *data_end = (void *)(long)ctx->data_end;
+   void *data = (void *)(long)ctx->data;
+   struct ethhdr *eth = data;
+   uint64_t h_proto, nh_off;
+   struct vlan_hdr *vhdr;
+   uint64_t *cpu_value;
+   uint32_t num_cpus;
+   uint32_t minlen;
+   uint64_t *vp;
+   uint64_t v, k;
+   uint64_t vlan;
+
+   /* count packets processed */
+   k = VRC_CALLS;
+   vp = bpf_map_lookup_ele

[PATCH bpf-next] bpf_load: add map name to load_maps error message

2018-10-29 Thread Shannon Nelson
To help when debugging bpf/xdp load issues, have the load_map()
error message include the number and name of the map that
failed.

Signed-off-by: Shannon Nelson 
---
 samples/bpf/bpf_load.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/samples/bpf/bpf_load.c b/samples/bpf/bpf_load.c
index 89161c9..5de0357 100644
--- a/samples/bpf/bpf_load.c
+++ b/samples/bpf/bpf_load.c
@@ -282,8 +282,8 @@ static int load_maps(struct bpf_map_data *maps, int nr_maps,
numa_node);
}
if (map_fd[i] < 0) {
-   printf("failed to create a map: %d %s\n",
-  errno, strerror(errno));
+   printf("failed to create map %d (%s): %d %s\n",
+  i, maps[i].name, errno, strerror(errno));
return 1;
}
maps[i].fd = map_fd[i];
-- 
2.7.4



Re: [PATCH net-next] net: enable RPS on vlan devices

2018-10-10 Thread Shannon Nelson

On 10/10/2018 10:37 AM, John Fastabend wrote:

Latest tree has a sk_lookup() helper supported in 'tc' layer now
to lookup the socket. And XDP has support for a "cpumap" object
that allows redirect to remote CPUs. Neither was specifically
designed for this but I suspect with some extra work these might
be what is needed.

I would start by looking at bpf_sk_lookup() in filter.c and the
cpumap type in ./kernel/bpf/cpumap.c, also in general sk_lookup
from XDP layer will likely be needed shortly anyways.


Thanks, John, for pointing to something I can start looking at.  My 
customer still wants to use the RPS that they already know, but I'll 
start looking into how this also might work for us.


sln


Re: [PATCH net-next] net: enable RPS on vlan devices

2018-10-10 Thread Shannon Nelson

On 10/10/2018 10:14 AM, Eric Dumazet wrote:


On 10/10/2018 09:18 AM, Shannon Nelson wrote:

On 10/9/2018 7:17 PM, Eric Dumazet wrote:



On 10/09/2018 07:11 PM, Shannon Nelson wrote:


Hence the reason we sent this as an RFC a couple of weeks ago.  We got no 
response, so followed up with this patch in order to get some input. Do you 
have any suggestions for how we might accomplish this in a less ugly way?


I dunno, maybe a modern way for all these very specific needs would be to use 
an eBPF
hook to implement whatever combination of RPS/RFS/what_have_you

Then, we no longer have to review what various strategies are used by users.


We're trying to make use of an existing useful feature that was designed for exactly 
this kind of problem.  It is already there and no new user training is needed.  We're 
actually fixing what could arguably be called a bug since the 
/sys/class/net//queues/rx-0/rps_cpus entry exists for vlan devices but 
currently doesn't do anything.  We're also addressing a security concern related to 
the recent L1TF excitement.

For this case, we want to target the network stack processing to happen on a 
certain subset of CPUs.  With admittedly only a cursory look through eBPF, I 
don't see an obvious way to target the packet processing to an alternative CPU, 
unless we add yet another field to the skb that eBPF/XDP could fill and then 
query that field in the same time as we currently check get_rps_cpu().  But 
adding to the skb is usually frowned upon unless absolutely necessary, and this 
seems like a duplication of what we already have with RPS, so why add a 
competing feature?

Back to my earlier question: are there any suggestions for how we might 
accomplish this in a less ugly way?



What if you want to have efficient multi queue processing ?
The Vlan device could have multiple RX queues, but you forced queue_mapping=0


This would be easy enough to change to a simple modulus of a recorded 
queue mapping with the number of queues in the vlan device.  We can fix 
that.




Honestly, RPS & RFS show their age and complexity (look at net/core/net-sysfs.c 
...)

We should not expand it, we should put in place a new infrastructure, fully 
expandable.
With socket lookups, we even can avoid having a hashtable for flow information, 
removing
one cache miss, and removing flow collisions.

eBPF seems perfect to me.


And yet specifying CPUs for processing is exactly what RPS was designed for.



It is time that we stop adding core infra that most users do not need/use.
(RPS and RFS are default off)


For that matter, lots of features are "default off" until someone 
configures them, and there are lots of features that are only used by a 
subset of users.  In this case, we're trying to use something that 
already exists and arguably is broken for the vlan case.  Are there some 
technical reasons why this is not a good solution?


sln





Re: [PATCH net-next] net: enable RPS on vlan devices

2018-10-10 Thread Shannon Nelson

On 10/9/2018 7:17 PM, Eric Dumazet wrote:



On 10/09/2018 07:11 PM, Shannon Nelson wrote:


Hence the reason we sent this as an RFC a couple of weeks ago.  We got no 
response, so followed up with this patch in order to get some input. Do you 
have any suggestions for how we might accomplish this in a less ugly way?


I dunno, maybe a modern way for all these very specific needs would be to use 
an eBPF
hook to implement whatever combination of RPS/RFS/what_have_you

Then, we no longer have to review what various strategies are used by users.


We're trying to make use of an existing useful feature that was designed 
for exactly this kind of problem.  It is already there and no new user 
training is needed.  We're actually fixing what could arguably be called 
a bug since the /sys/class/net//queues/rx-0/rps_cpus entry exists 
for vlan devices but currently doesn't do anything.  We're also 
addressing a security concern related to the recent L1TF excitement.


For this case, we want to target the network stack processing to happen 
on a certain subset of CPUs.  With admittedly only a cursory look 
through eBPF, I don't see an obvious way to target the packet processing 
to an alternative CPU, unless we add yet another field to the skb that 
eBPF/XDP could fill and then query that field in the same time as we 
currently check get_rps_cpu().  But adding to the skb is usually frowned 
upon unless absolutely necessary, and this seems like a duplication of 
what we already have with RPS, so why add a competing feature?


Back to my earlier question: are there any suggestions for how we might 
accomplish this in a less ugly way?


sln


Re: [PATCH net-next] net: enable RPS on vlan devices

2018-10-09 Thread Shannon Nelson

On 10/9/2018 6:04 PM, Eric Dumazet wrote:


On 10/09/2018 05:41 PM, Shannon Nelson wrote:

From: Silviu Smarandache 

This patch modifies the RPS processing code so that it searches
for a matching vlan interface on the packet and then uses the
RPS settings of the vlan interface.  If no vlan interface
is found or the vlan interface does not have RPS enabled,
it will fall back to the RPS settings of the underlying device.

In supporting VMs where we can't control the OS being used,
we'd like to separate the VM cpu processing from the host's
cpus as a way to help mitigate the impact of the L1TF issue.
When running the VM's traffic on a vlan we can stick the Rx
processing on one set of cpus separate from the VM's cpus.
Yes, choosing to use this may cause a bit of throughput pain
when the packets are actually passed into the VM and have to
move from one cache to another.

Orabug: 28645929

Signed-off-by: Silviu Smarandache 
Signed-off-by: Shannon Nelson 
---
  net/core/dev.c | 59 +++---
  1 file changed, 56 insertions(+), 3 deletions(-)

diff --git a/net/core/dev.c b/net/core/dev.c
index 0b2d777..1da3f63 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -3971,8 +3971,8 @@ set_rps_cpu(struct net_device *dev, struct sk_buff *skb,
   * CPU from the RPS map of the receiving queue for a given skb.
   * rcu_read_lock must be held on entry.
   */
-static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
-  struct rps_dev_flow **rflowp)
+static int __get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
+struct rps_dev_flow **rflowp)
  {
const struct rps_sock_flow_table *sock_flow_table;
struct netdev_rx_queue *rxqueue = dev->_rx;
@@ -4066,6 +4066,35 @@ static int get_rps_cpu(struct net_device *dev, struct 
sk_buff *skb,
return cpu;
  }
  
+static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,

+  struct rps_dev_flow **rflowp)
+{
+   /* Check for a vlan device with RPS settings */
+   if (skb_vlan_tag_present(skb)) {
+   struct net_device *vdev;
+   u16 vid;
+
+   vid = skb_vlan_tag_get_id(skb);
+   vdev = __vlan_find_dev_deep_rcu(dev, skb->vlan_proto, vid);
+   if (vdev) {
+   /* recorded queue is not referring to the vlan device.
+* Save and restore it
+*/
+   int cpu;
+   u16 queue_mapping = skb_get_queue_mapping(skb);
+
+   skb_set_queue_mapping(skb, 0);
+   cpu = __get_rps_cpu(vdev, skb, rflowp);
+   skb_set_queue_mapping(skb, queue_mapping);


This is really ugly :/


Hence the reason we sent this as an RFC a couple of weeks ago.  We got 
no response, so followed up with this patch in order to get some input. 
Do you have any suggestions for how we might accomplish this in a less 
ugly way?




Also what makes vlan so special compared to say macvlan ?


Only that vlan was the itch that Silviu needed to scratch.  If we can 
solve this for vlan, then perhaps we'll have a template to follow for 
other upper devices.


sln



[PATCH net-next] net: enable RPS on vlan devices

2018-10-09 Thread Shannon Nelson
From: Silviu Smarandache 

This patch modifies the RPS processing code so that it searches
for a matching vlan interface on the packet and then uses the
RPS settings of the vlan interface.  If no vlan interface
is found or the vlan interface does not have RPS enabled,
it will fall back to the RPS settings of the underlying device.

In supporting VMs where we can't control the OS being used,
we'd like to separate the VM cpu processing from the host's
cpus as a way to help mitigate the impact of the L1TF issue.
When running the VM's traffic on a vlan we can stick the Rx
processing on one set of cpus separate from the VM's cpus.
Yes, choosing to use this may cause a bit of throughput pain
when the packets are actually passed into the VM and have to
move from one cache to another.

Orabug: 28645929

Signed-off-by: Silviu Smarandache 
Signed-off-by: Shannon Nelson 
---
 net/core/dev.c | 59 +++---
 1 file changed, 56 insertions(+), 3 deletions(-)

diff --git a/net/core/dev.c b/net/core/dev.c
index 0b2d777..1da3f63 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -3971,8 +3971,8 @@ set_rps_cpu(struct net_device *dev, struct sk_buff *skb,
  * CPU from the RPS map of the receiving queue for a given skb.
  * rcu_read_lock must be held on entry.
  */
-static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
-  struct rps_dev_flow **rflowp)
+static int __get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
+struct rps_dev_flow **rflowp)
 {
const struct rps_sock_flow_table *sock_flow_table;
struct netdev_rx_queue *rxqueue = dev->_rx;
@@ -4066,6 +4066,35 @@ static int get_rps_cpu(struct net_device *dev, struct 
sk_buff *skb,
return cpu;
 }
 
+static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
+  struct rps_dev_flow **rflowp)
+{
+   /* Check for a vlan device with RPS settings */
+   if (skb_vlan_tag_present(skb)) {
+   struct net_device *vdev;
+   u16 vid;
+
+   vid = skb_vlan_tag_get_id(skb);
+   vdev = __vlan_find_dev_deep_rcu(dev, skb->vlan_proto, vid);
+   if (vdev) {
+   /* recorded queue is not referring to the vlan device.
+* Save and restore it
+*/
+   int cpu;
+   u16 queue_mapping = skb_get_queue_mapping(skb);
+
+   skb_set_queue_mapping(skb, 0);
+   cpu = __get_rps_cpu(vdev, skb, rflowp);
+   skb_set_queue_mapping(skb, queue_mapping);
+   if (cpu != -1)
+   return cpu;
+   }
+   }
+
+   /* Fall back to RPS settings of original device */
+   return __get_rps_cpu(dev, skb, rflowp);
+}
+
 #ifdef CONFIG_RFS_ACCEL
 
 /**
@@ -4437,12 +4466,23 @@ static int netif_rx_internal(struct sk_buff *skb)
preempt_disable();
rcu_read_lock();
 
+   /* strip any vlan tag before calling get_rps_cpu() */
+   if (skb->protocol == cpu_to_be16(ETH_P_8021Q) ||
+   skb->protocol == cpu_to_be16(ETH_P_8021AD)) {
+   skb = skb_vlan_untag(skb);
+   if (unlikely(!skb)) {
+   ret = NET_RX_DROP;
+   goto unlock;
+   }
+   }
+
cpu = get_rps_cpu(skb->dev, skb, );
if (cpu < 0)
cpu = smp_processor_id();
 
ret = enqueue_to_backlog(skb, cpu, >last_qtail);
 
+unlock:
rcu_read_unlock();
preempt_enable();
} else
@@ -5095,8 +5135,19 @@ static int netif_receive_skb_internal(struct sk_buff 
*skb)
 #ifdef CONFIG_RPS
if (static_key_false(_needed)) {
struct rps_dev_flow voidflow, *rflow = 
-   int cpu = get_rps_cpu(skb->dev, skb, );
+   int cpu;
+
+   /* strip any vlan tag before calling get_rps_cpu() */
+   if (skb->protocol == cpu_to_be16(ETH_P_8021Q) ||
+   skb->protocol == cpu_to_be16(ETH_P_8021AD)) {
+   skb = skb_vlan_untag(skb);
+   if (unlikely(!skb)) {
+   ret = NET_RX_DROP;
+   goto out;
+   }
+   }
 
+   cpu = get_rps_cpu(skb->dev, skb, );
if (cpu >= 0) {
ret = enqueue_to_backlog(skb, cpu, >last_qtail);
rcu_read_unlock();
@@ -5105,6 +5156,8 @@ static int netif_receive_skb_internal(struct sk_buff *skb)
}
 #endif
ret = __netif_receive_skb(skb);
+
+out:
rcu_read_unlock();
return ret;
 }
-- 
2.7.4



[PATCH] ixgbe: allow IPsec Tx offload in VEPA mode

2018-10-04 Thread Shannon Nelson
When it's possible that the PF might end up trying to send a
packet to one of its own VFs, we have to forbid IPsec offload
because the device drops the packets into a black hole.
See commit 47b6f50077e6 ("ixgbe: disallow IPsec Tx offload
when in SR-IOV mode") for more info.

This really is only necessary when the device is in the default
VEB mode.  If instead the device is running in VEPA mode,
the packets will go through the encryption engine and out the
MAC/PHY as normal, and get "hairpinned" as needed by the switch.

So let's not block IPsec offload when in VEPA mode.  To get
there with the ixgbe device, use the handy 'bridge' command:
bridge link set dev eth1 hwmode vepa

Signed-off-by: Shannon Nelson 
---
 drivers/net/ethernet/intel/ixgbe/ixgbe_ipsec.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_ipsec.c 
b/drivers/net/ethernet/intel/ixgbe/ixgbe_ipsec.c
index fd1b054..4d77f42 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_ipsec.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_ipsec.c
@@ -4,6 +4,7 @@
 #include "ixgbe.h"
 #include 
 #include 
+#include 
 
 #define IXGBE_IPSEC_KEY_BITS  160
 static const char aes_gcm_name[] = "rfc4106(gcm(aes))";
@@ -693,7 +694,8 @@ static int ixgbe_ipsec_add_sa(struct xfrm_state *xs)
} else {
struct tx_sa tsa;
 
-   if (adapter->num_vfs)
+   if (adapter->num_vfs &&
+   adapter->bridge_mode != BRIDGE_MODE_VEPA)
return -EOPNOTSUPP;
 
/* find the first unused index */
-- 
2.7.4



[PATCH RFC net-next] net: enable RPS on vlan devices

2018-09-24 Thread Shannon Nelson
This patch modifies the RPS processing code so that it searches
for a matching vlan interface on the packet and then uses the
RPS settings of the vlan interface.  If no vlan interface
is found or the vlan interface does not have RPS enabled,
it will fall back to the RPS settings of the underlying device.

In supporting VMs where we can't control the OS being used,
we'd like to separate the VM cpu processing from the host's
cpus as a way to help mitigate the impact of the L1TF issue.
When running the VM's traffic on a vlan we can stick the Rx
processing on one set of cpus separate from the VM's cpus.
Yes, choosing to use this may cause a bit of throughput pain
when the packets are actually passed into the VM and have to
move from one cache to another.

Orabug: 28645929

Signed-off-by: Silviu Smarandache 
Signed-off-by: Shannon Nelson 
---
 net/core/dev.c | 59 +++---
 1 file changed, 56 insertions(+), 3 deletions(-)

diff --git a/net/core/dev.c b/net/core/dev.c
index 559a912..28b12d1 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -3735,8 +3735,8 @@ set_rps_cpu(struct net_device *dev, struct sk_buff *skb,
  * CPU from the RPS map of the receiving queue for a given skb.
  * rcu_read_lock must be held on entry.
  */
-static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
-  struct rps_dev_flow **rflowp)
+static int __get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
+struct rps_dev_flow **rflowp)
 {
const struct rps_sock_flow_table *sock_flow_table;
struct netdev_rx_queue *rxqueue = dev->_rx;
@@ -3830,6 +3830,35 @@ static int get_rps_cpu(struct net_device *dev, struct 
sk_buff *skb,
return cpu;
 }
 
+static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
+  struct rps_dev_flow **rflowp)
+{
+   /* Check for a vlan device with RPS settings */
+   if (skb_vlan_tag_present(skb)) {
+   struct net_device *vdev;
+   u16 vid;
+
+   vid = skb_vlan_tag_get_id(skb);
+   vdev = __vlan_find_dev_deep_rcu(dev, skb->vlan_proto, vid);
+   if (vdev) {
+   /* recorded queue is not referring to the vlan device.
+* Save and restore it
+*/
+   int cpu;
+   u16 queue_mapping = skb_get_queue_mapping(skb);
+
+   skb_set_queue_mapping(skb, 0);
+   cpu = __get_rps_cpu(vdev, skb, rflowp);
+   skb_set_queue_mapping(skb, queue_mapping);
+   if (cpu != -1)
+   return cpu;
+   }
+   }
+
+   /* Fall back to RPS settings of original device */
+   return __get_rps_cpu(dev, skb, rflowp);
+}
+
 #ifdef CONFIG_RFS_ACCEL
 
 /**
@@ -4201,12 +4230,23 @@ static int netif_rx_internal(struct sk_buff *skb)
preempt_disable();
rcu_read_lock();
 
+   /* strip any vlan tag before calling get_rps_cpu() */
+   if (skb->protocol == cpu_to_be16(ETH_P_8021Q) ||
+   skb->protocol == cpu_to_be16(ETH_P_8021AD)) {
+   skb = skb_vlan_untag(skb);
+   if (unlikely(!skb)) {
+   ret = NET_RX_DROP;
+   goto unlock;
+   }
+   }
+
cpu = get_rps_cpu(skb->dev, skb, );
if (cpu < 0)
cpu = smp_processor_id();
 
ret = enqueue_to_backlog(skb, cpu, >last_qtail);
 
+unlock:
rcu_read_unlock();
preempt_enable();
} else
@@ -4755,8 +4795,19 @@ static int netif_receive_skb_internal(struct sk_buff 
*skb)
 #ifdef CONFIG_RPS
if (static_key_false(_needed)) {
struct rps_dev_flow voidflow, *rflow = 
-   int cpu = get_rps_cpu(skb->dev, skb, );
+   int cpu;
+
+   /* strip any vlan tag before calling get_rps_cpu() */
+   if (skb->protocol == cpu_to_be16(ETH_P_8021Q) ||
+   skb->protocol == cpu_to_be16(ETH_P_8021AD)) {
+   skb = skb_vlan_untag(skb);
+   if (unlikely(!skb)) {
+   ret = NET_RX_DROP;
+   goto out;
+   }
+   }
 
+   cpu = get_rps_cpu(skb->dev, skb, );
if (cpu >= 0) {
ret = enqueue_to_backlog(skb, cpu, >last_qtail);
rcu_read_unlock();
@@ -4765,6 +4816,8 @@ static int netif_receive_skb_internal(struct sk_buff *skb)
}
 #endif
ret = __netif_receive_skb(skb);
+
+out:
rcu_read_unlock();
return ret;
 }
-- 
2.7.4



Re: [PATCH] ixgbevf: off by one in ixgbevf_ipsec_tx()

2018-09-19 Thread Shannon Nelson

On 9/19/2018 3:35 AM, Dan Carpenter wrote:

The ipsec->tx_tbl[] array has IXGBE_IPSEC_MAX_SA_COUNT elements so the >
should be a >=.

Fixes: 0062e7cc955e ("ixgbevf: add VF IPsec offload code")
Signed-off-by: Dan Carpenter 


Signed-off-by: Shannon Nelson 



diff --git a/drivers/net/ethernet/intel/ixgbevf/ipsec.c 
b/drivers/net/ethernet/intel/ixgbevf/ipsec.c
index 997cea675a37..4fcbeffce52b 100644
--- a/drivers/net/ethernet/intel/ixgbevf/ipsec.c
+++ b/drivers/net/ethernet/intel/ixgbevf/ipsec.c
@@ -470,7 +470,7 @@ int ixgbevf_ipsec_tx(struct ixgbevf_ring *tx_ring,
}
  
  	sa_idx = xs->xso.offload_handle - IXGBE_IPSEC_BASE_TX_INDEX;

-   if (unlikely(sa_idx > IXGBE_IPSEC_MAX_SA_COUNT)) {
+   if (unlikely(sa_idx >= IXGBE_IPSEC_MAX_SA_COUNT)) {
netdev_err(tx_ring->netdev, "%s: bad sa_idx=%d handle=%lu\n",
   __func__, sa_idx, xs->xso.offload_handle);
return 0;



[PATCH next-queue 1/2] ixgbe: disallow ipsec tx offload when in sr-iov mode

2018-08-22 Thread Shannon Nelson
There seems to be a problem in the x540's internal switch wherein if SR/IOV
mode is enabled and an offloaded IPsec packet is sent to a local VF,
the packet is silently dropped.  This might never be a problem as it is
somewhat a corner case, but if someone happens to be using IPsec offload
from the PF to a VF that just happens to get migrated to the local box,
communication will mysteriously fail.

Not good.

A simple way to protect from this is to simply not allow any IPsec offloads
for outgoing packets when num_vfs != 0.  This doesn't help any offloads that
were created before SR/IOV was enabled, but we'll get to that later.

Signed-off-by: Shannon Nelson 
---
 drivers/net/ethernet/intel/ixgbe/ixgbe_ipsec.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_ipsec.c 
b/drivers/net/ethernet/intel/ixgbe/ixgbe_ipsec.c
index 68395ab..24076b4 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_ipsec.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_ipsec.c
@@ -697,6 +697,9 @@ static int ixgbe_ipsec_add_sa(struct xfrm_state *xs)
} else {
struct tx_sa tsa;
 
+   if (adapter->num_vfs)
+   return -EOPNOTSUPP;
+
/* find the first unused index */
ret = ixgbe_ipsec_find_empty_idx(ipsec, false);
if (ret < 0) {
-- 
2.7.4



[PATCH next-queue 2/2] ixgbe: fix the return value for unsupported VF offload

2018-08-22 Thread Shannon Nelson
When failing the request because we can't support that offload,
reporting EOPNOTSUPP makes much more sense than ENXIO.

Signed-off-by: Shannon Nelson 
---
 drivers/net/ethernet/intel/ixgbe/ixgbe_ipsec.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_ipsec.c 
b/drivers/net/ethernet/intel/ixgbe/ixgbe_ipsec.c
index 24076b4..7890f4a 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_ipsec.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_ipsec.c
@@ -898,7 +898,7 @@ int ixgbe_ipsec_vf_add_sa(struct ixgbe_adapter *adapter, 
u32 *msgbuf, u32 vf)
 * device, so block these requests for now.
 */
if (!(sam->flags & XFRM_OFFLOAD_INBOUND)) {
-   err = -ENXIO;
+   err = -EOPNOTSUPP;
goto err_out;
}
 
-- 
2.7.4



[PATCH ipsec-next] xfrm: allow driver to quietly refuse offload

2018-08-22 Thread Shannon Nelson
If the "offload" attribute is used to create an IPsec SA
and the .xdo_dev_state_add() fails, the SA creation fails.
However, if the "offload" attribute is used on a device that
doesn't offer it, the attribute is quietly ignored and the SA
is created without an offload.

Along the same line of that second case, it would be good to
have a way for the device to refuse to offload an SA without
failing the whole SA creation.  This patch adds that feature
by allowing the driver to return -EOPNOTSUPP as a signal that
the SA may be fine, it just can't be offloaded.

This allows the user a little more flexibility in requesting
offloads and not needing to know every detail at all times about
each specific NIC when trying to create SAs.

Signed-off-by: Shannon Nelson 
---

More specifically, this will help one user experience issue
with the coming ixgbevf IPsec offload.

 Documentation/networking/xfrm_device.txt | 4 
 net/xfrm/xfrm_device.c   | 6 +-
 2 files changed, 9 insertions(+), 1 deletion(-)

diff --git a/Documentation/networking/xfrm_device.txt 
b/Documentation/networking/xfrm_device.txt
index 50c34ca..267f55b 100644
--- a/Documentation/networking/xfrm_device.txt
+++ b/Documentation/networking/xfrm_device.txt
@@ -68,6 +68,10 @@ and an indication of whether it is for Rx or Tx.  The driver 
should
- verify the algorithm is supported for offloads
- store the SA information (key, salt, target-ip, protocol, etc)
- enable the HW offload of the SA
+   - return status value:
+   0 success
+   -EOPNETSUPP   offload not supported, try SW IPsec
+   other fail the request
 
 The driver can also set an offload_handle in the SA, an opaque void pointer
 that can be used to convey context into the fast-path offload requests.
diff --git a/net/xfrm/xfrm_device.c b/net/xfrm/xfrm_device.c
index 5611b75..3a1d9d6 100644
--- a/net/xfrm/xfrm_device.c
+++ b/net/xfrm/xfrm_device.c
@@ -192,9 +192,13 @@ int xfrm_dev_state_add(struct net *net, struct xfrm_state 
*x,
 
err = dev->xfrmdev_ops->xdo_dev_state_add(x);
if (err) {
+   xso->num_exthdrs = 0;
+   xso->flags = 0;
xso->dev = NULL;
dev_put(dev);
-   return err;
+
+   if (err != -EOPNOTSUPP)
+   return err;
}
 
return 0;
-- 
2.7.4



Re: [Intel-wired-lan] [PATCH next-queue 0/8] ixgbe/ixgbevf: IPsec offload support for VFs

2018-08-17 Thread Shannon Nelson



On 8/16/2018 2:36 PM, Shannon Nelson wrote:

On 8/16/2018 2:15 PM, Alexander Duyck wrote:

On Tue, Aug 14, 2018 at 10:10 AM Shannon Nelson
 wrote:


On 8/14/2018 8:30 AM, Alexander Duyck wrote:

On Mon, Aug 13, 2018 at 11:43 AM Shannon Nelson
 wrote:


This set of patches implements IPsec hardware offload for VF 
devices in

Intel's 10Gbe x540 family of Ethernet devices.


[...]



So the one question I would have about this patch set is what happens
if you are setting up a ipsec connection between the PF and one of the
VFs on the same port/function? Do the ipsec offloads get translated
across the Tx loopback or do they end up causing issues? Specifically
I would be interested in seeing the results of a test either between
two VFs, or the PF and one of the VFs on the same port.

- Alex



There is definitely something funky in the internal switch connection,
as messages going from PF to VF with an offloaded encryption don't seem
to get received by the VF, at least when in a VEB setup.  If I only set
up offloads on the Rx on both PF and VF, and don't offload the Tx, then
things work.

I don't have a setup to test this, but I suspect that in a VEPA
configuration, with packets going out to the switch and turned around
back in, the Tx encryption offload would happen as expected.

sln


We should probably look at adding at least one patch to the set then
that disables IPsec Tx offload if SR-IOV is enabled with VEB so that
we don't end up breaking connections should a VF be migrated from a
remote system to a local one that it is connected to.

- Alex



The problem with this is that someone could set up an IPsec connection 
on the PF for Tx and Rx use, then set num_vfs, start some VFs, and we 
still can end up in the same place.  I don't think we want to disallow 
all Tx IPsec offload.


Maybe we can catch it in ixgbe_ipsec_offload_ok()?  If it can find that 
the dest mac is on the internal switch, perhaps it can NAK the Tx 
offload?  That would force the XFRM xmit code to do a regular SW encrypt 
before sending the packet.  I'll look into this.


sln


This would be a great idea, but the xdo_state_offload_ok() callback 
happens in the network stack before routing has happened, so there is no 
mac address yet in the skb.  We may be stuck with NAKing *all* Tx 
offloads when num_vfs != 0.  It works, and it is better than no offload 
at all, but it sure harshes the vibe.  Blech.


sln



Re: [Intel-wired-lan] [PATCH next-queue 0/8] ixgbe/ixgbevf: IPsec offload support for VFs

2018-08-16 Thread Shannon Nelson

On 8/16/2018 2:15 PM, Alexander Duyck wrote:

On Tue, Aug 14, 2018 at 10:10 AM Shannon Nelson
 wrote:


On 8/14/2018 8:30 AM, Alexander Duyck wrote:

On Mon, Aug 13, 2018 at 11:43 AM Shannon Nelson
 wrote:


This set of patches implements IPsec hardware offload for VF devices in
Intel's 10Gbe x540 family of Ethernet devices.


[...]



So the one question I would have about this patch set is what happens
if you are setting up a ipsec connection between the PF and one of the
VFs on the same port/function? Do the ipsec offloads get translated
across the Tx loopback or do they end up causing issues? Specifically
I would be interested in seeing the results of a test either between
two VFs, or the PF and one of the VFs on the same port.

- Alex



There is definitely something funky in the internal switch connection,
as messages going from PF to VF with an offloaded encryption don't seem
to get received by the VF, at least when in a VEB setup.  If I only set
up offloads on the Rx on both PF and VF, and don't offload the Tx, then
things work.

I don't have a setup to test this, but I suspect that in a VEPA
configuration, with packets going out to the switch and turned around
back in, the Tx encryption offload would happen as expected.

sln


We should probably look at adding at least one patch to the set then
that disables IPsec Tx offload if SR-IOV is enabled with VEB so that
we don't end up breaking connections should a VF be migrated from a
remote system to a local one that it is connected to.

- Alex



The problem with this is that someone could set up an IPsec connection 
on the PF for Tx and Rx use, then set num_vfs, start some VFs, and we 
still can end up in the same place.  I don't think we want to disallow 
all Tx IPsec offload.


Maybe we can catch it in ixgbe_ipsec_offload_ok()?  If it can find that 
the dest mac is on the internal switch, perhaps it can NAK the Tx 
offload?  That would force the XFRM xmit code to do a regular SW encrypt 
before sending the packet.  I'll look into this.


sln







Re: [PATCH next-queue 3/8] ixgbe: add VF ipsec management

2018-08-14 Thread Shannon Nelson

On 8/13/2018 10:31 PM, kbuild test robot wrote:

Hi Shannon,

Thank you for the patch! Yet something to improve:

[auto build test ERROR on jkirsher-next-queue/dev-queue]
[also build test ERROR on v4.18 next-20180813]
[if your patch is applied to the wrong git tree, please drop us a note to help 
improve the system]

url:
https://github.com/0day-ci/linux/commits/Shannon-Nelson/ixgbe-ixgbevf-IPsec-offload-support-for-VFs/20180814-074800
base:   https://git.kernel.org/pub/scm/linux/kernel/git/jkirsher/next-queue.git 
dev-queue
config: x86_64-randconfig-v0-08131550 (attached as .config)
compiler: gcc-7 (Debian 7.3.0-16) 7.3.0
reproduce:
 # save the attached .config to linux build tree
 make ARCH=x86_64

All errors (new ones prefixed by >>):

drivers/net/ethernet/intel/ixgbe/ixgbe_ipsec.o: In function 
`ixgbe_ipsec_vf_add_sa':

drivers/net/ethernet/intel/ixgbe/ixgbe_ipsec.c:917: undefined reference to 
`xfrm_aead_get_byname'

make[1]: *** [vmlinux] Error 1
make[1]: Target '_all' not remade because of errors.



Huh, odd.  I'm not able to reproduce this error using your config file 
in a net-next tree of vintage v4.18-rc8 or in Jeff's dev-queue branch. 
It looks like I'm using an older compiler (4.8.5) but that shouldn't 
make a difference here.


sln



Re: [Intel-wired-lan] [PATCH next-queue 0/8] ixgbe/ixgbevf: IPsec offload support for VFs

2018-08-14 Thread Shannon Nelson

On 8/14/2018 8:30 AM, Alexander Duyck wrote:

On Mon, Aug 13, 2018 at 11:43 AM Shannon Nelson
 wrote:


This set of patches implements IPsec hardware offload for VF devices in
Intel's 10Gbe x540 family of Ethernet devices.


[...]



So the one question I would have about this patch set is what happens
if you are setting up a ipsec connection between the PF and one of the
VFs on the same port/function? Do the ipsec offloads get translated
across the Tx loopback or do they end up causing issues? Specifically
I would be interested in seeing the results of a test either between
two VFs, or the PF and one of the VFs on the same port.

- Alex



There is definitely something funky in the internal switch connection, 
as messages going from PF to VF with an offloaded encryption don't seem 
to get received by the VF, at least when in a VEB setup.  If I only set 
up offloads on the Rx on both PF and VF, and don't offload the Tx, then 
things work.


I don't have a setup to test this, but I suspect that in a VEPA 
configuration, with packets going out to the switch and turned around 
back in, the Tx encryption offload would happen as expected.


sln


[PATCH next-queue 0/8] ixgbe/ixgbevf: IPsec offload support for VFs

2018-08-13 Thread Shannon Nelson
This set of patches implements IPsec hardware offload for VF devices in
Intel's 10Gbe x540 family of Ethernet devices.

The IPsec HW offload feature has been in the x540/Niantic family of
network devices since their release in 2009, but there was no Linux
kernel support for the offload until 2017.  After the XFRM code added
support for the offload last year, the hw offload was added to the ixgbe
PF driver.

Since the related x540 VF device uses same setup as the PF for implementing
the offload, adding the feature to the ixgbevf seemed like a good idea.
In this case, the PF owns the device registers, so the VF simply packages
up the request information into a VF<->PF message and the PF does the
device configuration.  The resulting IPsec throughput is roughly equivalent
to what we see in the PF - nearly line-rate, with the expected drop in CPU
cycles burned.  (I'm not great at performance statistics, I'll let better
folks do the actual measurements as they pertain to their own usage)

To make use of the capability, first two things are needed: the PF must
be told to enable the offload for VFs (it is off by default) and the VF
must be trusted.  A new ethtool priv-flag for ixgbe is added to control
VF offload support.  For example:

ethtool --set-priv-flags eth0 vf-ipsec on
ip link set eth0 vf 1 trust on

Once those are set up and the VF device is UP, the user can add SAs the
same as for PFs, whether the VF is in the host or has been assigned to
a VM.

Note that the x540 chip supports a total of 1024 Rx plus 1024 Tx Security
Associations (SAs), shared among the PF and VFs that might request them.
It is entirely possible for a single VF to soak up all the offload
capability, which would likely annoy some people.  It seems rather
arbitrary to try to set a limit for how many a VF could be allowed,
but this is mitigated somewhat by the need for "trust" and "vf-ipsec"
to be enabled.  I suppose we could come up with a way to make a limit
configurable, but there is no existing method for adding that kind
configuration so I'll leave that to a future discussion.

Currently this doesn't support Tx offload as the hardware encryption
engine doesn't seem to engage on the Tx packets.  This may be a lingering
driver bug, more investigation is needed.  Until then, requests for a Tx
offload are failed and the userland requester will need to add Tx SAs
without the offload attribute.

Given that we don't have Tx offload support, the benefit here is less
than it could be, but is definitely still noticeable.  For example, with
informal iperf testing over a 10Gbps link, with full offload in a PF on
one side and a VF in a VM on the other side on a CPU with AES instructions:

Reference:
No IPsec: 9.4 Gbps
IPsec offload btwn two PFs:   9.2 Gbps
VF as the iperf receiver:
IPsec offload on PF, none on VF:  6.8 Gbps
IPsec offload on PF and VF:   9.2 Gbps   << biggest benefit
VF as the iperf sender:
IPsec offload on PF, none on VF:  4.8 Gbps
IPsec offload on PF and VF:   4.8 Gbps

The iperf traffic is primarily uni-directional, and we can see the most
benefit when VF is the iperf server and is receiving the test traffic.
Watching output from sar also shows a nice decrease in CPU utilization.


Shannon Nelson (8):
  ixgbe: reload ipsec ip table after sa tables
  ixgbe: prep ipsec constants for later use
  ixgbe: add VF ipsec management
  ixgbe: add VF IPsec offload enable flag
  ixgbe: add VF IPsec offload request message handling
  ixgbevf: add defines for IPsec offload request
  ixgbevf: add VF ipsec offload code
  ixgbevf: enable VF ipsec offload operations

 drivers/net/ethernet/intel/ixgbe/ixgbe.h  |  20 +-
 drivers/net/ethernet/intel/ixgbe/ixgbe_ethtool.c  |   9 +
 drivers/net/ethernet/intel/ixgbe/ixgbe_ipsec.c| 275 -
 drivers/net/ethernet/intel/ixgbe/ixgbe_ipsec.h|  13 +
 drivers/net/ethernet/intel/ixgbe/ixgbe_mbx.h  |   5 +
 drivers/net/ethernet/intel/ixgbe/ixgbe_sriov.c|  17 +-
 drivers/net/ethernet/intel/ixgbevf/Makefile   |   1 +
 drivers/net/ethernet/intel/ixgbevf/defines.h  |  10 +-
 drivers/net/ethernet/intel/ixgbevf/ethtool.c  |   2 +
 drivers/net/ethernet/intel/ixgbevf/ipsec.c| 673 ++
 drivers/net/ethernet/intel/ixgbevf/ipsec.h|  66 +++
 drivers/net/ethernet/intel/ixgbevf/ixgbevf.h  |  33 ++
 drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c |  74 ++-
 drivers/net/ethernet/intel/ixgbevf/mbx.h  |   5 +
 drivers/net/ethernet/intel/ixgbevf/vf.c   |   4 +
 15 files changed, 1163 insertions(+), 44 deletions(-)
 create mode 100644 drivers/net/ethernet/intel/ixgbevf/ipsec.c
 create mode 100644 drivers/net/ethernet/intel/ixgbevf/ipsec.h

-- 
2.7.4



[PATCH next-queue 3/8] ixgbe: add VF ipsec management

2018-08-13 Thread Shannon Nelson
Add functions to translate VF IPsec offload add and delete requests
into something the existing code can work with.

Signed-off-by: Shannon Nelson 
---
 drivers/net/ethernet/intel/ixgbe/ixgbe_ipsec.c | 256 -
 drivers/net/ethernet/intel/ixgbe/ixgbe_ipsec.h |  13 ++
 2 files changed, 260 insertions(+), 9 deletions(-)

diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_ipsec.c 
b/drivers/net/ethernet/intel/ixgbe/ixgbe_ipsec.c
index 3afb1fe..80108e1 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_ipsec.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_ipsec.c
@@ -8,6 +8,8 @@
 #define IXGBE_IPSEC_KEY_BITS  160
 static const char aes_gcm_name[] = "rfc4106(gcm(aes))";
 
+static void ixgbe_ipsec_del_sa(struct xfrm_state *xs);
+
 /**
  * ixgbe_ipsec_set_tx_sa - set the Tx SA registers
  * @hw: hw specific details
@@ -289,6 +291,13 @@ static void ixgbe_ipsec_start_engine(struct ixgbe_adapter 
*adapter)
 /**
  * ixgbe_ipsec_restore - restore the ipsec HW settings after a reset
  * @adapter: board private structure
+ *
+ * Reload the HW tables from the SW tables after they've been bashed
+ * by a chip reset.
+ *
+ * Any VF entries are removed from the SW and HW tables since either
+ * (a) the VF also gets reset on PF reset and will ask again for the
+ * offloads, or (b) the VF has been removed by a change in the num_vfs.
  **/
 void ixgbe_ipsec_restore(struct ixgbe_adapter *adapter)
 {
@@ -306,16 +315,24 @@ void ixgbe_ipsec_restore(struct ixgbe_adapter *adapter)
 
/* reload the Rx and Tx keys */
for (i = 0; i < IXGBE_IPSEC_MAX_SA_COUNT; i++) {
-   struct rx_sa *rsa = >rx_tbl[i];
-   struct tx_sa *tsa = >tx_tbl[i];
-
-   if (rsa->used)
-   ixgbe_ipsec_set_rx_sa(hw, i, rsa->xs->id.spi,
- rsa->key, rsa->salt,
- rsa->mode, rsa->iptbl_ind);
+   struct rx_sa *r = >rx_tbl[i];
+   struct tx_sa *t = >tx_tbl[i];
+
+   if (r->used) {
+   if (r->mode & IXGBE_RXTXMOD_VF)
+   ixgbe_ipsec_del_sa(r->xs);
+   else
+   ixgbe_ipsec_set_rx_sa(hw, i, r->xs->id.spi,
+ r->key, r->salt,
+ r->mode, r->iptbl_ind);
+   }
 
-   if (tsa->used)
-   ixgbe_ipsec_set_tx_sa(hw, i, tsa->key, tsa->salt);
+   if (t->used) {
+   if (t->mode & IXGBE_RXTXMOD_VF)
+   ixgbe_ipsec_del_sa(t->xs);
+   else
+   ixgbe_ipsec_set_tx_sa(hw, i, t->key, t->salt);
+   }
}
 
/* reload the IP addrs */
@@ -381,6 +398,8 @@ static struct xfrm_state *ixgbe_ipsec_find_rx_state(struct 
ixgbe_ipsec *ipsec,
rcu_read_lock();
hash_for_each_possible_rcu(ipsec->rx_sa_list, rsa, hlist,
   (__force u32)spi) {
+   if (rsa->mode & IXGBE_RXTXMOD_VF)
+   continue;
if (spi == rsa->xs->id.spi &&
((ip4 && *daddr == rsa->xs->id.daddr.a4) ||
  (!ip4 && !memcmp(daddr, >xs->id.daddr.a6,
@@ -809,6 +828,225 @@ static const struct xfrmdev_ops ixgbe_xfrmdev_ops = {
 };
 
 /**
+ * ixgbe_ipsec_vf_clear - clear the tables of data for a VF
+ * @adapter: board private structure
+ * @vf: VF id to be removed
+ **/
+void ixgbe_ipsec_vf_clear(struct ixgbe_adapter *adapter, u32 vf)
+{
+   struct ixgbe_ipsec *ipsec = adapter->ipsec;
+   int i;
+
+   /* search rx sa table */
+   for (i = 0; i < IXGBE_IPSEC_MAX_SA_COUNT && ipsec->num_rx_sa; i++) {
+   if (!ipsec->rx_tbl[i].used)
+   continue;
+   if (ipsec->rx_tbl[i].mode & IXGBE_RXTXMOD_VF &&
+   ipsec->rx_tbl[i].vf == vf)
+   ixgbe_ipsec_del_sa(ipsec->rx_tbl[i].xs);
+   }
+
+   /* search tx sa table */
+   for (i = 0; i < IXGBE_IPSEC_MAX_SA_COUNT && ipsec->num_tx_sa; i++) {
+   if (!ipsec->tx_tbl[i].used)
+   continue;
+   if (ipsec->tx_tbl[i].mode & IXGBE_RXTXMOD_VF &&
+   ipsec->tx_tbl[i].vf == vf)
+   ixgbe_ipsec_del_sa(ipsec->tx_tbl[i].xs);
+   }
+}
+
+/**
+ * ixgbe_ipsec_vf_add_sa - translate VF request to SA add
+ * @adapter: board private structure
+ * @msgbuf: The message buffer
+ * @vf: the VF index
+ *
+ * Make up a new xs and algorithm info from the data sent by the VF.
+ * We only need to 

[PATCH next-queue 1/8] ixgbe: reload ipsec ip table after sa tables

2018-08-13 Thread Shannon Nelson
Restore the ipsec hardware IP table after reloading the SA tables.
This doesn't make much difference now, but will matter when we add
support for VF ipsec offloads.

Signed-off-by: Shannon Nelson 
---
 drivers/net/ethernet/intel/ixgbe/ixgbe_ipsec.c | 16 
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_ipsec.c 
b/drivers/net/ethernet/intel/ixgbe/ixgbe_ipsec.c
index e515246..4340651 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_ipsec.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_ipsec.c
@@ -301,14 +301,6 @@ void ixgbe_ipsec_restore(struct ixgbe_adapter *adapter)
ixgbe_ipsec_clear_hw_tables(adapter);
ixgbe_ipsec_start_engine(adapter);
 
-   /* reload the IP addrs */
-   for (i = 0; i < IXGBE_IPSEC_MAX_RX_IP_COUNT; i++) {
-   struct rx_ip_sa *ipsa = >ip_tbl[i];
-
-   if (ipsa->used)
-   ixgbe_ipsec_set_rx_ip(hw, i, ipsa->ipaddr);
-   }
-
/* reload the Rx and Tx keys */
for (i = 0; i < IXGBE_IPSEC_MAX_SA_COUNT; i++) {
struct rx_sa *rsa = >rx_tbl[i];
@@ -322,6 +314,14 @@ void ixgbe_ipsec_restore(struct ixgbe_adapter *adapter)
if (tsa->used)
ixgbe_ipsec_set_tx_sa(hw, i, tsa->key, tsa->salt);
}
+
+   /* reload the IP addrs */
+   for (i = 0; i < IXGBE_IPSEC_MAX_RX_IP_COUNT; i++) {
+   struct rx_ip_sa *ipsa = >ip_tbl[i];
+
+   if (ipsa->used)
+   ixgbe_ipsec_set_rx_ip(hw, i, ipsa->ipaddr);
+   }
 }
 
 /**
-- 
2.7.4



[PATCH next-queue 2/8] ixgbe: prep ipsec constants for later use

2018-08-13 Thread Shannon Nelson
Pull out a couple of values from a function so they can be used
later elsewhere.

Signed-off-by: Shannon Nelson 
---
 drivers/net/ethernet/intel/ixgbe/ixgbe_ipsec.c | 8 +---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_ipsec.c 
b/drivers/net/ethernet/intel/ixgbe/ixgbe_ipsec.c
index 4340651..3afb1fe 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_ipsec.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_ipsec.c
@@ -5,6 +5,9 @@
 #include 
 #include 
 
+#define IXGBE_IPSEC_KEY_BITS  160
+static const char aes_gcm_name[] = "rfc4106(gcm(aes))";
+
 /**
  * ixgbe_ipsec_set_tx_sa - set the Tx SA registers
  * @hw: hw specific details
@@ -407,7 +410,6 @@ static int ixgbe_ipsec_parse_proto_keys(struct xfrm_state 
*xs,
struct net_device *dev = xs->xso.dev;
unsigned char *key_data;
char *alg_name = NULL;
-   const char aes_gcm_name[] = "rfc4106(gcm(aes))";
int key_len;
 
if (!xs->aead) {
@@ -435,9 +437,9 @@ static int ixgbe_ipsec_parse_proto_keys(struct xfrm_state 
*xs,
 * we don't need to do any byteswapping.
 * 160 accounts for 16 byte key and 4 byte salt
 */
-   if (key_len == 160) {
+   if (key_len == IXGBE_IPSEC_KEY_BITS) {
*mysalt = ((u32 *)key_data)[4];
-   } else if (key_len != 128) {
+   } else if (key_len != (IXGBE_IPSEC_KEY_BITS - (sizeof(*mysalt) * 8))) {
netdev_err(dev, "IPsec hw offload only supports keys up to 128 
bits with a 32 bit salt\n");
return -EINVAL;
} else {
-- 
2.7.4



[PATCH next-queue 6/8] ixgbevf: add defines for IPsec offload request

2018-08-13 Thread Shannon Nelson
Fix up the register definitions for using IPsec offloads and
add the new mailbox message IDs.

Signed-off-by: Shannon Nelson 
---
 drivers/net/ethernet/intel/ixgbevf/defines.h | 8 
 drivers/net/ethernet/intel/ixgbevf/mbx.h | 5 +
 2 files changed, 13 insertions(+)

diff --git a/drivers/net/ethernet/intel/ixgbevf/defines.h 
b/drivers/net/ethernet/intel/ixgbevf/defines.h
index 700d8eb..8b627b6 100644
--- a/drivers/net/ethernet/intel/ixgbevf/defines.h
+++ b/drivers/net/ethernet/intel/ixgbevf/defines.h
@@ -133,9 +133,14 @@ typedef u32 ixgbe_link_speed;
 #define IXGBE_RXDADV_STAT_FCSTAT_NODDP 0x0010 /* 01: Ctxt w/o DDP */
 #define IXGBE_RXDADV_STAT_FCSTAT_FCPRSP0x0020 /* 10: Recv. FCP_RSP 
*/
 #define IXGBE_RXDADV_STAT_FCSTAT_DDP   0x0030 /* 11: Ctxt w/ DDP */
+#define IXGBE_RXDADV_STAT_SECP 0x0002 /* IPsec/MACsec pkt found */
 
 #define IXGBE_RXDADV_RSSTYPE_MASK  0x000F
 #define IXGBE_RXDADV_PKTTYPE_MASK  0xFFF0
+#define IXGBE_RXDADV_PKTTYPE_IPV4   0x0010 /* IPv4 hdr present */
+#define IXGBE_RXDADV_PKTTYPE_IPV6   0x0040 /* IPv6 hdr present */
+#define IXGBE_RXDADV_PKTTYPE_IPSEC_ESP  0x1000 /* IPSec ESP */
+#define IXGBE_RXDADV_PKTTYPE_IPSEC_AH   0x2000 /* IPSec AH */
 #define IXGBE_RXDADV_PKTTYPE_MASK_EX   0x0001FFF0
 #define IXGBE_RXDADV_HDRBUFLEN_MASK0x7FE0
 #define IXGBE_RXDADV_RSCCNT_MASK   0x001E
@@ -250,9 +255,12 @@ struct ixgbe_adv_tx_context_desc {
 #define IXGBE_ADVTXD_TUCMD_L4T_UDP 0x  /* L4 Packet TYPE of UDP */
 #define IXGBE_ADVTXD_TUCMD_L4T_TCP 0x0800  /* L4 Packet TYPE of TCP */
 #define IXGBE_ADVTXD_TUCMD_L4T_SCTP0x1000  /* L4 Packet TYPE of SCTP */
+#define IXGBE_ADVTXD_TUCMD_IPSEC_TYPE_ESP   0x2000 /* IPSec Type ESP */
+#define IXGBE_ADVTXD_TUCMD_IPSEC_ENCRYPT_EN 0x4000 /* ESP Encrypt Enable */
 #define IXGBE_ADVTXD_IDX_SHIFT 4 /* Adv desc Index shift */
 #define IXGBE_ADVTXD_CC0x0080 /* Check Context */
 #define IXGBE_ADVTXD_POPTS_SHIFT   8  /* Adv desc POPTS shift */
+#define IXGBE_ADVTXD_POPTS_IPSEC0x0400 /* IPSec offload request */
 #define IXGBE_ADVTXD_POPTS_IXSM(IXGBE_TXD_POPTS_IXSM << \
 IXGBE_ADVTXD_POPTS_SHIFT)
 #define IXGBE_ADVTXD_POPTS_TXSM(IXGBE_TXD_POPTS_TXSM << \
diff --git a/drivers/net/ethernet/intel/ixgbevf/mbx.h 
b/drivers/net/ethernet/intel/ixgbevf/mbx.h
index bfd9ae1..853796c 100644
--- a/drivers/net/ethernet/intel/ixgbevf/mbx.h
+++ b/drivers/net/ethernet/intel/ixgbevf/mbx.h
@@ -62,6 +62,7 @@ enum ixgbe_pfvf_api_rev {
ixgbe_mbox_api_11,  /* API version 1.1, linux/freebsd VF driver */
ixgbe_mbox_api_12,  /* API version 1.2, linux/freebsd VF driver */
ixgbe_mbox_api_13,  /* API version 1.3, linux/freebsd VF driver */
+   ixgbe_mbox_api_14,  /* API version 1.4, linux/freebsd VF driver */
/* This value should always be last */
ixgbe_mbox_api_unknown, /* indicates that API version is not known */
 };
@@ -92,6 +93,10 @@ enum ixgbe_pfvf_api_rev {
 
 #define IXGBE_VF_UPDATE_XCAST_MODE 0x0c
 
+/* mailbox API, version 1.4 VF requests */
+#define IXGBE_VF_IPSEC_ADD 0x0d
+#define IXGBE_VF_IPSEC_DEL 0x0e
+
 /* length of permanent address message returned from PF */
 #define IXGBE_VF_PERMADDR_MSG_LEN  4
 /* word in permanent address message with the current multicast type */
-- 
2.7.4



[PATCH next-queue 5/8] ixgbe: add VF IPsec offload request message handling

2018-08-13 Thread Shannon Nelson
Add an add and a delete message for IPsec offload requests from
the VF.  These call into the ipsec functions that can translate
the message buffer into a useful IPsec offload.

These new messages bump the mbox API version to 1.4.

Signed-off-by: Shannon Nelson 
---
 drivers/net/ethernet/intel/ixgbe/ixgbe.h   | 19 ++-
 drivers/net/ethernet/intel/ixgbe/ixgbe_mbx.h   |  5 +
 drivers/net/ethernet/intel/ixgbe/ixgbe_sriov.c | 17 -
 3 files changed, 35 insertions(+), 6 deletions(-)

diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe.h 
b/drivers/net/ethernet/intel/ixgbe/ixgbe.h
index 89e709c..5c6fd42 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe.h
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe.h
@@ -1004,15 +1004,24 @@ void ixgbe_ipsec_rx(struct ixgbe_ring *rx_ring,
struct sk_buff *skb);
 int ixgbe_ipsec_tx(struct ixgbe_ring *tx_ring, struct ixgbe_tx_buffer *first,
   struct ixgbe_ipsec_tx_data *itd);
+void ixgbe_ipsec_vf_clear(struct ixgbe_adapter *adapter, u32 vf);
+int ixgbe_ipsec_vf_add_sa(struct ixgbe_adapter *adapter, u32 *mbuf, u32 vf);
+int ixgbe_ipsec_vf_del_sa(struct ixgbe_adapter *adapter, u32 *mbuf, u32 vf);
 #else
-static inline void ixgbe_init_ipsec_offload(struct ixgbe_adapter *adapter) { };
-static inline void ixgbe_stop_ipsec_offload(struct ixgbe_adapter *adapter) { };
-static inline void ixgbe_ipsec_restore(struct ixgbe_adapter *adapter) { };
+static inline void ixgbe_init_ipsec_offload(struct ixgbe_adapter *adapter) { }
+static inline void ixgbe_stop_ipsec_offload(struct ixgbe_adapter *adapter) { }
+static inline void ixgbe_ipsec_restore(struct ixgbe_adapter *adapter) { }
 static inline void ixgbe_ipsec_rx(struct ixgbe_ring *rx_ring,
  union ixgbe_adv_rx_desc *rx_desc,
- struct sk_buff *skb) { };
+ struct sk_buff *skb) { }
 static inline int ixgbe_ipsec_tx(struct ixgbe_ring *tx_ring,
 struct ixgbe_tx_buffer *first,
-struct ixgbe_ipsec_tx_data *itd) { return 0; };
+struct ixgbe_ipsec_tx_data *itd) { return 0; }
+static inline void ixgbe_ipsec_vf_clear(struct ixgbe_adapter *adapter,
+   u32 vf) { }
+static inline int ixgbe_ipsec_vf_add_sa(struct ixgbe_adapter *adapter,
+   u32 *mbuf, u32 vf) { return -EACCES; }
+static inline int ixgbe_ipsec_vf_del_sa(struct ixgbe_adapter *adapter,
+   u32 *mbuf, u32 vf) { return -EACCES; }
 #endif /* CONFIG_XFRM_OFFLOAD */
 #endif /* _IXGBE_H_ */
diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_mbx.h 
b/drivers/net/ethernet/intel/ixgbe/ixgbe_mbx.h
index e085b65..a148534 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_mbx.h
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_mbx.h
@@ -50,6 +50,7 @@ enum ixgbe_pfvf_api_rev {
ixgbe_mbox_api_11,  /* API version 1.1, linux/freebsd VF driver */
ixgbe_mbox_api_12,  /* API version 1.2, linux/freebsd VF driver */
ixgbe_mbox_api_13,  /* API version 1.3, linux/freebsd VF driver */
+   ixgbe_mbox_api_14,  /* API version 1.4, linux/freebsd VF driver */
/* This value should always be last */
ixgbe_mbox_api_unknown, /* indicates that API version is not known */
 };
@@ -80,6 +81,10 @@ enum ixgbe_pfvf_api_rev {
 
 #define IXGBE_VF_UPDATE_XCAST_MODE 0x0c
 
+/* mailbox API, version 1.4 VF requests */
+#define IXGBE_VF_IPSEC_ADD 0x0d
+#define IXGBE_VF_IPSEC_DEL 0x0e
+
 /* length of permanent address message returned from PF */
 #define IXGBE_VF_PERMADDR_MSG_LEN 4
 /* word in permanent address message with the current multicast type */
diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_sriov.c 
b/drivers/net/ethernet/intel/ixgbe/ixgbe_sriov.c
index 6f59933..e86a4d5 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_sriov.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_sriov.c
@@ -491,6 +491,7 @@ static s32 ixgbe_set_vf_lpe(struct ixgbe_adapter *adapter, 
u32 *msgbuf, u32 vf)
case ixgbe_mbox_api_11:
case ixgbe_mbox_api_12:
case ixgbe_mbox_api_13:
+   case ixgbe_mbox_api_14:
/* Version 1.1 supports jumbo frames on VFs if PF has
 * jumbo frames enabled which means legacy VFs are
 * disabled
@@ -718,6 +719,9 @@ static inline void ixgbe_vf_reset_event(struct 
ixgbe_adapter *adapter, u32 vf)
/* reset multicast table array for vf */
adapter->vfinfo[vf].num_vf_mc_hashes = 0;
 
+   /* clear any ipsec table info */
+   ixgbe_ipsec_vf_clear(adapter, vf);
+
/* Flush and reset the mta with the new values */
ixgbe_set_rx_mode(adapter->netdev);
 
@@ -969,6 +973,7 @@ static int ixgbe_negotiate_vf_api(struct ixgbe_adapter 
*a

[PATCH next-queue 4/8] ixgbe: add VF IPsec offload enable flag

2018-08-13 Thread Shannon Nelson
Add a private flag to expressly enable support for VF IPsec offload.
The VF will have to be "trusted" in order to use the hardware offload,
but because of the general concerns of managing VF access, we want to
be sure the user specifically is enabling the feature.

This is likely a candidate for becoming a netdev feature flag.

Signed-off-by: Shannon Nelson 
---
 drivers/net/ethernet/intel/ixgbe/ixgbe.h | 1 +
 drivers/net/ethernet/intel/ixgbe/ixgbe_ethtool.c | 9 +
 drivers/net/ethernet/intel/ixgbe/ixgbe_ipsec.c   | 3 ++-
 3 files changed, 12 insertions(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe.h 
b/drivers/net/ethernet/intel/ixgbe/ixgbe.h
index 4fc906c..89e709c 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe.h
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe.h
@@ -605,6 +605,7 @@ struct ixgbe_adapter {
 #define IXGBE_FLAG2_EEE_ENABLEDBIT(15)
 #define IXGBE_FLAG2_RX_LEGACY  BIT(16)
 #define IXGBE_FLAG2_IPSEC_ENABLED  BIT(17)
+#define IXGBE_FLAG2_VF_IPSEC_ENABLED   BIT(18)
 
/* Tx fast path data */
int num_tx_queues;
diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_ethtool.c 
b/drivers/net/ethernet/intel/ixgbe/ixgbe_ethtool.c
index e5a8461..732b1e6 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_ethtool.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_ethtool.c
@@ -136,6 +136,8 @@ static const char ixgbe_gstrings_test[][ETH_GSTRING_LEN] = {
 static const char ixgbe_priv_flags_strings[][ETH_GSTRING_LEN] = {
 #define IXGBE_PRIV_FLAGS_LEGACY_RX BIT(0)
"legacy-rx",
+#define IXGBE_PRIV_FLAGS_VF_IPSEC_EN   BIT(1)
+   "vf-ipsec",
 };
 
 #define IXGBE_PRIV_FLAGS_STR_LEN ARRAY_SIZE(ixgbe_priv_flags_strings)
@@ -3409,6 +3411,9 @@ static u32 ixgbe_get_priv_flags(struct net_device *netdev)
if (adapter->flags2 & IXGBE_FLAG2_RX_LEGACY)
priv_flags |= IXGBE_PRIV_FLAGS_LEGACY_RX;
 
+   if (adapter->flags2 & IXGBE_FLAG2_VF_IPSEC_ENABLED)
+   priv_flags |= IXGBE_PRIV_FLAGS_VF_IPSEC_EN;
+
return priv_flags;
 }
 
@@ -3421,6 +3426,10 @@ static int ixgbe_set_priv_flags(struct net_device 
*netdev, u32 priv_flags)
if (priv_flags & IXGBE_PRIV_FLAGS_LEGACY_RX)
flags2 |= IXGBE_FLAG2_RX_LEGACY;
 
+   flags2 &= ~IXGBE_FLAG2_VF_IPSEC_ENABLED;
+   if (priv_flags & IXGBE_PRIV_FLAGS_VF_IPSEC_EN)
+   flags2 |= IXGBE_FLAG2_VF_IPSEC_ENABLED;
+
if (flags2 != adapter->flags2) {
adapter->flags2 = flags2;
 
diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_ipsec.c 
b/drivers/net/ethernet/intel/ixgbe/ixgbe_ipsec.c
index 80108e1..ecd01fa 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_ipsec.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_ipsec.c
@@ -880,7 +880,8 @@ int ixgbe_ipsec_vf_add_sa(struct ixgbe_adapter *adapter, 
u32 *msgbuf, u32 vf)
int err;
 
sam = (struct sa_mbx_msg *)([1]);
-   if (!adapter->vfinfo[vf].trusted) {
+   if (!adapter->vfinfo[vf].trusted ||
+   !(adapter->flags2 & IXGBE_FLAG2_VF_IPSEC_ENABLED)) {
e_warn(drv, "VF %d attempted to add an IPsec SA\n", vf);
err = -EACCES;
goto err_out;
-- 
2.7.4



[PATCH next-queue 7/8] ixgbevf: add VF ipsec offload code

2018-08-13 Thread Shannon Nelson
Add the ipsec offload support code.  This is based off of the similar
code in ixgbe, but instead of writing the SA registers, the VF asks
the PF to setup the offload by sending the offload information to the
PF via the standard mailbox.

Signed-off-by: Shannon Nelson 
---
 drivers/net/ethernet/intel/ixgbevf/Makefile  |   1 +
 drivers/net/ethernet/intel/ixgbevf/ipsec.c   | 673 +++
 drivers/net/ethernet/intel/ixgbevf/ipsec.h   |  66 +++
 drivers/net/ethernet/intel/ixgbevf/ixgbevf.h |   8 +
 4 files changed, 748 insertions(+)
 create mode 100644 drivers/net/ethernet/intel/ixgbevf/ipsec.c
 create mode 100644 drivers/net/ethernet/intel/ixgbevf/ipsec.h

diff --git a/drivers/net/ethernet/intel/ixgbevf/Makefile 
b/drivers/net/ethernet/intel/ixgbevf/Makefile
index aba1e6a3..297d0f0 100644
--- a/drivers/net/ethernet/intel/ixgbevf/Makefile
+++ b/drivers/net/ethernet/intel/ixgbevf/Makefile
@@ -10,4 +10,5 @@ ixgbevf-objs := vf.o \
 mbx.o \
 ethtool.o \
 ixgbevf_main.o
+ixgbevf-$(CONFIG_XFRM_OFFLOAD) += ipsec.o
 
diff --git a/drivers/net/ethernet/intel/ixgbevf/ipsec.c 
b/drivers/net/ethernet/intel/ixgbevf/ipsec.c
new file mode 100644
index 000..fac91c0
--- /dev/null
+++ b/drivers/net/ethernet/intel/ixgbevf/ipsec.c
@@ -0,0 +1,673 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright(c) 2018 Oracle and/or its affiliates. All rights reserved. */
+
+#include "ixgbevf.h"
+#include 
+#include 
+
+#define IXGBE_IPSEC_KEY_BITS  160
+static const char aes_gcm_name[] = "rfc4106(gcm(aes))";
+
+/**
+ * ixgbevf_ipsec_set_pf_sa - ask the PF to set up an SA
+ * @adapter: board private structure
+ * @xs: xfrm info to be sent to the PF
+ *
+ * Returns: positive offload handle from the PF, or negative error code
+ **/
+static int ixgbevf_ipsec_set_pf_sa(struct ixgbevf_adapter *adapter,
+  struct xfrm_state *xs)
+{
+   u32 msgbuf[IXGBE_VFMAILBOX_SIZE] = { 0 };
+   struct ixgbe_hw *hw = >hw;
+   struct sa_mbx_msg *sam;
+   u16 msglen;
+   int ret;
+
+   /* send the important bits to the PF */
+   sam = (struct sa_mbx_msg *)([1]);
+   sam->flags = xs->xso.flags;
+   sam->spi = xs->id.spi;
+   sam->proto = xs->id.proto;
+   sam->family = xs->props.family;
+
+   if (xs->props.family == AF_INET6)
+   memcpy(sam->addr, >id.daddr.a6, sizeof(xs->id.daddr.a6));
+   else
+   memcpy(sam->addr, >id.daddr.a4, sizeof(xs->id.daddr.a4));
+   memcpy(sam->key, xs->aead->alg_key, sizeof(sam->key));
+
+   msgbuf[0] = IXGBE_VF_IPSEC_ADD;
+   msglen = sizeof(*sam) + sizeof(msgbuf[0]);
+
+   spin_lock_bh(>mbx_lock);
+
+   ret = hw->mbx.ops.write_posted(hw, msgbuf, msglen);
+   if (ret)
+   goto out;
+
+   msglen = sizeof(msgbuf[0]) * 2;
+   ret = hw->mbx.ops.read_posted(hw, msgbuf, msglen);
+   if (ret)
+   goto out;
+
+   ret = (int)msgbuf[1];
+   if (msgbuf[0] & IXGBE_VT_MSGTYPE_NACK && ret >= 0)
+   ret = -1;
+
+out:
+   spin_unlock_bh(>mbx_lock);
+
+   return ret;
+}
+
+/**
+ * ixgbevf_ipsec_del_pf_sa - ask the PF to delete an SA
+ * @adapter: board private structure
+ * @pfsa: sa index returned from PF when created, -1 for all
+ *
+ * Returns: 0 on success, or negative error code
+ **/
+static int ixgbevf_ipsec_del_pf_sa(struct ixgbevf_adapter *adapter, int pfsa)
+{
+   struct ixgbe_hw *hw = >hw;
+   u32 msgbuf[2];
+   int err;
+
+   memset(msgbuf, 0, sizeof(msgbuf));
+   msgbuf[0] = IXGBE_VF_IPSEC_DEL;
+   msgbuf[1] = (u32)pfsa;
+
+   spin_lock_bh(>mbx_lock);
+
+   err = hw->mbx.ops.write_posted(hw, msgbuf, sizeof(msgbuf));
+   if (err)
+   goto out;
+
+   err = hw->mbx.ops.read_posted(hw, msgbuf, sizeof(msgbuf));
+   if (err)
+   goto out;
+
+out:
+   spin_unlock_bh(>mbx_lock);
+   return err;
+}
+
+/**
+ * ixgbevf_ipsec_restore - restore the ipsec HW settings after a reset
+ * @adapter: board private structure
+ *
+ * Reload the HW tables from the SW tables after they've been bashed
+ * by a chip reset.  While we're here, make sure any stale VF data is
+ * removed, since we go through reset when num_vfs changes.
+ **/
+void ixgbevf_ipsec_restore(struct ixgbevf_adapter *adapter)
+{
+   struct ixgbevf_ipsec *ipsec = adapter->ipsec;
+   struct net_device *netdev = adapter->netdev;
+   int i;
+
+   if (!(adapter->netdev->features & NETIF_F_HW_ESP))
+   return;
+
+   /* reload the Rx and Tx keys */
+   for (i = 0; i < IXGBE_IPSEC_MAX_SA_COUNT; i++) {
+   struct rx_sa *r = >rx_tbl[i];
+   struct tx_sa *t = >tx_tbl[i];
+   int ret;
+
+   if (r->used) {
+   ret = ixgb

[PATCH next-queue 8/8] ixgbevf: enable VF ipsec offload operations

2018-08-13 Thread Shannon Nelson
Add the ipsec initialization into the driver startup and
add the Rx and Tx processing hooks.

Signed-off-by: Shannon Nelson 
---
 drivers/net/ethernet/intel/ixgbevf/defines.h  |  2 +-
 drivers/net/ethernet/intel/ixgbevf/ethtool.c  |  2 +
 drivers/net/ethernet/intel/ixgbevf/ixgbevf.h  | 25 
 drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c | 74 +--
 drivers/net/ethernet/intel/ixgbevf/vf.c   |  4 ++
 5 files changed, 86 insertions(+), 21 deletions(-)

diff --git a/drivers/net/ethernet/intel/ixgbevf/defines.h 
b/drivers/net/ethernet/intel/ixgbevf/defines.h
index 8b627b6..6c3d6bf 100644
--- a/drivers/net/ethernet/intel/ixgbevf/defines.h
+++ b/drivers/net/ethernet/intel/ixgbevf/defines.h
@@ -234,7 +234,7 @@ union ixgbe_adv_rx_desc {
 /* Context descriptors */
 struct ixgbe_adv_tx_context_desc {
__le32 vlan_macip_lens;
-   __le32 seqnum_seed;
+   __le32 fceof_saidx;
__le32 type_tucmd_mlhl;
__le32 mss_l4len_idx;
 };
diff --git a/drivers/net/ethernet/intel/ixgbevf/ethtool.c 
b/drivers/net/ethernet/intel/ixgbevf/ethtool.c
index 631c910..5399787 100644
--- a/drivers/net/ethernet/intel/ixgbevf/ethtool.c
+++ b/drivers/net/ethernet/intel/ixgbevf/ethtool.c
@@ -55,6 +55,8 @@ static struct ixgbe_stats ixgbevf_gstrings_stats[] = {
IXGBEVF_STAT("alloc_rx_page", alloc_rx_page),
IXGBEVF_STAT("alloc_rx_page_failed", alloc_rx_page_failed),
IXGBEVF_STAT("alloc_rx_buff_failed", alloc_rx_buff_failed),
+   IXGBEVF_STAT("tx_ipsec", tx_ipsec),
+   IXGBEVF_STAT("rx_ipsec", rx_ipsec),
 };
 
 #define IXGBEVF_QUEUE_STATS_LEN ( \
diff --git a/drivers/net/ethernet/intel/ixgbevf/ixgbevf.h 
b/drivers/net/ethernet/intel/ixgbevf/ixgbevf.h
index 172637e..e399e1c 100644
--- a/drivers/net/ethernet/intel/ixgbevf/ixgbevf.h
+++ b/drivers/net/ethernet/intel/ixgbevf/ixgbevf.h
@@ -459,6 +459,31 @@ int ethtool_ioctl(struct ifreq *ifr);
 
 extern void ixgbevf_write_eitr(struct ixgbevf_q_vector *q_vector);
 
+#ifdef CONFIG_XFRM_OFFLOAD
+void ixgbevf_init_ipsec_offload(struct ixgbevf_adapter *adapter);
+void ixgbevf_stop_ipsec_offload(struct ixgbevf_adapter *adapter);
+void ixgbevf_ipsec_restore(struct ixgbevf_adapter *adapter);
+void ixgbevf_ipsec_rx(struct ixgbevf_ring *rx_ring,
+ union ixgbe_adv_rx_desc *rx_desc,
+ struct sk_buff *skb);
+int ixgbevf_ipsec_tx(struct ixgbevf_ring *tx_ring,
+struct ixgbevf_tx_buffer *first,
+struct ixgbevf_ipsec_tx_data *itd);
+#else
+static inline void ixgbevf_init_ipsec_offload(struct ixgbevf_adapter *adapter)
+{ }
+static inline void ixgbevf_stop_ipsec_offload(struct ixgbevf_adapter *adapter)
+{ }
+static inline void ixgbevf_ipsec_restore(struct ixgbevf_adapter *adapter) { }
+static inline void ixgbevf_ipsec_rx(struct ixgbevf_ring *rx_ring,
+   union ixgbe_adv_rx_desc *rx_desc,
+   struct sk_buff *skb) { }
+static inline int ixgbevf_ipsec_tx(struct ixgbevf_ring *tx_ring,
+  struct ixgbevf_tx_buffer *first,
+  struct ixgbevf_ipsec_tx_data *itd)
+{ return 0; }
+#endif /* CONFIG_XFRM_OFFLOAD */
+
 void ixgbe_napi_add_all(struct ixgbevf_adapter *adapter);
 void ixgbe_napi_del_all(struct ixgbevf_adapter *adapter);
 
diff --git a/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c 
b/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c
index d86446d..f745a52 100644
--- a/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c
+++ b/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c
@@ -40,7 +40,7 @@ static const char ixgbevf_driver_string[] =
 #define DRV_VERSION "4.1.0-k"
 const char ixgbevf_driver_version[] = DRV_VERSION;
 static char ixgbevf_copyright[] =
-   "Copyright (c) 2009 - 2015 Intel Corporation.";
+   "Copyright (c) 2009 - 2018 Intel Corporation.";
 
 static const struct ixgbevf_info *ixgbevf_info_tbl[] = {
[board_82599_vf]= _82599_vf_info,
@@ -268,7 +268,7 @@ static bool ixgbevf_clean_tx_irq(struct ixgbevf_q_vector 
*q_vector,
struct ixgbevf_adapter *adapter = q_vector->adapter;
struct ixgbevf_tx_buffer *tx_buffer;
union ixgbe_adv_tx_desc *tx_desc;
-   unsigned int total_bytes = 0, total_packets = 0;
+   unsigned int total_bytes = 0, total_packets = 0, total_ipsec = 0;
unsigned int budget = tx_ring->count / 2;
unsigned int i = tx_ring->next_to_clean;
 
@@ -299,6 +299,8 @@ static bool ixgbevf_clean_tx_irq(struct ixgbevf_q_vector 
*q_vector,
/* update the statistics for this packet */
total_bytes += tx_buffer->bytecount;
total_packets += tx_buffer->gso_segs;
+   if (tx_buffer->tx_flags & IXGBE_TX_FLAGS_IPSEC)
+   total_ipsec++;
 
/* free the

[PATCH next-queue] ixgbe: don't clear ipsec sa counters on hw clearing

2018-08-10 Thread Shannon Nelson
The software SA record counters should not be cleared when clearing
the hardware tables.  This causes the counters to be out of sync
after a driver reset.

Fixes: 63a67fe229ea ("ixgbe: add ipsec offload add and remove SA")
Signed-off-by: Shannon Nelson 
---
 drivers/net/ethernet/intel/ixgbe/ixgbe_ipsec.c | 4 
 1 file changed, 4 deletions(-)

diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_ipsec.c 
b/drivers/net/ethernet/intel/ixgbe/ixgbe_ipsec.c
index c116f45..df2f997 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_ipsec.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_ipsec.c
@@ -113,7 +113,6 @@ static void ixgbe_ipsec_set_rx_ip(struct ixgbe_hw *hw, u16 
idx, __be32 addr[])
  **/
 static void ixgbe_ipsec_clear_hw_tables(struct ixgbe_adapter *adapter)
 {
-   struct ixgbe_ipsec *ipsec = adapter->ipsec;
struct ixgbe_hw *hw = >hw;
u32 buf[4] = {0, 0, 0, 0};
u16 idx;
@@ -132,9 +131,6 @@ static void ixgbe_ipsec_clear_hw_tables(struct 
ixgbe_adapter *adapter)
ixgbe_ipsec_set_tx_sa(hw, idx, buf, 0);
ixgbe_ipsec_set_rx_sa(hw, idx, 0, buf, 0, 0, 0);
}
-
-   ipsec->num_rx_sa = 0;
-   ipsec->num_tx_sa = 0;
 }
 
 /**
-- 
2.7.4



Re: [PATCH net] ixgbe: Off by one in ixgbe_ipsec_tx()

2018-07-09 Thread Shannon Nelson

On 7/4/2018 2:53 AM, Dan Carpenter wrote:

The ipsec->tx_tbl[] has IXGBE_IPSEC_MAX_SA_COUNT elements so the > needs
to be changed to >= so we don't read one element beyond the end of the
array.

Fixes: 592594704761 ("ixgbe: process the Tx ipsec offload")
Signed-off-by: Dan Carpenter 


Acked-by: Shannon Nelson 



diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_ipsec.c 
b/drivers/net/ethernet/intel/ixgbe/ixgbe_ipsec.c
index c116f459945d..da4322e4daed 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_ipsec.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_ipsec.c
@@ -839,7 +839,7 @@ int ixgbe_ipsec_tx(struct ixgbe_ring *tx_ring,
}
  
  	itd->sa_idx = xs->xso.offload_handle - IXGBE_IPSEC_BASE_TX_INDEX;

-   if (unlikely(itd->sa_idx > IXGBE_IPSEC_MAX_SA_COUNT)) {
+   if (unlikely(itd->sa_idx >= IXGBE_IPSEC_MAX_SA_COUNT)) {
netdev_err(tx_ring->netdev, "%s: bad sa_idx=%d handle=%lu\n",
   __func__, itd->sa_idx, xs->xso.offload_handle);
return 0;



[PATCH ethtool] ethtool: ixgbe dump strings for security registers

2018-07-02 Thread Shannon Nelson
Add the ixgbe's security configuration registers into
the register dump.

Signed-off-by: Shannon Nelson 
---
 ixgbe.c | 26 ++
 1 file changed, 26 insertions(+)

NOTE: Obviously this should wait until Intel accepts and pushes the related
  ixgbe patch and Dave accepts it into net-next, but I figured I may
  as well send them out together.


diff --git a/ixgbe.c b/ixgbe.c
index c632137..6779402 100644
--- a/ixgbe.c
+++ b/ixgbe.c
@@ -1265,5 +1265,31 @@ ixgbe_dump_regs(struct ethtool_drvinfo *info, struct 
ethtool_regs *regs)
regs_buff[1127]);
}
 
+   if (regs_buff_len > 1139 && mac_type != ixgbe_mac_82598EB) {
+   fprintf(stdout,
+   "0x08800: SECTXCTRL   (Security Tx Control)
0x%08X\n",
+   regs_buff[1139]);
+
+   fprintf(stdout,
+   "0x08804: SECTXSTAT   (Security Tx Status) 
0x%08X\n",
+   regs_buff[1140]);
+
+   fprintf(stdout,
+   "0x08808: SECTXBUFFAF (Security Tx Buffer Almost Full) 
0x%08X\n",
+   regs_buff[1141]);
+
+   fprintf(stdout,
+   "0x08800: SECTXMINIFG (Security Tx Buffer Minimum IFG) 
0x%08X\n",
+   regs_buff[1142]);
+
+   fprintf(stdout,
+   "0x08800: SECRXCTRL   (Security Rx Control)
0x%08X\n",
+   regs_buff[1143]);
+
+   fprintf(stdout,
+   "0x08800: SECRXSTAT   (Security Rx Status) 
0x%08X\n",
+   regs_buff[1144]);
+   }
+
return 0;
 }
-- 
2.7.4



[PATCH next-queue] ixgbe: add ipsec security registers into ethtool register dump

2018-07-02 Thread Shannon Nelson
Add the ixgbe's security configuration registers into
the register dump.

Signed-off-by: Shannon Nelson 
---
 drivers/net/ethernet/intel/ixgbe/ixgbe_ethtool.c | 10 +-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_ethtool.c 
b/drivers/net/ethernet/intel/ixgbe/ixgbe_ethtool.c
index bd1ba88..1d68884 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_ethtool.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_ethtool.c
@@ -511,7 +511,7 @@ static void ixgbe_set_msglevel(struct net_device *netdev, 
u32 data)
 
 static int ixgbe_get_regs_len(struct net_device *netdev)
 {
-#define IXGBE_REGS_LEN  1139
+#define IXGBE_REGS_LEN  1145
return IXGBE_REGS_LEN * sizeof(u32);
 }
 
@@ -874,6 +874,14 @@ static void ixgbe_get_regs(struct net_device *netdev,
/* X540 specific DCB registers  */
regs_buff[1137] = IXGBE_READ_REG(hw, IXGBE_RTTQCNCR);
regs_buff[1138] = IXGBE_READ_REG(hw, IXGBE_RTTQCNTG);
+
+   /* Security config registers */
+   regs_buff[1139] = IXGBE_READ_REG(hw, IXGBE_SECTXCTRL);
+   regs_buff[1140] = IXGBE_READ_REG(hw, IXGBE_SECTXSTAT);
+   regs_buff[1141] = IXGBE_READ_REG(hw, IXGBE_SECTXBUFFAF);
+   regs_buff[1142] = IXGBE_READ_REG(hw, IXGBE_SECTXMINIFG);
+   regs_buff[1143] = IXGBE_READ_REG(hw, IXGBE_SECRXCTRL);
+   regs_buff[1144] = IXGBE_READ_REG(hw, IXGBE_SECRXSTAT);
 }
 
 static int ixgbe_get_eeprom_len(struct net_device *netdev)
-- 
2.7.4



[PATCH ipsec-next 1/1] xfrm: don't check offload_handle for nonzero

2018-06-26 Thread Shannon Nelson
The offload_handle should be an opaque data cookie for the driver
to use, much like the data cookie for a timer or alarm callback.
Thus, the XFRM stack should not be checking for non-zero, because
the driver might use that to store an array reference, which could
be zero, or some other zero but meaningful value.

We can remove the checks for non-zero because there are plenty
other attributes also being checked to see if there is an offload
in place for the SA in question.

Signed-off-by: Shannon Nelson 
---
 net/ipv4/esp4_offload.c | 6 ++
 net/ipv6/esp6_offload.c | 6 ++
 net/xfrm/xfrm_device.c  | 6 +++---
 3 files changed, 7 insertions(+), 11 deletions(-)

diff --git a/net/ipv4/esp4_offload.c b/net/ipv4/esp4_offload.c
index bbeecd1..58834a1 100644
--- a/net/ipv4/esp4_offload.c
+++ b/net/ipv4/esp4_offload.c
@@ -135,8 +135,7 @@ static struct sk_buff *esp4_gso_segment(struct sk_buff *skb,
 
skb->encap_hdr_csum = 1;
 
-   if (!(features & NETIF_F_HW_ESP) || !x->xso.offload_handle ||
-   (x->xso.dev != skb->dev))
+   if (!(features & NETIF_F_HW_ESP) || x->xso.dev != skb->dev)
esp_features = features & ~(NETIF_F_SG | NETIF_F_CSUM_MASK);
else if (!(features & NETIF_F_HW_ESP_TX_CSUM))
esp_features = features & ~NETIF_F_CSUM_MASK;
@@ -179,8 +178,7 @@ static int esp_xmit(struct xfrm_state *x, struct sk_buff 
*skb,  netdev_features_
if (!xo)
return -EINVAL;
 
-   if (!(features & NETIF_F_HW_ESP) || !x->xso.offload_handle ||
-   (x->xso.dev != skb->dev)) {
+   if (!(features & NETIF_F_HW_ESP) || x->xso.dev != skb->dev) {
xo->flags |= CRYPTO_FALLBACK;
hw_offload = false;
}
diff --git a/net/ipv6/esp6_offload.c b/net/ipv6/esp6_offload.c
index ddfa533..6177e21 100644
--- a/net/ipv6/esp6_offload.c
+++ b/net/ipv6/esp6_offload.c
@@ -162,8 +162,7 @@ static struct sk_buff *esp6_gso_segment(struct sk_buff *skb,
 
skb->encap_hdr_csum = 1;
 
-   if (!(features & NETIF_F_HW_ESP) || !x->xso.offload_handle ||
-   (x->xso.dev != skb->dev))
+   if (!(features & NETIF_F_HW_ESP) || x->xso.dev != skb->dev)
esp_features = features & ~(NETIF_F_SG | NETIF_F_CSUM_MASK);
else if (!(features & NETIF_F_HW_ESP_TX_CSUM))
esp_features = features & ~NETIF_F_CSUM_MASK;
@@ -207,8 +206,7 @@ static int esp6_xmit(struct xfrm_state *x, struct sk_buff 
*skb,  netdev_features
if (!xo)
return -EINVAL;
 
-   if (!(features & NETIF_F_HW_ESP) || !x->xso.offload_handle ||
-   (x->xso.dev != skb->dev)) {
+   if (!(features & NETIF_F_HW_ESP) || x->xso.dev != skb->dev) {
xo->flags |= CRYPTO_FALLBACK;
hw_offload = false;
}
diff --git a/net/xfrm/xfrm_device.c b/net/xfrm/xfrm_device.c
index 175941e..9265dd6 100644
--- a/net/xfrm/xfrm_device.c
+++ b/net/xfrm/xfrm_device.c
@@ -56,7 +56,7 @@ struct sk_buff *validate_xmit_xfrm(struct sk_buff *skb, 
netdev_features_t featur
if (skb_is_gso(skb)) {
struct net_device *dev = skb->dev;
 
-   if (unlikely(!x->xso.offload_handle || (x->xso.dev != dev))) {
+   if (unlikely(x->xso.dev != dev)) {
struct sk_buff *segs;
 
/* Packet got rerouted, fixup features and segment it. 
*/
@@ -210,8 +210,8 @@ bool xfrm_dev_offload_ok(struct sk_buff *skb, struct 
xfrm_state *x)
if (!x->type_offload || x->encap)
return false;
 
-   if ((!dev || (x->xso.offload_handle && (dev == 
xfrm_dst_path(dst)->dev))) &&
-(!xdst->child->xfrm && x->type->get_mtu)) {
+   if ((!dev || (dev == xfrm_dst_path(dst)->dev)) &&
+   (!xdst->child->xfrm && x->type->get_mtu)) {
mtu = x->type->get_mtu(x, xdst->child_mtu_cached);
 
if (skb->len <= mtu)
-- 
2.7.4



[PATCH v3 net-next 1/4] selftests: rtnetlink: clear the return code at start of ipsec test

2018-06-26 Thread Shannon Nelson
Following the custom from the other functions, clear the global
ret code before starting the test so as to not have previously
failed tests cause us to thing this test has failed.

Reported-by: Anders Roxell 
Signed-off-by: Shannon Nelson 
---
 tools/testing/selftests/net/rtnetlink.sh | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/tools/testing/selftests/net/rtnetlink.sh 
b/tools/testing/selftests/net/rtnetlink.sh
index b33a371..261a981 100755
--- a/tools/testing/selftests/net/rtnetlink.sh
+++ b/tools/testing/selftests/net/rtnetlink.sh
@@ -522,6 +522,8 @@ kci_test_macsec()
 #---
 kci_test_ipsec()
 {
+   ret=0
+
# find an ip address on this machine and make up a destination
srcip=`ip -o addr | awk '/inet / { print $4; }' | grep -v "^127" | head 
-1 | cut -f1 -d/`
net=`echo $srcip | cut -f1-3 -d.`
-- 
2.7.4



[PATCH v3 net-next 0/4] Updates for ipsec selftests

2018-06-26 Thread Shannon Nelson
Fix up the existing ipsec selftest and add tests for
the ipsec offload driver API.

v2: addressed formatting nits in netdevsim from Jakub Kicinski
v3: a couple more nits from Jakub

Shannon Nelson (4):
  selftests: rtnetlink: clear the return code at start of ipsec test
  selftests: rtnetlink: use dummydev as a test device
  netdevsim: add ipsec offload testing
  selftests: rtnetlink: add ipsec offload API test

 drivers/net/netdevsim/Makefile   |   4 +
 drivers/net/netdevsim/ipsec.c| 345 +++
 drivers/net/netdevsim/netdev.c   |   7 +
 drivers/net/netdevsim/netdevsim.h|  37 
 tools/testing/selftests/net/rtnetlink.sh | 132 +++-
 5 files changed, 518 insertions(+), 7 deletions(-)
 create mode 100644 drivers/net/netdevsim/ipsec.c

-- 
2.7.4



[PATCH v3 net-next 3/4] netdevsim: add ipsec offload testing

2018-06-26 Thread Shannon Nelson
Implement the IPsec/XFRM offload API for testing.

Signed-off-by: Shannon Nelson 
---
V2 - addressed formatting comments from Jakub Kicinski
V3 - a couple more little xmas tree nits

 drivers/net/netdevsim/Makefile|   4 +
 drivers/net/netdevsim/ipsec.c | 297 ++
 drivers/net/netdevsim/netdev.c|   7 +
 drivers/net/netdevsim/netdevsim.h |  41 ++
 4 files changed, 349 insertions(+)
 create mode 100644 drivers/net/netdevsim/ipsec.c

diff --git a/drivers/net/netdevsim/Makefile b/drivers/net/netdevsim/Makefile
index 449b2a1..0fee1d0 100644
--- a/drivers/net/netdevsim/Makefile
+++ b/drivers/net/netdevsim/Makefile
@@ -13,3 +13,7 @@ endif
 ifneq ($(CONFIG_NET_DEVLINK),)
 netdevsim-objs += devlink.o fib.o
 endif
+
+ifneq ($(CONFIG_XFRM_OFFLOAD),)
+netdevsim-objs += ipsec.o
+endif
diff --git a/drivers/net/netdevsim/ipsec.c b/drivers/net/netdevsim/ipsec.c
new file mode 100644
index 000..ceff544
--- /dev/null
+++ b/drivers/net/netdevsim/ipsec.c
@@ -0,0 +1,297 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright(c) 2018 Oracle and/or its affiliates. All rights reserved. */
+
+#include 
+#include 
+#include 
+
+#include "netdevsim.h"
+
+#define NSIM_IPSEC_AUTH_BITS   128
+
+static ssize_t nsim_dbg_netdev_ops_read(struct file *filp,
+   char __user *buffer,
+   size_t count, loff_t *ppos)
+{
+   struct netdevsim *ns = filp->private_data;
+   struct nsim_ipsec *ipsec = >ipsec;
+   size_t bufsize;
+   char *buf, *p;
+   int len;
+   int i;
+
+   /* the buffer needed is
+* (num SAs * 3 lines each * ~60 bytes per line) + one more line
+*/
+   bufsize = (ipsec->count * 4 * 60) + 60;
+   buf = kzalloc(bufsize, GFP_KERNEL);
+   if (!buf)
+   return -ENOMEM;
+
+   p = buf;
+   p += snprintf(p, bufsize - (p - buf),
+ "SA count=%u tx=%u\n",
+ ipsec->count, ipsec->tx);
+
+   for (i = 0; i < NSIM_IPSEC_MAX_SA_COUNT; i++) {
+   struct nsim_sa *sap = >sa[i];
+
+   if (!sap->used)
+   continue;
+
+   p += snprintf(p, bufsize - (p - buf),
+ "sa[%i] %cx ipaddr=0x%08x %08x %08x %08x\n",
+ i, (sap->rx ? 'r' : 't'), sap->ipaddr[0],
+ sap->ipaddr[1], sap->ipaddr[2], sap->ipaddr[3]);
+   p += snprintf(p, bufsize - (p - buf),
+ "sa[%i]spi=0x%08x proto=0x%x salt=0x%08x 
crypt=%d\n",
+ i, be32_to_cpu(sap->xs->id.spi),
+ sap->xs->id.proto, sap->salt, sap->crypt);
+   p += snprintf(p, bufsize - (p - buf),
+ "sa[%i]key=0x%08x %08x %08x %08x\n",
+ i, sap->key[0], sap->key[1],
+ sap->key[2], sap->key[3]);
+   }
+
+   len = simple_read_from_buffer(buffer, count, ppos, buf, p - buf);
+
+   kfree(buf);
+   return len;
+}
+
+static const struct file_operations ipsec_dbg_fops = {
+   .owner = THIS_MODULE,
+   .open = simple_open,
+   .read = nsim_dbg_netdev_ops_read,
+};
+
+static int nsim_ipsec_find_empty_idx(struct nsim_ipsec *ipsec)
+{
+   u32 i;
+
+   if (ipsec->count == NSIM_IPSEC_MAX_SA_COUNT)
+   return -ENOSPC;
+
+   /* search sa table */
+   for (i = 0; i < NSIM_IPSEC_MAX_SA_COUNT; i++) {
+   if (!ipsec->sa[i].used)
+   return i;
+   }
+
+   return -ENOSPC;
+}
+
+static int nsim_ipsec_parse_proto_keys(struct xfrm_state *xs,
+  u32 *mykey, u32 *mysalt)
+{
+   const char aes_gcm_name[] = "rfc4106(gcm(aes))";
+   struct net_device *dev = xs->xso.dev;
+   unsigned char *key_data;
+   char *alg_name = NULL;
+   int key_len;
+
+   if (!xs->aead) {
+   netdev_err(dev, "Unsupported IPsec algorithm\n");
+   return -EINVAL;
+   }
+
+   if (xs->aead->alg_icv_len != NSIM_IPSEC_AUTH_BITS) {
+   netdev_err(dev, "IPsec offload requires %d bit 
authentication\n",
+  NSIM_IPSEC_AUTH_BITS);
+   return -EINVAL;
+   }
+
+   key_data = >aead->alg_key[0];
+   key_len = xs->aead->alg_key_len;
+   alg_name = xs->aead->alg_name;
+
+   if (strcmp(alg_name, aes_gcm_name)) {
+   netdev_err(dev, "Unsupported IPsec algorithm - please use %s\n",
+  aes_gcm_name);
+   return -EINVAL;
+   }
+
+   /* 160 accounts for 16 byte key and 4 byte salt */
+   if (key_len > NSIM_IPSEC_AUTH_BITS) {
+

[PATCH v3 net-next 4/4] selftests: rtnetlink: add ipsec offload API test

2018-06-26 Thread Shannon Nelson
Using the netdevsim as a device for testing, try out the XFRM commands
for setting up IPsec hardware offloads.

Signed-off-by: Shannon Nelson 
---
 tools/testing/selftests/net/rtnetlink.sh | 114 +++
 1 file changed, 114 insertions(+)

diff --git a/tools/testing/selftests/net/rtnetlink.sh 
b/tools/testing/selftests/net/rtnetlink.sh
index 15948cf..9e1a82e 100755
--- a/tools/testing/selftests/net/rtnetlink.sh
+++ b/tools/testing/selftests/net/rtnetlink.sh
@@ -608,6 +608,119 @@ kci_test_ipsec()
echo "PASS: ipsec"
 }
 
+#---
+# Example commands
+#   ip x s add proto esp src 14.0.0.52 dst 14.0.0.70 \
+#spi 0x07 mode transport reqid 0x07 replay-window 32 \
+#aead 'rfc4106(gcm(aes))' 1234567890123456dcba 128 \
+#sel src 14.0.0.52/24 dst 14.0.0.70/24
+#offload dev sim1 dir out
+#   ip x p add dir out src 14.0.0.52/24 dst 14.0.0.70/24 \
+#tmpl proto esp src 14.0.0.52 dst 14.0.0.70 \
+#spi 0x07 mode transport reqid 0x07
+#
+#---
+kci_test_ipsec_offload()
+{
+   ret=0
+   algo="aead rfc4106(gcm(aes)) 0x3132333435363738393031323334353664636261 
128"
+   srcip=192.168.123.3
+   dstip=192.168.123.4
+   dev=simx1
+   sysfsd=/sys/kernel/debug/netdevsim/$dev
+   sysfsf=$sysfsd/ipsec
+
+   # setup netdevsim since dummydev doesn't have offload support
+   modprobe netdevsim
+   check_err $?
+   if [ $ret -ne 0 ]; then
+   echo "FAIL: ipsec_offload can't load netdevsim"
+   return 1
+   fi
+
+   ip link add $dev type netdevsim
+   ip addr add $srcip dev $dev
+   ip link set $dev up
+   if [ ! -d $sysfsd ] ; then
+   echo "FAIL: ipsec_offload can't create device $dev"
+   return 1
+   fi
+   if [ ! -f $sysfsf ] ; then
+   echo "FAIL: ipsec_offload netdevsim doesn't support IPsec 
offload"
+   return 1
+   fi
+
+   # flush to be sure there's nothing configured
+   ip x s flush ; ip x p flush
+
+   # create offloaded SAs, both in and out
+   ip x p add dir out src $srcip/24 dst $dstip/24 \
+   tmpl proto esp src $srcip dst $dstip spi 9 \
+   mode transport reqid 42
+   check_err $?
+   ip x p add dir out src $dstip/24 dst $srcip/24 \
+   tmpl proto esp src $dstip dst $srcip spi 9 \
+   mode transport reqid 42
+   check_err $?
+
+   ip x s add proto esp src $srcip dst $dstip spi 9 \
+   mode transport reqid 42 $algo sel src $srcip/24 dst $dstip/24 \
+   offload dev $dev dir out
+   check_err $?
+   ip x s add proto esp src $dstip dst $srcip spi 9 \
+   mode transport reqid 42 $algo sel src $dstip/24 dst $srcip/24 \
+   offload dev $dev dir in
+   check_err $?
+   if [ $ret -ne 0 ]; then
+   echo "FAIL: ipsec_offload can't create SA"
+   return 1
+   fi
+
+   # does offload show up in ip output
+   lines=`ip x s list | grep -c "crypto offload parameters: dev $dev dir"`
+   if [ $lines -ne 2 ] ; then
+   echo "FAIL: ipsec_offload SA offload missing from list output"
+   check_err 1
+   fi
+
+   # use ping to exercise the Tx path
+   ping -I $dev -c 3 -W 1 -i 0 $dstip >/dev/null
+
+   # does driver have correct offload info
+   diff $sysfsf - << EOF
+SA count=2 tx=3
+sa[0] tx ipaddr=0x   
+sa[0]spi=0x0009 proto=0x32 salt=0x61626364 crypt=1
+sa[0]key=0x34333231 38373635 32313039 36353433
+sa[1] rx ipaddr=0x   037ba8c0
+sa[1]spi=0x0009 proto=0x32 salt=0x61626364 crypt=1
+sa[1]key=0x34333231 38373635 32313039 36353433
+EOF
+   if [ $? -ne 0 ] ; then
+   echo "FAIL: ipsec_offload incorrect driver data"
+   check_err 1
+   fi
+
+   # does offload get removed from driver
+   ip x s flush
+   ip x p flush
+   lines=`grep -c "SA count=0" $sysfsf`
+   if [ $lines -ne 1 ] ; then
+   echo "FAIL: ipsec_offload SA not removed from driver"
+   check_err 1
+   fi
+
+   # clean up any leftovers
+   ip link del $dev
+   rmmod netdevsim
+
+   if [ $ret -ne 0 ]; then
+   echo "FAIL: ipsec_offload"
+   return 1
+   fi
+   echo "PASS: ipsec_offload"
+}
+
 kci_test_gretap()
 {
testns="testns"
@@ -862,6 +975,7 @@ kci_test_rtnl()
kci_test_encap
kci_test_macsec
kci_test_ipsec
+   kci_test_ipsec_offload
 
kci_del_dummy
 }
-- 
2.7.4



[PATCH v3 net-next 2/4] selftests: rtnetlink: use dummydev as a test device

2018-06-26 Thread Shannon Nelson
We really shouldn't mess with local system settings, so let's
use the already created dummy device instead for ipsec testing.
Oh, and let's put the temp file into a proper directory.

Signed-off-by: Shannon Nelson 
---
 tools/testing/selftests/net/rtnetlink.sh | 15 +++
 1 file changed, 7 insertions(+), 8 deletions(-)

diff --git a/tools/testing/selftests/net/rtnetlink.sh 
b/tools/testing/selftests/net/rtnetlink.sh
index 261a981..15948cf 100755
--- a/tools/testing/selftests/net/rtnetlink.sh
+++ b/tools/testing/selftests/net/rtnetlink.sh
@@ -523,21 +523,19 @@ kci_test_macsec()
 kci_test_ipsec()
 {
ret=0
-
-   # find an ip address on this machine and make up a destination
-   srcip=`ip -o addr | awk '/inet / { print $4; }' | grep -v "^127" | head 
-1 | cut -f1 -d/`
-   net=`echo $srcip | cut -f1-3 -d.`
-   base=`echo $srcip | cut -f4 -d.`
-   dstip="$net."`expr $base + 1`
-
algo="aead rfc4106(gcm(aes)) 0x3132333435363738393031323334353664636261 
128"
+   srcip=192.168.123.1
+   dstip=192.168.123.2
+   spi=7
+
+   ip addr add $srcip dev $devdummy
 
# flush to be sure there's nothing configured
ip x s flush ; ip x p flush
check_err $?
 
# start the monitor in the background
-   tmpfile=`mktemp ipsectestXXX`
+   tmpfile=`mktemp /var/run/ipsectestXXX`
mpid=`(ip x m > $tmpfile & echo $!) 2>/dev/null`
sleep 0.2
 
@@ -601,6 +599,7 @@ kci_test_ipsec()
check_err $?
ip x p flush
check_err $?
+   ip addr del $srcip/32 dev $devdummy
 
if [ $ret -ne 0 ]; then
echo "FAIL: ipsec"
-- 
2.7.4



Re: [PATCH net-next] liquidio: fix kernel panic when NIC firmware is older than 1.7.2

2018-06-26 Thread Shannon Nelson

On 6/26/2018 4:58 AM, Felix Manlunas wrote:

From: Rick Farrington 

Pre-1.7.2 NIC firmware does not support (and does not respond to) the "get
speed" command which is sent by the 1.7.2 driver during modprobe.  Due to a
bug in older firmware (with respect to unknown commands), this unsupported
command causes a cascade of errors that ends in a kernel panic.

Fix it by making the sending of the "get speed" command conditional on the
firmware version.

Signed-off-by: Rick Farrington 
Acked-by: Derek Chickles 
Signed-off-by: Felix Manlunas 
---
Note: To avoid checkpatch.pl "WARNING: line over 80 characters", the comma
   that separates the arguments in the call to strcmp() was placed one
   line below the usual spot.

  drivers/net/ethernet/cavium/liquidio/lio_main.c | 11 ++-
  1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/cavium/liquidio/lio_main.c 
b/drivers/net/ethernet/cavium/liquidio/lio_main.c
index 7cb4e75..f83f884 100644
--- a/drivers/net/ethernet/cavium/liquidio/lio_main.c
+++ b/drivers/net/ethernet/cavium/liquidio/lio_main.c
@@ -3671,7 +3671,16 @@ static int setup_nic_devices(struct octeon_device 
*octeon_dev)
OCTEON_CN2350_25GB_SUBSYS_ID ||
octeon_dev->subsystem_id ==
OCTEON_CN2360_25GB_SUBSYS_ID) {
-   liquidio_get_speed(lio);
+   /* speed control unsupported in f/w older than 1.7.2 */
+   if (strcmp(octeon_dev->fw_info.liquidio_firmware_version
+  , "1.7.2") < 0) {


Will the liquidio_firmware_version ever end up something like 1.7.10? 
If so, this strcmp() may not do what you want.


sln


+   dev_info(_dev->pci_dev->dev,
+"speed setting not supported by f/w.");
+   octeon_dev->speed_setting = 25;
+   octeon_dev->no_speed_setting = 1;
+   } else {
+   liquidio_get_speed(lio);
+   }
  
  			if (octeon_dev->speed_setting == 0) {

octeon_dev->speed_setting = 25;



[PATCH v2 net-next 2/4] selftests: rtnetlink: use dummydev as a test device

2018-06-25 Thread Shannon Nelson
We really shouldn't mess with local system settings, so let's
use the already created dummy device instead for ipsec testing.
Oh, and let's put the temp file into a proper directory.

Signed-off-by: Shannon Nelson 
---
 tools/testing/selftests/net/rtnetlink.sh | 15 +++
 1 file changed, 7 insertions(+), 8 deletions(-)

diff --git a/tools/testing/selftests/net/rtnetlink.sh 
b/tools/testing/selftests/net/rtnetlink.sh
index 261a981..15948cf 100755
--- a/tools/testing/selftests/net/rtnetlink.sh
+++ b/tools/testing/selftests/net/rtnetlink.sh
@@ -523,21 +523,19 @@ kci_test_macsec()
 kci_test_ipsec()
 {
ret=0
-
-   # find an ip address on this machine and make up a destination
-   srcip=`ip -o addr | awk '/inet / { print $4; }' | grep -v "^127" | head 
-1 | cut -f1 -d/`
-   net=`echo $srcip | cut -f1-3 -d.`
-   base=`echo $srcip | cut -f4 -d.`
-   dstip="$net."`expr $base + 1`
-
algo="aead rfc4106(gcm(aes)) 0x3132333435363738393031323334353664636261 
128"
+   srcip=192.168.123.1
+   dstip=192.168.123.2
+   spi=7
+
+   ip addr add $srcip dev $devdummy
 
# flush to be sure there's nothing configured
ip x s flush ; ip x p flush
check_err $?
 
# start the monitor in the background
-   tmpfile=`mktemp ipsectestXXX`
+   tmpfile=`mktemp /var/run/ipsectestXXX`
mpid=`(ip x m > $tmpfile & echo $!) 2>/dev/null`
sleep 0.2
 
@@ -601,6 +599,7 @@ kci_test_ipsec()
check_err $?
ip x p flush
check_err $?
+   ip addr del $srcip/32 dev $devdummy
 
if [ $ret -ne 0 ]; then
echo "FAIL: ipsec"
-- 
2.7.4



[PATCH v2 net-next 4/4] selftests: rtnetlink: add ipsec offload API test

2018-06-25 Thread Shannon Nelson
Using the netdevsim as a device for testing, try out the XFRM commands
for setting up IPsec hardware offloads.

Signed-off-by: Shannon Nelson 
---
 tools/testing/selftests/net/rtnetlink.sh | 114 +++
 1 file changed, 114 insertions(+)

diff --git a/tools/testing/selftests/net/rtnetlink.sh 
b/tools/testing/selftests/net/rtnetlink.sh
index 15948cf..9e1a82e 100755
--- a/tools/testing/selftests/net/rtnetlink.sh
+++ b/tools/testing/selftests/net/rtnetlink.sh
@@ -608,6 +608,119 @@ kci_test_ipsec()
echo "PASS: ipsec"
 }
 
+#---
+# Example commands
+#   ip x s add proto esp src 14.0.0.52 dst 14.0.0.70 \
+#spi 0x07 mode transport reqid 0x07 replay-window 32 \
+#aead 'rfc4106(gcm(aes))' 1234567890123456dcba 128 \
+#sel src 14.0.0.52/24 dst 14.0.0.70/24
+#offload dev sim1 dir out
+#   ip x p add dir out src 14.0.0.52/24 dst 14.0.0.70/24 \
+#tmpl proto esp src 14.0.0.52 dst 14.0.0.70 \
+#spi 0x07 mode transport reqid 0x07
+#
+#---
+kci_test_ipsec_offload()
+{
+   ret=0
+   algo="aead rfc4106(gcm(aes)) 0x3132333435363738393031323334353664636261 
128"
+   srcip=192.168.123.3
+   dstip=192.168.123.4
+   dev=simx1
+   sysfsd=/sys/kernel/debug/netdevsim/$dev
+   sysfsf=$sysfsd/ipsec
+
+   # setup netdevsim since dummydev doesn't have offload support
+   modprobe netdevsim
+   check_err $?
+   if [ $ret -ne 0 ]; then
+   echo "FAIL: ipsec_offload can't load netdevsim"
+   return 1
+   fi
+
+   ip link add $dev type netdevsim
+   ip addr add $srcip dev $dev
+   ip link set $dev up
+   if [ ! -d $sysfsd ] ; then
+   echo "FAIL: ipsec_offload can't create device $dev"
+   return 1
+   fi
+   if [ ! -f $sysfsf ] ; then
+   echo "FAIL: ipsec_offload netdevsim doesn't support IPsec 
offload"
+   return 1
+   fi
+
+   # flush to be sure there's nothing configured
+   ip x s flush ; ip x p flush
+
+   # create offloaded SAs, both in and out
+   ip x p add dir out src $srcip/24 dst $dstip/24 \
+   tmpl proto esp src $srcip dst $dstip spi 9 \
+   mode transport reqid 42
+   check_err $?
+   ip x p add dir out src $dstip/24 dst $srcip/24 \
+   tmpl proto esp src $dstip dst $srcip spi 9 \
+   mode transport reqid 42
+   check_err $?
+
+   ip x s add proto esp src $srcip dst $dstip spi 9 \
+   mode transport reqid 42 $algo sel src $srcip/24 dst $dstip/24 \
+   offload dev $dev dir out
+   check_err $?
+   ip x s add proto esp src $dstip dst $srcip spi 9 \
+   mode transport reqid 42 $algo sel src $dstip/24 dst $srcip/24 \
+   offload dev $dev dir in
+   check_err $?
+   if [ $ret -ne 0 ]; then
+   echo "FAIL: ipsec_offload can't create SA"
+   return 1
+   fi
+
+   # does offload show up in ip output
+   lines=`ip x s list | grep -c "crypto offload parameters: dev $dev dir"`
+   if [ $lines -ne 2 ] ; then
+   echo "FAIL: ipsec_offload SA offload missing from list output"
+   check_err 1
+   fi
+
+   # use ping to exercise the Tx path
+   ping -I $dev -c 3 -W 1 -i 0 $dstip >/dev/null
+
+   # does driver have correct offload info
+   diff $sysfsf - << EOF
+SA count=2 tx=3
+sa[0] tx ipaddr=0x   
+sa[0]spi=0x0009 proto=0x32 salt=0x61626364 crypt=1
+sa[0]key=0x34333231 38373635 32313039 36353433
+sa[1] rx ipaddr=0x   037ba8c0
+sa[1]spi=0x0009 proto=0x32 salt=0x61626364 crypt=1
+sa[1]key=0x34333231 38373635 32313039 36353433
+EOF
+   if [ $? -ne 0 ] ; then
+   echo "FAIL: ipsec_offload incorrect driver data"
+   check_err 1
+   fi
+
+   # does offload get removed from driver
+   ip x s flush
+   ip x p flush
+   lines=`grep -c "SA count=0" $sysfsf`
+   if [ $lines -ne 1 ] ; then
+   echo "FAIL: ipsec_offload SA not removed from driver"
+   check_err 1
+   fi
+
+   # clean up any leftovers
+   ip link del $dev
+   rmmod netdevsim
+
+   if [ $ret -ne 0 ]; then
+   echo "FAIL: ipsec_offload"
+   return 1
+   fi
+   echo "PASS: ipsec_offload"
+}
+
 kci_test_gretap()
 {
testns="testns"
@@ -862,6 +975,7 @@ kci_test_rtnl()
kci_test_encap
kci_test_macsec
kci_test_ipsec
+   kci_test_ipsec_offload
 
kci_del_dummy
 }
-- 
2.7.4



[PATCH v2 net-next 3/4] netdevsim: add ipsec offload testing

2018-06-25 Thread Shannon Nelson
Implement the IPsec/XFRM offload API for testing.

Signed-off-by: Shannon Nelson 
---
V2 - addressed formatting comments from Jakub Kicinski

 drivers/net/netdevsim/Makefile|   4 +
 drivers/net/netdevsim/ipsec.c | 298 ++
 drivers/net/netdevsim/netdev.c|   7 +
 drivers/net/netdevsim/netdevsim.h |  41 ++
 4 files changed, 350 insertions(+)
 create mode 100644 drivers/net/netdevsim/ipsec.c

diff --git a/drivers/net/netdevsim/Makefile b/drivers/net/netdevsim/Makefile
index 449b2a1..0fee1d0 100644
--- a/drivers/net/netdevsim/Makefile
+++ b/drivers/net/netdevsim/Makefile
@@ -13,3 +13,7 @@ endif
 ifneq ($(CONFIG_NET_DEVLINK),)
 netdevsim-objs += devlink.o fib.o
 endif
+
+ifneq ($(CONFIG_XFRM_OFFLOAD),)
+netdevsim-objs += ipsec.o
+endif
diff --git a/drivers/net/netdevsim/ipsec.c b/drivers/net/netdevsim/ipsec.c
new file mode 100644
index 000..1a23426
--- /dev/null
+++ b/drivers/net/netdevsim/ipsec.c
@@ -0,0 +1,298 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright(c) 2018 Oracle and/or its affiliates. All rights reserved. */
+
+#include 
+#include 
+#include 
+
+#include "netdevsim.h"
+
+#define NSIM_IPSEC_AUTH_BITS   128
+
+static ssize_t nsim_dbg_netdev_ops_read(struct file *filp,
+   char __user *buffer,
+   size_t count, loff_t *ppos)
+{
+   struct netdevsim *ns = filp->private_data;
+   struct nsim_ipsec *ipsec = >ipsec;
+   size_t bufsize;
+   char *buf, *p;
+   int len;
+   int i;
+
+   /* the buffer needed is
+* (num SAs * 3 lines each * ~60 bytes per line) + one more line
+*/
+   bufsize = (ipsec->count * 4 * 60) + 60;
+   buf = kzalloc(bufsize, GFP_KERNEL);
+   if (!buf)
+   return -ENOMEM;
+
+   p = buf;
+   p += snprintf(p, bufsize - (p - buf),
+ "SA count=%u tx=%u\n",
+ ipsec->count, ipsec->tx);
+
+   for (i = 0; i < NSIM_IPSEC_MAX_SA_COUNT; i++) {
+   struct nsim_sa *sap = >sa[i];
+
+   if (!sap->used)
+   continue;
+
+   p += snprintf(p, bufsize - (p - buf),
+ "sa[%i] %cx ipaddr=0x%08x %08x %08x %08x\n",
+ i, (sap->rx ? 'r' : 't'), sap->ipaddr[0],
+ sap->ipaddr[1], sap->ipaddr[2], sap->ipaddr[3]);
+   p += snprintf(p, bufsize - (p - buf),
+ "sa[%i]spi=0x%08x proto=0x%x salt=0x%08x 
crypt=%d\n",
+ i, be32_to_cpu(sap->xs->id.spi),
+ sap->xs->id.proto, sap->salt, sap->crypt);
+   p += snprintf(p, bufsize - (p - buf),
+ "sa[%i]key=0x%08x %08x %08x %08x\n",
+ i, sap->key[0], sap->key[1],
+ sap->key[2], sap->key[3]);
+   }
+
+   len = simple_read_from_buffer(buffer, count, ppos, buf, p - buf);
+
+   kfree(buf);
+   return len;
+}
+
+static const struct file_operations ipsec_dbg_fops = {
+   .owner = THIS_MODULE,
+   .open = simple_open,
+   .read = nsim_dbg_netdev_ops_read,
+};
+
+static int nsim_ipsec_find_empty_idx(struct nsim_ipsec *ipsec)
+{
+   u32 i;
+
+   if (ipsec->count == NSIM_IPSEC_MAX_SA_COUNT)
+   return -ENOSPC;
+
+   /* search sa table */
+   for (i = 0; i < NSIM_IPSEC_MAX_SA_COUNT; i++) {
+   if (!ipsec->sa[i].used)
+   return i;
+   }
+
+   return -ENOSPC;
+}
+
+static int nsim_ipsec_parse_proto_keys(struct xfrm_state *xs,
+  u32 *mykey, u32 *mysalt)
+{
+   const char aes_gcm_name[] = "rfc4106(gcm(aes))";
+   struct net_device *dev = xs->xso.dev;
+   unsigned char *key_data;
+   char *alg_name = NULL;
+   int key_len;
+
+   if (!xs->aead) {
+   netdev_err(dev, "Unsupported IPsec algorithm\n");
+   return -EINVAL;
+   }
+
+   if (xs->aead->alg_icv_len != NSIM_IPSEC_AUTH_BITS) {
+   netdev_err(dev, "IPsec offload requires %d bit 
authentication\n",
+  NSIM_IPSEC_AUTH_BITS);
+   return -EINVAL;
+   }
+
+   key_data = >aead->alg_key[0];
+   key_len = xs->aead->alg_key_len;
+   alg_name = xs->aead->alg_name;
+
+   if (strcmp(alg_name, aes_gcm_name)) {
+   netdev_err(dev, "Unsupported IPsec algorithm - please use %s\n",
+  aes_gcm_name);
+   return -EINVAL;
+   }
+
+   /* 160 accounts for 16 byte key and 4 byte salt */
+   if (key_len > NSIM_IPSEC_AUTH_BITS) {
+   *mysalt = ((u32 *)key_data)[4

[PATCH v2 net-next 0/4] Updates for ipsec selftests

2018-06-25 Thread Shannon Nelson
Fix up the existing ipsec selftest and add tests for
the ipsec offload driver API.

v2: addressed formatting nits in netdevsim from Jakub Kicinski

Shannon Nelson (4):
  selftests: rtnetlink: clear the return code at start of ipsec test
  selftests: rtnetlink: use dummydev as a test device
  netdevsim: add ipsec offload testing
  selftests: rtnetlink: add ipsec offload API test

 drivers/net/netdevsim/Makefile   |   4 +
 drivers/net/netdevsim/ipsec.c| 345 +++
 drivers/net/netdevsim/netdev.c   |   7 +
 drivers/net/netdevsim/netdevsim.h|  37 
 tools/testing/selftests/net/rtnetlink.sh | 132 +++-
 5 files changed, 518 insertions(+), 7 deletions(-)
 create mode 100644 drivers/net/netdevsim/ipsec.c

-- 
2.7.4



[PATCH v2 net-next 1/4] selftests: rtnetlink: clear the return code at start of ipsec test

2018-06-25 Thread Shannon Nelson
Following the custom from the other functions, clear the global
ret code before starting the test so as to not have previously
failed tests cause us to thing this test has failed.

Reported-by: Anders Roxell 
Signed-off-by: Shannon Nelson 
---
 tools/testing/selftests/net/rtnetlink.sh | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/tools/testing/selftests/net/rtnetlink.sh 
b/tools/testing/selftests/net/rtnetlink.sh
index b33a371..261a981 100755
--- a/tools/testing/selftests/net/rtnetlink.sh
+++ b/tools/testing/selftests/net/rtnetlink.sh
@@ -522,6 +522,8 @@ kci_test_macsec()
 #---
 kci_test_ipsec()
 {
+   ret=0
+
# find an ip address on this machine and make up a destination
srcip=`ip -o addr | awk '/inet / { print $4; }' | grep -v "^127" | head 
-1 | cut -f1 -d/`
net=`echo $srcip | cut -f1-3 -d.`
-- 
2.7.4



Re: [PATCH net-next 3/4] netdevsim: add ipsec offload testing

2018-06-25 Thread Shannon Nelson

On 6/22/2018 9:07 PM, Jakub Kicinski wrote:

On Fri, 22 Jun 2018 17:31:37 -0700, Shannon Nelson wrote:

Implement the IPsec/XFRM offload API for testing.

Signed-off-by: Shannon Nelson 


Thanks for the patch!  Just a number of stylistic nit picks.


diff --git a/drivers/net/netdevsim/ipsec.c b/drivers/net/netdevsim/ipsec.c
new file mode 100644
index 000..ad64266
--- /dev/null
+++ b/drivers/net/netdevsim/ipsec.c
@@ -0,0 +1,345 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright(c) 2018 Oracle and/or its affiliates. All rights reserved. */
+
+#include 
+#include 
+#include 
+#include "netdevsim.h"


Other files in the driver sort headers alphabetically and put an empty
line between global and local headers.


Sure.




+#define NSIM_IPSEC_AUTH_BITS   128
+
+/**
+ * nsim_ipsec_dbg_read - read for ipsec data
+ * @filp: the opened file
+ * @buffer: where to write the data for the user to read
+ * @count: the size of the user's buffer
+ * @ppos: file position offset
+ **/
+static ssize_t nsim_dbg_netdev_ops_read(struct file *filp,


Doesn't match the kdoc.  Please run

./scripts/kernel-doc -none $file

if you want kdoc.  Although IMHO you may as well drop the kdoc, your
code is quite self explanatory and local.


By adding -v to that I got a couple of warnings that I didn't include 
the Return information - is that what you were commenting on?  The rest 
seems acceptable to the script I'm using from the net-next tree.





+   char __user *buffer,
+   size_t count, loff_t *ppos)
+{
+   struct netdevsim *ns = filp->private_data;
+   struct nsim_ipsec *ipsec = >ipsec;
+   size_t bufsize;
+   char *buf, *p;
+   int len;
+   int i;
+
+   /* don't allow partial reads */
+   if (*ppos != 0)
+   return 0;
+
+   /* the buffer needed is
+* (num SAs * 3 lines each * ~60 bytes per line) + one more line
+*/
+   bufsize = (ipsec->count * 4 * 60) + 60;
+   buf = kzalloc(bufsize, GFP_KERNEL);
+   if (!buf)
+   return -ENOMEM;
+
+   p = buf;
+   p += snprintf(p, bufsize - (p - buf),
+ "SA count=%u tx=%u\n",
+ ipsec->count, ipsec->tx);
+
+   for (i = 0; i < NSIM_IPSEC_MAX_SA_COUNT; i++) {
+   struct nsim_sa *sap = >sa[i];
+
+   if (!sap->used)
+   continue;
+
+   p += snprintf(p, bufsize - (p - buf),
+ "sa[%i] %cx ipaddr=0x%08x %08x %08x %08x\n",
+ i, (sap->rx ? 'r' : 't'), sap->ipaddr[0],
+ sap->ipaddr[1], sap->ipaddr[2], sap->ipaddr[3]);
+   p += snprintf(p, bufsize - (p - buf),
+ "sa[%i]spi=0x%08x proto=0x%x salt=0x%08x 
crypt=%d\n",
+ i, be32_to_cpu(sap->xs->id.spi),
+ sap->xs->id.proto, sap->salt, sap->crypt);
+   p += snprintf(p, bufsize - (p - buf),
+ "sa[%i]key=0x%08x %08x %08x %08x\n",
+ i, sap->key[0], sap->key[1],
+ sap->key[2], sap->key[3]);
+   }
+
+   len = simple_read_from_buffer(buffer, count, ppos, buf, p - buf);


Why not seq_file for this?


Why bother with more interface code?  This is useful enough to support 
the API testing needed.





+   kfree(buf);
+   return len;
+}
+
+static const struct file_operations ipsec_dbg_fops = {
+   .owner = THIS_MODULE,
+   .open = simple_open,
+   .read = nsim_dbg_netdev_ops_read,
+};
+
+/**
+ * nsim_ipsec_find_empty_idx - find the first unused security parameter index
+ * @ipsec: pointer to ipsec struct
+ **/
+static int nsim_ipsec_find_empty_idx(struct nsim_ipsec *ipsec)
+{
+   u32 i;
+
+   if (ipsec->count == NSIM_IPSEC_MAX_SA_COUNT)
+   return -ENOSPC;
+
+   /* search sa table */
+   for (i = 0; i < NSIM_IPSEC_MAX_SA_COUNT; i++) {
+   if (!ipsec->sa[i].used)
+   return i;
+   }
+
+   return -ENOSPC;


FWIW I personally find bitmaps and find_first_zero_bit() etc. nice and
concise for a small ID allocator, but no objection to open coding.


Sure, we could add a parallel bitmap data structure to track usage of 
our array elements, and probably would for a much larger array so as to 
lessen the impact of a serial search.  But, since this is a short array 
for simple testing purposes, the search time is minimal so I think the 
simple code is fine.





+}
+
+/**
+ * nsim_ipsec_parse_proto_keys - find the key and salt based on the protocol
+ * @xs: pointer to xfrm_state struct
+ * @mykey: pointer to key array to populate
+ * @mysalt: pointer to salt value to populate
+ *
+ * This copies the pr

Re: [PATCH net-next 3/4] netdevsim: add ipsec offload testing

2018-06-23 Thread Shannon Nelson

On 6/22/2018 9:07 PM, Jakub Kicinski wrote:

On Fri, 22 Jun 2018 17:31:37 -0700, Shannon Nelson wrote:

Implement the IPsec/XFRM offload API for testing.

Signed-off-by: Shannon Nelson 


Thanks for the patch!  Just a number of stylistic nit picks.


Thanks for the comments, I'll do a v2 in a couple of days.
sln




diff --git a/drivers/net/netdevsim/ipsec.c b/drivers/net/netdevsim/ipsec.c
new file mode 100644
index 000..ad64266
--- /dev/null
+++ b/drivers/net/netdevsim/ipsec.c
@@ -0,0 +1,345 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright(c) 2018 Oracle and/or its affiliates. All rights reserved. */
+
+#include 
+#include 
+#include 
+#include "netdevsim.h"


Other files in the driver sort headers alphabetically and put an empty
line between global and local headers.


+#define NSIM_IPSEC_AUTH_BITS   128
+
+/**
+ * nsim_ipsec_dbg_read - read for ipsec data
+ * @filp: the opened file
+ * @buffer: where to write the data for the user to read
+ * @count: the size of the user's buffer
+ * @ppos: file position offset
+ **/
+static ssize_t nsim_dbg_netdev_ops_read(struct file *filp,


Doesn't match the kdoc.  Please run

./scripts/kernel-doc -none $file

if you want kdoc.  Although IMHO you may as well drop the kdoc, your
code is quite self explanatory and local.


+   char __user *buffer,
+   size_t count, loff_t *ppos)
+{
+   struct netdevsim *ns = filp->private_data;
+   struct nsim_ipsec *ipsec = >ipsec;
+   size_t bufsize;
+   char *buf, *p;
+   int len;
+   int i;
+
+   /* don't allow partial reads */
+   if (*ppos != 0)
+   return 0;
+
+   /* the buffer needed is
+* (num SAs * 3 lines each * ~60 bytes per line) + one more line
+*/
+   bufsize = (ipsec->count * 4 * 60) + 60;
+   buf = kzalloc(bufsize, GFP_KERNEL);
+   if (!buf)
+   return -ENOMEM;
+
+   p = buf;
+   p += snprintf(p, bufsize - (p - buf),
+ "SA count=%u tx=%u\n",
+ ipsec->count, ipsec->tx);
+
+   for (i = 0; i < NSIM_IPSEC_MAX_SA_COUNT; i++) {
+   struct nsim_sa *sap = >sa[i];
+
+   if (!sap->used)
+   continue;
+
+   p += snprintf(p, bufsize - (p - buf),
+ "sa[%i] %cx ipaddr=0x%08x %08x %08x %08x\n",
+ i, (sap->rx ? 'r' : 't'), sap->ipaddr[0],
+ sap->ipaddr[1], sap->ipaddr[2], sap->ipaddr[3]);
+   p += snprintf(p, bufsize - (p - buf),
+ "sa[%i]spi=0x%08x proto=0x%x salt=0x%08x 
crypt=%d\n",
+ i, be32_to_cpu(sap->xs->id.spi),
+ sap->xs->id.proto, sap->salt, sap->crypt);
+   p += snprintf(p, bufsize - (p - buf),
+ "sa[%i]key=0x%08x %08x %08x %08x\n",
+ i, sap->key[0], sap->key[1],
+ sap->key[2], sap->key[3]);
+   }
+
+   len = simple_read_from_buffer(buffer, count, ppos, buf, p - buf);


Why not seq_file for this?


+   kfree(buf);
+   return len;
+}
+
+static const struct file_operations ipsec_dbg_fops = {
+   .owner = THIS_MODULE,
+   .open = simple_open,
+   .read = nsim_dbg_netdev_ops_read,
+};
+
+/**
+ * nsim_ipsec_find_empty_idx - find the first unused security parameter index
+ * @ipsec: pointer to ipsec struct
+ **/
+static int nsim_ipsec_find_empty_idx(struct nsim_ipsec *ipsec)
+{
+   u32 i;
+
+   if (ipsec->count == NSIM_IPSEC_MAX_SA_COUNT)
+   return -ENOSPC;
+
+   /* search sa table */
+   for (i = 0; i < NSIM_IPSEC_MAX_SA_COUNT; i++) {
+   if (!ipsec->sa[i].used)
+   return i;
+   }
+
+   return -ENOSPC;


FWIW I personally find bitmaps and find_first_zero_bit() etc. nice and
concise for a small ID allocator, but no objection to open coding.


+}
+
+/**
+ * nsim_ipsec_parse_proto_keys - find the key and salt based on the protocol
+ * @xs: pointer to xfrm_state struct
+ * @mykey: pointer to key array to populate
+ * @mysalt: pointer to salt value to populate
+ *
+ * This copies the protocol keys and salt to our own data tables.  The
+ * 82599 family only supports the one algorithm.


82599 is a fine chip, it's not netdevsim tho? ;)


+ **/
+static int nsim_ipsec_parse_proto_keys(struct xfrm_state *xs,
+  u32 *mykey, u32 *mysalt)
+{
+   struct net_device *dev = xs->xso.dev;
+   unsigned char *key_data;
+   char *alg_name = NULL;
+   const char aes_gcm_name[] = "rfc4106(gcm(aes))";
+   int key_len;


reverse xmas tree please


+
+   if (!xs->aead) {
+   netdev_

[PATCH net-next 3/4] netdevsim: add ipsec offload testing

2018-06-22 Thread Shannon Nelson
Implement the IPsec/XFRM offload API for testing.

Signed-off-by: Shannon Nelson 
---
 drivers/net/netdevsim/Makefile|   4 +
 drivers/net/netdevsim/ipsec.c | 345 ++
 drivers/net/netdevsim/netdev.c|   7 +
 drivers/net/netdevsim/netdevsim.h |  37 
 4 files changed, 393 insertions(+)
 create mode 100644 drivers/net/netdevsim/ipsec.c

diff --git a/drivers/net/netdevsim/Makefile b/drivers/net/netdevsim/Makefile
index 449b2a1..0fee1d0 100644
--- a/drivers/net/netdevsim/Makefile
+++ b/drivers/net/netdevsim/Makefile
@@ -13,3 +13,7 @@ endif
 ifneq ($(CONFIG_NET_DEVLINK),)
 netdevsim-objs += devlink.o fib.o
 endif
+
+ifneq ($(CONFIG_XFRM_OFFLOAD),)
+netdevsim-objs += ipsec.o
+endif
diff --git a/drivers/net/netdevsim/ipsec.c b/drivers/net/netdevsim/ipsec.c
new file mode 100644
index 000..ad64266
--- /dev/null
+++ b/drivers/net/netdevsim/ipsec.c
@@ -0,0 +1,345 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright(c) 2018 Oracle and/or its affiliates. All rights reserved. */
+
+#include 
+#include 
+#include 
+#include "netdevsim.h"
+
+#define NSIM_IPSEC_AUTH_BITS   128
+
+/**
+ * nsim_ipsec_dbg_read - read for ipsec data
+ * @filp: the opened file
+ * @buffer: where to write the data for the user to read
+ * @count: the size of the user's buffer
+ * @ppos: file position offset
+ **/
+static ssize_t nsim_dbg_netdev_ops_read(struct file *filp,
+   char __user *buffer,
+   size_t count, loff_t *ppos)
+{
+   struct netdevsim *ns = filp->private_data;
+   struct nsim_ipsec *ipsec = >ipsec;
+   size_t bufsize;
+   char *buf, *p;
+   int len;
+   int i;
+
+   /* don't allow partial reads */
+   if (*ppos != 0)
+   return 0;
+
+   /* the buffer needed is
+* (num SAs * 3 lines each * ~60 bytes per line) + one more line
+*/
+   bufsize = (ipsec->count * 4 * 60) + 60;
+   buf = kzalloc(bufsize, GFP_KERNEL);
+   if (!buf)
+   return -ENOMEM;
+
+   p = buf;
+   p += snprintf(p, bufsize - (p - buf),
+ "SA count=%u tx=%u\n",
+ ipsec->count, ipsec->tx);
+
+   for (i = 0; i < NSIM_IPSEC_MAX_SA_COUNT; i++) {
+   struct nsim_sa *sap = >sa[i];
+
+   if (!sap->used)
+   continue;
+
+   p += snprintf(p, bufsize - (p - buf),
+ "sa[%i] %cx ipaddr=0x%08x %08x %08x %08x\n",
+ i, (sap->rx ? 'r' : 't'), sap->ipaddr[0],
+ sap->ipaddr[1], sap->ipaddr[2], sap->ipaddr[3]);
+   p += snprintf(p, bufsize - (p - buf),
+ "sa[%i]spi=0x%08x proto=0x%x salt=0x%08x 
crypt=%d\n",
+ i, be32_to_cpu(sap->xs->id.spi),
+ sap->xs->id.proto, sap->salt, sap->crypt);
+   p += snprintf(p, bufsize - (p - buf),
+ "sa[%i]key=0x%08x %08x %08x %08x\n",
+ i, sap->key[0], sap->key[1],
+ sap->key[2], sap->key[3]);
+   }
+
+   len = simple_read_from_buffer(buffer, count, ppos, buf, p - buf);
+
+   kfree(buf);
+   return len;
+}
+
+static const struct file_operations ipsec_dbg_fops = {
+   .owner = THIS_MODULE,
+   .open = simple_open,
+   .read = nsim_dbg_netdev_ops_read,
+};
+
+/**
+ * nsim_ipsec_find_empty_idx - find the first unused security parameter index
+ * @ipsec: pointer to ipsec struct
+ **/
+static int nsim_ipsec_find_empty_idx(struct nsim_ipsec *ipsec)
+{
+   u32 i;
+
+   if (ipsec->count == NSIM_IPSEC_MAX_SA_COUNT)
+   return -ENOSPC;
+
+   /* search sa table */
+   for (i = 0; i < NSIM_IPSEC_MAX_SA_COUNT; i++) {
+   if (!ipsec->sa[i].used)
+   return i;
+   }
+
+   return -ENOSPC;
+}
+
+/**
+ * nsim_ipsec_parse_proto_keys - find the key and salt based on the protocol
+ * @xs: pointer to xfrm_state struct
+ * @mykey: pointer to key array to populate
+ * @mysalt: pointer to salt value to populate
+ *
+ * This copies the protocol keys and salt to our own data tables.  The
+ * 82599 family only supports the one algorithm.
+ **/
+static int nsim_ipsec_parse_proto_keys(struct xfrm_state *xs,
+  u32 *mykey, u32 *mysalt)
+{
+   struct net_device *dev = xs->xso.dev;
+   unsigned char *key_data;
+   char *alg_name = NULL;
+   const char aes_gcm_name[] = "rfc4106(gcm(aes))";
+   int key_len;
+
+   if (!xs->aead) {
+   netdev_err(dev, "Unsupported IPsec algorithm\n");
+   return -EINVAL;
+   }
+
+   if (xs->aead->alg_icv_l

[PATCH net-next 0/4] Updates for ipsec selftests

2018-06-22 Thread Shannon Nelson
Fix up the existing ipsec selftest and add tests for
the ipsec offload driver API.

Shannon Nelson (4):
  selftests: rtnetlink: clear the return code at start of ipsec test
  selftests: rtnetlink: use dummydev as a test device
  netdevsim: add ipsec offload testing
  selftests: rtnetlink: add ipsec offload API test

 drivers/net/netdevsim/Makefile   |   4 +
 drivers/net/netdevsim/ipsec.c| 345 +++
 drivers/net/netdevsim/netdev.c   |   7 +
 drivers/net/netdevsim/netdevsim.h|  37 
 tools/testing/selftests/net/rtnetlink.sh | 132 +++-
 5 files changed, 518 insertions(+), 7 deletions(-)
 create mode 100644 drivers/net/netdevsim/ipsec.c

-- 
2.7.4



[PATCH net-next 4/4] selftests: rtnetlink: add ipsec offload API test

2018-06-22 Thread Shannon Nelson
Using the netdevsim as a device for testing, try out the XFRM commands
for setting up IPsec hardware offloads.

Signed-off-by: Shannon Nelson 
---
 tools/testing/selftests/net/rtnetlink.sh | 114 +++
 1 file changed, 114 insertions(+)

diff --git a/tools/testing/selftests/net/rtnetlink.sh 
b/tools/testing/selftests/net/rtnetlink.sh
index 15948cf..9e1a82e 100755
--- a/tools/testing/selftests/net/rtnetlink.sh
+++ b/tools/testing/selftests/net/rtnetlink.sh
@@ -608,6 +608,119 @@ kci_test_ipsec()
echo "PASS: ipsec"
 }
 
+#---
+# Example commands
+#   ip x s add proto esp src 14.0.0.52 dst 14.0.0.70 \
+#spi 0x07 mode transport reqid 0x07 replay-window 32 \
+#aead 'rfc4106(gcm(aes))' 1234567890123456dcba 128 \
+#sel src 14.0.0.52/24 dst 14.0.0.70/24
+#offload dev sim1 dir out
+#   ip x p add dir out src 14.0.0.52/24 dst 14.0.0.70/24 \
+#tmpl proto esp src 14.0.0.52 dst 14.0.0.70 \
+#spi 0x07 mode transport reqid 0x07
+#
+#---
+kci_test_ipsec_offload()
+{
+   ret=0
+   algo="aead rfc4106(gcm(aes)) 0x3132333435363738393031323334353664636261 
128"
+   srcip=192.168.123.3
+   dstip=192.168.123.4
+   dev=simx1
+   sysfsd=/sys/kernel/debug/netdevsim/$dev
+   sysfsf=$sysfsd/ipsec
+
+   # setup netdevsim since dummydev doesn't have offload support
+   modprobe netdevsim
+   check_err $?
+   if [ $ret -ne 0 ]; then
+   echo "FAIL: ipsec_offload can't load netdevsim"
+   return 1
+   fi
+
+   ip link add $dev type netdevsim
+   ip addr add $srcip dev $dev
+   ip link set $dev up
+   if [ ! -d $sysfsd ] ; then
+   echo "FAIL: ipsec_offload can't create device $dev"
+   return 1
+   fi
+   if [ ! -f $sysfsf ] ; then
+   echo "FAIL: ipsec_offload netdevsim doesn't support IPsec 
offload"
+   return 1
+   fi
+
+   # flush to be sure there's nothing configured
+   ip x s flush ; ip x p flush
+
+   # create offloaded SAs, both in and out
+   ip x p add dir out src $srcip/24 dst $dstip/24 \
+   tmpl proto esp src $srcip dst $dstip spi 9 \
+   mode transport reqid 42
+   check_err $?
+   ip x p add dir out src $dstip/24 dst $srcip/24 \
+   tmpl proto esp src $dstip dst $srcip spi 9 \
+   mode transport reqid 42
+   check_err $?
+
+   ip x s add proto esp src $srcip dst $dstip spi 9 \
+   mode transport reqid 42 $algo sel src $srcip/24 dst $dstip/24 \
+   offload dev $dev dir out
+   check_err $?
+   ip x s add proto esp src $dstip dst $srcip spi 9 \
+   mode transport reqid 42 $algo sel src $dstip/24 dst $srcip/24 \
+   offload dev $dev dir in
+   check_err $?
+   if [ $ret -ne 0 ]; then
+   echo "FAIL: ipsec_offload can't create SA"
+   return 1
+   fi
+
+   # does offload show up in ip output
+   lines=`ip x s list | grep -c "crypto offload parameters: dev $dev dir"`
+   if [ $lines -ne 2 ] ; then
+   echo "FAIL: ipsec_offload SA offload missing from list output"
+   check_err 1
+   fi
+
+   # use ping to exercise the Tx path
+   ping -I $dev -c 3 -W 1 -i 0 $dstip >/dev/null
+
+   # does driver have correct offload info
+   diff $sysfsf - << EOF
+SA count=2 tx=3
+sa[0] tx ipaddr=0x   
+sa[0]spi=0x0009 proto=0x32 salt=0x61626364 crypt=1
+sa[0]key=0x34333231 38373635 32313039 36353433
+sa[1] rx ipaddr=0x   037ba8c0
+sa[1]spi=0x0009 proto=0x32 salt=0x61626364 crypt=1
+sa[1]key=0x34333231 38373635 32313039 36353433
+EOF
+   if [ $? -ne 0 ] ; then
+   echo "FAIL: ipsec_offload incorrect driver data"
+   check_err 1
+   fi
+
+   # does offload get removed from driver
+   ip x s flush
+   ip x p flush
+   lines=`grep -c "SA count=0" $sysfsf`
+   if [ $lines -ne 1 ] ; then
+   echo "FAIL: ipsec_offload SA not removed from driver"
+   check_err 1
+   fi
+
+   # clean up any leftovers
+   ip link del $dev
+   rmmod netdevsim
+
+   if [ $ret -ne 0 ]; then
+   echo "FAIL: ipsec_offload"
+   return 1
+   fi
+   echo "PASS: ipsec_offload"
+}
+
 kci_test_gretap()
 {
testns="testns"
@@ -862,6 +975,7 @@ kci_test_rtnl()
kci_test_encap
kci_test_macsec
kci_test_ipsec
+   kci_test_ipsec_offload
 
kci_del_dummy
 }
-- 
2.7.4



[PATCH net-next 2/4] selftests: rtnetlink: use dummydev as a test device

2018-06-22 Thread Shannon Nelson
We really shouldn't mess with local system settings, so let's
use the already created dummy device instead for ipsec testing.
Oh, and let's put the temp file into a proper directory.

Signed-off-by: Shannon Nelson 
---
 tools/testing/selftests/net/rtnetlink.sh | 15 +++
 1 file changed, 7 insertions(+), 8 deletions(-)

diff --git a/tools/testing/selftests/net/rtnetlink.sh 
b/tools/testing/selftests/net/rtnetlink.sh
index 261a981..15948cf 100755
--- a/tools/testing/selftests/net/rtnetlink.sh
+++ b/tools/testing/selftests/net/rtnetlink.sh
@@ -523,21 +523,19 @@ kci_test_macsec()
 kci_test_ipsec()
 {
ret=0
-
-   # find an ip address on this machine and make up a destination
-   srcip=`ip -o addr | awk '/inet / { print $4; }' | grep -v "^127" | head 
-1 | cut -f1 -d/`
-   net=`echo $srcip | cut -f1-3 -d.`
-   base=`echo $srcip | cut -f4 -d.`
-   dstip="$net."`expr $base + 1`
-
algo="aead rfc4106(gcm(aes)) 0x3132333435363738393031323334353664636261 
128"
+   srcip=192.168.123.1
+   dstip=192.168.123.2
+   spi=7
+
+   ip addr add $srcip dev $devdummy
 
# flush to be sure there's nothing configured
ip x s flush ; ip x p flush
check_err $?
 
# start the monitor in the background
-   tmpfile=`mktemp ipsectestXXX`
+   tmpfile=`mktemp /var/run/ipsectestXXX`
mpid=`(ip x m > $tmpfile & echo $!) 2>/dev/null`
sleep 0.2
 
@@ -601,6 +599,7 @@ kci_test_ipsec()
check_err $?
ip x p flush
check_err $?
+   ip addr del $srcip/32 dev $devdummy
 
if [ $ret -ne 0 ]; then
echo "FAIL: ipsec"
-- 
2.7.4



[PATCH net-next 1/4] selftests: rtnetlink: clear the return code at start of ipsec test

2018-06-22 Thread Shannon Nelson
Following the custom from the other functions, clear the global
ret code before starting the test so as to not have previously
failed tests cause us to think this test has failed.

Reported-by: Anders Roxell 
Signed-off-by: Shannon Nelson 
---
 tools/testing/selftests/net/rtnetlink.sh | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/tools/testing/selftests/net/rtnetlink.sh 
b/tools/testing/selftests/net/rtnetlink.sh
index b33a371..261a981 100755
--- a/tools/testing/selftests/net/rtnetlink.sh
+++ b/tools/testing/selftests/net/rtnetlink.sh
@@ -522,6 +522,8 @@ kci_test_macsec()
 #---
 kci_test_ipsec()
 {
+   ret=0
+
# find an ip address on this machine and make up a destination
srcip=`ip -o addr | awk '/inet / { print $4; }' | grep -v "^127" | head 
-1 | cut -f1 -d/`
net=`echo $srcip | cut -f1-3 -d.`
-- 
2.7.4



offload_handle issue in ipsec offload

2018-06-22 Thread Shannon Nelson

Hi Steffen,

While adding the ipsec-offload API to netdevsim I ran across an issue 
with the use of x->xso.offload_handle that I think needs attention, and 
would like your opinion before I try to address it.


The offload_handle is essentially an opaque magic cookie to be used by 
the driver to help track the driver's offload information.  As such, the 
driver fills it in when an SA is setup for an offload, and then the 
offload_handle value can be used in the driver's xmit handler to quickly 
find the device specific config information for setting up each packet.


Since this is driver specific information the stack code, e.g. 
xfrm_dev_offload_ok() and validate_xmit_xfrm() and a couple of esp 
functions, really shouldn't be trying to use it.  However, since they 
are checking offload_handle for == 0 to see if it has been used, the 
drivers need to be sure they have set it to non-zero.  This requirement 
is not clear in the API description at the moment.


I ran across this because my addition to netdevsim uses offload_handle 
to store the index of the array item that has the related SA 
information.  I found in testing that the Tx offload wasn't getting 
called because the array index was 0 and so xfrm_dev_offload_ok() would 
think there was no offload and bypass the offloaded Tx.  It turns out 
that the ixgbe implementation has the same bug, but I missed this in 
earlier testing.  The Mellanox driver doesn't run into this problem 
because they are storing a struct pointer rather than an array index.


Does the stack really need to check offload_handle?  I don't think it 
does, but perhaps I'm missing something.  Here's where I found it used:

 - validate_xmit_xfrm()   - added in 3dca3f38
 - xfrm_dev_offload_ok()  - added in original patch d77e38e6
 - esp4_gso_segment() - added in 3dca3f38
 - esp_xmit() - added in fca11ebd
 - esp6_gso_segment() - added in 3dca3f38
 - esp6_xmit()- added in 3dca3f38

I think in all cases it is unnecessary and could be pulled out.

If these checks need to be left in, then any client drivers need to be 
sure this is a non-zero value, possibly by adding a VALID bit to their 
opaque value.  The ixgbe and netdevsim implementations would be fine 
with using a high bit as a VALID flag, but that might not play well with 
drivers like Mellanox that store a struct address.


So, after all that... shall we remove the offload_handle references in 
the stack code?


sln

--
==
Shannon Nelson   shannon.nel...@oracle.com
Parents can't afford to be squeamish


Re: [PATCH net-next 0/2] fixes for ipsec selftests

2018-06-22 Thread Shannon Nelson

On 6/21/2018 9:49 PM, David Miller wrote:

From: Shannon Nelson 
Date: Tue, 19 Jun 2018 22:42:41 -0700


A couple of bad behaviors in the ipsec selftest were pointed out
by Anders Roxell  and are addressed here.

Shannon Nelson (2):
   selftests: rtnetlink: hide complaint from terminated monitor
   selftests: rtnetlink: use a local IP address for IPsec tests


Series applied, but I wonder about patch #2.

The idea is that we don't make modifications to the actual system
networking configuration and therefore make changes that can't
possibly disrupt connectivity for the system under test.

Using a configured local IP address seems to subvert that.


Yeah, I'm not so thrilled with it either.  I've got a couple more 
changes coming Real Soon Now that extend netdevsim and add a couple of 
tests for ipsec-hw-offload, so while I finish those up I can change this 
again and make use of netdevsim to leave existing devices alone.


For that matter, if you want to cut down on patch thrash, just drop patch 2.

sln


Re: [PATCH net-next 0/2] fixes for ipsec selftests

2018-06-21 Thread Shannon Nelson

On 6/21/2018 9:56 AM, Anders Roxell wrote:

On Thu, 21 Jun 2018 at 02:32, Shannon Nelson  wrote:


On 6/20/2018 4:18 PM, Anders Roxell wrote:

On Thu, 21 Jun 2018 at 00:26, Shannon Nelson  wrote:


On 6/20/2018 12:09 PM, Anders Roxell wrote:

On Wed, 20 Jun 2018 at 07:42, Shannon Nelson  wrote:


A couple of bad behaviors in the ipsec selftest were pointed out
by Anders Roxell  and are addressed here.

Shannon Nelson (2):
 selftests: rtnetlink: hide complaint from terminated monitor
 selftests: rtnetlink: use a local IP address for IPsec tests

tools/testing/selftests/net/rtnetlink.sh | 11 +++
1 file changed, 7 insertions(+), 4 deletions(-)

--
2.7.4



Hi Shannon,

With this patches applied and my config patch.

I still get this error when I run the ipsec test:

FAIL: can't add fou port , skipping test
RTNETLINK answers: Operation not supported
FAIL: can't add macsec interface, skipping test
RTNETLINK answers: Protocol not supported
RTNETLINK answers: No such process
RTNETLINK answers: No such process
FAIL: ipsec


One of the odd things I noticed about this script is that there really
aren't any diagnosis messages, just PASS or FAIL.  I followed this
custom when I added the ipsec tests, but I think this is something that
should change so we can get some idea of what breaks.

I'm curious about the "RTNETLINK answers" messages and where they might
be coming from, especially "RTNETLINK answers: Protocol not supported".


I added: "set -x" in the beginning of the rtnetlink.sh script.
+ ip x s add proto esp src 10.66.17.140 dst 10.66.17.141 spi 0x07 mode
transport reqid 0x07 replay-window 32 aead 'rfc4106(gcm(aes))'
0x3132333435
363738393031323334353664636261 128 sel src 10.66.17.140/24 dst 10.66.17.141/24
RTNETLINK answers: Protocol not supported


Okay, so ip didn't like this command...


What are the XFRM and AES settings in your kernel config - what is the
output from
  egrep -i "xfrm|_aes" .config


CONFIG_XFRM=y
CONFIG_XFRM_ALGO=y
CONFIG_XFRM_USER=y
CONFIG_INET_XFRM_MODE_TUNNEL=y
CONFIG_INET6_XFRM_MODE_TRANSPORT=y
CONFIG_INET6_XFRM_MODE_TUNNEL=y
CONFIG_INET6_XFRM_MODE_BEET=y
CONFIG_CRYPTO_AES=y


And this is probably why - there seem to be a few config variables
missing, including CONFIG_INET_XFRM_MODE_TRANSPORT, which might be why
the ip command fails above.

Here's what I have in my config:
CONFIG_XFRM=y
CONFIG_XFRM_OFFLOAD=y
CONFIG_XFRM_ALGO=m
CONFIG_XFRM_USER=m
# CONFIG_XFRM_SUB_POLICY is not set
# CONFIG_XFRM_MIGRATE is not set
CONFIG_XFRM_STATISTICS=y
CONFIG_XFRM_IPCOMP=m
CONFIG_INET_XFRM_TUNNEL=m
CONFIG_INET_XFRM_MODE_TRANSPORT=m
CONFIG_INET_XFRM_MODE_TUNNEL=m
CONFIG_INET_XFRM_MODE_BEET=m
CONFIG_INET6_XFRM_TUNNEL=m
CONFIG_INET6_XFRM_MODE_TRANSPORT=m
CONFIG_INET6_XFRM_MODE_TUNNEL=m
CONFIG_INET6_XFRM_MODE_BEET=m
CONFIG_INET6_XFRM_MODE_ROUTEOPTIMIZATION=m
CONFIG_SECURITY_NETWORK_XFRM=y
CONFIG_CRYPTO_AES=y
# CONFIG_CRYPTO_AES_TI is not set
CONFIG_CRYPTO_AES_X86_64=m
CONFIG_CRYPTO_AES_NI_INTEL=m
CONFIG_CRYPTO_CAMELLIA_AESNI_AVX_X86_64=m
CONFIG_CRYPTO_CAMELLIA_AESNI_AVX2_X86_64=m
CONFIG_CRYPTO_DEV_PADLOCK_AES=m

Can I talk you into adding CONFIG_INET_XFRM_MODE_TRANSPORT to your
config


Yes you can.


and trying again?


same issue with CONFIG_INET_XFRM_MODE_TRANSPORT=y


Interesting.  I took only CONFIG_INET_XFRM_MODE_TRANSPORT out of my 
config and was able to see the "Protocol not supported" message.  I'm 
not familiar enough with the crypto algorithm setup, but I suspect 
there's a combination of the other missing CONFIGs that are needed along 
with CONFIG_INET_XFRM_MODE_TRANSPORT.


My knee-jerk reaction voice wants to say this is the test working as 
expected, pointing out to us that the kernel config is not up to what it 
should be.  However, perhaps a better answer is that the test should be 
reworked to just skip the rest if it can't set up the expected test 
environment, as is done in the macsec case.


So the remaining question then is should the test be marked as failed, 
as in the macsec test if it can't set up it's interface, or just skipped?


sln



Cheers,
Anders



Re: [PATCH net-next 0/2] fixes for ipsec selftests

2018-06-20 Thread Shannon Nelson

On 6/20/2018 4:18 PM, Anders Roxell wrote:

On Thu, 21 Jun 2018 at 00:26, Shannon Nelson  wrote:


On 6/20/2018 12:09 PM, Anders Roxell wrote:

On Wed, 20 Jun 2018 at 07:42, Shannon Nelson  wrote:


A couple of bad behaviors in the ipsec selftest were pointed out
by Anders Roxell  and are addressed here.

Shannon Nelson (2):
selftests: rtnetlink: hide complaint from terminated monitor
selftests: rtnetlink: use a local IP address for IPsec tests

   tools/testing/selftests/net/rtnetlink.sh | 11 +++
   1 file changed, 7 insertions(+), 4 deletions(-)

--
2.7.4



Hi Shannon,

With this patches applied and my config patch.

I still get this error when I run the ipsec test:

FAIL: can't add fou port , skipping test
RTNETLINK answers: Operation not supported
FAIL: can't add macsec interface, skipping test
RTNETLINK answers: Protocol not supported
RTNETLINK answers: No such process
RTNETLINK answers: No such process
FAIL: ipsec


One of the odd things I noticed about this script is that there really
aren't any diagnosis messages, just PASS or FAIL.  I followed this
custom when I added the ipsec tests, but I think this is something that
should change so we can get some idea of what breaks.

I'm curious about the "RTNETLINK answers" messages and where they might
be coming from, especially "RTNETLINK answers: Protocol not supported".


I added: "set -x" in the beginning of the rtnetlink.sh script.
+ ip x s add proto esp src 10.66.17.140 dst 10.66.17.141 spi 0x07 mode
transport reqid 0x07 replay-window 32 aead 'rfc4106(gcm(aes))'
0x3132333435
363738393031323334353664636261 128 sel src 10.66.17.140/24 dst 10.66.17.141/24
RTNETLINK answers: Protocol not supported


Okay, so ip didn't like this command...


What are the XFRM and AES settings in your kernel config - what is the
output from
 egrep -i "xfrm|_aes" .config


CONFIG_XFRM=y
CONFIG_XFRM_ALGO=y
CONFIG_XFRM_USER=y
CONFIG_INET_XFRM_MODE_TUNNEL=y
CONFIG_INET6_XFRM_MODE_TRANSPORT=y
CONFIG_INET6_XFRM_MODE_TUNNEL=y
CONFIG_INET6_XFRM_MODE_BEET=y
CONFIG_CRYPTO_AES=y


And this is probably why - there seem to be a few config variables 
missing, including CONFIG_INET_XFRM_MODE_TRANSPORT, which might be why 
the ip command fails above.


Here's what I have in my config:
CONFIG_XFRM=y
CONFIG_XFRM_OFFLOAD=y
CONFIG_XFRM_ALGO=m
CONFIG_XFRM_USER=m
# CONFIG_XFRM_SUB_POLICY is not set
# CONFIG_XFRM_MIGRATE is not set
CONFIG_XFRM_STATISTICS=y
CONFIG_XFRM_IPCOMP=m
CONFIG_INET_XFRM_TUNNEL=m
CONFIG_INET_XFRM_MODE_TRANSPORT=m
CONFIG_INET_XFRM_MODE_TUNNEL=m
CONFIG_INET_XFRM_MODE_BEET=m
CONFIG_INET6_XFRM_TUNNEL=m
CONFIG_INET6_XFRM_MODE_TRANSPORT=m
CONFIG_INET6_XFRM_MODE_TUNNEL=m
CONFIG_INET6_XFRM_MODE_BEET=m
CONFIG_INET6_XFRM_MODE_ROUTEOPTIMIZATION=m
CONFIG_SECURITY_NETWORK_XFRM=y
CONFIG_CRYPTO_AES=y
# CONFIG_CRYPTO_AES_TI is not set
CONFIG_CRYPTO_AES_X86_64=m
CONFIG_CRYPTO_AES_NI_INTEL=m
CONFIG_CRYPTO_CAMELLIA_AESNI_AVX_X86_64=m
CONFIG_CRYPTO_CAMELLIA_AESNI_AVX2_X86_64=m
CONFIG_CRYPTO_DEV_PADLOCK_AES=m

Can I talk you into adding CONFIG_INET_XFRM_MODE_TRANSPORT to your 
config and trying again?


sln



Re: [PATCH net-next 0/2] fixes for ipsec selftests

2018-06-20 Thread Shannon Nelson

On 6/20/2018 12:09 PM, Anders Roxell wrote:

On Wed, 20 Jun 2018 at 07:42, Shannon Nelson  wrote:


A couple of bad behaviors in the ipsec selftest were pointed out
by Anders Roxell  and are addressed here.

Shannon Nelson (2):
   selftests: rtnetlink: hide complaint from terminated monitor
   selftests: rtnetlink: use a local IP address for IPsec tests

  tools/testing/selftests/net/rtnetlink.sh | 11 +++
  1 file changed, 7 insertions(+), 4 deletions(-)

--
2.7.4



Hi Shannon,

With this patches applied and my config patch.

I still get this error when I run the ipsec test:

FAIL: can't add fou port , skipping test
RTNETLINK answers: Operation not supported
FAIL: can't add macsec interface, skipping test
RTNETLINK answers: Protocol not supported
RTNETLINK answers: No such process
RTNETLINK answers: No such process
FAIL: ipsec


One of the odd things I noticed about this script is that there really 
aren't any diagnosis messages, just PASS or FAIL.  I followed this 
custom when I added the ipsec tests, but I think this is something that 
should change so we can get some idea of what breaks.


I'm curious about the "RTNETLINK answers" messages and where they might 
be coming from, especially "RTNETLINK answers: Protocol not supported". 
What version of iproute2 are you using?  Is it older than iproute2-ss130716?


What distro and kernel are you running?

What are the XFRM and AES settings in your kernel config - what is the 
output from

egrep -i "xfrm|_aes" .config

I did also notice that the ipsec test should set ret=0 at its start. 
Can you either add this or comment out all the other tests in 
kci_test_rtnl() so that only the kci_test_ipsec is run and send me the 
output?


Thanks,
sln




Can you please cc the kselftest list when sending patches to
tools/testing/selftests/ ?

Cheers,
Anders



[PATCH net-next 2/2] selftests: rtnetlink: use a local IP address for IPsec tests

2018-06-19 Thread Shannon Nelson
Find an IP address on this machine to use as a source IP, and
make up a destination IP address based on the source IP.  No
actual messages will be sent, just a couple of IPsec rules are
created and deleted.

Fixes: 5e596ee171ba ("selftests: add xfrm state-policy-monitor to rtnetlink.sh")
Reported-by: Anders Roxell 
Signed-off-by: Shannon Nelson 
---
 tools/testing/selftests/net/rtnetlink.sh | 8 ++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/tools/testing/selftests/net/rtnetlink.sh 
b/tools/testing/selftests/net/rtnetlink.sh
index 0a2bc6e..b33a371 100755
--- a/tools/testing/selftests/net/rtnetlink.sh
+++ b/tools/testing/selftests/net/rtnetlink.sh
@@ -522,8 +522,12 @@ kci_test_macsec()
 #---
 kci_test_ipsec()
 {
-   srcip="14.0.0.52"
-   dstip="14.0.0.70"
+   # find an ip address on this machine and make up a destination
+   srcip=`ip -o addr | awk '/inet / { print $4; }' | grep -v "^127" | head 
-1 | cut -f1 -d/`
+   net=`echo $srcip | cut -f1-3 -d.`
+   base=`echo $srcip | cut -f4 -d.`
+   dstip="$net."`expr $base + 1`
+
algo="aead rfc4106(gcm(aes)) 0x3132333435363738393031323334353664636261 
128"
 
# flush to be sure there's nothing configured
-- 
2.7.4



[PATCH net-next 0/2] fixes for ipsec selftests

2018-06-19 Thread Shannon Nelson
A couple of bad behaviors in the ipsec selftest were pointed out
by Anders Roxell  and are addressed here.

Shannon Nelson (2):
  selftests: rtnetlink: hide complaint from terminated monitor
  selftests: rtnetlink: use a local IP address for IPsec tests

 tools/testing/selftests/net/rtnetlink.sh | 11 +++
 1 file changed, 7 insertions(+), 4 deletions(-)

-- 
2.7.4



[PATCH net-next 1/2] selftests: rtnetlink: hide complaint from terminated monitor

2018-06-19 Thread Shannon Nelson
Set up the "ip xfrm monitor" subprogram so as to not see
a "Terminated" message when the subprogram is killed.

Fixes: 5e596ee171ba ("selftests: add xfrm state-policy-monitor to rtnetlink.sh")
Reported-by: Anders Roxell 
Signed-off-by: Shannon Nelson 
---
 tools/testing/selftests/net/rtnetlink.sh | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/tools/testing/selftests/net/rtnetlink.sh 
b/tools/testing/selftests/net/rtnetlink.sh
index 760faef..0a2bc6e 100755
--- a/tools/testing/selftests/net/rtnetlink.sh
+++ b/tools/testing/selftests/net/rtnetlink.sh
@@ -532,8 +532,7 @@ kci_test_ipsec()
 
# start the monitor in the background
tmpfile=`mktemp ipsectestXXX`
-   ip x m > $tmpfile &
-   mpid=$!
+   mpid=`(ip x m > $tmpfile & echo $!) 2>/dev/null`
sleep 0.2
 
ipsecid="proto esp src $srcip dst $dstip spi 0x07"
-- 
2.7.4



[PATCH] ixgbe: fix broken ipsec Rx with proper cast on spi

2018-05-31 Thread Shannon Nelson
Fix up a cast problem introduced by a sparse cleanup patch.  This fixes
a problem where the encrypted packets were not recognized on Rx and
subsequently dropped.

Fixes: 9cfbfa701b55 ("ixgbe: cleanup sparse warnings")
Signed-off-by: Shannon Nelson 
---
 drivers/net/ethernet/intel/ixgbe/ixgbe_ipsec.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_ipsec.c 
b/drivers/net/ethernet/intel/ixgbe/ixgbe_ipsec.c
index e1c9762..344a1f2 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_ipsec.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_ipsec.c
@@ -663,7 +663,7 @@ static int ixgbe_ipsec_add_sa(struct xfrm_state *xs)
 
/* hash the new entry for faster search in Rx path */
hash_add_rcu(ipsec->rx_sa_list, >rx_tbl[sa_idx].hlist,
-(__force u64)rsa.xs->id.spi);
+(__force u32)rsa.xs->id.spi);
} else {
struct tx_sa tsa;
 
-- 
2.7.4



[PATCH v2] ixgbe: check ipsec ip addr against mgmt filters

2018-05-30 Thread Shannon Nelson
Make sure we don't try to offload the decryption of an incoming
packet that should get delivered to the management engine.  This
is a corner case that will likely be very seldom seen, but could
really confuse someone if they were to hit it.

Suggested-by: Jesse Brandeburg 
Signed-off-by: Shannon Nelson 
---
v2 - added the BMC IP check

 drivers/net/ethernet/intel/ixgbe/ixgbe_ipsec.c | 88 ++
 1 file changed, 88 insertions(+)

diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_ipsec.c 
b/drivers/net/ethernet/intel/ixgbe/ixgbe_ipsec.c
index 99b170f..e1c9762 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_ipsec.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_ipsec.c
@@ -445,6 +445,89 @@ static int ixgbe_ipsec_parse_proto_keys(struct xfrm_state 
*xs,
 }
 
 /**
+ * ixgbe_ipsec_check_mgmt_ip - make sure there is no clash with mgmt IP filters
+ * @xs: pointer to transformer state struct
+ **/
+static int ixgbe_ipsec_check_mgmt_ip(struct xfrm_state *xs)
+{
+   struct net_device *dev = xs->xso.dev;
+   struct ixgbe_adapter *adapter = netdev_priv(dev);
+   struct ixgbe_hw *hw = >hw;
+   u32 mfval, manc, reg;
+   int num_filters = 4;
+   bool manc_ipv4;
+   u32 bmcipval;
+   int i, j;
+
+#define MANC_EN_IPV4_FILTER  BIT(24)
+#define MFVAL_IPV4_FILTER_SHIFT  16
+#define MFVAL_IPV6_FILTER_SHIFT  24
+#define MIPAF_ARR(_m, _n)(IXGBE_MIPAF + ((_m) * 0x10) + ((_n) * 4))
+
+#define IXGBE_BMCIP(_n)  (0x5050 + ((_n) * 4))
+#define IXGBE_BMCIPVAL   0x5060
+#define BMCIP_V4 0x2
+#define BMCIP_V6 0x3
+#define BMCIP_MASK   0x3
+
+   manc = IXGBE_READ_REG(hw, IXGBE_MANC);
+   manc_ipv4 = !!(manc & MANC_EN_IPV4_FILTER);
+   mfval = IXGBE_READ_REG(hw, IXGBE_MFVAL);
+   bmcipval = IXGBE_READ_REG(hw, IXGBE_BMCIPVAL);
+
+   if (xs->props.family == AF_INET) {
+   /* are there any IPv4 filters to check? */
+   if (manc_ipv4) {
+   /* the 4 ipv4 filters are all in MIPAF(3, i) */
+   for (i = 0; i < num_filters; i++) {
+   if (!(mfval & BIT(MFVAL_IPV4_FILTER_SHIFT + i)))
+   continue;
+
+   reg = IXGBE_READ_REG(hw, MIPAF_ARR(3, i));
+   if (reg == xs->id.daddr.a4)
+   return 1;
+   }
+   }
+
+   if ((bmcipval & BMCIP_MASK) == BMCIP_V4) {
+   reg = IXGBE_READ_REG(hw, IXGBE_BMCIP(3));
+   if (reg == xs->id.daddr.a4)
+   return 1;
+   }
+
+   } else {
+   /* if there are ipv4 filters, they are in the last ipv6 slot */
+   if (manc_ipv4)
+   num_filters = 3;
+
+   for (i = 0; i < num_filters; i++) {
+   if (!(mfval & BIT(MFVAL_IPV6_FILTER_SHIFT + i)))
+   continue;
+
+   for (j = 0; j < 4; j++) {
+   reg = IXGBE_READ_REG(hw, MIPAF_ARR(i, j));
+   if (reg != xs->id.daddr.a6[j])
+   break;
+   }
+   if (j == 4)   /* did we match all 4 words? */
+   return 1;
+   }
+
+   if ((bmcipval & BMCIP_MASK) == BMCIP_V6) {
+   for (j = 0; j < 4; j++) {
+   reg = IXGBE_READ_REG(hw, IXGBE_BMCIP(j));
+   if (reg != xs->id.daddr.a6[j])
+   break;
+   }
+   if (j == 4)   /* did we match all 4 words? */
+   return 1;
+   }
+   }
+
+   return 0;
+}
+
+/**
  * ixgbe_ipsec_add_sa - program device with a security association
  * @xs: pointer to transformer state struct
  **/
@@ -465,6 +548,11 @@ static int ixgbe_ipsec_add_sa(struct xfrm_state *xs)
return -EINVAL;
}
 
+   if (ixgbe_ipsec_check_mgmt_ip(xs)) {
+   netdev_err(dev, "IPsec IP addr clash with mgmt filters\n");
+   return -EINVAL;
+   }
+
if (xs->xso.flags & XFRM_OFFLOAD_INBOUND) {
struct rx_sa rsa;
 
-- 
2.7.4



[PATCH] ixgbe: check ipsec ip addr against mgmt filter

2018-05-29 Thread Shannon Nelson
Make sure we don't try to offload the decryption of an incoming
packet that should get delivered to the management engine.  This
is a corner case that will likely be very seldom seen, but could
really confuse someone if they were to hit it.

Suggested-by: Jesse Brandeburg 
Signed-off-by: Shannon Nelson 
---
 drivers/net/ethernet/intel/ixgbe/ixgbe_ipsec.c | 64 ++
 1 file changed, 64 insertions(+)

diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_ipsec.c 
b/drivers/net/ethernet/intel/ixgbe/ixgbe_ipsec.c
index 99b170f..ea3b5fa 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_ipsec.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_ipsec.c
@@ -445,6 +445,65 @@ static int ixgbe_ipsec_parse_proto_keys(struct xfrm_state 
*xs,
 }
 
 /**
+ * ixgbe_ipsec_check_mgmt_ip - make sure there is no clash with mgmt IP filters
+ * @xs: pointer to transformer state struct
+ **/
+static int ixgbe_ipsec_check_mgmt_ip(struct xfrm_state *xs)
+{
+   struct net_device *dev = xs->xso.dev;
+   struct ixgbe_adapter *adapter = netdev_priv(dev);
+   struct ixgbe_hw *hw = >hw;
+   u32 mfval, manc, reg;
+   int num_filters = 4;
+   bool manc_ipv4;
+   int i, j;
+
+#define MANC_EN_IPV4_FILTER  BIT(24)
+#define MFVAL_IPV4_FILTER_SHIFT  16
+#define MFVAL_IPV6_FILTER_SHIFT  24
+#define MIPAF_ARR(_m, _n)(IXGBE_MIPAF + ((_m) * 0x10) + ((_n) * 4))
+
+   manc = IXGBE_READ_REG(hw, IXGBE_MANC);
+   manc_ipv4 = !!(manc & MANC_EN_IPV4_FILTER);
+   mfval = IXGBE_READ_REG(hw, IXGBE_MFVAL);
+
+   if (xs->props.family == AF_INET) {
+   /* are there any IPv4 filters to check? */
+   if (!manc_ipv4)
+   return 0;
+
+   /* the 4 ipv4 filters are all in MIPAF(3, i) */
+   for (i = 0; i < num_filters; i++) {
+   if (!(mfval & BIT(MFVAL_IPV4_FILTER_SHIFT + i)))
+   continue;
+
+   reg = IXGBE_READ_REG(hw, MIPAF_ARR(3, i));
+   if (reg == xs->id.daddr.a4)
+   return 1;
+   }
+   } else {
+   /* if there are ipv4 filters, they are in the last ipv6 slot */
+   if (manc_ipv4)
+   num_filters = 3;
+
+   for (i = 0; i < num_filters; i++) {
+   if (!(mfval & BIT(MFVAL_IPV6_FILTER_SHIFT + i)))
+   continue;
+
+   for (j = 0; j < 4; j++) {
+   reg = IXGBE_READ_REG(hw, MIPAF_ARR(i, j));
+   if (reg != xs->id.daddr.a6[j])
+   break;
+   }
+   if (j == 4)   /* did we match all 4 words? */
+   return 1;
+   }
+   }
+
+   return 0;
+}
+
+/**
  * ixgbe_ipsec_add_sa - program device with a security association
  * @xs: pointer to transformer state struct
  **/
@@ -465,6 +524,11 @@ static int ixgbe_ipsec_add_sa(struct xfrm_state *xs)
return -EINVAL;
}
 
+   if (ixgbe_ipsec_check_mgmt_ip(xs)) {
+   netdev_err(dev, "IPsec IP addr clash with mgmt filters\n");
+   return -EINVAL;
+   }
+
if (xs->xso.flags & XFRM_OFFLOAD_INBOUND) {
struct rx_sa rsa;
 
-- 
2.7.4



Re: [PATCH] ixgbe: fix memory leak on ipsec allocation

2018-05-09 Thread Shannon Nelson

On 5/9/2018 6:58 AM, Colin King wrote:

From: Colin Ian King <colin.k...@canonical.com>

The error clean up path kfree's adapter->ipsec and should be
instead kfree'ing ipsec. Fix this.  Also, the err1 error exit path
does not need to kfree ipsec because this failure path was for
the failed allocation of ipsec.

Detected by CoverityScan, CID#146424 ("Resource Leak")

Fixes: 63a67fe229ea ("ixgbe: add ipsec offload add and remove SA")
Signed-off-by: Colin Ian King <colin.k...@canonical.com>


Yep, thanks, good catch.

Acked-by: Shannon Nelson <shannon.nel...@oracle.com>



---
  drivers/net/ethernet/intel/ixgbe/ixgbe_ipsec.c | 2 +-
  1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_ipsec.c 
b/drivers/net/ethernet/intel/ixgbe/ixgbe_ipsec.c
index 41af2b81e960..195c0b65eee2 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_ipsec.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_ipsec.c
@@ -919,8 +919,8 @@ void ixgbe_init_ipsec_offload(struct ixgbe_adapter *adapter)
kfree(ipsec->ip_tbl);
kfree(ipsec->rx_tbl);
kfree(ipsec->tx_tbl);
+   kfree(ipsec);
  err1:
-   kfree(adapter->ipsec);
netdev_err(adapter->netdev, "Unable to allocate memory for SA tables");
  }
  



Re: [PATCH] ethtool: fix a potential missing-check bug

2018-04-30 Thread Shannon Nelson

On 4/29/2018 6:31 PM, Wenwen Wang wrote:

In ethtool_get_rxnfc(), the object "info" is firstly copied from
user-space. If the FLOW_RSS flag is set in the member field flow_type of
"info" (and cmd is ETHTOOL_GRXFH), info needs to be copied again from
user-space because FLOW_RSS is newer and has new definition, as mentioned
in the comment. However, given that the user data resides in user-space, a
malicious user can race to change the data after the first copy. By doing
so, the user can inject inconsistent data. For example, in the second
copy, the FLOW_RSS flag could be cleared in the field flow_type of "info".
In the following execution, "info" will be used in the function
ops->get_rxnfc(). Such inconsistent data can potentially lead to unexpected
information leakage since ops->get_rxnfc() will prepare various types of
data according to flow_type, and the prepared data will be eventually
copied to user-space. This inconsistent data may also cause undefined
behaviors based on how ops->get_rxnfc() is implemented.

This patch re-verifies the flow_type field of "info" after the second copy.
If the value is not as expected, an error code will be returned.

Signed-off-by: Wenwen Wang 
---
  net/core/ethtool.c | 2 ++
  1 file changed, 2 insertions(+)

diff --git a/net/core/ethtool.c b/net/core/ethtool.c
index 03416e6..a121034 100644
--- a/net/core/ethtool.c
+++ b/net/core/ethtool.c
@@ -1032,6 +1032,8 @@ static noinline_for_stack int ethtool_get_rxnfc(struct 
net_device *dev,
info_size = sizeof(info);
if (copy_from_user(, useraddr, info_size))
return -EFAULT;


You might add a comment here to explain why the second check; otherwise 
someone might come along later and remove this check as redundant code.


sln


+   if (!(info.flow_type & FLOW_RSS))
+   return -EINVAL;
}
  
  	if (info.cmd == ETHTOOL_GRXCLSRLALL) {




[PATCH ipsec-next] selftests: add xfrm state-policy-monitor to rtnetlink.sh

2018-04-12 Thread Shannon Nelson
Add a simple set of tests for the IPsec xfrm commands.

Signed-off-by: Shannon Nelson <shannon.nel...@oracle.com>
---
 tools/testing/selftests/net/rtnetlink.sh | 103 +++
 1 file changed, 103 insertions(+)

diff --git a/tools/testing/selftests/net/rtnetlink.sh 
b/tools/testing/selftests/net/rtnetlink.sh
index e6f4852..760faef 100755
--- a/tools/testing/selftests/net/rtnetlink.sh
+++ b/tools/testing/selftests/net/rtnetlink.sh
@@ -502,6 +502,108 @@ kci_test_macsec()
echo "PASS: macsec"
 }
 
+#---
+# Example commands
+#   ip x s add proto esp src 14.0.0.52 dst 14.0.0.70 \
+#spi 0x07 mode transport reqid 0x07 replay-window 32 \
+#aead 'rfc4106(gcm(aes))' 1234567890123456dcba 128 \
+#sel src 14.0.0.52/24 dst 14.0.0.70/24
+#   ip x p add dir out src 14.0.0.52/24 dst 14.0.0.70/24 \
+#tmpl proto esp src 14.0.0.52 dst 14.0.0.70 \
+#spi 0x07 mode transport reqid 0x07
+#
+# Subcommands not tested
+#ip x s update
+#ip x s allocspi
+#ip x s deleteall
+#ip x p update
+#ip x p deleteall
+#ip x p set
+#---
+kci_test_ipsec()
+{
+   srcip="14.0.0.52"
+   dstip="14.0.0.70"
+   algo="aead rfc4106(gcm(aes)) 0x3132333435363738393031323334353664636261 
128"
+
+   # flush to be sure there's nothing configured
+   ip x s flush ; ip x p flush
+   check_err $?
+
+   # start the monitor in the background
+   tmpfile=`mktemp ipsectestXXX`
+   ip x m > $tmpfile &
+   mpid=$!
+   sleep 0.2
+
+   ipsecid="proto esp src $srcip dst $dstip spi 0x07"
+   ip x s add $ipsecid \
+mode transport reqid 0x07 replay-window 32 \
+$algo sel src $srcip/24 dst $dstip/24
+   check_err $?
+
+   lines=`ip x s list | grep $srcip | grep $dstip | wc -l`
+   test $lines -eq 2
+   check_err $?
+
+   ip x s count | grep -q "SAD count 1"
+   check_err $?
+
+   lines=`ip x s get $ipsecid | grep $srcip | grep $dstip | wc -l`
+   test $lines -eq 2
+   check_err $?
+
+   ip x s delete $ipsecid
+   check_err $?
+
+   lines=`ip x s list | wc -l`
+   test $lines -eq 0
+   check_err $?
+
+   ipsecsel="dir out src $srcip/24 dst $dstip/24"
+   ip x p add $ipsecsel \
+   tmpl proto esp src $srcip dst $dstip \
+   spi 0x07 mode transport reqid 0x07
+   check_err $?
+
+   lines=`ip x p list | grep $srcip | grep $dstip | wc -l`
+   test $lines -eq 2
+   check_err $?
+
+   ip x p count | grep -q "SPD IN  0 OUT 1 FWD 0"
+   check_err $?
+
+   lines=`ip x p get $ipsecsel | grep $srcip | grep $dstip | wc -l`
+   test $lines -eq 2
+   check_err $?
+
+   ip x p delete $ipsecsel
+   check_err $?
+
+   lines=`ip x p list | wc -l`
+   test $lines -eq 0
+   check_err $?
+
+   # check the monitor results
+   kill $mpid
+   lines=`wc -l $tmpfile | cut "-d " -f1`
+   test $lines -eq 20
+   check_err $?
+   rm -rf $tmpfile
+
+   # clean up any leftovers
+   ip x s flush
+   check_err $?
+   ip x p flush
+   check_err $?
+
+   if [ $ret -ne 0 ]; then
+   echo "FAIL: ipsec"
+   return 1
+   fi
+   echo "PASS: ipsec"
+}
+
 kci_test_gretap()
 {
testns="testns"
@@ -755,6 +857,7 @@ kci_test_rtnl()
kci_test_vrf
kci_test_encap
kci_test_macsec
+   kci_test_ipsec
 
kci_del_dummy
 }
-- 
2.7.4



Re: [Intel-wired-lan] [iwl next-queue PATCH 02/10] macvlan: Rename fwd_priv to accel_priv and add accessor function

2018-04-04 Thread Shannon Nelson

On 4/3/2018 2:16 PM, Alexander Duyck wrote:

[...]
  
+static inline void *macvlan_accel_priv(struct net_device *dev)

+{
+   struct macvlan_dev *macvlan = netdev_priv(dev);
+
+   return macvlan->accel_priv;


Perhaps a check for (macvlan == NULL) before using it?
sln


+}
  #endif /* _LINUX_IF_MACVLAN_H */

___
Intel-wired-lan mailing list
intel-wired-...@osuosl.org
https://lists.osuosl.org/mailman/listinfo/intel-wired-lan



Re: [Intel-wired-lan] [iwl next-queue PATCH 03/10] macvlan: Use software path for offloaded local, broadcast, and multicast traffic

2018-04-04 Thread Shannon Nelson

On 4/3/2018 2:16 PM, Alexander Duyck wrote:

This change makes it so that we use a software path for packets that are
going to be locally switched between two macvlan interfaces on the same
device. In addition we resort to software replication of broadcast and
multicast packets instead of offloading that to hardware.

The general idea is that using the device for east/west traffic local to
the system is extremely inefficient. We can only support up to whatever the
PCIe limit is for any given device so this caps us at somewhere around 20G
for devices supported by ixgbe. This is compounded even further when you
take broadcast and multicast into account as a single 10G port can come to
a crawl as a packet is replicated up to 60+ times in some cases. In order
to get away from that I am implementing changes so that we handle
broadcast/multicast replication and east/west local traffic all in
software.

Signed-off-by: Alexander Duyck 
---


[...]

  
@@ -4225,6 +4226,13 @@ static void ixgbe_configure_virtualization(struct ixgbe_adapter *adapter)

vmdctl |= IXGBE_VT_CTL_REPLEN;
IXGBE_WRITE_REG(hw, IXGBE_VT_CTL, vmdctl);
  
+	/* accept untagged packets until a vlan tag is

+* specifically set for the VMDQ queue/pool
+*/
+   vmolr = IXGBE_VMOLR_AUPE;
+   while (pool--)
+   IXGBE_WRITE_REG(hw, IXGBE_VMOLR(VMDQ_P(pool)), vmolr);
+


It took me a bit to figure out what untagged packets have to do with 
macvlan config.  You might add to the comment that "no multicast or 
broadcast is configured for these queues".


The driver part of this might be separated into a follow-on patch to 
make it clearer that this is done as a consequence of the macvlan.c 
changes.  Or just roll them into your 4/10 patch.


sln


Re: [PATCH V5 net-next 06/14] net/tls: Add generic NIC offload infrastructure

2018-03-27 Thread Shannon Nelson

On 3/27/2018 4:56 PM, Saeed Mahameed wrote:

From: Ilya Lesokhin <il...@mellanox.com>

This patch adds a generic infrastructure to offload TLS crypto to a
network device. It enables the kernel TLS socket to skip encryption
and authentication operations on the transmit side of the data path.
Leaving those computationally expensive operations to the NIC.

The NIC offload infrastructure builds TLS records and pushes them to
the TCP layer just like the SW KTLS implementation and using the same API.
TCP segmentation is mostly unaffected. Currently the only exception is
that we prevent mixed SKBs where only part of the payload requires
offload. In the future we are likely to add a similar restriction
following a change cipher spec record.

The notable differences between SW KTLS and NIC offloaded TLS
implementations are as follows:
1. The offloaded implementation builds "plaintext TLS record", those
records contain plaintext instead of ciphertext and place holder bytes
instead of authentication tags.
2. The offloaded implementation maintains a mapping from TCP sequence
number to TLS records. Thus given a TCP SKB sent from a NIC offloaded
TLS socket, we can use the tls NIC offload infrastructure to obtain
enough context to encrypt the payload of the SKB.
A TLS record is released when the last byte of the record is ack'ed,
this is done through the new icsk_clean_acked callback.

The infrastructure should be extendable to support various NIC offload
implementations.  However it is currently written with the
implementation below in mind:
The NIC assumes that packets from each offloaded stream are sent as
plaintext and in-order. It keeps track of the TLS records in the TCP
stream. When a packet marked for offload is transmitted, the NIC
encrypts the payload in-place and puts authentication tags in the
relevant place holders.

The responsibility for handling out-of-order packets (i.e. TCP
retransmission, qdisc drops) falls on the netdev driver.

The netdev driver keeps track of the expected TCP SN from the NIC's
perspective.  If the next packet to transmit matches the expected TCP
SN, the driver advances the expected TCP SN, and transmits the packet
with TLS offload indication.

If the next packet to transmit does not match the expected TCP SN. The
driver calls the TLS layer to obtain the TLS record that includes the
TCP of the packet for transmission. Using this TLS record, the driver
posts a work entry on the transmit queue to reconstruct the NIC TLS
state required for the offload of the out-of-order packet. It updates
the expected TCP SN accordingly and transmits the now in-order packet.
The same queue is used for packet transmission and TLS context
reconstruction to avoid the need for flushing the transmit queue before
issuing the context reconstruction request.

Signed-off-by: Ilya Lesokhin <il...@mellanox.com>
Signed-off-by: Boris Pismenny <bor...@mellanox.com>
Signed-off-by: Aviad Yehezkel <avia...@mellanox.com>
Signed-off-by: Saeed Mahameed <sae...@mellanox.com>


Acked-by: Shannon Nelson <shannon.nel...@oracle.com>


---
  include/net/tls.h | 120 +--
  net/tls/Kconfig   |  10 +
  net/tls/Makefile  |   2 +
  net/tls/tls_device.c  | 759 ++
  net/tls/tls_device_fallback.c | 454 +
  net/tls/tls_main.c| 120 ---
  net/tls/tls_sw.c  | 132 
  7 files changed, 1476 insertions(+), 121 deletions(-)
  create mode 100644 net/tls/tls_device.c
  create mode 100644 net/tls/tls_device_fallback.c

diff --git a/include/net/tls.h b/include/net/tls.h
index 437a746300bf..0a8529e9ec21 100644
--- a/include/net/tls.h
+++ b/include/net/tls.h
@@ -57,21 +57,10 @@
  
  #define TLS_AAD_SPACE_SIZE		13
  
-struct tls_sw_context {

+struct tls_sw_context_tx {
struct crypto_aead *aead_send;
-   struct crypto_aead *aead_recv;
struct crypto_wait async_wait;
  
-	/* Receive context */

-   struct strparser strp;
-   void (*saved_data_ready)(struct sock *sk);
-   unsigned int (*sk_poll)(struct file *file, struct socket *sock,
-   struct poll_table_struct *wait);
-   struct sk_buff *recv_pkt;
-   u8 control;
-   bool decrypted;
-
-   /* Sending context */
char aad_space[TLS_AAD_SPACE_SIZE];
  
  	unsigned int sg_plaintext_size;

@@ -88,6 +77,50 @@ struct tls_sw_context {
struct scatterlist sg_aead_out[2];
  };
  
+struct tls_sw_context_rx {

+   struct crypto_aead *aead_recv;
+   struct crypto_wait async_wait;
+
+   struct strparser strp;
+   void (*saved_data_ready)(struct sock *sk);
+   unsigned int (*sk_poll)(struct file *file, struct socket *sock,
+   struct poll_table_struct *wait);
+   struct sk_buff *recv_pkt;
+   u8 control;
+   bool decrypted;
+};
+
+struct tls_record_info {
+   struct list_h

Re: [PATCH V3 net-next 06/14] net/tls: Add generic NIC offload infrastructure

2018-03-26 Thread Shannon Nelson

On 3/26/2018 12:10 AM, Boris Pismenny wrote:

Hi Shannon,

On 3/23/2018 11:21 PM, Shannon Nelson wrote:

On 3/22/2018 3:33 PM, Saeed Mahameed wrote:

From: Ilya Lesokhin <il...@mellanox.com>



Thanks, Boris, these sound fine.
sln



Re: [PATCH V3 net-next 06/14] net/tls: Add generic NIC offload infrastructure

2018-03-23 Thread Shannon Nelson

On 3/22/2018 3:33 PM, Saeed Mahameed wrote:

From: Ilya Lesokhin 

This patch adds a generic infrastructure to offload TLS crypto to a
network devices. It enables the kernel TLS socket to skip encryption


s/devices/device/


and authentication operations on the transmit side of the data path.
Leaving those computationally expensive operations to the NIC.

The NIC offload infrastructure builds TLS records and pushes them to
the TCP layer just like the SW KTLS implementation and using the same API.
TCP segmentation is mostly unaffected. Currently the only exception is
that we prevent mixed SKBs where only part of the payload requires
offload. In the future we are likely to add a similar restriction
following a change cipher spec record.

The notable differences between SW KTLS and NIC offloaded TLS
implementations are as follows:
1. The offloaded implementation builds "plaintext TLS record", those
records contain plaintext instead of ciphertext and place holder bytes
instead of authentication tags.
2. The offloaded implementation maintains a mapping from TCP sequence
number to TLS records. Thus given a TCP SKB sent from a NIC offloaded
TLS socket, we can use the tls NIC offload infrastructure to obtain
enough context to encrypt the payload of the SKB.
A TLS record is released when the last byte of the record is ack'ed,
this is done through the new icsk_clean_acked callback.

The infrastructure should be extendable to support various NIC offload
implementations.  However it is currently written with the
implementation below in mind:
The NIC assumes that packets from each offloaded stream are sent as
plaintext and in-order. It keeps track of the TLS records in the TCP
stream. When a packet marked for offload is transmitted, the NIC
encrypts the payload in-place and puts authentication tags in the
relevant place holders.

The responsibility for handling out-of-order packets (i.e. TCP
retransmission, qdisc drops) falls on the netdev driver.

The netdev driver keeps track of the expected TCP SN from the NIC's
perspective.  If the next packet to transmit matches the expected TCP
SN, the driver advances the expected TCP SN, and transmits the packet
with TLS offload indication.

If the next packet to transmit does not match the expected TCP SN. The
driver calls the TLS layer to obtain the TLS record that includes the
TCP of the packet for transmission. Using this TLS record, the driver
posts a work entry on the transmit queue to reconstruct the NIC TLS
state required for the offload of the out-of-order packet. It updates
the expected TCP SN accordingly and transmit the now in-order packet.


s/transmit/transmits/


The same queue is used for packet transmission and TLS context
reconstruction to avoid the need for flushing the transmit queue before
issuing the context reconstruction request.

Signed-off-by: Ilya Lesokhin 
Signed-off-by: Boris Pismenny 
Signed-off-by: Aviad Yehezkel 
Signed-off-by: Saeed Mahameed 
---
  include/net/tls.h |  73 +++-
  net/tls/Kconfig   |  10 +
  net/tls/Makefile  |   2 +
  net/tls/tls_device.c  | 756 ++
  net/tls/tls_device_fallback.c | 412 +++
  net/tls/tls_main.c|  33 +-
  6 files changed, 1279 insertions(+), 7 deletions(-)
  create mode 100644 net/tls/tls_device.c
  create mode 100644 net/tls/tls_device_fallback.c

diff --git a/include/net/tls.h b/include/net/tls.h
index 4913430ab807..4f6a6f98d62b 100644
--- a/include/net/tls.h
+++ b/include/net/tls.h
@@ -77,6 +77,37 @@ struct tls_sw_context {
struct scatterlist sg_aead_out[2];
  };
  
+struct tls_record_info {

+   struct list_head list;
+   u32 end_seq;
+   int len;
+   int num_frags;
+   skb_frag_t frags[MAX_SKB_FRAGS];
+};
+
+struct tls_offload_context {
+   struct crypto_aead *aead_send;
+   spinlock_t lock;/* protects records list */
+   struct list_head records_list;
+   struct tls_record_info *open_record;
+   struct tls_record_info *retransmit_hint;
+   u64 hint_record_sn;
+   u64 unacked_record_sn;
+
+   struct scatterlist sg_tx_data[MAX_SKB_FRAGS];
+   void (*sk_destruct)(struct sock *sk);
+   u8 driver_state[];
+   /* The TLS layer reserves room for driver specific state
+* Currently the belief is that there is not enough
+* driver specific state to justify another layer of indirection
+*/
+#define TLS_DRIVER_STATE_SIZE (max_t(size_t, 8, sizeof(void *)))
+};
+
+#define TLS_OFFLOAD_CONTEXT_SIZE   
\
+   (ALIGN(sizeof(struct tls_offload_context), sizeof(void *)) +   \
+TLS_DRIVER_STATE_SIZE)
+
  enum {
TLS_PENDING_CLOSED_RECORD
  };
@@ -87,6 +118,10 @@ struct tls_context {
struct tls12_crypto_info_aes_gcm_128 

[next-queue v2 3/4] ixgbe: no need for esp trailer if gso

2018-03-16 Thread Shannon Nelson
There is no need to calculate the trailer length if we're doing
a GSO/TSO, as there is no trailer added to the packet data.
Also, don't bother clearing the flags field as it was already
cleared earlier.

Signed-off-by: Shannon Nelson <shannon.nel...@oracle.com>
---
 drivers/net/ethernet/intel/ixgbe/ixgbe_ipsec.c | 37 +++---
 1 file changed, 21 insertions(+), 16 deletions(-)

diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_ipsec.c 
b/drivers/net/ethernet/intel/ixgbe/ixgbe_ipsec.c
index f225452..5ddea43 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_ipsec.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_ipsec.c
@@ -774,11 +774,7 @@ int ixgbe_ipsec_tx(struct ixgbe_ring *tx_ring,
 
first->tx_flags |= IXGBE_TX_FLAGS_IPSEC | IXGBE_TX_FLAGS_CC;
 
-   itd->flags = 0;
if (xs->id.proto == IPPROTO_ESP) {
-   struct sk_buff *skb = first->skb;
-   int ret, authlen, trailerlen;
-   u8 padlen;
 
itd->flags |= IXGBE_ADVTXD_TUCMD_IPSEC_TYPE_ESP |
  IXGBE_ADVTXD_TUCMD_L4T_TCP;
@@ -790,19 +786,28 @@ int ixgbe_ipsec_tx(struct ixgbe_ring *tx_ring,
 * padlen bytes of padding.  This ends up not the same
 * as the static value found in xs->props.trailer_len (21).
 *
-* The "correct" way to get the auth length would be to use
-*authlen = crypto_aead_authsize(xs->data);
-* but since we know we only have one size to worry about
-* we can let the compiler use the constant and save us a
-* few CPU cycles.
+* ... but if we're doing GSO, don't bother as the stack
+* doesn't add a trailer for those.
 */
-   authlen = IXGBE_IPSEC_AUTH_BITS / 8;
-
-   ret = skb_copy_bits(skb, skb->len - (authlen + 2), , 1);
-   if (unlikely(ret))
-   return 0;
-   trailerlen = authlen + 2 + padlen;
-   itd->trailer_len = trailerlen;
+   if (!skb_is_gso(first->skb)) {
+   /* The "correct" way to get the auth length would be
+* to use
+*authlen = crypto_aead_authsize(xs->data);
+* but since we know we only have one size to worry
+* about * we can let the compiler use the constant
+* and save us a few CPU cycles.
+*/
+   const int authlen = IXGBE_IPSEC_AUTH_BITS / 8;
+   struct sk_buff *skb = first->skb;
+   u8 padlen;
+   int ret;
+
+   ret = skb_copy_bits(skb, skb->len - (authlen + 2),
+   , 1);
+   if (unlikely(ret))
+   return 0;
+   itd->trailer_len = authlen + 2 + padlen;
+   }
}
if (tsa->encrypt)
itd->flags |= IXGBE_ADVTXD_TUCMD_IPSEC_ENCRYPT_EN;
-- 
2.7.4



[next-queue v2 4/4] ixgbe: enable tso with ipsec offload

2018-03-16 Thread Shannon Nelson
Fix things up to support TSO offload in conjunction
with IPsec hw offload.  This raises throughput with
IPsec offload on to nearly line rate.

Signed-off-by: Shannon Nelson <shannon.nel...@oracle.com>
---
v2 updates from Alex's comments:
 - changed feature add from variable to #define
 - fixed a reverse christmas tree miss
 - GSO_PARTIAL'd the ipv4 header checksum
 - dropped an extra blank line


 drivers/net/ethernet/intel/ixgbe/ixgbe_ipsec.c |  9 +++--
 drivers/net/ethernet/intel/ixgbe/ixgbe_main.c  | 26 ++
 2 files changed, 25 insertions(+), 10 deletions(-)

diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_ipsec.c 
b/drivers/net/ethernet/intel/ixgbe/ixgbe_ipsec.c
index 5ddea43..68af127 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_ipsec.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_ipsec.c
@@ -929,8 +929,13 @@ void ixgbe_init_ipsec_offload(struct ixgbe_adapter 
*adapter)
ixgbe_ipsec_clear_hw_tables(adapter);
 
adapter->netdev->xfrmdev_ops = _xfrmdev_ops;
-   adapter->netdev->features |= NETIF_F_HW_ESP;
-   adapter->netdev->hw_enc_features |= NETIF_F_HW_ESP;
+
+#define IXGBE_ESP_FEATURES (NETIF_F_HW_ESP | \
+NETIF_F_HW_ESP_TX_CSUM | \
+NETIF_F_GSO_ESP)
+
+   adapter->netdev->features |= IXGBE_ESP_FEATURES;
+   adapter->netdev->hw_enc_features |= IXGBE_ESP_FEATURES;
 
return;
 
diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c 
b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
index a54f3d8..b947435 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
@@ -7721,7 +7721,8 @@ static void ixgbe_service_task(struct work_struct *work)
 
 static int ixgbe_tso(struct ixgbe_ring *tx_ring,
 struct ixgbe_tx_buffer *first,
-u8 *hdr_len)
+u8 *hdr_len,
+struct ixgbe_ipsec_tx_data *itd)
 {
u32 vlan_macip_lens, type_tucmd, mss_l4len_idx;
struct sk_buff *skb = first->skb;
@@ -7735,6 +7736,7 @@ static int ixgbe_tso(struct ixgbe_ring *tx_ring,
unsigned char *hdr;
} l4;
u32 paylen, l4_offset;
+   u32 fceof_saidx = 0;
int err;
 
if (skb->ip_summed != CHECKSUM_PARTIAL)
@@ -7760,13 +7762,15 @@ static int ixgbe_tso(struct ixgbe_ring *tx_ring,
if (ip.v4->version == 4) {
unsigned char *csum_start = skb_checksum_start(skb);
unsigned char *trans_start = ip.hdr + (ip.v4->ihl * 4);
+   int len = csum_start - trans_start;
 
/* IP header will have to cancel out any data that
-* is not a part of the outer IP header
+* is not a part of the outer IP header, so set to
+* a reverse csum if needed, else init check to 0.
 */
-   ip.v4->check = csum_fold(csum_partial(trans_start,
- csum_start - trans_start,
- 0));
+   ip.v4->check = (skb_shinfo(skb)->gso_type & SKB_GSO_PARTIAL) ?
+  csum_fold(csum_partial(trans_start,
+ len, 0)) : 0;
type_tucmd |= IXGBE_ADVTXD_TUCMD_IPV4;
 
ip.v4->tot_len = 0;
@@ -7797,12 +7801,15 @@ static int ixgbe_tso(struct ixgbe_ring *tx_ring,
mss_l4len_idx = (*hdr_len - l4_offset) << IXGBE_ADVTXD_L4LEN_SHIFT;
mss_l4len_idx |= skb_shinfo(skb)->gso_size << IXGBE_ADVTXD_MSS_SHIFT;
 
+   fceof_saidx |= itd->sa_idx;
+   type_tucmd |= itd->flags | itd->trailer_len;
+
/* vlan_macip_lens: HEADLEN, MACLEN, VLAN tag */
vlan_macip_lens = l4.hdr - ip.hdr;
vlan_macip_lens |= (ip.hdr - skb->data) << IXGBE_ADVTXD_MACLEN_SHIFT;
vlan_macip_lens |= first->tx_flags & IXGBE_TX_FLAGS_VLAN_MASK;
 
-   ixgbe_tx_ctxtdesc(tx_ring, vlan_macip_lens, 0, type_tucmd,
+   ixgbe_tx_ctxtdesc(tx_ring, vlan_macip_lens, fceof_saidx, type_tucmd,
  mss_l4len_idx);
 
return 1;
@@ -8493,7 +8500,7 @@ netdev_tx_t ixgbe_xmit_frame_ring(struct sk_buff *skb,
if (skb->sp && !ixgbe_ipsec_tx(tx_ring, first, _tx))
goto out_drop;
 #endif
-   tso = ixgbe_tso(tx_ring, first, _len);
+   tso = ixgbe_tso(tx_ring, first, _len, _tx);
if (tso < 0)
goto out_drop;
else if (!tso)
@@ -9902,8 +9909,11 @@ ixgbe_features_check(struct sk_buff *skb, struct 
net_device *dev,
 
/* We can only support IPV4 TSO in tunnels if we can mangle the
 * inner IP ID field, so strip TSO if MANGLEID is not supported.
+* IPsec offoad sets skb->encapsulation

[next-queue v2 1/4] ixgbe: no need for ipsec csum feature check

2018-03-16 Thread Shannon Nelson
With the patch
commit f8aa2696b4af ("esp: check the NETIF_F_HW_ESP_TX_CSUM bit before 
segmenting")
we no longer need to protect ourself from checksum
offload requests on IPsec packets, so we can remove
the check in our .ndo_features_check callback.

Signed-off-by: Shannon Nelson <shannon.nel...@oracle.com>
---
 drivers/net/ethernet/intel/ixgbe/ixgbe_main.c | 6 --
 1 file changed, 6 deletions(-)

diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c 
b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
index 8536942..153cd9e 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
@@ -9908,12 +9908,6 @@ ixgbe_features_check(struct sk_buff *skb, struct 
net_device *dev,
if (skb->encapsulation && !(features & NETIF_F_TSO_MANGLEID))
features &= ~NETIF_F_TSO;
 
-#ifdef CONFIG_XFRM_OFFLOAD
-   /* IPsec offload doesn't get along well with others *yet* */
-   if (skb->sp)
-   features &= ~(NETIF_F_TSO | NETIF_F_HW_CSUM);
-#endif
-
return features;
 }
 
-- 
2.7.4



[next-queue v2 0/4] ixgbe: Enable tso and checksum offload with ipsec

2018-03-16 Thread Shannon Nelson
This patchset fixes up the bits for supporting TSO and checksum
offload in conjunction with IPsec offload.  This brings the
throughput of a simple iperf test back up to nearly line rate.

v2: fixes in patch 4 from Alex's comments

Shannon Nelson (4):
  ixgbe: no need for ipsec csum feature check
  ixgbe: remove unneeded ipsec test in Tx path
  ixgbe: no need for esp trailer if gso
  ixgbe: enable tso with ipsec offload

 drivers/net/ethernet/intel/ixgbe/ixgbe_ipsec.c | 45 +++---
 drivers/net/ethernet/intel/ixgbe/ixgbe_main.c  | 30 -
 2 files changed, 42 insertions(+), 33 deletions(-)

-- 
2.7.4



[next-queue v2 2/4] ixgbe: remove unneeded ipsec test in TX path

2018-03-16 Thread Shannon Nelson
Since the ipsec data fields will be zero anyway in the non-ipsec
case, we can remove the conditional jump.

Suggested-by: Alexander Duyck <alexander.du...@gmail.com>
Signed-off-by: Shannon Nelson <shannon.nel...@oracle.com>
---
 drivers/net/ethernet/intel/ixgbe/ixgbe_main.c | 6 ++
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c 
b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
index 153cd9e..a54f3d8 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
@@ -7864,10 +7864,8 @@ static void ixgbe_tx_csum(struct ixgbe_ring *tx_ring,
vlan_macip_lens |= skb_network_offset(skb) << IXGBE_ADVTXD_MACLEN_SHIFT;
vlan_macip_lens |= first->tx_flags & IXGBE_TX_FLAGS_VLAN_MASK;
 
-   if (first->tx_flags & IXGBE_TX_FLAGS_IPSEC) {
-   fceof_saidx |= itd->sa_idx;
-   type_tucmd |= itd->flags | itd->trailer_len;
-   }
+   fceof_saidx |= itd->sa_idx;
+   type_tucmd |= itd->flags | itd->trailer_len;
 
ixgbe_tx_ctxtdesc(tx_ring, vlan_macip_lens, fceof_saidx, type_tucmd, 0);
 }
-- 
2.7.4



Re: [PATCH v2 06/15] ice: Initialize PF and setup miscellaneous interrupt

2018-03-16 Thread Shannon Nelson

On 3/15/2018 4:47 PM, Anirudh Venkataramanan wrote:

This patch continues the initialization flow as follows:

1) Allocate and initialize necessary fields (like vsi, num_alloc_vsi,
irq_tracker, etc) in the ice_pf instance.

2) Setup the miscellaneous interrupt handler. This also known as the
"other interrupt causes" (OIC) handler and is used to handle non
hotpath interrupts (like control queue events, link events,
exceptions, etc.

3) Implement a background task to process admin queue receive (ARQ)
events received by the driver.

CC: Shannon Nelson <shannon.nel...@oracle.com>
Signed-off-by: Anirudh Venkataramanan <anirudh.venkatarama...@intel.com>
---
v2: Removed reference to "lump" as suggested by Shannon Nelson.


Yep,
sln
Acked-by: Shannon Nelson <shannon.nel...@oracle.com>



Re: [PATCH v2 04/15] ice: Get switch config, scheduler config and device capabilities

2018-03-16 Thread Shannon Nelson

On 3/15/2018 4:47 PM, Anirudh Venkataramanan wrote:

This patch adds to the initialization flow by getting switch
configuration, scheduler configuration and device capabilities.

Switch configuration:
On boot, an L2 switch element is created in the firmware per physical
function. Each physical function is also mapped to a port, to which its
switch element is connected. In other words, this switch can be visualized
as an embedded vSwitch that can connect a physical functions's virtual
station interfaces (VSIs) to the egress/ingress port. Egress/ingress
filters will be eventually created and applied on this switch element.
As part of the initialization flow, the driver gets configuration data
from this switch element and stores it.

Scheduler configuration:
The Tx scheduler is a subsystem responsible for setting and enforcing QoS.
As part of the initialization flow, the driver queries and stores the
default scheduler configuration for the given physical function.

Device capabilities:
As part of initialization, the driver has to determine what the device is
capable of (ex. max queues, VSIs, etc). This information is obtained from
the firmware and stored by the driver.

CC: Shannon Nelson <shannon.nel...@oracle.com>
Signed-off-by: Anirudh Venkataramanan <anirudh.venkatarama...@intel.com>
---
v2: Addressed Shannon Nelson's review comment by changing retry count value
 to 2.


The fix looks fine.
sln

Acked-by: Shannon Nelson <shannon.nel...@oracle.com>


Re: [PATCH v2 07/15] ice: Add support for VSI allocation and deallocation

2018-03-16 Thread Shannon Nelson

On 3/15/2018 4:47 PM, Anirudh Venkataramanan wrote:

This patch introduces data structures and functions to alloc/free
VSIs. The driver represents a VSI using the ice_vsi structure.

Some noteworthy points about VSI allocation:

1) A VSI is allocated in the firmware using the "add VSI" admin queue
command (implemented as ice_aq_add_vsi). The firmware returns an
identifier for the allocated VSI. The VSI context is used to program
certain aspects (loopback, queue map, etc.) of the VSI's configuration.

2) A VSI is deleted using the "free VSI" admin queue command (implemented
as ice_aq_free_vsi).

3) The driver represents a VSI using struct ice_vsi. This is allocated
and initialized as part of the ice_vsi_alloc flow, and deallocated
as part of the ice_vsi_delete flow.

4) Once the VSI is created, a netdev is allocated and associated with it.
The VSI's ring and vector related data structures are also allocated
and initialized.

5) A VSI's queues can either be contiguous or scattered. To do this, the
driver maintains a bitmap (vsi->avail_txqs) which is kept in sync with
the firmware's VSI queue allocation imap. If the VSI can't get a
contiguous queue allocation, it will fallback to scatter. This is
implemented in ice_vsi_get_qs which is called as part of the VSI setup
flow. In the release flow, the VSI's queues are released and the bitmap
is updated to reflect this by ice_vsi_put_qs.

CC: Shannon Nelson <shannon.nel...@oracle.com>
Signed-off-by: Anirudh Venkataramanan <anirudh.venkatarama...@intel.com>
---
v2: Addressed Shannon Nelson's comments by
1) using a new define ICE_NO_VSI instead of the magic number 0x.
2) adding missing curly braces and break statements.

Also, ice_set_def_vsi_ctx was changed to ice_set_dflt_vsi_ctx for clarity.


Thanks,
sln
Acked-by: Shannon Nelson <shannon.nel...@oracle.com>



Re: [PATCH v2 03/15] ice: Start hardware initialization

2018-03-16 Thread Shannon Nelson

On 3/15/2018 4:47 PM, Anirudh Venkataramanan wrote:

This patch implements multiple pieces of the initialization flow
as follows:

1) A reset is issued to ensure a clean device state, followed
by initialization of admin queue interface.

2) Once the admin queue interface is up, clear the PF config
and transition the device to non-PXE mode.

3) Get the NVM configuration stored in the device's non-volatile
memory (NVM) using ice_init_nvm.

CC: Shannon Nelson <shannon.nel...@oracle.com>
Signed-off-by: Anirudh Venkataramanan <anirudh.venkatarama...@intel.com>
---
v2: Addressed Shannon Nelson's review comments by
 1) removing an unnecessary register write in ice_aq_clear_pxe_mode.
 2) adding a comment explaining the need to convert word sized values
to byte sized values.


These two changes look fine.

sln

Acked-by: Shannon Nelson <shannon.nel...@oracle.com>



Re: [Intel-wired-lan] [next-queue 4/4] ixgbe: enable tso with ipsec offload

2018-03-15 Thread Shannon Nelson

On 3/15/2018 3:03 PM, Alexander Duyck wrote:

On Thu, Mar 15, 2018 at 2:23 PM, Shannon Nelson
<shannon.nel...@oracle.com> wrote:

Fix things up to support TSO offload in conjunction
with IPsec hw offload.  This raises throughput with
IPsec offload on to nearly line rate.

Signed-off-by: Shannon Nelson <shannon.nel...@oracle.com>
---
  drivers/net/ethernet/intel/ixgbe/ixgbe_ipsec.c |  7 +--
  drivers/net/ethernet/intel/ixgbe/ixgbe_main.c  | 25 +++--
  2 files changed, 24 insertions(+), 8 deletions(-)

diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_ipsec.c 
b/drivers/net/ethernet/intel/ixgbe/ixgbe_ipsec.c
index 5ddea43..bfbcfc2 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_ipsec.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_ipsec.c
@@ -896,6 +896,7 @@ void ixgbe_ipsec_rx(struct ixgbe_ring *rx_ring,
  void ixgbe_init_ipsec_offload(struct ixgbe_adapter *adapter)
  {
 struct ixgbe_ipsec *ipsec;
+   netdev_features_t features;
 size_t size;

 if (adapter->hw.mac.type == ixgbe_mac_82598EB)
@@ -929,8 +930,10 @@ void ixgbe_init_ipsec_offload(struct ixgbe_adapter 
*adapter)
 ixgbe_ipsec_clear_hw_tables(adapter);

 adapter->netdev->xfrmdev_ops = _xfrmdev_ops;
-   adapter->netdev->features |= NETIF_F_HW_ESP;
-   adapter->netdev->hw_enc_features |= NETIF_F_HW_ESP;
+
+   features = NETIF_F_HW_ESP | NETIF_F_HW_ESP_TX_CSUM | NETIF_F_GSO_ESP;
+   adapter->netdev->features |= features;
+   adapter->netdev->hw_enc_features |= features;


Instead of adding the local variable you might just create a new
define that includes these 3 feature flags and then use that here. You
could use the way I did IXGBE_GSO_PARTIAL_FEATURES as an example.


 return;

diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c 
b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
index a54f3d8..6022666 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
@@ -7721,9 +7721,11 @@ static void ixgbe_service_task(struct work_struct *work)

  static int ixgbe_tso(struct ixgbe_ring *tx_ring,
  struct ixgbe_tx_buffer *first,
-u8 *hdr_len)
+u8 *hdr_len,
+struct ixgbe_ipsec_tx_data *itd)
  {
 u32 vlan_macip_lens, type_tucmd, mss_l4len_idx;
+   u32 fceof_saidx = 0;
 struct sk_buff *skb = first->skb;


Reverse xmas tree this. It should probably be moved down to just past
the declaration of paylen and l4_offset.


 union {
 struct iphdr *v4;
@@ -7762,9 +7764,13 @@ static int ixgbe_tso(struct ixgbe_ring *tx_ring,
 unsigned char *trans_start = ip.hdr + (ip.v4->ihl * 4);

 /* IP header will have to cancel out any data that
-* is not a part of the outer IP header
+* is not a part of the outer IP header, except for
+* IPsec where we want the IP+ESP header.
  */
-   ip.v4->check = csum_fold(csum_partial(trans_start,
+   if (first->tx_flags & IXGBE_TX_FLAGS_IPSEC)
+   ip.v4->check = 0;
+   else
+   ip.v4->check = csum_fold(csum_partial(trans_start,
   csum_start - trans_start,
   0));
 type_tucmd |= IXGBE_ADVTXD_TUCMD_IPV4;


I would say this should be flipped like so:
ip.v4->check =  (skb_shinfo(skb)->gso_type & SKB_GSO_PARTIAL) ?
  csum_fold(csum_partial(trans_start,
csum_start - trans_start, 0) : 0;


@@ -7797,12 +7803,15 @@ static int ixgbe_tso(struct ixgbe_ring *tx_ring,
 mss_l4len_idx = (*hdr_len - l4_offset) << IXGBE_ADVTXD_L4LEN_SHIFT;
 mss_l4len_idx |= skb_shinfo(skb)->gso_size << IXGBE_ADVTXD_MSS_SHIFT;

+   fceof_saidx |= itd->sa_idx;
+   type_tucmd |= itd->flags | itd->trailer_len;
+
 /* vlan_macip_lens: HEADLEN, MACLEN, VLAN tag */
 vlan_macip_lens = l4.hdr - ip.hdr;
 vlan_macip_lens |= (ip.hdr - skb->data) << IXGBE_ADVTXD_MACLEN_SHIFT;
 vlan_macip_lens |= first->tx_flags & IXGBE_TX_FLAGS_VLAN_MASK;

-   ixgbe_tx_ctxtdesc(tx_ring, vlan_macip_lens, 0, type_tucmd,
+   ixgbe_tx_ctxtdesc(tx_ring, vlan_macip_lens, fceof_saidx, type_tucmd,
   mss_l4len_idx);

 return 1;
@@ -8493,7 +8502,8 @@ netdev_tx_t ixgbe_xmit_frame_ring(struct sk_buff *skb,
 if (skb->sp && !ixgbe_ipsec_tx(tx_ring, first, _tx))
 goto out_drop;
  #endif
-   tso = ixgbe_tso(tx_ring, first, _len);
+
+   tso = ixgbe_tso(tx_ring, first, _len, _tx);
 if (tso < 0)
 goto out_drop;
 else if (!tso)


No need for the extr

[next-queue 4/4] ixgbe: enable tso with ipsec offload

2018-03-15 Thread Shannon Nelson
Fix things up to support TSO offload in conjunction
with IPsec hw offload.  This raises throughput with
IPsec offload on to nearly line rate.

Signed-off-by: Shannon Nelson <shannon.nel...@oracle.com>
---
 drivers/net/ethernet/intel/ixgbe/ixgbe_ipsec.c |  7 +--
 drivers/net/ethernet/intel/ixgbe/ixgbe_main.c  | 25 +++--
 2 files changed, 24 insertions(+), 8 deletions(-)

diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_ipsec.c 
b/drivers/net/ethernet/intel/ixgbe/ixgbe_ipsec.c
index 5ddea43..bfbcfc2 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_ipsec.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_ipsec.c
@@ -896,6 +896,7 @@ void ixgbe_ipsec_rx(struct ixgbe_ring *rx_ring,
 void ixgbe_init_ipsec_offload(struct ixgbe_adapter *adapter)
 {
struct ixgbe_ipsec *ipsec;
+   netdev_features_t features;
size_t size;
 
if (adapter->hw.mac.type == ixgbe_mac_82598EB)
@@ -929,8 +930,10 @@ void ixgbe_init_ipsec_offload(struct ixgbe_adapter 
*adapter)
ixgbe_ipsec_clear_hw_tables(adapter);
 
adapter->netdev->xfrmdev_ops = _xfrmdev_ops;
-   adapter->netdev->features |= NETIF_F_HW_ESP;
-   adapter->netdev->hw_enc_features |= NETIF_F_HW_ESP;
+
+   features = NETIF_F_HW_ESP | NETIF_F_HW_ESP_TX_CSUM | NETIF_F_GSO_ESP;
+   adapter->netdev->features |= features;
+   adapter->netdev->hw_enc_features |= features;
 
return;
 
diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c 
b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
index a54f3d8..6022666 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
@@ -7721,9 +7721,11 @@ static void ixgbe_service_task(struct work_struct *work)
 
 static int ixgbe_tso(struct ixgbe_ring *tx_ring,
 struct ixgbe_tx_buffer *first,
-u8 *hdr_len)
+u8 *hdr_len,
+struct ixgbe_ipsec_tx_data *itd)
 {
u32 vlan_macip_lens, type_tucmd, mss_l4len_idx;
+   u32 fceof_saidx = 0;
struct sk_buff *skb = first->skb;
union {
struct iphdr *v4;
@@ -7762,9 +7764,13 @@ static int ixgbe_tso(struct ixgbe_ring *tx_ring,
unsigned char *trans_start = ip.hdr + (ip.v4->ihl * 4);
 
/* IP header will have to cancel out any data that
-* is not a part of the outer IP header
+* is not a part of the outer IP header, except for
+* IPsec where we want the IP+ESP header.
 */
-   ip.v4->check = csum_fold(csum_partial(trans_start,
+   if (first->tx_flags & IXGBE_TX_FLAGS_IPSEC)
+   ip.v4->check = 0;
+   else
+   ip.v4->check = csum_fold(csum_partial(trans_start,
  csum_start - trans_start,
  0));
type_tucmd |= IXGBE_ADVTXD_TUCMD_IPV4;
@@ -7797,12 +7803,15 @@ static int ixgbe_tso(struct ixgbe_ring *tx_ring,
mss_l4len_idx = (*hdr_len - l4_offset) << IXGBE_ADVTXD_L4LEN_SHIFT;
mss_l4len_idx |= skb_shinfo(skb)->gso_size << IXGBE_ADVTXD_MSS_SHIFT;
 
+   fceof_saidx |= itd->sa_idx;
+   type_tucmd |= itd->flags | itd->trailer_len;
+
/* vlan_macip_lens: HEADLEN, MACLEN, VLAN tag */
vlan_macip_lens = l4.hdr - ip.hdr;
vlan_macip_lens |= (ip.hdr - skb->data) << IXGBE_ADVTXD_MACLEN_SHIFT;
vlan_macip_lens |= first->tx_flags & IXGBE_TX_FLAGS_VLAN_MASK;
 
-   ixgbe_tx_ctxtdesc(tx_ring, vlan_macip_lens, 0, type_tucmd,
+   ixgbe_tx_ctxtdesc(tx_ring, vlan_macip_lens, fceof_saidx, type_tucmd,
  mss_l4len_idx);
 
return 1;
@@ -8493,7 +8502,8 @@ netdev_tx_t ixgbe_xmit_frame_ring(struct sk_buff *skb,
if (skb->sp && !ixgbe_ipsec_tx(tx_ring, first, _tx))
goto out_drop;
 #endif
-   tso = ixgbe_tso(tx_ring, first, _len);
+
+   tso = ixgbe_tso(tx_ring, first, _len, _tx);
if (tso < 0)
goto out_drop;
else if (!tso)
@@ -9902,8 +9912,11 @@ ixgbe_features_check(struct sk_buff *skb, struct 
net_device *dev,
 
/* We can only support IPV4 TSO in tunnels if we can mangle the
 * inner IP ID field, so strip TSO if MANGLEID is not supported.
+* IPsec offoad sets skb->encapsulation but still can handle
+* the TSO, so it's the exception.
 */
-   if (skb->encapsulation && !(features & NETIF_F_TSO_MANGLEID))
+   if (skb->encapsulation && !(features & NETIF_F_TSO_MANGLEID) &&
+   !skb->sp)
features &= ~NETIF_F_TSO;
 
return features;
-- 
2.7.4



[next-queue 0/4] ixgbe: Enable tso and checksum offload with ipsec

2018-03-15 Thread Shannon Nelson
This patchset fixes up the bits for supporting TSO and checksum
offload in conjunction with IPsec offload.  This brings the
throughput of a simple iperf test back up to nearly line rate.

Shannon Nelson (4):
  ixgbe: no need for ipsec csum feature check
  ixgbe: remove unneeded ipsec test in Tx path
  ixgbe: no need for esp trailer if gso
  ixgbe: enable tso with ipsec offload

 drivers/net/ethernet/intel/ixgbe/ixgbe_ipsec.c | 45 +++---
 drivers/net/ethernet/intel/ixgbe/ixgbe_main.c  | 30 -
 2 files changed, 42 insertions(+), 33 deletions(-)

-- 
2.7.4



[next-queue 3/4] ixgbe: no need for esp trailer if gso

2018-03-15 Thread Shannon Nelson
There is no need to calculate the trailer length if we're doing
a GSO/TSO, as there is no trailer added to the packet data.
Also, don't bother clearing the flags field as it was already
cleared earlier.

Signed-off-by: Shannon Nelson <shannon.nel...@oracle.com>
---
 drivers/net/ethernet/intel/ixgbe/ixgbe_ipsec.c | 37 +++---
 1 file changed, 21 insertions(+), 16 deletions(-)

diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_ipsec.c 
b/drivers/net/ethernet/intel/ixgbe/ixgbe_ipsec.c
index f225452..5ddea43 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_ipsec.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_ipsec.c
@@ -774,11 +774,7 @@ int ixgbe_ipsec_tx(struct ixgbe_ring *tx_ring,
 
first->tx_flags |= IXGBE_TX_FLAGS_IPSEC | IXGBE_TX_FLAGS_CC;
 
-   itd->flags = 0;
if (xs->id.proto == IPPROTO_ESP) {
-   struct sk_buff *skb = first->skb;
-   int ret, authlen, trailerlen;
-   u8 padlen;
 
itd->flags |= IXGBE_ADVTXD_TUCMD_IPSEC_TYPE_ESP |
  IXGBE_ADVTXD_TUCMD_L4T_TCP;
@@ -790,19 +786,28 @@ int ixgbe_ipsec_tx(struct ixgbe_ring *tx_ring,
 * padlen bytes of padding.  This ends up not the same
 * as the static value found in xs->props.trailer_len (21).
 *
-* The "correct" way to get the auth length would be to use
-*authlen = crypto_aead_authsize(xs->data);
-* but since we know we only have one size to worry about
-* we can let the compiler use the constant and save us a
-* few CPU cycles.
+* ... but if we're doing GSO, don't bother as the stack
+* doesn't add a trailer for those.
 */
-   authlen = IXGBE_IPSEC_AUTH_BITS / 8;
-
-   ret = skb_copy_bits(skb, skb->len - (authlen + 2), , 1);
-   if (unlikely(ret))
-   return 0;
-   trailerlen = authlen + 2 + padlen;
-   itd->trailer_len = trailerlen;
+   if (!skb_is_gso(first->skb)) {
+   /* The "correct" way to get the auth length would be
+* to use
+*authlen = crypto_aead_authsize(xs->data);
+* but since we know we only have one size to worry
+* about * we can let the compiler use the constant
+* and save us a few CPU cycles.
+*/
+   const int authlen = IXGBE_IPSEC_AUTH_BITS / 8;
+   struct sk_buff *skb = first->skb;
+   u8 padlen;
+   int ret;
+
+   ret = skb_copy_bits(skb, skb->len - (authlen + 2),
+   , 1);
+   if (unlikely(ret))
+   return 0;
+   itd->trailer_len = authlen + 2 + padlen;
+   }
}
if (tsa->encrypt)
itd->flags |= IXGBE_ADVTXD_TUCMD_IPSEC_ENCRYPT_EN;
-- 
2.7.4



[next-queue 2/4] ixgbe: remove unneeded ipsec test in TX path

2018-03-15 Thread Shannon Nelson
Since the ipsec data fields will be zero anyway in the non-ipsec
case, we can remove the conditional jump.

Suggested-by: Alexander Duyck <alexander.du...@gmail.com>
Signed-off-by: Shannon Nelson <shannon.nel...@oracle.com>
---
 drivers/net/ethernet/intel/ixgbe/ixgbe_main.c | 6 ++
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c 
b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
index 153cd9e..a54f3d8 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
@@ -7864,10 +7864,8 @@ static void ixgbe_tx_csum(struct ixgbe_ring *tx_ring,
vlan_macip_lens |= skb_network_offset(skb) << IXGBE_ADVTXD_MACLEN_SHIFT;
vlan_macip_lens |= first->tx_flags & IXGBE_TX_FLAGS_VLAN_MASK;
 
-   if (first->tx_flags & IXGBE_TX_FLAGS_IPSEC) {
-   fceof_saidx |= itd->sa_idx;
-   type_tucmd |= itd->flags | itd->trailer_len;
-   }
+   fceof_saidx |= itd->sa_idx;
+   type_tucmd |= itd->flags | itd->trailer_len;
 
ixgbe_tx_ctxtdesc(tx_ring, vlan_macip_lens, fceof_saidx, type_tucmd, 0);
 }
-- 
2.7.4



[next-queue 1/4] ixgbe: no need for ipsec csum feature check

2018-03-15 Thread Shannon Nelson
With the patch
commit f8aa2696b4af ("esp: check the NETIF_F_HW_ESP_TX_CSUM bit before 
segmenting")
we no longer need to protect ourself from checksum
offload requests on IPsec packets, so we can remove
the check in our .ndo_features_check callback.

Signed-off-by: Shannon Nelson <shannon.nel...@oracle.com>
---
 drivers/net/ethernet/intel/ixgbe/ixgbe_main.c | 6 --
 1 file changed, 6 deletions(-)

diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c 
b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
index 8536942..153cd9e 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
@@ -9908,12 +9908,6 @@ ixgbe_features_check(struct sk_buff *skb, struct 
net_device *dev,
if (skb->encapsulation && !(features & NETIF_F_TSO_MANGLEID))
features &= ~NETIF_F_TSO;
 
-#ifdef CONFIG_XFRM_OFFLOAD
-   /* IPsec offload doesn't get along well with others *yet* */
-   if (skb->sp)
-   features &= ~(NETIF_F_TSO | NETIF_F_HW_CSUM);
-#endif
-
return features;
 }
 
-- 
2.7.4



Re: [Intel-wired-lan] [PATCH 03/15] ice: Start hardware initialization

2018-03-15 Thread Shannon Nelson

On 3/14/2018 3:05 PM, Venkataramanan, Anirudh wrote:

On Mon, 2018-03-12 at 19:05 -0700, Shannon Nelson wrote:

On 3/9/2018 9:21 AM, Anirudh Venkataramanan wrote:




+
+/**
+ * ice_read_sr_aq - Read Shadow RAM.
+ * @hw: pointer to the HW structure
+ * @offset: offset in words from module start
+ * @words: number of words to read
+ * @data: buffer for words reads from Shadow RAM
+ * @last_command: tells the AdminQ that this is the last command
+ *
+ * Reads 16-bit word buffers from the Shadow RAM using the admin
command.
+ */
+static enum ice_status
+ice_read_sr_aq(struct ice_hw *hw, u32 offset, u16 words, u16
*data,
+  bool last_command)
+{
+   enum ice_status status;
+
+   status = ice_check_sr_access_params(hw, offset, words);
+   if (!status)
+   status = ice_aq_read_nvm(hw, 0, 2 * offset, 2 *
words, data,


Why the doubling of offset and words?  If this is some general
adjustment made for the AQ interface, it should be made in
ice_aq_read_nvm().  If not, then some explanation is needed here.


ice_read_sr_aq expects a word offset and size in words. The
ice_aq_read_nvm interface expects offset and size in bytes. The
doubling is a conversion from word offset/size to byte offset/size.


In that case, this might be a good place for a small comment for readers 
like me who don't have the spec available.


sln



Re: [Intel-wired-lan] [PATCH 07/15] ice: Add support for VSI allocation and deallocation

2018-03-12 Thread Shannon Nelson

On 3/9/2018 9:21 AM, Anirudh Venkataramanan wrote:

This patch introduces data structures and functions to alloc/free
VSIs. The driver represents a VSI using the ice_vsi structure.

Some noteworthy points about VSI allocation:

1) A VSI is allocated in the firmware using the "add VSI" admin queue
command (implemented as ice_aq_add_vsi). The firmware returns an
identifier for the allocated VSI. The VSI context is used to program
certain aspects (loopback, queue map, etc.) of the VSI's configuration.

2) A VSI is deleted using the "free VSI" admin queue command (implemented
as ice_aq_free_vsi).

3) The driver represents a VSI using struct ice_vsi. This is allocated
and initialized as part of the ice_vsi_alloc flow, and deallocated
as part of the ice_vsi_delete flow.

4) Once the VSI is created, a netdev is allocated and associated with it.
The VSI's ring and vector related data structures are also allocated
and initialized.

5) A VSI's queues can either be contiguous or scattered. To do this, the
driver maintains a bitmap (vsi->avail_txqs) which is kept in sync with
the firmware's VSI queue allocation imap. If the VSI can't get a
contiguous queue allocation, it will fallback to scatter. This is
implemented in ice_vsi_get_qs which is called as part of the VSI setup
flow. In the release flow, the VSI's queues are released and the bitmap
is updated to reflect this by ice_vsi_put_qs.

Signed-off-by: Anirudh Venkataramanan 
---
  drivers/net/ethernet/intel/ice/ice.h|   71 ++
  drivers/net/ethernet/intel/ice/ice_adminq_cmd.h |  199 
  drivers/net/ethernet/intel/ice/ice_main.c   | 1108 +++
  drivers/net/ethernet/intel/ice/ice_switch.c |  115 +++
  drivers/net/ethernet/intel/ice/ice_switch.h |   21 +
  drivers/net/ethernet/intel/ice/ice_txrx.h   |   26 +
  drivers/net/ethernet/intel/ice/ice_type.h   |4 +
  7 files changed, 1544 insertions(+)

diff --git a/drivers/net/ethernet/intel/ice/ice.h 
b/drivers/net/ethernet/intel/ice/ice.h
index c8079c852a48..b169f3751cc9 100644
--- a/drivers/net/ethernet/intel/ice/ice.h
+++ b/drivers/net/ethernet/intel/ice/ice.h
@@ -25,6 +25,8 @@
  #include 
  #include 
  #include 
+#include 
+#include 
  #include 
  #include 
  #include 
@@ -32,6 +34,7 @@
  #include 
  #include 
  #include 
+#include 
  #include 
  #include "ice_devids.h"
  #include "ice_type.h"
@@ -41,17 +44,42 @@
  #include "ice_sched.h"
  
  #define ICE_BAR0		0

+#define ICE_DFLT_NUM_DESC  128
+#define ICE_REQ_DESC_MULTIPLE  32
  #define ICE_INT_NAME_STR_LEN  (IFNAMSIZ + 16)
  #define ICE_AQ_LEN64
  #define ICE_MIN_MSIX  2
  #define ICE_MAX_VSI_ALLOC 130
  #define ICE_MAX_TXQS  2048
  #define ICE_MAX_RXQS  2048
+#define ICE_VSI_MAP_CONTIG 0
+#define ICE_VSI_MAP_SCATTER1
+#define ICE_MAX_SCATTER_TXQS   16
+#define ICE_MAX_SCATTER_RXQS   16
  #define ICE_RES_VALID_BIT 0x8000
  #define ICE_RES_MISC_VEC_ID   (ICE_RES_VALID_BIT - 1)
+#define ICE_INVAL_Q_INDEX  0x
  
  #define ICE_DFLT_NETIF_M (NETIF_MSG_DRV | NETIF_MSG_PROBE | NETIF_MSG_LINK)
  
+#define ICE_MAX_MTU	(ICE_AQ_SET_MAC_FRAME_SIZE_MAX - \

+ETH_HLEN + ETH_FCS_LEN + VLAN_HLEN)
+
+#define ICE_UP_TABLE_TRANSLATE(val, i) \
+   (((val) << ICE_AQ_VSI_UP_TABLE_UP##i##_S) & \
+ ICE_AQ_VSI_UP_TABLE_UP##i##_M)
+
+struct ice_tc_info {
+   u16 qoffset;
+   u16 qcount;
+};
+
+struct ice_tc_cfg {
+   u8 numtc; /* Total number of enabled TCs */
+   u8 ena_tc; /* TX map */
+   struct ice_tc_info tc_info[ICE_MAX_TRAFFIC_CLASS];
+};
+
  struct ice_res_tracker {
u16 num_entries;
u16 search_hint;
@@ -75,8 +103,47 @@ enum ice_state {
  /* struct that defines a VSI, associated with a dev */
  struct ice_vsi {
struct net_device *netdev;
+   struct ice_sw *vsw;  /* switch this VSI is on */
+   struct ice_pf *back; /* back pointer to PF */
struct ice_port_info *port_info; /* back pointer to port_info */
+   struct ice_ring **rx_rings;  /* rx ring array */
+   struct ice_ring **tx_rings;  /* tx ring array */
+   struct ice_q_vector **q_vectors; /* q_vector array */
+   DECLARE_BITMAP(state, __ICE_STATE_NBITS);
+   int num_q_vectors;
+   int base_vector;
+   enum ice_vsi_type type;
u16 vsi_num; /* HW (absolute) index of this VSI */
+   u16 idx; /* software index in pf->vsi[] */
+
+   /* Interrupt thresholds */
+   u16 work_lmt;
+
+   struct ice_aqc_vsi_props info;   /* VSI properties */
+
+   /* queue information */
+   u8 tx_mapping_mode;  /* ICE_MAP_MODE_[CONTIG|SCATTER] */
+   u8 rx_mapping_mode;  /* ICE_MAP_MODE_[CONTIG|SCATTER] */
+   u16 txq_map[ICE_MAX_TXQS];   /* index in pf->avail_txqs */
+ 

Re: [Intel-wired-lan] [PATCH 04/15] ice: Get switch config, scheduler config and device capabilities

2018-03-12 Thread Shannon Nelson

On 3/9/2018 9:21 AM, Anirudh Venkataramanan wrote:

This patch adds to the initialization flow by getting switch
configuration, scheduler configuration and device capabilities.

Switch configuration:
On boot, an L2 switch element is created in the firmware per physical
function. Each physical function is also mapped to a port, to which its
switch element is connected. In other words, this switch can be visualized
as an embedded vSwitch that can connect a physical functions's virtual
station interfaces (VSIs) to the egress/ingress port. Egress/ingress
filters will be eventually created and applied on this switch element.
As part of the initialization flow, the driver gets configuration data
from this switch element and stores it.

Scheduler configuration:
The Tx scheduler is a subsystem responsible for setting and enforcing QoS.
As part of the initialization flow, the driver queries and stores the
default scheduler configuration for the given physical function.

Device capabilities:
As part of initialization, the driver has to determine what the device is
capable of (ex. max queues, VSIs, etc). This information is obtained from
the firmware and stored by the driver.

Signed-off-by: Anirudh Venkataramanan 
---
  drivers/net/ethernet/intel/ice/Makefile |   4 +-
  drivers/net/ethernet/intel/ice/ice.h|   2 +
  drivers/net/ethernet/intel/ice/ice_adminq_cmd.h | 209 ++
  drivers/net/ethernet/intel/ice/ice_common.c | 223 +++
  drivers/net/ethernet/intel/ice/ice_common.h |   2 +
  drivers/net/ethernet/intel/ice/ice_sched.c  | 354 
  drivers/net/ethernet/intel/ice/ice_sched.h  |  42 +++
  drivers/net/ethernet/intel/ice/ice_switch.c | 158 +++
  drivers/net/ethernet/intel/ice/ice_switch.h |  28 ++
  drivers/net/ethernet/intel/ice/ice_type.h   | 109 
  10 files changed, 1130 insertions(+), 1 deletion(-)
  create mode 100644 drivers/net/ethernet/intel/ice/ice_sched.c
  create mode 100644 drivers/net/ethernet/intel/ice/ice_sched.h
  create mode 100644 drivers/net/ethernet/intel/ice/ice_switch.c
  create mode 100644 drivers/net/ethernet/intel/ice/ice_switch.h

diff --git a/drivers/net/ethernet/intel/ice/Makefile 
b/drivers/net/ethernet/intel/ice/Makefile
index 373d481dbb25..809d85c04398 100644
--- a/drivers/net/ethernet/intel/ice/Makefile
+++ b/drivers/net/ethernet/intel/ice/Makefile
@@ -27,4 +27,6 @@ obj-$(CONFIG_ICE) += ice.o
  ice-y := ice_main.o   \
 ice_controlq.o \
 ice_common.o   \
-ice_nvm.o
+ice_nvm.o  \
+ice_switch.o   \
+ice_sched.o
diff --git a/drivers/net/ethernet/intel/ice/ice.h 
b/drivers/net/ethernet/intel/ice/ice.h
index ab2800c31906..f6e3339591bb 100644
--- a/drivers/net/ethernet/intel/ice/ice.h
+++ b/drivers/net/ethernet/intel/ice/ice.h
@@ -30,7 +30,9 @@
  #include 
  #include "ice_devids.h"
  #include "ice_type.h"
+#include "ice_switch.h"
  #include "ice_common.h"
+#include "ice_sched.h"
  
  #define ICE_BAR0		0

  #define ICE_AQ_LEN64
diff --git a/drivers/net/ethernet/intel/ice/ice_adminq_cmd.h 
b/drivers/net/ethernet/intel/ice/ice_adminq_cmd.h
index 05b22a1ffd70..66a3f41df673 100644
--- a/drivers/net/ethernet/intel/ice/ice_adminq_cmd.h
+++ b/drivers/net/ethernet/intel/ice/ice_adminq_cmd.h
@@ -22,6 +22,8 @@
   * descriptor format.  It is shared between Firmware and Software.
   */
  
+#define ICE_AQC_TOPO_MAX_LEVEL_NUM	0x9

+
  struct ice_aqc_generic {
__le32 param0;
__le32 param1;
@@ -82,6 +84,40 @@ struct ice_aqc_req_res {
u8 reserved[2];
  };
  
+/* Get function capabilities (indirect 0x000A)

+ * Get device capabilities (indirect 0x000B)
+ */
+struct ice_aqc_list_caps {
+   u8 cmd_flags;
+   u8 pf_index;
+   u8 reserved[2];
+   __le32 count;
+   __le32 addr_high;
+   __le32 addr_low;
+};
+
+/* Device/Function buffer entry, repeated per reported capability */
+struct ice_aqc_list_caps_elem {
+   __le16 cap;
+#define ICE_AQC_CAPS_VSI   0x0017
+#define ICE_AQC_CAPS_RSS   0x0040
+#define ICE_AQC_CAPS_RXQS  0x0041
+#define ICE_AQC_CAPS_TXQS  0x0042
+#define ICE_AQC_CAPS_MSIX  0x0043
+#define ICE_AQC_CAPS_MAX_MTU   0x0047
+
+   u8 major_ver;
+   u8 minor_ver;
+   /* Number of resources described by this capability */
+   __le32 number;
+   /* Only meaningful for some types of resources */
+   __le32 logical_id;
+   /* Only meaningful for some types of resources */
+   __le32 phys_id;
+   __le64 rsvd1;
+   __le64 rsvd2;
+};
+
  /* Clear PXE Command and response (direct 0x0110) */
  struct ice_aqc_clear_pxe {
u8 rx_cnt;
@@ -89,6 +125,161 @@ struct ice_aqc_clear_pxe {
u8 reserved[15];
  };
  
+/* Get switch configuration (0x0200) */

Re: [Intel-wired-lan] [PATCH 06/15] ice: Initialize PF and setup miscellaneous interrupt

2018-03-12 Thread Shannon Nelson

On 3/9/2018 9:21 AM, Anirudh Venkataramanan wrote:

This patch continues the initialization flow as follows:

1) Allocate and initialize necessary fields (like vsi, num_alloc_vsi,
irq_tracker, etc) in the ice_pf instance.

2) Setup the miscellaneous interrupt handler. This also known as the
"other interrupt causes" (OIC) handler and is used to handle non
hotpath interrupts (like control queue events, link events,
exceptions, etc.

3) Implement a background task to process admin queue receive (ARQ)
events received by the driver.

Signed-off-by: Anirudh Venkataramanan 
---
  drivers/net/ethernet/intel/ice/ice.h|  84 +++
  drivers/net/ethernet/intel/ice/ice_adminq_cmd.h |   2 +
  drivers/net/ethernet/intel/ice/ice_common.c |   6 +
  drivers/net/ethernet/intel/ice/ice_common.h |   3 +
  drivers/net/ethernet/intel/ice/ice_controlq.c   | 101 
  drivers/net/ethernet/intel/ice/ice_controlq.h   |   8 +
  drivers/net/ethernet/intel/ice/ice_hw_autogen.h |  63 +++
  drivers/net/ethernet/intel/ice/ice_main.c   | 719 +++-
  drivers/net/ethernet/intel/ice/ice_txrx.h   |  43 ++
  drivers/net/ethernet/intel/ice/ice_type.h   |  11 +
  10 files changed, 1039 insertions(+), 1 deletion(-)
  create mode 100644 drivers/net/ethernet/intel/ice/ice_txrx.h

diff --git a/drivers/net/ethernet/intel/ice/ice.h 
b/drivers/net/ethernet/intel/ice/ice.h
index 9681e971bcab..c8079c852a48 100644
--- a/drivers/net/ethernet/intel/ice/ice.h
+++ b/drivers/net/ethernet/intel/ice/ice.h
@@ -26,29 +26,113 @@
  #include 
  #include 
  #include 
+#include 
  #include 
+#include 
+#include 
  #include 
  #include 
+#include 
  #include "ice_devids.h"
  #include "ice_type.h"
+#include "ice_txrx.h"
  #include "ice_switch.h"
  #include "ice_common.h"
  #include "ice_sched.h"
  
  #define ICE_BAR0		0

+#define ICE_INT_NAME_STR_LEN   (IFNAMSIZ + 16)
  #define ICE_AQ_LEN64
+#define ICE_MIN_MSIX   2
+#define ICE_MAX_VSI_ALLOC  130
+#define ICE_MAX_TXQS   2048
+#define ICE_MAX_RXQS   2048
+#define ICE_RES_VALID_BIT  0x8000
+#define ICE_RES_MISC_VEC_ID(ICE_RES_VALID_BIT - 1)
  
  #define ICE_DFLT_NETIF_M (NETIF_MSG_DRV | NETIF_MSG_PROBE | NETIF_MSG_LINK)
  
+struct ice_res_tracker {

+   u16 num_entries;
+   u16 search_hint;
+   u16 list[1];
+};
+
+struct ice_sw {
+   struct ice_pf *pf;
+   u16 sw_id;  /* switch ID for this switch */
+   u16 bridge_mode;/* VEB/VEPA/Port Virtualizer */
+};
+
  enum ice_state {
__ICE_DOWN,
+   __ICE_PFR_REQ,  /* set by driver and peers */
+   __ICE_ADMINQ_EVENT_PENDING,
+   __ICE_SERVICE_SCHED,
__ICE_STATE_NBITS   /* must be last */
  };
  
+/* struct that defines a VSI, associated with a dev */

+struct ice_vsi {
+   struct net_device *netdev;
+   struct ice_port_info *port_info; /* back pointer to port_info */
+   u16 vsi_num; /* HW (absolute) index of this VSI */
+} cacheline_internodealigned_in_smp;
+
+enum ice_pf_flags {
+   ICE_FLAG_MSIX_ENA,
+   ICE_FLAG_FLTR_SYNC,
+   ICE_FLAG_RSS_ENA,
+   ICE_PF_FLAGS_NBITS  /* must be last */
+};
+
  struct ice_pf {
struct pci_dev *pdev;
+   struct msix_entry *msix_entries;
+   struct ice_res_tracker *irq_tracker;
+   struct ice_vsi **vsi;   /* VSIs created by the driver */
+   struct ice_sw *first_sw;/* first switch created by firmware */
DECLARE_BITMAP(state, __ICE_STATE_NBITS);
+   DECLARE_BITMAP(avail_txqs, ICE_MAX_TXQS);
+   DECLARE_BITMAP(avail_rxqs, ICE_MAX_RXQS);
+   DECLARE_BITMAP(flags, ICE_PF_FLAGS_NBITS);
+   unsigned long serv_tmr_period;
+   unsigned long serv_tmr_prev;
+   struct timer_list serv_tmr;
+   struct work_struct serv_task;
+   struct mutex avail_q_mutex; /* protects access to avail_[rx|tx]qs */
+   struct mutex sw_mutex;  /* lock for protecting VSI alloc flow */
u32 msg_enable;
+   u32 oicr_idx;   /* Other interrupt cause vector index */
+   u32 num_lan_msix;   /* Total MSIX vectors for base driver */
+   u32 num_avail_msix; /* remaining MSIX vectors left unclaimed */
+   u16 num_lan_tx; /* num lan tx queues setup */
+   u16 num_lan_rx; /* num lan rx queues setup */
+   u16 q_left_tx;  /* remaining num tx queues left unclaimed */
+   u16 q_left_rx;  /* remaining num rx queues left unclaimed */
+   u16 next_vsi;   /* Next free slot in pf->vsi[] - 0-based! */
+   u16 num_alloc_vsi;
+
struct ice_hw hw;
+   char int_name[ICE_INT_NAME_STR_LEN];
  };
+
+/**
+ * ice_irq_dynamic_ena - Enable default interrupt generation settings
+ * @hw: pointer to hw struct
+ */
+static inline void ice_irq_dynamic_ena(struct ice_hw *hw)
+{
+   u32 vector = 

Re: [Intel-wired-lan] [PATCH 02/15] ice: Add support for control queues

2018-03-12 Thread Shannon Nelson

On 3/9/2018 9:21 AM, Anirudh Venkataramanan wrote:

A control queue is a hardware interface which is used by the driver
to interact with other subsystems (like firmware, PHY, etc.). It is
implemented as a producer-consumer ring. More specifically, an
"admin queue" is a type of control queue used to interact with the
firmware.

This patch introduces data structures and functions to initialize
and teardown control/admin queues. Once the admin queue is initialized,
the driver uses it to get the firmware version.

Signed-off-by: Anirudh Venkataramanan 
---
  drivers/net/ethernet/intel/ice/Makefile |   4 +-
  drivers/net/ethernet/intel/ice/ice.h|   1 +
  drivers/net/ethernet/intel/ice/ice_adminq_cmd.h | 108 +++
  drivers/net/ethernet/intel/ice/ice_common.c | 144 
  drivers/net/ethernet/intel/ice/ice_common.h |  39 +
  drivers/net/ethernet/intel/ice/ice_controlq.c   | 979 
  drivers/net/ethernet/intel/ice/ice_controlq.h   |  97 +++
  drivers/net/ethernet/intel/ice/ice_hw_autogen.h |  46 ++
  drivers/net/ethernet/intel/ice/ice_main.c   |  11 +-
  drivers/net/ethernet/intel/ice/ice_osdep.h  |  86 +++
  drivers/net/ethernet/intel/ice/ice_status.h |  35 +
  drivers/net/ethernet/intel/ice/ice_type.h   |  22 +
  12 files changed, 1570 insertions(+), 2 deletions(-)
  create mode 100644 drivers/net/ethernet/intel/ice/ice_adminq_cmd.h
  create mode 100644 drivers/net/ethernet/intel/ice/ice_common.c
  create mode 100644 drivers/net/ethernet/intel/ice/ice_common.h
  create mode 100644 drivers/net/ethernet/intel/ice/ice_controlq.c
  create mode 100644 drivers/net/ethernet/intel/ice/ice_controlq.h
  create mode 100644 drivers/net/ethernet/intel/ice/ice_hw_autogen.h
  create mode 100644 drivers/net/ethernet/intel/ice/ice_osdep.h
  create mode 100644 drivers/net/ethernet/intel/ice/ice_status.h

diff --git a/drivers/net/ethernet/intel/ice/Makefile 
b/drivers/net/ethernet/intel/ice/Makefile
index 2a177ea21b74..eebf619e84a8 100644
--- a/drivers/net/ethernet/intel/ice/Makefile
+++ b/drivers/net/ethernet/intel/ice/Makefile
@@ -24,4 +24,6 @@
  
  obj-$(CONFIG_ICE) += ice.o
  
-ice-y := ice_main.o

+ice-y := ice_main.o\
+ice_controlq.o \
+ice_common.o
diff --git a/drivers/net/ethernet/intel/ice/ice.h 
b/drivers/net/ethernet/intel/ice/ice.h
index d781027330cc..ea2fb63bb095 100644
--- a/drivers/net/ethernet/intel/ice/ice.h
+++ b/drivers/net/ethernet/intel/ice/ice.h
@@ -26,6 +26,7 @@
  #include 
  #include 
  #include 
+#include 
  #include 
  #include "ice_devids.h"
  #include "ice_type.h"
diff --git a/drivers/net/ethernet/intel/ice/ice_adminq_cmd.h 
b/drivers/net/ethernet/intel/ice/ice_adminq_cmd.h
new file mode 100644
index ..885fa3c6fec4
--- /dev/null
+++ b/drivers/net/ethernet/intel/ice/ice_adminq_cmd.h
@@ -0,0 +1,108 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/* Intel(R) Ethernet Connection E800 Series Linux Driver
+ * Copyright (c) 2018, Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * The full GNU General Public License is included in this distribution in
+ * the file called "COPYING".
+ */
+
+#ifndef _ICE_ADMINQ_CMD_H_
+#define _ICE_ADMINQ_CMD_H_
+
+/* This header file defines the Admin Queue commands, error codes and
+ * descriptor format.  It is shared between Firmware and Software.
+ */
+
+struct ice_aqc_generic {
+   __le32 param0;
+   __le32 param1;
+   __le32 addr_high;
+   __le32 addr_low;
+};
+
+/* Get version (direct 0x0001) */
+struct ice_aqc_get_ver {
+   __le32 rom_ver;
+   __le32 fw_build;
+   u8 fw_branch;
+   u8 fw_major;
+   u8 fw_minor;
+   u8 fw_patch;
+   u8 api_branch;
+   u8 api_major;
+   u8 api_minor;
+   u8 api_patch;
+};
+
+/* Queue Shutdown (direct 0x0003) */
+struct ice_aqc_q_shutdown {
+#define ICE_AQC_DRIVER_UNLOADING   BIT(0)
+   __le32 driver_unloading;
+   u8 reserved[12];
+};
+
+/**
+ * struct ice_aq_desc - Admin Queue (AQ) descriptor
+ * @flags: ICE_AQ_FLAG_* flags
+ * @opcode: AQ command opcode
+ * @datalen: length in bytes of indirect/external data buffer
+ * @retval: return value from firmware
+ * @cookie_h: opaque data high-half
+ * @cookie_l: opaque data low-half
+ * @params: command-specific parameters
+ *
+ * Descriptor format for commands the driver posts on the Admin Transmit Queue
+ * (ATQ).  The firmware writes back onto the command descriptor and returns
+ * the result of the command.  Asynchronous events that are not an immediate
+ * result 

Re: [Intel-wired-lan] [PATCH 03/15] ice: Start hardware initialization

2018-03-12 Thread Shannon Nelson

On 3/9/2018 9:21 AM, Anirudh Venkataramanan wrote:

This patch implements multiple pieces of the initialization flow
as follows:

1) A reset is issued to ensure a clean device state, followed
by initialization of admin queue interface.

2) Once the admin queue interface is up, clear the PF config
and transition the device to non-PXE mode.

3) Get the NVM configuration stored in the device's non-volatile
memory (NVM) using ice_init_nvm.

Signed-off-by: Anirudh Venkataramanan 
---
  drivers/net/ethernet/intel/ice/Makefile |   3 +-
  drivers/net/ethernet/intel/ice/ice.h|   2 +
  drivers/net/ethernet/intel/ice/ice_adminq_cmd.h |  79 +
  drivers/net/ethernet/intel/ice/ice_common.c | 410 
  drivers/net/ethernet/intel/ice/ice_common.h |  11 +
  drivers/net/ethernet/intel/ice/ice_controlq.h   |   3 +
  drivers/net/ethernet/intel/ice/ice_hw_autogen.h |  30 ++
  drivers/net/ethernet/intel/ice/ice_main.c   |  31 ++
  drivers/net/ethernet/intel/ice/ice_nvm.c| 245 ++
  drivers/net/ethernet/intel/ice/ice_osdep.h  |   1 +
  drivers/net/ethernet/intel/ice/ice_status.h |   5 +
  drivers/net/ethernet/intel/ice/ice_type.h   |  49 +++
  12 files changed, 868 insertions(+), 1 deletion(-)
  create mode 100644 drivers/net/ethernet/intel/ice/ice_nvm.c

diff --git a/drivers/net/ethernet/intel/ice/Makefile 
b/drivers/net/ethernet/intel/ice/Makefile
index eebf619e84a8..373d481dbb25 100644
--- a/drivers/net/ethernet/intel/ice/Makefile
+++ b/drivers/net/ethernet/intel/ice/Makefile
@@ -26,4 +26,5 @@ obj-$(CONFIG_ICE) += ice.o
  
  ice-y := ice_main.o	\

 ice_controlq.o \
-ice_common.o
+ice_common.o   \
+ice_nvm.o
diff --git a/drivers/net/ethernet/intel/ice/ice.h 
b/drivers/net/ethernet/intel/ice/ice.h
index ea2fb63bb095..ab2800c31906 100644
--- a/drivers/net/ethernet/intel/ice/ice.h
+++ b/drivers/net/ethernet/intel/ice/ice.h
@@ -30,8 +30,10 @@
  #include 
  #include "ice_devids.h"
  #include "ice_type.h"
+#include "ice_common.h"
  
  #define ICE_BAR0		0

+#define ICE_AQ_LEN 64
  
  #define ICE_DFLT_NETIF_M (NETIF_MSG_DRV | NETIF_MSG_PROBE | NETIF_MSG_LINK)
  
diff --git a/drivers/net/ethernet/intel/ice/ice_adminq_cmd.h b/drivers/net/ethernet/intel/ice/ice_adminq_cmd.h

index 885fa3c6fec4..05b22a1ffd70 100644
--- a/drivers/net/ethernet/intel/ice/ice_adminq_cmd.h
+++ b/drivers/net/ethernet/intel/ice/ice_adminq_cmd.h
@@ -50,6 +50,67 @@ struct ice_aqc_q_shutdown {
u8 reserved[12];
  };
  
+/* Request resource ownership (direct 0x0008)

+ * Release resource ownership (direct 0x0009)
+ */
+struct ice_aqc_req_res {
+   __le16 res_id;
+#define ICE_AQC_RES_ID_NVM 1
+#define ICE_AQC_RES_ID_SDP 2
+#define ICE_AQC_RES_ID_CHNG_LOCK   3
+#define ICE_AQC_RES_ID_GLBL_LOCK   4
+   __le16 access_type;
+#define ICE_AQC_RES_ACCESS_READ1
+#define ICE_AQC_RES_ACCESS_WRITE   2
+
+   /* Upon successful completion, FW writes this value and driver is
+* expected to release resource before timeout. This value is provided
+* in milliseconds.
+*/
+   __le32 timeout;
+#define ICE_AQ_RES_NVM_READ_DFLT_TIMEOUT_MS3000
+#define ICE_AQ_RES_NVM_WRITE_DFLT_TIMEOUT_MS   18
+#define ICE_AQ_RES_CHNG_LOCK_DFLT_TIMEOUT_MS   1000
+#define ICE_AQ_RES_GLBL_LOCK_DFLT_TIMEOUT_MS   3000
+   /* For SDP: pin id of the SDP */
+   __le32 res_number;
+   /* Status is only used for ICE_AQC_RES_ID_GLBL_LOCK */
+   __le16 status;
+#define ICE_AQ_RES_GLBL_SUCCESS0
+#define ICE_AQ_RES_GLBL_IN_PROG1
+#define ICE_AQ_RES_GLBL_DONE   2
+   u8 reserved[2];


Since these structs all become part of the descriptor's param union, 
perhaps adding reserved space to the end is not necessary.



+};
+
+/* Clear PXE Command and response (direct 0x0110) */
+struct ice_aqc_clear_pxe {
+   u8 rx_cnt;
+#define ICE_AQC_CLEAR_PXE_RX_CNT   0x2
+   u8 reserved[15];
+};
+
+/* NVM Read command (indirect 0x0701)
+ * NVM Erase commands (direct 0x0702)
+ * NVM Update commands (indirect 0x0703)
+ */
+struct ice_aqc_nvm {
+   u8  cmd_flags;
+#define ICE_AQC_NVM_LAST_CMD   BIT(0)
+#define ICE_AQC_NVM_PCIR_REQ   BIT(0)  /* Used by NVM Update reply */
+#define ICE_AQC_NVM_PRESERVATION_S 1
+#define ICE_AQC_NVM_PRESERVATION_M (3 << CSR_AQ_NVM_PRESERVATION_S)
+#define ICE_AQC_NVM_NO_PRESERVATION(0 << CSR_AQ_NVM_PRESERVATION_S)
+#define ICE_AQC_NVM_PRESERVE_ALL   BIT(1)
+#define ICE_AQC_NVM_PRESERVE_SELECTED  (3 << CSR_AQ_NVM_PRESERVATION_S)
+#define ICE_AQC_NVM_FLASH_ONLY BIT(7)
+   u8  module_typeid;
+   __le16  length;
+#define ICE_AQC_NVM_ERASE_LEN  0x
+   __le32  offset;
+   __le32  addr_high;
+   __le32  addr_low;
+};
+
  /**
   * struct ice_aq_desc - Admin Queue (AQ) descriptor
   

[PATCH net] macvlan: filter out unsupported feature flags

2018-03-08 Thread Shannon Nelson
Adding a macvlan device on top of a lowerdev that supports
the xfrm offloads fails with a new regression:
  # ip link add link ens1f0 mv0 type macvlan
  RTNETLINK answers: Operation not permitted

Tracing down the failure shows that the macvlan device inherits
the NETIF_F_HW_ESP and NETIF_F_HW_ESP_TX_CSUM feature flags
from the lowerdev, but with no dev->xfrmdev_ops API filled
in, it doesn't actually support xfrm.  When the request is
made to add the new macvlan device, the XFRM listener for
NETDEV_REGISTER calls xfrm_api_check() which fails the new
registration because dev->xfrmdev_ops is NULL.

The macvlan creation succeeds when we filter out the ESP
feature flags in macvlan_fix_features(), so let's filter them
out like we're already filtering out ~NETIF_F_NETNS_LOCAL.
When XFRM support is added in the future, we can add the flags
into MACVLAN_FEATURES.

This same problem could crop up in the future with any other
new feature flags, so let's filter out any flags that aren't
defined as supported in macvlan.

Fixes: d77e38e612a0 ("xfrm: Add an IPsec hardware offloading API")
Reported-by: Alexey Kodanev <alexey.koda...@oracle.com>
Signed-off-by: Shannon Nelson <shannon.nel...@oracle.com>
---
 drivers/net/macvlan.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/macvlan.c b/drivers/net/macvlan.c
index 8fc02d9..725f4b4 100644
--- a/drivers/net/macvlan.c
+++ b/drivers/net/macvlan.c
@@ -1036,7 +1036,7 @@ static netdev_features_t macvlan_fix_features(struct 
net_device *dev,
lowerdev_features &= (features | ~NETIF_F_LRO);
features = netdev_increment_features(lowerdev_features, features, mask);
features |= ALWAYS_ON_FEATURES;
-   features &= ~NETIF_F_NETNS_LOCAL;
+   features &= (ALWAYS_ON_FEATURES | MACVLAN_FEATURES);
 
return features;
 }
-- 
2.7.4



Re: [PATCH net] macvlan: filter out xfrm feature flags

2018-03-08 Thread Shannon Nelson

On 3/8/2018 9:33 AM, David Miller wrote:

From: Shannon Nelson <shannon.nel...@oracle.com>
Date: Tue,  6 Mar 2018 14:57:08 -0800


This isn't broken for vlans because they use a separate features
connection (vlan_features) for inheriting features.  This is
fine, but I don't think trying to add something like this to
every driver for every new upperdev is a good idea - I think
the upperdev should try to protect itself.


I think this fix is correct.

But for how many upperdevs are we going to duplicate code like this,
and how many subtle differences and in fact bugs will result from all
of that duplication?

I think we really need something common for these upperdev drivers
to use.  Maybe just a macro defining feature bits to trim in this
situation.

Thanks.


Thanks, Dave.  Yes, this could use something a little more generic, 
something that would catch any future "dangerous" bits.


I'm not sure we can come up with a generic mask to be used by everyone, 
as each upper and lower dev has their own feature support levels.  But 
we might come up with a better example for others to follow.


Rather than calling out specific non-supported bits, we can probably 
just build a mask from bits that we already know are supported, 
something like this:

features &= (ALWAYS_ON_FEATURES | MACVLAN_FEATURES);

which would take care of NETIF_F_NETNS_LOCAL, the ESP flags, and 
anything else that wasn't already specifically called for.  I'll repost 
with this and see what folks think.


sln


[PATCH net] macvlan: filter out xfrm feature flags

2018-03-06 Thread Shannon Nelson
Adding a macvlan device on top of a lowerdev that supports
the xfrm offloads fails.
# ip link add link ens1f0 mv0 type macvlan
RTNETLINK answers: Operation not permitted

Tracing down the failure shows that the macvlan device inherits
the NETIF_F_HW_ESP and NETIF_F_HW_ESP_TX_CSUM feature flags from
the lowerdev, but doesn't actually support xfrm so doesn't have
the dev->xfrmdev_ops API filled in.  When the request is made
to add the new macvlan device, the various feature flags are
checked by the feature subsystems, and the xfrm_api_check()
fails the check since the dev->xfrmdev_ops are not set up.
The macvlan creation succeeds when we filter out those flags
in macvlan_fix_features().

This isn't broken for vlans because they use a separate features
connection (vlan_features) for inheriting features.  This is
fine, but I don't think trying to add something like this to
every driver for every new upperdev is a good idea - I think
the upperdev should try to protect itself.

Fixes: d77e38e612a0 ("xfrm: Add an IPsec hardware offloading API")
Signed-off-by: Shannon Nelson <shannon.nel...@oracle.com>
---
 drivers/net/macvlan.c | 6 +-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/drivers/net/macvlan.c b/drivers/net/macvlan.c
index 8fc02d9..76b8fb5 100644
--- a/drivers/net/macvlan.c
+++ b/drivers/net/macvlan.c
@@ -844,6 +844,10 @@ static struct lock_class_key macvlan_netdev_addr_lock_key;
 NETIF_F_TSO_ECN | NETIF_F_TSO6 | NETIF_F_GRO | NETIF_F_RXCSUM | \
 NETIF_F_HW_VLAN_CTAG_FILTER | NETIF_F_HW_VLAN_STAG_FILTER)
 
+#define MACVLAN_NON_FEATURES \
+   (NETIF_F_HW_ESP | NETIF_F_HW_ESP_TX_CSUM | NETIF_F_GSO_ESP | \
+NETIF_F_NETNS_LOCAL)
+
 #define MACVLAN_STATE_MASK \
((1<<__LINK_STATE_NOCARRIER) | (1<<__LINK_STATE_DORMANT))
 
@@ -1036,7 +1040,7 @@ static netdev_features_t macvlan_fix_features(struct 
net_device *dev,
lowerdev_features &= (features | ~NETIF_F_LRO);
features = netdev_increment_features(lowerdev_features, features, mask);
features |= ALWAYS_ON_FEATURES;
-   features &= ~NETIF_F_NETNS_LOCAL;
+   features &= ~MACVLAN_NON_FEATURES;
 
return features;
 }
-- 
2.7.4



Re: [PATCH v2 0/4] GSO_BY_FRAGS correctness improvements

2018-03-01 Thread Shannon Nelson

From: Daniel Axtens 
Date: Wed, Feb 28, 2018 at 10:13 PM

As requested [1], I went through and had a look at users of gso_size to
see if there were things that need to be fixed to consider
GSO_BY_FRAGS, and I have tried to improve our helper functions to deal
with this case.

I found a few. This fixes bugs relating to the use of
skb_gso_*_seglen() where GSO_BY_FRAGS is not considered.

Patch 1 renames skb_gso_validate_mtu to skb_gso_validate_network_len.
This is follow-up to my earlier patch 2b16f048729b ("net: create
skb_gso_validate_mac_len()"), and just makes everything a bit clearer.

Patches 2 and 3 replace the final users of skb_gso_network_seglen() -
which doesn't consider GSO_BY_FRAGS - with
skb_gso_validate_network_len(), which does. This allows me to make the
skb_gso_*_seglen functions private in patch 4 - now future users won't
accidentally do the wrong comparison.

Two things remain. One is qdisc_pkt_len_init, which is discussed at
[2] - it's caught up in the GSO_DODGY mess. I don't have any expertise
in GSO_DODGY, and it looks like a good clean fix will involve
unpicking the whole validation mess, so I have left it for now.

Secondly, there are 3 eBPF opcodes that change the gso_size of an SKB
and don't consider GSO_BY_FRAGS. This is going through the bpf tree.

Regards,
Daniel

[1] https://patchwork.ozlabs.org/comment/1852414/
[2] https://www.spinics.net/lists/netdev/msg482397.html

PS: This is all in the core networking stack. For a driver to be
affected by this it would need to support NETIF_F_GSO_SCTP /
NETIF_F_GSO_SOFTWARE and then either use gso_size or not be a purely
virtual device. (Many drivers look at gso_size, but do not support
SCTP segmentation, so the core network will segment an SCTP gso before
it hits them.) Based on that, the only driver that may be affected is
sunvnet, but I have no way of testing it, so I haven't looked at it.


I took a quick peek into sunvnet to check on this, and it looks like the 
code in sunvnet_common.c::vnet_handle_offloads() is already mis-handling 
SCTP gso packets, so I don't think you're adding any more breakage.


I suspect we should change sunvnet's use of NETIF_F_GSO_SOFTWARE to 
NETIF_F_ALL_TSO...


sln



v2: split out bpf stuff
 fix review comments from Dave Miller

Daniel Axtens (4):
   net: rename skb_gso_validate_mtu -> skb_gso_validate_network_len
   net: sched: tbf: handle GSO_BY_FRAGS case in enqueue
   net: xfrm: use skb_gso_validate_network_len() to check gso sizes
   net: make skb_gso_*_seglen functions private

  include/linux/skbuff.h  | 35 +---
  net/core/skbuff.c   | 48 -
  net/ipv4/ip_forward.c   |  2 +-
  net/ipv4/ip_output.c|  2 +-
  net/ipv4/netfilter/nf_flow_table_ipv4.c |  2 +-
  net/ipv4/xfrm4_output.c |  3 ++-
  net/ipv6/ip6_output.c   |  2 +-
  net/ipv6/netfilter/nf_flow_table_ipv6.c |  2 +-
  net/ipv6/xfrm6_output.c |  2 +-
  net/mpls/af_mpls.c  |  2 +-
  net/sched/sch_tbf.c |  3 ++-
  net/xfrm/xfrm_device.c  |  2 +-
  12 files changed, 54 insertions(+), 51 deletions(-)

--
2.14.1





[PATCH ipsec-next] esp: check the NETIF_F_HW_ESP_TX_CSUM bit before segmenting

2018-02-26 Thread Shannon Nelson
If I understand correctly, we should not be asking for a
checksum offload on an ipsec packet if the netdev isn't
advertising NETIF_F_HW_ESP_TX_CSUM.  In that case, we should
clear the NETIF_F_CSUM_MASK bits.

Signed-off-by: Shannon Nelson <shannon.nel...@oracle.com>
---
 net/ipv4/esp4_offload.c | 2 ++
 net/ipv6/esp6_offload.c | 2 ++
 2 files changed, 4 insertions(+)

diff --git a/net/ipv4/esp4_offload.c b/net/ipv4/esp4_offload.c
index da5635f..7cf755e 100644
--- a/net/ipv4/esp4_offload.c
+++ b/net/ipv4/esp4_offload.c
@@ -138,6 +138,8 @@ static struct sk_buff *esp4_gso_segment(struct sk_buff *skb,
if (!(features & NETIF_F_HW_ESP) || !x->xso.offload_handle ||
(x->xso.dev != skb->dev))
esp_features = features & ~(NETIF_F_SG | NETIF_F_CSUM_MASK);
+   else if (!(features & NETIF_F_HW_ESP_TX_CSUM))
+   esp_features = features & ~NETIF_F_CSUM_MASK;
 
xo->flags |= XFRM_GSO_SEGMENT;
 
diff --git a/net/ipv6/esp6_offload.c b/net/ipv6/esp6_offload.c
index 3fd1ec7..27f59b6 100644
--- a/net/ipv6/esp6_offload.c
+++ b/net/ipv6/esp6_offload.c
@@ -165,6 +165,8 @@ static struct sk_buff *esp6_gso_segment(struct sk_buff *skb,
if (!(features & NETIF_F_HW_ESP) || !x->xso.offload_handle ||
(x->xso.dev != skb->dev))
esp_features = features & ~(NETIF_F_SG | NETIF_F_CSUM_MASK);
+   else if (!(features & NETIF_F_HW_ESP_TX_CSUM))
+   esp_features = features & ~NETIF_F_CSUM_MASK;
 
xo->flags |= XFRM_GSO_SEGMENT;
 
-- 
2.7.4



[PATCH next-queue 3/3] ixgbe: remove unneeded ipsec state free callback

2018-02-22 Thread Shannon Nelson
With commit 7f05b467a735 ("xfrm: check for xdo_dev_state_free")
we no longer need to add an empty callback function
to the driver, so now let's remove the useless code.

Signed-off-by: Shannon Nelson <shannon.nel...@oracle.com>
---
 drivers/net/ethernet/intel/ixgbe/ixgbe_ipsec.c | 13 -
 1 file changed, 13 deletions(-)

diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_ipsec.c 
b/drivers/net/ethernet/intel/ixgbe/ixgbe_ipsec.c
index 8623013..f225452 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_ipsec.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_ipsec.c
@@ -724,23 +724,10 @@ static bool ixgbe_ipsec_offload_ok(struct sk_buff *skb, 
struct xfrm_state *xs)
return true;
 }
 
-/**
- * ixgbe_ipsec_free - called by xfrm garbage collections
- * @xs: pointer to transformer state struct
- *
- * We don't have any garbage to collect, so we shouldn't bother
- * implementing this function, but the XFRM code doesn't check for
- * existence before calling the API callback.
- **/
-static void ixgbe_ipsec_free(struct xfrm_state *xs)
-{
-}
-
 static const struct xfrmdev_ops ixgbe_xfrmdev_ops = {
.xdo_dev_state_add = ixgbe_ipsec_add_sa,
.xdo_dev_state_delete = ixgbe_ipsec_del_sa,
.xdo_dev_offload_ok = ixgbe_ipsec_offload_ok,
-   .xdo_dev_state_free = ixgbe_ipsec_free,
 };
 
 /**
-- 
2.7.4



[PATCH next-queue 2/3] ixgbe: fix ipsec trailer length

2018-02-22 Thread Shannon Nelson
Fix up the Tx trailer length calculation.  We can't believe the
trailer len from the xstate information because it was calculated
before the packet was put together and padding added.  This bit
of code finds the padding value in the trailer, adds it to the
authentication length, and saves it so later we can put it into
the Tx descriptor to tell the device where to stop the checksum
calculation.

Fixes: 592594704761 ("ixgbe: process the Tx ipsec offload")
Signed-off-by: Shannon Nelson <shannon.nel...@oracle.com>
---
 drivers/net/ethernet/intel/ixgbe/ixgbe_ipsec.c | 24 +++-
 1 file changed, 23 insertions(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_ipsec.c 
b/drivers/net/ethernet/intel/ixgbe/ixgbe_ipsec.c
index 8b7dbc8..8623013 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_ipsec.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_ipsec.c
@@ -789,11 +789,33 @@ int ixgbe_ipsec_tx(struct ixgbe_ring *tx_ring,
 
itd->flags = 0;
if (xs->id.proto == IPPROTO_ESP) {
+   struct sk_buff *skb = first->skb;
+   int ret, authlen, trailerlen;
+   u8 padlen;
+
itd->flags |= IXGBE_ADVTXD_TUCMD_IPSEC_TYPE_ESP |
  IXGBE_ADVTXD_TUCMD_L4T_TCP;
if (first->protocol == htons(ETH_P_IP))
itd->flags |= IXGBE_ADVTXD_TUCMD_IPV4;
-   itd->trailer_len = xs->props.trailer_len;
+
+   /* The actual trailer length is authlen (16 bytes) plus
+* 2 bytes for the proto and the padlen values, plus
+* padlen bytes of padding.  This ends up not the same
+* as the static value found in xs->props.trailer_len (21).
+*
+* The "correct" way to get the auth length would be to use
+*authlen = crypto_aead_authsize(xs->data);
+* but since we know we only have one size to worry about
+* we can let the compiler use the constant and save us a
+* few CPU cycles.
+*/
+   authlen = IXGBE_IPSEC_AUTH_BITS / 8;
+
+   ret = skb_copy_bits(skb, skb->len - (authlen + 2), , 1);
+   if (unlikely(ret))
+   return 0;
+   trailerlen = authlen + 2 + padlen;
+   itd->trailer_len = trailerlen;
}
if (tsa->encrypt)
itd->flags |= IXGBE_ADVTXD_TUCMD_IPSEC_ENCRYPT_EN;
-- 
2.7.4



[PATCH next-queue 0/3] ixgbe: ipsec fixups

2018-02-22 Thread Shannon Nelson
These are a couple of updates for the ixgbe IPsec offload support.

Shannon Nelson (3):
  ixgbe: check for 128-bit authentication
  ixgbe: fix ipsec trailer length
  ixgbe: remove unneeded ipsec state free callback

 drivers/net/ethernet/intel/ixgbe/ixgbe_ipsec.c | 53 +-
 drivers/net/ethernet/intel/ixgbe/ixgbe_ipsec.h |  1 +
 2 files changed, 35 insertions(+), 19 deletions(-)

-- 
2.7.4



  1   2   3   4   >