date:20181019

[bpf-next v2 2/3] bpf: libbpf support for msg_push_data

2018-10-19 Thread John Fastabend

Add support for new bpf_msg_push_data in libbpf.

Signed-off-by: John Fastabend 
---
 tools/include/uapi/linux/bpf.h| 20 +++-
 tools/testing/selftests/bpf/bpf_helpers.h |  2 ++
 2 files changed, 21 insertions(+), 1 deletion(-)

diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index a2fb333..852dc17 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -2240,6 +2240,23 @@ struct bpf_stack_build_id {
  * pointer that was returned from bpf_sk_lookup_xxx\ ().
  * Return
  * 0 on success, or a negative error in case of failure.
+ *
+ * int bpf_msg_push_data(struct sk_buff *skb, u32 start, u32 len, u64 flags)
+ * Description
+ * For socket policies, insert *len* bytes into msg at offset
+ * *start*.
+ *
+ * If a program of type **BPF_PROG_TYPE_SK_MSG** is run on a
+ * *msg* it may want to insert metadata or options into the msg.
+ * This can later be read and used by any of the lower layer BPF
+ * hooks.
+ *
+ * This helper may fail if under memory pressure (a malloc
+ * fails) in these cases BPF programs will get an appropriate
+ * error and BPF programs will need to handle them.
+ *
+ * Return
+ * 0 on success, or a negative error in case of failure.
  */
 #define __BPF_FUNC_MAPPER(FN)  \
FN(unspec), \
@@ -2331,7 +2348,8 @@ struct bpf_stack_build_id {
FN(sk_release), \
FN(map_push_elem),  \
FN(map_pop_elem),   \
-   FN(map_peek_elem),
+   FN(map_peek_elem),  \
+   FN(msg_push_data),
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
  * function eBPF program intends to call
diff --git a/tools/testing/selftests/bpf/bpf_helpers.h 
b/tools/testing/selftests/bpf/bpf_helpers.h
index 6407a3d..686e57c 100644
--- a/tools/testing/selftests/bpf/bpf_helpers.h
+++ b/tools/testing/selftests/bpf/bpf_helpers.h
@@ -111,6 +111,8 @@ static int (*bpf_msg_cork_bytes)(void *ctx, int len) =
(void *) BPF_FUNC_msg_cork_bytes;
 static int (*bpf_msg_pull_data)(void *ctx, int start, int end, int flags) =
(void *) BPF_FUNC_msg_pull_data;
+static int (*bpf_msg_push_data)(void *ctx, int start, int end, int flags) =
+   (void *) BPF_FUNC_msg_push_data;
 static int (*bpf_bind)(void *ctx, void *addr, int addr_len) =
(void *) BPF_FUNC_bind;
 static int (*bpf_xdp_adjust_tail)(void *ctx, int offset) =
-- 
1.9.1

[bpf-next v2 0/3] sockmap, bpf_msg_push_data helper

2018-10-19 Thread John Fastabend

This series adds a new helper bpf_msg_push_data to be used by
sk_msg programs. The helper can be used to insert extra bytes into
the message that can then be used by the program as metadata tags
among other things.

The first patch adds the helper, second patch the libbpf support,
and last patch updates test_sockmap to run msg_push_data tests.

v2: rebase after queue map and in filter.c convert int -> u32

John Fastabend (3):
  bpf: sk_msg program helper bpf_msg_push_data
  bpf: libbpf support for msg_push_data
  bpf: test_sockmap add options to use msg_push_data

 include/linux/skmsg.h   |   5 +
 include/uapi/linux/bpf.h|  20 +++-
 net/core/filter.c   | 134 
 tools/include/uapi/linux/bpf.h  |  20 +++-
 tools/testing/selftests/bpf/bpf_helpers.h   |   2 +
 tools/testing/selftests/bpf/test_sockmap.c  |  58 +-
 tools/testing/selftests/bpf/test_sockmap_kern.h |  97 +
 7 files changed, 308 insertions(+), 28 deletions(-)

-- 
1.9.1

[bpf-next v2 1/3] bpf: sk_msg program helper bpf_msg_push_data

2018-10-19 Thread John Fastabend

This allows user to push data into a msg using sk_msg program types.
The format is as follows,

bpf_msg_push_data(msg, offset, len, flags)

this will insert 'len' bytes at offset 'offset'. For example to
prepend 10 bytes at the front of the message the user can,

bpf_msg_push_data(msg, 0, 10, 0);

This will invalidate data bounds so BPF user will have to then recheck
data bounds after calling this. After this the msg size will have been
updated and the user is free to write into the added bytes. We allow
any offset/len as long as it is within the (data, data_end) range.
However, a copy will be required if the ring is full and its possible
for the helper to fail with ENOMEM or EINVAL errors which need to be
handled by the BPF program.

This can be used similar to XDP metadata to pass data between sk_msg
layer and lower layers.

Signed-off-by: John Fastabend 
---
 include/linux/skmsg.h|   5 ++
 include/uapi/linux/bpf.h |  20 ++-
 net/core/filter.c| 134 +++
 3 files changed, 158 insertions(+), 1 deletion(-)

diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h
index 84e1886..2a11e9d 100644
--- a/include/linux/skmsg.h
+++ b/include/linux/skmsg.h
@@ -207,6 +207,11 @@ static inline struct scatterlist *sk_msg_elem(struct 
sk_msg *msg, int which)
return &msg->sg.data[which];
 }
 
+static inline struct scatterlist sk_msg_elem_cpy(struct sk_msg *msg, int which)
+{
+   return msg->sg.data[which];
+}
+
 static inline struct page *sk_msg_page(struct sk_msg *msg, int which)
 {
return sg_page(sk_msg_elem(msg, which));
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index a2fb333..852dc17 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -2240,6 +2240,23 @@ struct bpf_stack_build_id {
  * pointer that was returned from bpf_sk_lookup_xxx\ ().
  * Return
  * 0 on success, or a negative error in case of failure.
+ *
+ * int bpf_msg_push_data(struct sk_buff *skb, u32 start, u32 len, u64 flags)
+ * Description
+ * For socket policies, insert *len* bytes into msg at offset
+ * *start*.
+ *
+ * If a program of type **BPF_PROG_TYPE_SK_MSG** is run on a
+ * *msg* it may want to insert metadata or options into the msg.
+ * This can later be read and used by any of the lower layer BPF
+ * hooks.
+ *
+ * This helper may fail if under memory pressure (a malloc
+ * fails) in these cases BPF programs will get an appropriate
+ * error and BPF programs will need to handle them.
+ *
+ * Return
+ * 0 on success, or a negative error in case of failure.
  */
 #define __BPF_FUNC_MAPPER(FN)  \
FN(unspec), \
@@ -2331,7 +2348,8 @@ struct bpf_stack_build_id {
FN(sk_release), \
FN(map_push_elem),  \
FN(map_pop_elem),   \
-   FN(map_peek_elem),
+   FN(map_peek_elem),  \
+   FN(msg_push_data),
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
  * function eBPF program intends to call
diff --git a/net/core/filter.c b/net/core/filter.c
index 5fd5139..35c6933 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -2297,6 +2297,137 @@ int skb_do_redirect(struct sk_buff *skb)
.arg4_type  = ARG_ANYTHING,
 };
 
+BPF_CALL_4(bpf_msg_push_data, struct sk_msg *, msg, u32, start,
+  u32, len, u64, flags)
+{
+   struct scatterlist sge, nsge, nnsge, rsge = {0}, *psge;
+   u32 new, i = 0, l, space, copy = 0, offset = 0;
+   u8 *raw, *to, *from;
+   struct page *page;
+
+   if (unlikely(flags))
+   return -EINVAL;
+
+   /* First find the starting scatterlist element */
+   i = msg->sg.start;
+   do {
+   l = sk_msg_elem(msg, i)->length;
+
+   if (start < offset + l)
+   break;
+   offset += l;
+   sk_msg_iter_var_next(i);
+   } while (i != msg->sg.end);
+
+   if (start >= offset + l)
+   return -EINVAL;
+
+   space = MAX_MSG_FRAGS - sk_msg_elem_used(msg);
+
+   /* If no space available will fallback to copy, we need at
+* least one scatterlist elem available to push data into
+* when start aligns to the beginning of an element or two
+* when it falls inside an element. We handle the start equals
+* offset case because its the common case for inserting a
+* header.
+*/
+   if (!space || (space == 1 && start != offset))
+   copy = msg->sg.data[i].length;
+
+   page = alloc_pages(__GFP_NOWARN | GFP_ATOMIC | __GFP_COMP,
+  get_order(copy + len));
+   if (unlikely(!page))
+   return -ENOMEM;
+
+   if (copy) {
+   int front, back;
+
+

[bpf-next v2 3/3] bpf: test_sockmap add options to use msg_push_data

2018-10-19 Thread John Fastabend

Add options to run msg_push_data, this patch creates two more flags
in test_sockmap that can be used to specify the offset and length
of bytes to be added. The new options are --txmsg_start_push to
specify where bytes should be inserted and --txmsg_end_push to
specify how many bytes. This is analagous to the options that are
used to pull data, --txmsg_start and --txmsg_end.

In addition to adding the options tests are added to the test
suit to run the tests similar to what was done for msg_pull_data.

Signed-off-by: John Fastabend 
---
 tools/testing/selftests/bpf/test_sockmap.c  | 58 ++-
 tools/testing/selftests/bpf/test_sockmap_kern.h | 97 +++--
 2 files changed, 129 insertions(+), 26 deletions(-)

diff --git a/tools/testing/selftests/bpf/test_sockmap.c 
b/tools/testing/selftests/bpf/test_sockmap.c
index cbd1c0b..622ade0 100644
--- a/tools/testing/selftests/bpf/test_sockmap.c
+++ b/tools/testing/selftests/bpf/test_sockmap.c
@@ -77,6 +77,8 @@
 int txmsg_cork;
 int txmsg_start;
 int txmsg_end;
+int txmsg_start_push;
+int txmsg_end_push;
 int txmsg_ingress;
 int txmsg_skb;
 int ktls;
@@ -100,6 +102,8 @@
{"txmsg_cork",  required_argument,  NULL, 'k'},
{"txmsg_start", required_argument,  NULL, 's'},
{"txmsg_end",   required_argument,  NULL, 'e'},
+   {"txmsg_start_push", required_argument, NULL, 'p'},
+   {"txmsg_end_push",   required_argument, NULL, 'q'},
{"txmsg_ingress", no_argument,  &txmsg_ingress, 1 },
{"txmsg_skb", no_argument,  &txmsg_skb, 1 },
{"ktls", no_argument,   &ktls, 1 },
@@ -903,6 +907,30 @@ static int run_options(struct sockmap_options *options, 
int cg_fd,  int test)
}
}
 
+   if (txmsg_start_push) {
+   i = 2;
+   err = bpf_map_update_elem(map_fd[5],
+ &i, &txmsg_start_push, 
BPF_ANY);
+   if (err) {
+   fprintf(stderr,
+   "ERROR: bpf_map_update_elem 
(txmsg_start_push):  %d (%s)\n",
+   err, strerror(errno));
+   goto out;
+   }
+   }
+
+   if (txmsg_end_push) {
+   i = 3;
+   err = bpf_map_update_elem(map_fd[5],
+ &i, &txmsg_end_push, BPF_ANY);
+   if (err) {
+   fprintf(stderr,
+   "ERROR: bpf_map_update_elem %i@%i 
(txmsg_end_push):  %d (%s)\n",
+   txmsg_end_push, i, err, 
strerror(errno));
+   goto out;
+   }
+   }
+
if (txmsg_ingress) {
int in = BPF_F_INGRESS;
 
@@ -1235,6 +1263,8 @@ static int test_mixed(int cgrp)
txmsg_pass = txmsg_noisy = txmsg_redir_noisy = txmsg_drop = 0;
txmsg_apply = txmsg_cork = 0;
txmsg_start = txmsg_end = 0;
+   txmsg_start_push = txmsg_end_push = 0;
+
/* Test small and large iov_count values with pass/redir/apply/cork */
txmsg_pass = 1;
txmsg_redir = 0;
@@ -1351,6 +1381,8 @@ static int test_start_end(int cgrp)
/* Test basic start/end with lots of iov_count and iov_lengths */
txmsg_start = 1;
txmsg_end = 2;
+   txmsg_start_push = 1;
+   txmsg_end_push = 2;
err = test_txmsg(cgrp);
if (err)
goto out;
@@ -1364,6 +1396,8 @@ static int test_start_end(int cgrp)
for (i = 99; i <= 1600; i += 500) {
txmsg_start = 0;
txmsg_end = i;
+   txmsg_start_push = 0;
+   txmsg_end_push = i;
err = test_exec(cgrp, &opt);
if (err)
goto out;
@@ -1373,6 +1407,8 @@ static int test_start_end(int cgrp)
for (i = 199; i <= 1600; i += 500) {
txmsg_start = 100;
txmsg_end = i;
+   txmsg_start_push = 100;
+   txmsg_end_push = i;
err = test_exec(cgrp, &opt);
if (err)
goto out;
@@ -1381,6 +1417,8 @@ static int test_start_end(int cgrp)
/* Test start/end with cork pulling last sg entry */
txmsg_start = 1500;
txmsg_end = 1600;
+   txmsg_start_push = 1500;
+   txmsg_end_push = 1600;
err = test_exec(cgrp, &opt);
if (err)
goto out;
@@ -1388,6 +1426,8 @@ static int test_start_end(int cgrp)
/* Test start/end pull of single byte in last page */
txmsg_start = ;
txmsg_end = 1112;
+   txmsg_start_push = ;
+   txmsg_end_push = 1112;
err = test_exec(cgrp, &opt);
if (err)

Re: [PATCH net-next] net: loopback: clear skb->tstamp before netif_rx()

2018-10-19 Thread Soheil Hassas Yeganeh

On Fri, Oct 19, 2018 at 10:11 PM Eric Dumazet  wrote:
>
> At least UDP / TCP stacks can now cook skbs with a tstamp using
> MONOTONIC base (or arbitrary values with SCM_TXTIME)
>
> Since loopback driver does not call (directly or indirectly)
> skb_scrub_packet(), we need to clear skb->tstamp so that
> net_timestamp_check() can eventually resample the time,
> using ktime_get_real().
>
> Fixes: 80b14dee2bea ("net: Add a new socket option for a future transmit 
> time.")
> Fixes: fb420d5d91c1 ("tcp/fq: move back to CLOCK_MONOTONIC")
> Signed-off-by: Eric Dumazet 
> Cc: Willem de Bruijn 
> Cc: Soheil Hassas Yeganeh 

Acked-by: Soheil Hassas Yeganeh 

Thank you, Eric!

> ---
>  drivers/net/loopback.c | 4 
>  1 file changed, 4 insertions(+)
>
> diff --git a/drivers/net/loopback.c b/drivers/net/loopback.c
> index 
> a7207fa7e451311aed13cdeb100e0ea7922931bf..2df7f60fe05220c19896a251b6b15239f4b95112
>  100644
> --- a/drivers/net/loopback.c
> +++ b/drivers/net/loopback.c
> @@ -69,6 +69,10 @@ static netdev_tx_t loopback_xmit(struct sk_buff *skb,
> int len;
>
> skb_tx_timestamp(skb);
> +
> +   /* do not fool net_timestamp_check() with various clock bases */
> +   skb->tstamp = 0;
> +
> skb_orphan(skb);
>
> /* Before queueing this packet to netif_rx(),
> --
> 2.19.1.568.g152ad8e336-goog
>

[PATCH net-next] net: loopback: clear skb->tstamp before netif_rx()

2018-10-19 Thread Eric Dumazet

At least UDP / TCP stacks can now cook skbs with a tstamp using
MONOTONIC base (or arbitrary values with SCM_TXTIME)

Since loopback driver does not call (directly or indirectly)
skb_scrub_packet(), we need to clear skb->tstamp so that
net_timestamp_check() can eventually resample the time,
using ktime_get_real().

Fixes: 80b14dee2bea ("net: Add a new socket option for a future transmit time.")
Fixes: fb420d5d91c1 ("tcp/fq: move back to CLOCK_MONOTONIC")
Signed-off-by: Eric Dumazet 
Cc: Willem de Bruijn 
Cc: Soheil Hassas Yeganeh 
---
 drivers/net/loopback.c | 4 
 1 file changed, 4 insertions(+)

diff --git a/drivers/net/loopback.c b/drivers/net/loopback.c
index 
a7207fa7e451311aed13cdeb100e0ea7922931bf..2df7f60fe05220c19896a251b6b15239f4b95112
 100644
--- a/drivers/net/loopback.c
+++ b/drivers/net/loopback.c
@@ -69,6 +69,10 @@ static netdev_tx_t loopback_xmit(struct sk_buff *skb,
int len;
 
skb_tx_timestamp(skb);
+
+   /* do not fool net_timestamp_check() with various clock bases */
+   skb->tstamp = 0;
+
skb_orphan(skb);
 
/* Before queueing this packet to netif_rx(),
-- 
2.19.1.568.g152ad8e336-goog

RE: [Intel-wired-lan] [PATCH] igb: shorten maximum PHC timecounter update interval

2018-10-19 Thread Brown, Aaron F

> From: Intel-wired-lan [mailto:intel-wired-lan-boun...@osuosl.org] On
> Behalf Of Miroslav Lichvar
> Sent: Friday, October 12, 2018 4:14 AM
> To: intel-wired-...@lists.osuosl.org; netdev@vger.kernel.org
> Cc: Thomas Gleixner ; Richard Cochran
> 
> Subject: [Intel-wired-lan] [PATCH] igb: shorten maximum PHC timecounter
> update interval
> 
> The timecounter needs to be updated at least once per ~550 seconds in
> order to avoid a 40-bit SYSTIM timestamp to be misinterpreted as an old
> timestamp.
> 
> Since commit 500462a9d ("timers: Switch to a non-cascading wheel"),
> scheduling of delayed work seems to be less accurate and a requested
> delay of 540 seconds may actually be longer than 550 seconds. Shorten
> the delay to 480 seconds to be sure the timecounter is updated in time.
> 
> This fixes an issue with HW timestamps on 82580/I350/I354 being off by
> ~1100 seconds for few seconds every ~9 minutes.
> 
> Cc: Jacob Keller 
> Cc: Richard Cochran 
> Cc: Thomas Gleixner 
> Signed-off-by: Miroslav Lichvar 
> ---
>  drivers/net/ethernet/intel/igb/igb_ptp.c | 8 +++-
>  1 file changed, 7 insertions(+), 1 deletion(-)
> 

Tested-by: Aaron Brown

Re: [PATCH net] net: fix pskb_trim_rcsum_slow() with odd trim offset

2018-10-19 Thread Eric Dumazet




On 10/19/2018 05:07 PM, Dimitris Michailidis wrote:
> We've been getting checksum errors involving small UDP packets, usually
> 59B packets with 1 extra non-zero padding byte. netdev_rx_csum_fault()
> has been complaining that HW is providing bad checksums. Turns out the
> problem is in pskb_trim_rcsum_slow(), introduced in commit 88078d98d1bb
> ("net: pskb_trim_rcsum() and CHECKSUM_COMPLETE are friends").
> 
> The source of the problem is that when the bytes we are trimming start
> at an odd address, as in the case of the 1 padding byte above,
> skb_checksum() returns a byte-swapped value. We cannot just combine this
> with skb->csum using csum_sub(). We need to use csum_block_sub() here
> that takes into account the parity of the start address and handles the
> swapping.
> 
> Matches existing code in __skb_postpull_rcsum() and esp_remove_trailer().
> 
> Fixes: 88078d98d1bb ("net: pskb_trim_rcsum() and CHECKSUM_COMPLETE are 
> friends")
> Signed-off-by: Dimitris Michailidis 

Thanks a lot Dimitris for finding this.

Reviewed-by: Eric Dumazet

[PATCH net] net: fix pskb_trim_rcsum_slow() with odd trim offset

2018-10-19 Thread Dimitris Michailidis

We've been getting checksum errors involving small UDP packets, usually
59B packets with 1 extra non-zero padding byte. netdev_rx_csum_fault()
has been complaining that HW is providing bad checksums. Turns out the
problem is in pskb_trim_rcsum_slow(), introduced in commit 88078d98d1bb
("net: pskb_trim_rcsum() and CHECKSUM_COMPLETE are friends").

The source of the problem is that when the bytes we are trimming start
at an odd address, as in the case of the 1 padding byte above,
skb_checksum() returns a byte-swapped value. We cannot just combine this
with skb->csum using csum_sub(). We need to use csum_block_sub() here
that takes into account the parity of the start address and handles the
swapping.

Matches existing code in __skb_postpull_rcsum() and esp_remove_trailer().

Fixes: 88078d98d1bb ("net: pskb_trim_rcsum() and CHECKSUM_COMPLETE are friends")
Signed-off-by: Dimitris Michailidis 
---
 net/core/skbuff.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 428094b577fc..f817f336595d 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -1846,8 +1846,9 @@ int pskb_trim_rcsum_slow(struct sk_buff *skb, unsigned 
int len)
if (skb->ip_summed == CHECKSUM_COMPLETE) {
int delta = skb->len - len;
 
-   skb->csum = csum_sub(skb->csum,
-skb_checksum(skb, len, delta, 0));
+   skb->csum = csum_block_sub(skb->csum,
+  skb_checksum(skb, len, delta, 0),
+  len);
}
return __pskb_trim(skb, len);
 }
-- 
2.19.1.568.g152ad8e336-goog

Re: [PATCH] net: ethernet: lpc_eth: remove unused local variable

2018-10-19 Thread David Miller

From: Vladimir Zapolskiy 
Date: Fri, 19 Oct 2018 02:06:53 +0300

> A trivial change which removes an unused local variable, the issue
> is reported as a compile time warning:
> 
>   drivers/net/ethernet/nxp/lpc_eth.c: In function 'lpc_eth_drv_probe':
>   drivers/net/ethernet/nxp/lpc_eth.c:1250:21: warning: variable 'phydev' set 
> but not used [-Wunused-but-set-variable]
> struct phy_device *phydev;
>^~
> 
> Signed-off-by: Vladimir Zapolskiy 

Applied.

Re: [PATCH] net: ethernet: lpc_eth: add device and device node local variables

2018-10-19 Thread David Miller

From: Vladimir Zapolskiy 
Date: Fri, 19 Oct 2018 02:25:11 +0300

> Trivial non-functional change added to simplify getting multiple
> references to device pointer in lpc_eth_drv_probe().
> 
> Signed-off-by: Vladimir Zapolskiy 

Applied.

Re: [PATCH] net: ethernet: lpc_eth: clean up the list of included headers

2018-10-19 Thread David Miller

From: Vladimir Zapolskiy 
Date: Fri, 19 Oct 2018 01:53:25 +0300

> The change removes all unnecessary included headers from the driver
> source code, the remaining list is sorted in alphabetical order.
> 
> Signed-off-by: Vladimir Zapolskiy 

Applied.

Re: [PATCH] net: ethernet: lpc_eth: remove CONFIG_OF guard from the driver

2018-10-19 Thread David Miller

From: Vladimir Zapolskiy 
Date: Fri, 19 Oct 2018 01:58:41 +0300

> The MAC controller device is available on NXP LPC32xx platform only,
> and the LPC32xx platform supports OF builds only, so additional
> checks in the device driver are not needed.
> 
> Signed-off-by: Vladimir Zapolskiy 

Applied.

Re: [PATCH net-next v2] netpoll: allow cleanup to be synchronous

2018-10-19 Thread David Miller

From: Debabrata Banerjee 
Date: Thu, 18 Oct 2018 11:18:26 -0400

> This fixes a problem introduced by:
> commit 2cde6acd49da ("netpoll: Fix __netpoll_rcu_free so that it can hold the 
> rtnl lock")
> 
> When using netconsole on a bond, __netpoll_cleanup can asynchronously
> recurse multiple times, each __netpoll_free_async call can result in
> more __netpoll_free_async's. This means there is now a race between
> cleanup_work queues on multiple netpoll_info's on multiple devices and
> the configuration of a new netpoll. For example if a netconsole is set
> to enable 0, reconfigured, and enable 1 immediately, this netconsole
> will likely not work.
> 
> Given the reason for __netpoll_free_async is it can be called when rtnl
> is not locked, if it is locked, we should be able to execute
> synchronously. It appears to be locked everywhere it's called from.
> 
> Generalize the design pattern from the teaming driver for current
> callers of __netpoll_free_async.
> 
> CC: Neil Horman 
> CC: "David S. Miller" 
> Signed-off-by: Debabrata Banerjee 

Applied, thank you.

Re: [bpf-next v3 0/2] Fix kcm + sockmap by checking psock type

2018-10-19 Thread John Fastabend

On 10/19/2018 03:57 PM, Daniel Borkmann wrote:
> On 10/20/2018 12:51 AM, Daniel Borkmann wrote:
>> On 10/18/2018 10:58 PM, John Fastabend wrote:
>>> We check if the sk_user_data (the psock in skmsg) is in fact a sockmap
>>> type to late, after we read the refcnt which is an error. This
>>> series moves the check up before reading refcnt and also adds a test
>>> to test_maps to test trying to add a KCM socket into a sockmap.
>>>
>>> While reviewig this code I also found an issue with KCM and kTLS
>>> where each uses sk_data_ready hooks and associated stream parser
>>> breaking expectations in kcm, ktls or both. But that fix will need
>>> to go to net.
>>>
>>> Thanks to Eric for reporting.
>>>
>>> v2: Fix up file +/- my scripts lost track of them
>>> v3: return EBUSY if refcnt is zero
>>>
>>> John Fastabend (2):
>>>   bpf: skmsg, fix psock create on existing kcm/tls port
>>>   bpf: test_maps add a test to catch kcm + sockmap
>>>
>>>  include/linux/skmsg.h | 25 +---
>>>  net/core/sock_map.c   | 11 +++---
>>>  tools/testing/selftests/bpf/Makefile  |  2 +-
>>>  tools/testing/selftests/bpf/sockmap_kcm.c | 14 +++
>>>  tools/testing/selftests/bpf/test_maps.c   | 64 
>>> ++-
>>>  5 files changed, 103 insertions(+), 13 deletions(-)
>>>  create mode 100644 tools/testing/selftests/bpf/sockmap_kcm.c
>>
>> Applied, thanks!
> 
> Fyi, I've only applied patch 1/2 for now to get the bug fixed. The patch 2/2 
> throws
> a bunch of warnings that look like the below. Also, I think we leak kcm 
> socket in
> error paths and once we're done with testing, so would be good to close it 
> once
> unneeded. Please respin the test as a stand-alone commit, thanks:
> 

Thanks, I didn't see the warnings below locally but will look
into spinning a good version tonight with the closing sock fix
as well.

John

> [...]
> bpf-next/tools/testing/selftests/bpf/libbpf.a -lcap -lelf -lrt -lpthread -o 
> /home/darkstar/trees/bpf-next-ok/tools/testing/selftests/bpf/test_maps
> test_maps.c: In function ‘test_sockmap’:
> test_maps.c:869:0: warning: "AF_KCM" redefined
>  #define AF_KCM 41
> 
> In file included from /usr/include/sys/socket.h:38:0,
>  from test_maps.c:21:
> /usr/include/bits/socket.h:133:0: note: this is the location of the previous 
> definition
>  #define AF_KCM  PF_KCM
>

Re: [PATCH bpf-next v2 02/13] bpf: btf: Add BTF_KIND_FUNC and BTF_KIND_FUNC_PROTO

2018-10-19 Thread Martin Lau

On Fri, Oct 19, 2018 at 10:26:53PM +0100, Edward Cree wrote:
> On 19/10/18 20:36, Martin Lau wrote:
> > On Fri, Oct 19, 2018 at 06:04:11PM +0100, Edward Cree wrote:
> >> But you *do* have such a new section.
> >> The patch comment talks about a 'FuncInfo Table' which appears to
> > Note that the new section, which contains the FuncInfo Table,
> > is in a new ELF section ".BTF.ext" instead of the ".BTF".
> > It is not in the ".BTF" section because it is only useful during
> > bpf_prog_load().
> I thought it was because it needed to be munged by the loader/linker?
> 
> > IIUC, I think what you are suggesting here is to use (type_id, name)
> > to describe DW_TAG_subprogram "int foo1(int) {}", "int foo2(int) {}",
> > "int foo3(int) {}" where type_id here is referring to the same
> > DW_TAG_subroutine_type, and only define that _one_
> > DW_TAG_subroutine_type in the BTF "type" section.
> Yes, something like that.
> 
> > If the concern is having both FUNC and FUNC_PROTO is confusing,
> The concern is that you're conflating different entities (types
>  and instances); FUNC_PROTO is just a symptom/canary of that.
> 
> > we could go back to the CTF way which adds a new function section
> > in ".BTF" and it is only for DW_TAG_subprogram.
> > BTF_KIND_FUNC_PROTO is then no longer necessary.
> > Some of new BTF verifier checkings may actually go away also.
> > The down side is there will be two id spaces.
> Two id spaces... one for types and the other for subprograms.
> These are different things, so why would you _want_ them to share
>  an id space?  I don't, for instance, see any situation in which
>  you'd want some other record to have a field that could reference
>  either.
> And the 'subprogram id' doesn't have to be just for subprograms;
>  it could be for instances generally — like I've been saying, a
>  variable declaration is to an object type what a subprogram is to
>  a function type, just with a few complications like "subprograms
>  can only appear at file scope, not nested in other functions" and
>  "variables of function type are immutable".
> (I'm assuming that at some point we're going to want to be able to
>  have BTF information for e.g. variables stored on a subprogram's
>  stack, if only for stuff like single-stepping in a debugger in
>  userspace with some sort of mock.  At that point, the variable
>  has to have its own record — you can't just have some sort of
>  magic type record because e.g. "struct foo bar;" has two names,
>  one for the type and one for the variable.)
> 
btf_type is not exactly a C type.

btf_type is a debug-info.  Each btf_type carries specific
debug information.  Name is part of the debug-info/btf_type.
If something carries different debug-info, it is another btf_type.
Like struct, the member's names of struct is part of the btf_type.
A struct with the same member's types but different member's names
is a different btf_type.

The same go for function.  The function with different function
names and arg names is a different btf_type.

> > Discussed a bit offline with folks about the two id spaces
> > situation and it is not good for debugging purpose.
> Could you unpack this a bit more?
Having two id spaces for debug-info is confusing.  They are
all debug-info at the end.

Re: [PATCH v8 bpf-next 0/2] bpf: add cg_skb_is_valid_access

2018-10-19 Thread Alexei Starovoitov

On Fri, Oct 19, 2018 at 09:57:56AM -0700, Song Liu wrote:
> Changes v7 -> v8:
> 1. Dynamically allocate the dummy sk to avoid race conditions.
> 
> Changes v6 -> v7:
> 1. Make dummy sk a global variable (test_run_sk).
> 
> Changes v5 -> v6:
> 1. Fixed dummy sk in bpf_prog_test_run_skb() as suggested by Eric Dumazet.
> 
> Changes v4 -> v5:
> 1. Replaced bpf_compute_and_save_data_pointers() with
>bpf_compute_and_save_data_end();
>Replaced bpf_restore_data_pointers() with bpf_restore_data_end().
> 2. Fixed indentation in test_verifier.c
> 
> Changes v3 -> v4:
> 1. Fixed crash issue reported by Alexei.
> 
> Changes v2 -> v3:
> 1. Added helper function bpf_compute_and_save_data_pointers() and
>bpf_restore_data_pointers().
> 
> Changes v1 -> v2:
> 1. Updated the list of read-only fields, and read-write fields.
> 2. Added dummy sk to bpf_prog_test_run_skb().
> 
> This set enables BPF program of type BPF_PROG_TYPE_CGROUP_SKB to access
> some __skb_buff data directly.

Applied, Thanks

Re: [PATCH bpf-next] bpf: Extend the sk_lookup() helper to XDP hookpoint.

2018-10-19 Thread Nitin Hande

On Fri, 19 Oct 2018 22:32:28 +0200
Daniel Borkmann  wrote:

> On 10/19/2018 06:47 PM, Joe Stringer wrote:
> > On Thu, 18 Oct 2018 at 22:07, Martin Lau  wrote:  
> >> On Thu, Oct 18, 2018 at 04:52:40PM -0700, Joe Stringer wrote:  
> >>> On Thu, 18 Oct 2018 at 14:20, Daniel Borkmann
> >>>  wrote:  
>  On 10/18/2018 11:06 PM, Joe Stringer wrote:  
> > On Thu, 18 Oct 2018 at 11:54, Nitin Hande
> >  wrote:  
>  [...]  
> >> Open Issue
> >> * The underlying code relies on presence of an skb to find out
> >> the right sk for the case of REUSEPORT socket option. Since
> >> there is no skb available at XDP hookpoint, the helper
> >> function will return the first available sk based off the 5
> >> tuple hash. If the desire is to return a particular sk
> >> matching reuseport_cb function, please suggest way to tackle
> >> it, which can be addressed in a future commit.  
>   
> >> Signed-off-by: Nitin Hande   
> >
> > Thanks Nitin, LGTM overall.
> >
> > The REUSEPORT thing suggests that the usage of this helper from
> > XDP layer may lead to a different socket being selected vs. the
> > equivalent call at TC hook, or other places where the selection
> > may occur. This could be a bit counter-intuitive.
> >
> > One thought I had to work around this was to introduce a flag,
> > something like BPF_F_FIND_REUSEPORT_SK_BY_HASH. This flag would
> > effectively communicate in the API that the bpf_sk_lookup_xxx()
> > functions will only select a REUSEPORT socket based on the hash
> > and not by, for example BPF_PROG_TYPE_SK_REUSEPORT programs.
> > The absence of the flag would support finding REUSEPORT sockets
> > by other mechanisms (which would be allowed for now from TC
> > hooks but would be disallowed from XDP, since there's no
> > specific plan to support this).  
> 
>  Hmm, given skb is NULL here the only way to lookup the socket in
>  such scenario is based on hash, that is, inet_ehashfn() /
>  inet6_ehashfn(), perhaps alternative is to pass this hash in
>  from XDP itself to the helper so it could be custom selector. Do
>  you have a specific use case on this for XDP (just curious)?  
> >>>
> >>> I don't have a use case for SO_REUSEPORT introspection from XDP,
> >>> so I'm primarily thinking from the perspective of making the
> >>> behaviour clear in the API in a way that leaves open the
> >>> possibility for a reasonable implementation in future. From that
> >>> perspective, my main concern is that it may surprise some BPF
> >>> writers that the same "bpf_sk_lookup_tcp()" call (with identical
> >>> parameters) may have different behaviour at TC vs. XDP layers, as
> >>> the BPF selection of sockets is respected at TC but not at XDP.
> >>>
> >>> FWIW we're already out of parameters for the actual call, so if we
> >>> wanted to allow passing a hash in, we'd need to either dedicate
> >>> half the 'flags' field for this configurable hash, or consider
> >>> adding the new hash parameter to 'struct bpf_sock_tuple'.
> >>>
> >>> +Martin for any thoughts on SO_REUSEPORT and XDP here.  
> >> The XDP/TC prog has read access to the sk fields through
> >> 'struct bpf_sock'?
> >>
> >> A quick thought...
> >> Considering all sk in the same reuse->socks[] share
> >> many things (e.g. family,type,protocol,ip,port..etc are the same),
> >> I wonder returning which particular sk from reuse->socks[] will
> >> matter too much since most of the fields from 'struct bpf_sock'
> >> will be the same.  Some of fields in 'struct bpf_sock' could be
> >> different though, like priority?  Hence, another possibility is to
> >> limit the accessible fields for the XDP prog.  Only allow
> >> accessing the fields that must be the same among the sk in the
> >> same reuse->socks[].  
> > 
> > This sounds pretty reasonable to me.  
> 
> Agree, and in any case this difference in returned sk selection should
> probably also be documented in the uapi helper description.

Okay, will do in a v2.

Thanks
Nitin

Re: [bpf-next v3 0/2] Fix kcm + sockmap by checking psock type

2018-10-19 Thread Daniel Borkmann

On 10/20/2018 12:51 AM, Daniel Borkmann wrote:
> On 10/18/2018 10:58 PM, John Fastabend wrote:
>> We check if the sk_user_data (the psock in skmsg) is in fact a sockmap
>> type to late, after we read the refcnt which is an error. This
>> series moves the check up before reading refcnt and also adds a test
>> to test_maps to test trying to add a KCM socket into a sockmap.
>>
>> While reviewig this code I also found an issue with KCM and kTLS
>> where each uses sk_data_ready hooks and associated stream parser
>> breaking expectations in kcm, ktls or both. But that fix will need
>> to go to net.
>>
>> Thanks to Eric for reporting.
>>
>> v2: Fix up file +/- my scripts lost track of them
>> v3: return EBUSY if refcnt is zero
>>
>> John Fastabend (2):
>>   bpf: skmsg, fix psock create on existing kcm/tls port
>>   bpf: test_maps add a test to catch kcm + sockmap
>>
>>  include/linux/skmsg.h | 25 +---
>>  net/core/sock_map.c   | 11 +++---
>>  tools/testing/selftests/bpf/Makefile  |  2 +-
>>  tools/testing/selftests/bpf/sockmap_kcm.c | 14 +++
>>  tools/testing/selftests/bpf/test_maps.c   | 64 
>> ++-
>>  5 files changed, 103 insertions(+), 13 deletions(-)
>>  create mode 100644 tools/testing/selftests/bpf/sockmap_kcm.c
> 
> Applied, thanks!

Fyi, I've only applied patch 1/2 for now to get the bug fixed. The patch 2/2 
throws
a bunch of warnings that look like the below. Also, I think we leak kcm socket 
in
error paths and once we're done with testing, so would be good to close it once
unneeded. Please respin the test as a stand-alone commit, thanks:

[...]
bpf-next/tools/testing/selftests/bpf/libbpf.a -lcap -lelf -lrt -lpthread -o 
/home/darkstar/trees/bpf-next-ok/tools/testing/selftests/bpf/test_maps
test_maps.c: In function ‘test_sockmap’:
test_maps.c:869:0: warning: "AF_KCM" redefined
 #define AF_KCM 41

In file included from /usr/include/sys/socket.h:38:0,
 from test_maps.c:21:
/usr/include/bits/socket.h:133:0: note: this is the location of the previous 
definition
 #define AF_KCM  PF_KCM

Re: [bpf-next v3 0/2] Fix kcm + sockmap by checking psock type

2018-10-19 Thread Daniel Borkmann

On 10/18/2018 10:58 PM, John Fastabend wrote:
> We check if the sk_user_data (the psock in skmsg) is in fact a sockmap
> type to late, after we read the refcnt which is an error. This
> series moves the check up before reading refcnt and also adds a test
> to test_maps to test trying to add a KCM socket into a sockmap.
> 
> While reviewig this code I also found an issue with KCM and kTLS
> where each uses sk_data_ready hooks and associated stream parser
> breaking expectations in kcm, ktls or both. But that fix will need
> to go to net.
> 
> Thanks to Eric for reporting.
> 
> v2: Fix up file +/- my scripts lost track of them
> v3: return EBUSY if refcnt is zero
> 
> John Fastabend (2):
>   bpf: skmsg, fix psock create on existing kcm/tls port
>   bpf: test_maps add a test to catch kcm + sockmap
> 
>  include/linux/skmsg.h | 25 +---
>  net/core/sock_map.c   | 11 +++---
>  tools/testing/selftests/bpf/Makefile  |  2 +-
>  tools/testing/selftests/bpf/sockmap_kcm.c | 14 +++
>  tools/testing/selftests/bpf/test_maps.c   | 64 
> ++-
>  5 files changed, 103 insertions(+), 13 deletions(-)
>  create mode 100644 tools/testing/selftests/bpf/sockmap_kcm.c

Applied, thanks!

Re: Fw: [Bug 201423] New: eth0: hw csum failure

2018-10-19 Thread Eric Dumazet




On 10/19/2018 02:58 PM, Eric Dumazet wrote:
> 
> 
> On 10/16/2018 06:00 AM, Eric Dumazet wrote:
>> On Mon, Oct 15, 2018 at 11:30 PM Andre Tomt  wrote:
>>>
>>> On 15.10.2018 17:41, Eric Dumazet wrote:
 On Mon, Oct 15, 2018 at 8:15 AM Stephen Hemminger
> Something is changed between 4.17.12 and 4.18, after bisecting the 
> problem I
> got the following first bad commit:
>
> commit 88078d98d1bb085d72af8437707279e203524fa5
> Author: Eric Dumazet 
> Date:   Wed Apr 18 11:43:15 2018 -0700
>
>  net: pskb_trim_rcsum() and CHECKSUM_COMPLETE are friends
>
>  After working on IP defragmentation lately, I found that some large
>  packets defeat CHECKSUM_COMPLETE optimization because of NIC adding
>  zero paddings on the last (small) fragment.
>
>  While removing the padding with pskb_trim_rcsum(), we set 
> skb->ip_summed
>  to CHECKSUM_NONE, forcing a full csum validation, even if all prior
>  fragments had CHECKSUM_COMPLETE set.
>
>  We can instead compute the checksum of the part we are trimming,
>  usually smaller than the part we keep.
>
>  Signed-off-by: Eric Dumazet 
>  Signed-off-by: David S. Miller 
>

 Thanks for bisecting !

 This commit is known to expose some NIC/driver bugs.

 Look at commit 12b03558cef6d655d0d394f5e98a6fd07c1f6c0f
 ("net: sungem: fix rx checksum support")  for one driver needing a fix.

 I assume SKY2_HW_NEW_LE is not set on your NIC ?

>>>
>>> I've seen similar on several systems with mlx4 cards when using 4.18.x -
>>> that is hw csum failure followed by some backtrace.
>>>
>>> Only seems to happen on systems dealing with quite a bit of UDP.
>>>
>>
>> Strange, because mlx4 on IPv6+UDP should not use CHECKSUM_COMPLETE,
>> but CHECKSUM_UNNECESSARY
>>
>> I would be nice to track this a bit further, maybe by providing the
>> full packet content.
>>
>>> Example from 4.18.10:
 [635607.740574] p0xe0: hw csum failure
 [635607.740598] CPU: 4 PID: 0 Comm: swapper/4 Not tainted 4.18.0-1 #1
 [635607.740599] Hardware name: Supermicro Super Server/X10SRL-F, BIOS 2.0b 
 05/02/2017
 [635607.740599] Call Trace:
 [635607.740602]  
 [635607.740611]  dump_stack+0x5c/0x7b
 [635607.740617]  __skb_gro_checksum_complete+0x9a/0xa0
 [635607.740621]  udp6_gro_receive+0x211/0x290
 [635607.740624]  ipv6_gro_receive+0x1a8/0x390
 [635607.740627]  dev_gro_receive+0x33e/0x550
 [635607.740628]  napi_gro_frags+0xa2/0x210
 [635607.740635]  mlx4_en_process_rx_cq+0xa01/0xb40 [mlx4_en]
 [635607.740648]  ? mlx4_cq_completion+0x23/0x70 [mlx4_core]
 [635607.740654]  ? mlx4_eq_int+0x373/0xc80 [mlx4_core]
 [635607.740657]  mlx4_en_poll_rx_cq+0x55/0xf0 [mlx4_en]
 [635607.740658]  net_rx_action+0xe0/0x2e0
 [635607.740662]  __do_softirq+0xd8/0x2e5
 [635607.740666]  irq_exit+0xb4/0xc0
 [635607.740667]  do_IRQ+0x85/0xd0
 [635607.740670]  common_interrupt+0xf/0xf
 [635607.740671]  
 [635607.740675] RIP: 0010:cpuidle_enter_state+0xb4/0x2a0
 [635607.740675] Code: 31 ff e8 df a6 ba ff 45 84 f6 74 17 9c 58 0f 1f 44 
 00 00 f6 c4 02 0f 85 d8 01 00 00 31 ff e8 13 81 bf ff fb 66 0f 1f 44 00 00 
 <4c> 29 fb 48 ba cf f7 53 e3 a5 9b c4 20 48 89 d8 48 c1 fb 3f 48 f7
 [635607.740701] RSP: 0018:a5c206353ea8 EFLAGS: 0246 ORIG_RAX: 
 ffd9
 [635607.740703] RAX: 8d72ffd20f00 RBX: 00024214f597c5b0 RCX: 
 001f
 [635607.740703] RDX: 00024214f597c5b0 RSI: 00020780 RDI: 
 
 [635607.740704] RBP: 0004 R08: 002542bfbefa99fa R09: 
 
 [635607.740705] R10: a5c206353e88 R11: 00c5 R12: 
 af0aaf78
 [635607.740706] R13: 8d72ffd297d8 R14:  R15: 
 00024214f58c2ed5
 [635607.740709]  ? cpuidle_enter_state+0x91/0x2a0
 [635607.740712]  do_idle+0x1d0/0x240
 [635607.740715]  cpu_startup_entry+0x5f/0x70
 [635607.740719]  start_secondary+0x185/0x1a0
 [635607.740722]  secondary_startup_64+0xa5/0xb0
 [635607.740731] p0xe0: hw csum failure
 [635607.740745] CPU: 4 PID: 0 Comm: swapper/4 Not tainted 4.18.0-1 #1
 [635607.740746] Hardware name: Supermicro Super Server/X10SRL-F, BIOS 2.0b 
 05/02/2017
 [635607.740746] Call Trace:
 [635607.740747]  
 [635607.740750]  dump_stack+0x5c/0x7b
 [635607.740755]  __skb_checksum_complete+0xb8/0xd0
 [635607.740760]  __udp6_lib_rcv+0xa6b/0xa70
 [635607.740767]  ? nft_do_chain_inet+0x7a/0xd0 [nf_tables]
 [635607.740770]  ? nft_do_chain_inet+0x7a/0xd0 [nf_tables]
 [635607.740774]  ip6_input_finish+0xc0/0x460
 [635607.740776]  ip6_input+0x2b/0x90
 [635607.740778]  ? ip6_rcv_finish+0x110/0x110
 [635607.740780]  ipv6_rcv+0x2cd/0x4b0
 [635607.740783]  ? udp6_lib_lookup_skb+0x59/0x80
 [635607.740785]

Re: Fw: [Bug 201423] New: eth0: hw csum failure

2018-10-19 Thread Eric Dumazet




On 10/16/2018 06:00 AM, Eric Dumazet wrote:
> On Mon, Oct 15, 2018 at 11:30 PM Andre Tomt  wrote:
>>
>> On 15.10.2018 17:41, Eric Dumazet wrote:
>>> On Mon, Oct 15, 2018 at 8:15 AM Stephen Hemminger
 Something is changed between 4.17.12 and 4.18, after bisecting the problem 
 I
 got the following first bad commit:

 commit 88078d98d1bb085d72af8437707279e203524fa5
 Author: Eric Dumazet 
 Date:   Wed Apr 18 11:43:15 2018 -0700

  net: pskb_trim_rcsum() and CHECKSUM_COMPLETE are friends

  After working on IP defragmentation lately, I found that some large
  packets defeat CHECKSUM_COMPLETE optimization because of NIC adding
  zero paddings on the last (small) fragment.

  While removing the padding with pskb_trim_rcsum(), we set 
 skb->ip_summed
  to CHECKSUM_NONE, forcing a full csum validation, even if all prior
  fragments had CHECKSUM_COMPLETE set.

  We can instead compute the checksum of the part we are trimming,
  usually smaller than the part we keep.

  Signed-off-by: Eric Dumazet 
  Signed-off-by: David S. Miller 

>>>
>>> Thanks for bisecting !
>>>
>>> This commit is known to expose some NIC/driver bugs.
>>>
>>> Look at commit 12b03558cef6d655d0d394f5e98a6fd07c1f6c0f
>>> ("net: sungem: fix rx checksum support")  for one driver needing a fix.
>>>
>>> I assume SKY2_HW_NEW_LE is not set on your NIC ?
>>>
>>
>> I've seen similar on several systems with mlx4 cards when using 4.18.x -
>> that is hw csum failure followed by some backtrace.
>>
>> Only seems to happen on systems dealing with quite a bit of UDP.
>>
> 
> Strange, because mlx4 on IPv6+UDP should not use CHECKSUM_COMPLETE,
> but CHECKSUM_UNNECESSARY
> 
> I would be nice to track this a bit further, maybe by providing the
> full packet content.
> 
>> Example from 4.18.10:
>>> [635607.740574] p0xe0: hw csum failure
>>> [635607.740598] CPU: 4 PID: 0 Comm: swapper/4 Not tainted 4.18.0-1 #1
>>> [635607.740599] Hardware name: Supermicro Super Server/X10SRL-F, BIOS 2.0b 
>>> 05/02/2017
>>> [635607.740599] Call Trace:
>>> [635607.740602]  
>>> [635607.740611]  dump_stack+0x5c/0x7b
>>> [635607.740617]  __skb_gro_checksum_complete+0x9a/0xa0
>>> [635607.740621]  udp6_gro_receive+0x211/0x290
>>> [635607.740624]  ipv6_gro_receive+0x1a8/0x390
>>> [635607.740627]  dev_gro_receive+0x33e/0x550
>>> [635607.740628]  napi_gro_frags+0xa2/0x210
>>> [635607.740635]  mlx4_en_process_rx_cq+0xa01/0xb40 [mlx4_en]
>>> [635607.740648]  ? mlx4_cq_completion+0x23/0x70 [mlx4_core]
>>> [635607.740654]  ? mlx4_eq_int+0x373/0xc80 [mlx4_core]
>>> [635607.740657]  mlx4_en_poll_rx_cq+0x55/0xf0 [mlx4_en]
>>> [635607.740658]  net_rx_action+0xe0/0x2e0
>>> [635607.740662]  __do_softirq+0xd8/0x2e5
>>> [635607.740666]  irq_exit+0xb4/0xc0
>>> [635607.740667]  do_IRQ+0x85/0xd0
>>> [635607.740670]  common_interrupt+0xf/0xf
>>> [635607.740671]  
>>> [635607.740675] RIP: 0010:cpuidle_enter_state+0xb4/0x2a0
>>> [635607.740675] Code: 31 ff e8 df a6 ba ff 45 84 f6 74 17 9c 58 0f 1f 44 00 
>>> 00 f6 c4 02 0f 85 d8 01 00 00 31 ff e8 13 81 bf ff fb 66 0f 1f 44 00 00 
>>> <4c> 29 fb 48 ba cf f7 53 e3 a5 9b c4 20 48 89 d8 48 c1 fb 3f 48 f7
>>> [635607.740701] RSP: 0018:a5c206353ea8 EFLAGS: 0246 ORIG_RAX: 
>>> ffd9
>>> [635607.740703] RAX: 8d72ffd20f00 RBX: 00024214f597c5b0 RCX: 
>>> 001f
>>> [635607.740703] RDX: 00024214f597c5b0 RSI: 00020780 RDI: 
>>> 
>>> [635607.740704] RBP: 0004 R08: 002542bfbefa99fa R09: 
>>> 
>>> [635607.740705] R10: a5c206353e88 R11: 00c5 R12: 
>>> af0aaf78
>>> [635607.740706] R13: 8d72ffd297d8 R14:  R15: 
>>> 00024214f58c2ed5
>>> [635607.740709]  ? cpuidle_enter_state+0x91/0x2a0
>>> [635607.740712]  do_idle+0x1d0/0x240
>>> [635607.740715]  cpu_startup_entry+0x5f/0x70
>>> [635607.740719]  start_secondary+0x185/0x1a0
>>> [635607.740722]  secondary_startup_64+0xa5/0xb0
>>> [635607.740731] p0xe0: hw csum failure
>>> [635607.740745] CPU: 4 PID: 0 Comm: swapper/4 Not tainted 4.18.0-1 #1
>>> [635607.740746] Hardware name: Supermicro Super Server/X10SRL-F, BIOS 2.0b 
>>> 05/02/2017
>>> [635607.740746] Call Trace:
>>> [635607.740747]  
>>> [635607.740750]  dump_stack+0x5c/0x7b
>>> [635607.740755]  __skb_checksum_complete+0xb8/0xd0
>>> [635607.740760]  __udp6_lib_rcv+0xa6b/0xa70
>>> [635607.740767]  ? nft_do_chain_inet+0x7a/0xd0 [nf_tables]
>>> [635607.740770]  ? nft_do_chain_inet+0x7a/0xd0 [nf_tables]
>>> [635607.740774]  ip6_input_finish+0xc0/0x460
>>> [635607.740776]  ip6_input+0x2b/0x90
>>> [635607.740778]  ? ip6_rcv_finish+0x110/0x110
>>> [635607.740780]  ipv6_rcv+0x2cd/0x4b0
>>> [635607.740783]  ? udp6_lib_lookup_skb+0x59/0x80
>>> [635607.740785]  __netif_receive_skb_core+0x455/0xb30
>>> [635607.740788]  ? ipv6_gro_receive+0x1a8/0x390
>>> [635607.740790]  ? netif_receive_skb_internal+0x24/0xb0
>>> [635607

Re: [PATCH v2 net] r8169: fix NAPI handling under high load

2018-10-19 Thread Francois Romieu

Eric Dumazet  :
> On 10/18/2018 03:59 PM, Francois Romieu wrote:
> > Eric Dumazet  :
> > [...]
> >> One has to wonder why rtl8169_poll(), which might be called in a loop 
> >> under DOS,
> >> has to call rtl_ack_events() ?
> > 
> > So as to cover a wider temporal range before any event can trigger an
> > extra irq. I was more worried about irq cost than about IO cost (and
> > I still am).
> > 
> Normally the IRQ would not be enabled under DOS.

Yes.

My concern was not the DOS situation when NAPI runs at full speed.
As far as I was able to experiment with it, the driver did not seem
too bad here.

The location of the ack targets the interim situation where the IRQ
rate can increase before NAPI kicks in. By increasing the time range
whose events can be acked, the maximum irq rate should be lowered.

-- 
Ueimor

Re: [PATCH bpf-next v2 02/13] bpf: btf: Add BTF_KIND_FUNC and BTF_KIND_FUNC_PROTO

2018-10-19 Thread Edward Cree

On 19/10/18 20:36, Martin Lau wrote:
> On Fri, Oct 19, 2018 at 06:04:11PM +0100, Edward Cree wrote:
>> But you *do* have such a new section.
>> The patch comment talks about a 'FuncInfo Table' which appears to
> Note that the new section, which contains the FuncInfo Table,
> is in a new ELF section ".BTF.ext" instead of the ".BTF".
> It is not in the ".BTF" section because it is only useful during
> bpf_prog_load().
I thought it was because it needed to be munged by the loader/linker?

> IIUC, I think what you are suggesting here is to use (type_id, name)
> to describe DW_TAG_subprogram "int foo1(int) {}", "int foo2(int) {}",
> "int foo3(int) {}" where type_id here is referring to the same
> DW_TAG_subroutine_type, and only define that _one_
> DW_TAG_subroutine_type in the BTF "type" section.
Yes, something like that.

> If the concern is having both FUNC and FUNC_PROTO is confusing,
The concern is that you're conflating different entities (types
 and instances); FUNC_PROTO is just a symptom/canary of that.

> we could go back to the CTF way which adds a new function section
> in ".BTF" and it is only for DW_TAG_subprogram.
> BTF_KIND_FUNC_PROTO is then no longer necessary.
> Some of new BTF verifier checkings may actually go away also.
> The down side is there will be two id spaces.
Two id spaces... one for types and the other for subprograms.
These are different things, so why would you _want_ them to share
 an id space?  I don't, for instance, see any situation in which
 you'd want some other record to have a field that could reference
 either.
And the 'subprogram id' doesn't have to be just for subprograms;
 it could be for instances generally — like I've been saying, a
 variable declaration is to an object type what a subprogram is to
 a function type, just with a few complications like "subprograms
 can only appear at file scope, not nested in other functions" and
 "variables of function type are immutable".
(I'm assuming that at some point we're going to want to be able to
 have BTF information for e.g. variables stored on a subprogram's
 stack, if only for stuff like single-stepping in a debugger in
 userspace with some sort of mock.  At that point, the variable
 has to have its own record — you can't just have some sort of
 magic type record because e.g. "struct foo bar;" has two names,
 one for the type and one for the variable.)

> Discussed a bit offline with folks about the two id spaces
> situation and it is not good for debugging purpose.
Could you unpack this a bit more?

-Ed

[no subject]

2018-10-19 Thread David Miller

From: David Howells 
Date: Fri, 19 Oct 2018 21:51:35 +0100

> David Miller  wrote:
> 
>> > Is there going to be a merge of net into net-next before the merge
>> > window opens?  Or do you have a sample merge that I can rebase my
>> > afs-next branch on?
>> 
>> I'll be doing a net to net-next merge some time today.
> 
> Excellent, thanks!

And this is now complete.

[no subject]

2018-10-19 Thread David Howells

David Miller  wrote:

> > Is there going to be a merge of net into net-next before the merge
> > window opens?  Or do you have a sample merge that I can rebase my
> > afs-next branch on?
> 
> I'll be doing a net to net-next merge some time today.

Excellent, thanks!

David

RE: [PATCH net-next v2] netpoll: allow cleanup to be synchronous

2018-10-19 Thread Banerjee, Debabrata

> From: Neil Horman 

> I presume you've tested this with some of the stacked devices?  I think I'm
> ok with this change, but I'd like confirmation that its worked.
> 
> Neil

Yes I've tested this on a bond device with vlan stacked on top.

-Deb

> 
> > CC: Neil Horman 
> > CC: "David S. Miller" 
> > Signed-off-by: Debabrata Banerjee 
> > ---
> >  drivers/net/bonding/bond_main.c |  3 ++-
> >  drivers/net/macvlan.c   |  2 +-
> >  drivers/net/team/team.c |  5 +
> >  include/linux/netpoll.h |  4 +---
> >  net/8021q/vlan_dev.c|  3 +--
> >  net/bridge/br_device.c  |  2 +-
> >  net/core/netpoll.c  | 20 +---
> >  net/dsa/slave.c |  2 +-
> >  8 files changed, 13 insertions(+), 28 deletions(-)
> >
> > diff --git a/drivers/net/bonding/bond_main.c
> > b/drivers/net/bonding/bond_main.c index ee28ec9e0aba..ffa37adb7681
> > 100644
> > --- a/drivers/net/bonding/bond_main.c
> > +++ b/drivers/net/bonding/bond_main.c
> > @@ -963,7 +963,8 @@ static inline void slave_disable_netpoll(struct slave
> *slave)
> > return;
> >
> > slave->np = NULL;
> > -   __netpoll_free_async(np);
> > +
> > +   __netpoll_free(np);
> >  }
> >
> >  static void bond_poll_controller(struct net_device *bond_dev) diff
> > --git a/drivers/net/macvlan.c b/drivers/net/macvlan.c index
> > cfda146f3b3b..fc8d5f1ee1ad 100644
> > --- a/drivers/net/macvlan.c
> > +++ b/drivers/net/macvlan.c
> > @@ -1077,7 +1077,7 @@ static void macvlan_dev_netpoll_cleanup(struct
> > net_device *dev)
> >
> > vlan->netpoll = NULL;
> >
> > -   __netpoll_free_async(netpoll);
> > +   __netpoll_free(netpoll);
> >  }
> >  #endif /* CONFIG_NET_POLL_CONTROLLER */
> >
> > diff --git a/drivers/net/team/team.c b/drivers/net/team/team.c index
> > d887016e54b6..db633ae9f784 100644
> > --- a/drivers/net/team/team.c
> > +++ b/drivers/net/team/team.c
> > @@ -1104,10 +1104,7 @@ static void team_port_disable_netpoll(struct
> team_port *port)
> > return;
> > port->np = NULL;
> >
> > -   /* Wait for transmitting packets to finish before freeing. */
> > -   synchronize_rcu_bh();
> > -   __netpoll_cleanup(np);
> > -   kfree(np);
> > +   __netpoll_free(np);
> >  }
> >  #else
> >  static int team_port_enable_netpoll(struct team_port *port) diff
> > --git a/include/linux/netpoll.h b/include/linux/netpoll.h index
> > 3ef82d3a78db..676f1ff161a9 100644
> > --- a/include/linux/netpoll.h
> > +++ b/include/linux/netpoll.h
> > @@ -31,8 +31,6 @@ struct netpoll {
> > bool ipv6;
> > u16 local_port, remote_port;
> > u8 remote_mac[ETH_ALEN];
> > -
> > -   struct work_struct cleanup_work;
> >  };
> >
> >  struct netpoll_info {
> > @@ -63,7 +61,7 @@ int netpoll_parse_options(struct netpoll *np, char
> > *opt);  int __netpoll_setup(struct netpoll *np, struct net_device
> > *ndev);  int netpoll_setup(struct netpoll *np);  void
> > __netpoll_cleanup(struct netpoll *np); -void
> > __netpoll_free_async(struct netpoll *np);
> > +void __netpoll_free(struct netpoll *np);
> >  void netpoll_cleanup(struct netpoll *np);  void
> > netpoll_send_skb_on_dev(struct netpoll *np, struct sk_buff *skb,
> >  struct net_device *dev);
> > diff --git a/net/8021q/vlan_dev.c b/net/8021q/vlan_dev.c index
> > 546af0e73ac3..ff720f1ebf73 100644
> > --- a/net/8021q/vlan_dev.c
> > +++ b/net/8021q/vlan_dev.c
> > @@ -756,8 +756,7 @@ static void vlan_dev_netpoll_cleanup(struct
> net_device *dev)
> > return;
> >
> > vlan->netpoll = NULL;
> > -
> > -   __netpoll_free_async(netpoll);
> > +   __netpoll_free(netpoll);
> >  }
> >  #endif /* CONFIG_NET_POLL_CONTROLLER */
> >
> > diff --git a/net/bridge/br_device.c b/net/bridge/br_device.c index
> > e053a4e43758..c6abf927f0c9 100644
> > --- a/net/bridge/br_device.c
> > +++ b/net/bridge/br_device.c
> > @@ -344,7 +344,7 @@ void br_netpoll_disable(struct net_bridge_port *p)
> >
> > p->np = NULL;
> >
> > -   __netpoll_free_async(np);
> > +   __netpoll_free(np);
> >  }
> >
> >  #endif
> > diff --git a/net/core/netpoll.c b/net/core/netpoll.c index
> > de1d1ba92f2d..6ac71624ead4 100644
> > --- a/net/core/netpoll.c
> > +++ b/net/core/netpoll.c
> > @@ -591,7 +591,6 @@ int __netpoll_setup(struct netpoll *np, struct
> > net_device *ndev)
> >
> > np->dev = ndev;
> > strlcpy(np->dev_name, ndev->name, IFNAMSIZ);
> > -   INIT_WORK(&np->cleanup_work, netpoll_async_cleanup);
> >
> > if (ndev->priv_flags & IFF_DISABLE_NETPOLL) {
> > np_err(np, "%s doesn't support polling, aborting\n", @@ -
> 790,10
> > +789,6 @@ void __netpoll_cleanup(struct netpoll *np)  {
> > struct netpoll_info *npinfo;
> >
> > -   /* rtnl_dereference would be preferable here but
> > -* rcu_cleanup_netpoll path can put us in here safely without
> > -* holding the rtnl, so plain rcu_dereference it is
> > -*/
> > npinfo = rtnl_dereference(np->dev->npinfo);
> > if (!npinfo)
> > return;
> > @@ -814,21 +809,16 @@ void _

Re: [PATCH bpf-next v2 0/2] improve and fix barriers for walking perf ring buffer

2018-10-19 Thread Alexei Starovoitov

On Fri, Oct 19, 2018 at 03:51:01PM +0200, Daniel Borkmann wrote:
> This set first adds smp_* barrier variants to tools infrastructure
> and updates perf and libbpf to make use of them. For details, please
> see individual patches, thanks!
> 
> Arnaldo, if there are no objections, could this be routed via bpf-next
> with Acked-by's due to later dependencies in libbpf? Alternatively,
> I could also get the 2nd patch out during merge window, but perhaps
> it's okay to do in one go as there shouldn't be much conflict in perf
> itself.
> 
> Thanks!
> 
> v1 -> v2:
>   - add common helper and switch to acquire/release variants
> when possible, thanks Peter!

Applied, Thanks

[PATCH iproute2-next] Tree wide: Drop sockaddr_nl arg

2018-10-19 Thread David Ahern

From: David Ahern 

No command, filter, or print function uses the sockaddr_nl arg,
so just drop it.

Signed-off-by: David Ahern 
---
 bridge/br_common.h   |  9 +++--
 bridge/fdb.c |  2 +-
 bridge/link.c|  3 +--
 bridge/mdb.c |  2 +-
 bridge/monitor.c |  9 -
 bridge/vlan.c| 12 +++-
 genl/ctrl.c  | 10 --
 genl/genl.c  |  3 +--
 genl/genl_utils.h|  3 +--
 include/libnetlink.h |  6 ++
 include/ll_map.h |  3 +--
 ip/ip_common.h   | 36 
 ip/ipaddress.c   | 26 ++
 ip/ipaddrlabel.c |  4 ++--
 ip/ipfou.c   |  3 +--
 ip/ipila.c   |  3 +--
 ip/ipl2tp.c  |  6 ++
 ip/iplink.c  |  9 +++--
 ip/iplink_bridge.c   |  3 +--
 ip/ipmacsec.c|  3 +--
 ip/ipmonitor.c   | 25 -
 ip/ipmroute.c|  2 +-
 ip/ipneigh.c |  2 +-
 ip/ipnetconf.c   |  8 +++-
 ip/ipnetns.c |  5 ++---
 ip/ipntable.c|  3 +--
 ip/ipprefix.c|  2 +-
 ip/iproute.c | 17 +++--
 ip/iprule.c  | 11 ---
 ip/ipseg6.c  |  5 ++---
 ip/iptoken.c |  2 +-
 ip/iptuntap.c|  3 +--
 ip/rtmon.c   |  7 +++
 ip/tcp_metrics.c |  5 ++---
 ip/tunnel.c  |  3 +--
 ip/xfrm.h|  6 ++
 ip/xfrm_monitor.c| 37 +++--
 ip/xfrm_policy.c |  9 +++--
 ip/xfrm_state.c  | 11 ---
 lib/libnetlink.c |  7 +++
 lib/ll_map.c |  3 +--
 misc/ifstat.c|  6 ++
 misc/ss.c| 31 ---
 tc/m_action.c|  6 ++
 tc/tc_class.c|  3 +--
 tc/tc_common.h   |  8 
 tc/tc_filter.c   |  4 ++--
 tc/tc_monitor.c  | 11 +--
 tc/tc_qdisc.c|  6 ++
 49 files changed, 155 insertions(+), 248 deletions(-)

diff --git a/bridge/br_common.h b/bridge/br_common.h
index 00a4e9ea125d..23d653df931d 100644
--- a/bridge/br_common.h
+++ b/bridge/br_common.h
@@ -7,12 +7,9 @@
((struct rtattr *)(((char *)(r)) + RTA_ALIGN(sizeof(__u32
 
 void print_vlan_info(struct rtattr *tb, int ifindex);
-int print_linkinfo(const struct sockaddr_nl *who,
-  struct nlmsghdr *n, void *arg);
-int print_fdb(const struct sockaddr_nl *who,
-struct nlmsghdr *n, void *arg);
-int print_mdb(const struct sockaddr_nl *who,
-struct nlmsghdr *n, void *arg);
+int print_linkinfo(struct nlmsghdr *n, void *arg);
+int print_fdb(struct nlmsghdr *n, void *arg);
+int print_mdb(struct nlmsghdr *n, void *arg);
 
 int do_fdb(int argc, char **argv);
 int do_mdb(int argc, char **argv);
diff --git a/bridge/fdb.c b/bridge/fdb.c
index 828fdab264cb..d759f7ec12e2 100644
--- a/bridge/fdb.c
+++ b/bridge/fdb.c
@@ -126,7 +126,7 @@ static void fdb_print_stats(FILE *fp, const struct 
nda_cacheinfo *ci)
}
 }
 
-int print_fdb(const struct sockaddr_nl *who, struct nlmsghdr *n, void *arg)
+int print_fdb(struct nlmsghdr *n, void *arg)
 {
FILE *fp = arg;
struct ndmsg *r = NLMSG_DATA(n);
diff --git a/bridge/link.c b/bridge/link.c
index 4a14845da591..3290c16f0951 100644
--- a/bridge/link.c
+++ b/bridge/link.c
@@ -190,8 +190,7 @@ static void print_af_spec(struct rtattr *attr, int ifindex)
print_vlan_info(aftb[IFLA_BRIDGE_VLAN_INFO], ifindex);
 }
 
-int print_linkinfo(const struct sockaddr_nl *who,
-  struct nlmsghdr *n, void *arg)
+int print_linkinfo(struct nlmsghdr *n, void *arg)
 {
FILE *fp = arg;
struct ifinfomsg *ifi = NLMSG_DATA(n);
diff --git a/bridge/mdb.c b/bridge/mdb.c
index 03fcc91f0219..855a6a4552c7 100644
--- a/bridge/mdb.c
+++ b/bridge/mdb.c
@@ -225,7 +225,7 @@ static void print_router_entries(FILE *fp, struct nlmsghdr 
*n,
close_json_array(PRINT_JSON, NULL);
 }
 
-int print_mdb(const struct sockaddr_nl *who, struct nlmsghdr *n, void *arg)
+int print_mdb(struct nlmsghdr *n, void *arg)
 {
FILE *fp = arg;
struct br_port_msg *r = NLMSG_DATA(n);
diff --git a/bridge/monitor.c b/bridge/monitor.c
index d294269e1092..82bc6b407a06 100644
--- a/bridge/monitor.c
+++ b/bridge/monitor.c
@@ -35,8 +35,7 @@ static void usage(void)
exit(-1);
 }
 
-static int accept_msg(const struct sockaddr_nl *who,
- struct rtnl_ctrl_data *ctrl,
+static int accept_msg(struct rtnl_ctrl_data *ctrl,
  struct nlmsghdr *n, void *arg)
 {
FILE *fp = arg;
@@ -50,19 +49,19 @@ static int accept_msg(const struct sockaddr_nl *who,
if (prefix_banner)
fprintf(fp, "[LINK]");
 
-   return print_linkinfo(who, n, arg);
+   return print_linkinfo(n, arg);
 
case RTM_NEWNEIGH:
case RTM_DELNEIGH:
if (prefix_banner)
fprintf(fp, "[NEIGH]");
-   r

Re: [PATCH net v2] net/sched: act_gact: properly init 'goto chain'

2018-10-19 Thread Cong Wang

On Thu, Oct 18, 2018 at 8:30 AM Davide Caratti  wrote:
> The alternative is, we systematically forbid usage of 'goto chain' in
> tcfg_paction, so that:
>
> # tc f a dev v0 egress matchall action  random determ goto chain 4 5
>
> is systematically rejected with -EINVAL. This comand never worked, so we
> are not breaking anything in userspace.

This is exactly why I asked you if we really need to support it. :)

If no one finds it useful, disallowing it is a good solution here, as
we don't need
to introduce any additional code to handle filter chains.

Re: [PATCH bpf-next v2 02/13] bpf: btf: Add BTF_KIND_FUNC and BTF_KIND_FUNC_PROTO

2018-10-19 Thread Martin Lau

On Fri, Oct 19, 2018 at 12:36:49PM -0700, Martin Lau wrote:
> On Fri, Oct 19, 2018 at 06:04:11PM +0100, Edward Cree wrote:
> > On 18/10/18 22:19, Martin Lau wrote:
> > > As I have mentioned earlier, it is also special to
> > > the kernel because the BTF verifier and bpf_prog_load()
> > > need to do different checks for FUNC and FUNC_PROTO to
> > > ensure they are sane.
> > >
> > > First, we need to agree that the kernel needs to verify
> > > them differently.
> > >
> > > and then we can proceed to discuss how to distinguish them.
> > > We picked the current way to avoid adding a
> > > new BTF function section and keep it
> > > strict forward to distinguish them w/o relying
> > > on other hints from 'struct btf_type'.
> > >
> > > Are you suggesting another way of doing it?
> > But you *do* have such a new section.
> > The patch comment talks about a 'FuncInfo Table' which appears to
> Note that the new section, which contains the FuncInfo Table,
> is in a new ELF section ".BTF.ext" instead of the ".BTF".
> It is not in the ".BTF" section because it is only useful during
> bpf_prog_load().
> 
> I was meaning a new function section within ".BTF".
> 
> >  map (section, insn_idx) to type_id.  (I think this gets added in
> >  .BTF.ext per patch 9?)  So when you're looking at a FUNC type
> >  because you looked up a type_id from that table, you know it's
> >  the signature of a subprogram, and you're checking it as such.
> > Whereas, if you were doing something with some other type and it
> >  referenced a FUNC type (e.g., in the patch comment's example,
> >  you're checking foo's first argument against the type bar) in
> >  its type_id, you know you're using it as a formal type (a FUNC_
> >  PROTO in your parlance) and not as a subprogram.
> > The context in which you are using a type entry tells you which
> >  kind it is.  And the verifier can and should be smart enough to
> >  know what it's doing in this way.
> > 
> > And it's entirely reasonable for the same type entry to get used
> >  for both those cases; in my example, you'd have a FUNC type for
> >  int foo(int), referenced both by the func_info entry for foo
> >  and by the PTR type for bar.  And if you had another subprogram
> >  int baz(int), its func_info entry could reference the same
> >  type_id, because the (reference to the) name of the function
> >  should live in the func_info, not in the type.
> IIUC, I think what you are suggesting here is to use (type_id, name)
> to describe DW_TAG_subprogram "int foo1(int) {}", "int foo2(int) {}",
> "int foo3(int) {}" where type_id here is referring to the same
> DW_TAG_subroutine_type, and only define that _one_
> DW_TAG_subroutine_type in the BTF "type" section.
> 
> That will require more manipulation/type-merging in the dwarf2btf
> process and it could get quite complicated.
> 
> Note that CTF is also fully spelling out the return type
> and arg types for each DW_TAG_subprogram in a separate
> function section (still within the same ELF section).
> The only difference here is they are merged into the type
> section and FUNC_PROTO is added.
> 
> If the concern is having both FUNC and FUNC_PROTO is confusing,
> we could go back to the CTF way which adds a new function section
> in ".BTF" and it is only for DW_TAG_subprogram.
> BTF_KIND_FUNC_PROTO is then no longer necessary.
> Some of new BTF verifier checkings may actually go away also.
> The down side is there will be two id spaces.
Discussed a bit offline with folks about the two id spaces
situation and it is not good for debugging purpose.

If we must get rid of FUNC_PROTO, it is better to use the
name_off==0 check instead of adding a new function section.
We will go for this path in the next respin.

> 
> > 
> > What you are proposing seems to be saying "if we have this
> >  particular special btf_kind, then this BTF entry doesn't just
> >  define a type, it declares a subprogram of that type".  Oh,
> >  and with the name of the type as the subprogram name.  Which
> >  just creates blurry confusion as to whether BTF entries define
> >  types or declare objects; IMNSHO the correct approach is for
> >  objects to be declared elsewhere and to reference BTF types by
> >  their type_id.
> > Which is what the func_info table in patch 9 appears to do.
> > 
> > (It also rather bothers me the way we are using special type
> >  names to associate maps with their k/v types, rather than
> >  extending the records in the maps section to include type_ids
> >  referencing them.  It's the same kind of weird implicitness,
> >  and if I'd spotted it when it was going in I'd've nacked it,
> >  but I suppose it's ABI now and too late to change.)
> > 
> > -Ed

Re: [PATCH net-next v2] netpoll: allow cleanup to be synchronous

2018-10-19 Thread Neil Horman

On Thu, Oct 18, 2018 at 11:18:26AM -0400, Debabrata Banerjee wrote:
> This fixes a problem introduced by:
> commit 2cde6acd49da ("netpoll: Fix __netpoll_rcu_free so that it can hold the 
> rtnl lock")
> 
> When using netconsole on a bond, __netpoll_cleanup can asynchronously
> recurse multiple times, each __netpoll_free_async call can result in
> more __netpoll_free_async's. This means there is now a race between
> cleanup_work queues on multiple netpoll_info's on multiple devices and
> the configuration of a new netpoll. For example if a netconsole is set
> to enable 0, reconfigured, and enable 1 immediately, this netconsole
> will likely not work.
> 
> Given the reason for __netpoll_free_async is it can be called when rtnl
> is not locked, if it is locked, we should be able to execute
> synchronously. It appears to be locked everywhere it's called from.
> 
> Generalize the design pattern from the teaming driver for current
> callers of __netpoll_free_async.
> 
I presume you've tested this with some of the stacked devices?  I think I'm ok
with this change, but I'd like confirmation that its worked.

Neil

> CC: Neil Horman 
> CC: "David S. Miller" 
> Signed-off-by: Debabrata Banerjee 
> ---
>  drivers/net/bonding/bond_main.c |  3 ++-
>  drivers/net/macvlan.c   |  2 +-
>  drivers/net/team/team.c |  5 +
>  include/linux/netpoll.h |  4 +---
>  net/8021q/vlan_dev.c|  3 +--
>  net/bridge/br_device.c  |  2 +-
>  net/core/netpoll.c  | 20 +---
>  net/dsa/slave.c |  2 +-
>  8 files changed, 13 insertions(+), 28 deletions(-)
> 
> diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
> index ee28ec9e0aba..ffa37adb7681 100644
> --- a/drivers/net/bonding/bond_main.c
> +++ b/drivers/net/bonding/bond_main.c
> @@ -963,7 +963,8 @@ static inline void slave_disable_netpoll(struct slave 
> *slave)
>   return;
>  
>   slave->np = NULL;
> - __netpoll_free_async(np);
> +
> + __netpoll_free(np);
>  }
>  
>  static void bond_poll_controller(struct net_device *bond_dev)
> diff --git a/drivers/net/macvlan.c b/drivers/net/macvlan.c
> index cfda146f3b3b..fc8d5f1ee1ad 100644
> --- a/drivers/net/macvlan.c
> +++ b/drivers/net/macvlan.c
> @@ -1077,7 +1077,7 @@ static void macvlan_dev_netpoll_cleanup(struct 
> net_device *dev)
>  
>   vlan->netpoll = NULL;
>  
> - __netpoll_free_async(netpoll);
> + __netpoll_free(netpoll);
>  }
>  #endif   /* CONFIG_NET_POLL_CONTROLLER */
>  
> diff --git a/drivers/net/team/team.c b/drivers/net/team/team.c
> index d887016e54b6..db633ae9f784 100644
> --- a/drivers/net/team/team.c
> +++ b/drivers/net/team/team.c
> @@ -1104,10 +1104,7 @@ static void team_port_disable_netpoll(struct team_port 
> *port)
>   return;
>   port->np = NULL;
>  
> - /* Wait for transmitting packets to finish before freeing. */
> - synchronize_rcu_bh();
> - __netpoll_cleanup(np);
> - kfree(np);
> + __netpoll_free(np);
>  }
>  #else
>  static int team_port_enable_netpoll(struct team_port *port)
> diff --git a/include/linux/netpoll.h b/include/linux/netpoll.h
> index 3ef82d3a78db..676f1ff161a9 100644
> --- a/include/linux/netpoll.h
> +++ b/include/linux/netpoll.h
> @@ -31,8 +31,6 @@ struct netpoll {
>   bool ipv6;
>   u16 local_port, remote_port;
>   u8 remote_mac[ETH_ALEN];
> -
> - struct work_struct cleanup_work;
>  };
>  
>  struct netpoll_info {
> @@ -63,7 +61,7 @@ int netpoll_parse_options(struct netpoll *np, char *opt);
>  int __netpoll_setup(struct netpoll *np, struct net_device *ndev);
>  int netpoll_setup(struct netpoll *np);
>  void __netpoll_cleanup(struct netpoll *np);
> -void __netpoll_free_async(struct netpoll *np);
> +void __netpoll_free(struct netpoll *np);
>  void netpoll_cleanup(struct netpoll *np);
>  void netpoll_send_skb_on_dev(struct netpoll *np, struct sk_buff *skb,
>struct net_device *dev);
> diff --git a/net/8021q/vlan_dev.c b/net/8021q/vlan_dev.c
> index 546af0e73ac3..ff720f1ebf73 100644
> --- a/net/8021q/vlan_dev.c
> +++ b/net/8021q/vlan_dev.c
> @@ -756,8 +756,7 @@ static void vlan_dev_netpoll_cleanup(struct net_device 
> *dev)
>   return;
>  
>   vlan->netpoll = NULL;
> -
> - __netpoll_free_async(netpoll);
> + __netpoll_free(netpoll);
>  }
>  #endif /* CONFIG_NET_POLL_CONTROLLER */
>  
> diff --git a/net/bridge/br_device.c b/net/bridge/br_device.c
> index e053a4e43758..c6abf927f0c9 100644
> --- a/net/bridge/br_device.c
> +++ b/net/bridge/br_device.c
> @@ -344,7 +344,7 @@ void br_netpoll_disable(struct net_bridge_port *p)
>  
>   p->np = NULL;
>  
> - __netpoll_free_async(np);
> + __netpoll_free(np);
>  }
>  
>  #endif
> diff --git a/net/core/netpoll.c b/net/core/netpoll.c
> index de1d1ba92f2d..6ac71624ead4 100644
> --- a/net/core/netpoll.c
> +++ b/net/core/netpoll.c
> @@ -591,7 +591,6 @@ int __netpoll_setup(struct netpoll *np, struct net_de

Re: [PATCH bpf-next] bpf: Extend the sk_lookup() helper to XDP hookpoint.

2018-10-19 Thread Daniel Borkmann

On 10/19/2018 06:47 PM, Joe Stringer wrote:
> On Thu, 18 Oct 2018 at 22:07, Martin Lau  wrote:
>> On Thu, Oct 18, 2018 at 04:52:40PM -0700, Joe Stringer wrote:
>>> On Thu, 18 Oct 2018 at 14:20, Daniel Borkmann  wrote:
 On 10/18/2018 11:06 PM, Joe Stringer wrote:
> On Thu, 18 Oct 2018 at 11:54, Nitin Hande  wrote:
 [...]
>> Open Issue
>> * The underlying code relies on presence of an skb to find out the
>> right sk for the case of REUSEPORT socket option. Since there is
>> no skb available at XDP hookpoint, the helper function will return
>> the first available sk based off the 5 tuple hash. If the desire
>> is to return a particular sk matching reuseport_cb function, please
>> suggest way to tackle it, which can be addressed in a future commit.

>> Signed-off-by: Nitin Hande 
>
> Thanks Nitin, LGTM overall.
>
> The REUSEPORT thing suggests that the usage of this helper from XDP
> layer may lead to a different socket being selected vs. the equivalent
> call at TC hook, or other places where the selection may occur. This
> could be a bit counter-intuitive.
>
> One thought I had to work around this was to introduce a flag,
> something like BPF_F_FIND_REUSEPORT_SK_BY_HASH. This flag would
> effectively communicate in the API that the bpf_sk_lookup_xxx()
> functions will only select a REUSEPORT socket based on the hash and
> not by, for example BPF_PROG_TYPE_SK_REUSEPORT programs. The absence
> of the flag would support finding REUSEPORT sockets by other
> mechanisms (which would be allowed for now from TC hooks but would be
> disallowed from XDP, since there's no specific plan to support this).

 Hmm, given skb is NULL here the only way to lookup the socket in such
 scenario is based on hash, that is, inet_ehashfn() / inet6_ehashfn(),
 perhaps alternative is to pass this hash in from XDP itself to the
 helper so it could be custom selector. Do you have a specific use case
 on this for XDP (just curious)?
>>>
>>> I don't have a use case for SO_REUSEPORT introspection from XDP, so
>>> I'm primarily thinking from the perspective of making the behaviour
>>> clear in the API in a way that leaves open the possibility for a
>>> reasonable implementation in future. From that perspective, my main
>>> concern is that it may surprise some BPF writers that the same
>>> "bpf_sk_lookup_tcp()" call (with identical parameters) may have
>>> different behaviour at TC vs. XDP layers, as the BPF selection of
>>> sockets is respected at TC but not at XDP.
>>>
>>> FWIW we're already out of parameters for the actual call, so if we
>>> wanted to allow passing a hash in, we'd need to either dedicate half
>>> the 'flags' field for this configurable hash, or consider adding the
>>> new hash parameter to 'struct bpf_sock_tuple'.
>>>
>>> +Martin for any thoughts on SO_REUSEPORT and XDP here.
>> The XDP/TC prog has read access to the sk fields through
>> 'struct bpf_sock'?
>>
>> A quick thought...
>> Considering all sk in the same reuse->socks[] share
>> many things (e.g. family,type,protocol,ip,port..etc are the same),
>> I wonder returning which particular sk from reuse->socks[] will
>> matter too much since most of the fields from 'struct bpf_sock' will
>> be the same.  Some of fields in 'struct bpf_sock' could be different
>> though, like priority?  Hence, another possibility is to limit the
>> accessible fields for the XDP prog.  Only allow accessing the fields
>> that must be the same among the sk in the same reuse->socks[].
> 
> This sounds pretty reasonable to me.

Agree, and in any case this difference in returned sk selection should
probably also be documented in the uapi helper description.

Re: [PATCH bpf-next v3 0/7] Implement queue/stack maps

2018-10-19 Thread Alexei Starovoitov

On Fri, Oct 19, 2018 at 10:08:08PM +0200, Daniel Borkmann wrote:
> On 10/18/2018 03:16 PM, Mauricio Vasquez B wrote:
> > In some applications this is needed have a pool of free elements, for
> > example the list of free L4 ports in a SNAT.  None of the current maps allow
> > to do it as it is not possible to get any element without having they key
> > it is associated to, even if it were possible, the lack of locking 
> > mecanishms in
> > eBPF would do it almost impossible to be implemented without data races.
> > 
> > This patchset implements two new kind of eBPF maps: queue and stack.
> > Those maps provide to eBPF programs the peek, push and pop operations, and 
> > for
> > userspace applications a new bpf_map_lookup_and_delete_elem() is added.
> > 
> > Signed-off-by: Mauricio Vasquez B 
> Acked-by: Daniel Borkmann 

Applied, Thanks

[PATCH iproute2-next] iplink: Remove flags argument from iplink_get

2018-10-19 Thread David Ahern

From: David Ahern 

iplink_get has 1 caller and the flags arg is 0, so just remove it.

Signed-off-by: David Ahern 
---
 ip/ip_common.h | 2 +-
 ip/ipaddress.c | 2 +-
 ip/iplink.c| 4 ++--
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/ip/ip_common.h b/ip/ip_common.h
index 200be5e23dd1..458a9cb7ff2c 100644
--- a/ip/ip_common.h
+++ b/ip/ip_common.h
@@ -91,7 +91,7 @@ void vrf_reset(void);
 int netns_identify_pid(const char *pidstr, char *name, int len);
 int do_seg6(int argc, char **argv);
 
-int iplink_get(unsigned int flags, char *name, __u32 filt_mask);
+int iplink_get(char *name, __u32 filt_mask);
 int iplink_ifla_xstats(int argc, char **argv);
 
 int ip_linkaddr_list(int family, req_filter_fn_t filter_fn,
diff --git a/ip/ipaddress.c b/ip/ipaddress.c
index c0c1fbbe4c74..9481f241cb36 100644
--- a/ip/ipaddress.c
+++ b/ip/ipaddress.c
@@ -1940,7 +1940,7 @@ static int ipaddr_list_flush_or_save(int argc, char 
**argv, int action)
 * the link device
 */
if (filter_dev && filter.group == -1 && do_link == 1) {
-   if (iplink_get(0, filter_dev, RTEXT_FILTER_VF) < 0) {
+   if (iplink_get(filter_dev, RTEXT_FILTER_VF) < 0) {
perror("Cannot send link get request");
delete_json_obj();
exit(1);
diff --git a/ip/iplink.c b/ip/iplink.c
index 50ccb49a0263..9f39e3826c19 100644
--- a/ip/iplink.c
+++ b/ip/iplink.c
@@ -1087,11 +1087,11 @@ static int iplink_modify(int cmd, unsigned int flags, 
int argc, char **argv)
return 0;
 }
 
-int iplink_get(unsigned int flags, char *name, __u32 filt_mask)
+int iplink_get(char *name, __u32 filt_mask)
 {
struct iplink_req req = {
.n.nlmsg_len = NLMSG_LENGTH(sizeof(struct ifinfomsg)),
-   .n.nlmsg_flags = NLM_F_REQUEST | flags,
+   .n.nlmsg_flags = NLM_F_REQUEST,
.n.nlmsg_type = RTM_GETLINK,
.i.ifi_family = preferred_family,
};
-- 
2.11.0

[PATCH net] qlcnic: fix a return in qlcnic_dcb_get_capability()

2018-10-19 Thread Dan Carpenter

These functions are supposed to return one on failure and zero on
success.  Returning a zero here could cause uninitialized variable
bugs in several of the callers.  For example:

drivers/scsi/cxgbi/cxgb4i/cxgb4i.c:1660 get_iscsi_dcb_priority()
error: uninitialized symbol 'caps'.

Fixes: 48365e485275 ("qlcnic: dcb: Add support for CEE Netlink interface.")
Signed-off-by: Dan Carpenter 
---
 drivers/net/ethernet/qlogic/qlcnic/qlcnic_dcb.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/qlogic/qlcnic/qlcnic_dcb.c 
b/drivers/net/ethernet/qlogic/qlcnic/qlcnic_dcb.c
index 4b76c69fe86d..834208e55f7b 100644
--- a/drivers/net/ethernet/qlogic/qlcnic/qlcnic_dcb.c
+++ b/drivers/net/ethernet/qlogic/qlcnic/qlcnic_dcb.c
@@ -883,7 +883,7 @@ static u8 qlcnic_dcb_get_capability(struct net_device 
*netdev, int capid,
struct qlcnic_adapter *adapter = netdev_priv(netdev);
 
if (!test_bit(QLCNIC_DCB_STATE, &adapter->dcb->state))
-   return 0;
+   return 1;
 
switch (capid) {
case DCB_CAP_ATTR_PG:
-- 
2.11.0

Re: [PATCH bpf-next v3 0/7] Implement queue/stack maps

2018-10-19 Thread Daniel Borkmann

On 10/18/2018 03:16 PM, Mauricio Vasquez B wrote:
> In some applications this is needed have a pool of free elements, for
> example the list of free L4 ports in a SNAT.  None of the current maps allow
> to do it as it is not possible to get any element without having they key
> it is associated to, even if it were possible, the lack of locking mecanishms 
> in
> eBPF would do it almost impossible to be implemented without data races.
> 
> This patchset implements two new kind of eBPF maps: queue and stack.
> Those maps provide to eBPF programs the peek, push and pop operations, and for
> userspace applications a new bpf_map_lookup_and_delete_elem() is added.
> 
> Signed-off-by: Mauricio Vasquez B 
> 
> v2 -> v3:
>  - Remove "almost dead code" in syscall.c
>  - Remove unnecessary copy_from_user in bpf_map_lookup_and_delete_elem
>  - Rebase
> 
> v1 -> v2:
>  - Put ARG_PTR_TO_UNINIT_MAP_VALUE logic into a separated patch
>  - Fix missing __this_cpu_dec & preempt_enable calls in kernel/bpf/syscall.c
> 
> RFC v4 -> v1:
>  - Remove roundup to power of 2 in memory allocation
>  - Remove count and use a free slot to check if queue/stack is empty
>  - Use if + assigment for wrapping indexes
>  - Fix some minor style issues
>  - Squash two patches together
> 
> RFC v3 -> RFC v4:
>  - Revert renaming of kernel/bpf/stackmap.c
>  - Remove restriction on value size
>  - Remove len arguments from peek/pop helpers
>  - Add new ARG_PTR_TO_UNINIT_MAP_VALUE
> 
> RFC v2 -> RFC v3:
>  - Return elements by value instead that by reference
>  - Implement queue/stack base on array and head + tail indexes
>  - Rename stack trace related files to avoid confusion and conflicts
> 
> RFC v1 -> RFC v2:
>  - Create two separate maps instead of single one + flags
>  - Implement bpf_map_lookup_and_delete syscall
>  - Support peek operation
>  - Define replacement policy through flags in the update() method
>  - Add eBPF side tests
> 
> ---
> 
> Mauricio Vasquez B (7):
>   bpf: rename stack trace map operations
>   bpf/syscall: allow key to be null in map functions
>   bpf/verifier: add ARG_PTR_TO_UNINIT_MAP_VALUE
>   bpf: add queue and stack maps
>   bpf: add MAP_LOOKUP_AND_DELETE_ELEM syscall
>   Sync uapi/bpf.h to tools/include
>   selftests/bpf: add test cases for queue and stack maps
> 
> 
>  include/linux/bpf.h|7 
>  include/linux/bpf_types.h  |4 
>  include/uapi/linux/bpf.h   |   30 ++
>  kernel/bpf/Makefile|2 
>  kernel/bpf/core.c  |3 
>  kernel/bpf/helpers.c   |   43 +++
>  kernel/bpf/queue_stack_maps.c  |  288 
> 
>  kernel/bpf/stackmap.c  |2 
>  kernel/bpf/syscall.c   |   91 ++
>  kernel/bpf/verifier.c  |   28 ++
>  net/core/filter.c  |6 
>  tools/include/uapi/linux/bpf.h |   30 ++
>  tools/lib/bpf/bpf.c|   12 +
>  tools/lib/bpf/bpf.h|2 
>  tools/testing/selftests/bpf/Makefile   |5 
>  tools/testing/selftests/bpf/bpf_helpers.h  |7 
>  tools/testing/selftests/bpf/test_maps.c|  122 
>  tools/testing/selftests/bpf/test_progs.c   |   99 +++
>  tools/testing/selftests/bpf/test_queue_map.c   |4 
>  tools/testing/selftests/bpf/test_queue_stack_map.h |   59 
>  tools/testing/selftests/bpf/test_stack_map.c   |4 
>  21 files changed, 834 insertions(+), 14 deletions(-)
>  create mode 100644 kernel/bpf/queue_stack_maps.c
>  create mode 100644 tools/testing/selftests/bpf/test_queue_map.c
>  create mode 100644 tools/testing/selftests/bpf/test_queue_stack_map.h
>  create mode 100644 tools/testing/selftests/bpf/test_stack_map.c
> 
> --
> 

Series:

Acked-by: Daniel Borkmann

[PATCH net-next 3/4] net/ipv4: Add support for dumping addresses for a specific device

2018-10-19 Thread David Ahern

From: David Ahern 

If an RTM_GETADDR dump request has ifa_index set in the ifaddrmsg
header, then return only the addresses for that device.

Signed-off-by: David Ahern 
---
 net/ipv4/devinet.c | 28 +++-
 1 file changed, 23 insertions(+), 5 deletions(-)

diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c
index 67f382c560ba..63d5b58fbfdb 100644
--- a/net/ipv4/devinet.c
+++ b/net/ipv4/devinet.c
@@ -109,6 +109,7 @@ struct inet_fill_args {
int event;
unsigned int flags;
int netnsid;
+   int ifindex;
 };
 
 #define IN4_ADDR_HSIZE_SHIFT   8
@@ -1663,8 +1664,9 @@ static int inet_fill_ifaddr(struct sk_buff *skb, struct 
in_ifaddr *ifa,
 static int inet_valid_dump_ifaddr_req(const struct nlmsghdr *nlh,
  struct inet_fill_args *fillargs,
  struct net **tgt_net, struct sock *sk,
- struct netlink_ext_ack *extack)
+ struct netlink_callback *cb)
 {
+   struct netlink_ext_ack *extack = cb->extack;
struct nlattr *tb[IFA_MAX+1];
struct ifaddrmsg *ifm;
int err, i;
@@ -1679,9 +1681,11 @@ static int inet_valid_dump_ifaddr_req(const struct 
nlmsghdr *nlh,
NL_SET_ERR_MSG(extack, "ipv4: Invalid values in header for 
address dump request");
return -EINVAL;
}
-   if (ifm->ifa_index) {
-   NL_SET_ERR_MSG(extack, "ipv4: Filter by device index not 
supported for address dump");
-   return -EINVAL;
+
+   fillargs->ifindex = ifm->ifa_index;
+   if (fillargs->ifindex) {
+   cb->answer_flags |= NLM_F_DUMP_FILTERED;
+   fillargs->flags |= NLM_F_DUMP_FILTERED;
}
 
err = nlmsg_parse_strict(nlh, sizeof(*ifm), tb, IFA_MAX,
@@ -1765,9 +1769,22 @@ static int inet_dump_ifaddr(struct sk_buff *skb, struct 
netlink_callback *cb)
 
if (cb->strict_check) {
err = inet_valid_dump_ifaddr_req(nlh, &fillargs, &tgt_net,
-skb->sk, cb->extack);
+skb->sk, cb);
if (err < 0)
return err;
+
+   if (fillargs.ifindex) {
+   dev = __dev_get_by_index(tgt_net, fillargs.ifindex);
+   if (!dev)
+   return -ENODEV;
+
+   in_dev = __in_dev_get_rtnl(dev);
+   if (in_dev) {
+   err = in_dev_dump_addr(in_dev, skb, cb, 
s_ip_idx,
+  &fillargs);
+   }
+   goto put_tgt_net;
+   }
}
 
for (h = s_h; h < NETDEV_HASHENTRIES; h++, s_idx = 0) {
@@ -1800,6 +1817,7 @@ static int inet_dump_ifaddr(struct sk_buff *skb, struct 
netlink_callback *cb)
 done:
cb->args[0] = h;
cb->args[1] = idx;
+put_tgt_net:
if (fillargs.netnsid >= 0)
put_net(tgt_net);
 
-- 
2.11.0

[PATCH net-next 0/4] net: Add support for dumping addresses for a specific device

2018-10-19 Thread David Ahern

From: David Ahern 

Use the recently added kernel side filter infrastructure to add support
for dumping addresses only for a specific device.

Patch 1 creates an IPv4 version similar to IPv6's in6_dump_addrs function.

Patch 2 simplifies in6_dump_addrs by moving index tracking of IP
addresses from inet6_dump_addr to in6_dump_addrs.

Patches 3 and 4 use the device-based address dump helpers to limit a
dump to just the addresses on a specific device.

David Ahern (4):
  net/ipv4: Move loop over addresses in dumps into in_dev_dump_addr
  net/ipv6: Remove ip_idx arg to in6_dump_addrs
  net/ipv4: Add support for dumping addresses for a specific device
  net/ipv6: Add support for dumping addresses for a specific device

 net/ipv4/devinet.c  | 77 +++--
 net/ipv6/addrconf.c | 43 +++---
 2 files changed, 85 insertions(+), 35 deletions(-)

-- 
2.11.0

[PATCH net-next 2/4] net/ipv6: Remove ip_idx arg to in6_dump_addrs

2018-10-19 Thread David Ahern

From: David Ahern 

ip_idx is always 0 going into in6_dump_addrs; it is passed as a pointer
to save the last good index into cb. Since cb is already argument to
in6_dump_addrs, just save the value there.

Signed-off-by: David Ahern 
---
 net/ipv6/addrconf.c | 16 ++--
 1 file changed, 6 insertions(+), 10 deletions(-)

diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index e39c284e2954..6b659846ff8a 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -4955,14 +4955,13 @@ static int inet6_fill_ifacaddr(struct sk_buff *skb, 
struct ifacaddr6 *ifaca,
 
 /* called with rcu_read_lock() */
 static int in6_dump_addrs(struct inet6_dev *idev, struct sk_buff *skb,
- struct netlink_callback *cb,
- int s_ip_idx, int *p_ip_idx,
+ struct netlink_callback *cb, int s_ip_idx,
  struct inet6_fill_args *fillargs)
 {
struct ifmcaddr6 *ifmca;
struct ifacaddr6 *ifaca;
+   int ip_idx = 0;
int err = 1;
-   int ip_idx = *p_ip_idx;
 
read_lock_bh(&idev->lock);
switch (fillargs->type) {
@@ -5012,7 +5011,7 @@ static int in6_dump_addrs(struct inet6_dev *idev, struct 
sk_buff *skb,
break;
}
read_unlock_bh(&idev->lock);
-   *p_ip_idx = ip_idx;
+   cb->args[2] = ip_idx;
return err;
 }
 
@@ -5081,16 +5080,15 @@ static int inet6_dump_addr(struct sk_buff *skb, struct 
netlink_callback *cb,
};
struct net *net = sock_net(skb->sk);
struct net *tgt_net = net;
+   int idx, s_idx, s_ip_idx;
int h, s_h;
-   int idx, ip_idx;
-   int s_idx, s_ip_idx;
struct net_device *dev;
struct inet6_dev *idev;
struct hlist_head *head;
 
s_h = cb->args[0];
s_idx = idx = cb->args[1];
-   s_ip_idx = ip_idx = cb->args[2];
+   s_ip_idx = cb->args[2];
 
if (cb->strict_check) {
int err;
@@ -5111,12 +5109,11 @@ static int inet6_dump_addr(struct sk_buff *skb, struct 
netlink_callback *cb,
goto cont;
if (h > s_h || idx > s_idx)
s_ip_idx = 0;
-   ip_idx = 0;
idev = __in6_dev_get(dev);
if (!idev)
goto cont;
 
-   if (in6_dump_addrs(idev, skb, cb, s_ip_idx, &ip_idx,
+   if (in6_dump_addrs(idev, skb, cb, s_ip_idx,
   &fillargs) < 0)
goto done;
 cont:
@@ -5127,7 +5124,6 @@ static int inet6_dump_addr(struct sk_buff *skb, struct 
netlink_callback *cb,
rcu_read_unlock();
cb->args[0] = h;
cb->args[1] = idx;
-   cb->args[2] = ip_idx;
if (fillargs.netnsid >= 0)
put_net(tgt_net);
 
-- 
2.11.0

[PATCH net-next 1/4] net/ipv4: Move loop over addresses on a device into in_dev_dump_addr

2018-10-19 Thread David Ahern

From: David Ahern 

Similar to IPv6 move the logic that walks over the ipv4 address list
for a device into a helper.

Signed-off-by: David Ahern 
---
 net/ipv4/devinet.c | 49 ++---
 1 file changed, 34 insertions(+), 15 deletions(-)

diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c
index d122ebbe5980..67f382c560ba 100644
--- a/net/ipv4/devinet.c
+++ b/net/ipv4/devinet.c
@@ -1713,6 +1713,32 @@ static int inet_valid_dump_ifaddr_req(const struct 
nlmsghdr *nlh,
return 0;
 }
 
+static int in_dev_dump_addr(struct in_device *in_dev, struct sk_buff *skb,
+   struct netlink_callback *cb, int s_ip_idx,
+   struct inet_fill_args *fillargs)
+{
+   struct in_ifaddr *ifa;
+   int ip_idx = 0;
+   int err;
+
+   for (ifa = in_dev->ifa_list; ifa; ifa = ifa->ifa_next, ip_idx++) {
+   if (ip_idx < s_ip_idx)
+   continue;
+
+   err = inet_fill_ifaddr(skb, ifa, fillargs);
+   if (err < 0)
+   goto done;
+
+   nl_dump_check_consistent(cb, nlmsg_hdr(skb));
+   }
+   err = 0;
+
+done:
+   cb->args[2] = ip_idx;
+
+   return err;
+}
+
 static int inet_dump_ifaddr(struct sk_buff *skb, struct netlink_callback *cb)
 {
const struct nlmsghdr *nlh = cb->nlh;
@@ -1727,19 +1753,17 @@ static int inet_dump_ifaddr(struct sk_buff *skb, struct 
netlink_callback *cb)
struct net *tgt_net = net;
int h, s_h;
int idx, s_idx;
-   int ip_idx, s_ip_idx;
+   int s_ip_idx;
struct net_device *dev;
struct in_device *in_dev;
-   struct in_ifaddr *ifa;
struct hlist_head *head;
+   int err;
 
s_h = cb->args[0];
s_idx = idx = cb->args[1];
-   s_ip_idx = ip_idx = cb->args[2];
+   s_ip_idx = cb->args[2];
 
if (cb->strict_check) {
-   int err;
-
err = inet_valid_dump_ifaddr_req(nlh, &fillargs, &tgt_net,
 skb->sk, cb->extack);
if (err < 0)
@@ -1761,15 +1785,11 @@ static int inet_dump_ifaddr(struct sk_buff *skb, struct 
netlink_callback *cb)
if (!in_dev)
goto cont;
 
-   for (ifa = in_dev->ifa_list, ip_idx = 0; ifa;
-ifa = ifa->ifa_next, ip_idx++) {
-   if (ip_idx < s_ip_idx)
-   continue;
-   if (inet_fill_ifaddr(skb, ifa, &fillargs) < 0) {
-   rcu_read_unlock();
-   goto done;
-   }
-   nl_dump_check_consistent(cb, nlmsg_hdr(skb));
+   err = in_dev_dump_addr(in_dev, skb, cb, s_ip_idx,
+  &fillargs);
+   if (err < 0) {
+   rcu_read_unlock();
+   goto done;
}
 cont:
idx++;
@@ -1780,7 +1800,6 @@ static int inet_dump_ifaddr(struct sk_buff *skb, struct 
netlink_callback *cb)
 done:
cb->args[0] = h;
cb->args[1] = idx;
-   cb->args[2] = ip_idx;
if (fillargs.netnsid >= 0)
put_net(tgt_net);
 
-- 
2.11.0

[PATCH net-next 4/4] net/ipv6: Add support for dumping addresses for a specific device

2018-10-19 Thread David Ahern

From: David Ahern 

If an RTM_GETADDR dump request has ifa_index set in the ifaddrmsg
header, then return only the addresses for that device.

Since inet6_dump_addr is reused for multicast and anycast addresses,
this adds support for device specfic dumps of RTM_GETMULTICAST and
RTM_GETANYCAST as well.

Signed-off-by: David Ahern 
---
 net/ipv6/addrconf.c | 27 ++-
 1 file changed, 22 insertions(+), 5 deletions(-)

diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index 6b659846ff8a..45b84dd5c4eb 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -4821,6 +4821,7 @@ struct inet6_fill_args {
int event;
unsigned int flags;
int netnsid;
+   int ifindex;
enum addr_type_t type;
 };
 
@@ -5018,8 +5019,9 @@ static int in6_dump_addrs(struct inet6_dev *idev, struct 
sk_buff *skb,
 static int inet6_valid_dump_ifaddr_req(const struct nlmsghdr *nlh,
   struct inet6_fill_args *fillargs,
   struct net **tgt_net, struct sock *sk,
-  struct netlink_ext_ack *extack)
+  struct netlink_callback *cb)
 {
+   struct netlink_ext_ack *extack = cb->extack;
struct nlattr *tb[IFA_MAX+1];
struct ifaddrmsg *ifm;
int err, i;
@@ -5034,9 +5036,11 @@ static int inet6_valid_dump_ifaddr_req(const struct 
nlmsghdr *nlh,
NL_SET_ERR_MSG_MOD(extack, "Invalid values in header for 
address dump request");
return -EINVAL;
}
-   if (ifm->ifa_index) {
-   NL_SET_ERR_MSG_MOD(extack, "Filter by device index not 
supported for address dump");
-   return -EINVAL;
+
+   fillargs->ifindex = ifm->ifa_index;
+   if (fillargs->ifindex) {
+   cb->answer_flags |= NLM_F_DUMP_FILTERED;
+   fillargs->flags |= NLM_F_DUMP_FILTERED;
}
 
err = nlmsg_parse_strict(nlh, sizeof(*ifm), tb, IFA_MAX,
@@ -5094,9 +5098,21 @@ static int inet6_dump_addr(struct sk_buff *skb, struct 
netlink_callback *cb,
int err;
 
err = inet6_valid_dump_ifaddr_req(nlh, &fillargs, &tgt_net,
- skb->sk, cb->extack);
+ skb->sk, cb);
if (err < 0)
return err;
+
+   if (fillargs.ifindex) {
+   dev = __dev_get_by_index(tgt_net, fillargs.ifindex);
+   if (!dev)
+   return -ENODEV;
+   idev = __in6_dev_get(dev);
+   if (idev) {
+   err = in6_dump_addrs(idev, skb, cb, s_ip_idx,
+&fillargs);
+   }
+   goto put_tgt_net;
+   }
}
 
rcu_read_lock();
@@ -5124,6 +5140,7 @@ static int inet6_dump_addr(struct sk_buff *skb, struct 
netlink_callback *cb,
rcu_read_unlock();
cb->args[0] = h;
cb->args[1] = idx;
+put_tgt_net:
if (fillargs.netnsid >= 0)
put_net(tgt_net);
 
-- 
2.11.0

Re: [PATCH bpf-next v2 02/13] bpf: btf: Add BTF_KIND_FUNC and BTF_KIND_FUNC_PROTO

2018-10-19 Thread Martin Lau

On Fri, Oct 19, 2018 at 06:04:11PM +0100, Edward Cree wrote:
> On 18/10/18 22:19, Martin Lau wrote:
> > As I have mentioned earlier, it is also special to
> > the kernel because the BTF verifier and bpf_prog_load()
> > need to do different checks for FUNC and FUNC_PROTO to
> > ensure they are sane.
> >
> > First, we need to agree that the kernel needs to verify
> > them differently.
> >
> > and then we can proceed to discuss how to distinguish them.
> > We picked the current way to avoid adding a
> > new BTF function section and keep it
> > strict forward to distinguish them w/o relying
> > on other hints from 'struct btf_type'.
> >
> > Are you suggesting another way of doing it?
> But you *do* have such a new section.
> The patch comment talks about a 'FuncInfo Table' which appears to
Note that the new section, which contains the FuncInfo Table,
is in a new ELF section ".BTF.ext" instead of the ".BTF".
It is not in the ".BTF" section because it is only useful during
bpf_prog_load().

I was meaning a new function section within ".BTF".

>  map (section, insn_idx) to type_id.  (I think this gets added in
>  .BTF.ext per patch 9?)  So when you're looking at a FUNC type
>  because you looked up a type_id from that table, you know it's
>  the signature of a subprogram, and you're checking it as such.
> Whereas, if you were doing something with some other type and it
>  referenced a FUNC type (e.g., in the patch comment's example,
>  you're checking foo's first argument against the type bar) in
>  its type_id, you know you're using it as a formal type (a FUNC_
>  PROTO in your parlance) and not as a subprogram.
> The context in which you are using a type entry tells you which
>  kind it is.  And the verifier can and should be smart enough to
>  know what it's doing in this way.
> 
> And it's entirely reasonable for the same type entry to get used
>  for both those cases; in my example, you'd have a FUNC type for
>  int foo(int), referenced both by the func_info entry for foo
>  and by the PTR type for bar.  And if you had another subprogram
>  int baz(int), its func_info entry could reference the same
>  type_id, because the (reference to the) name of the function
>  should live in the func_info, not in the type.
IIUC, I think what you are suggesting here is to use (type_id, name)
to describe DW_TAG_subprogram "int foo1(int) {}", "int foo2(int) {}",
"int foo3(int) {}" where type_id here is referring to the same
DW_TAG_subroutine_type, and only define that _one_
DW_TAG_subroutine_type in the BTF "type" section.

That will require more manipulation/type-merging in the dwarf2btf
process and it could get quite complicated.

Note that CTF is also fully spelling out the return type
and arg types for each DW_TAG_subprogram in a separate
function section (still within the same ELF section).
The only difference here is they are merged into the type
section and FUNC_PROTO is added.

If the concern is having both FUNC and FUNC_PROTO is confusing,
we could go back to the CTF way which adds a new function section
in ".BTF" and it is only for DW_TAG_subprogram.
BTF_KIND_FUNC_PROTO is then no longer necessary.
Some of new BTF verifier checkings may actually go away also.
The down side is there will be two id spaces.

> 
> What you are proposing seems to be saying "if we have this
>  particular special btf_kind, then this BTF entry doesn't just
>  define a type, it declares a subprogram of that type".  Oh,
>  and with the name of the type as the subprogram name.  Which
>  just creates blurry confusion as to whether BTF entries define
>  types or declare objects; IMNSHO the correct approach is for
>  objects to be declared elsewhere and to reference BTF types by
>  their type_id.
> Which is what the func_info table in patch 9 appears to do.
> 
> (It also rather bothers me the way we are using special type
>  names to associate maps with their k/v types, rather than
>  extending the records in the maps section to include type_ids
>  referencing them.  It's the same kind of weird implicitness,
>  and if I'd spotted it when it was going in I'd've nacked it,
>  but I suppose it's ABI now and too late to change.)
> 
> -Ed

Re: C45 Phys and PHY_FORCING state

2018-10-19 Thread Florian Fainelli

On 10/19/2018 05:02 AM, Jose Abreu wrote:
> Hello Andrew and Florian,
> 
> Currently I have a 10G C45 phy that is fixed at 10G link. This
> version does not support auto negotiation so I'm turning off the
> feature in phydev struct field. I found out that when I do this
> phylib is not composing C45 frames and is instead using C22. This
> is due to call to genphy_udpate_link() which doesn't work on my
> phy because it doesn't support C22.
> 
> If I apply attached patch then things work perfectly fine. Can
> you please review it ?

Looks reasonable, I could not find other functions in the state machine
that were not already abstracting the clause type, or letting a driver
callback be called. Can you submit this as a formal patch against
net-next (and not attached, but inline)?

I would suggest creating a helper, e.g: phy_update_link() that way
everything is well namespaced and clear within the state machine itself.
-- 
Florian

[PATCH][next] igc: fix error return handling from call to netif_set_real_num_tx_queues

2018-10-19 Thread Colin King

From: Colin Ian King 

The call to netif_set_real_num_tx_queues is not assigning the error
return to variable err even though the next line checks err for an
error.  Fix this by adding the missing err assignment.

Detected by CoverityScan, CID#1474551 ("Logically dead code")

Fixes: 3df25e4c1e66 ("igc: Add interrupt support")
Signed-off-by: Colin Ian King 
---
 drivers/net/ethernet/intel/igc/igc_main.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/intel/igc/igc_main.c 
b/drivers/net/ethernet/intel/igc/igc_main.c
index 9d85707e8a81..80ddbd987764 100644
--- a/drivers/net/ethernet/intel/igc/igc_main.c
+++ b/drivers/net/ethernet/intel/igc/igc_main.c
@@ -3358,7 +3358,7 @@ static int __igc_open(struct net_device *netdev, bool 
resuming)
goto err_req_irq;
 
/* Notify the stack of the actual queue counts. */
-   netif_set_real_num_tx_queues(netdev, adapter->num_tx_queues);
+   err = netif_set_real_num_tx_queues(netdev, adapter->num_tx_queues);
if (err)
goto err_set_queues;
 
-- 
2.19.1

Re: [PATCH v2 net-next 0/8] net: dsa: microchip: Modify KSZ9477 DSA driver in preparation to add other KSZ switch drivers

2018-10-19 Thread Florian Fainelli

On 08/16/2018 02:34 PM, tristram...@microchip.com wrote:
>> -Original Message-
>> From: Florian Fainelli 
>> Sent: Wednesday, August 15, 2018 5:29 PM
>> To: Tristram Ha - C24268 ; Andrew Lunn
>> ; Pavel Machek ; Ruediger Schmitt
>> 
>> Cc: Arkadi Sharshevsky ; UNGLinuxDriver
>> ; netdev@vger.kernel.org
>> Subject: Re: [PATCH v2 net-next 0/8] net: dsa: microchip: Modify KSZ9477
>> DSA driver in preparation to add other KSZ switch drivers
>>
>> On 12/05/2017 05:46 PM, tristram...@microchip.com wrote:
>>> From: Tristram Ha 
>>>
>>> This series of patches is to modify the original KSZ9477 DSA driver so
>>> that other KSZ switch drivers can be added and use the common code.
>>>
>>> There are several steps to accomplish this achievement.  First is to
>>> rename some function names with a prefix to indicate chip specific
>>> function.  Second is to move common code into header that can be shared.
>>> Last is to modify tag_ksz.c so that it can handle many tail tag formats
>>> used by different KSZ switch drivers.
>>>
>>> ksz_common.c will contain the common code used by all KSZ switch drivers.
>>> ksz9477.c will contain KSZ9477 code from the original ksz_common.c.
>>> ksz9477_spi.c is renamed from ksz_spi.c.
>>> ksz9477_reg.h is renamed from ksz_9477_reg.h.
>>> ksz_common.h is added to provide common code access to KSZ switch
>>> drivers.
>>> ksz_spi.h is added to provide common SPI access functions to KSZ SPI
>>> drivers.
>>
>> Is something gating this series from getting included? It's been nearly
>> 8 months now and this has not been include nor resubmitted, any plans to
>> rebase that patch series and work towards inclusion in net-next when it
>> opens back again?
>>
>> Thank you!
> 
> Sorry for the long delay.  I will restart my kernel submission effort next 
> month
> after finishing the work on current development project.
> 

Tristram, any chance of resubmitting this or should someone with access
to those switches take up your series and submit it?
-- 
Florian

[no subject]

2018-10-19 Thread David Miller

From: David Howells 
Date: Fri, 19 Oct 2018 15:40:53 +0100

> Is there going to be a merge of net into net-next before the merge
> window opens?  Or do you have a sample merge that I can rebase my
> afs-next branch on?

I'll be doing a net to net-next merge some time today.

Re: [PATCH net-next] rocker: Drop pointless static qualifier

2018-10-19 Thread David Miller

From: YueHaibing 
Date: Fri, 19 Oct 2018 12:02:59 +

> There is no need to have the 'struct rocker_desc_info *desc_info'
> variable static since new value always be assigned before use it.
> 
> Signed-off-by: YueHaibing 

Applied, thank you.

Re: [PATCH bpf-next v2 02/13] bpf: btf: Add BTF_KIND_FUNC and BTF_KIND_FUNC_PROTO

2018-10-19 Thread Edward Cree

On 18/10/18 22:19, Martin Lau wrote:
> As I have mentioned earlier, it is also special to
> the kernel because the BTF verifier and bpf_prog_load()
> need to do different checks for FUNC and FUNC_PROTO to
> ensure they are sane.
>
> First, we need to agree that the kernel needs to verify
> them differently.
>
> and then we can proceed to discuss how to distinguish them.
> We picked the current way to avoid adding a
> new BTF function section and keep it
> strict forward to distinguish them w/o relying
> on other hints from 'struct btf_type'.
>
> Are you suggesting another way of doing it?
But you *do* have such a new section.
The patch comment talks about a 'FuncInfo Table' which appears to
 map (section, insn_idx) to type_id.  (I think this gets added in
 .BTF.ext per patch 9?)  So when you're looking at a FUNC type
 because you looked up a type_id from that table, you know it's
 the signature of a subprogram, and you're checking it as such.
Whereas, if you were doing something with some other type and it
 referenced a FUNC type (e.g., in the patch comment's example,
 you're checking foo's first argument against the type bar) in
 its type_id, you know you're using it as a formal type (a FUNC_
 PROTO in your parlance) and not as a subprogram.
The context in which you are using a type entry tells you which
 kind it is.  And the verifier can and should be smart enough to
 know what it's doing in this way.

And it's entirely reasonable for the same type entry to get used
 for both those cases; in my example, you'd have a FUNC type for
 int foo(int), referenced both by the func_info entry for foo
 and by the PTR type for bar.  And if you had another subprogram
 int baz(int), its func_info entry could reference the same
 type_id, because the (reference to the) name of the function
 should live in the func_info, not in the type.

What you are proposing seems to be saying "if we have this
 particular special btf_kind, then this BTF entry doesn't just
 define a type, it declares a subprogram of that type".  Oh,
 and with the name of the type as the subprogram name.  Which
 just creates blurry confusion as to whether BTF entries define
 types or declare objects; IMNSHO the correct approach is for
 objects to be declared elsewhere and to reference BTF types by
 their type_id.
Which is what the func_info table in patch 9 appears to do.

(It also rather bothers me the way we are using special type
 names to associate maps with their k/v types, rather than
 extending the records in the maps section to include type_ids
 referencing them.  It's the same kind of weird implicitness,
 and if I'd spotted it when it was going in I'd've nacked it,
 but I suppose it's ABI now and too late to change.)

-Ed

[PATCH net] net/ipv6: Fix index counter for unicast addresses in in6_dump_addrs

2018-10-19 Thread David Ahern

From: David Ahern 

The loop wants to skip previously dumped addresses, so loops until
current index >= saved index. If the message fills it wants to save
the index for the next address to dump - ie., the one that did not
fit in the current message.

Currently, it is incrementing the index counter before comparing to the
saved index, and then the saved index is off by 1 - it assumes the
current address is going to fit in the message.

Change the index handling to increment only after a succesful dump.

Fixes: 502a2ffd7376a ("ipv6: convert idev_list to list macros")
Signed-off-by: David Ahern 
---
 net/ipv6/addrconf.c | 6 --
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index c63ccce6425f..4e81ff2f4588 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -4928,8 +4928,8 @@ static int in6_dump_addrs(struct inet6_dev *idev, struct 
sk_buff *skb,
 
/* unicast address incl. temp addr */
list_for_each_entry(ifa, &idev->addr_list, if_list) {
-   if (++ip_idx < s_ip_idx)
-   continue;
+   if (ip_idx < s_ip_idx)
+   goto next;
err = inet6_fill_ifaddr(skb, ifa,
NETLINK_CB(cb->skb).portid,
cb->nlh->nlmsg_seq,
@@ -4938,6 +4938,8 @@ static int in6_dump_addrs(struct inet6_dev *idev, struct 
sk_buff *skb,
if (err < 0)
break;
nl_dump_check_consistent(cb, nlmsg_hdr(skb));
+next:
+   ip_idx++;
}
break;
}
-- 
2.11.0

[PATCH v8 bpf-next 1/2] bpf: add cg_skb_is_valid_access for BPF_PROG_TYPE_CGROUP_SKB

2018-10-19 Thread Song Liu

BPF programs of BPF_PROG_TYPE_CGROUP_SKB need to access headers in the
skb. This patch enables direct access of skb for these programs.

Two helper functions bpf_compute_and_save_data_end() and
bpf_restore_data_end() are introduced. There are used in
__cgroup_bpf_run_filter_skb(), to compute proper data_end for the
BPF program, and restore original data afterwards.

Signed-off-by: Song Liu 
---
 include/linux/filter.h | 21 +
 kernel/bpf/cgroup.c|  6 ++
 net/core/filter.c  | 36 +++-
 3 files changed, 62 insertions(+), 1 deletion(-)

diff --git a/include/linux/filter.h b/include/linux/filter.h
index 5771874bc01e..91b4c934f02e 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -548,6 +548,27 @@ static inline void bpf_compute_data_pointers(struct 
sk_buff *skb)
cb->data_end  = skb->data + skb_headlen(skb);
 }
 
+/* Similar to bpf_compute_data_pointers(), except that save orginal
+ * data in cb->data and cb->meta_data for restore.
+ */
+static inline void bpf_compute_and_save_data_end(
+   struct sk_buff *skb, void **saved_data_end)
+{
+   struct bpf_skb_data_end *cb = (struct bpf_skb_data_end *)skb->cb;
+
+   *saved_data_end = cb->data_end;
+   cb->data_end  = skb->data + skb_headlen(skb);
+}
+
+/* Restore data saved by bpf_compute_data_pointers(). */
+static inline void bpf_restore_data_end(
+   struct sk_buff *skb, void *saved_data_end)
+{
+   struct bpf_skb_data_end *cb = (struct bpf_skb_data_end *)skb->cb;
+
+   cb->data_end = saved_data_end;
+}
+
 static inline u8 *bpf_skb_cb(struct sk_buff *skb)
 {
/* eBPF programs may read/write skb->cb[] area to transfer meta
diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c
index 00f6ed2e4f9a..9425c2fb872f 100644
--- a/kernel/bpf/cgroup.c
+++ b/kernel/bpf/cgroup.c
@@ -553,6 +553,7 @@ int __cgroup_bpf_run_filter_skb(struct sock *sk,
 {
unsigned int offset = skb->data - skb_network_header(skb);
struct sock *save_sk;
+   void *saved_data_end;
struct cgroup *cgrp;
int ret;
 
@@ -566,8 +567,13 @@ int __cgroup_bpf_run_filter_skb(struct sock *sk,
save_sk = skb->sk;
skb->sk = sk;
__skb_push(skb, offset);
+
+   /* compute pointers for the bpf prog */
+   bpf_compute_and_save_data_end(skb, &saved_data_end);
+
ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[type], skb,
 bpf_prog_run_save_cb);
+   bpf_restore_data_end(skb, saved_data_end);
__skb_pull(skb, offset);
skb->sk = save_sk;
return ret == 1 ? 0 : -EPERM;
diff --git a/net/core/filter.c b/net/core/filter.c
index 1a3ac6c46873..e3ca30bd6840 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -5346,6 +5346,40 @@ static bool sk_filter_is_valid_access(int off, int size,
return bpf_skb_is_valid_access(off, size, type, prog, info);
 }
 
+static bool cg_skb_is_valid_access(int off, int size,
+  enum bpf_access_type type,
+  const struct bpf_prog *prog,
+  struct bpf_insn_access_aux *info)
+{
+   switch (off) {
+   case bpf_ctx_range(struct __sk_buff, tc_classid):
+   case bpf_ctx_range(struct __sk_buff, data_meta):
+   case bpf_ctx_range(struct __sk_buff, flow_keys):
+   return false;
+   }
+   if (type == BPF_WRITE) {
+   switch (off) {
+   case bpf_ctx_range(struct __sk_buff, mark):
+   case bpf_ctx_range(struct __sk_buff, priority):
+   case bpf_ctx_range_till(struct __sk_buff, cb[0], cb[4]):
+   break;
+   default:
+   return false;
+   }
+   }
+
+   switch (off) {
+   case bpf_ctx_range(struct __sk_buff, data):
+   info->reg_type = PTR_TO_PACKET;
+   break;
+   case bpf_ctx_range(struct __sk_buff, data_end):
+   info->reg_type = PTR_TO_PACKET_END;
+   break;
+   }
+
+   return bpf_skb_is_valid_access(off, size, type, prog, info);
+}
+
 static bool lwt_is_valid_access(int off, int size,
enum bpf_access_type type,
const struct bpf_prog *prog,
@@ -7038,7 +7072,7 @@ const struct bpf_prog_ops xdp_prog_ops = {
 
 const struct bpf_verifier_ops cg_skb_verifier_ops = {
.get_func_proto = cg_skb_func_proto,
-   .is_valid_access= sk_filter_is_valid_access,
+   .is_valid_access= cg_skb_is_valid_access,
.convert_ctx_access = bpf_convert_ctx_access,
 };
 
-- 
2.17.1

[PATCH v8 bpf-next 0/2] bpf: add cg_skb_is_valid_access

2018-10-19 Thread Song Liu

Changes v7 -> v8:
1. Dynamically allocate the dummy sk to avoid race conditions.

Changes v6 -> v7:
1. Make dummy sk a global variable (test_run_sk).

Changes v5 -> v6:
1. Fixed dummy sk in bpf_prog_test_run_skb() as suggested by Eric Dumazet.

Changes v4 -> v5:
1. Replaced bpf_compute_and_save_data_pointers() with
   bpf_compute_and_save_data_end();
   Replaced bpf_restore_data_pointers() with bpf_restore_data_end().
2. Fixed indentation in test_verifier.c

Changes v3 -> v4:
1. Fixed crash issue reported by Alexei.

Changes v2 -> v3:
1. Added helper function bpf_compute_and_save_data_pointers() and
   bpf_restore_data_pointers().

Changes v1 -> v2:
1. Updated the list of read-only fields, and read-write fields.
2. Added dummy sk to bpf_prog_test_run_skb().

This set enables BPF program of type BPF_PROG_TYPE_CGROUP_SKB to access
some __skb_buff data directly.

Song Liu (2):
  bpf: add cg_skb_is_valid_access for BPF_PROG_TYPE_CGROUP_SKB
  bpf: add tests for direct packet access from CGROUP_SKB

 include/linux/filter.h  |  21 +++
 kernel/bpf/cgroup.c |   6 +
 net/bpf/test_run.c  |  15 ++
 net/core/filter.c   |  36 -
 tools/testing/selftests/bpf/test_verifier.c | 171 
 5 files changed, 248 insertions(+), 1 deletion(-)

--
2.17.1

[PATCH v8 bpf-next 2/2] bpf: add tests for direct packet access from CGROUP_SKB

2018-10-19 Thread Song Liu

Tests are added to make sure CGROUP_SKB cannot access:
  tc_classid, data_meta, flow_keys

and can read and write:
  mark, prority, and cb[0-4]

and can read other fields.

To make selftest with skb->sk work, a dummy sk is added in
bpf_prog_test_run_skb().

Signed-off-by: Song Liu 
---
 net/bpf/test_run.c  |  15 ++
 tools/testing/selftests/bpf/test_verifier.c | 171 
 2 files changed, 186 insertions(+)

diff --git a/net/bpf/test_run.c b/net/bpf/test_run.c
index 0c423b8cd75c..c89c22c49015 100644
--- a/net/bpf/test_run.c
+++ b/net/bpf/test_run.c
@@ -10,6 +10,8 @@
 #include 
 #include 
 #include 
+#include 
+#include 
 
 static __always_inline u32 bpf_test_run_one(struct bpf_prog *prog, void *ctx,
struct bpf_cgroup_storage *storage[MAX_BPF_CGROUP_STORAGE_TYPE])
@@ -115,6 +117,7 @@ int bpf_prog_test_run_skb(struct bpf_prog *prog, const 
union bpf_attr *kattr,
u32 retval, duration;
int hh_len = ETH_HLEN;
struct sk_buff *skb;
+   struct sock *sk;
void *data;
int ret;
 
@@ -137,11 +140,21 @@ int bpf_prog_test_run_skb(struct bpf_prog *prog, const 
union bpf_attr *kattr,
break;
}
 
+   sk = kzalloc(sizeof(struct sock), GFP_USER);
+   if (!sk) {
+   kfree(data);
+   return -ENOMEM;
+   }
+   sock_net_set(sk, current->nsproxy->net_ns);
+   sock_init_data(NULL, sk);
+
skb = build_skb(data, 0);
if (!skb) {
kfree(data);
+   kfree(sk);
return -ENOMEM;
}
+   skb->sk = sk;
 
skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN);
__skb_put(skb, size);
@@ -159,6 +172,7 @@ int bpf_prog_test_run_skb(struct bpf_prog *prog, const 
union bpf_attr *kattr,
 
if (pskb_expand_head(skb, nhead, 0, GFP_USER)) {
kfree_skb(skb);
+   kfree(sk);
return -ENOMEM;
}
}
@@ -171,6 +185,7 @@ int bpf_prog_test_run_skb(struct bpf_prog *prog, const 
union bpf_attr *kattr,
size = skb_headlen(skb);
ret = bpf_test_finish(kattr, uattr, skb->data, size, retval, duration);
kfree_skb(skb);
+   kfree(sk);
return ret;
 }
 
diff --git a/tools/testing/selftests/bpf/test_verifier.c 
b/tools/testing/selftests/bpf/test_verifier.c
index cf4cd32b6772..f1ae8d09770f 100644
--- a/tools/testing/selftests/bpf/test_verifier.c
+++ b/tools/testing/selftests/bpf/test_verifier.c
@@ -4862,6 +4862,177 @@ static struct bpf_test tests[] = {
.result = REJECT,
.flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS,
},
+   {
+   "direct packet read test#1 for CGROUP_SKB",
+   .insns = {
+   BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1,
+   offsetof(struct __sk_buff, data)),
+   BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1,
+   offsetof(struct __sk_buff, data_end)),
+   BPF_LDX_MEM(BPF_W, BPF_REG_4, BPF_REG_1,
+   offsetof(struct __sk_buff, len)),
+   BPF_LDX_MEM(BPF_W, BPF_REG_5, BPF_REG_1,
+   offsetof(struct __sk_buff, pkt_type)),
+   BPF_LDX_MEM(BPF_W, BPF_REG_6, BPF_REG_1,
+   offsetof(struct __sk_buff, mark)),
+   BPF_STX_MEM(BPF_W, BPF_REG_1, BPF_REG_6,
+   offsetof(struct __sk_buff, mark)),
+   BPF_LDX_MEM(BPF_W, BPF_REG_7, BPF_REG_1,
+   offsetof(struct __sk_buff, queue_mapping)),
+   BPF_LDX_MEM(BPF_W, BPF_REG_8, BPF_REG_1,
+   offsetof(struct __sk_buff, protocol)),
+   BPF_LDX_MEM(BPF_W, BPF_REG_9, BPF_REG_1,
+   offsetof(struct __sk_buff, vlan_present)),
+   BPF_MOV64_REG(BPF_REG_0, BPF_REG_2),
+   BPF_ALU64_IMM(BPF_ADD, BPF_REG_0, 8),
+   BPF_JMP_REG(BPF_JGT, BPF_REG_0, BPF_REG_3, 1),
+   BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_2, 0),
+   BPF_MOV64_IMM(BPF_REG_0, 0),
+   BPF_EXIT_INSN(),
+   },
+   .result = ACCEPT,
+   .prog_type = BPF_PROG_TYPE_CGROUP_SKB,
+   },
+   {
+   "direct packet read test#2 for CGROUP_SKB",
+   .insns = {
+   BPF_LDX_MEM(BPF_W, BPF_REG_4, BPF_REG_1,
+   offsetof(struct __sk_buff, vlan_tci)),
+   BPF_LDX_MEM(BPF_W, BPF_REG_5, BPF_REG_1,
+   offsetof(struct __sk_buff, vlan_proto)),
+   BPF_LDX_M

RE: Improving accuracy of PHC readings

2018-10-19 Thread Keller, Jacob E

> -Original Message-
> From: netdev-ow...@vger.kernel.org [mailto:netdev-ow...@vger.kernel.org] On
> Behalf Of Miroslav Lichvar
> Sent: Friday, October 19, 2018 2:52 AM
> To: netdev@vger.kernel.org
> Cc: Richard Cochran ; Keller, Jacob E
> 
> Subject: Improving accuracy of PHC readings
> 
> I think there might be a way how we could significantly improve
> accuracy of synchronization between the system clock and a PTP
> hardware clock, at least with some network drivers.
> 
> Currently, the PTP_SYS_OFFSET ioctl reads the system clock, reads the
> PHC using the gettime64 function of the driver, and reads the system
> clock again. The ioctl can repeat this to provide multiple readings to
> the user space.
> 
> phc2sys (or another program synchronizing the system clock to the PHC)
> assumes the PHC timestamps were captured in the middle between the two
> closest system clock timestamps.
> 
> The trouble is that gettime64 typically reads multiple (2-3) registers
> and the timestamp is latched on the first one, so the assumption about
> middle point is wrong. There is an asymmetry, even if the delays on
> the PCIe bus are perfectly symmetric.
> 

Right! I feel like this is obvious now that you said it, so I'm surprised no 
one thought of it before...

> A solution to this would be a new driver function that wraps the
> latching register read with readings of the system clock and return
> three timestamps instead of one. For example:
> 
> ktime_get_real_ts64(&sys_ts1);
>   IXGBE_READ_REG(hw, IXGBE_SYSTIMR);
>   ktime_get_real_ts64(&sys_ts2);
>   phc_ts.tv_nsec = IXGBE_READ_REG(hw, IXGBE_SYSTIML);
>   phc_ts.tv_sec = IXGBE_READ_REG(hw, IXGBE_SYSTIMH);
> 
> The extra timestamp doesn't fit the API of the PTP_SYS_OFFSET ioctl,
> so it would need to shift the timestamp it returns by the missing
> intervals (assuming the frequency offset between the PHC and system
> clock is small), or a new ioctl could be introduced that would return
> all timestamps in an array looking like this:
> 
>   [sys, phc, sys, sys, phc, sys, ...]
> 

I think the new ioctl is probably the better solution.

> This should significantly improve the accuracy of the synchronization,
> reduce the uncertainty in the readings to less than a half or third,
> and also reduce the jitter as there are fewer register reads sensitive
> to the PCIe delay.
> 
> What do you think?
> 

Nice! I think this is good. I'd love to see some data to back it up, but it 
makes sense to me.

Thanks,
Jake

> --
> Miroslav Lichvar

Re: [PATCH bpf-next] bpf: Extend the sk_lookup() helper to XDP hookpoint.

2018-10-19 Thread Joe Stringer

On Thu, 18 Oct 2018 at 22:07, Martin Lau  wrote:
>
> On Thu, Oct 18, 2018 at 04:52:40PM -0700, Joe Stringer wrote:
> > On Thu, 18 Oct 2018 at 14:20, Daniel Borkmann  wrote:
> > >
> > > On 10/18/2018 11:06 PM, Joe Stringer wrote:
> > > > On Thu, 18 Oct 2018 at 11:54, Nitin Hande  wrote:
> > > [...]
> > > >> Open Issue
> > > >> * The underlying code relies on presence of an skb to find out the
> > > >> right sk for the case of REUSEPORT socket option. Since there is
> > > >> no skb available at XDP hookpoint, the helper function will return
> > > >> the first available sk based off the 5 tuple hash. If the desire
> > > >> is to return a particular sk matching reuseport_cb function, please
> > > >> suggest way to tackle it, which can be addressed in a future commit.
> > >
> > > >> Signed-off-by: Nitin Hande 
> > > >
> > > > Thanks Nitin, LGTM overall.
> > > >
> > > > The REUSEPORT thing suggests that the usage of this helper from XDP
> > > > layer may lead to a different socket being selected vs. the equivalent
> > > > call at TC hook, or other places where the selection may occur. This
> > > > could be a bit counter-intuitive.
> > > >
> > > > One thought I had to work around this was to introduce a flag,
> > > > something like BPF_F_FIND_REUSEPORT_SK_BY_HASH. This flag would
> > > > effectively communicate in the API that the bpf_sk_lookup_xxx()
> > > > functions will only select a REUSEPORT socket based on the hash and
> > > > not by, for example BPF_PROG_TYPE_SK_REUSEPORT programs. The absence
> > > > of the flag would support finding REUSEPORT sockets by other
> > > > mechanisms (which would be allowed for now from TC hooks but would be
> > > > disallowed from XDP, since there's no specific plan to support this).
> > >
> > > Hmm, given skb is NULL here the only way to lookup the socket in such
> > > scenario is based on hash, that is, inet_ehashfn() / inet6_ehashfn(),
> > > perhaps alternative is to pass this hash in from XDP itself to the
> > > helper so it could be custom selector. Do you have a specific use case
> > > on this for XDP (just curious)?
> >
> > I don't have a use case for SO_REUSEPORT introspection from XDP, so
> > I'm primarily thinking from the perspective of making the behaviour
> > clear in the API in a way that leaves open the possibility for a
> > reasonable implementation in future. From that perspective, my main
> > concern is that it may surprise some BPF writers that the same
> > "bpf_sk_lookup_tcp()" call (with identical parameters) may have
> > different behaviour at TC vs. XDP layers, as the BPF selection of
> > sockets is respected at TC but not at XDP.
> >
> > FWIW we're already out of parameters for the actual call, so if we
> > wanted to allow passing a hash in, we'd need to either dedicate half
> > the 'flags' field for this configurable hash, or consider adding the
> > new hash parameter to 'struct bpf_sock_tuple'.
> >
> > +Martin for any thoughts on SO_REUSEPORT and XDP here.
> The XDP/TC prog has read access to the sk fields through
> 'struct bpf_sock'?
>
> A quick thought...
> Considering all sk in the same reuse->socks[] share
> many things (e.g. family,type,protocol,ip,port..etc are the same),
> I wonder returning which particular sk from reuse->socks[] will
> matter too much since most of the fields from 'struct bpf_sock' will
> be the same.  Some of fields in 'struct bpf_sock' could be different
> though, like priority?  Hence, another possibility is to limit the
> accessible fields for the XDP prog.  Only allow accessing the fields
> that must be the same among the sk in the same reuse->socks[].

This sounds pretty reasonable to me.

Re: [PATCH v7 bpf-next 2/2] bpf: add tests for direct packet access from CGROUP_SKB

2018-10-19 Thread Eric Dumazet




On 10/19/2018 09:27 AM, Song Liu wrote:
> Tests are added to make sure CGROUP_SKB cannot access:
>   tc_classid, data_meta, flow_keys
> 
> and can read and write:
>   mark, prority, and cb[0-4]
> 
> and can read other fields.
> 
> To make selftest with skb->sk work, a dummy sk is added in
> bpf_prog_test_run_skb().
> 
> Signed-off-by: Song Liu 
> ---
>  net/bpf/test_run.c  |   8 +
>  tools/testing/selftests/bpf/test_verifier.c | 171 
>  2 files changed, 179 insertions(+)
> 
> diff --git a/net/bpf/test_run.c b/net/bpf/test_run.c
> index 0c423b8cd75c..ae2ab89a9291 100644
> --- a/net/bpf/test_run.c
> +++ b/net/bpf/test_run.c
> @@ -10,6 +10,8 @@
>  #include 
>  #include 
>  #include 
> +#include 
> +#include 
>  
>  static __always_inline u32 bpf_test_run_one(struct bpf_prog *prog, void *ctx,
>   struct bpf_cgroup_storage *storage[MAX_BPF_CGROUP_STORAGE_TYPE])
> @@ -106,6 +108,8 @@ static void *bpf_test_init(const union bpf_attr *kattr, 
> u32 size,
>   return data;
>  }
>  
> +static struct sock test_run_sk = {0};

No need for the {0}  : bss is guaranteed to be zero.

> +
>  int bpf_prog_test_run_skb(struct bpf_prog *prog, const union bpf_attr *kattr,
> union bpf_attr __user *uattr)
>  {
> @@ -137,11 +141,15 @@ int bpf_prog_test_run_skb(struct bpf_prog *prog, const 
> union bpf_attr *kattr,
>   break;
>   }
>  
> + sock_net_set(&test_run_sk, current->nsproxy->net_ns);
> + sock_init_data(NULL, &test_run_sk);
> +
>

Can bpf_prog_test_run_skb() be used in parallel from different CPUS/threads ?

 
If yes, this looks racy, and I would suggest to use a kzalloc()ed socket just 
to be safe.

[PATCH v7 bpf-next 0/2] bpf: add cg_skb_is_valid_access

2018-10-19 Thread Song Liu

Changes v6 -> v7:
1. Make dummy sk a global variable (test_run_sk).

Changes v5 -> v6:
1. Fixed dummy sk in bpf_prog_test_run_skb() as suggested by Eric Dumazet.

Changes v4 -> v5:
1. Replaced bpf_compute_and_save_data_pointers() with
   bpf_compute_and_save_data_end();
   Replaced bpf_restore_data_pointers() with bpf_restore_data_end().
2. Fixed indentation in test_verifier.c

Changes v3 -> v4:
1. Fixed crash issue reported by Alexei.

Changes v2 -> v3:
1. Added helper function bpf_compute_and_save_data_pointers() and
   bpf_restore_data_pointers().

Changes v1 -> v2:
1. Updated the list of read-only fields, and read-write fields.
2. Added dummy sk to bpf_prog_test_run_skb().

This set enables BPF program of type BPF_PROG_TYPE_CGROUP_SKB to access
some __skb_buff data directly.

Song Liu (2):
  bpf: add cg_skb_is_valid_access for BPF_PROG_TYPE_CGROUP_SKB
  bpf: add tests for direct packet access from CGROUP_SKB

 include/linux/filter.h  |  21 +++
 kernel/bpf/cgroup.c |   6 +
 net/bpf/test_run.c  |   8 +
 net/core/filter.c   |  36 -
 tools/testing/selftests/bpf/test_verifier.c | 171 
 5 files changed, 241 insertions(+), 1 deletion(-)

--
2.17.1

[PATCH v7 bpf-next 1/2] bpf: add cg_skb_is_valid_access for BPF_PROG_TYPE_CGROUP_SKB

2018-10-19 Thread Song Liu

BPF programs of BPF_PROG_TYPE_CGROUP_SKB need to access headers in the
skb. This patch enables direct access of skb for these programs.

Two helper functions bpf_compute_and_save_data_end() and
bpf_restore_data_end() are introduced. There are used in
__cgroup_bpf_run_filter_skb(), to compute proper data_end for the
BPF program, and restore original data afterwards.

Signed-off-by: Song Liu 
---
 include/linux/filter.h | 21 +
 kernel/bpf/cgroup.c|  6 ++
 net/core/filter.c  | 36 +++-
 3 files changed, 62 insertions(+), 1 deletion(-)

diff --git a/include/linux/filter.h b/include/linux/filter.h
index 5771874bc01e..91b4c934f02e 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -548,6 +548,27 @@ static inline void bpf_compute_data_pointers(struct 
sk_buff *skb)
cb->data_end  = skb->data + skb_headlen(skb);
 }
 
+/* Similar to bpf_compute_data_pointers(), except that save orginal
+ * data in cb->data and cb->meta_data for restore.
+ */
+static inline void bpf_compute_and_save_data_end(
+   struct sk_buff *skb, void **saved_data_end)
+{
+   struct bpf_skb_data_end *cb = (struct bpf_skb_data_end *)skb->cb;
+
+   *saved_data_end = cb->data_end;
+   cb->data_end  = skb->data + skb_headlen(skb);
+}
+
+/* Restore data saved by bpf_compute_data_pointers(). */
+static inline void bpf_restore_data_end(
+   struct sk_buff *skb, void *saved_data_end)
+{
+   struct bpf_skb_data_end *cb = (struct bpf_skb_data_end *)skb->cb;
+
+   cb->data_end = saved_data_end;
+}
+
 static inline u8 *bpf_skb_cb(struct sk_buff *skb)
 {
/* eBPF programs may read/write skb->cb[] area to transfer meta
diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c
index 00f6ed2e4f9a..9425c2fb872f 100644
--- a/kernel/bpf/cgroup.c
+++ b/kernel/bpf/cgroup.c
@@ -553,6 +553,7 @@ int __cgroup_bpf_run_filter_skb(struct sock *sk,
 {
unsigned int offset = skb->data - skb_network_header(skb);
struct sock *save_sk;
+   void *saved_data_end;
struct cgroup *cgrp;
int ret;
 
@@ -566,8 +567,13 @@ int __cgroup_bpf_run_filter_skb(struct sock *sk,
save_sk = skb->sk;
skb->sk = sk;
__skb_push(skb, offset);
+
+   /* compute pointers for the bpf prog */
+   bpf_compute_and_save_data_end(skb, &saved_data_end);
+
ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[type], skb,
 bpf_prog_run_save_cb);
+   bpf_restore_data_end(skb, saved_data_end);
__skb_pull(skb, offset);
skb->sk = save_sk;
return ret == 1 ? 0 : -EPERM;
diff --git a/net/core/filter.c b/net/core/filter.c
index 1a3ac6c46873..e3ca30bd6840 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -5346,6 +5346,40 @@ static bool sk_filter_is_valid_access(int off, int size,
return bpf_skb_is_valid_access(off, size, type, prog, info);
 }
 
+static bool cg_skb_is_valid_access(int off, int size,
+  enum bpf_access_type type,
+  const struct bpf_prog *prog,
+  struct bpf_insn_access_aux *info)
+{
+   switch (off) {
+   case bpf_ctx_range(struct __sk_buff, tc_classid):
+   case bpf_ctx_range(struct __sk_buff, data_meta):
+   case bpf_ctx_range(struct __sk_buff, flow_keys):
+   return false;
+   }
+   if (type == BPF_WRITE) {
+   switch (off) {
+   case bpf_ctx_range(struct __sk_buff, mark):
+   case bpf_ctx_range(struct __sk_buff, priority):
+   case bpf_ctx_range_till(struct __sk_buff, cb[0], cb[4]):
+   break;
+   default:
+   return false;
+   }
+   }
+
+   switch (off) {
+   case bpf_ctx_range(struct __sk_buff, data):
+   info->reg_type = PTR_TO_PACKET;
+   break;
+   case bpf_ctx_range(struct __sk_buff, data_end):
+   info->reg_type = PTR_TO_PACKET_END;
+   break;
+   }
+
+   return bpf_skb_is_valid_access(off, size, type, prog, info);
+}
+
 static bool lwt_is_valid_access(int off, int size,
enum bpf_access_type type,
const struct bpf_prog *prog,
@@ -7038,7 +7072,7 @@ const struct bpf_prog_ops xdp_prog_ops = {
 
 const struct bpf_verifier_ops cg_skb_verifier_ops = {
.get_func_proto = cg_skb_func_proto,
-   .is_valid_access= sk_filter_is_valid_access,
+   .is_valid_access= cg_skb_is_valid_access,
.convert_ctx_access = bpf_convert_ctx_access,
 };
 
-- 
2.17.1

[PATCH v7 bpf-next 2/2] bpf: add tests for direct packet access from CGROUP_SKB

2018-10-19 Thread Song Liu

Tests are added to make sure CGROUP_SKB cannot access:
  tc_classid, data_meta, flow_keys

and can read and write:
  mark, prority, and cb[0-4]

and can read other fields.

To make selftest with skb->sk work, a dummy sk is added in
bpf_prog_test_run_skb().

Signed-off-by: Song Liu 
---
 net/bpf/test_run.c  |   8 +
 tools/testing/selftests/bpf/test_verifier.c | 171 
 2 files changed, 179 insertions(+)

diff --git a/net/bpf/test_run.c b/net/bpf/test_run.c
index 0c423b8cd75c..ae2ab89a9291 100644
--- a/net/bpf/test_run.c
+++ b/net/bpf/test_run.c
@@ -10,6 +10,8 @@
 #include 
 #include 
 #include 
+#include 
+#include 
 
 static __always_inline u32 bpf_test_run_one(struct bpf_prog *prog, void *ctx,
struct bpf_cgroup_storage *storage[MAX_BPF_CGROUP_STORAGE_TYPE])
@@ -106,6 +108,8 @@ static void *bpf_test_init(const union bpf_attr *kattr, u32 
size,
return data;
 }
 
+static struct sock test_run_sk = {0};
+
 int bpf_prog_test_run_skb(struct bpf_prog *prog, const union bpf_attr *kattr,
  union bpf_attr __user *uattr)
 {
@@ -137,11 +141,15 @@ int bpf_prog_test_run_skb(struct bpf_prog *prog, const 
union bpf_attr *kattr,
break;
}
 
+   sock_net_set(&test_run_sk, current->nsproxy->net_ns);
+   sock_init_data(NULL, &test_run_sk);
+
skb = build_skb(data, 0);
if (!skb) {
kfree(data);
return -ENOMEM;
}
+   skb->sk = &test_run_sk;
 
skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN);
__skb_put(skb, size);
diff --git a/tools/testing/selftests/bpf/test_verifier.c 
b/tools/testing/selftests/bpf/test_verifier.c
index cf4cd32b6772..f1ae8d09770f 100644
--- a/tools/testing/selftests/bpf/test_verifier.c
+++ b/tools/testing/selftests/bpf/test_verifier.c
@@ -4862,6 +4862,177 @@ static struct bpf_test tests[] = {
.result = REJECT,
.flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS,
},
+   {
+   "direct packet read test#1 for CGROUP_SKB",
+   .insns = {
+   BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1,
+   offsetof(struct __sk_buff, data)),
+   BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1,
+   offsetof(struct __sk_buff, data_end)),
+   BPF_LDX_MEM(BPF_W, BPF_REG_4, BPF_REG_1,
+   offsetof(struct __sk_buff, len)),
+   BPF_LDX_MEM(BPF_W, BPF_REG_5, BPF_REG_1,
+   offsetof(struct __sk_buff, pkt_type)),
+   BPF_LDX_MEM(BPF_W, BPF_REG_6, BPF_REG_1,
+   offsetof(struct __sk_buff, mark)),
+   BPF_STX_MEM(BPF_W, BPF_REG_1, BPF_REG_6,
+   offsetof(struct __sk_buff, mark)),
+   BPF_LDX_MEM(BPF_W, BPF_REG_7, BPF_REG_1,
+   offsetof(struct __sk_buff, queue_mapping)),
+   BPF_LDX_MEM(BPF_W, BPF_REG_8, BPF_REG_1,
+   offsetof(struct __sk_buff, protocol)),
+   BPF_LDX_MEM(BPF_W, BPF_REG_9, BPF_REG_1,
+   offsetof(struct __sk_buff, vlan_present)),
+   BPF_MOV64_REG(BPF_REG_0, BPF_REG_2),
+   BPF_ALU64_IMM(BPF_ADD, BPF_REG_0, 8),
+   BPF_JMP_REG(BPF_JGT, BPF_REG_0, BPF_REG_3, 1),
+   BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_2, 0),
+   BPF_MOV64_IMM(BPF_REG_0, 0),
+   BPF_EXIT_INSN(),
+   },
+   .result = ACCEPT,
+   .prog_type = BPF_PROG_TYPE_CGROUP_SKB,
+   },
+   {
+   "direct packet read test#2 for CGROUP_SKB",
+   .insns = {
+   BPF_LDX_MEM(BPF_W, BPF_REG_4, BPF_REG_1,
+   offsetof(struct __sk_buff, vlan_tci)),
+   BPF_LDX_MEM(BPF_W, BPF_REG_5, BPF_REG_1,
+   offsetof(struct __sk_buff, vlan_proto)),
+   BPF_LDX_MEM(BPF_W, BPF_REG_6, BPF_REG_1,
+   offsetof(struct __sk_buff, priority)),
+   BPF_STX_MEM(BPF_W, BPF_REG_1, BPF_REG_6,
+   offsetof(struct __sk_buff, priority)),
+   BPF_LDX_MEM(BPF_W, BPF_REG_7, BPF_REG_1,
+   offsetof(struct __sk_buff,
+ingress_ifindex)),
+   BPF_LDX_MEM(BPF_W, BPF_REG_8, BPF_REG_1,
+   offsetof(struct __sk_buff, tc_index)),
+   BPF_LDX_MEM(BPF_W, BPF_REG_9, BPF_REG_1,
+   offsetof(struct __sk_buff, hash)),
+   BPF_MOV6

Re: [danie...@cisco.com: Re: gianfar: Implement MAC reset and reconfig procedure]

2018-10-19 Thread Daniel Walker

On Thu, Oct 18, 2018 at 04:49:26PM +, Claudiu Manoil wrote:
> 
> I can only advise you to check whether the MACCFG2 register settings are 
> consistent 
> at this point, when ping fails.  You should check the I/F Mode bits (22-23) 
> and the
> Full Duplex bit (31), in big-endian format.  If these do not match the 
> 100Mbps full 
> duplex link mode, then it might be that another thread (probably doing 
> reset_gfar) 
> changes MACCFG2 concurrently.  I think MACCFG2 may be dumped with ethtool -d.
> I can get my hands on a board no sooner than maybe next week.

What does the MACCFG2 register actually do ? Is that connected to the phy
somehow ? I'm wondering because it seems like the gianfar driver is doing the
right things, and adjust_link() is getting called etc.. Something seems not to
tolerate the change from GMII to MII.

Daniel

Re: [PATCH 1/2] net: emac: implement 802.1Q VLAN TX tagging support

2018-10-19 Thread Christian Lamparter

On Wednesday, October 17, 2018 10:09:10 PM CEST Florian Fainelli wrote:
> On 10/17/2018 01:08 PM, Florian Fainelli wrote:
> > On 10/17/2018 12:53 PM, Christian Lamparter wrote:
> >> As per' APM82181 Embedded Processor User Manual 26.1 EMAC Features:
> >> VLAN:
> >>  - Support for VLAN tag ID in compliance with IEEE 802.3ac.
> >>  - VLAN tag insertion or replacement for transmit packets
> >>
> >> This patch completes the missing code for the VLAN tx tagging
> >> support, as the the EMAC_MR1_VLE was already enabled.
> >>
> >> Signed-off-by: Christian Lamparter 
> >> ---
> >>  drivers/net/ethernet/ibm/emac/core.c | 32 
> >>  drivers/net/ethernet/ibm/emac/core.h |  6 +-
> >>  2 files changed, 33 insertions(+), 5 deletions(-)
> >>
> >> diff --git a/drivers/net/ethernet/ibm/emac/core.c 
> >> b/drivers/net/ethernet/ibm/emac/core.c
> >> index 760b2ad8e295..be560f9031f4 100644
> >> --- a/drivers/net/ethernet/ibm/emac/core.c
> >> +++ b/drivers/net/ethernet/ibm/emac/core.c
> >> @@ -37,6 +37,7 @@
> >>  #include 
> >>  #include 
> >>  #include 
> >> +#include 
> >>  #include 
> >>  #include 
> >>  #include 
> >> @@ -674,7 +675,7 @@ static int emac_configure(struct emac_instance *dev)
> >> ndev->dev_addr[5]);
> >>  
> >>/* VLAN Tag Protocol ID */
> >> -  out_be32(&p->vtpid, 0x8100);
> >> +  out_be32(&p->vtpid, ETH_P_8021Q);
> >>  
> >>/* Receive mode register */
> >>r = emac_iff2rmr(ndev);
> >> @@ -1435,6 +1436,22 @@ static inline netdev_tx_t emac_xmit_finish(struct 
> >> emac_instance *dev, int len)
> >>return NETDEV_TX_OK;
> >>  }
> >>  
> >> +static inline u16 emac_tx_vlan(struct emac_instance *dev, struct sk_buff 
> >> *skb)
> >> +{
> >> +  /* Handle VLAN TPID and TCI insert if this is a VLAN skb */
> >> +  if (emac_has_feature(dev, EMAC_FTR_HAS_VLAN_CTAG_TX) &&
> >> +  skb_vlan_tag_present(skb)) {
> >> +  struct emac_regs __iomem *p = dev->emacp;
> >> +
> >> +  /* update the VLAN TCI */
> >> +  out_be32(&p->vtci, (u32)skb_vlan_tag_get(skb));
> > 
> > The only case where this is likely not going to be 0x8100/ETH_P_8021Q is
> > if you do 802.1ad (QinQ) and you decided to somehow offload the S-Tag
> > instead of the C-Tag.
> 
> Sorry, looks like I mixed up TCI and TPID here, this looks obviously
> correct ;)

Ok, I wasn't really sure what to write anyway ;).

The hardware documentation mentions that:
"Support for VLAN tag ID in compliance with IEEE Draft 802.3ac/D1.0 standard".
It's too old for offloading any fancy QinQ stuff :(.

[iproute PATCH] tc: htb: Print default value in hex

2018-10-19 Thread Phil Sutter

Value of 'default' is assumed to be hexadecimal when parsing, so
consequently it should be printed in hex as well. This is a regression
introduced when adding JSON output.

Fixes: f354fa6aa5ff0 ("tc: jsonify htb qdisc")
Signed-off-by: Phil Sutter 
---
 tc/q_htb.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tc/q_htb.c b/tc/q_htb.c
index c8b2941d945b7..c69485db8ef7d 100644
--- a/tc/q_htb.c
+++ b/tc/q_htb.c
@@ -332,7 +332,7 @@ static int htb_print_opt(struct qdisc_util *qu, FILE *f, 
struct rtattr *opt)
if (RTA_PAYLOAD(tb[TCA_HTB_INIT])  < sizeof(*gopt)) return -1;
 
print_int(PRINT_ANY, "r2q", "r2q %d", gopt->rate2quantum);
-   print_uint(PRINT_ANY, "default", " default %u", gopt->defcls);
+   print_uint(PRINT_ANY, "default", " default %x", gopt->defcls);
print_uint(PRINT_ANY, "direct_packets_stat",
   " direct_packets_stat %u", gopt->direct_pkts);
if (show_details) {
-- 
2.19.0

Re: [PATCH 2/2] net: emac: implement TCP TSO

2018-10-19 Thread Christian Lamparter

Hello,

On Wednesday, October 17, 2018 10:09:44 PM CEST Florian Fainelli wrote:
> On 10/17/2018 12:53 PM, Christian Lamparter wrote:
> > This patch enables TSO(v4) hw feature for emac driver.
> > As atleast the APM82181's TCP/IP acceleration hardware
> > controller (TAH) provides TCP segmentation support in
> > the transmit path.
> > 
> > Signed-off-by: Christian Lamparter 
> > ---
> > diff --git a/drivers/net/ethernet/ibm/emac/core.c 
> > b/drivers/net/ethernet/ibm/emac/core.c
> > index be560f9031f4..49ffbd6e1707 100644
> > --- a/drivers/net/ethernet/ibm/emac/core.c
> > +++ b/drivers/net/ethernet/ibm/emac/core.c
> > @@ -1410,6 +1413,52 @@ static inline u16 emac_tx_csum(struct emac_instance 
> > *dev,
> > return 0;
> >  }
> >  
> > +const u32 tah_ss[TAH_NO_SSR] = { 9000, 4500, 1500, 1300, 576, 176 };
> > +
> > +static int emac_tx_tso(struct emac_instance *dev, struct sk_buff *skb,
> > +  u16 *ctrl)
> > +{
> > +   if (emac_has_feature(dev, EMAC_FTR_TAH_HAS_TSO) &&
> > +   skb_is_gso(skb) && !!(skb_shinfo(skb)->gso_type &
> > +   (SKB_GSO_TCPV4 | SKB_GSO_TCPV6))) {
> > +   u32 seg_size = 0, i;
> > +
> > +   /* Get the MTU */
> > +   seg_size = skb_shinfo(skb)->gso_size + tcp_hdrlen(skb)
> > +   + skb_network_header_len(skb);
> > +
> > +   /* Restriction applied for the segmentation size
> > +* to use HW segmentation offload feature: the size
> > +* of the segment must not be less than 168 bytes for
> > +* DIX formatted segments, or 176 bytes for
> > +* IEEE formatted segments.
> > +*
> > +* I use value 176 to check for the segment size here
> > +* as it can cover both 2 conditions above.
> > +*/
> > +   if (seg_size < 176)
> > +   return -ENODEV;
> > +
> > +   /* Get the best suitable MTU */
> > +   for (i = 0; i < ARRAY_SIZE(tah_ss); i++) {
> > +   u32 curr_seg = tah_ss[i];
> > +
> > +   if (curr_seg > dev->ndev->mtu ||
> > +   curr_seg > seg_size)
> > +   continue;
> > +
> > +   *ctrl &= ~EMAC_TX_CTRL_TAH_CSUM;
> > +   *ctrl |= EMAC_TX_CTRL_TAH_SSR(i);
> > +   return 0;
> 
> This is something that you can possibly take out of your hot path and
> recalculate when the MTU actually changes?

Do you mean the curr_seg > dev->ndev->mtu check? Because from what I
know, the MSS can be manually set by a socket option (TCP_MAXSEG) on a
"per-socket" base.

(Altough, iperf warns that "Setting MSS is very buggy"... Which it is
as I see more way retries with a iperf run with a MSS less than
768-ish. Ideally, the change_mtu could update the TAH_SS register )

> [snip]
> 
> > +static netdev_tx_t emac_sw_tso(struct sk_buff *skb, struct net_device 
> > *ndev)
> > +{
> > +   struct emac_instance *dev = netdev_priv(ndev);
> > +   struct sk_buff *segs, *curr;
> > +
> > +   segs = skb_gso_segment(skb, ndev->features &
> > +   ~(NETIF_F_TSO | NETIF_F_TSO6));
> > +   if (IS_ERR_OR_NULL(segs)) {
> > +   goto drop;
> > +   } else {
> > +   while (segs) {
> > +   /* check for overflow */
> > +   if (dev->tx_cnt >= NUM_TX_BUFF) {
> > +   dev_kfree_skb_any(segs);
> > +   goto drop;
> > +   }
> 
> Would setting dev->max_gso_segs somehow help make sure the stack does
> not feed you oversized GSO'd skbs?
Ok, thanks's for that pointer. I'll look at dev->gso_max_segs and
dev->gso_max_size. The hardware documentation doesn't mention any
hard upper limit for how many segments it can do. What it does tell
is that it just needs an extra 512Bytes in the TX FIFO as a buffer
to store the header template and the calculated checksums and what
not.
 
But this should be no problem because that TX FIFO is 10 KiB. so even
the 9000 Jumbo frames should have plenty of "free real estate".

As for the "overflow" case:

There's a check in emac_start_xmit_sg() before emac_tx_tso() call that
does an *estimation* check and goes to "stop_queue" if it doesn't fit:

|/* Note, this is only an *estimation*, we can still run out of empty
| * slots because of the additional fragmentation into
| * MAL_MAX_TX_SIZE-sized chunks
| */
|if (unlikely(dev->tx_cnt + nr_frags + mal_tx_chunks(len) > 
NUM_TX_BUFF))
|goto stop_queue;
|[...]
|
|if (emac_tx_tso(dev, skb, &ctrl))
|   return emac_sw_tso(skb, ndev);
|[]
|
|   stop_queue:
|netif_stop_queue(ndev);
|DBG2(dev, "stopped TX queue" NL);
|return NETDEV_TX_BUSY;

emac_start_xmit_sg() can also drop the whole skb later if it turns
out that it doesn't fit. (see the "undo_frame" goto labe

Re: [PATCH v6 bpf-next 2/2] bpf: add tests for direct packet access from CGROUP_SKB

2018-10-19 Thread Eric Dumazet




On 10/18/2018 10:53 PM, Song Liu wrote:
> Tests are added to make sure CGROUP_SKB cannot access:
>   tc_classid, data_meta, flow_keys
> 
> and can read and write:
>   mark, prority, and cb[0-4]
> 
> and can read other fields.
> 
> To make selftest with skb->sk work, a dummy sk is added in
> bpf_prog_test_run_skb().
> 
> Signed-off-by: Song Liu 
> ---
>  net/bpf/test_run.c  |   7 +
>  tools/testing/selftests/bpf/test_verifier.c | 171 
>  2 files changed, 178 insertions(+)
> 
> diff --git a/net/bpf/test_run.c b/net/bpf/test_run.c
> index 0c423b8cd75c..87ea279cb095 100644
> --- a/net/bpf/test_run.c
> +++ b/net/bpf/test_run.c
> @@ -10,6 +10,8 @@
>  #include 
>  #include 
>  #include 
> +#include 
> +#include 
>  
>  static __always_inline u32 bpf_test_run_one(struct bpf_prog *prog, void *ctx,
>   struct bpf_cgroup_storage *storage[MAX_BPF_CGROUP_STORAGE_TYPE])
> @@ -115,6 +117,7 @@ int bpf_prog_test_run_skb(struct bpf_prog *prog, const 
> union bpf_attr *kattr,
>   u32 retval, duration;
>   int hh_len = ETH_HLEN;
>   struct sk_buff *skb;
> + struct sock sk = {0};
>   void *data;
>   int ret;
>

A dummy on stack, Nah forget it.

Please compile your test kernels with common debugging features,
I am sure some horrible thing will show up.
(Like debug_object_is_on_stack())

[no subject]

2018-10-19 Thread David Howells

Hi Dave,

Is there going to be a merge of net into net-next before the merge window
opens?  Or do you have a sample merge that I can rebase my afs-next branch on?

The problem I have is that there's a really necessary patch in net that's not
in net-next:

d7b4c24f45d2efe51b8f213da4593fefd49240ba
rxrpc: Fix an uninitialised variable

(it fixes a fix that went in net just before you last merged it into
net-next).

So I would like to base my branch on both net and net-next, but the merge is
non-trivial, and I'd rather not hand Linus a merge that conflicts with yours.

The issues are:

 (*) net/sched/cls_api.c

 I think nlmsg_parse() needs to take both rtm_tca_policy and cb->extack as
 its last two arguments.  Each branch fills in one argument and leaves the
 other NULL.

 (*) net/ipv4/ipmr_base.c

 mr_rtm_dumproute() got a piece abstracted out and modified in one branch,
 but the unabstracted branch has a fix in the same area.  I think the
 thing to do is to apply the fix (removing the same two lines) from the
 abstracted-out branch.

Thanks,
David

[RFC PATCH v2 08/10] selftests: conditionally enable XDP support in udpgso_bench_rx

2018-10-19 Thread Paolo Abeni

XDP support will be used by a later patch to test the GRO path
in a net namespace, leveraging the veth XDP implementation.
To avoid breaking existing setup, XDP support is conditionally
enabled and build only if llc is locally available.

Signed-off-by: Paolo Abeni 
---
 tools/testing/selftests/net/Makefile  | 69 +++
 tools/testing/selftests/net/udpgso_bench_rx.c | 37 ++
 tools/testing/selftests/net/xdp_dummy.c   | 13 
 3 files changed, 119 insertions(+)
 create mode 100644 tools/testing/selftests/net/xdp_dummy.c

diff --git a/tools/testing/selftests/net/Makefile 
b/tools/testing/selftests/net/Makefile
index 256d82d5fa87..176459b7c4d6 100644
--- a/tools/testing/selftests/net/Makefile
+++ b/tools/testing/selftests/net/Makefile
@@ -16,8 +16,77 @@ TEST_GEN_PROGS = reuseport_bpf reuseport_bpf_cpu 
reuseport_bpf_numa
 TEST_GEN_PROGS += reuseport_dualstack reuseaddr_conflict tls
 
 KSFT_KHDR_INSTALL := 1
+
+# Allows pointing LLC/CLANG to a LLVM backend with bpf support, redefine on 
cmdline:
+#  make samples/bpf/ LLC=~/git/llvm/build/bin/llc 
CLANG=~/git/llvm/build/bin/clang
+LLC ?= llc
+CLANG ?= clang
+LLVM_OBJCOPY ?= llvm-objcopy
+BTF_PAHOLE ?= pahole
+HAS_LLC := $(shell which $(LLC) 2>/dev/null)
+
+# conditional enable testes requiring llc
+ifneq (, $(HAS_LLC))
+TEST_GEN_FILES += xdp_dummy.o
+endif
+
 include ../lib.mk
 
+ifneq (, $(HAS_LLC))
+
+# Detect that we're cross compiling and use the cross compiler
+ifdef CROSS_COMPILE
+CLANG_ARCH_ARGS = -target $(ARCH)
+endif
+
+PROBE := $(shell $(LLC) -march=bpf -mcpu=probe -filetype=null /dev/null 2>&1)
+
+# Let newer LLVM versions transparently probe the kernel for availability
+# of full BPF instruction set.
+ifeq ($(PROBE),)
+  CPU ?= probe
+else
+  CPU ?= generic
+endif
+
+SRC_PATH := $(abspath ../../../..)
+LIB_PATH := $(SRC_PATH)/tools/lib
+XDP_CFLAGS := -D SUPPORT_XDP=1 -I$(LIB_PATH)
+LIBBPF = $(LIB_PATH)/bpf/libbpf.a
+BTF_LLC_PROBE := $(shell $(LLC) -march=bpf -mattr=help 2>&1 | grep dwarfris)
+BTF_PAHOLE_PROBE := $(shell $(BTF_PAHOLE) --help 2>&1 | grep BTF)
+BTF_OBJCOPY_PROBE := $(shell $(LLVM_OBJCOPY) --help 2>&1 | grep -i 
'usage.*llvm')
+CLANG_SYS_INCLUDES := $(shell $(CLANG) -v -E - &1 \
+| sed -n '/<...> search starts here:/,/End of search list./{ s| 
\(/.*\)|-idirafter \1|p }')
+CLANG_FLAGS = -I. -I$(SRC_PATH)/include -I../bpf/ \
+ $(CLANG_SYS_INCLUDES) -Wno-compare-distinct-pointer-types
+
+ifneq ($(and $(BTF_LLC_PROBE),$(BTF_PAHOLE_PROBE),$(BTF_OBJCOPY_PROBE)),)
+   CLANG_CFLAGS += -g
+   LLC_FLAGS += -mattr=dwarfris
+   DWARF2BTF = y
+endif
+
+$(LIBBPF): FORCE
+# Fix up variables inherited from Kbuild that tools/ build system won't like
+   $(MAKE) -C $(dir $@) RM='rm -rf' LDFLAGS= srctree=$(SRC_PATH) O= 
$(nodir $@)
+
+$(OUTPUT)/udpgso_bench_rx: $(OUTPUT)/udpgso_bench_rx.c $(LIBBPF)
+   $(CC) -o $@ $(XDP_CFLAGS) $(CFLAGS) $(LOADLIBES) $(LDLIBS) $^ -lelf
+
+FORCE:
+
+# bpf program[s] generation
+$(OUTPUT)/%.o: %.c
+   $(CLANG) $(CLANG_FLAGS) \
+-O2 -target bpf -emit-llvm -c $< -o - |  \
+   $(LLC) -march=bpf -mcpu=$(CPU) $(LLC_FLAGS) -filetype=obj -o $@
+ifeq ($(DWARF2BTF),y)
+   $(BTF_PAHOLE) -J $@
+endif
+
+endif
+
 $(OUTPUT)/reuseport_bpf_numa: LDFLAGS += -lnuma
 $(OUTPUT)/tcp_mmap: LDFLAGS += -lpthread
 $(OUTPUT)/tcp_inq: LDFLAGS += -lpthread
diff --git a/tools/testing/selftests/net/udpgso_bench_rx.c 
b/tools/testing/selftests/net/udpgso_bench_rx.c
index c55acdd1e27b..84f101852805 100644
--- a/tools/testing/selftests/net/udpgso_bench_rx.c
+++ b/tools/testing/selftests/net/udpgso_bench_rx.c
@@ -31,6 +31,10 @@
 #include 
 #include 
 
+#ifdef SUPPORT_XDP
+#include "bpf/libbpf.h"
+#endif
+
 #ifndef UDP_GRO
 #define UDP_GRO104
 #endif
@@ -40,6 +44,9 @@ static bool cfg_tcp;
 static bool cfg_verify;
 static bool cfg_read_all;
 static bool cfg_gro_segment;
+#ifdef SUPPORT_XDP
+static int cfg_xdp_iface;
+#endif
 
 static bool interrupted;
 static unsigned long packets, bytes;
@@ -227,6 +234,13 @@ static void parse_opts(int argc, char **argv)
cfg_verify = true;
cfg_read_all = true;
break;
+#ifdef SUPPORT_XDP
+   case 'x':
+   cfg_xdp_iface = if_nametoindex(optarg);
+   if (!cfg_xdp_iface)
+   error(1, errno, "unknown interface %s", optarg);
+   break;
+#endif
}
}
 
@@ -240,6 +254,9 @@ static void parse_opts(int argc, char **argv)
 static void do_recv(void)
 {
unsigned long tnow, treport;
+#ifdef SUPPORT_XDP
+   int prog_fd = -1;
+#endif
int fd;
 
fd = do_socket(cfg_tcp);
@@ -250,6 +267,22 @@ static void do_recv(void)
error(1, errno, "setsockopt UDP_GRO");
}
 
+#ifdef SUPPORT_XDP
+   if (cfg_xdp_iface) {
+   struct bpf_prog_load_attr prog_load_attr = {
+

[RFC PATCH v2 07/10] selftests: add GRO support to udp bench rx program

2018-10-19 Thread Paolo Abeni

And fix a couple of buglets (port option processing,
clean termination on SIGINT). This is preparatory work
for GRO tests.

Signed-off-by: Paolo Abeni 
---
 tools/testing/selftests/net/udpgso_bench_rx.c | 37 +++
 1 file changed, 30 insertions(+), 7 deletions(-)

diff --git a/tools/testing/selftests/net/udpgso_bench_rx.c 
b/tools/testing/selftests/net/udpgso_bench_rx.c
index 727cf67a3f75..c55acdd1e27b 100644
--- a/tools/testing/selftests/net/udpgso_bench_rx.c
+++ b/tools/testing/selftests/net/udpgso_bench_rx.c
@@ -31,9 +31,15 @@
 #include 
 #include 
 
+#ifndef UDP_GRO
+#define UDP_GRO104
+#endif
+
 static int  cfg_port   = 8000;
 static bool cfg_tcp;
 static bool cfg_verify;
+static bool cfg_read_all;
+static bool cfg_gro_segment;
 
 static bool interrupted;
 static unsigned long packets, bytes;
@@ -63,6 +69,8 @@ static void do_poll(int fd)
 
do {
ret = poll(&pfd, 1, 10);
+   if (interrupted)
+   break;
if (ret == -1)
error(1, errno, "poll");
if (ret == 0)
@@ -70,7 +78,7 @@ static void do_poll(int fd)
if (pfd.revents != POLLIN)
error(1, errno, "poll: 0x%x expected 0x%x\n",
pfd.revents, POLLIN);
-   } while (!ret && !interrupted);
+   } while (!ret);
 }
 
 static int do_socket(bool do_tcp)
@@ -102,6 +110,8 @@ static int do_socket(bool do_tcp)
error(1, errno, "listen");
 
do_poll(accept_fd);
+   if (interrupted)
+   exit(0);
 
fd = accept(accept_fd, NULL, NULL);
if (fd == -1)
@@ -167,10 +177,10 @@ static void do_verify_udp(const char *data, int len)
 /* Flush all outstanding datagrams. Verify first few bytes of each. */
 static void do_flush_udp(int fd)
 {
-   static char rbuf[ETH_DATA_LEN];
+   static char rbuf[65535];
int ret, len, budget = 256;
 
-   len = cfg_verify ? sizeof(rbuf) : 0;
+   len = cfg_read_all ? sizeof(rbuf) : 0;
while (budget--) {
/* MSG_TRUNC will make return value full datagram length */
ret = recv(fd, rbuf, len, MSG_TRUNC | MSG_DONTWAIT);
@@ -178,7 +188,7 @@ static void do_flush_udp(int fd)
return;
if (ret == -1)
error(1, errno, "recv");
-   if (len) {
+   if (len && cfg_verify) {
if (ret == 0)
error(1, errno, "recv: 0 byte datagram\n");
 
@@ -192,23 +202,30 @@ static void do_flush_udp(int fd)
 
 static void usage(const char *filepath)
 {
-   error(1, 0, "Usage: %s [-tv] [-p port]", filepath);
+   error(1, 0, "Usage: %s [-Grtv] [-p port]", filepath);
 }
 
 static void parse_opts(int argc, char **argv)
 {
int c;
 
-   while ((c = getopt(argc, argv, "ptv")) != -1) {
+   while ((c = getopt(argc, argv, "Gp:rtvx:")) != -1) {
switch (c) {
+   case 'G':
+   cfg_gro_segment = true;
+   break;
case 'p':
-   cfg_port = htons(strtoul(optarg, NULL, 0));
+   cfg_port = strtoul(optarg, NULL, 0);
+   break;
+   case 'r':
+   cfg_read_all = true;
break;
case 't':
cfg_tcp = true;
break;
case 'v':
cfg_verify = true;
+   cfg_read_all = true;
break;
}
}
@@ -227,6 +244,12 @@ static void do_recv(void)
 
fd = do_socket(cfg_tcp);
 
+   if (cfg_gro_segment && !cfg_tcp) {
+   int val = 1;
+   if (setsockopt(fd, IPPROTO_UDP, UDP_GRO, &val, sizeof(val)))
+   error(1, errno, "setsockopt UDP_GRO");
+   }
+
treport = gettimeofday_ms() + 1000;
do {
do_poll(fd);
-- 
2.17.2

[RFC PATCH v2 09/10] selftests: add some benchmark for UDP GRO

2018-10-19 Thread Paolo Abeni

Run on top of veth pair, using a dummy XDP program to enable the GRO.

Signed-off-by: Paolo Abeni 
---
 tools/testing/selftests/net/Makefile|  1 +
 tools/testing/selftests/net/udpgro_bench.sh | 92 +
 2 files changed, 93 insertions(+)
 create mode 100755 tools/testing/selftests/net/udpgro_bench.sh

diff --git a/tools/testing/selftests/net/Makefile 
b/tools/testing/selftests/net/Makefile
index 176459b7c4d6..ac999354af54 100644
--- a/tools/testing/selftests/net/Makefile
+++ b/tools/testing/selftests/net/Makefile
@@ -7,6 +7,7 @@ CFLAGS += -I../../../../usr/include/
 TEST_PROGS := run_netsocktests run_afpackettests test_bpf.sh netdevice.sh 
rtnetlink.sh
 TEST_PROGS += fib_tests.sh fib-onlink-tests.sh pmtu.sh udpgso.sh ip_defrag.sh
 TEST_PROGS += udpgso_bench.sh fib_rule_tests.sh msg_zerocopy.sh psock_snd.sh
+TEST_PROGS += udpgro_bench.sh
 TEST_PROGS_EXTENDED := in_netns.sh
 TEST_GEN_FILES =  socket
 TEST_GEN_FILES += psock_fanout psock_tpacket msg_zerocopy
diff --git a/tools/testing/selftests/net/udpgro_bench.sh 
b/tools/testing/selftests/net/udpgro_bench.sh
new file mode 100755
index ..03d37e5e7424
--- /dev/null
+++ b/tools/testing/selftests/net/udpgro_bench.sh
@@ -0,0 +1,92 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+#
+# Run a series of udpgro benchmarks
+
+readonly PEER_NS="ns-peer-$(mktemp -u XX)"
+
+cleanup() {
+   local -r jobs="$(jobs -p)"
+   local -r ns="$(ip netns list|grep $PEER_NS)"
+
+   [ -n "${jobs}" ] && kill -INT ${jobs} 2>/dev/null
+   [ -n "$ns" ] && ip netns del $ns 2>/dev/null
+}
+trap cleanup EXIT
+
+run_one() {
+   # use 'rx' as separator between sender args and receiver args
+   local -r all="$@"
+   local -r tx_args=${all%rx*}
+   local -r rx_args=${all#*rx}
+
+   ip netns add "${PEER_NS}"
+   ip -netns "${PEER_NS}" link set lo up
+   ip link add type veth
+   ip link set dev veth0 up
+   ip addr add dev veth0 192.168.1.2/24
+   ip addr add dev veth0 2001:db8::2/64 nodad
+
+   ip link set dev veth1 netns "${PEER_NS}"
+   ip -netns "${PEER_NS}" addr add dev veth1 192.168.1.1/24
+   ip -netns "${PEER_NS}" addr add dev veth1 2001:db8::1/64 nodad
+   ip -netns "${PEER_NS}" link set dev veth1 up
+
+   ip netns exec "${PEER_NS}" ./udpgso_bench_rx ${rx_args} -r -x veth1 &
+   ip netns exec "${PEER_NS}" ./udpgso_bench_rx -t ${rx_args} -r &
+
+   # Hack: let bg programs complete the startup
+   sleep 0.1
+   ./udpgso_bench_tx ${tx_args}
+}
+
+run_in_netns() {
+   local -r args=$@
+
+   ./in_netns.sh $0 __subprocess ${args}
+}
+
+run_udp() {
+   local -r args=$@
+
+   echo "udp gso - over veth touching data"
+   run_in_netns ${args} -S rx
+
+   echo "udp gso and gro - over veth touching data"
+   run_in_netns ${args} -S rx -G
+}
+
+run_tcp() {
+   local -r args=$@
+
+   echo "tcp - over veth touching data"
+   run_in_netns ${args} -t rx
+}
+
+run_all() {
+   local -r core_args="-l 4"
+   local -r ipv4_args="${core_args} -4 -D 192.168.1.1"
+   local -r ipv6_args="${core_args} -6 -D 2001:db8::1"
+
+   echo "ipv4"
+   run_tcp "${ipv4_args}"
+   run_udp "${ipv4_args}"
+
+   echo "ipv6"
+   run_tcp "${ipv4_args}"
+   run_udp "${ipv6_args}"
+}
+
+if [ ! -f xdp_dummy.o ]; then
+   echo "Skipping GRO benchmarks - missing LLC"
+   exit 0
+fi
+
+if [[ $# -eq 0 ]]; then
+   run_all
+elif [[ $1 == "__subprocess" ]]; then
+   shift
+   run_one $@
+else
+   run_in_netns $@
+fi
-- 
2.17.2

[RFC PATCH v2 10/10] selftests: add functionals test for UDP GRO

2018-10-19 Thread Paolo Abeni

Extends the existing udp programs to allow checking for proper
GRO aggregation/GSO size, and run the tests via a shell script, using
a veth pair with XDP program attached to trigger the GRO code path.

Signed-off-by: Paolo Abeni 
---
 tools/testing/selftests/net/Makefile  |   2 +-
 tools/testing/selftests/net/udpgro.sh | 144 ++
 tools/testing/selftests/net/udpgro_bench.sh   |   8 +-
 tools/testing/selftests/net/udpgso_bench.sh   |   2 +-
 tools/testing/selftests/net/udpgso_bench_rx.c | 125 +--
 tools/testing/selftests/net/udpgso_bench_tx.c |  22 ++-
 6 files changed, 281 insertions(+), 22 deletions(-)
 create mode 100755 tools/testing/selftests/net/udpgro.sh

diff --git a/tools/testing/selftests/net/Makefile 
b/tools/testing/selftests/net/Makefile
index ac999354af54..a8a0d256aafb 100644
--- a/tools/testing/selftests/net/Makefile
+++ b/tools/testing/selftests/net/Makefile
@@ -7,7 +7,7 @@ CFLAGS += -I../../../../usr/include/
 TEST_PROGS := run_netsocktests run_afpackettests test_bpf.sh netdevice.sh 
rtnetlink.sh
 TEST_PROGS += fib_tests.sh fib-onlink-tests.sh pmtu.sh udpgso.sh ip_defrag.sh
 TEST_PROGS += udpgso_bench.sh fib_rule_tests.sh msg_zerocopy.sh psock_snd.sh
-TEST_PROGS += udpgro_bench.sh
+TEST_PROGS += udpgro_bench.sh udpgro.sh
 TEST_PROGS_EXTENDED := in_netns.sh
 TEST_GEN_FILES =  socket
 TEST_GEN_FILES += psock_fanout psock_tpacket msg_zerocopy
diff --git a/tools/testing/selftests/net/udpgro.sh 
b/tools/testing/selftests/net/udpgro.sh
new file mode 100755
index ..eb380c7babf0
--- /dev/null
+++ b/tools/testing/selftests/net/udpgro.sh
@@ -0,0 +1,144 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+#
+# Run a series of udpgro functional tests.
+
+readonly PEER_NS="ns-peer-$(mktemp -u XX)"
+
+cleanup() {
+   local -r jobs="$(jobs -p)"
+   local -r ns="$(ip netns list|grep $PEER_NS)"
+
+   [ -n "${jobs}" ] && kill -1 ${jobs} 2>/dev/null
+   [ -n "$ns" ] && ip netns del $ns 2>/dev/null
+}
+trap cleanup EXIT
+
+cfg_veth() {
+   ip netns add "${PEER_NS}"
+   ip -netns "${PEER_NS}" link set lo up
+   ip link add type veth
+   ip link set dev veth0 up
+   ip addr add dev veth0 192.168.1.2/24
+   ip addr add dev veth0 2001:db8::2/64 nodad
+
+   ip link set dev veth1 netns "${PEER_NS}"
+   ip -netns "${PEER_NS}" addr add dev veth1 192.168.1.1/24
+   ip -netns "${PEER_NS}" addr add dev veth1 2001:db8::1/64 nodad
+   ip -netns "${PEER_NS}" link set dev veth1 up
+}
+
+run_one() {
+   # use 'rx' as separator between sender args and receiver args
+   local -r all="$@"
+   local -r tx_args=${all%rx*}
+   local -r rx_args=${all#*rx}
+
+   cfg_veth
+
+   ip netns exec "${PEER_NS}" ./udpgso_bench_rx ${rx_args} && \
+   echo "ok" || \
+   echo "failed" &
+
+   # Hack: let bg programs complete the startup
+   sleep 0.1
+   ./udpgso_bench_tx ${tx_args}
+   wait $(jobs -p)
+}
+
+run_test() {
+   local -r args=$@
+
+   printf " %-40s" "$1"
+   ./in_netns.sh $0 __subprocess $2 rx -G -r -x veth1 $3
+}
+
+run_one_nat() {
+   # use 'rx' as separator between sender args and receiver args
+   local addr1 addr2 pid family="" ipt_cmd=ip6tables
+   local -r all="$@"
+   local -r tx_args=${all%rx*}
+   local -r rx_args=${all#*rx}
+
+   if [[ ${tx_args} = *-4* ]]; then
+   ipt_cmd=iptables
+   family=-4
+   addr1=192.168.1.1
+   addr2=192.168.1.3/24
+   else
+   addr1=2001:db8::1
+   addr2="2001:db8::3/64 nodad"
+   fi
+
+   cfg_veth
+   ip -netns "${PEER_NS}" addr add dev veth1 ${addr2}
+
+   # fool the GRO engine changing the destination address ...
+   ip netns exec "${PEER_NS}" $ipt_cmd -t nat -I PREROUTING -d ${addr1} -j 
DNAT --to-destination ${addr2%/*}
+
+   # ... so that GRO will match the UDP_GRO enabled socket, but packets
+   # will land on the 'plain' one
+   ip netns exec "${PEER_NS}" ./udpgso_bench_rx -G ${family} -x veth1 -b 
${addr1} -n 0 &
+   pid=$!
+   ip netns exec "${PEER_NS}" ./udpgso_bench_rx ${family} -b ${addr2%/*} 
${rx_args} && \
+   echo "ok" || \
+   echo "failed"&
+
+   sleep 0.1
+   ./udpgso_bench_tx ${tx_args}
+   kill -INT $pid
+   wait $(jobs -p)
+}
+
+run_nat_test() {
+   local -r args=$@
+
+   printf " %-40s" "$1"
+   ./in_netns.sh $0 __subprocess_nat $2 rx -r $3
+}
+
+run_all() {
+   local -r core_args="-l 4"
+   local -r ipv4_args="${core_args} -4 -D 192.168.1.1"
+   local -r ipv6_args="${core_args} -6 -D 2001:db8::1"
+
+   echo "ipv4"
+   run_test "no GRO" "${ipv4_args} -M 10 -s 1400" "-4 -n 10 -l 1400"
+   run_test "no GRO chk cmsg" "${ipv4_args} -M 10 -s 1400" "-4 -n 10 -l 
1400 -S -1"
+
+   # the GSO packets are aggregated because:
+   # * veth schedule napi after each xmit
+

[RFC PATCH v2 06/10] udp: cope with UDP GRO packet misdirection

2018-10-19 Thread Paolo Abeni

In some scenarios, the GRO engine can assemble an UDP GRO packet
that ultimately lands on a non GRO-enabled socket.
This patch tries to address the issue explicitly checking for the UDP
socket features before enqueuing the packet, and eventually segmenting
the unexpected GRO packet, as needed.

We must also cope with re-insertion requests: after segmentation the
UDP code calls the helper introduced by the previous patches, as needed.

Signed-off-by: Paolo Abeni 
---
 include/linux/udp.h | 23 +++
 net/ipv4/udp.c  | 25 -
 net/ipv6/udp.c  | 27 ++-
 3 files changed, 73 insertions(+), 2 deletions(-)

diff --git a/include/linux/udp.h b/include/linux/udp.h
index e23d5024f42f..19bcb396cd1b 100644
--- a/include/linux/udp.h
+++ b/include/linux/udp.h
@@ -132,6 +132,29 @@ static inline void udp_cmsg_recv(struct msghdr *msg, 
struct sock *sk,
}
 }
 
+static inline bool udp_unexpected_gso(struct sock *sk, struct sk_buff *skb)
+{
+   return !udp_sk(sk)->gro_enabled && skb_is_gso(skb) &&
+  skb_shinfo(skb)->gso_type & SKB_GSO_UDP_L4;
+}
+
+static inline struct sk_buff *udp_rcv_segment(struct sock *sk,
+ struct sk_buff *skb)
+{
+   struct sk_buff *segs;
+
+   /* the GSO CB lays after the UDP one, no need to save and restore any
+* CB fragment, just initialize it
+*/
+   segs = __skb_gso_segment(skb, NETIF_F_SG, false);
+   if (unlikely(IS_ERR(segs)))
+   kfree_skb(skb);
+   else if (segs)
+   consume_skb(skb);
+   return segs;
+}
+
+
 #define udp_portaddr_for_each_entry(__sk, list) \
hlist_for_each_entry(__sk, list, __sk_common.skc_portaddr_node)
 
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 2331ac9de954..0d55145ce9f5 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -1909,7 +1909,7 @@ EXPORT_SYMBOL(udp_encap_enable);
  * Note that in the success and error cases, the skb is assumed to
  * have either been requeued or freed.
  */
-static int udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
+static int udp_queue_rcv_one_skb(struct sock *sk, struct sk_buff *skb)
 {
struct udp_sock *up = udp_sk(sk);
int is_udplite = IS_UDPLITE(sk);
@@ -2012,6 +2012,29 @@ static int udp_queue_rcv_skb(struct sock *sk, struct 
sk_buff *skb)
return -1;
 }
 
+void ip_protocol_deliver_rcu(struct net *net, struct sk_buff *skb, int proto);
+
+static int udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
+{
+   struct sk_buff *next, *segs;
+   int ret;
+
+   if (likely(!udp_unexpected_gso(sk, skb)))
+   return udp_queue_rcv_one_skb(sk, skb);
+
+   BUILD_BUG_ON(sizeof(struct udp_skb_cb) > SKB_SGO_CB_OFFSET);
+   __skb_push(skb, -skb_mac_offset(skb));
+   segs = udp_rcv_segment(sk, skb);
+   for (skb = segs; skb; skb = next) {
+   next = skb->next;
+   __skb_pull(skb, skb_transport_offset(skb));
+   ret = udp_queue_rcv_one_skb(sk, skb);
+   if (ret > 0)
+   ip_protocol_deliver_rcu(dev_net(skb->dev), skb, -ret);
+   }
+   return 0;
+}
+
 /* For TCP sockets, sk_rx_dst is protected by socket lock
  * For UDP, we use xchg() to guard against concurrent changes.
  */
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index 05f723b98cab..d892c064657c 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -558,7 +558,7 @@ void udpv6_encap_enable(void)
 }
 EXPORT_SYMBOL(udpv6_encap_enable);
 
-static int udpv6_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
+static int udpv6_queue_rcv_one_skb(struct sock *sk, struct sk_buff *skb)
 {
struct udp_sock *up = udp_sk(sk);
int is_udplite = IS_UDPLITE(sk);
@@ -641,6 +641,31 @@ static int udpv6_queue_rcv_skb(struct sock *sk, struct 
sk_buff *skb)
return -1;
 }
 
+void ip6_protocol_deliver_rcu(struct net *net, struct sk_buff *skb, int 
nexthdr,
+ bool have_final);
+
+static int udpv6_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
+{
+   struct sk_buff *next, *segs;
+   int ret;
+
+   if (likely(!udp_unexpected_gso(sk, skb)))
+   return udpv6_queue_rcv_one_skb(sk, skb);
+
+   __skb_push(skb, -skb_mac_offset(skb));
+   segs = udp_rcv_segment(sk, skb);
+   for (skb = segs; skb; skb = next) {
+   next = skb->next;
+   __skb_pull(skb, skb_transport_offset(skb));
+
+   ret = udpv6_queue_rcv_one_skb(sk, skb);
+   if (ret > 0)
+   ip6_protocol_deliver_rcu(dev_net(skb->dev), skb, ret,
+true);
+   }
+   return 0;
+}
+
 static bool __udp_v6_is_mcast_sock(struct net *net, struct sock *sk,
   __be16 loc_port, const struct in6_addr 
*loc_addr,
   __be16 rmt_port, const struct in6_addr 
*rmt_addr,

[RFC PATCH v2 03/10] udp: add support for UDP_GRO cmsg

2018-10-19 Thread Paolo Abeni

When UDP GRO is enabled, the UDP_GRO cmsg will carry the ingress
datagram size. User-space can use such info to compute the original
packets layout.

Signed-off-by: Paolo Abeni 
---
CHECK: should we use a separate setsockopt to explicitly enable
gso_size cmsg reception? So that user space can enable UDP_GRO and
fetch cmsg without forcefully receiving GRO related info.
---
 include/linux/udp.h | 11 +++
 net/ipv4/udp.c  |  4 
 net/ipv6/udp.c  |  3 +++
 3 files changed, 18 insertions(+)

diff --git a/include/linux/udp.h b/include/linux/udp.h
index f613b329852e..e23d5024f42f 100644
--- a/include/linux/udp.h
+++ b/include/linux/udp.h
@@ -121,6 +121,17 @@ static inline bool udp_get_no_check6_rx(struct sock *sk)
return udp_sk(sk)->no_check6_rx;
 }
 
+static inline void udp_cmsg_recv(struct msghdr *msg, struct sock *sk,
+struct sk_buff *skb)
+{
+   int gso_size;
+
+   if (skb_shinfo(skb)->gso_type & SKB_GSO_UDP_L4) {
+   gso_size = skb_shinfo(skb)->gso_size;
+   put_cmsg(msg, SOL_UDP, UDP_GRO, sizeof(gso_size), &gso_size);
+   }
+}
+
 #define udp_portaddr_for_each_entry(__sk, list) \
hlist_for_each_entry(__sk, list, __sk_common.skc_portaddr_node)
 
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 3c277378814f..2331ac9de954 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -1714,6 +1714,10 @@ int udp_recvmsg(struct sock *sk, struct msghdr *msg, 
size_t len, int noblock,
memset(sin->sin_zero, 0, sizeof(sin->sin_zero));
*addr_len = sizeof(*sin);
}
+
+   if (udp_sk(sk)->gro_enabled)
+   udp_cmsg_recv(msg, sk, skb);
+
if (inet->cmsg_flags)
ip_cmsg_recv_offset(msg, sk, skb, sizeof(struct udphdr), off);
 
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index 8bb50ba32a6f..05f723b98cab 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -421,6 +421,9 @@ int udpv6_recvmsg(struct sock *sk, struct msghdr *msg, 
size_t len,
*addr_len = sizeof(*sin6);
}
 
+   if (udp_sk(sk)->gro_enabled)
+   udp_cmsg_recv(msg, sk, skb);
+
if (np->rxopt.all)
ip6_datagram_recv_common_ctl(sk, msg, skb);
 
-- 
2.17.2

[RFC PATCH v2 02/10] udp: implement GRO for plain UDP sockets.

2018-10-19 Thread Paolo Abeni

This is the RX counterpart of commit bec1f6f69736 ("udp: generate gso
with UDP_SEGMENT"). When UDP_GRO is enabled, such socket is also
eligible for GRO in the rx path: UDP segments directed to such socket
are assembled into a larger GSO_UDP_L4 packet.

The core UDP GRO support is enabled with setsockopt(UDP_GRO).

Initial benchmark numbers:

Before:
udp rx:   1079 MB/s   769065 calls/s

After:
udp rx:   1466 MB/s24877 calls/s

This change introduces a side effect in respect to UDP tunnels:
after a UDP tunnel creation, now the kernel performs a lookup per ingress
UDP packet, while before such lookup happened only if the ingress packet
carried a valid internal header csum.

v1 -> v2:
 - use a new option to enable UDP GRO
 - use static keys to protect the UDP GRO socket lookup

Signed-off-by: Paolo Abeni 
---
 include/linux/udp.h  |   3 +-
 include/uapi/linux/udp.h |   1 +
 net/ipv4/udp.c   |   7 +++
 net/ipv4/udp_offload.c   | 109 +++
 net/ipv6/udp_offload.c   |   6 +--
 5 files changed, 98 insertions(+), 28 deletions(-)

diff --git a/include/linux/udp.h b/include/linux/udp.h
index a4dafff407fb..f613b329852e 100644
--- a/include/linux/udp.h
+++ b/include/linux/udp.h
@@ -50,11 +50,12 @@ struct udp_sock {
__u8 encap_type;/* Is this an Encapsulation socket? */
unsigned charno_check6_tx:1,/* Send zero UDP6 checksums on TX? */
 no_check6_rx:1,/* Allow zero UDP6 checksums on RX? */
-encap_enabled:1; /* This socket enabled encap
+encap_enabled:1, /* This socket enabled encap
   * processing; UDP tunnels and
   * different encapsulation layer set
   * this
   */
+gro_enabled:1; /* Can accept GRO packets */
/*
 * Following member retains the information to create a UDP header
 * when the socket is uncorked.
diff --git a/include/uapi/linux/udp.h b/include/uapi/linux/udp.h
index 09502de447f5..30baccb6c9c4 100644
--- a/include/uapi/linux/udp.h
+++ b/include/uapi/linux/udp.h
@@ -33,6 +33,7 @@ struct udphdr {
 #define UDP_NO_CHECK6_TX 101   /* Disable sending checksum for UDP6X */
 #define UDP_NO_CHECK6_RX 102   /* Disable accpeting checksum for UDP6 */
 #define UDP_SEGMENT103 /* Set GSO segmentation size */
+#define UDP_GRO104 /* This socket can receive UDP GRO 
packets */
 
 /* UDP encapsulation types */
 #define UDP_ENCAP_ESPINUDP_NON_IKE 1 /* draft-ietf-ipsec-nat-t-ike-00/01 */
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 9fcb5374e166..3c277378814f 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -115,6 +115,7 @@
 #include "udp_impl.h"
 #include 
 #include 
+#include 
 
 struct udp_table udp_table __read_mostly;
 EXPORT_SYMBOL(udp_table);
@@ -2459,6 +2460,12 @@ int udp_lib_setsockopt(struct sock *sk, int level, int 
optname,
up->gso_size = val;
break;
 
+   case UDP_GRO:
+   if (valbool)
+   udp_tunnel_encap_enable(sk->sk_socket);
+   up->gro_enabled = valbool;
+   break;
+
/*
 *  UDP-Lite's partial checksum coverage (RFC 3828).
 */
diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c
index 802f2bc00d69..d93c1e8097ba 100644
--- a/net/ipv4/udp_offload.c
+++ b/net/ipv4/udp_offload.c
@@ -343,6 +343,54 @@ static struct sk_buff *udp4_ufo_fragment(struct sk_buff 
*skb,
return segs;
 }
 
+#define UDO_GRO_CNT_MAX 64
+static struct sk_buff *udp_gro_receive_segment(struct list_head *head,
+  struct sk_buff *skb)
+{
+   struct udphdr *uh = udp_hdr(skb);
+   struct sk_buff *pp = NULL;
+   struct udphdr *uh2;
+   struct sk_buff *p;
+
+   /* requires non zero csum, for simmetry with GSO */
+   if (!uh->check) {
+   NAPI_GRO_CB(skb)->flush = 1;
+   return NULL;
+   }
+
+   /* pull encapsulating udp header */
+   skb_gro_pull(skb, sizeof(struct udphdr));
+   skb_gro_postpull_rcsum(skb, uh, sizeof(struct udphdr));
+
+   list_for_each_entry(p, head, list) {
+   if (!NAPI_GRO_CB(p)->same_flow)
+   continue;
+
+   uh2 = udp_hdr(p);
+
+   /* Match ports only, as csum is always non zero */
+   if ((*(u32 *)&uh->source != *(u32 *)&uh2->source)) {
+   NAPI_GRO_CB(p)->same_flow = 0;
+   continue;
+   }
+
+   /* Terminate the flow on len mismatch or if it grow "too much".
+* Under small packet flood GRO count could elsewhere grow a lot
+* leading to execessive truesize values
+*/
+   if (!skb_gro_receive(p, skb

[RFC PATCH v2 05/10] ipv6: factor out protocol delivery helper

2018-10-19 Thread Paolo Abeni

So that we can re-use it at the UDP lavel in the next patch

Signed-off-by: Paolo Abeni 
---
 net/ipv6/ip6_input.c | 28 
 1 file changed, 16 insertions(+), 12 deletions(-)

diff --git a/net/ipv6/ip6_input.c b/net/ipv6/ip6_input.c
index 96577e742afd..3065226bdc57 100644
--- a/net/ipv6/ip6_input.c
+++ b/net/ipv6/ip6_input.c
@@ -319,28 +319,26 @@ void ipv6_list_rcv(struct list_head *head, struct 
packet_type *pt,
 /*
  * Deliver the packet to the host
  */
-
-
-static int ip6_input_finish(struct net *net, struct sock *sk, struct sk_buff 
*skb)
+void ip6_protocol_deliver_rcu(struct net *net, struct sk_buff *skb, int 
nexthdr,
+ bool have_final)
 {
const struct inet6_protocol *ipprot;
struct inet6_dev *idev;
unsigned int nhoff;
-   int nexthdr;
bool raw;
-   bool have_final = false;
 
/*
 *  Parse extension headers
 */
 
-   rcu_read_lock();
 resubmit:
idev = ip6_dst_idev(skb_dst(skb));
-   if (!pskb_pull(skb, skb_transport_offset(skb)))
-   goto discard;
nhoff = IP6CB(skb)->nhoff;
-   nexthdr = skb_network_header(skb)[nhoff];
+   if (!have_final) {
+   if (!pskb_pull(skb, skb_transport_offset(skb)))
+   goto discard;
+   nexthdr = skb_network_header(skb)[nhoff];
+   }
 
 resubmit_final:
raw = raw6_local_deliver(skb, nexthdr);
@@ -411,13 +409,19 @@ static int ip6_input_finish(struct net *net, struct sock 
*sk, struct sk_buff *sk
consume_skb(skb);
}
}
-   rcu_read_unlock();
-   return 0;
+   return;
 
 discard:
__IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS);
-   rcu_read_unlock();
kfree_skb(skb);
+}
+
+static int ip6_input_finish(struct net *net, struct sock *sk, struct sk_buff 
*skb)
+{
+   rcu_read_lock();
+   ip6_protocol_deliver_rcu(net, skb, 0, false);
+   rcu_read_unlock();
+
return 0;
 }
 
-- 
2.17.2

[RFC PATCH v2 00/10] udp: implement GRO support

2018-10-19 Thread Paolo Abeni

This series implements GRO support for UDP sockets, as the RX counterpart
of commit bec1f6f69736 ("udp: generate gso with UDP_SEGMENT").
The core functionality is implemented by the second patch, introducing a new
sockopt to enable UDP_GRO, while patch 3 implements support for passing the
segment size to the user space via a new cmsg.
UDP GRO performs a socket lookup for each ingress packets and aggregate datagram
directed to UDP GRO enabled sockets with constant l4 tuple.

UDP GRO packets can land on non GRO-enabled sockets, e.g. due to iptables NAT
rules, and that could potentially confuse existing applications.

The solution adopted here is to de-segment the GRO packet before enqueuing
as needed. Since we must cope with packet reinsertion after de-segmentation,
the relevant code is factored-out in ipv4 and ipv6 specific helpers and exposed
to UDP usage.

While the current code can probably be improved, this safeguard ,implemented in
the patches 4-7, allows future enachements to enable UDP GSO offload on more
virtual devices eventually even on forwarded packets.

The last 4 for patches implement some performance and functional self-tests,
re-using the existing udpgso infrastructure. The problematic scenario described
above is explicitly tested.

v1 - v2:
 - use a new option to enable UDP GRO
 - use static keys to protect the UDP GRO socket lookup
 - cope with UDP GRO misdirection
 - add self-tests

Paolo Abeni (10):
  udp: implement complete book-keeping for encap_needed
  udp: implement GRO for plain UDP sockets.
  udp: add support for UDP_GRO cmsg
  ip: factor out protocol delivery helper
  ipv6: factor out protocol delivery helper
  udp: cope with UDP GRO packet misdirection
  selftests: add GRO support to udp bench rx program
  selftests: conditionally enable XDP support in udpgso_bench_rx
  selftests: add some benchmark for UDP GRO
  selftests: add functionals test for UDP GRO

 include/linux/udp.h   |  42 +++-
 include/net/udp_tunnel.h  |   6 +
 include/uapi/linux/udp.h  |   1 +
 net/ipv4/ip_input.c   |  73 ---
 net/ipv4/udp.c|  54 -
 net/ipv4/udp_offload.c| 109 --
 net/ipv6/ip6_input.c  |  28 +--
 net/ipv6/udp.c|  44 +++-
 net/ipv6/udp_offload.c|   6 +-
 tools/testing/selftests/net/Makefile  |  70 +++
 tools/testing/selftests/net/udpgro.sh | 144 +
 tools/testing/selftests/net/udpgro_bench.sh   |  94 +
 tools/testing/selftests/net/udpgso_bench.sh   |   2 +-
 tools/testing/selftests/net/udpgso_bench_rx.c | 195 --
 tools/testing/selftests/net/udpgso_bench_tx.c |  22 +-
 tools/testing/selftests/net/xdp_dummy.c   |  13 ++
 16 files changed, 790 insertions(+), 113 deletions(-)
 create mode 100755 tools/testing/selftests/net/udpgro.sh
 create mode 100755 tools/testing/selftests/net/udpgro_bench.sh
 create mode 100644 tools/testing/selftests/net/xdp_dummy.c

-- 
2.17.2

[RFC PATCH v2 01/10] udp: implement complete book-keeping for encap_needed

2018-10-19 Thread Paolo Abeni

The *encap_needed static keys are enabled by UDP tunnels
and several UDP encapsulations type, but they are never
turned off. This can cause unneeded overall performance
degradation for systems where such features are used
transiently.

This patch introduces complete book-keeping for such keys,
decreasing the usage at socket destruction time, if needed,
and avoiding that the same socket could increase the key
usage multiple times.

Signed-off-by: Paolo Abeni 
---
 include/linux/udp.h  |  7 ++-
 include/net/udp_tunnel.h |  6 ++
 net/ipv4/udp.c   | 18 --
 net/ipv6/udp.c   | 14 +-
 4 files changed, 33 insertions(+), 12 deletions(-)

diff --git a/include/linux/udp.h b/include/linux/udp.h
index 320d49d85484..a4dafff407fb 100644
--- a/include/linux/udp.h
+++ b/include/linux/udp.h
@@ -49,7 +49,12 @@ struct udp_sock {
unsigned int corkflag;  /* Cork is required */
__u8 encap_type;/* Is this an Encapsulation socket? */
unsigned charno_check6_tx:1,/* Send zero UDP6 checksums on TX? */
-no_check6_rx:1;/* Allow zero UDP6 checksums on RX? */
+no_check6_rx:1,/* Allow zero UDP6 checksums on RX? */
+encap_enabled:1; /* This socket enabled encap
+  * processing; UDP tunnels and
+  * different encapsulation layer set
+  * this
+  */
/*
 * Following member retains the information to create a UDP header
 * when the socket is uncorked.
diff --git a/include/net/udp_tunnel.h b/include/net/udp_tunnel.h
index fe680ab6b15a..3fbe56430e3b 100644
--- a/include/net/udp_tunnel.h
+++ b/include/net/udp_tunnel.h
@@ -165,6 +165,12 @@ static inline int udp_tunnel_handle_offloads(struct 
sk_buff *skb, bool udp_csum)
 
 static inline void udp_tunnel_encap_enable(struct socket *sock)
 {
+   struct udp_sock *up = udp_sk(sock->sk);
+
+   if (up->encap_enabled)
+   return;
+
+   up->encap_enabled = 1;
 #if IS_ENABLED(CONFIG_IPV6)
if (sock->sk->sk_family == PF_INET6)
ipv6_stub->udpv6_encap_enable();
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index cf8252d05a01..9fcb5374e166 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -2382,11 +2382,15 @@ void udp_destroy_sock(struct sock *sk)
bool slow = lock_sock_fast(sk);
udp_flush_pending_frames(sk);
unlock_sock_fast(sk, slow);
-   if (static_branch_unlikely(&udp_encap_needed_key) && up->encap_type) {
-   void (*encap_destroy)(struct sock *sk);
-   encap_destroy = READ_ONCE(up->encap_destroy);
-   if (encap_destroy)
-   encap_destroy(sk);
+   if (static_branch_unlikely(&udp_encap_needed_key)) {
+   if (up->encap_type) {
+   void (*encap_destroy)(struct sock *sk);
+   encap_destroy = READ_ONCE(up->encap_destroy);
+   if (encap_destroy)
+   encap_destroy(sk);
+   }
+   if (up->encap_enabled)
+   static_branch_disable(&udp_encap_needed_key);
}
 }
 
@@ -2431,7 +2435,9 @@ int udp_lib_setsockopt(struct sock *sk, int level, int 
optname,
/* FALLTHROUGH */
case UDP_ENCAP_L2TPINUDP:
up->encap_type = val;
-   udp_encap_enable();
+   if (!up->encap_enabled)
+   udp_encap_enable();
+   up->encap_enabled = 1;
break;
default:
err = -ENOPROTOOPT;
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index 374e7d302f26..8bb50ba32a6f 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -1460,11 +1460,15 @@ void udpv6_destroy_sock(struct sock *sk)
udp_v6_flush_pending_frames(sk);
release_sock(sk);
 
-   if (static_branch_unlikely(&udpv6_encap_needed_key) && up->encap_type) {
-   void (*encap_destroy)(struct sock *sk);
-   encap_destroy = READ_ONCE(up->encap_destroy);
-   if (encap_destroy)
-   encap_destroy(sk);
+   if (static_branch_unlikely(&udpv6_encap_needed_key)) {
+   if (up->encap_type) {
+   void (*encap_destroy)(struct sock *sk);
+   encap_destroy = READ_ONCE(up->encap_destroy);
+   if (encap_destroy)
+   encap_destroy(sk);
+   }
+   if (up->encap_enabled)
+   static_branch_disable(&udpv6_encap_needed_key);
}
 
inet6_destroy_sock(sk);
-- 
2.17.2

[RFC PATCH v2 04/10] ip: factor out protocol delivery helper

2018-10-19 Thread Paolo Abeni

So that we can re-use it at the UDP lavel in a later patch

Signed-off-by: Paolo Abeni 
---
 net/ipv4/ip_input.c | 73 ++---
 1 file changed, 36 insertions(+), 37 deletions(-)

diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c
index 35a786c0aaa0..72250b4e466d 100644
--- a/net/ipv4/ip_input.c
+++ b/net/ipv4/ip_input.c
@@ -188,51 +188,50 @@ bool ip_call_ra_chain(struct sk_buff *skb)
return false;
 }
 
-static int ip_local_deliver_finish(struct net *net, struct sock *sk, struct 
sk_buff *skb)
+void ip_protocol_deliver_rcu(struct net *net, struct sk_buff *skb, int 
protocol)
 {
-   __skb_pull(skb, skb_network_header_len(skb));
-
-   rcu_read_lock();
-   {
-   int protocol = ip_hdr(skb)->protocol;
-   const struct net_protocol *ipprot;
-   int raw;
+   const struct net_protocol *ipprot;
+   int raw, ret;
 
-   resubmit:
-   raw = raw_local_deliver(skb, protocol);
+resubmit:
+   raw = raw_local_deliver(skb, protocol);
 
-   ipprot = rcu_dereference(inet_protos[protocol]);
-   if (ipprot) {
-   int ret;
-
-   if (!ipprot->no_policy) {
-   if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, 
skb)) {
-   kfree_skb(skb);
-   goto out;
-   }
-   nf_reset(skb);
+   ipprot = rcu_dereference(inet_protos[protocol]);
+   if (ipprot) {
+   if (!ipprot->no_policy) {
+   if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
+   kfree_skb(skb);
+   return;
}
-   ret = ipprot->handler(skb);
-   if (ret < 0) {
-   protocol = -ret;
-   goto resubmit;
+   nf_reset(skb);
+   }
+   ret = ipprot->handler(skb);
+   if (ret < 0) {
+   protocol = -ret;
+   goto resubmit;
+   }
+   __IP_INC_STATS(net, IPSTATS_MIB_INDELIVERS);
+   } else {
+   if (!raw) {
+   if (xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
+   __IP_INC_STATS(net, 
IPSTATS_MIB_INUNKNOWNPROTOS);
+   icmp_send(skb, ICMP_DEST_UNREACH,
+ ICMP_PROT_UNREACH, 0);
}
-   __IP_INC_STATS(net, IPSTATS_MIB_INDELIVERS);
+   kfree_skb(skb);
} else {
-   if (!raw) {
-   if (xfrm4_policy_check(NULL, XFRM_POLICY_IN, 
skb)) {
-   __IP_INC_STATS(net, 
IPSTATS_MIB_INUNKNOWNPROTOS);
-   icmp_send(skb, ICMP_DEST_UNREACH,
- ICMP_PROT_UNREACH, 0);
-   }
-   kfree_skb(skb);
-   } else {
-   __IP_INC_STATS(net, IPSTATS_MIB_INDELIVERS);
-   consume_skb(skb);
-   }
+   __IP_INC_STATS(net, IPSTATS_MIB_INDELIVERS);
+   consume_skb(skb);
}
}
- out:
+}
+
+static int ip_local_deliver_finish(struct net *net, struct sock *sk, struct 
sk_buff *skb)
+{
+   __skb_pull(skb, skb_network_header_len(skb));
+
+   rcu_read_lock();
+   ip_protocol_deliver_rcu(net, skb, ip_hdr(skb)->protocol);
rcu_read_unlock();
 
return 0;
-- 
2.17.2

[PATCH bpf-next v2 1/2] tools, perf: add and use optimized ring_buffer_{read_head,write_tail} helpers

2018-10-19 Thread Daniel Borkmann

Currently, on x86-64, perf uses LFENCE and MFENCE (rmb() and mb(),
respectively) when processing events from the perf ring buffer which
is unnecessarily expensive as we can do more lightweight in particular
given this is critical fast-path in perf.

According to Peter rmb()/mb() were added back then via a94d342b9cb0
("tools/perf: Add required memory barriers") at a time where kernel
still supported chips that needed it, but nowadays support for these
has been ditched completely, therefore we can fix them up as well.

While for x86-64, replacing rmb() and mb() with smp_*() variants would
result in just a compiler barrier for the former and LOCK + ADD for
the latter (__sync_synchronize() uses slower MFENCE by the way), Peter
suggested we can use smp_{load_acquire,store_release}() instead for
architectures where its implementation doesn't resolve in slower smp_mb().
Thus, e.g. in x86-64 we would be able to avoid CPU barrier entirely due
to TSO. For architectures where the latter needs to use smp_mb() e.g.
on arm, we stick to cheaper smp_rmb() variant for fetching the head.

This work adds helpers ring_buffer_read_head() and ring_buffer_write_tail()
for tools infrastructure that either switches to smp_load_acquire() for
architectures where it is cheaper or uses READ_ONCE() + smp_rmb() barrier
for those where it's not in order to fetch the data_head from the perf
control page, and it uses smp_store_release() to write the data_tail.
Latter is smp_mb() + WRITE_ONCE() combination or a cheaper variant if
architecture allows for it. Those that rely on smp_rmb() and smp_mb() can
further improve performance in a follow up step by implementing the two
under tools/arch/*/include/asm/barrier.h such that they don't have to
fallback to rmb() and mb() in tools/include/asm/barrier.h.

Switch perf to use ring_buffer_read_head() and ring_buffer_write_tail()
so it can make use of the optimizations. Later, we convert libbpf as
well to use the same helpers.

Side note [0]: the topic has been raised of whether one could simply use
the C11 gcc builtins [1] for the smp_load_acquire() and smp_store_release()
instead:

  __atomic_load_n(ptr, __ATOMIC_ACQUIRE);
  __atomic_store_n(ptr, val, __ATOMIC_RELEASE);

Kernel and (presumably) tooling shipped along with the kernel has a
minimum requirement of being able to build with gcc-4.6 and the latter
does not have C11 builtins. While generally the C11 memory models don't
align with the kernel's, the C11 load-acquire and store-release alone
/could/ suffice, however. Issue is that this is implementation dependent
on how the load-acquire and store-release is done by the compiler and
the mapping of supported compilers must align to be compatible with the
kernel's implementation, and thus needs to be verified/tracked on a
case by case basis whether they match (unless an architecture uses them
also from kernel side). The implementations for smp_load_acquire() and
smp_store_release() in this patch have been adapted from the kernel side
ones to have a concrete and compatible mapping in place.

  [0] http://patchwork.ozlabs.org/patch/985422/
  [1] https://gcc.gnu.org/onlinedocs/gcc/_005f_005fatomic-Builtins.html

Signed-off-by: Daniel Borkmann 
Acked-by: Peter Zijlstra (Intel) 
Cc: "Paul E. McKenney" 
Cc: Will Deacon 
Cc: Arnaldo Carvalho de Melo 
---
 tools/arch/arm64/include/asm/barrier.h| 70 +
 tools/arch/ia64/include/asm/barrier.h | 13 ++
 tools/arch/powerpc/include/asm/barrier.h  | 16 +++
 tools/arch/s390/include/asm/barrier.h | 13 ++
 tools/arch/sparc/include/asm/barrier_64.h | 13 ++
 tools/arch/x86/include/asm/barrier.h  | 14 ++
 tools/include/asm/barrier.h   | 35 +++
 tools/include/linux/ring_buffer.h | 73 +++
 tools/perf/util/mmap.h| 15 ++-
 9 files changed, 250 insertions(+), 12 deletions(-)
 create mode 100644 tools/include/linux/ring_buffer.h

diff --git a/tools/arch/arm64/include/asm/barrier.h 
b/tools/arch/arm64/include/asm/barrier.h
index 40bde6b..12835ea 100644
--- a/tools/arch/arm64/include/asm/barrier.h
+++ b/tools/arch/arm64/include/asm/barrier.h
@@ -14,4 +14,74 @@
 #define wmb()  asm volatile("dmb ishst" ::: "memory")
 #define rmb()  asm volatile("dmb ishld" ::: "memory")
 
+#define smp_store_release(p, v)\
+do {   \
+   union { typeof(*p) __val; char __c[1]; } __u =  \
+   { .__val = (__force typeof(*p)) (v) };  \
+   \
+   switch (sizeof(*p)) {   \
+   case 1: \
+   asm volatile ("stlrb %w1, %0"   \
+   : "=Q" (*p) \
+   : "r" (*(__u8 *)__u.__c

[PATCH bpf-next v2 2/2] bpf, libbpf: use correct barriers in perf ring buffer walk

2018-10-19 Thread Daniel Borkmann

Given libbpf is a generic library and not restricted to x86-64 only,
the compiler barrier in bpf_perf_event_read_simple() after fetching
the head needs to be replaced with smp_rmb() at minimum. Also, writing
out the tail we should use WRITE_ONCE() to avoid store tearing.

Now that we have the logic in place in ring_buffer_read_head() and
ring_buffer_write_tail() helper also used by perf tool which would
select the correct and best variant for a given architecture (e.g.
x86-64 can avoid CPU barriers entirely), make use of these in order
to fix bpf_perf_event_read_simple().

Fixes: d0cabbb021be ("tools: bpf: move the event reading loop to libbpf")
Fixes: 39111695b1b8 ("samples: bpf: add bpf_perf_event_output example")
Signed-off-by: Daniel Borkmann 
Cc: Peter Zijlstra 
Cc: "Paul E. McKenney" 
Cc: Will Deacon 
Cc: Arnaldo Carvalho de Melo 
---
 tools/lib/bpf/libbpf.c | 10 --
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c
index bd71efc..0c21355 100644
--- a/tools/lib/bpf/libbpf.c
+++ b/tools/lib/bpf/libbpf.c
@@ -27,6 +27,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -2418,13 +2419,12 @@ bpf_perf_event_read_simple(void *mem, unsigned long 
size,
   unsigned long page_size, void **buf, size_t *buf_len,
   bpf_perf_event_print_t fn, void *priv)
 {
-   volatile struct perf_event_mmap_page *header = mem;
+   struct perf_event_mmap_page *header = mem;
+   __u64 data_head = ring_buffer_read_head(header);
__u64 data_tail = header->data_tail;
-   __u64 data_head = header->data_head;
int ret = LIBBPF_PERF_EVENT_ERROR;
void *base, *begin, *end;
 
-   asm volatile("" ::: "memory"); /* in real code it should be smp_rmb() */
if (data_head == data_tail)
return LIBBPF_PERF_EVENT_CONT;
 
@@ -2467,8 +2467,6 @@ bpf_perf_event_read_simple(void *mem, unsigned long size,
data_tail += ehdr->size;
}
 
-   __sync_synchronize(); /* smp_mb() */
-   header->data_tail = data_tail;
-
+   ring_buffer_write_tail(header, data_tail);
return ret;
 }
-- 
2.9.5

[PATCH bpf-next v2 0/2] improve and fix barriers for walking perf ring buffer

2018-10-19 Thread Daniel Borkmann

This set first adds smp_* barrier variants to tools infrastructure
and updates perf and libbpf to make use of them. For details, please
see individual patches, thanks!

Arnaldo, if there are no objections, could this be routed via bpf-next
with Acked-by's due to later dependencies in libbpf? Alternatively,
I could also get the 2nd patch out during merge window, but perhaps
it's okay to do in one go as there shouldn't be much conflict in perf
itself.

Thanks!

v1 -> v2:
  - add common helper and switch to acquire/release variants
when possible, thanks Peter!

Daniel Borkmann (2):
  tools, perf: add and use optimized ring_buffer_{read_head,write_tail} helpers
  bpf, libbpf: use correct barriers in perf ring buffer walk

 tools/arch/arm64/include/asm/barrier.h| 70 +
 tools/arch/ia64/include/asm/barrier.h | 13 ++
 tools/arch/powerpc/include/asm/barrier.h  | 16 +++
 tools/arch/s390/include/asm/barrier.h | 13 ++
 tools/arch/sparc/include/asm/barrier_64.h | 13 ++
 tools/arch/x86/include/asm/barrier.h  | 14 ++
 tools/include/asm/barrier.h   | 35 +++
 tools/include/linux/ring_buffer.h | 73 +++
 tools/lib/bpf/libbpf.c| 10 ++---
 tools/perf/util/mmap.h| 15 ++-
 10 files changed, 254 insertions(+), 18 deletions(-)
 create mode 100644 tools/include/linux/ring_buffer.h

-- 
2.9.5

[PATCH 13/17] octeontx2-af: Install ucast and bcast pkt forwarding rules

2018-10-19 Thread sunil . kovvuri

From: Sunil Goutham 

Upon NIXLF_ALLOC install a unicast forwarding rule in NPC MCAM
like below
 - Match pkt DMAC with NIXLF attached PF/VF's MAC address.
 - Ingress channel
 - Action is UCAST
 - Forward to PF_FUNC of this NIXLF
And broadcast pkt forwarding rule as
 - Match L2B bit in MCAM search key
 - Ingress channel
 - Action is UCAST, for now, later it will be changed to MCAST.
Only PFs can install this rule

Upon NIXLF_FREE disable all MCAM entries in use by that NIXLF.

Signed-off-by: Sunil Goutham 
---
 drivers/net/ethernet/marvell/octeontx2/af/npc.h|  19 ++
 drivers/net/ethernet/marvell/octeontx2/af/rvu.h|   5 +
 .../net/ethernet/marvell/octeontx2/af/rvu_nix.c|  14 +
 .../net/ethernet/marvell/octeontx2/af/rvu_npc.c| 333 +
 4 files changed, 371 insertions(+)

diff --git a/drivers/net/ethernet/marvell/octeontx2/af/npc.h 
b/drivers/net/ethernet/marvell/octeontx2/af/npc.h
index 58d8f0b..9cbcac2 100644
--- a/drivers/net/ethernet/marvell/octeontx2/af/npc.h
+++ b/drivers/net/ethernet/marvell/octeontx2/af/npc.h
@@ -240,4 +240,23 @@ struct npc_kpu_pkind_cpi_def {
u64 ena: 1;
 #endif
 };
+
+struct nix_rx_action {
+#if defined(__BIG_ENDIAN_BITFIELD)
+   u64 rsvd_63_61  :3;
+   u64 flow_key_alg:5;
+   u64 match_id:16;
+   u64 index   :20;
+   u64 pf_func :16
+   u64 op  :4;
+#else
+   u64 op  :4;
+   u64 pf_func :16;
+   u64 index   :20;
+   u64 match_id:16;
+   u64 flow_key_alg:5;
+   u64 rsvd_63_61  :3;
+#endif
+};
+
 #endif /* NPC_H */
diff --git a/drivers/net/ethernet/marvell/octeontx2/af/rvu.h 
b/drivers/net/ethernet/marvell/octeontx2/af/rvu.h
index 12391d2..e83d324 100644
--- a/drivers/net/ethernet/marvell/octeontx2/af/rvu.h
+++ b/drivers/net/ethernet/marvell/octeontx2/af/rvu.h
@@ -347,4 +347,9 @@ int rvu_npc_init(struct rvu *rvu);
 void rvu_npc_freemem(struct rvu *rvu);
 int rvu_npc_get_pkind(struct rvu *rvu, u16 pf);
 void rvu_npc_set_pkind(struct rvu *rvu, int pkind, struct rvu_pfvf *pfvf);
+void rvu_npc_install_ucast_entry(struct rvu *rvu, u16 pcifunc,
+int nixlf, u64 chan, u8 *mac_addr);
+void rvu_npc_install_bcast_match_entry(struct rvu *rvu, u16 pcifunc,
+  int nixlf, u64 chan);
+void rvu_npc_disable_mcam_entries(struct rvu *rvu, u16 pcifunc, int nixlf);
 #endif /* RVU_H */
diff --git a/drivers/net/ethernet/marvell/octeontx2/af/rvu_nix.c 
b/drivers/net/ethernet/marvell/octeontx2/af/rvu_nix.c
index 86b1e9b..fbe4ff0 100644
--- a/drivers/net/ethernet/marvell/octeontx2/af/rvu_nix.c
+++ b/drivers/net/ethernet/marvell/octeontx2/af/rvu_nix.c
@@ -151,13 +151,24 @@ static int nix_interface_init(struct rvu *rvu, u16 
pcifunc, int type, int nixlf)
break;
}
 
+   /* Add a UCAST forwarding rule in MCAM with this NIXLF attached
+* RVU PF/VF's MAC address.
+*/
+   rvu_npc_install_ucast_entry(rvu, pcifunc, nixlf,
+   pfvf->rx_chan_base, pfvf->mac_addr);
+
/* Add this PF_FUNC to bcast pkt replication list */
err = nix_update_bcast_mce_list(rvu, pcifunc, true);
if (err) {
dev_err(rvu->dev,
"Bcast list, failed to enable PF_FUNC 0x%x\n",
pcifunc);
+   return err;
}
+
+   rvu_npc_install_bcast_match_entry(rvu, pcifunc,
+ nixlf, pfvf->rx_chan_base);
+
return 0;
 }
 
@@ -172,6 +183,9 @@ static void nix_interface_deinit(struct rvu *rvu, u16 
pcifunc, u8 nixlf)
"Bcast list, failed to disable PF_FUNC 0x%x\n",
pcifunc);
}
+
+   /* Free and disable any MCAM entries used by this NIX LF */
+   rvu_npc_disable_mcam_entries(rvu, pcifunc, nixlf);
 }
 
 static void nix_setup_lso_tso_l3(struct rvu *rvu, int blkaddr,
diff --git a/drivers/net/ethernet/marvell/octeontx2/af/rvu_npc.c 
b/drivers/net/ethernet/marvell/octeontx2/af/rvu_npc.c
index 1c29436..e283372 100644
--- a/drivers/net/ethernet/marvell/octeontx2/af/rvu_npc.c
+++ b/drivers/net/ethernet/marvell/octeontx2/af/rvu_npc.c
@@ -26,6 +26,14 @@
 
 #define NPC_PARSE_RESULT_DMAC_OFFSET   8
 
+struct mcam_entry {
+#define NPC_MAX_KWS_IN_KEY 7 /* Number of keywords in max keywidth */
+   u64 kw[NPC_MAX_KWS_IN_KEY];
+   u64 kw_mask[NPC_MAX_KWS_IN_KEY];
+   u64 action;
+   u64 vtag_action;
+};
+
 void rvu_npc_set_pkind(struct rvu *rvu, int pkind, struct rvu_pfvf *pfvf)
 {
int blkaddr;
@@ -54,6 +62,331 @@ int rvu_npc_get_pkind(struct rvu *rvu, u16 pf)
return -1;
 }
 
+static int npc_get_nixlf_mcam_index(struct npc_mcam *mcam,
+   u16 pcifunc, int nixlf, int type)
+{
+   int pf = rvu_get_pf(pcifunc);
+

[PATCH 05/17] octeontx2-af: Config NPC KPU engines with parser profile

2018-10-19 Thread sunil . kovvuri

From: Sunil Goutham 

This patch configures all 16 KPUs and iKPU (pkinds) with
the KPU parser profile defined in npc_profile.h. Each KPU
engine has a 128 entry CAM, only CAM entries which are listed
in the profile are enabled and rest are left disabled.

Also
- Memory is allocated for pkind's bitmap and PFFUNC, interface
  channel mapping.
- Added all CSR offsets of NPC HW block.

Signed-off-by: Sunil Goutham 
---
 drivers/net/ethernet/marvell/octeontx2/af/Makefile |   2 +-
 drivers/net/ethernet/marvell/octeontx2/af/npc.h| 100 ++
 drivers/net/ethernet/marvell/octeontx2/af/rvu.c|   8 +
 drivers/net/ethernet/marvell/octeontx2/af/rvu.h|  12 ++
 .../net/ethernet/marvell/octeontx2/af/rvu_npc.c| 203 +
 .../net/ethernet/marvell/octeontx2/af/rvu_reg.h|  61 +++
 6 files changed, 385 insertions(+), 1 deletion(-)
 create mode 100644 drivers/net/ethernet/marvell/octeontx2/af/rvu_npc.c

diff --git a/drivers/net/ethernet/marvell/octeontx2/af/Makefile 
b/drivers/net/ethernet/marvell/octeontx2/af/Makefile
index 264cbd7..06329ac 100644
--- a/drivers/net/ethernet/marvell/octeontx2/af/Makefile
+++ b/drivers/net/ethernet/marvell/octeontx2/af/Makefile
@@ -8,4 +8,4 @@ obj-$(CONFIG_OCTEONTX2_AF) += octeontx2_af.o
 
 octeontx2_mbox-y := mbox.o
 octeontx2_af-y := cgx.o rvu.o rvu_cgx.o rvu_npa.o rvu_nix.o \
- rvu_reg.o
+ rvu_reg.o rvu_npc.o
diff --git a/drivers/net/ethernet/marvell/octeontx2/af/npc.h 
b/drivers/net/ethernet/marvell/octeontx2/af/npc.h
index 3e7ec10..58d8f0b 100644
--- a/drivers/net/ethernet/marvell/octeontx2/af/npc.h
+++ b/drivers/net/ethernet/marvell/octeontx2/af/npc.h
@@ -140,4 +140,104 @@ struct npc_kpu_profile {
struct npc_kpu_profile_action *action;
 };
 
+/* NPC KPU register formats */
+struct npc_kpu_cam {
+#if defined(__BIG_ENDIAN_BITFIELD)
+   u64 rsvd_63_56 : 8;
+   u64 state  : 8;
+   u64 dp2_data   : 16;
+   u64 dp1_data   : 16;
+   u64 dp0_data   : 16;
+#else
+   u64 dp0_data   : 16;
+   u64 dp1_data   : 16;
+   u64 dp2_data   : 16;
+   u64 state  : 8;
+   u64 rsvd_63_56 : 8;
+#endif
+};
+
+struct npc_kpu_action0 {
+#if defined(__BIG_ENDIAN_BITFIELD)
+   u64 rsvd_63_57 : 7;
+   u64 byp_count  : 3;
+   u64 capture_ena: 1;
+   u64 parse_done : 1;
+   u64 next_state : 8;
+   u64 rsvd_43: 1;
+   u64 capture_lid: 3;
+   u64 capture_ltype  : 4;
+   u64 capture_flags  : 8;
+   u64 ptr_advance: 8;
+   u64 var_len_offset : 8;
+   u64 var_len_mask   : 8;
+   u64 var_len_right  : 1;
+   u64 var_len_shift  : 3;
+#else
+   u64 var_len_shift  : 3;
+   u64 var_len_right  : 1;
+   u64 var_len_mask   : 8;
+   u64 var_len_offset : 8;
+   u64 ptr_advance: 8;
+   u64 capture_flags  : 8;
+   u64 capture_ltype  : 4;
+   u64 capture_lid: 3;
+   u64 rsvd_43: 1;
+   u64 next_state : 8;
+   u64 parse_done : 1;
+   u64 capture_ena: 1;
+   u64 byp_count  : 3;
+   u64 rsvd_63_57 : 7;
+#endif
+};
+
+struct npc_kpu_action1 {
+#if defined(__BIG_ENDIAN_BITFIELD)
+   u64 rsvd_63_36 : 28;
+   u64 errlev : 4;
+   u64 errcode: 8;
+   u64 dp2_offset : 8;
+   u64 dp1_offset : 8;
+   u64 dp0_offset : 8;
+#else
+   u64 dp0_offset : 8;
+   u64 dp1_offset : 8;
+   u64 dp2_offset : 8;
+   u64 errcode: 8;
+   u64 errlev : 4;
+   u64 rsvd_63_36 : 28;
+#endif
+};
+
+struct npc_kpu_pkind_cpi_def {
+#if defined(__BIG_ENDIAN_BITFIELD)
+   u64 ena: 1;
+   u64 rsvd_62_59 : 4;
+   u64 lid: 3;
+   u64 ltype_match: 4;
+   u64 ltype_mask : 4;
+   u64 flags_match: 8;
+   u64 flags_mask : 8;
+   u64 add_offset : 8;
+   u64 add_mask   : 8;
+   u64 rsvd_15: 1;
+   u64 add_shift  : 3;
+   u64 rsvd_11_10 : 2;
+   u64 cpi_base   : 10;
+#else
+   u64 cpi_base   : 10;
+   u64 rsvd_11_10 : 2;
+   u64 add_shift  : 3;
+   u64 rsvd_15: 1;
+   u64 add_mask   : 8;
+   u64 add_offset : 8;
+   u64 flags_mask : 8;
+   u64 flags_match: 8;
+   u64 ltype_mask : 4;
+   u64 ltype_match: 4;
+   u64 lid: 3;
+   u64 rsvd_62_59 : 4;
+   u64 ena: 1;
+#endif
+};
 #endif /* NPC_H */
diff --git a/drivers/net/ethernet/marvell/octeontx2/af/rvu.c 
b/drivers/net/ethernet/marvell/octeontx2/af/rvu.c
index 9594432..3cb7f76 100644
--- a/drivers/net/ethernet/marvell/octeontx2/af/rvu.c
+++ b/drivers/net/ethernet/marvell/octeontx2/af/rvu.c
@@ -180,6 +180,9 @@ int rvu_get_blkaddr(struct rvu *rvu, int blktype, u16 
pcifunc)
bool is_pf;
 
switch (blktype) {
+   case BLKTYP

[PATCH 12/17] octeontx2-af: Add LMAC channel info to NIXLF_ALLOC response

2018-10-19 Thread sunil . kovvuri

From: Stanislaw Kardach 

Add LMAC channel info like Rx/Tx channel base and count to
NIXLF_ALLOC mailbox message response. This info is used by
NIXLF attached RVU PF/VF to configure SQ's default channel,
TL3_TL2_LINKX_CFG and to install MCAM rules in NPC based
on matching ingress channel number.

Signed-off-by: Stanislaw Kardach 
Signed-off-by: Tomasz Duszynski 
Signed-off-by: Sunil Goutham 
---
 drivers/net/ethernet/marvell/octeontx2/af/common.h  | 1 +
 drivers/net/ethernet/marvell/octeontx2/af/mbox.h| 4 
 drivers/net/ethernet/marvell/octeontx2/af/rvu.h | 5 +
 drivers/net/ethernet/marvell/octeontx2/af/rvu_nix.c | 8 
 4 files changed, 18 insertions(+)

diff --git a/drivers/net/ethernet/marvell/octeontx2/af/common.h 
b/drivers/net/ethernet/marvell/octeontx2/af/common.h
index e438f92..6c8150d 100644
--- a/drivers/net/ethernet/marvell/octeontx2/af/common.h
+++ b/drivers/net/ethernet/marvell/octeontx2/af/common.h
@@ -169,6 +169,7 @@ enum nix_scheduler {
 
 #define MAX_LMAC_PKIND 12
 #define NIX_LINK_CGX_LMAC(a, b)(0 + 4 * (a) + (b))
+#define NIX_CHAN_CGX_LMAC_CHX(a, b, c) (0x800 + 0x100 * (a) + 0x10 * (b) + (c))
 
 /* NIX LSO format indices.
  * As of now TSO is the only one using, so statically assigning indices.
diff --git a/drivers/net/ethernet/marvell/octeontx2/af/mbox.h 
b/drivers/net/ethernet/marvell/octeontx2/af/mbox.h
index b60ac9d..0e2552c 100644
--- a/drivers/net/ethernet/marvell/octeontx2/af/mbox.h
+++ b/drivers/net/ethernet/marvell/octeontx2/af/mbox.h
@@ -382,6 +382,10 @@ struct nix_lf_alloc_req {
 struct nix_lf_alloc_rsp {
struct mbox_msghdr hdr;
u16 sqb_size;
+   u16 rx_chan_base;
+   u16 tx_chan_base;
+   u8  rx_chan_cnt; /* total number of RX channels */
+   u8  tx_chan_cnt; /* total number of TX channels */
u8  lso_tsov4_idx;
u8  lso_tsov6_idx;
u8  mac_addr[ETH_ALEN];
diff --git a/drivers/net/ethernet/marvell/octeontx2/af/rvu.h 
b/drivers/net/ethernet/marvell/octeontx2/af/rvu.h
index 9fa5183..12391d2 100644
--- a/drivers/net/ethernet/marvell/octeontx2/af/rvu.h
+++ b/drivers/net/ethernet/marvell/octeontx2/af/rvu.h
@@ -118,6 +118,11 @@ struct rvu_pfvf {
unsigned long   *rq_bmap;
unsigned long   *cq_bmap;
 
+   u16 rx_chan_base;
+   u16 tx_chan_base;
+   u8  rx_chan_cnt; /* total number of RX channels */
+   u8  tx_chan_cnt; /* total number of TX channels */
+
u8  mac_addr[ETH_ALEN]; /* MAC address of this PF/VF */
 
/* Broadcast pkt replication info */
diff --git a/drivers/net/ethernet/marvell/octeontx2/af/rvu_nix.c 
b/drivers/net/ethernet/marvell/octeontx2/af/rvu_nix.c
index 55075e7..86b1e9b 100644
--- a/drivers/net/ethernet/marvell/octeontx2/af/rvu_nix.c
+++ b/drivers/net/ethernet/marvell/octeontx2/af/rvu_nix.c
@@ -140,6 +140,10 @@ static int nix_interface_init(struct rvu *rvu, u16 
pcifunc, int type, int nixlf)
"PF_Func 0x%x: Invalid pkind\n", pcifunc);
return -EINVAL;
}
+   pfvf->rx_chan_base = NIX_CHAN_CGX_LMAC_CHX(cgx_id, lmac_id, 0);
+   pfvf->tx_chan_base = pfvf->rx_chan_base;
+   pfvf->rx_chan_cnt = 1;
+   pfvf->tx_chan_cnt = 1;
cgx_set_pkind(rvu_cgx_pdata(cgx_id, rvu), lmac_id, pkind);
rvu_npc_set_pkind(rvu, pkind, pfvf);
break;
@@ -799,6 +803,10 @@ int rvu_mbox_handler_NIX_LF_ALLOC(struct rvu *rvu,
/* set SQB size info */
cfg = rvu_read64(rvu, blkaddr, NIX_AF_SQ_CONST);
rsp->sqb_size = (cfg >> 34) & 0x;
+   rsp->rx_chan_base = pfvf->rx_chan_base;
+   rsp->tx_chan_base = pfvf->tx_chan_base;
+   rsp->rx_chan_cnt = pfvf->rx_chan_cnt;
+   rsp->tx_chan_cnt = pfvf->tx_chan_cnt;
rsp->lso_tsov4_idx = NIX_LSO_FORMAT_IDX_TSOV4;
rsp->lso_tsov6_idx = NIX_LSO_FORMAT_IDX_TSOV6;
return rc;
-- 
2.7.4

[PATCH 04/17] octeontx2-af: Add NPC KPU profile

2018-10-19 Thread sunil . kovvuri

From: Hao Zheng 

NPC block is responsible for parsing and forwarding
packets to different NIXLFs. NPC has 16 KPU engines
(Kangaroo parse engine) and one iKPU which represents
pkinds. Each physical port either CGX/LBK is assigned
a pkind and upon receiving a packet HW takes that port's
pkind and starts parsing as per the KPU engines config.

This patch adds header files which contain configuration
profile/array for each of the iKPU and 16 KPU engines.

Signed-off-by: Hao Zheng 
Signed-off-by: Sunil Goutham 
---
 drivers/net/ethernet/marvell/octeontx2/af/npc.h|  143 +
 .../ethernet/marvell/octeontx2/af/npc_profile.h| 5709 
 2 files changed, 5852 insertions(+)
 create mode 100644 drivers/net/ethernet/marvell/octeontx2/af/npc.h
 create mode 100644 drivers/net/ethernet/marvell/octeontx2/af/npc_profile.h

diff --git a/drivers/net/ethernet/marvell/octeontx2/af/npc.h 
b/drivers/net/ethernet/marvell/octeontx2/af/npc.h
new file mode 100644
index 000..3e7ec10
--- /dev/null
+++ b/drivers/net/ethernet/marvell/octeontx2/af/npc.h
@@ -0,0 +1,143 @@
+/* SPDX-License-Identifier: GPL-2.0
+ * Marvell OcteonTx2 RVU Admin Function driver
+ *
+ * Copyright (C) 2018 Marvell International Ltd.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#ifndef NPC_H
+#define NPC_H
+
+enum NPC_LID_E {
+   NPC_LID_LA = 0,
+   NPC_LID_LB,
+   NPC_LID_LC,
+   NPC_LID_LD,
+   NPC_LID_LE,
+   NPC_LID_LF,
+   NPC_LID_LG,
+   NPC_LID_LH,
+};
+
+#define NPC_LT_NA 0
+
+enum npc_kpu_la_ltype {
+   NPC_LT_LA_8023 = 1,
+   NPC_LT_LA_ETHER,
+};
+
+enum npc_kpu_lb_ltype {
+   NPC_LT_LB_ETAG = 1,
+   NPC_LT_LB_CTAG,
+   NPC_LT_LB_STAG,
+   NPC_LT_LB_BTAG,
+   NPC_LT_LB_QINQ,
+   NPC_LT_LB_ITAG,
+};
+
+enum npc_kpu_lc_ltype {
+   NPC_LT_LC_IP = 1,
+   NPC_LT_LC_IP6,
+   NPC_LT_LC_ARP,
+   NPC_LT_LC_RARP,
+   NPC_LT_LC_MPLS,
+   NPC_LT_LC_NSH,
+   NPC_LT_LC_PTP,
+   NPC_LT_LC_FCOE,
+};
+
+/* Don't modify Ltypes upto SCTP, otherwise it will
+ * effect flow tag calculation and thus RSS.
+ */
+enum npc_kpu_ld_ltype {
+   NPC_LT_LD_TCP = 1,
+   NPC_LT_LD_UDP,
+   NPC_LT_LD_ICMP,
+   NPC_LT_LD_SCTP,
+   NPC_LT_LD_IGMP,
+   NPC_LT_LD_ICMP6,
+   NPC_LT_LD_ESP,
+   NPC_LT_LD_AH,
+   NPC_LT_LD_GRE,
+   NPC_LT_LD_GRE_MPLS,
+   NPC_LT_LD_GRE_NSH,
+   NPC_LT_LD_TU_MPLS,
+};
+
+enum npc_kpu_le_ltype {
+   NPC_LT_LE_TU_ETHER = 1,
+   NPC_LT_LE_TU_PPP,
+   NPC_LT_LE_TU_MPLS_IN_NSH,
+   NPC_LT_LE_TU_3RD_NSH,
+};
+
+enum npc_kpu_lf_ltype {
+   NPC_LT_LF_TU_IP = 1,
+   NPC_LT_LF_TU_IP6,
+   NPC_LT_LF_TU_ARP,
+   NPC_LT_LF_TU_MPLS_IP,
+   NPC_LT_LF_TU_MPLS_IP6,
+   NPC_LT_LF_TU_MPLS_ETHER,
+};
+
+enum npc_kpu_lg_ltype {
+   NPC_LT_LG_TU_TCP = 1,
+   NPC_LT_LG_TU_UDP,
+   NPC_LT_LG_TU_SCTP,
+   NPC_LT_LG_TU_ICMP,
+   NPC_LT_LG_TU_IGMP,
+   NPC_LT_LG_TU_ICMP6,
+   NPC_LT_LG_TU_ESP,
+   NPC_LT_LG_TU_AH,
+};
+
+enum npc_kpu_lh_ltype {
+   NPC_LT_LH_TCP_DATA = 1,
+   NPC_LT_LH_HTTP_DATA,
+   NPC_LT_LH_HTTPS_DATA,
+   NPC_LT_LH_PPTP_DATA,
+   NPC_LT_LH_UDP_DATA,
+};
+
+struct npc_kpu_profile_cam {
+   u8 state;
+   u8 state_mask;
+   u16 dp0;
+   u16 dp0_mask;
+   u16 dp1;
+   u16 dp1_mask;
+   u16 dp2;
+   u16 dp2_mask;
+};
+
+struct npc_kpu_profile_action {
+   u8 errlev;
+   u8 errcode;
+   u8 dp0_offset;
+   u8 dp1_offset;
+   u8 dp2_offset;
+   u8 bypass_count;
+   u8 parse_done;
+   u8 next_state;
+   u8 ptr_advance;
+   u8 cap_ena;
+   u8 lid;
+   u8 ltype;
+   u8 flags;
+   u8 offset;
+   u8 mask;
+   u8 right;
+   u8 shift;
+};
+
+struct npc_kpu_profile {
+   int cam_entries;
+   int action_entries;
+   struct npc_kpu_profile_cam *cam;
+   struct npc_kpu_profile_action *action;
+};
+
+#endif /* NPC_H */
diff --git a/drivers/net/ethernet/marvell/octeontx2/af/npc_profile.h 
b/drivers/net/ethernet/marvell/octeontx2/af/npc_profile.h
new file mode 100644
index 000..b2ce957
--- /dev/null
+++ b/drivers/net/ethernet/marvell/octeontx2/af/npc_profile.h
@@ -0,0 +1,5709 @@
+/* SPDX-License-Identifier: GPL-2.0
+ * Marvell OcteonTx2 RVU Admin Function driver
+ *
+ * Copyright (C) 2018 Marvell International Ltd.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#ifndef NPC_PROFILE_H
+#define NPC_PROFILE_H
+
+#define NPC_ETYPE_IP   0x0800
+#define NPC_ETYPE_IP6  0x86dd
+#define NPC_ETYPE_ARP  0x0806
+#define NPC_ETYPE_RARP 0x8035
+#define NPC_ETYPE_MPLSU0x8847
+#

[PATCH 16/17] octeontx2-af: Support for setting MAC address

2018-10-19 Thread sunil . kovvuri

From: Sunil Goutham 

Added a new mailbox message for a PF/VF to set/update
it's NIXLF's MAC address. Also updates unicast NPC
MCAM entry with this address as matching DMAC.

Signed-off-by: Sunil Goutham 
---
 drivers/net/ethernet/marvell/octeontx2/af/mbox.h   |  8 ++-
 drivers/net/ethernet/marvell/octeontx2/af/rvu.h|  3 +++
 .../net/ethernet/marvell/octeontx2/af/rvu_nix.c| 25 ++
 3 files changed, 35 insertions(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/marvell/octeontx2/af/mbox.h 
b/drivers/net/ethernet/marvell/octeontx2/af/mbox.h
index 32d70bf..afa2ead 100644
--- a/drivers/net/ethernet/marvell/octeontx2/af/mbox.h
+++ b/drivers/net/ethernet/marvell/octeontx2/af/mbox.h
@@ -158,7 +158,8 @@ M(NIX_TXSCH_FREE,   0x8005, nix_txsch_free_req, msg_rsp)
\
 M(NIX_TXSCHQ_CFG,  0x8006, nix_txschq_config, msg_rsp) \
 M(NIX_STATS_RST,   0x8007, msg_req, msg_rsp)   \
 M(NIX_VTAG_CFG,0x8008, nix_vtag_config, msg_rsp)   \
-M(NIX_RSS_FLOWKEY_CFG,  0x8009, nix_rss_flowkey_cfg, msg_rsp)
+M(NIX_RSS_FLOWKEY_CFG,  0x8009, nix_rss_flowkey_cfg, msg_rsp)  \
+M(NIX_SET_MAC_ADDR,0x800a, nix_set_mac_addr, msg_rsp)
 
 /* Messages initiated by AF (range 0xC00 - 0xDFF) */
 #define MBOX_UP_CGX_MESSAGES   \
@@ -507,4 +508,9 @@ struct nix_rss_flowkey_cfg {
u8  group;   /* RSS context or group */
 };
 
+struct nix_set_mac_addr {
+   struct mbox_msghdr hdr;
+   u8 mac_addr[ETH_ALEN]; /* MAC address to be set for this pcifunc */
+};
+
 #endif /* MBOX_H */
diff --git a/drivers/net/ethernet/marvell/octeontx2/af/rvu.h 
b/drivers/net/ethernet/marvell/octeontx2/af/rvu.h
index b169657..93e6891 100644
--- a/drivers/net/ethernet/marvell/octeontx2/af/rvu.h
+++ b/drivers/net/ethernet/marvell/octeontx2/af/rvu.h
@@ -344,6 +344,9 @@ int rvu_mbox_handler_NIX_VTAG_CFG(struct rvu *rvu,
 int rvu_mbox_handler_NIX_RSS_FLOWKEY_CFG(struct rvu *rvu,
 struct nix_rss_flowkey_cfg *req,
 struct msg_rsp *rsp);
+int rvu_mbox_handler_NIX_SET_MAC_ADDR(struct rvu *rvu,
+ struct nix_set_mac_addr *req,
+ struct msg_rsp *rsp);
 
 /* NPC APIs */
 int rvu_npc_init(struct rvu *rvu);
diff --git a/drivers/net/ethernet/marvell/octeontx2/af/rvu_nix.c 
b/drivers/net/ethernet/marvell/octeontx2/af/rvu_nix.c
index d4dcdbb..3caf81b 100644
--- a/drivers/net/ethernet/marvell/octeontx2/af/rvu_nix.c
+++ b/drivers/net/ethernet/marvell/octeontx2/af/rvu_nix.c
@@ -1720,6 +1720,31 @@ static void nix_rx_flowkey_alg_cfg(struct rvu *rvu, int 
blkaddr)
}
 }
 
+int rvu_mbox_handler_NIX_SET_MAC_ADDR(struct rvu *rvu,
+ struct nix_set_mac_addr *req,
+ struct msg_rsp *rsp)
+{
+   struct rvu_hwinfo *hw = rvu->hw;
+   u16 pcifunc = req->hdr.pcifunc;
+   struct rvu_pfvf *pfvf;
+   int blkaddr, nixlf;
+
+   pfvf = rvu_get_pfvf(rvu, pcifunc);
+   blkaddr = rvu_get_blkaddr(rvu, BLKTYPE_NIX, pcifunc);
+   if (!pfvf->nixlf || blkaddr < 0)
+   return NIX_AF_ERR_AF_LF_INVALID;
+
+   nixlf = rvu_get_lf(rvu, &hw->block[blkaddr], pcifunc, 0);
+   if (nixlf < 0)
+   return NIX_AF_ERR_AF_LF_INVALID;
+
+   ether_addr_copy(pfvf->mac_addr, req->mac_addr);
+
+   rvu_npc_install_ucast_entry(rvu, pcifunc, nixlf,
+   pfvf->rx_chan_base, req->mac_addr);
+   return 0;
+}
+
 static int nix_calibrate_x2p(struct rvu *rvu, int blkaddr)
 {
int idx, err;
-- 
2.7.4

[PATCH 17/17] octeontx2-af: Support for NIXLF's UCAST/PROMISC/ALLMULTI modes

2018-10-19 Thread sunil . kovvuri

From: Sunil Goutham 

By default NIXLF is set in UCAST mode. This patch adds a new
mailbox message which when sent by a RVU PF changes this default
mode. When promiscuous mode is needed, the reserved promisc entry
for each of RVU PF is setup to match against ingress channel number
only, so that all pkts on that channel are accepted and forwarded
to the mode change requesting PF_FUNC's NIXLF.

PROMISC and ALLMULTI modes are supported only for PFs, for VFs only
UCAST mode is supported.

Signed-off-by: Sunil Goutham 
---
 drivers/net/ethernet/marvell/octeontx2/af/mbox.h   | 11 -
 drivers/net/ethernet/marvell/octeontx2/af/rvu.h|  5 ++
 .../net/ethernet/marvell/octeontx2/af/rvu_nix.c| 33 +
 .../net/ethernet/marvell/octeontx2/af/rvu_npc.c| 57 ++
 4 files changed, 105 insertions(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/marvell/octeontx2/af/mbox.h 
b/drivers/net/ethernet/marvell/octeontx2/af/mbox.h
index afa2ead..a15a59c 100644
--- a/drivers/net/ethernet/marvell/octeontx2/af/mbox.h
+++ b/drivers/net/ethernet/marvell/octeontx2/af/mbox.h
@@ -159,7 +159,8 @@ M(NIX_TXSCHQ_CFG,   0x8006, nix_txschq_config, msg_rsp) 
\
 M(NIX_STATS_RST,   0x8007, msg_req, msg_rsp)   \
 M(NIX_VTAG_CFG,0x8008, nix_vtag_config, msg_rsp)   \
 M(NIX_RSS_FLOWKEY_CFG,  0x8009, nix_rss_flowkey_cfg, msg_rsp)  \
-M(NIX_SET_MAC_ADDR,0x800a, nix_set_mac_addr, msg_rsp)
+M(NIX_SET_MAC_ADDR,0x800a, nix_set_mac_addr, msg_rsp)  \
+M(NIX_SET_RX_MODE, 0x800b, nix_rx_mode, msg_rsp)
 
 /* Messages initiated by AF (range 0xC00 - 0xDFF) */
 #define MBOX_UP_CGX_MESSAGES   \
@@ -513,4 +514,12 @@ struct nix_set_mac_addr {
u8 mac_addr[ETH_ALEN]; /* MAC address to be set for this pcifunc */
 };
 
+struct nix_rx_mode {
+   struct mbox_msghdr hdr;
+#define NIX_RX_MODE_UCAST  BIT(0)
+#define NIX_RX_MODE_PROMISCBIT(1)
+#define NIX_RX_MODE_ALLMULTI   BIT(2)
+   u16 mode;
+};
+
 #endif /* MBOX_H */
diff --git a/drivers/net/ethernet/marvell/octeontx2/af/rvu.h 
b/drivers/net/ethernet/marvell/octeontx2/af/rvu.h
index 93e6891..2c0580c 100644
--- a/drivers/net/ethernet/marvell/octeontx2/af/rvu.h
+++ b/drivers/net/ethernet/marvell/octeontx2/af/rvu.h
@@ -347,6 +347,8 @@ int rvu_mbox_handler_NIX_RSS_FLOWKEY_CFG(struct rvu *rvu,
 int rvu_mbox_handler_NIX_SET_MAC_ADDR(struct rvu *rvu,
  struct nix_set_mac_addr *req,
  struct msg_rsp *rsp);
+int rvu_mbox_handler_NIX_SET_RX_MODE(struct rvu *rvu, struct nix_rx_mode *req,
+struct msg_rsp *rsp);
 
 /* NPC APIs */
 int rvu_npc_init(struct rvu *rvu);
@@ -355,6 +357,9 @@ int rvu_npc_get_pkind(struct rvu *rvu, u16 pf);
 void rvu_npc_set_pkind(struct rvu *rvu, int pkind, struct rvu_pfvf *pfvf);
 void rvu_npc_install_ucast_entry(struct rvu *rvu, u16 pcifunc,
 int nixlf, u64 chan, u8 *mac_addr);
+void rvu_npc_install_promisc_entry(struct rvu *rvu, u16 pcifunc,
+  int nixlf, u64 chan, bool allmulti);
+void rvu_npc_disable_promisc_entry(struct rvu *rvu, u16 pcifunc, int nixlf);
 void rvu_npc_install_bcast_match_entry(struct rvu *rvu, u16 pcifunc,
   int nixlf, u64 chan);
 void rvu_npc_disable_mcam_entries(struct rvu *rvu, u16 pcifunc, int nixlf);
diff --git a/drivers/net/ethernet/marvell/octeontx2/af/rvu_nix.c 
b/drivers/net/ethernet/marvell/octeontx2/af/rvu_nix.c
index 3caf81b..8890c95 100644
--- a/drivers/net/ethernet/marvell/octeontx2/af/rvu_nix.c
+++ b/drivers/net/ethernet/marvell/octeontx2/af/rvu_nix.c
@@ -1745,6 +1745,39 @@ int rvu_mbox_handler_NIX_SET_MAC_ADDR(struct rvu *rvu,
return 0;
 }
 
+int rvu_mbox_handler_NIX_SET_RX_MODE(struct rvu *rvu, struct nix_rx_mode *req,
+struct msg_rsp *rsp)
+{
+   bool allmulti = false, disable_promisc = false;
+   struct rvu_hwinfo *hw = rvu->hw;
+   u16 pcifunc = req->hdr.pcifunc;
+   struct rvu_pfvf *pfvf;
+   int blkaddr, nixlf;
+
+   pfvf = rvu_get_pfvf(rvu, pcifunc);
+   blkaddr = rvu_get_blkaddr(rvu, BLKTYPE_NIX, pcifunc);
+   if (!pfvf->nixlf || blkaddr < 0)
+   return NIX_AF_ERR_AF_LF_INVALID;
+
+   nixlf = rvu_get_lf(rvu, &hw->block[blkaddr], pcifunc, 0);
+   if (nixlf < 0)
+   return NIX_AF_ERR_AF_LF_INVALID;
+
+   if (req->mode & NIX_RX_MODE_PROMISC)
+   allmulti = false;
+   else if (req->mode & NIX_RX_MODE_ALLMULTI)
+   allmulti = true;
+   else
+   disable_promisc = true;
+
+   if (disable_promisc)
+   rvu_npc_disable_promisc_entry(rvu, pcifunc, nixlf);
+   else
+   rvu_npc_install_promisc_entry(rvu, pcifunc, nixlf,
+ pfvf->rx_chan_base, allmul

[PATCH 11/17] octeontx2-af: NPC MCAM and LDATA extract minimal configuration

2018-10-19 Thread sunil . kovvuri

From: Sunil Goutham 

This patch adds some minimal configuration for NPC MCAM and
LDATA extraction which is sufficient enough to install
ucast/bcast/promiscuous forwarding rules. Below is the
config done
- LDATA extraction config to extract DMAC from pkt
  to offset 64bit in MCAM search key.
- Set MCAM lookup keysize to 224bits
- Set MCAM TX miss action to UCAST_DEFAULT
- Set MCAM RX miss action to DROP

Also inorder to have guaranteed space in MCAM to install
ucast forwarding rule for each of RVU PF/VF, reserved
one MCAM entry for each of NIXLF for ucast rule. And two
entries for each of RVU PF. One for bcast pkt replication
and other for promiscuous mode which allows all pkts
received on a HW CGX/LBK channel.

Signed-off-by: Sunil Goutham 
---
 drivers/net/ethernet/marvell/octeontx2/af/common.h |  21 
 drivers/net/ethernet/marvell/octeontx2/af/rvu.h|  14 +++
 .../net/ethernet/marvell/octeontx2/af/rvu_nix.c|  12 ++
 .../net/ethernet/marvell/octeontx2/af/rvu_npc.c| 134 +
 4 files changed, 181 insertions(+)

diff --git a/drivers/net/ethernet/marvell/octeontx2/af/common.h 
b/drivers/net/ethernet/marvell/octeontx2/af/common.h
index 7c53ba3..e438f92 100644
--- a/drivers/net/ethernet/marvell/octeontx2/af/common.h
+++ b/drivers/net/ethernet/marvell/octeontx2/af/common.h
@@ -143,6 +143,27 @@ enum nix_scheduler {
NIX_TXSCH_LVL_CNT = 0x5,
 };
 
+/* NIX RX action operation*/
+#define NIX_RX_ACTIONOP_DROP   (0x0ull)
+#define NIX_RX_ACTIONOP_UCAST  (0x1ull)
+#define NIX_RX_ACTIONOP_UCAST_IPSEC(0x2ull)
+#define NIX_RX_ACTIONOP_MCAST  (0x3ull)
+#define NIX_RX_ACTIONOP_RSS(0x4ull)
+
+/* NIX TX action operation*/
+#define NIX_TX_ACTIONOP_DROP   (0x0ull)
+#define NIX_TX_ACTIONOP_UCAST_DEFAULT  (0x1ull)
+#define NIX_TX_ACTIONOP_UCAST_CHAN (0x2ull)
+#define NIX_TX_ACTIONOP_MCAST  (0x3ull)
+#define NIX_TX_ACTIONOP_DROP_VIOL  (0x5ull)
+
+#define NPC_MCAM_KEY_X10
+#define NPC_MCAM_KEY_X21
+#define NPC_MCAM_KEY_X42
+
+#define NIX_INTF_RX0
+#define NIX_INTF_TX1
+
 #define NIX_INTF_TYPE_CGX  0
 #define NIX_INTF_TYPE_LBK  1
 
diff --git a/drivers/net/ethernet/marvell/octeontx2/af/rvu.h 
b/drivers/net/ethernet/marvell/octeontx2/af/rvu.h
index 1e85e80..9fa5183 100644
--- a/drivers/net/ethernet/marvell/octeontx2/af/rvu.h
+++ b/drivers/net/ethernet/marvell/octeontx2/af/rvu.h
@@ -73,6 +73,18 @@ struct nix_mce_list {
int max;
 };
 
+struct npc_mcam {
+   spinlock_t  lock;   /* MCAM entries and counters update lock */
+   u8  keysize;/* MCAM keysize 112/224/448 bits */
+   u8  banks;  /* Number of MCAM banks */
+   u8  banks_per_entry;/* Number of keywords in key */
+   u16 banksize;   /* Number of MCAM entries in each bank */
+   u16 total_entries;  /* Total number of MCAM entries */
+   u16 entries;/* Total minus reserved for NIX LFs */
+   u16 nixlf_offset;   /* Offset of nixlf rsvd uncast entries */
+   u16 pf_offset;  /* Offset of PF's rsvd bcast, promisc entries */
+};
+
 /* Structure for per RVU func info ie PF/VF */
 struct rvu_pfvf {
boolnpalf; /* Only one NPALF per RVU_FUNC */
@@ -144,6 +156,7 @@ struct rvu_hwinfo {
struct rvu_block block[BLK_COUNT]; /* Block info */
struct nix_hw*nix0;
struct npc_pkind pkind;
+   struct npc_mcam  mcam;
 };
 
 struct rvu {
@@ -297,6 +310,7 @@ int rvu_mbox_handler_NPA_LF_FREE(struct rvu *rvu, struct 
msg_req *req,
 /* NIX APIs */
 int rvu_nix_init(struct rvu *rvu);
 void rvu_nix_freemem(struct rvu *rvu);
+int rvu_get_nixlf_count(struct rvu *rvu);
 int rvu_mbox_handler_NIX_LF_ALLOC(struct rvu *rvu,
  struct nix_lf_alloc_req *req,
  struct nix_lf_alloc_rsp *rsp);
diff --git a/drivers/net/ethernet/marvell/octeontx2/af/rvu_nix.c 
b/drivers/net/ethernet/marvell/octeontx2/af/rvu_nix.c
index 02e1d16..55075e7 100644
--- a/drivers/net/ethernet/marvell/octeontx2/af/rvu_nix.c
+++ b/drivers/net/ethernet/marvell/octeontx2/af/rvu_nix.c
@@ -55,6 +55,18 @@ struct mce {
u16 pcifunc;
 };
 
+int rvu_get_nixlf_count(struct rvu *rvu)
+{
+   struct rvu_block *block;
+   int blkaddr;
+
+   blkaddr = rvu_get_blkaddr(rvu, BLKTYPE_NIX, 0);
+   if (blkaddr < 0)
+   return 0;
+   block = &rvu->hw->block[blkaddr];
+   return block->lf.max;
+}
+
 static void nix_mce_list_init(struct nix_mce_list *list, int max)
 {
INIT_HLIST_HEAD(&list->head);
diff --git a/drivers/net/ethernet/marvell/octeontx2/af/rvu_npc.c 
b/drivers/net/ethernet/marvell/octeontx2/af/rvu_npc.c
index cc1d8c9..1c29436 100644
--- a/drivers/net/ethernet/marvell/octeontx2/af/rvu_npc.c
+++ b/drivers/net/etherne

[PATCH 08/17] octeontx2-af: Update bcast list upon NIXLF alloc/free

2018-10-19 Thread sunil . kovvuri

From: Sunil Goutham 

Upon NIXLF ALLOC/FREE, add or remove corresponding PF_FUNC from
the broadcast packet replication list of the CGX LMAC mapped
RVU PF.

Signed-off-by: Sunil Goutham 
---
 .../net/ethernet/marvell/octeontx2/af/rvu_nix.c| 133 +
 1 file changed, 133 insertions(+)

diff --git a/drivers/net/ethernet/marvell/octeontx2/af/rvu_nix.c 
b/drivers/net/ethernet/marvell/octeontx2/af/rvu_nix.c
index 947424a..8333283 100644
--- a/drivers/net/ethernet/marvell/octeontx2/af/rvu_nix.c
+++ b/drivers/net/ethernet/marvell/octeontx2/af/rvu_nix.c
@@ -16,6 +16,8 @@
 #include "rvu.h"
 #include "cgx.h"
 
+static int nix_update_bcast_mce_list(struct rvu *rvu, u16 pcifunc, bool add);
+
 enum mc_tbl_sz {
MC_TBL_SZ_256,
MC_TBL_SZ_512,
@@ -108,6 +110,7 @@ static int nix_interface_init(struct rvu *rvu, u16 pcifunc, 
int type, int nixlf)
struct rvu_pfvf *pfvf = rvu_get_pfvf(rvu, pcifunc);
u8 cgx_id, lmac_id;
int pkind, pf;
+   int err;
 
pf = rvu_get_pf(pcifunc);
if (!is_pf_cgxmapped(rvu, pf) && type != NIX_INTF_TYPE_LBK)
@@ -130,9 +133,30 @@ static int nix_interface_init(struct rvu *rvu, u16 
pcifunc, int type, int nixlf)
case NIX_INTF_TYPE_LBK:
break;
}
+
+   /* Add this PF_FUNC to bcast pkt replication list */
+   err = nix_update_bcast_mce_list(rvu, pcifunc, true);
+   if (err) {
+   dev_err(rvu->dev,
+   "Bcast list, failed to enable PF_FUNC 0x%x\n",
+   pcifunc);
+   }
return 0;
 }
 
+static void nix_interface_deinit(struct rvu *rvu, u16 pcifunc, u8 nixlf)
+{
+   int err;
+
+   /* Remove this PF_FUNC from bcast pkt replication list */
+   err = nix_update_bcast_mce_list(rvu, pcifunc, false);
+   if (err) {
+   dev_err(rvu->dev,
+   "Bcast list, failed to disable PF_FUNC 0x%x\n",
+   pcifunc);
+   }
+}
+
 static void nix_setup_lso_tso_l3(struct rvu *rvu, int blkaddr,
 u64 format, bool v4, u64 *fidx)
 {
@@ -786,6 +810,8 @@ int rvu_mbox_handler_NIX_LF_FREE(struct rvu *rvu, struct 
msg_req *req,
if (nixlf < 0)
return NIX_AF_ERR_AF_LF_INVALID;
 
+   nix_interface_deinit(rvu, pcifunc, nixlf);
+
/* Reset this NIX LF */
err = rvu_lf_reset(rvu, block, nixlf);
if (err) {
@@ -1147,6 +1173,113 @@ static int nix_setup_mce(struct rvu *rvu, int mce, u8 
op,
return 0;
 }
 
+static int nix_update_mce_list(struct nix_mce_list *mce_list,
+  u16 pcifunc, int idx, bool add)
+{
+   struct mce *mce, *tail = NULL;
+   bool delete = false;
+
+   /* Scan through the current list */
+   hlist_for_each_entry(mce, &mce_list->head, node) {
+   /* If already exists, then delete */
+   if (mce->pcifunc == pcifunc && !add) {
+   delete = true;
+   break;
+   }
+   tail = mce;
+   }
+
+   if (delete) {
+   hlist_del(&mce->node);
+   kfree(mce);
+   mce_list->count--;
+   return 0;
+   }
+
+   if (!add)
+   return 0;
+
+   /* Add a new one to the list, at the tail */
+   mce = kzalloc(sizeof(*mce), GFP_KERNEL);
+   if (!mce)
+   return -ENOMEM;
+   mce->idx = idx;
+   mce->pcifunc = pcifunc;
+   if (!tail)
+   hlist_add_head(&mce->node, &mce_list->head);
+   else
+   hlist_add_behind(&mce->node, &tail->node);
+   mce_list->count++;
+   return 0;
+}
+
+static int nix_update_bcast_mce_list(struct rvu *rvu, u16 pcifunc, bool add)
+{
+   int err = 0, idx, next_idx, count;
+   struct nix_mce_list *mce_list;
+   struct mce *mce, *next_mce;
+   struct nix_mcast *mcast;
+   struct nix_hw *nix_hw;
+   struct rvu_pfvf *pfvf;
+   int blkaddr;
+
+   blkaddr = rvu_get_blkaddr(rvu, BLKTYPE_NIX, pcifunc);
+   if (blkaddr < 0)
+   return 0;
+
+   nix_hw = get_nix_hw(rvu->hw, blkaddr);
+   if (!nix_hw)
+   return 0;
+
+   mcast = &nix_hw->mcast;
+
+   /* Get this PF/VF func's MCE index */
+   pfvf = rvu_get_pfvf(rvu, pcifunc & ~RVU_PFVF_FUNC_MASK);
+   idx = pfvf->bcast_mce_idx + (pcifunc & RVU_PFVF_FUNC_MASK);
+
+   mce_list = &pfvf->bcast_mce_list;
+   if (idx > (pfvf->bcast_mce_idx + mce_list->max)) {
+   dev_err(rvu->dev,
+   "%s: Idx %d > max MCE idx %d, for PF%d bcast list\n",
+   __func__, idx, mce_list->max,
+   pcifunc >> RVU_PFVF_PF_SHIFT);
+   return -EINVAL;
+   }
+
+   spin_lock(&mcast->mce_lock);
+
+   err = nix_update_mce_list(mce_list, pcifunc, idx, add);
+   if (err)
+   goto end;
+
+   /* Disable MCAM entry in NPC */
+
+

[PATCH 01/17] octeontx2-af: NIX Tx scheduler queues alloc/free

2018-10-19 Thread sunil . kovvuri

From: Sunil Goutham 

Added support for a PF/VF to allocate or free NIX transmit
scheduler queues via mbox. For setting up pkt transmission
priorities between queues, the scheduler queues have to be
contiguous w.r.t their HW indices. So both contiguous and
non-contiguous allocations are supported.

Upon receiving NIX_TXSCH_FREE mbox msg all scheduler queues
allocated to sending PFFUNC (PF/VF) will be freed. Selective
free is not supported.

Signed-off-by: Sunil Goutham 
Signed-off-by: Nithin Dabilpuram 
---
 drivers/net/ethernet/marvell/octeontx2/af/mbox.h   |  36 +++-
 drivers/net/ethernet/marvell/octeontx2/af/rvu.c|   4 +-
 drivers/net/ethernet/marvell/octeontx2/af/rvu.h|   9 +-
 .../net/ethernet/marvell/octeontx2/af/rvu_nix.c| 220 +
 4 files changed, 265 insertions(+), 4 deletions(-)

diff --git a/drivers/net/ethernet/marvell/octeontx2/af/mbox.h 
b/drivers/net/ethernet/marvell/octeontx2/af/mbox.h
index c339024..282e556 100644
--- a/drivers/net/ethernet/marvell/octeontx2/af/mbox.h
+++ b/drivers/net/ethernet/marvell/octeontx2/af/mbox.h
@@ -15,6 +15,7 @@
 #include 
 
 #include "rvu_struct.h"
+#include "common.h"
 
 #define MBOX_SIZE  SZ_64K
 
@@ -151,7 +152,9 @@ M(NPA_HWCTX_DISABLE,0x403, hwctx_disable_req, 
msg_rsp)  \
 M(NIX_LF_ALLOC,0x8000, nix_lf_alloc_req, nix_lf_alloc_rsp) 
\
 M(NIX_LF_FREE, 0x8001, msg_req, msg_rsp)   \
 M(NIX_AQ_ENQ,  0x8002, nix_aq_enq_req, nix_aq_enq_rsp) \
-M(NIX_HWCTX_DISABLE,   0x8003, hwctx_disable_req, msg_rsp)
+M(NIX_HWCTX_DISABLE,   0x8003, hwctx_disable_req, msg_rsp) \
+M(NIX_TXSCH_ALLOC, 0x8004, nix_txsch_alloc_req, nix_txsch_alloc_rsp) \
+M(NIX_TXSCH_FREE,  0x8005, nix_txsch_free_req, msg_rsp)
 
 /* Messages initiated by AF (range 0xC00 - 0xDFF) */
 #define MBOX_UP_CGX_MESSAGES   \
@@ -414,4 +417,35 @@ struct nix_aq_enq_rsp {
};
 };
 
+/* Tx scheduler/shaper mailbox messages */
+
+#define MAX_TXSCHQ_PER_FUNC128
+
+struct nix_txsch_alloc_req {
+   struct mbox_msghdr hdr;
+   /* Scheduler queue count request at each level */
+   u16 schq_contig[NIX_TXSCH_LVL_CNT]; /* No of contiguous queues */
+   u16 schq[NIX_TXSCH_LVL_CNT]; /* No of non-contiguous queues */
+};
+
+struct nix_txsch_alloc_rsp {
+   struct mbox_msghdr hdr;
+   /* Scheduler queue count allocated at each level */
+   u16 schq_contig[NIX_TXSCH_LVL_CNT];
+   u16 schq[NIX_TXSCH_LVL_CNT];
+   /* Scheduler queue list allocated at each level */
+   u16 schq_contig_list[NIX_TXSCH_LVL_CNT][MAX_TXSCHQ_PER_FUNC];
+   u16 schq_list[NIX_TXSCH_LVL_CNT][MAX_TXSCHQ_PER_FUNC];
+};
+
+struct nix_txsch_free_req {
+   struct mbox_msghdr hdr;
+#define TXSCHQ_FREE_ALL BIT_ULL(0)
+   u16 flags;
+   /* Scheduler queue level to be freed */
+   u16 schq_lvl;
+   /* List of scheduler queues to be freed */
+   u16 schq;
+};
+
 #endif /* MBOX_H */
diff --git a/drivers/net/ethernet/marvell/octeontx2/af/rvu.c 
b/drivers/net/ethernet/marvell/octeontx2/af/rvu.c
index c06cca9..9594432 100644
--- a/drivers/net/ethernet/marvell/octeontx2/af/rvu.c
+++ b/drivers/net/ethernet/marvell/octeontx2/af/rvu.c
@@ -80,7 +80,7 @@ int rvu_alloc_rsrc(struct rsrc_bmap *rsrc)
return id;
 }
 
-static int rvu_alloc_rsrc_contig(struct rsrc_bmap *rsrc, int nrsrc)
+int rvu_alloc_rsrc_contig(struct rsrc_bmap *rsrc, int nrsrc)
 {
int start;
 
@@ -105,7 +105,7 @@ static void rvu_free_rsrc_contig(struct rsrc_bmap *rsrc, 
int nrsrc, int start)
bitmap_clear(rsrc->bmap, start, nrsrc);
 }
 
-static bool rvu_rsrc_check_contig(struct rsrc_bmap *rsrc, int nrsrc)
+bool rvu_rsrc_check_contig(struct rsrc_bmap *rsrc, int nrsrc)
 {
int start;
 
diff --git a/drivers/net/ethernet/marvell/octeontx2/af/rvu.h 
b/drivers/net/ethernet/marvell/octeontx2/af/rvu.h
index b48b5af..c402eba 100644
--- a/drivers/net/ethernet/marvell/octeontx2/af/rvu.h
+++ b/drivers/net/ethernet/marvell/octeontx2/af/rvu.h
@@ -180,11 +180,12 @@ static inline u64 rvupf_read64(struct rvu *rvu, u64 
offset)
 /* Function Prototypes
  * RVU
  */
-
 int rvu_alloc_bitmap(struct rsrc_bmap *rsrc);
 int rvu_alloc_rsrc(struct rsrc_bmap *rsrc);
 void rvu_free_rsrc(struct rsrc_bmap *rsrc, int id);
 int rvu_rsrc_free_count(struct rsrc_bmap *rsrc);
+int rvu_alloc_rsrc_contig(struct rsrc_bmap *rsrc, int nrsrc);
+bool rvu_rsrc_check_contig(struct rsrc_bmap *rsrc, int nrsrc);
 int rvu_get_pf(u16 pcifunc);
 struct rvu_pfvf *rvu_get_pfvf(struct rvu *rvu, int pcifunc);
 void rvu_get_pf_numvfs(struct rvu *rvu, int pf, int *numvfs, int *hwvf);
@@ -270,4 +271,10 @@ int rvu_mbox_handler_NIX_AQ_ENQ(struct rvu *rvu,
 int rvu_mbox_handler_NIX_HWCTX_DISABLE(struct rvu *rvu,
   struct hwctx_disable_req *req,
   struct msg_rsp *rsp);
+int rvu_mbox_handler_NIX_TXSCH_ALLOC

[PATCH 09/17] octeontx2-af: Support for VTAG strip and capture

2018-10-19 Thread sunil . kovvuri

From: Vamsi Attunuru 

Added support for PF/VF drivers to configure NIX to
capture and/or strip VLAN tag from ingress packets.

Signed-off-by: Vamsi Attunuru 
Signed-off-by: Sunil Goutham 
---
 drivers/net/ethernet/marvell/octeontx2/af/mbox.h   | 35 -
 drivers/net/ethernet/marvell/octeontx2/af/rvu.h|  3 ++
 .../net/ethernet/marvell/octeontx2/af/rvu_nix.c| 59 ++
 .../net/ethernet/marvell/octeontx2/af/rvu_struct.h |  5 ++
 4 files changed, 101 insertions(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/marvell/octeontx2/af/mbox.h 
b/drivers/net/ethernet/marvell/octeontx2/af/mbox.h
index f8efeaa..b60ac9d 100644
--- a/drivers/net/ethernet/marvell/octeontx2/af/mbox.h
+++ b/drivers/net/ethernet/marvell/octeontx2/af/mbox.h
@@ -156,7 +156,8 @@ M(NIX_HWCTX_DISABLE,0x8003, hwctx_disable_req, 
msg_rsp) \
 M(NIX_TXSCH_ALLOC, 0x8004, nix_txsch_alloc_req, nix_txsch_alloc_rsp) \
 M(NIX_TXSCH_FREE,  0x8005, nix_txsch_free_req, msg_rsp)\
 M(NIX_TXSCHQ_CFG,  0x8006, nix_txschq_config, msg_rsp) \
-M(NIX_STATS_RST,   0x8007, msg_req, msg_rsp)
+M(NIX_STATS_RST,   0x8007, msg_req, msg_rsp)   \
+M(NIX_VTAG_CFG,0x8008, nix_vtag_config, msg_rsp)
 
 /* Messages initiated by AF (range 0xC00 - 0xDFF) */
 #define MBOX_UP_CGX_MESSAGES   \
@@ -462,4 +463,36 @@ struct nix_txschq_config {
u64 regval[MAX_REGS_PER_MBOX_MSG];
 };
 
+struct nix_vtag_config {
+   struct mbox_msghdr hdr;
+   u8 vtag_size;
+   /* cfg_type is '0' for tx vlan cfg
+* cfg_type is '1' for rx vlan cfg
+*/
+   u8 cfg_type;
+   union {
+   /* valid when cfg_type is '0' */
+   struct {
+   /* tx vlan0 tag(C-VLAN) */
+   u64 vlan0;
+   /* tx vlan1 tag(S-VLAN) */
+   u64 vlan1;
+   /* insert tx vlan tag */
+   u8 insert_vlan :1;
+   /* insert tx double vlan tag */
+   u8 double_vlan :1;
+   } tx;
+
+   /* valid when cfg_type is '1' */
+   struct {
+   /* rx vtag type index */
+   u8 vtag_type;
+   /* rx vtag strip */
+   u8 strip_vtag :1;
+   /* rx vtag capture */
+   u8 capture_vtag :1;
+   } rx;
+   };
+};
+
 #endif /* MBOX_H */
diff --git a/drivers/net/ethernet/marvell/octeontx2/af/rvu.h 
b/drivers/net/ethernet/marvell/octeontx2/af/rvu.h
index b39400d..1e85e80 100644
--- a/drivers/net/ethernet/marvell/octeontx2/af/rvu.h
+++ b/drivers/net/ethernet/marvell/octeontx2/af/rvu.h
@@ -319,6 +319,9 @@ int rvu_mbox_handler_NIX_TXSCHQ_CFG(struct rvu *rvu,
struct msg_rsp *rsp);
 int rvu_mbox_handler_NIX_STATS_RST(struct rvu *rvu, struct msg_req *req,
   struct msg_rsp *rsp);
+int rvu_mbox_handler_NIX_VTAG_CFG(struct rvu *rvu,
+ struct nix_vtag_config *req,
+ struct msg_rsp *rsp);
 
 /* NPC APIs */
 int rvu_npc_init(struct rvu *rvu);
diff --git a/drivers/net/ethernet/marvell/octeontx2/af/rvu_nix.c 
b/drivers/net/ethernet/marvell/octeontx2/af/rvu_nix.c
index 8333283..7de5417 100644
--- a/drivers/net/ethernet/marvell/octeontx2/af/rvu_nix.c
+++ b/drivers/net/ethernet/marvell/octeontx2/af/rvu_nix.c
@@ -1143,6 +1143,65 @@ int rvu_mbox_handler_NIX_TXSCHQ_CFG(struct rvu *rvu,
return 0;
 }
 
+static int nix_rx_vtag_cfg(struct rvu *rvu, int nixlf, int blkaddr,
+  struct nix_vtag_config *req)
+{
+   u64 regval = 0;
+
+#define NIX_VTAGTYPE_MAX 0x8ull
+#define NIX_VTAGSIZE_MASK 0x7ull
+#define NIX_VTAGSTRIP_CAP_MASK 0x30ull
+
+   if (req->rx.vtag_type >= NIX_VTAGTYPE_MAX ||
+   req->vtag_size > VTAGSIZE_T8)
+   return -EINVAL;
+
+   regval = rvu_read64(rvu, blkaddr,
+   NIX_AF_LFX_RX_VTAG_TYPEX(nixlf, req->rx.vtag_type));
+
+   if (req->rx.strip_vtag && req->rx.capture_vtag)
+   regval |= BIT_ULL(4) | BIT_ULL(5);
+   else if (req->rx.strip_vtag)
+   regval |= BIT_ULL(4);
+   else
+   regval &= ~(BIT_ULL(4) | BIT_ULL(5));
+
+   regval &= ~NIX_VTAGSIZE_MASK;
+   regval |= req->vtag_size & NIX_VTAGSIZE_MASK;
+
+   rvu_write64(rvu, blkaddr,
+   NIX_AF_LFX_RX_VTAG_TYPEX(nixlf, req->rx.vtag_type), regval);
+   return 0;
+}
+
+int rvu_mbox_handler_NIX_VTAG_CFG(struct rvu *rvu,
+ struct nix_vtag_config *req,
+ struct msg_rsp *rsp)
+{
+   struct rvu_hwinfo *hw = rvu->hw;
+   u16 pcifunc = req->hdr.pcifunc;
+   int blkaddr, nixlf, err;
+
+   blkaddr = rvu_get_blkaddr

[PATCH 00/17] octeontx2-af: NPC parser and NIX blocks initialization

2018-10-19 Thread sunil . kovvuri

From: Sunil Goutham 

This patchset is a continuation to earlier submitted two patch
series to add a new driver for Marvell's OcteonTX2 SOC's 
Resource virtualization unit (RVU) admin function driver.

1. octeontx2-af: Add RVU Admin Function driver
   https://www.spinics.net/lists/netdev/msg528272.html
2. octeontx2-af: NPA and NIX blocks initialization 
   https://www.spinics.net/lists/netdev/msg529163.html

This patch series adds more NIX block configuration logic
and additionally adds NPC block parser profile configuration.
In brief below is what this series adds.
NIX block:
- Support for PF/VF to allocate/free transmit scheduler queues,
  maintenance and their configuration.
- Adds support for packet replication lists, only broadcast
  packets is covered for now.
- Defines few RSS flow algorithms for HW to distribute packets.
  This is not the hash algorithsm (i.e toeplitz or crc32), here SW
  defines what fields in packet should HW take and calculate the hash.
- Support for PF/VF to configure VTAG strip and capture capabilities.
- Reset NIXLF statastics.

NPC block:
This block has multiple parser engines which support packet parsing
at multiple layers and generates a parse result which is further used
to generate a key. Based on packet field offsets in the key, SW can 
install packet forwarding rules.
This patch series adds
- Initial parser profile to be programmed into parser engines.
- Default forwarding rules to forward packets to different logical
  interfaces having a NIXLF attached.
- Support for promiscuous and multicast modes.


Geetha sowjanya (1):
  octeontx2-af: Config pkind for CGX mapped PFs

Hao Zheng (1):
  octeontx2-af: Add NPC KPU profile

Stanislaw Kardach (1):
  octeontx2-af: Add LMAC channel info to NIXLF_ALLOC response

Sunil Goutham (12):
  octeontx2-af: NIX Tx scheduler queues alloc/free
  octeontx2-af: NIX Tx scheduler queue config support
  octeontx2-af: Config NPC KPU engines with parser profile
  octeontx2-af: Broadcast packet replication support
  octeontx2-af: Update bcast list upon NIXLF alloc/free
  octeontx2-af: Enable packet length and csum validation
  octeontx2-af: NPC MCAM and LDATA extract minimal configuration
  octeontx2-af: Install ucast and bcast pkt forwarding rules
  octeontx2-af: NIX Rx flowkey configuration for RSS
  octeontx2-af: Support for changing RSS algorithm
  octeontx2-af: Support for setting MAC address
  octeontx2-af: Support for NIXLF's UCAST/PROMISC/ALLMULTI modes

Vamsi Attunuru (2):
  octeontx2-af: Reset NIXLF's Rx/Tx stats
  octeontx2-af: Support for VTAG strip and capture

 drivers/net/ethernet/marvell/octeontx2/af/Makefile |3 +-
 drivers/net/ethernet/marvell/octeontx2/af/cgx.c|   12 +
 drivers/net/ethernet/marvell/octeontx2/af/cgx.h|1 +
 drivers/net/ethernet/marvell/octeontx2/af/common.h |   50 +
 drivers/net/ethernet/marvell/octeontx2/af/mbox.h   |  110 +-
 drivers/net/ethernet/marvell/octeontx2/af/npc.h|  262 +
 .../ethernet/marvell/octeontx2/af/npc_profile.h| 5709 
 drivers/net/ethernet/marvell/octeontx2/af/rvu.c|   12 +-
 drivers/net/ethernet/marvell/octeontx2/af/rvu.h|   97 +-
 .../net/ethernet/marvell/octeontx2/af/rvu_cgx.c|7 +-
 .../net/ethernet/marvell/octeontx2/af/rvu_nix.c| 1067 
 .../net/ethernet/marvell/octeontx2/af/rvu_npc.c|  812 +++
 .../net/ethernet/marvell/octeontx2/af/rvu_reg.c|   71 +
 .../net/ethernet/marvell/octeontx2/af/rvu_reg.h|   61 +
 .../net/ethernet/marvell/octeontx2/af/rvu_struct.h |   35 +
 15 files changed, 8302 insertions(+), 7 deletions(-)
 create mode 100644 drivers/net/ethernet/marvell/octeontx2/af/npc.h
 create mode 100644 drivers/net/ethernet/marvell/octeontx2/af/npc_profile.h
 create mode 100644 drivers/net/ethernet/marvell/octeontx2/af/rvu_npc.c
 create mode 100644 drivers/net/ethernet/marvell/octeontx2/af/rvu_reg.c

-- 
2.7.4

[PATCH 15/17] octeontx2-af: Support for changing RSS algorithm

2018-10-19 Thread sunil . kovvuri

From: Sunil Goutham 

This patch adds support for a RVU PF/VF to change
NIX Rx flowkey algorithm index in NPC RX RSS_ACTION.
eg: a ethtool command changing RSS algorithm for a netdev
interface would trigger this change in NPC.

If PF/VF doesn't specify any MCAM entry index then default
UCAST entry of the NIXLF attached to PF/VF will be updated
with RSS_ACTION and flowkey index.

Signed-off-by: Sunil Goutham 
---
 drivers/net/ethernet/marvell/octeontx2/af/mbox.h   | 10 -
 drivers/net/ethernet/marvell/octeontx2/af/rvu.h|  5 +++
 .../net/ethernet/marvell/octeontx2/af/rvu_nix.c| 51 ++
 .../net/ethernet/marvell/octeontx2/af/rvu_npc.c| 43 ++
 4 files changed, 108 insertions(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/marvell/octeontx2/af/mbox.h 
b/drivers/net/ethernet/marvell/octeontx2/af/mbox.h
index 0e2552c..32d70bf 100644
--- a/drivers/net/ethernet/marvell/octeontx2/af/mbox.h
+++ b/drivers/net/ethernet/marvell/octeontx2/af/mbox.h
@@ -157,7 +157,8 @@ M(NIX_TXSCH_ALLOC,  0x8004, nix_txsch_alloc_req, 
nix_txsch_alloc_rsp) \
 M(NIX_TXSCH_FREE,  0x8005, nix_txsch_free_req, msg_rsp)\
 M(NIX_TXSCHQ_CFG,  0x8006, nix_txschq_config, msg_rsp) \
 M(NIX_STATS_RST,   0x8007, msg_req, msg_rsp)   \
-M(NIX_VTAG_CFG,0x8008, nix_vtag_config, msg_rsp)
+M(NIX_VTAG_CFG,0x8008, nix_vtag_config, msg_rsp)   \
+M(NIX_RSS_FLOWKEY_CFG,  0x8009, nix_rss_flowkey_cfg, msg_rsp)
 
 /* Messages initiated by AF (range 0xC00 - 0xDFF) */
 #define MBOX_UP_CGX_MESSAGES   \
@@ -499,4 +500,11 @@ struct nix_vtag_config {
};
 };
 
+struct nix_rss_flowkey_cfg {
+   struct mbox_msghdr hdr;
+   int mcam_index;  /* MCAM entry index to modify */
+   u32 flowkey_cfg; /* Flowkey types selected */
+   u8  group;   /* RSS context or group */
+};
+
 #endif /* MBOX_H */
diff --git a/drivers/net/ethernet/marvell/octeontx2/af/rvu.h 
b/drivers/net/ethernet/marvell/octeontx2/af/rvu.h
index e83d324..b169657 100644
--- a/drivers/net/ethernet/marvell/octeontx2/af/rvu.h
+++ b/drivers/net/ethernet/marvell/octeontx2/af/rvu.h
@@ -341,6 +341,9 @@ int rvu_mbox_handler_NIX_STATS_RST(struct rvu *rvu, struct 
msg_req *req,
 int rvu_mbox_handler_NIX_VTAG_CFG(struct rvu *rvu,
  struct nix_vtag_config *req,
  struct msg_rsp *rsp);
+int rvu_mbox_handler_NIX_RSS_FLOWKEY_CFG(struct rvu *rvu,
+struct nix_rss_flowkey_cfg *req,
+struct msg_rsp *rsp);
 
 /* NPC APIs */
 int rvu_npc_init(struct rvu *rvu);
@@ -352,4 +355,6 @@ void rvu_npc_install_ucast_entry(struct rvu *rvu, u16 
pcifunc,
 void rvu_npc_install_bcast_match_entry(struct rvu *rvu, u16 pcifunc,
   int nixlf, u64 chan);
 void rvu_npc_disable_mcam_entries(struct rvu *rvu, u16 pcifunc, int nixlf);
+void rvu_npc_update_flowkey_alg_idx(struct rvu *rvu, u16 pcifunc, int nixlf,
+   int group, int alg_idx, int mcam_index);
 #endif /* RVU_H */
diff --git a/drivers/net/ethernet/marvell/octeontx2/af/rvu_nix.c 
b/drivers/net/ethernet/marvell/octeontx2/af/rvu_nix.c
index e4c2c52..d4dcdbb 100644
--- a/drivers/net/ethernet/marvell/octeontx2/af/rvu_nix.c
+++ b/drivers/net/ethernet/marvell/octeontx2/af/rvu_nix.c
@@ -1536,6 +1536,57 @@ int rvu_mbox_handler_NIX_STATS_RST(struct rvu *rvu, 
struct msg_req *req,
return 0;
 }
 
+/* Returns the ALG index to be set into NPC_RX_ACTION */
+static int get_flowkey_alg_idx(u32 flow_cfg)
+{
+   u32 ip_cfg;
+
+   flow_cfg &= ~FLOW_KEY_TYPE_PORT;
+   ip_cfg = FLOW_KEY_TYPE_IPV4 | FLOW_KEY_TYPE_IPV6;
+   if (flow_cfg == ip_cfg)
+   return FLOW_KEY_ALG_IP;
+   else if (flow_cfg == (ip_cfg | FLOW_KEY_TYPE_TCP))
+   return FLOW_KEY_ALG_TCP;
+   else if (flow_cfg == (ip_cfg | FLOW_KEY_TYPE_UDP))
+   return FLOW_KEY_ALG_UDP;
+   else if (flow_cfg == (ip_cfg | FLOW_KEY_TYPE_SCTP))
+   return FLOW_KEY_ALG_SCTP;
+   else if (flow_cfg == (ip_cfg | FLOW_KEY_TYPE_TCP | FLOW_KEY_TYPE_UDP))
+   return FLOW_KEY_ALG_TCP_UDP;
+   else if (flow_cfg == (ip_cfg | FLOW_KEY_TYPE_TCP | FLOW_KEY_TYPE_SCTP))
+   return FLOW_KEY_ALG_TCP_SCTP;
+   else if (flow_cfg == (ip_cfg | FLOW_KEY_TYPE_UDP | FLOW_KEY_TYPE_SCTP))
+   return FLOW_KEY_ALG_UDP_SCTP;
+   else if (flow_cfg == (ip_cfg | FLOW_KEY_TYPE_TCP |
+ FLOW_KEY_TYPE_UDP | FLOW_KEY_TYPE_SCTP))
+   return FLOW_KEY_ALG_TCP_UDP_SCTP;
+
+   return FLOW_KEY_ALG_PORT;
+}
+
+int rvu_mbox_handler_NIX_RSS_FLOWKEY_CFG(struct rvu *rvu,
+struct nix_rss_flowkey_cfg *req,
+struct msg_rsp *rsp)
+{
+   struct

[PATCH 03/17] octeontx2-af: Reset NIXLF's Rx/Tx stats

2018-10-19 Thread sunil . kovvuri

From: Vamsi Attunuru 

This patch adds a new mailbox message to reset
a NIXLF's receive and transmit HW stats.

Signed-off-by: Vamsi Attunuru 
Signed-off-by: Sunil Goutham 
---
 drivers/net/ethernet/marvell/octeontx2/af/mbox.h   |  3 ++-
 drivers/net/ethernet/marvell/octeontx2/af/rvu.h|  2 ++
 .../net/ethernet/marvell/octeontx2/af/rvu_nix.c| 30 ++
 3 files changed, 34 insertions(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/marvell/octeontx2/af/mbox.h 
b/drivers/net/ethernet/marvell/octeontx2/af/mbox.h
index f2e0743..f8efeaa 100644
--- a/drivers/net/ethernet/marvell/octeontx2/af/mbox.h
+++ b/drivers/net/ethernet/marvell/octeontx2/af/mbox.h
@@ -155,7 +155,8 @@ M(NIX_AQ_ENQ,   0x8002, nix_aq_enq_req, 
nix_aq_enq_rsp) \
 M(NIX_HWCTX_DISABLE,   0x8003, hwctx_disable_req, msg_rsp) \
 M(NIX_TXSCH_ALLOC, 0x8004, nix_txsch_alloc_req, nix_txsch_alloc_rsp) \
 M(NIX_TXSCH_FREE,  0x8005, nix_txsch_free_req, msg_rsp)\
-M(NIX_TXSCHQ_CFG,  0x8006, nix_txschq_config, msg_rsp)
+M(NIX_TXSCHQ_CFG,  0x8006, nix_txschq_config, msg_rsp) \
+M(NIX_STATS_RST,   0x8007, msg_req, msg_rsp)
 
 /* Messages initiated by AF (range 0xC00 - 0xDFF) */
 #define MBOX_UP_CGX_MESSAGES   \
diff --git a/drivers/net/ethernet/marvell/octeontx2/af/rvu.h 
b/drivers/net/ethernet/marvell/octeontx2/af/rvu.h
index 4b15552..f041d0a 100644
--- a/drivers/net/ethernet/marvell/octeontx2/af/rvu.h
+++ b/drivers/net/ethernet/marvell/octeontx2/af/rvu.h
@@ -288,4 +288,6 @@ int rvu_mbox_handler_NIX_TXSCH_FREE(struct rvu *rvu,
 int rvu_mbox_handler_NIX_TXSCHQ_CFG(struct rvu *rvu,
struct nix_txschq_config *req,
struct msg_rsp *rsp);
+int rvu_mbox_handler_NIX_STATS_RST(struct rvu *rvu, struct msg_req *req,
+  struct msg_rsp *rsp);
 #endif /* RVU_H */
diff --git a/drivers/net/ethernet/marvell/octeontx2/af/rvu_nix.c 
b/drivers/net/ethernet/marvell/octeontx2/af/rvu_nix.c
index 56f242d..62d8913 100644
--- a/drivers/net/ethernet/marvell/octeontx2/af/rvu_nix.c
+++ b/drivers/net/ethernet/marvell/octeontx2/af/rvu_nix.c
@@ -1053,6 +1053,36 @@ static int nix_setup_txschq(struct rvu *rvu, struct 
nix_hw *nix_hw, int blkaddr)
return 0;
 }
 
+int rvu_mbox_handler_NIX_STATS_RST(struct rvu *rvu, struct msg_req *req,
+  struct msg_rsp *rsp)
+{
+   struct rvu_hwinfo *hw = rvu->hw;
+   u16 pcifunc = req->hdr.pcifunc;
+   int i, nixlf, blkaddr;
+   u64 stats;
+
+   blkaddr = rvu_get_blkaddr(rvu, BLKTYPE_NIX, pcifunc);
+   if (blkaddr < 0)
+   return NIX_AF_ERR_AF_LF_INVALID;
+
+   nixlf = rvu_get_lf(rvu, &hw->block[blkaddr], pcifunc, 0);
+   if (nixlf < 0)
+   return NIX_AF_ERR_AF_LF_INVALID;
+
+   /* Get stats count supported by HW */
+   stats = rvu_read64(rvu, blkaddr, NIX_AF_CONST1);
+
+   /* Reset tx stats */
+   for (i = 0; i < ((stats >> 24) & 0xFF); i++)
+   rvu_write64(rvu, blkaddr, NIX_AF_LFX_TX_STATX(nixlf, i), 0);
+
+   /* Reset rx stats */
+   for (i = 0; i < ((stats >> 32) & 0xFF); i++)
+   rvu_write64(rvu, blkaddr, NIX_AF_LFX_RX_STATX(nixlf, i), 0);
+
+   return 0;
+}
+
 static int nix_calibrate_x2p(struct rvu *rvu, int blkaddr)
 {
int idx, err;
-- 
2.7.4

[PATCH 14/17] octeontx2-af: NIX Rx flowkey configuration for RSS

2018-10-19 Thread sunil . kovvuri

From: Sunil Goutham 

Configure NIX RX flowkey algorithm configuration to support
RSS (receive side scaling). Currently support for only L3/L4
2-tuple and 4-tuple hash of IPv4/v6/TCP/UDP/SCTP is added.
HW supports upto 32 different flowkey algorithms which SW
can define, this patch defines 9. NPC RX ACTION has to point
to one of these flowkey indices for RSS to work.

The configuration is dependent on NPC parse result's layer
info. So if NPC KPU profile changes suchthat LID/LTYPE values
of above said protocols change then this configuration will
most likely be effected.

Signed-off-by: Sunil Goutham 
---
 drivers/net/ethernet/marvell/octeontx2/af/common.h |  22 
 .../net/ethernet/marvell/octeontx2/af/rvu_nix.c| 135 +
 .../net/ethernet/marvell/octeontx2/af/rvu_struct.h |  30 +
 3 files changed, 187 insertions(+)

diff --git a/drivers/net/ethernet/marvell/octeontx2/af/common.h 
b/drivers/net/ethernet/marvell/octeontx2/af/common.h
index 6c8150d..d39ada4 100644
--- a/drivers/net/ethernet/marvell/octeontx2/af/common.h
+++ b/drivers/net/ethernet/marvell/octeontx2/af/common.h
@@ -186,4 +186,26 @@ enum nix_scheduler {
 #define DEFAULT_RSS_CONTEXT_GROUP  0
 #define MAX_RSS_INDIR_TBL_SIZE 256 /* 1 << Max adder bits */
 
+/* NIX flow tag, key type flags */
+#define FLOW_KEY_TYPE_PORT BIT(0)
+#define FLOW_KEY_TYPE_IPV4 BIT(1)
+#define FLOW_KEY_TYPE_IPV6 BIT(2)
+#define FLOW_KEY_TYPE_TCP  BIT(3)
+#define FLOW_KEY_TYPE_UDP  BIT(4)
+#define FLOW_KEY_TYPE_SCTP BIT(5)
+
+/* NIX flow tag algorithm indices, max is 31 */
+enum {
+   FLOW_KEY_ALG_PORT,
+   FLOW_KEY_ALG_IP,
+   FLOW_KEY_ALG_TCP,
+   FLOW_KEY_ALG_UDP,
+   FLOW_KEY_ALG_SCTP,
+   FLOW_KEY_ALG_TCP_UDP,
+   FLOW_KEY_ALG_TCP_SCTP,
+   FLOW_KEY_ALG_UDP_SCTP,
+   FLOW_KEY_ALG_TCP_UDP_SCTP,
+   FLOW_KEY_ALG_MAX,
+};
+
 #endif /* COMMON_H */
diff --git a/drivers/net/ethernet/marvell/octeontx2/af/rvu_nix.c 
b/drivers/net/ethernet/marvell/octeontx2/af/rvu_nix.c
index fbe4ff0..e4c2c52 100644
--- a/drivers/net/ethernet/marvell/octeontx2/af/rvu_nix.c
+++ b/drivers/net/ethernet/marvell/octeontx2/af/rvu_nix.c
@@ -1536,6 +1536,139 @@ int rvu_mbox_handler_NIX_STATS_RST(struct rvu *rvu, 
struct msg_req *req,
return 0;
 }
 
+static void set_flowkey_fields(struct nix_rx_flowkey_alg *alg, u32 flow_cfg)
+{
+   struct nix_rx_flowkey_alg *field = NULL;
+   int idx, key_type;
+
+   if (!alg)
+   return;
+
+   /* FIELD0: IPv4
+* FIELD1: IPv6
+* FIELD2: TCP/UDP/SCTP/ALL
+* FIELD3: Unused
+* FIELD4: Unused
+*
+* Each of the 32 possible flow key algorithm definitions should
+* fall into above incremental config (except ALG0). Otherwise a
+* single NPC MCAM entry is not sufficient for supporting RSS.
+*
+* If a different definition or combination needed then NPC MCAM
+* has to be programmed to filter such pkts and it's action should
+* point to this definition to calculate flowtag or hash.
+*/
+   for (idx = 0; idx < 32; idx++) {
+   key_type = flow_cfg & BIT_ULL(idx);
+   if (!key_type)
+   continue;
+   switch (key_type) {
+   case FLOW_KEY_TYPE_PORT:
+   field = &alg[0];
+   field->sel_chan = true;
+   /* This should be set to 1, when SEL_CHAN is set */
+   field->bytesm1 = 1;
+   break;
+   case FLOW_KEY_TYPE_IPV4:
+   field = &alg[0];
+   field->lid = NPC_LID_LC;
+   field->ltype_match = NPC_LT_LC_IP;
+   field->hdr_offset = 12; /* SIP offset */
+   field->bytesm1 = 7; /* SIP + DIP, 8 bytes */
+   field->ltype_mask = 0xF; /* Match only IPv4 */
+   break;
+   case FLOW_KEY_TYPE_IPV6:
+   field = &alg[1];
+   field->lid = NPC_LID_LC;
+   field->ltype_match = NPC_LT_LC_IP6;
+   field->hdr_offset = 8; /* SIP offset */
+   field->bytesm1 = 31; /* SIP + DIP, 32 bytes */
+   field->ltype_mask = 0xF; /* Match only IPv6 */
+   break;
+   case FLOW_KEY_TYPE_TCP:
+   case FLOW_KEY_TYPE_UDP:
+   case FLOW_KEY_TYPE_SCTP:
+   field = &alg[2];
+   field->lid = NPC_LID_LD;
+   field->bytesm1 = 3; /* Sport + Dport, 4 bytes */
+   if (key_type == FLOW_KEY_TYPE_TCP)
+   field->ltype_match |= NPC_LT_LD_TCP;
+   else if (key_type == FLOW_KEY_TYPE_UDP)
+   field->ltype_match |= NPC_LT_LD_UDP;
+

[PATCH 02/17] octeontx2-af: NIX Tx scheduler queue config support

2018-10-19 Thread sunil . kovvuri

From: Sunil Goutham 

This patch adds support for a PF/VF driver to configure
NIX transmit scheduler queues via mbox. Since PF/VF doesn't
know the absolute HW index of the NIXLF attached to it, AF
traps the register config and overwrites with the correct
NIXLF index.

HW supports shaping, colouring and policing of packets with
these multilevel traffic scheduler queues. Instead of
introducing different mbox message formats for different
configurations and making both AF & PF/VF driver implementation
cumbersome, access to the scheduler queue's CSRs is provided
via mbox. AF checks whether the sender PF/VF has the
corresponding queue allocated or not and dumps the config
to HW. With a single mbox msg 20 registers can be configured.

Signed-off-by: Sunil Goutham 
---
 drivers/net/ethernet/marvell/octeontx2/af/Makefile |   3 +-
 drivers/net/ethernet/marvell/octeontx2/af/mbox.h   |  15 ++-
 drivers/net/ethernet/marvell/octeontx2/af/rvu.h|  11 +++
 .../net/ethernet/marvell/octeontx2/af/rvu_nix.c| 104 -
 .../net/ethernet/marvell/octeontx2/af/rvu_reg.c|  71 ++
 5 files changed, 199 insertions(+), 5 deletions(-)
 create mode 100644 drivers/net/ethernet/marvell/octeontx2/af/rvu_reg.c

diff --git a/drivers/net/ethernet/marvell/octeontx2/af/Makefile 
b/drivers/net/ethernet/marvell/octeontx2/af/Makefile
index 45b108f..264cbd7 100644
--- a/drivers/net/ethernet/marvell/octeontx2/af/Makefile
+++ b/drivers/net/ethernet/marvell/octeontx2/af/Makefile
@@ -7,4 +7,5 @@ obj-$(CONFIG_OCTEONTX2_MBOX) += octeontx2_mbox.o
 obj-$(CONFIG_OCTEONTX2_AF) += octeontx2_af.o
 
 octeontx2_mbox-y := mbox.o
-octeontx2_af-y := cgx.o rvu.o rvu_cgx.o rvu_npa.o rvu_nix.o
+octeontx2_af-y := cgx.o rvu.o rvu_cgx.o rvu_npa.o rvu_nix.o \
+ rvu_reg.o
diff --git a/drivers/net/ethernet/marvell/octeontx2/af/mbox.h 
b/drivers/net/ethernet/marvell/octeontx2/af/mbox.h
index 282e556..f2e0743 100644
--- a/drivers/net/ethernet/marvell/octeontx2/af/mbox.h
+++ b/drivers/net/ethernet/marvell/octeontx2/af/mbox.h
@@ -154,7 +154,8 @@ M(NIX_LF_FREE,  0x8001, msg_req, msg_rsp)   
\
 M(NIX_AQ_ENQ,  0x8002, nix_aq_enq_req, nix_aq_enq_rsp) \
 M(NIX_HWCTX_DISABLE,   0x8003, hwctx_disable_req, msg_rsp) \
 M(NIX_TXSCH_ALLOC, 0x8004, nix_txsch_alloc_req, nix_txsch_alloc_rsp) \
-M(NIX_TXSCH_FREE,  0x8005, nix_txsch_free_req, msg_rsp)
+M(NIX_TXSCH_FREE,  0x8005, nix_txsch_free_req, msg_rsp)\
+M(NIX_TXSCHQ_CFG,  0x8006, nix_txschq_config, msg_rsp)
 
 /* Messages initiated by AF (range 0xC00 - 0xDFF) */
 #define MBOX_UP_CGX_MESSAGES   \
@@ -448,4 +449,16 @@ struct nix_txsch_free_req {
u16 schq;
 };
 
+struct nix_txschq_config {
+   struct mbox_msghdr hdr;
+   u8 lvl; /* SMQ/MDQ/TL4/TL3/TL2/TL1 */
+#define TXSCHQ_IDX_SHIFT   16
+#define TXSCHQ_IDX_MASK(BIT_ULL(10) - 1)
+#define TXSCHQ_IDX(reg, shift) (((reg) >> (shift)) & TXSCHQ_IDX_MASK)
+   u8 num_regs;
+#define MAX_REGS_PER_MBOX_MSG  20
+   u64 reg[MAX_REGS_PER_MBOX_MSG];
+   u64 regval[MAX_REGS_PER_MBOX_MSG];
+};
+
 #endif /* MBOX_H */
diff --git a/drivers/net/ethernet/marvell/octeontx2/af/rvu.h 
b/drivers/net/ethernet/marvell/octeontx2/af/rvu.h
index c402eba..4b15552 100644
--- a/drivers/net/ethernet/marvell/octeontx2/af/rvu.h
+++ b/drivers/net/ethernet/marvell/octeontx2/af/rvu.h
@@ -195,6 +195,14 @@ int rvu_lf_reset(struct rvu *rvu, struct rvu_block *block, 
int lf);
 int rvu_get_blkaddr(struct rvu *rvu, int blktype, u16 pcifunc);
 int rvu_poll_reg(struct rvu *rvu, u64 block, u64 offset, u64 mask, bool zero);
 
+/* RVU HW reg validation */
+enum regmap_block {
+   TXSCHQ_HWREGMAP = 0,
+   MAX_HWREGMAP,
+};
+
+bool rvu_check_valid_reg(int regmap, int regblk, u64 reg);
+
 /* NPA/NIX AQ APIs */
 int rvu_aq_alloc(struct rvu *rvu, struct admin_queue **ad_queue,
 int qsize, int inst_size, int res_size);
@@ -277,4 +285,7 @@ int rvu_mbox_handler_NIX_TXSCH_ALLOC(struct rvu *rvu,
 int rvu_mbox_handler_NIX_TXSCH_FREE(struct rvu *rvu,
struct nix_txsch_free_req *req,
struct msg_rsp *rsp);
+int rvu_mbox_handler_NIX_TXSCHQ_CFG(struct rvu *rvu,
+   struct nix_txschq_config *req,
+   struct msg_rsp *rsp);
 #endif /* RVU_H */
diff --git a/drivers/net/ethernet/marvell/octeontx2/af/rvu_nix.c 
b/drivers/net/ethernet/marvell/octeontx2/af/rvu_nix.c
index e8374d9..56f242d 100644
--- a/drivers/net/ethernet/marvell/octeontx2/af/rvu_nix.c
+++ b/drivers/net/ethernet/marvell/octeontx2/af/rvu_nix.c
@@ -738,10 +738,10 @@ static void nix_reset_tx_linkcfg(struct rvu *rvu, int 
blkaddr,
if (lvl == NIX_TXSCH_LVL_TL4)
rvu_write64(rvu, blkaddr, NIX_AF_TL4X_SDP_LINK_CFG(schq), 0x00);
 
-   if (lvl != NIX_TXSCH_LVL_TL3)
+   if (lvl != NIX_TXSCH_L

[PATCH 07/17] octeontx2-af: Broadcast packet replication support

2018-10-19 Thread sunil . kovvuri

From: Sunil Goutham 

Allocate memory for mcast/bcast/mirror replication entry
contexts, replication buffers (used by HW) and config HW
with corresponding memory bases. Added support for installing
MCEs via NIX AQ mbox.

For now support is restricted to broadcast pkt replication,
hence MCE table size and number of replication buffers
allocated are less. Each CGX LMAC mapped RVU PF is assigned
a MCE table of size 'num VFs of that PF + PF'.

Signed-off-by: Sunil Goutham 
---
 drivers/net/ethernet/marvell/octeontx2/af/rvu.h|  19 ++
 .../net/ethernet/marvell/octeontx2/af/rvu_nix.c| 201 +
 2 files changed, 220 insertions(+)

diff --git a/drivers/net/ethernet/marvell/octeontx2/af/rvu.h 
b/drivers/net/ethernet/marvell/octeontx2/af/rvu.h
index b3dbbd6..b39400d 100644
--- a/drivers/net/ethernet/marvell/octeontx2/af/rvu.h
+++ b/drivers/net/ethernet/marvell/octeontx2/af/rvu.h
@@ -59,6 +59,20 @@ struct rvu_block {
unsigned char name[NAME_SIZE];
 };
 
+struct nix_mcast {
+   struct qmem *mce_ctx;
+   struct qmem *mcast_buf;
+   int replay_pkind;
+   int next_free_mce;
+   spinlock_t  mce_lock; /* Serialize MCE updates */
+};
+
+struct nix_mce_list {
+   struct hlist_head   head;
+   int count;
+   int max;
+};
+
 /* Structure for per RVU func info ie PF/VF */
 struct rvu_pfvf {
boolnpalf; /* Only one NPALF per RVU_FUNC */
@@ -93,6 +107,10 @@ struct rvu_pfvf {
unsigned long   *cq_bmap;
 
u8  mac_addr[ETH_ALEN]; /* MAC address of this PF/VF */
+
+   /* Broadcast pkt replication info */
+   u16 bcast_mce_idx;
+   struct nix_mce_list bcast_mce_list;
 };
 
 struct nix_txsch {
@@ -108,6 +126,7 @@ struct npc_pkind {
 
 struct nix_hw {
struct nix_txsch txsch[NIX_TXSCH_LVL_CNT]; /* Tx schedulers */
+   struct nix_mcast mcast;
 };
 
 struct rvu_hwinfo {
diff --git a/drivers/net/ethernet/marvell/octeontx2/af/rvu_nix.c 
b/drivers/net/ethernet/marvell/octeontx2/af/rvu_nix.c
index 4f2528e..947424a 100644
--- a/drivers/net/ethernet/marvell/octeontx2/af/rvu_nix.c
+++ b/drivers/net/ethernet/marvell/octeontx2/af/rvu_nix.c
@@ -16,6 +16,61 @@
 #include "rvu.h"
 #include "cgx.h"
 
+enum mc_tbl_sz {
+   MC_TBL_SZ_256,
+   MC_TBL_SZ_512,
+   MC_TBL_SZ_1K,
+   MC_TBL_SZ_2K,
+   MC_TBL_SZ_4K,
+   MC_TBL_SZ_8K,
+   MC_TBL_SZ_16K,
+   MC_TBL_SZ_32K,
+   MC_TBL_SZ_64K,
+};
+
+enum mc_buf_cnt {
+   MC_BUF_CNT_8,
+   MC_BUF_CNT_16,
+   MC_BUF_CNT_32,
+   MC_BUF_CNT_64,
+   MC_BUF_CNT_128,
+   MC_BUF_CNT_256,
+   MC_BUF_CNT_512,
+   MC_BUF_CNT_1024,
+   MC_BUF_CNT_2048,
+};
+
+/* For now considering MC resources needed for broadcast
+ * pkt replication only. i.e 256 HWVFs + 12 PFs.
+ */
+#define MC_TBL_SIZEMC_TBL_SZ_512
+#define MC_BUF_CNT MC_BUF_CNT_128
+
+struct mce {
+   struct hlist_node   node;
+   u16 idx;
+   u16 pcifunc;
+};
+
+static void nix_mce_list_init(struct nix_mce_list *list, int max)
+{
+   INIT_HLIST_HEAD(&list->head);
+   list->count = 0;
+   list->max = max;
+}
+
+static u16 nix_alloc_mce_list(struct nix_mcast *mcast, int count)
+{
+   int idx;
+
+   if (!mcast)
+   return 0;
+
+   idx = mcast->next_free_mce;
+   mcast->next_free_mce += count;
+   return idx;
+}
+
 static inline struct nix_hw *get_nix_hw(struct rvu_hwinfo *hw, int blkaddr)
 {
if (blkaddr == BLKADDR_NIX0 && hw->nix0)
@@ -315,6 +370,19 @@ static int rvu_nix_aq_enq_inst(struct rvu *rvu, struct 
nix_aq_enq_req *req,
(req->qidx >= (256UL << (cfg & 0xF
rc = NIX_AF_ERR_AQ_ENQUEUE;
break;
+   case NIX_AQ_CTYPE_MCE:
+   cfg = rvu_read64(rvu, blkaddr, NIX_AF_RX_MCAST_CFG);
+   /* Check if index exceeds MCE list length */
+   if (!hw->nix0->mcast.mce_ctx ||
+   (req->qidx >= (256UL << (cfg & 0xF
+   rc = NIX_AF_ERR_AQ_ENQUEUE;
+
+   /* Adding multicast lists for requests from PF/VFs is not
+* yet supported, so ignore this.
+*/
+   if (rsp)
+   rc = NIX_AF_ERR_AQ_ENQUEUE;
+   break;
default:
rc = NIX_AF_ERR_AQ_ENQUEUE;
}
@@ -361,6 +429,9 @@ static int rvu_nix_aq_enq_inst(struct rvu *rvu, struct 
nix_aq_enq_req *req,
else if (req->ctype == NIX_AQ_CTYPE_RSS)
memcpy(mask, &req->rss_mask,
   sizeof(struct nix_rsse_s));
+   else if (req->ctype == NIX_AQ_CTYPE_MCE)
+   memcpy(mask, &req->mce_mask,
+  sizeof(struct nix_rx_mce_s));
/* Fall through */

[PATCH 06/17] octeontx2-af: Config pkind for CGX mapped PFs

2018-10-19 Thread sunil . kovvuri

From: Geetha sowjanya 

For each CGX LMAC that is mapped to a RVU PF, allocate
a pkind and config the same in CGX. For a received packet
at CGX LMAC interface this pkind is used by NPC block
to start parsing of packet.

Signed-off-by: Geetha sowjanya 
Signed-off-by: Sunil Goutham 
---
 drivers/net/ethernet/marvell/octeontx2/af/cgx.c| 12 
 drivers/net/ethernet/marvell/octeontx2/af/cgx.h|  1 +
 drivers/net/ethernet/marvell/octeontx2/af/common.h |  6 
 drivers/net/ethernet/marvell/octeontx2/af/rvu.h|  4 +++
 .../net/ethernet/marvell/octeontx2/af/rvu_cgx.c|  7 +++--
 .../net/ethernet/marvell/octeontx2/af/rvu_nix.c| 34 ++
 .../net/ethernet/marvell/octeontx2/af/rvu_npc.c| 28 ++
 7 files changed, 90 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/marvell/octeontx2/af/cgx.c 
b/drivers/net/ethernet/marvell/octeontx2/af/cgx.c
index 352501b..12db256 100644
--- a/drivers/net/ethernet/marvell/octeontx2/af/cgx.c
+++ b/drivers/net/ethernet/marvell/octeontx2/af/cgx.c
@@ -186,6 +186,18 @@ u64 cgx_lmac_addr_get(u8 cgx_id, u8 lmac_id)
 }
 EXPORT_SYMBOL(cgx_lmac_addr_get);
 
+int cgx_set_pkind(void *cgxd, u8 lmac_id, int pkind)
+{
+   struct cgx *cgx = cgxd;
+
+   if (!cgx || lmac_id >= cgx->lmac_count)
+   return -ENODEV;
+
+   cgx_write(cgx, lmac_id, CGXX_CMRX_RX_ID_MAP, (pkind & 0x3F));
+   return 0;
+}
+EXPORT_SYMBOL(cgx_set_pkind);
+
 static inline u8 cgx_get_lmac_type(struct cgx *cgx, int lmac_id)
 {
u64 cfg;
diff --git a/drivers/net/ethernet/marvell/octeontx2/af/cgx.h 
b/drivers/net/ethernet/marvell/octeontx2/af/cgx.h
index ada25ed..0a66d27 100644
--- a/drivers/net/ethernet/marvell/octeontx2/af/cgx.h
+++ b/drivers/net/ethernet/marvell/octeontx2/af/cgx.h
@@ -97,6 +97,7 @@ extern struct pci_driver cgx_driver;
 int cgx_get_cgx_cnt(void);
 int cgx_get_lmac_cnt(void *cgxd);
 void *cgx_get_pdata(int cgx_id);
+int cgx_set_pkind(void *cgxd, u8 lmac_id, int pkind);
 int cgx_lmac_evh_register(struct cgx_event_cb *cb, void *cgxd, int lmac_id);
 int cgx_get_tx_stats(void *cgxd, int lmac_id, int idx, u64 *tx_stat);
 int cgx_get_rx_stats(void *cgxd, int lmac_id, int idx, u64 *rx_stat);
diff --git a/drivers/net/ethernet/marvell/octeontx2/af/common.h 
b/drivers/net/ethernet/marvell/octeontx2/af/common.h
index 28eb691..7c53ba3 100644
--- a/drivers/net/ethernet/marvell/octeontx2/af/common.h
+++ b/drivers/net/ethernet/marvell/octeontx2/af/common.h
@@ -143,6 +143,12 @@ enum nix_scheduler {
NIX_TXSCH_LVL_CNT = 0x5,
 };
 
+#define NIX_INTF_TYPE_CGX  0
+#define NIX_INTF_TYPE_LBK  1
+
+#define MAX_LMAC_PKIND 12
+#define NIX_LINK_CGX_LMAC(a, b)(0 + 4 * (a) + (b))
+
 /* NIX LSO format indices.
  * As of now TSO is the only one using, so statically assigning indices.
  */
diff --git a/drivers/net/ethernet/marvell/octeontx2/af/rvu.h 
b/drivers/net/ethernet/marvell/octeontx2/af/rvu.h
index e8e16a7..b3dbbd6 100644
--- a/drivers/net/ethernet/marvell/octeontx2/af/rvu.h
+++ b/drivers/net/ethernet/marvell/octeontx2/af/rvu.h
@@ -67,6 +67,7 @@ struct rvu_pfvf {
u16 ssow;
u16 cptlfs;
u16 timlfs;
+   u8  cgx_lmac;
 
/* Block LF's MSIX vector info */
struct rsrc_bmap msix;  /* Bitmap for MSIX vector alloc */
@@ -230,6 +231,7 @@ static inline void rvu_get_cgx_lmac_id(u8 map, u8 *cgx_id, 
u8 *lmac_id)
 
 int rvu_cgx_probe(struct rvu *rvu);
 void rvu_cgx_wq_destroy(struct rvu *rvu);
+void *rvu_cgx_pdata(u8 cgx_id, struct rvu *rvu);
 int rvu_cgx_config_rxtx(struct rvu *rvu, u16 pcifunc, bool start);
 int rvu_mbox_handler_CGX_START_RXTX(struct rvu *rvu, struct msg_req *req,
struct msg_rsp *rsp);
@@ -302,4 +304,6 @@ int rvu_mbox_handler_NIX_STATS_RST(struct rvu *rvu, struct 
msg_req *req,
 /* NPC APIs */
 int rvu_npc_init(struct rvu *rvu);
 void rvu_npc_freemem(struct rvu *rvu);
+int rvu_npc_get_pkind(struct rvu *rvu, u16 pf);
+void rvu_npc_set_pkind(struct rvu *rvu, int pkind, struct rvu_pfvf *pfvf);
 #endif /* RVU_H */
diff --git a/drivers/net/ethernet/marvell/octeontx2/af/rvu_cgx.c 
b/drivers/net/ethernet/marvell/octeontx2/af/rvu_cgx.c
index e0aee21..188185c 100644
--- a/drivers/net/ethernet/marvell/octeontx2/af/rvu_cgx.c
+++ b/drivers/net/ethernet/marvell/octeontx2/af/rvu_cgx.c
@@ -50,7 +50,7 @@ static inline u8 cgxlmac_id_to_bmap(u8 cgx_id, u8 lmac_id)
return ((cgx_id & 0xF) << 4) | (lmac_id & 0xF);
 }
 
-static void *rvu_cgx_pdata(u8 cgx_id, struct rvu *rvu)
+void *rvu_cgx_pdata(u8 cgx_id, struct rvu *rvu)
 {
if (cgx_id >= rvu->cgx_cnt)
return NULL;
@@ -60,10 +60,11 @@ static void *rvu_cgx_pdata(u8 cgx_id, struct rvu *rvu)
 
 static int rvu_map_cgx_lmac_pf(struct rvu *rvu)
 {
+   struct npc_pkind *pkind = &rvu->hw->pkind;
int cgx_cnt = rvu->cgx_cnt;
int cgx, lmac_cnt, lmac;
int pf =

[PATCH 10/17] octeontx2-af: Enable packet length and csum validation

2018-10-19 Thread sunil . kovvuri

From: Sunil Goutham 

Config NPC layer info from KPU profile into protocol
checker to identify outer L2/IPv4/TCP/UDP headers in a
packet. And enable IPv4 checksum validation.

L3/L4 and L4 CSUM validation will be enabled by PF/VF
drivers by configuring NIX_AF_LF(0..127)_RX_CFG via mbox
i.e 'nix_lf_alloc_req->rx_cfg'

Also enable setting of NPC_RESULT_S[L2B] when an outer
L2 broadcast address is detected. This will help in
installing NPC MCAM rules for broadcast packets.

Signed-off-by: Sunil Goutham 
---
 drivers/net/ethernet/marvell/octeontx2/af/rvu_nix.c | 14 ++
 drivers/net/ethernet/marvell/octeontx2/af/rvu_npc.c | 14 ++
 2 files changed, 28 insertions(+)

diff --git a/drivers/net/ethernet/marvell/octeontx2/af/rvu_nix.c 
b/drivers/net/ethernet/marvell/octeontx2/af/rvu_nix.c
index 7de5417..02e1d16 100644
--- a/drivers/net/ethernet/marvell/octeontx2/af/rvu_nix.c
+++ b/drivers/net/ethernet/marvell/octeontx2/af/rvu_nix.c
@@ -14,6 +14,7 @@
 #include "rvu_struct.h"
 #include "rvu_reg.h"
 #include "rvu.h"
+#include "npc.h"
 #include "cgx.h"
 
 static int nix_update_bcast_mce_list(struct rvu *rvu, u16 pcifunc, bool add);
@@ -1630,6 +1631,19 @@ int rvu_nix_init(struct rvu *rvu)
err = nix_setup_mcast(rvu, hw->nix0, blkaddr);
if (err)
return err;
+
+   /* Config Outer L2, IP, TCP and UDP's NPC layer info.
+* This helps HW protocol checker to identify headers
+* and validate length and checksums.
+*/
+   rvu_write64(rvu, blkaddr, NIX_AF_RX_DEF_OL2,
+   (NPC_LID_LA << 8) | (NPC_LT_LA_ETHER << 4) | 0x0F);
+   rvu_write64(rvu, blkaddr, NIX_AF_RX_DEF_OUDP,
+   (NPC_LID_LD << 8) | (NPC_LT_LD_UDP << 4) | 0x0F);
+   rvu_write64(rvu, blkaddr, NIX_AF_RX_DEF_OTCP,
+   (NPC_LID_LD << 8) | (NPC_LT_LD_TCP << 4) | 0x0F);
+   rvu_write64(rvu, blkaddr, NIX_AF_RX_DEF_OIP4,
+   (NPC_LID_LC << 8) | (NPC_LT_LC_IP << 4) | 0x0F);
}
return 0;
 }
diff --git a/drivers/net/ethernet/marvell/octeontx2/af/rvu_npc.c 
b/drivers/net/ethernet/marvell/octeontx2/af/rvu_npc.c
index a973895..cc1d8c9 100644
--- a/drivers/net/ethernet/marvell/octeontx2/af/rvu_npc.c
+++ b/drivers/net/ethernet/marvell/octeontx2/af/rvu_npc.c
@@ -220,6 +220,20 @@ int rvu_npc_init(struct rvu *rvu)
/* Configure KPU profile */
npc_parser_profile_init(rvu, blkaddr);
 
+   /* Config Outer L2, IPv4's NPC layer info */
+   rvu_write64(rvu, blkaddr, NPC_AF_PCK_DEF_OL2,
+   (NPC_LID_LA << 8) | (NPC_LT_LA_ETHER << 4) | 0x0F);
+   rvu_write64(rvu, blkaddr, NPC_AF_PCK_DEF_OIP4,
+   (NPC_LID_LC << 8) | (NPC_LT_LC_IP << 4) | 0x0F);
+
+   /* Enable below for Rx pkts.
+* - Outer IPv4 header checksum validation.
+* - Detect outer L2 broadcast address and set NPC_RESULT_S[L2M].
+*/
+   rvu_write64(rvu, blkaddr, NPC_AF_PCK_CFG,
+   rvu_read64(rvu, blkaddr, NPC_AF_PCK_CFG) |
+   BIT_ULL(6) | BIT_ULL(2));
+
return 0;
 }
 
-- 
2.7.4

Re: [PATCH net-next] octeontx2-af: Remove set but not used variable 'block'

2018-10-19 Thread Sunil Kovvuri

On Fri, Oct 19, 2018 at 6:11 PM YueHaibing  wrote:
>
> Fixes gcc '-Wunused-but-set-variable' warning:
>
> drivers/net/ethernet/marvell/octeontx2/af/rvu_npa.c: In function 
> 'rvu_npa_init':
> drivers/net/ethernet/marvell/octeontx2/af/rvu_npa.c:446:20: warning:
>  variable 'block' set but not used [-Wunused-but-set-variable]
>
> It never used since introduction in
> commit 7a37245ef23f ("octeontx2-af: NPA block admin queue init")
>
> Signed-off-by: YueHaibing 
> ---
>  drivers/net/ethernet/marvell/octeontx2/af/rvu_npa.c | 3 ---
>  1 file changed, 3 deletions(-)
>
> diff --git a/drivers/net/ethernet/marvell/octeontx2/af/rvu_npa.c 
> b/drivers/net/ethernet/marvell/octeontx2/af/rvu_npa.c
> index 0e43a69..7531fdc 100644
> --- a/drivers/net/ethernet/marvell/octeontx2/af/rvu_npa.c
> +++ b/drivers/net/ethernet/marvell/octeontx2/af/rvu_npa.c
> @@ -443,15 +443,12 @@ static int npa_aq_init(struct rvu *rvu, struct 
> rvu_block *block)
>  int rvu_npa_init(struct rvu *rvu)
>  {
> struct rvu_hwinfo *hw = rvu->hw;
> -   struct rvu_block *block;
> int blkaddr, err;
>
> blkaddr = rvu_get_blkaddr(rvu, BLKTYPE_NPA, 0);
> if (blkaddr < 0)
> return 0;
>
> -   block = &hw->block[blkaddr];
> -
> /* Initialize admin queue */
> err = npa_aq_init(rvu, &hw->block[blkaddr]);
> if (err)
>

Thanks for the patch.
Which GCC version do you use ?
Before submitting patches I did test compiling specifically with these
"make  arch=X86 -j8 -Werror=unused-function -Wunused-but-set-variable"
but that didn't throw these warnings.

Thanks,
Sunil.

[PATCH net-next] octeontx2-af: Remove set but not used variable 'block'

2018-10-19 Thread YueHaibing

Fixes gcc '-Wunused-but-set-variable' warning:

drivers/net/ethernet/marvell/octeontx2/af/rvu_npa.c: In function 'rvu_npa_init':
drivers/net/ethernet/marvell/octeontx2/af/rvu_npa.c:446:20: warning:
 variable 'block' set but not used [-Wunused-but-set-variable]

It never used since introduction in
commit 7a37245ef23f ("octeontx2-af: NPA block admin queue init")

Signed-off-by: YueHaibing 
---
 drivers/net/ethernet/marvell/octeontx2/af/rvu_npa.c | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/drivers/net/ethernet/marvell/octeontx2/af/rvu_npa.c 
b/drivers/net/ethernet/marvell/octeontx2/af/rvu_npa.c
index 0e43a69..7531fdc 100644
--- a/drivers/net/ethernet/marvell/octeontx2/af/rvu_npa.c
+++ b/drivers/net/ethernet/marvell/octeontx2/af/rvu_npa.c
@@ -443,15 +443,12 @@ static int npa_aq_init(struct rvu *rvu, struct rvu_block 
*block)
 int rvu_npa_init(struct rvu *rvu)
 {
struct rvu_hwinfo *hw = rvu->hw;
-   struct rvu_block *block;
int blkaddr, err;
 
blkaddr = rvu_get_blkaddr(rvu, BLKTYPE_NPA, 0);
if (blkaddr < 0)
return 0;
 
-   block = &hw->block[blkaddr];
-
/* Initialize admin queue */
err = npa_aq_init(rvu, &hw->block[blkaddr]);
if (err)

[PATCH net-next] igc: Remove set but not used variable 'pci_using_dac'

2018-10-19 Thread YueHaibing

Fixes gcc '-Wunused-but-set-variable' warning:

drivers/net/ethernet/intel/igc/igc_main.c: In function 'igc_probe':
drivers/net/ethernet/intel/igc/igc_main.c:3535:11: warning:
 variable 'pci_using_dac' set but not used [-Wunused-but-set-variable]

It never used since introduction in commit
d89f88419f99 ("igc: Add skeletal frame for Intel(R) 2.5G Ethernet Controller 
support")

Signed-off-by: YueHaibing 
---
 drivers/net/ethernet/intel/igc/igc_main.c | 5 +
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/drivers/net/ethernet/intel/igc/igc_main.c 
b/drivers/net/ethernet/intel/igc/igc_main.c
index 9d85707..06a4afbe 100644
--- a/drivers/net/ethernet/intel/igc/igc_main.c
+++ b/drivers/net/ethernet/intel/igc/igc_main.c
@@ -3532,19 +3532,16 @@ static int igc_probe(struct pci_dev *pdev,
struct net_device *netdev;
struct igc_hw *hw;
const struct igc_info *ei = igc_info_tbl[ent->driver_data];
-   int err, pci_using_dac;
+   int err;
 
err = pci_enable_device_mem(pdev);
if (err)
return err;
 
-   pci_using_dac = 0;
err = dma_set_mask(&pdev->dev, DMA_BIT_MASK(64));
if (!err) {
err = dma_set_coherent_mask(&pdev->dev,
DMA_BIT_MASK(64));
-   if (!err)
-   pci_using_dac = 1;
} else {
err = dma_set_mask(&pdev->dev, DMA_BIT_MASK(32));
if (err) {

1 2 >

1 - 100 of 113 matches

Mail list logo