Re: [PATCH net] net/act_pedit: fix an error code

2017-06-14 Thread Amir Vadai
On Wed, Jun 14, 2017 at 01:29:31PM +0300, Dan Carpenter wrote:
> I'm reviewing static checker warnings where we do ERR_PTR(0), which is
> the same as NULL.  I'm pretty sure we intended to return ERR_PTR(-EINVAL)
> here.  Sometimes these bugs lead to a NULL dereference but I don't
> immediately see that problem here.
> 
> Fixes: 71d0ed7079df ("net/act_pedit: Support using offset relative to the 
> conventional network headers")
> Signed-off-by: Dan Carpenter <dan.carpen...@oracle.com>
> 
You are right, it was intended to be -EINVAL. Thanks.

Acked-by: Amir Vadai <a...@vadai.me>


[PATCH iproute2 master 3/4] pedit: Check for extended capability in protocol parser

2017-05-14 Thread Amir Vadai
Do not allow using eth and udp header types if non-extended pedit kABI
is being used. Other protocol parsers already have this check.

Signed-off-by: Amir Vadai <a...@vadai.me>
---
 tc/p_eth.c | 3 +++
 tc/p_udp.c | 3 +++
 2 files changed, 6 insertions(+)

diff --git a/tc/p_eth.c b/tc/p_eth.c
index ad3e28f80eb6..2d2f96ca2f0f 100644
--- a/tc/p_eth.c
+++ b/tc/p_eth.c
@@ -34,6 +34,9 @@ parse_eth(int *argc_p, char ***argv_p,
if (argc < 2)
return -1;
 
+   if (!sel->extended)
+   return -1;
+
tkey->htype = TCA_PEDIT_KEY_EX_HDR_TYPE_ETH;
 
if (strcmp(*argv, "type") == 0) {
diff --git a/tc/p_udp.c b/tc/p_udp.c
index a56a1b519254..3916d9586040 100644
--- a/tc/p_udp.c
+++ b/tc/p_udp.c
@@ -34,6 +34,9 @@ parse_udp(int *argc_p, char ***argv_p,
if (argc < 2)
return -1;
 
+   if (!sel->extended)
+   return -1;
+
tkey->htype = TCA_PEDIT_KEY_EX_HDR_TYPE_UDP;
 
if (strcmp(*argv, "sport") == 0) {
-- 
2.12.2



[PATCH iproute2 master 2/4] pedit: Do not allow using retain for too big fields

2017-05-14 Thread Amir Vadai
Using retain for fields longer than 32 bits is not supported.
Do not allow user to do it.

Signed-off-by: Amir Vadai <a...@vadai.me>
---
 man/man8/tc-pedit.8 | 3 ++-
 tc/m_pedit.c| 6 ++
 2 files changed, 8 insertions(+), 1 deletion(-)

diff --git a/man/man8/tc-pedit.8 b/man/man8/tc-pedit.8
index 7f482eafc6c7..9c4d57b972cc 100644
--- a/man/man8/tc-pedit.8
+++ b/man/man8/tc-pedit.8
@@ -266,7 +266,8 @@ Keep the addressed data as is.
 .BI retain " RVAL"
 This optional extra part of
 .I CMD_SPEC
-allows to exclude bits from being changed.
+allows to exclude bits from being changed. Supported only for 32 bits fields
+or smaller.
 .TP
 .I CONTROL
 The following keywords allow to control how the tree of qdisc, classes,
diff --git a/tc/m_pedit.c b/tc/m_pedit.c
index 7ef2acc52bce..9b74c965932e 100644
--- a/tc/m_pedit.c
+++ b/tc/m_pedit.c
@@ -353,6 +353,12 @@ int parse_cmd(int *argc_p, char ***argv_p, __u32 len, int 
type, __u32 retain,
argv++;
}
 
+   if (len > 4 && retain != ~0) {
+   fprintf(stderr,
+   "retain is not supported for fields longer the 32 
bits\n");
+   return -1;
+   }
+
if (type == TMAC) {
res = pack_mac(sel, tkey, (__u8 *)val);
goto done;
-- 
2.12.2



[PATCH iproute2 master 1/4] pedit: Fix a typo in warning

2017-05-14 Thread Amir Vadai
'ex' attribute should be placed after 'action pedit' and not after
'munge'.

Signed-off-by: Amir Vadai <a...@vadai.me>
---
 tc/m_pedit.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tc/m_pedit.c b/tc/m_pedit.c
index 6498dd91b471..7ef2acc52bce 100644
--- a/tc/m_pedit.c
+++ b/tc/m_pedit.c
@@ -146,7 +146,7 @@ int pack_key(struct m_pedit_sel *_sel, struct m_pedit_key 
*tkey)
if (tkey->htype != TCA_PEDIT_KEY_EX_HDR_TYPE_NETWORK ||
tkey->cmd != TCA_PEDIT_KEY_EX_CMD_SET) {
fprintf(stderr,
-   "Munge parameters not supported. Use 'munge 
ex'.\n");
+   "Munge parameters not supported. Use 'pedit ex 
munge ...'.\n");
return -1;
}
}
-- 
2.12.2



[PATCH iproute2 master 4/4] pedit: Introduce ipv6 support

2017-05-14 Thread Amir Vadai
Add support for modifying IPv6 headers using pedit.

Signed-off-by: Amir Vadai <a...@vadai.me>
---
 man/man8/tc-pedit.8 | 30 ++
 tc/Makefile |  1 +
 tc/m_pedit.c| 43 +++--
 tc/p_ip.c   | 17 +-
 tc/p_ip6.c  | 91 +
 5 files changed, 164 insertions(+), 18 deletions(-)
 create mode 100644 tc/p_ip6.c

diff --git a/man/man8/tc-pedit.8 b/man/man8/tc-pedit.8
index 9c4d57b972cc..82d4217bc958 100644
--- a/man/man8/tc-pedit.8
+++ b/man/man8/tc-pedit.8
@@ -33,6 +33,8 @@ pedit - generic packet editor action
 |
 .BI ip " EX_IPHDR_FIELD"
 |
+.BI ip6 " IP6HDR_FIELD"
+|
 .BI tcp " TCPHDR_FIELD"
 |
 .BI udp " UDPHDR_FIELD"
@@ -55,6 +57,12 @@ pedit - generic packet editor action
 .IR EX_IPHDR_FIELD " := { "
 .BR ttl " }"
 
+
+.ti -8
+.IR IP6HDR_FIELD " := { "
+.BR src " | " dst " | " flow_lbl " | " payload_len " | " nexthdr " |"
+.BR hoplimit " }"
+
 .ti -8
 .IR TCPHDR_FIELD " := { "
 .BR sport " | " dport " | " flags " }"
@@ -211,6 +219,25 @@ are:
 .B ttl
 .RE
 .TP
+.BI ip6 " IP6HDR_FIELD"
+The supported keywords for
+.I IP6HDR_FIELD
+are:
+.RS
+.TP
+.B src
+.TQ
+.B dst
+.TQ
+.B flow_lbl
+.TQ
+.B payload_len
+.TQ
+.B nexthdr
+.TQ
+.B hoplimit
+.RE
+.TP
 .BI tcp " TCPHDR_FIELD"
 The supported keywords for
 .I TCPHDR_FIELD
@@ -331,6 +358,9 @@ tc filter add dev eth0 parent : u32 \\
action pedit ex munge ip dst set 192.168.1.199
 tc filter add dev eth0 parent : u32 \\
match ip sport 22 0x \\
+   action pedit ex munge ip6 dst set fe80::dacb:8aff:fec7:320e
+tc filter add dev eth0 parent : u32 \\
+   match ip sport 22 0x \\
action pedit ex munge eth dst set 11:22:33:44:55:66
 tc filter add dev eth0 parent : u32 \\
match ip dport 23 0x \\
diff --git a/tc/Makefile b/tc/Makefile
index 446a11391ad7..9a6bb1ddea57 100644
--- a/tc/Makefile
+++ b/tc/Makefile
@@ -53,6 +53,7 @@ TCMODULES += m_bpf.o
 TCMODULES += m_tunnel_key.o
 TCMODULES += m_sample.o
 TCMODULES += p_ip.o
+TCMODULES += p_ip6.o
 TCMODULES += p_icmp.o
 TCMODULES += p_eth.o
 TCMODULES += p_tcp.o
diff --git a/tc/m_pedit.c b/tc/m_pedit.c
index 9b74c965932e..dfa6b2c4835e 100644
--- a/tc/m_pedit.c
+++ b/tc/m_pedit.c
@@ -257,6 +257,32 @@ static int pack_mac(struct m_pedit_sel *sel, struct 
m_pedit_key *tkey,
return ret;
 }
 
+static int pack_ipv6(struct m_pedit_sel *sel, struct m_pedit_key *tkey,
+__u32 *ipv6)
+{
+   int ret = 0;
+   int i;
+
+   if (tkey->off & 0x3) {
+   fprintf(stderr,
+   "pack_ipv6: IPv6 offsets must begin in 32bit 
boundaries\n");
+   return -1;
+   }
+
+   for (i = 0; i < 4; i++) {
+   tkey->mask = 0;
+   tkey->val = ntohl(ipv6[i]);
+
+   ret = pack_key32(~0, sel, tkey);
+   if (ret)
+   return ret;
+
+   tkey->off += 4;
+   }
+
+   return 0;
+}
+
 int parse_val(int *argc_p, char ***argv_p, __u32 *val, int type)
 {
int argc = *argc_p;
@@ -281,8 +307,16 @@ int parse_val(int *argc_p, char ***argv_p, __u32 *val, int 
type)
return 0;
}
 
-   if (type == TIPV6)
-   return -1; /* not implemented yet */
+   if (type == TIPV6) {
+   inet_prefix addr;
+
+   if (get_prefix_1(, *argv, AF_INET6))
+   return -1;
+
+   memcpy(val, addr.data, addr.bytelen);
+
+   return 0;
+   }
 
if (type == TMAC) {
 #define MAC_ALEN 6
@@ -364,6 +398,11 @@ int parse_cmd(int *argc_p, char ***argv_p, __u32 len, int 
type, __u32 retain,
goto done;
}
 
+   if (type == TIPV6) {
+   res = pack_ipv6(sel, tkey, val);
+   goto done;
+   }
+
tkey->val = *v;
tkey->mask = *m;
 
diff --git a/tc/p_ip.c b/tc/p_ip.c
index 22fe6505e427..0272a6eaaf48 100644
--- a/tc/p_ip.c
+++ b/tc/p_ip.c
@@ -1,5 +1,5 @@
 /*
- * m_pedit.c   packet editor: IPV4/6 header
+ * p_ip.c  packet editor: IPV4 header
  *
  * This program is free software; you can distribute it and/or
  * modify it under the terms of the GNU General Public License
@@ -156,23 +156,8 @@ done:
return res;
 }
 
-static int
-parse_ip6(int *argc_p, char ***argv_p,
- struct m_pedit_sel *sel, struct m_pedit_key *tkey)
-{
-   int res = -1;
-   return res;
-}
-
 struct m_pedit_util p_pedit_ip = {
NULL,
"ip",
parse_ip,
 };
-
-
-struct m_pedit_util p_pedit_ip6 = {
-   NULL,
-   "ip6",
-   parse_ip6,
-};
diff --git a

[PATCH iproute2 master 0/4] pedit: Introduce IPv6 support + some minor fixes

2017-05-14 Thread Amir Vadai
Hi,

This patchset introduces pedit IPv6 support.
Almost all IPv6 header fields are editable now (src, dst, flow_lbl,
payload_len, next_hdr and hoplimit).
The patch uses the new extended pedit netlink and will fail the operation if
kernel has no support or user didn't use the 'ex' keyword.
In addition to this patch, 3 more patches fix some minor UI issues:
- some typo's
- 'retain' can't be used with fields > 32 bits. It will make unexpected things
when used in such fields. Fixing this limitiation requires some changes 
(in
tc user space only) that are out of the scope of this patchset. So I 
added a
patch to prevent the user from using retain on those fields.


Thanks,
Amir    

Amir Vadai (4):
  pedit: Fix a typo in warning
  pedit: Do not allow using retain for too big fields
  pedit: Check for extended capability in protocol parser
  pedit: Introduce ipv6 support

 man/man8/tc-pedit.8 | 33 ++-
 tc/Makefile |  1 +
 tc/m_pedit.c| 51 --
 tc/p_eth.c  |  3 ++
 tc/p_ip.c   | 17 +-
 tc/p_ip6.c  | 91 +
 tc/p_udp.c  |  3 ++
 7 files changed, 179 insertions(+), 20 deletions(-)
 create mode 100644 tc/p_ip6.c

-- 
2.12.2



Re: [PATCH iproute2 net 0/8] tc/act_pedit: Support offset relative to conventional header

2017-05-03 Thread Amir Vadai
On Mon, May 01, 2017 at 09:26:25AM -0700, Stephen Hemminger wrote:
> On Sun, 23 Apr 2017 15:53:48 +0300
> Amir Vadai <a...@vadai.me> wrote:
> 
> > Hi Stephen,
> > 
> > This patchset extends pedit to support modifying a field in an offset 
> > relative
> > to the conventional network headers (kenrel support was added [1] in 4.11 
> > rc1).
> > Without the extended pedit, user could specify fields in TCP and ICMP 
> > headers,
> > but the kernel code was using an offset relative to the begining of the IP
> > header. This will break if IP header length is greater than the minimal 
> > value
> > of 20, or if L3 is not IPv4.
> > 
> > It also introduces support in manipulating ETH, TCP, UDP and IP.ttl fields 
> > and
> > a new command to increase/decrease the value of a field (current use case 
> > is IP.ttl).
> > 
> > Since there might be deployments already using pedit, special consideration 
> > was
> > taken, not to break those scripts - only by specifying the special keyword
> > 'ex', the extended capabilities are available, thus there should be no 
> > impact
> > on existing scripts.
> > Also, the new code can live together with rules added by the old code. It
> > supports both the old netlink and the new one.
> > 
> > This patchset is against the master and not net-next as the functionality 
> > was
> > added in 4.11
> > 
> > Thanks,
> > Amir
> > 
> > [1] - 71d0ed7079df ("net/act_pedit: Support using offset relative to the
> >  conventional network headers")

[...]

> 
> Applied. Then I cleaned up long lines

Thanks. Will make sure to clean up long lines in future patches.


Re: [PATCH iproute2 net 3/8] tc/pedit: Introduce 'add' operation

2017-04-24 Thread Amir Vadai
On Sun, Apr 23, 2017 at 01:44:51PM -0400, Jamal Hadi Salim wrote:
> 
> Thanks for the excellent work.
> 
> On 17-04-23 08:53 AM, Amir Vadai wrote:
> > This command could be useful to increase/decrease fields value.
> > 
> 
> Does this contradict the "retain" feature? Example rule to
> retain the second nibble but set the first to 0xA
> (essentially it X-ORs):
> tc filter add dev lo parent : protocol ip prio 10 \
> u32 match ip dst 127.0.0.1/32 flowid 1:1 \
> action pedit munge offset 0 u8 set 0x0A retain 0xF0

You mean, for example increasing a single nible in an ipv4 address?
If so, then it should work:
# tc filter add dev veth0 parent :  u32 \
match ip dport 23 0x \
action pedit ex \
  munge ip src add 1.0.0.0 retain 0xff00

# tc -s filter show dev veth0 parent :
filter protocol all pref 49151 u32
filter protocol all pref 49151 u32 fh 801: ht divisor 1
filter protocol all pref 49151 u32 fh 801::800 order 2048 key ht 801 bkt 0 
terminal flowid ???  (rule hit 0 success 0)
  match 0017/ at 20 (success 0 )
action order 1:  pedit action pass keys 1
 index 48 ref 1 bind 1 installed 1 sec used 1 sec
 key #0  at ipv4+12: add 0100 mask 00ff
Action statistics:
Sent 0 bytes 0 pkt (dropped 0, overlimits 0 requeues 0)
backlog 0b 0p requeues 0

> 
> Mostly curious about hardware handling.
As to hardware handling, Or did the implementation so he will answer
better. AFAIK, current implementation doesn't allow partial field
setting/adding, so such a rule can't be offloaded. I think it is only to
make things simple and because there was no use case for that.
To my knowledge, hardware should support such thing if needed.

Amir

> 
> cheers,
> jamal
> 


[PATCH iproute2 net 8/8] tc/pedit: p_udp: introduce pedit udp support

2017-04-23 Thread Amir Vadai
From: Or Gerlitz <ogerl...@mellanox.com>

For example, forward udp traffic destined to port 999 to veth0 and set
tcp port to 888:
$ tc filter add dev enp0s9 protocol ip parent : \
flower \
  ip_proto udp \
  dst_port 999 \
action pedit ex munge \
  udp dport set 888 \
action mirred egress \
  redirect dev veth0

Signed-off-by: Or Gerlitz <ogerl...@mellanox.com>
Signed-off-by: Amir Vadai <a...@vadai.me>
---
 man/man8/tc-pedit.8 | 18 ++
 tc/p_udp.c  | 27 +++
 2 files changed, 45 insertions(+)

diff --git a/man/man8/tc-pedit.8 b/man/man8/tc-pedit.8
index ad1929592660..7f482eafc6c7 100644
--- a/man/man8/tc-pedit.8
+++ b/man/man8/tc-pedit.8
@@ -34,6 +34,8 @@ pedit - generic packet editor action
 .BI ip " EX_IPHDR_FIELD"
 |
 .BI tcp " TCPHDR_FIELD"
+|
+.BI udp " UDPHDR_FIELD"
 .RI } " CMD_SPEC"
 
 .ti -8
@@ -58,6 +60,10 @@ pedit - generic packet editor action
 .BR sport " | " dport " | " flags " }"
 
 .ti -8
+.IR UDPHDR_FIELD " := { "
+.BR sport " | " dport " }"
+
+.ti -8
 .IR CMD_SPEC " := {"
 .BR clear " | " invert " | " set
 .IR VAL " | "
@@ -219,6 +225,18 @@ Source or destination TCP port number, a 16-bit value.
 .B flags
 .RE
 .TP
+.BI udp " UDPHDR_FIELD"
+The supported keywords for
+.I UDPHDR_FIELD
+are:
+.RS
+.TP
+.B sport
+.TQ
+.B dport
+Source or destination TCP port number, a 16-bit value.
+.RE
+.TP
 .B clear
 Clear the addressed data (i.e., set it to zero).
 .TP
diff --git a/tc/p_udp.c b/tc/p_udp.c
index 3a86ba382391..a56a1b519254 100644
--- a/tc/p_udp.c
+++ b/tc/p_udp.c
@@ -28,6 +28,33 @@ parse_udp(int *argc_p, char ***argv_p,
  struct m_pedit_sel *sel, struct m_pedit_key *tkey)
 {
int res = -1;
+   int argc = *argc_p;
+   char **argv = *argv_p;
+
+   if (argc < 2)
+   return -1;
+
+   tkey->htype = TCA_PEDIT_KEY_EX_HDR_TYPE_UDP;
+
+   if (strcmp(*argv, "sport") == 0) {
+   NEXT_ARG();
+   tkey->off = 0;
+   res = parse_cmd(, , 2, TU32, RU16, sel, tkey);
+   goto done;
+   }
+
+   if (strcmp(*argv, "dport") == 0) {
+   NEXT_ARG();
+   tkey->off = 2;
+   res = parse_cmd(, , 2, TU32, RU16, sel, tkey);
+   goto done;
+   }
+
+   return -1;
+
+done:
+   *argc_p = argc;
+   *argv_p = argv;
return res;
 }
 
-- 
2.12.0



[PATCH iproute2 net 7/8] tc/pedit: p_tcp: introduce pedit tcp support

2017-04-23 Thread Amir Vadai
For example, forward tcp traffic destined to port 80 to veth0 and set
tcp port to 8080:
$ tc filter add dev enp0s9 protocol ip parent : \
flower \
  ip_proto tcp \
  dst_port 80 \
action pedit ex munge \
  tcp dport set 8080 \
action mirred egress \
  redirect dev veth0

Signed-off-by: Amir Vadai <a...@vadai.me>
---
 man/man8/tc-pedit.8 | 23 +++
 tc/p_tcp.c  | 37 +
 2 files changed, 60 insertions(+)

diff --git a/man/man8/tc-pedit.8 b/man/man8/tc-pedit.8
index 8febdfe23f6e..ad1929592660 100644
--- a/man/man8/tc-pedit.8
+++ b/man/man8/tc-pedit.8
@@ -32,6 +32,8 @@ pedit - generic packet editor action
 .BI ip " IPHDR_FIELD"
 |
 .BI ip " EX_IPHDR_FIELD"
+|
+.BI tcp " TCPHDR_FIELD"
 .RI } " CMD_SPEC"
 
 .ti -8
@@ -52,6 +54,10 @@ pedit - generic packet editor action
 .BR ttl " }"
 
 .ti -8
+.IR TCPHDR_FIELD " := { "
+.BR sport " | " dport " | " flags " }"
+
+.ti -8
 .IR CMD_SPEC " := {"
 .BR clear " | " invert " | " set
 .IR VAL " | "
@@ -199,6 +205,20 @@ are:
 .B ttl
 .RE
 .TP
+.BI tcp " TCPHDR_FIELD"
+The supported keywords for
+.I TCPHDR_FIELD
+are:
+.RS
+.TP
+.B sport
+.TQ
+.B dport
+Source or destination TCP port number, a 16-bit value.
+.TP
+.B flags
+.RE
+.TP
 .B clear
 Clear the addressed data (i.e., set it to zero).
 .TP
@@ -293,6 +313,9 @@ tc filter add dev eth0 parent : u32 \\
 tc filter add dev eth0 parent : u32 \\
match ip sport 22 0x \\
action pedit ex munge eth dst set 11:22:33:44:55:66
+tc filter add dev eth0 parent : u32 \\
+   match ip dport 23 0x \\
+   action pedit ex munge tcp dport set 22
 .EE
 .RE
 .SH SEE ALSO
diff --git a/tc/p_tcp.c b/tc/p_tcp.c
index 53ee9842160b..cf14574c9c3e 100644
--- a/tc/p_tcp.c
+++ b/tc/p_tcp.c
@@ -28,6 +28,43 @@ parse_tcp(int *argc_p, char ***argv_p,
  struct m_pedit_sel *sel, struct m_pedit_key *tkey)
 {
int res = -1;
+   int argc = *argc_p;
+   char **argv = *argv_p;
+
+   if (argc < 2)
+   return -1;
+
+   if (!sel->extended)
+   return -1;
+
+   tkey->htype = TCA_PEDIT_KEY_EX_HDR_TYPE_TCP;
+
+   if (strcmp(*argv, "sport") == 0) {
+   NEXT_ARG();
+   tkey->off = 0;
+   res = parse_cmd(, , 2, TU32, RU16, sel, tkey);
+   goto done;
+   }
+
+   if (strcmp(*argv, "dport") == 0) {
+   NEXT_ARG();
+   tkey->off = 2;
+   res = parse_cmd(, , 2, TU32, RU16, sel, tkey);
+   goto done;
+   }
+
+   if (strcmp(*argv, "flags") == 0) {
+   NEXT_ARG();
+   tkey->off = 13;
+   res = parse_cmd(, , 1, TU32, RU8, sel, tkey);
+   goto done;
+   }
+
+   return -1;
+
+done:
+   *argc_p = argc;
+   *argv_p = argv;
return res;
 }
 struct m_pedit_util p_pedit_tcp = {
-- 
2.12.0



[PATCH iproute2 net 5/8] tc/pedit: Support fields bigger than 32 bits

2017-04-23 Thread Amir Vadai
Make parse_val() accept fields up to 128 bits long, this should be
enough for current use cases and involves a minimal change to code.

Signed-off-by: Amir Vadai <a...@vadai.me>
---
 tc/m_pedit.c | 13 -
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/tc/m_pedit.c b/tc/m_pedit.c
index 7af074a5a97c..d982c91a2585 100644
--- a/tc/m_pedit.c
+++ b/tc/m_pedit.c
@@ -256,7 +256,10 @@ int parse_val(int *argc_p, char ***argv_p, __u32 *val, int 
type)
 int parse_cmd(int *argc_p, char ***argv_p, __u32 len, int type, __u32 retain,
  struct m_pedit_sel *sel, struct m_pedit_key *tkey)
 {
-   __u32 mask = 0, val = 0;
+   __u32 mask[4] = { 0 };
+   __u32 val[4] = { 0 };
+   __u32 *m = [0];
+   __u32 *v = [0];
__u32 o = 0xFF;
int res = -1;
int argc = *argc_p;
@@ -275,7 +278,7 @@ int parse_cmd(int *argc_p, char ***argv_p, __u32 len, int 
type, __u32 retain,
o = 0x;
 
if (matches(*argv, "invert") == 0) {
-   val = mask = o;
+   *v = *m = o;
} else if (matches(*argv, "set") == 0 ||
   matches(*argv, "add") == 0) {
if (matches(*argv, "add") == 0)
@@ -287,7 +290,7 @@ int parse_cmd(int *argc_p, char ***argv_p, __u32 len, int 
type, __u32 retain,
}
 
NEXT_ARG();
-   if (parse_val(, , , type))
+   if (parse_val(, , val, type))
return -1;
} else if (matches(*argv, "preserve") == 0) {
retain = 0;
@@ -307,8 +310,8 @@ int parse_cmd(int *argc_p, char ***argv_p, __u32 len, int 
type, __u32 retain,
argv++;
}
 
-   tkey->val = val;
-   tkey->mask = mask;
+   tkey->val = *v;
+   tkey->mask = *m;
 
if (type == TIPV4)
tkey->val = ntohl(tkey->val);
-- 
2.12.0



[PATCH iproute2 net 6/8] tc/pedit: p_eth: ETH header editor

2017-04-23 Thread Amir Vadai
For example, forward tcp traffic to veth0 and set
destination mac address to 11:22:33:44:55:66 :
$ tc filter add dev enp0s9 protocol ip parent : \
flower \
  ip_proto tcp \
action pedit ex munge \
  eth dst set 11:22:33:44:55:66 \
action mirred egress \
  redirect dev veth0

Signed-off-by: Amir Vadai <a...@vadai.me>
---
 man/man8/tc-pedit.8 | 24 ++
 tc/Makefile |  1 +
 tc/m_pedit.c| 46 ++
 tc/m_pedit.h|  1 +
 tc/p_eth.c  | 72 +
 5 files changed, 144 insertions(+)
 create mode 100644 tc/p_eth.c

diff --git a/man/man8/tc-pedit.8 b/man/man8/tc-pedit.8
index c98d95cb0021..8febdfe23f6e 100644
--- a/man/man8/tc-pedit.8
+++ b/man/man8/tc-pedit.8
@@ -27,12 +27,18 @@ pedit - generic packet editor action
 
 .ti -8
 .IR EXTENDED_LAYERED_OP " := { "
+.BI eth " ETHHDR_FIELD"
+|
 .BI ip " IPHDR_FIELD"
 |
 .BI ip " EX_IPHDR_FIELD"
 .RI } " CMD_SPEC"
 
 .ti -8
+.IR ETHHDR_FIELD " := { "
+.BR src " | " dst " | " type " }"
+
+.ti -8
 .IR IPHDR_FIELD " := { "
 .BR src " | " dst " | " tos " | " dsfield " | " ihl " | " protocol " |"
 .BR precedence " | " nofrag " | " firstfrag " | " ce " | " df " }"
@@ -103,6 +109,21 @@ and right-shifted by
 before adding it to
 .IR OFFSET .
 .TP
+.BI eth " ETHHDR_FIELD"
+Change an ETH header field. The supported keywords for
+.I ETHHDR_FIELD
+are:
+.RS
+.TP
+.B src
+.TQ
+.B dst
+Source or destination MAC address in the standard format: XX:XX:XX:XX:XX:XX
+.TP
+.B type
+Ether-type in numeric value
+.RE
+.TP
 .BI ip " IPHDR_FIELD"
 Change an IPv4 header field. The supported keywords for
 .I IPHDR_FIELD
@@ -269,6 +290,9 @@ tc filter add dev eth0 parent : u32 \\
 tc filter add dev eth0 parent : u32 \\
match ip sport 22 0x \\
action pedit ex munge ip dst set 192.168.1.199
+tc filter add dev eth0 parent : u32 \\
+   match ip sport 22 0x \\
+   action pedit ex munge eth dst set 11:22:33:44:55:66
 .EE
 .RE
 .SH SEE ALSO
diff --git a/tc/Makefile b/tc/Makefile
index 3f7fc939e194..446a11391ad7 100644
--- a/tc/Makefile
+++ b/tc/Makefile
@@ -54,6 +54,7 @@ TCMODULES += m_tunnel_key.o
 TCMODULES += m_sample.o
 TCMODULES += p_ip.o
 TCMODULES += p_icmp.o
+TCMODULES += p_eth.o
 TCMODULES += p_tcp.o
 TCMODULES += p_udp.o
 TCMODULES += em_nbyte.o
diff --git a/tc/m_pedit.c b/tc/m_pedit.c
index d982c91a2585..0be42343ac88 100644
--- a/tc/m_pedit.c
+++ b/tc/m_pedit.c
@@ -28,6 +28,7 @@
 #include "utils.h"
 #include "tc_util.h"
 #include "m_pedit.h"
+#include "rt_names.h"
 
 static struct m_pedit_util *pedit_list;
 static int pedit_debug;
@@ -223,6 +224,38 @@ int pack_key8(__u32 retain, struct m_pedit_sel *sel, 
struct m_pedit_key *tkey)
return pack_key(sel, tkey);
 }
 
+static int pack_mac(struct m_pedit_sel *sel, struct m_pedit_key *tkey,
+   __u8 *mac)
+{
+   int ret = 0;
+
+   if (!(tkey->off & 0x3)) {
+   tkey->mask = 0;
+   tkey->val = ntohl(*((__u32 *)mac));
+   ret |= pack_key32(~0, sel, tkey);
+
+   tkey->off += 4;
+   tkey->mask = 0;
+   tkey->val = ntohs(*((__u16 *)[4]));
+   ret |= pack_key16(~0, sel, tkey);
+   } else if (!(tkey->off & 0x1)) {
+   tkey->mask = 0;
+   tkey->val = ntohs(*((__u16 *)mac));
+   ret |= pack_key16(~0, sel, tkey);
+
+   tkey->off += 4;
+   tkey->mask = 0;
+   tkey->val = ntohl(*((__u32 *)(mac + 2)));
+   ret |= pack_key32(~0, sel, tkey);
+   } else {
+   fprintf(stderr,
+   "pack_mac: mac offsets must begin in 32bit or 16bit 
boundaries\n");
+   return -1;
+   }
+
+   return ret;
+}
+
 int parse_val(int *argc_p, char ***argv_p, __u32 *val, int type)
 {
int argc = *argc_p;
@@ -250,6 +283,14 @@ int parse_val(int *argc_p, char ***argv_p, __u32 *val, int 
type)
if (type == TIPV6)
return -1; /* not implemented yet */
 
+   if (type == TMAC) {
+#define MAC_ALEN 6
+   int ret = ll_addr_a2n((char *)val, MAC_ALEN, *argv);
+
+   if (ret == MAC_ALEN)
+   return 0;
+   }
+
return -1;
 }
 
@@ -310,6 +351,11 @@ int parse_cmd(int *argc_p, char ***argv_p, __u32 len, int 
type, __u32 retain,
argv++;
}
 
+   if (type == TMAC) {
+   res = pack_mac(sel, tkey, (__u8 *)val);
+   goto done;
+   }
+
tkey->val = *v;
tkey->mask = *m;
 
diff --git a/tc/m_pedit.

[PATCH iproute2 net 0/8] tc/act_pedit: Support offset relative to conventional header

2017-04-23 Thread Amir Vadai
Hi Stephen,

This patchset extends pedit to support modifying a field in an offset relative
to the conventional network headers (kenrel support was added [1] in 4.11 rc1).
Without the extended pedit, user could specify fields in TCP and ICMP headers,
but the kernel code was using an offset relative to the begining of the IP
header. This will break if IP header length is greater than the minimal value
of 20, or if L3 is not IPv4.

It also introduces support in manipulating ETH, TCP, UDP and IP.ttl fields and
a new command to increase/decrease the value of a field (current use case is 
IP.ttl).

Since there might be deployments already using pedit, special consideration was
taken, not to break those scripts - only by specifying the special keyword
'ex', the extended capabilities are available, thus there should be no impact
on existing scripts.
Also, the new code can live together with rules added by the old code. It
supports both the old netlink and the new one.

This patchset is against the master and not net-next as the functionality was
added in 4.11

Thanks,
Amir

[1] - 71d0ed7079df ("net/act_pedit: Support using offset relative to the
 conventional network headers")

Amir Vadai (7):
  tc/pedit: Fix a typo in pedit usage message
  tc/pedit: Extend pedit to specify offset relative to mac/transport
headers
  tc/pedit: Introduce 'add' operation
  tc/pedit: p_ip: introduce editing ttl header
  tc/pedit: Support fields bigger than 32 bits
  tc/pedit: p_eth: ETH header editor
  tc/pedit: p_tcp: introduce pedit tcp support

Or Gerlitz (1):
  tc/pedit: p_udp: introduce pedit udp support

 man/man8/tc-pedit.8 | 126 +--
 tc/Makefile |   1 +
 tc/m_pedit.c| 290 ++--
 tc/m_pedit.h|  44 ++--
 tc/p_eth.c  |  72 +
 tc/p_icmp.c |   3 +-
 tc/p_ip.c   |  21 +++-
 tc/p_tcp.c  |  40 +++-
 tc/p_udp.c  |  30 +-
 9 files changed, 572 insertions(+), 55 deletions(-)
 create mode 100644 tc/p_eth.c

-- 
2.12.0



[PATCH iproute2 net 3/8] tc/pedit: Introduce 'add' operation

2017-04-23 Thread Amir Vadai
This command could be useful to increase/decrease fields value.

Signed-off-by: Amir Vadai <a...@vadai.me>
---
 man/man8/tc-pedit.8 | 13 -
 tc/m_pedit.c| 18 +++---
 2 files changed, 27 insertions(+), 4 deletions(-)

diff --git a/man/man8/tc-pedit.8 b/man/man8/tc-pedit.8
index 761d5c8ee2d5..6bba741956f1 100644
--- a/man/man8/tc-pedit.8
+++ b/man/man8/tc-pedit.8
@@ -43,6 +43,8 @@ pedit - generic packet editor action
 .IR CMD_SPEC " := {"
 .BR clear " | " invert " | " set
 .IR VAL " | "
+.BR add
+.IR VAL " | "
 .BR preserve " } [ " retain
 .IR RVAL " ]"
 
@@ -63,7 +65,9 @@ only for IPv4 headers.
 .B ex
 Use extended pedit.
 .I EXTENDED_LAYERED_OP
-is allowed only in this mode.
+and the add
+.I CMD_SPEC
+are allowed only in this mode.
 .TP
 .BI offset " OFFSET " "\fR{ \fBu32 \fR| \fBu16 \fR| \fBu8 \fR}"
 Specify the offset at which to change data.
@@ -173,6 +177,13 @@ keywords in
 or the size of the addressed header field in
 .IR LAYERED_OP .
 .TP
+.BI add " VAL"
+Add the addressed data by a specific value. The size of
+.I VAL
+is defined by the size of the addressed header field in
+.IR EXTENDED_LAYERED_OP .
+This operation is supported only for extended layered op.
+.TP
 .B preserve
 Keep the addressed data as is.
 .TP
diff --git a/tc/m_pedit.c b/tc/m_pedit.c
index a26fd3e5bc5e..7af074a5a97c 100644
--- a/tc/m_pedit.c
+++ b/tc/m_pedit.c
@@ -41,7 +41,7 @@ static void explain(void)
"\t\tATC:= at  offmask  shift \n"
"\t\tNOTE: offval is byte offset, must be multiple of 4\n"
"\t\tNOTE: maskval is a 32 bit hex number\n \t\tNOTE: shiftval 
is a shift value\n"
-   "\t\tCMD:= clear | invert | set | retain\n"
+   "\t\tCMD:= clear | invert | set | add  | 
retain\n"
"\t:= ip  | ip6 \n"
" \t\t| udp  | tcp  | icmp \n"
"\tCONTROL:= reclassify | pipe | drop | continue | pass\n"
@@ -276,7 +276,16 @@ int parse_cmd(int *argc_p, char ***argv_p, __u32 len, int 
type, __u32 retain,
 
if (matches(*argv, "invert") == 0) {
val = mask = o;
-   } else if (matches(*argv, "set") == 0) {
+   } else if (matches(*argv, "set") == 0 ||
+  matches(*argv, "add") == 0) {
+   if (matches(*argv, "add") == 0)
+   tkey->cmd = TCA_PEDIT_KEY_EX_CMD_ADD;
+
+   if (!sel->extended && tkey->cmd) {
+   fprintf(stderr, "Non extended mode. only 'set' command 
is supported\n");
+   return -1;
+   }
+
NEXT_ARG();
if (parse_val(, , , type))
return -1;
@@ -690,9 +699,11 @@ int print_pedit(struct action_util *au, FILE *f, struct 
rtattr *arg)
for (i = 0; i < sel->nkeys; i++, key++) {
enum pedit_header_type htype =
TCA_PEDIT_KEY_EX_HDR_TYPE_NETWORK;
+   enum pedit_cmd cmd = TCA_PEDIT_KEY_EX_CMD_SET;
 
if (keys_ex) {
htype = key_ex->htype;
+   cmd = key_ex->cmd;
 
key_ex++;
}
@@ -703,7 +714,8 @@ int print_pedit(struct action_util *au, FILE *f, struct 
rtattr *arg)
 
print_pedit_location(f, htype, key->off);
 
-   fprintf(f, ": val %08x mask %08x",
+   fprintf(f, ": %s %08x mask %08x",
+   cmd ? "add" : "val",
(unsigned int)ntohl(key->val),
(unsigned int)ntohl(key->mask));
}
-- 
2.12.0



[PATCH iproute2 net 4/8] tc/pedit: p_ip: introduce editing ttl header

2017-04-23 Thread Amir Vadai
Enable user to edit IP header ttl field.

For example, to forward any TCP packet and decrease its TTL by one:
$ tc filter add dev enp0s9 protocol ip parent : \
flower \
  ip_proto tcp \
action pedit ex munge \
  ip ttl add 0xff pipe \
action mirred egress \
  redirect dev veth0

Signed-off-by: Amir Vadai <a...@vadai.me>
---
 man/man8/tc-pedit.8 | 17 +
 tc/p_ip.c   |  6 ++
 2 files changed, 23 insertions(+)

diff --git a/man/man8/tc-pedit.8 b/man/man8/tc-pedit.8
index 6bba741956f1..c98d95cb0021 100644
--- a/man/man8/tc-pedit.8
+++ b/man/man8/tc-pedit.8
@@ -28,6 +28,8 @@ pedit - generic packet editor action
 .ti -8
 .IR EXTENDED_LAYERED_OP " := { "
 .BI ip " IPHDR_FIELD"
+|
+.BI ip " EX_IPHDR_FIELD"
 .RI } " CMD_SPEC"
 
 .ti -8
@@ -40,6 +42,10 @@ pedit - generic packet editor action
 .BR dport " | " sport " | " icmp_type " | " icmp_code " }"
 
 .ti -8
+.IR EX_IPHDR_FIELD " := { "
+.BR ttl " }"
+
+.ti -8
 .IR CMD_SPEC " := {"
 .BR clear " | " invert " | " set
 .IR VAL " | "
@@ -161,6 +167,17 @@ If it is not or the latter is bigger than the minimum of 
20 bytes, this will do
 unexpected things. These fields are eight-bit values.
 .RE
 .TP
+.BI ip " EX_IPHDR_FIELD"
+Supported only when
+.I ex
+is used. The supported keywords for
+.I EX_IPHDR_FIELD
+are:
+.RS
+.TP
+.B ttl
+.RE
+.TP
 .B clear
 Clear the addressed data (i.e., set it to zero).
 .TP
diff --git a/tc/p_ip.c b/tc/p_ip.c
index e56eb39317ba..22fe6505e427 100644
--- a/tc/p_ip.c
+++ b/tc/p_ip.c
@@ -66,6 +66,12 @@ parse_ip(int *argc_p, char ***argv_p,
res = parse_cmd(, , 1, TU32, 0x0f, sel, tkey);
goto done;
}
+   if (strcmp(*argv, "ttl") == 0) {
+   NEXT_ARG();
+   tkey->off = 8;
+   res = parse_cmd(, , 1, TU32, RU8, sel, tkey);
+   goto done;
+   }
if (strcmp(*argv, "protocol") == 0) {
NEXT_ARG();
tkey->off = 9;
-- 
2.12.0



[PATCH iproute2 net 2/8] tc/pedit: Extend pedit to specify offset relative to mac/transport headers

2017-04-23 Thread Amir Vadai
Utilize the extended pedit netlink to set an offset relative to a
specific header type. Old netlink only enabled the user to set
approximated  offset relative to the IPv4 header.

To use this extended functionality need to use the 'ex' keyword after
'pedit' and before any 'munge'.
e.g:
$ tc filter add dev ens9 protocol ip parent : \
flower \
  ip_proto udp \
  dst_port 80 \
action pedit ex munge \
  ip dst set 1.1.1.1 \
  pipe \
action mirred egress redirect dev veth0

Signed-off-by: Amir Vadai <a...@vadai.me>
---
 man/man8/tc-pedit.8 |  41 +++---
 tc/m_pedit.c| 213 +---
 tc/m_pedit.h|  43 ---
 tc/p_icmp.c |   3 +-
 tc/p_ip.c   |  15 +++-
 tc/p_tcp.c  |   3 +-
 tc/p_udp.c  |   3 +-
 7 files changed, 270 insertions(+), 51 deletions(-)

diff --git a/man/man8/tc-pedit.8 b/man/man8/tc-pedit.8
index c34520c046a6..761d5c8ee2d5 100644
--- a/man/man8/tc-pedit.8
+++ b/man/man8/tc-pedit.8
@@ -5,8 +5,8 @@ pedit - generic packet editor action
 .SH SYNOPSIS
 .in +8
 .ti -8
-.BR tc " ... " "action pedit munge " {
-.IR RAW_OP " | " LAYERED_OP " } [ " CONTROL " ]"
+.BR tc " ... " "action pedit [ex] munge " {
+.IR RAW_OP " | " LAYERED_OP " | " EXTENDED_LAYERED_OP " } [ " CONTROL " ]"
 
 .ti -8
 .IR RAW_OP " := "
@@ -22,20 +22,22 @@ pedit - generic packet editor action
 .IR LAYERED_OP " := { "
 .BI ip " IPHDR_FIELD"
 |
-.BI ip6 " IP6HDR_FIELD"
-|
-.BI udp " UDPHDR_FIELD"
-|
-.BI tcp " TCPHDR_FIELD"
-|
-.BI icmp " ICMPHDR_FIELD"
+.BI ip " BEYOND_IPHDR_FIELD"
+.RI } " CMD_SPEC"
+
+.ti -8
+.IR EXTENDED_LAYERED_OP " := { "
+.BI ip " IPHDR_FIELD"
 .RI } " CMD_SPEC"
 
 .ti -8
 .IR IPHDR_FIELD " := { "
 .BR src " | " dst " | " tos " | " dsfield " | " ihl " | " protocol " |"
-.BR precedence " | " nofrag " | " firstfrag " | " ce " | " df " |"
-.BR mf " | " dport " | " sport " | " icmp_type " | " icmp_code " }"
+.BR precedence " | " nofrag " | " firstfrag " | " ce " | " df " }"
+
+.ti -8
+.IR BEYOND_IPHDR_FIELD " := { "
+.BR dport " | " sport " | " icmp_type " | " icmp_code " }"
 
 .ti -8
 .IR CMD_SPEC " := {"
@@ -58,6 +60,11 @@ chosen automatically based on the header field size. 
Currently this is supported
 only for IPv4 headers.
 .SH OPTIONS
 .TP
+.B ex
+Use extended pedit.
+.I EXTENDED_LAYERED_OP
+is allowed only in this mode.
+.TP
 .BI offset " OFFSET " "\fR{ \fBu32 \fR| \fBu16 \fR| \fBu8 \fR}"
 Specify the offset at which to change data.
 .I OFFSET
@@ -123,6 +130,15 @@ Change IP header flags. Note that the value to pass to the
 .B set
 command is not just a bit value, but the full byte including the flags field.
 Though only the relevant bits of that value are respected, the rest ignored.
+.RE
+.TP
+.BI ip " BEYOND_IPHDR_FIELD"
+Supported only for non-extended layered op. It is passed to the kernel as
+offsets relative to the beginning of the IP header and assumes the IP header is
+of minimum size (20 bytes). The supported keywords for
+.I BEYOND_IPHDR_FIELD
+are:
+.RS
 .TP
 .B dport
 .TQ
@@ -222,6 +238,9 @@ tc filter add dev eth0 parent 1: u32 \\
 tc filter add dev eth0 parent : u32 \\
match ip sport 22 0x \\
action pedit pedit munge ip sport set 23
+tc filter add dev eth0 parent : u32 \\
+   match ip sport 22 0x \\
+   action pedit ex munge ip dst set 192.168.1.199
 .EE
 .RE
 .SH SEE ALSO
diff --git a/tc/m_pedit.c b/tc/m_pedit.c
index 939a6a1455a5..a26fd3e5bc5e 100644
--- a/tc/m_pedit.c
+++ b/tc/m_pedit.c
@@ -34,7 +34,7 @@ static int pedit_debug;
 
 static void explain(void)
 {
-   fprintf(stderr, "Usage: ... pedit munge  [CONTROL]\n");
+   fprintf(stderr, "Usage: ... pedit munge [ex]  [CONTROL]\n");
fprintf(stderr,
"Where: MUNGE := |\n"
"\t:= [ATC]\n \t\tOFFSETC:= offset  
<u8|u16|u32>\n"
@@ -45,6 +45,7 @@ static void explain(void)
"\t:= ip  | ip6 \n"
" \t\t| udp  | tcp  | icmp \n"
"\tCONTROL:= reclassify | pipe | drop | continue | pass\n"
+   "\tNOTE: if 'ex' is set, extended functionality will be 
supported (kernel >= 4.11)\n"
"For Example usage look at the examples directory\n");
 
 }
@@ -56,8 +57,8 @@ static void usage(void)
 }
 
 static int pedit_parse_nopopt(int *argc_p, char ***

[PATCH iproute2 net 1/8] tc/pedit: Fix a typo in pedit usage message

2017-04-23 Thread Amir Vadai
Signed-off-by: Amir Vadai <a...@vadai.me>
---
 tc/m_pedit.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tc/m_pedit.c b/tc/m_pedit.c
index 8e9bf0720dfe..939a6a1455a5 100644
--- a/tc/m_pedit.c
+++ b/tc/m_pedit.c
@@ -40,7 +40,7 @@ static void explain(void)
"\t:= [ATC]\n \t\tOFFSETC:= offset  
<u8|u16|u32>\n"
"\t\tATC:= at  offmask  shift \n"
"\t\tNOTE: offval is byte offset, must be multiple of 4\n"
-   "\t\tNOTE: maskval is a 32 bit hex number\n \t\tNOTE: shiftval 
is a is a shift value\n"
+   "\t\tNOTE: maskval is a 32 bit hex number\n \t\tNOTE: shiftval 
is a shift value\n"
"\t\tCMD:= clear | invert | set | retain\n"
"\t:= ip  | ip6 \n"
" \t\t| udp  | tcp  | icmp \n"
-- 
2.12.0



[PATCH net-next V3 1/3] net/skbuff: Introduce skb_mac_offset()

2017-02-06 Thread Amir Vadai
Introduce skb_mac_offset() that could be used to get mac header offset.

Signed-off-by: Amir Vadai <a...@vadai.me>
Reviewed-by: Or Gerlitz <ogerl...@mellanox.com>
---
 include/linux/skbuff.h | 5 +
 1 file changed, 5 insertions(+)

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index c6a78e1892b6..a1b73b794a38 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -2182,6 +2182,11 @@ static inline unsigned char *skb_mac_header(const struct 
sk_buff *skb)
return skb->head + skb->mac_header;
 }
 
+static inline int skb_mac_offset(const struct sk_buff *skb)
+{
+   return skb_mac_header(skb) - skb->data;
+}
+
 static inline int skb_mac_header_was_set(const struct sk_buff *skb)
 {
return skb->mac_header != (typeof(skb->mac_header))~0U;
-- 
2.11.0



[PATCH net-next V3 2/3] net/act_pedit: Support using offset relative to the conventional network headers

2017-02-06 Thread Amir Vadai
Extend pedit to enable the user setting offset relative to network
headers. This change would enable to work with more complex header
schemes (vs the simple IPv4 case) where setting a fixed offset relative
to the network header is not enough.

After this patch, the action has information about the exact header type
and field inside this header. This information could be used later on
for hardware offloading of pedit.

Backward compatibility was being kept:
1. Old kernel <-> new userspace
2. New kernel <-> old userspace
3. add rule using new userspace <-> dump using old userspace
4. add rule using old userspace <-> dump using new userspace

When using the extended api, new netlink attributes are being used. This
way, operation will fail in (1) and (3) - and no malformed rule be added
or dumped. Of course, new user space that doesn't need the new
functionality can use the old netlink attributes and operation will
succeed.
Since action can support both api's, (2) should work, and it is easy to
write the new user space to have (4) work.

The action is having a strict check that only header types and commands
it can handle are accepted. This way future additions will be much
easier.

Usage example:
$ tc filter add dev enp0s9 protocol ip parent : \
  flower \
ip_proto tcp \
dst_port 80 \
  action pedit munge tcp dport set 8080 pipe \
  action mirred egress redirect dev veth0

Will forward tcp port whose original dest port is 80, while modifying
the destination port to 8080.

Signed-off-by: Amir Vadai <a...@vadai.me>
Reviewed-by: Or Gerlitz <ogerl...@mellanox.com>
---
 include/net/tc_act/tc_pedit.h|   5 +
 include/uapi/linux/tc_act/tc_pedit.h |  23 
 net/sched/act_pedit.c| 196 ---
 3 files changed, 208 insertions(+), 16 deletions(-)

diff --git a/include/net/tc_act/tc_pedit.h b/include/net/tc_act/tc_pedit.h
index 29e38d6823df..e076f22035a5 100644
--- a/include/net/tc_act/tc_pedit.h
+++ b/include/net/tc_act/tc_pedit.h
@@ -3,11 +3,16 @@
 
 #include 
 
+struct tcf_pedit_key_ex {
+   enum pedit_header_type htype;
+};
+
 struct tcf_pedit {
struct tc_actioncommon;
unsigned char   tcfp_nkeys;
unsigned char   tcfp_flags;
struct tc_pedit_key *tcfp_keys;
+   struct tcf_pedit_key_ex *tcfp_keys_ex;
 };
 #define to_pedit(a) ((struct tcf_pedit *)a)
 
diff --git a/include/uapi/linux/tc_act/tc_pedit.h 
b/include/uapi/linux/tc_act/tc_pedit.h
index 6389959a5157..22f19eeda997 100644
--- a/include/uapi/linux/tc_act/tc_pedit.h
+++ b/include/uapi/linux/tc_act/tc_pedit.h
@@ -11,10 +11,33 @@ enum {
TCA_PEDIT_TM,
TCA_PEDIT_PARMS,
TCA_PEDIT_PAD,
+   TCA_PEDIT_PARMS_EX,
+   TCA_PEDIT_KEYS_EX,
+   TCA_PEDIT_KEY_EX,
__TCA_PEDIT_MAX
 };
 #define TCA_PEDIT_MAX (__TCA_PEDIT_MAX - 1)

 
+enum {
+   TCA_PEDIT_KEY_EX_HTYPE = 1,
+   __TCA_PEDIT_KEY_EX_MAX
+};
+#define TCA_PEDIT_KEY_EX_MAX (__TCA_PEDIT_KEY_EX_MAX - 1)
+
+ /* TCA_PEDIT_KEY_EX_HDR_TYPE_NETWROK is a special case for legacy users. It
+  * means no specific header type - offset is relative to the network layer
+  */
+enum pedit_header_type {
+   TCA_PEDIT_KEY_EX_HDR_TYPE_NETWORK = 0,
+   TCA_PEDIT_KEY_EX_HDR_TYPE_ETH = 1,
+   TCA_PEDIT_KEY_EX_HDR_TYPE_IP4 = 2,
+   TCA_PEDIT_KEY_EX_HDR_TYPE_IP6 = 3,
+   TCA_PEDIT_KEY_EX_HDR_TYPE_TCP = 4,
+   TCA_PEDIT_KEY_EX_HDR_TYPE_UDP = 5,
+   __PEDIT_HDR_TYPE_MAX,
+};
+#define TCA_PEDIT_HDR_TYPE_MAX (__PEDIT_HDR_TYPE_MAX - 1)
+
 struct tc_pedit_key {
__u32   mask;  /* AND */
__u32   val;   /*XOR */
diff --git a/net/sched/act_pedit.c b/net/sched/act_pedit.c
index b27c4daec88f..fdd012bd3602 100644
--- a/net/sched/act_pedit.c
+++ b/net/sched/act_pedit.c
@@ -22,6 +22,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #define PEDIT_TAB_MASK 15
 
@@ -30,18 +31,112 @@ static struct tc_action_ops act_pedit_ops;
 
 static const struct nla_policy pedit_policy[TCA_PEDIT_MAX + 1] = {
[TCA_PEDIT_PARMS]   = { .len = sizeof(struct tc_pedit) },
+   [TCA_PEDIT_KEYS_EX]   = { .type = NLA_NESTED },
 };
 
+static const struct nla_policy pedit_key_ex_policy[TCA_PEDIT_KEY_EX_MAX + 1] = 
{
+   [TCA_PEDIT_KEY_EX_HTYPE]  = { .type = NLA_U16 },
+};
+
+static struct tcf_pedit_key_ex *tcf_pedit_keys_ex_parse(struct nlattr *nla,
+   u8 n)
+{
+   struct tcf_pedit_key_ex *keys_ex;
+   struct tcf_pedit_key_ex *k;
+   const struct nlattr *ka;
+   int err = -EINVAL;
+   int rem;
+
+   if (!nla || !n)
+   return NULL;
+
+   keys_ex = kcalloc(n, sizeof(*k), GFP_KERNEL);
+   if (!keys_ex)
+   return ERR_PTR(-ENOMEM);
+
+   k = keys_ex;
+
+   nla_for_each_nested(ka, nla, rem)

[PATCH net-next V3 0/3] net/sched: act_pedit: Use offset relative to conventional network headers

2017-02-06 Thread Amir Vadai
Hi Dave,

Some FW/HW parser APIs are such that they need to get the specific header type 
(e.g
IPV4 or IPV6, TCP or UDP) and not only the networking level (e.g network or 
transport).

Enhancing the UAPI to allow for specifying that, would allow the same flows to 
be
set into both SW and HW.

This patchset also makes pedit more robust. Currently fields offset is specified
by offset relative to the ip header, while using negative offsets for 
MAC layer fields.

This series enables the user to set offset relative to the relevant header.

Usage example:
$ tc filter add dev enp0s9 protocol ip parent : \
   flower \
 ip_proto tcp \
dst_port 80 \
   action \
   pedit munge ip ttl add 0xff \
   pedit munge tcp dport set 8080 \
 pipe action mirred egress redirect dev veth0

Will forward traffic destined to tcp dport 80, while modifying the
destination port to 8080, and decreasing the ttl by one.

I've uploaded a draft for the userspace [2] to make it easier to review and
test the patchset.

[1] - http://patchwork.ozlabs.org/patch/700909/
[2] - git: https://bitbucket.org/av42/iproute2.git
  branch: pedit

Patchset was tested and applied on top of upstream commit bd092ad1463c ("Merge
branch 'remove-__napi_complete_done'")

Thanks,
Amir

Changes since V2:
- Instead of reusing unused bits in existing uapi fields, using new netlink
attributes for the new information. This way new/old user space and 
new/old
kernel can live together without having misunderstandings.

Changes since V1:
- No changes - V1 was sent and didn't make it for 4.10.
- You asked me [1] why did I use specific header names instead of layers (L2,
L3...), and I explained that it is on purpose, this extra information is
planned to be used by hardware drivers to offload the action.


Amir Vadai (3):
  net/skbuff: Introduce skb_mac_offset()
  net/act_pedit: Support using offset relative to the conventional
network headers
  net/act_pedit: Introduce 'add' operation

 include/linux/skbuff.h   |   5 +
 include/net/tc_act/tc_pedit.h|   6 +
 include/uapi/linux/tc_act/tc_pedit.h |  31 +
 net/sched/act_pedit.c| 220 ---
 4 files changed, 245 insertions(+), 17 deletions(-)

-- 
2.11.0



[PATCH net-next V3 3/3] net/act_pedit: Introduce 'add' operation

2017-02-06 Thread Amir Vadai
This command could be useful to inc/dec fields.

For example, to forward any TCP packet and decrease its TTL:
$ tc filter add dev enp0s9 protocol ip parent : \
flower ip_proto tcp \
action pedit munge ip ttl add 0xff pipe \
action mirred egress redirect dev veth0

In the example above, adding 0xff to this u8 field is actually
decreasing it by one, since the operation is masked.

Signed-off-by: Amir Vadai <a...@vadai.me>
Reviewed-by: Or Gerlitz <ogerl...@mellanox.com>
---
 include/net/tc_act/tc_pedit.h|  1 +
 include/uapi/linux/tc_act/tc_pedit.h |  8 
 net/sched/act_pedit.c| 30 ++
 3 files changed, 35 insertions(+), 4 deletions(-)

diff --git a/include/net/tc_act/tc_pedit.h b/include/net/tc_act/tc_pedit.h
index e076f22035a5..dfbd6ee0bc7c 100644
--- a/include/net/tc_act/tc_pedit.h
+++ b/include/net/tc_act/tc_pedit.h
@@ -5,6 +5,7 @@
 
 struct tcf_pedit_key_ex {
enum pedit_header_type htype;
+   enum pedit_cmd cmd;
 };
 
 struct tcf_pedit {
diff --git a/include/uapi/linux/tc_act/tc_pedit.h 
b/include/uapi/linux/tc_act/tc_pedit.h
index 22f19eeda997..143d2b31a316 100644
--- a/include/uapi/linux/tc_act/tc_pedit.h
+++ b/include/uapi/linux/tc_act/tc_pedit.h
@@ -20,6 +20,7 @@ enum {

 
 enum {
TCA_PEDIT_KEY_EX_HTYPE = 1,
+   TCA_PEDIT_KEY_EX_CMD = 2,
__TCA_PEDIT_KEY_EX_MAX
 };
 #define TCA_PEDIT_KEY_EX_MAX (__TCA_PEDIT_KEY_EX_MAX - 1)
@@ -38,6 +39,13 @@ enum pedit_header_type {
 };
 #define TCA_PEDIT_HDR_TYPE_MAX (__PEDIT_HDR_TYPE_MAX - 1)
 
+enum pedit_cmd {
+   TCA_PEDIT_KEY_EX_CMD_SET = 0,
+   TCA_PEDIT_KEY_EX_CMD_ADD = 1,
+   __PEDIT_CMD_MAX,
+};
+#define TCA_PEDIT_CMD_MAX (__PEDIT_CMD_MAX - 1)
+
 struct tc_pedit_key {
__u32   mask;  /* AND */
__u32   val;   /*XOR */
diff --git a/net/sched/act_pedit.c b/net/sched/act_pedit.c
index fdd012bd3602..c1310472f620 100644
--- a/net/sched/act_pedit.c
+++ b/net/sched/act_pedit.c
@@ -36,6 +36,7 @@ static const struct nla_policy pedit_policy[TCA_PEDIT_MAX + 
1] = {
 
 static const struct nla_policy pedit_key_ex_policy[TCA_PEDIT_KEY_EX_MAX + 1] = 
{
[TCA_PEDIT_KEY_EX_HTYPE]  = { .type = NLA_U16 },
+   [TCA_PEDIT_KEY_EX_CMD]= { .type = NLA_U16 },
 };
 
 static struct tcf_pedit_key_ex *tcf_pedit_keys_ex_parse(struct nlattr *nla,
@@ -75,14 +76,17 @@ static struct tcf_pedit_key_ex 
*tcf_pedit_keys_ex_parse(struct nlattr *nla,
if (err)
goto err_out;
 
-   if (!tb[TCA_PEDIT_KEY_EX_HTYPE]) {
+   if (!tb[TCA_PEDIT_KEY_EX_HTYPE] ||
+   !tb[TCA_PEDIT_KEY_EX_CMD]) {
err = -EINVAL;
goto err_out;
}
 
k->htype = nla_get_u16(tb[TCA_PEDIT_KEY_EX_HTYPE]);
+   k->cmd = nla_get_u16(tb[TCA_PEDIT_KEY_EX_CMD]);
 
-   if (k->htype > TCA_PEDIT_HDR_TYPE_MAX) {
+   if (k->htype > TCA_PEDIT_HDR_TYPE_MAX ||
+   k->cmd > TCA_PEDIT_CMD_MAX) {
err = -EINVAL;
goto err_out;
}
@@ -110,7 +114,8 @@ static int tcf_pedit_key_ex_dump(struct sk_buff *skb,
 
key_start = nla_nest_start(skb, TCA_PEDIT_KEY_EX);
 
-   if (nla_put_u16(skb, TCA_PEDIT_KEY_EX_HTYPE, keys_ex->htype)) {
+   if (nla_put_u16(skb, TCA_PEDIT_KEY_EX_HTYPE, keys_ex->htype) ||
+   nla_put_u16(skb, TCA_PEDIT_KEY_EX_CMD, keys_ex->cmd)) {
nlmsg_trim(skb, keys_start);
return -EINVAL;
}
@@ -280,15 +285,19 @@ static int tcf_pedit(struct sk_buff *skb, const struct 
tc_action *a,
struct tc_pedit_key *tkey = p->tcfp_keys;
struct tcf_pedit_key_ex *tkey_ex = p->tcfp_keys_ex;
enum pedit_header_type htype = 
TCA_PEDIT_KEY_EX_HDR_TYPE_NETWORK;
+   enum pedit_cmd cmd = TCA_PEDIT_KEY_EX_CMD_SET;
 
for (i = p->tcfp_nkeys; i > 0; i--, tkey++) {
u32 *ptr, _data;
int offset = tkey->off;
int hoffset;
+   u32 val;
int rc;
 
if (tkey_ex) {
htype = tkey_ex->htype;
+   cmd = tkey_ex->cmd;
+
tkey_ex++;
}
 
@@ -330,7 +339,20 @@ static int tcf_pedit(struct sk_buff *skb, const struct 
tc_action *a,
if (!ptr)
goto bad;
/* just do it, baby */
-   *ptr = ((*ptr & tkey->mask) ^ tkey->val);
+   switch (cmd) {
+

Re: [PATCH net-next V2 0/3] net/sched: act_pedit: Use offset relative to conventional network headers

2017-01-08 Thread Amir Vadai
On Fri, Jan 06, 2017 at 08:51:09PM -0500, David Miller wrote:
> From: Amir Vadai <a...@vadai.me>
> Date: Thu,  5 Jan 2017 11:54:51 +0200
> 
> > Enhancing the UAPI to allow for specifying that would allow the same
> > flows to be set into both SW and HW.
> 
> This is actually not backward compatible.
> 
> When pedit rules are dumped, older tools will not know about the
> type field and therefore will completely misinterpret the rule.
> 
> You must extend this the proper way, which is to add a new attribute
> or something along those lines.  The presense of a new attribute
> is an explicit communication to older tools that somethng they
> might not support and understand is going on.

Sorry, I missed this scenario. Going back to the drawing board.



Re: [PATCH net-next V2 0/3] net/sched: act_pedit: Use offset relative to conventional network headers

2017-01-05 Thread Amir Vadai
On Thu, Jan 05, 2017 at 12:54:14PM +0100, Jiri Benc wrote:
> On Thu,  5 Jan 2017 11:54:51 +0200, Amir Vadai wrote:
> > You asked me [1] why did I use specific header names instead of layers (L2, 
> > L3...),
> > and I explained that it is on purpose, this extra information is planned to 
> > be used
> > by hardware drivers to offload the action.
> > 
> > Some FW/HW parser APIs are such that they need to get the specific header 
> > type (e.g
> > IPV4 or IPV6, TCP or UDP) and not only the networking level (e.g network or 
> > transport).
> 
> Don't we need better API specification (and enforcement) then, though?
> See below.
> 
> > Usage example:
> > $ tc filter add dev enp0s9 protocol ip parent : \
> >flower \
> >  ip_proto tcp \
> > dst_port 80 \
> >action \
> >pedit munge ip ttl add 0xff \
> >pedit munge tcp dport set 8080 \
> >  pipe action mirred egress redirect dev veth0
> 
> What happens when one does:
> 
> tc filter add ... flower ip_proto udp action pedit munge tcp ...
> 
> ?
This is a simple action. It is not fool proof - it prevents the user
from getting out of packet bounds, but it is the user responsibility to
provide valid rules.

> 
>  Jiri


[PATCH net-next V2 3/3] net/act_pedit: Introduce 'add' operation

2017-01-05 Thread Amir Vadai
This command could be useful to inc/dec fields.
Command type is embedded inside the existing shift field in an unused
bits, therefore UAPI backward compatibility is being kept.

For example, to forward any TCP packet and decrease its TTL:
$ tc filter add dev enp0s9 protocol ip parent : \
flower ip_proto tcp \
action pedit munge ip ttl add 0xff pipe \
action mirred egress redirect dev veth0

In the example above, adding 0xff to this u8 field is actually
decreasing it by one, since the operation is masked.

Signed-off-by: Amir Vadai <a...@vadai.me>
Reviewed-by: Or Gerlitz <ogerl...@mellanox.com>
---
 include/uapi/linux/tc_act/tc_pedit.h | 10 ++
 net/sched/act_pedit.c| 16 +++-
 2 files changed, 25 insertions(+), 1 deletion(-)

diff --git a/include/uapi/linux/tc_act/tc_pedit.h 
b/include/uapi/linux/tc_act/tc_pedit.h
index 604e6729ad38..80028cd0bb1b 100644
--- a/include/uapi/linux/tc_act/tc_pedit.h
+++ b/include/uapi/linux/tc_act/tc_pedit.h
@@ -35,8 +35,13 @@ struct tc_pedit_sel {
 #define PEDIT_TYPE_SHIFT 24
 #define PEDIT_TYPE_MASK 0xff
 
+#define PEDIT_CMD_SHIFT 16
+#define PEDIT_CMD_MASK 0xff
+
 #define PEDIT_TYPE_GET(_val) \
(((_val) >> PEDIT_TYPE_SHIFT) & PEDIT_TYPE_MASK)
+#define PEDIT_CMD_GET(_val) \
+   (((_val) >> PEDIT_CMD_SHIFT) & PEDIT_CMD_MASK)
 #define PEDIT_SHIFT_GET(_val) ((_val) & 0xff)
 
 enum pedit_header_type {
@@ -49,4 +54,9 @@ enum pedit_header_type {
PEDIT_HDR_TYPE_UDP = 5,
 };
 
+enum pedit_cmd {
+   PEDIT_CMD_SET = 0,
+   PEDIT_CMD_ADD = 1,
+};
+
 #endif
diff --git a/net/sched/act_pedit.c b/net/sched/act_pedit.c
index 4b9c7184c752..aa137d51bf7f 100644
--- a/net/sched/act_pedit.c
+++ b/net/sched/act_pedit.c
@@ -169,6 +169,7 @@ static int tcf_pedit(struct sk_buff *skb, const struct 
tc_action *a,
u32 *ptr, _data;
int offset = tkey->off;
int hoffset;
+   u32 val;
int rc;
enum pedit_header_type htype =
PEDIT_TYPE_GET(tkey->shift);
@@ -214,7 +215,20 @@ static int tcf_pedit(struct sk_buff *skb, const struct 
tc_action *a,
if (!ptr)
goto bad;
/* just do it, baby */
-   *ptr = ((*ptr & tkey->mask) ^ tkey->val);
+   switch (PEDIT_CMD_GET(tkey->shift)) {
+   case PEDIT_CMD_SET:
+   val = tkey->val;
+   break;
+   case PEDIT_CMD_ADD:
+   val = (*ptr + tkey->val) & ~tkey->mask;
+   break;
+   default:
+   pr_info("tc filter pedit bad command (%d)\n",
+   PEDIT_CMD_GET(tkey->shift));
+   goto bad;
+   }
+
+   *ptr = ((*ptr & tkey->mask) ^ val);
if (ptr == &_data)
skb_store_bits(skb, hoffset + offset, ptr, 4);
}
-- 
2.11.0



[PATCH net-next V2 0/3] net/sched: act_pedit: Use offset relative to conventional network headers

2017-01-05 Thread Amir Vadai
Hi Dave,

This is a respin of the patchset. V1 was sent and didn't make it for 4.10.

You asked me [1] why did I use specific header names instead of layers (L2, 
L3...),
and I explained that it is on purpose, this extra information is planned to be 
used
by hardware drivers to offload the action.

Some FW/HW parser APIs are such that they need to get the specific header type 
(e.g
IPV4 or IPV6, TCP or UDP) and not only the networking level (e.g network or 
transport).

Enhancing the UAPI to allow for specifying that would allow the same flows to be
set into both SW and HW.

This patchset also makes pedit more robust. Currently fields offset is specified
by offset relative to the ip header, while using negative offsets for 
MAC layer fields.

This series enables the user to set offset relative to the relevant header.

This patch is reusing existing fields in a way where backward UAPI 
compatibility is being kept.

Usage example:
$ tc filter add dev enp0s9 protocol ip parent : \
   flower \
 ip_proto tcp \
dst_port 80 \
   action \
   pedit munge ip ttl add 0xff \
   pedit munge tcp dport set 8080 \
 pipe action mirred egress redirect dev veth0

Will forward traffic destined to tcp dport 80, while modifying the
destination port to 8080, and decreasing the ttl by one.

I've uploaded a draft for the userspace [2] to make it easier to review and
test the patchset.

[1] - http://patchwork.ozlabs.org/patch/700909/
[2] - git: https://bitbucket.org/av42/iproute2.git
  branch: pedit

Patchset was tested and applied on top of upstream commit 57ea884b0dcf
("packet: fix panic in __packet_set_timestamp on tpacket_v3 in tx mode")

Thanks,
Amir

Amir Vadai (3):
  net/skbuff: Introduce skb_mac_offset()
  net/act_pedit: Support using offset relative to the conventional
network headers
  net/act_pedit: Introduce 'add' operation

 include/linux/skbuff.h   |  5 +++
 include/uapi/linux/tc_act/tc_pedit.h | 27 
 net/sched/act_pedit.c| 81 ++--
 3 files changed, 100 insertions(+), 13 deletions(-)

-- 
2.11.0



[PATCH net-next V2 2/3] net/act_pedit: Support using offset relative to the conventional network headers

2017-01-05 Thread Amir Vadai
Extend pedit to enable the user setting offset relative to network
headers. This change would enable to work with more complex header
schemes (vs the simple IPv4 case) where setting a fixed offset relative
to the network header is not enough. It is also forward looking to
enable hardware offloading of pedit.

The header type is embedded in the 8 MSB of the u32 key->shift which
were never used till now. Therefore backward compatibility is being
kept.

Usage example:
$ tc filter add dev enp0s9 protocol ip parent : \
  flower \
ip_proto tcp \
dst_port 80 \
  action pedit munge tcp dport set 8080 pipe \
  action mirred egress redirect dev veth0

Will forward tcp port whose original dest port is 80, while modifying
the destination port to 8080.

Signed-off-by: Amir Vadai <a...@vadai.me>
Reviewed-by: Or Gerlitz <ogerl...@mellanox.com>
---
 include/uapi/linux/tc_act/tc_pedit.h | 17 ++
 net/sched/act_pedit.c| 65 +---
 2 files changed, 70 insertions(+), 12 deletions(-)

diff --git a/include/uapi/linux/tc_act/tc_pedit.h 
b/include/uapi/linux/tc_act/tc_pedit.h
index 6389959a5157..604e6729ad38 100644
--- a/include/uapi/linux/tc_act/tc_pedit.h
+++ b/include/uapi/linux/tc_act/tc_pedit.h
@@ -32,4 +32,21 @@ struct tc_pedit_sel {
 };
 #define tc_pedit tc_pedit_sel
 
+#define PEDIT_TYPE_SHIFT 24
+#define PEDIT_TYPE_MASK 0xff
+
+#define PEDIT_TYPE_GET(_val) \
+   (((_val) >> PEDIT_TYPE_SHIFT) & PEDIT_TYPE_MASK)
+#define PEDIT_SHIFT_GET(_val) ((_val) & 0xff)
+
+enum pedit_header_type {
+   PEDIT_HDR_TYPE_RAW = 0,
+
+   PEDIT_HDR_TYPE_ETH = 1,
+   PEDIT_HDR_TYPE_IP4 = 2,
+   PEDIT_HDR_TYPE_IP6 = 3,
+   PEDIT_HDR_TYPE_TCP = 4,
+   PEDIT_HDR_TYPE_UDP = 5,
+};
+
 #endif
diff --git a/net/sched/act_pedit.c b/net/sched/act_pedit.c
index b27c4daec88f..4b9c7184c752 100644
--- a/net/sched/act_pedit.c
+++ b/net/sched/act_pedit.c
@@ -119,18 +119,45 @@ static bool offset_valid(struct sk_buff *skb, int offset)
return true;
 }
 
+static int pedit_skb_hdr_offset(struct sk_buff *skb,
+   enum pedit_header_type htype, int *hoffset)
+{
+   int ret = -1;
+
+   switch (htype) {
+   case PEDIT_HDR_TYPE_ETH:
+   if (skb_mac_header_was_set(skb)) {
+   *hoffset = skb_mac_offset(skb);
+   ret = 0;
+   }
+   break;
+   case PEDIT_HDR_TYPE_RAW:
+   case PEDIT_HDR_TYPE_IP4:
+   case PEDIT_HDR_TYPE_IP6:
+   *hoffset = skb_network_offset(skb);
+   ret = 0;
+   break;
+   case PEDIT_HDR_TYPE_TCP:
+   case PEDIT_HDR_TYPE_UDP:
+   if (skb_transport_header_was_set(skb)) {
+   *hoffset = skb_transport_offset(skb);
+   ret = 0;
+   }
+   break;
+   };
+
+   return ret;
+}
+
 static int tcf_pedit(struct sk_buff *skb, const struct tc_action *a,
 struct tcf_result *res)
 {
struct tcf_pedit *p = to_pedit(a);
int i;
-   unsigned int off;
 
if (skb_unclone(skb, GFP_ATOMIC))
return p->tcf_action;
 
-   off = skb_network_offset(skb);
-
spin_lock(>tcf_lock);
 
tcf_lastuse_update(>tcf_tm);
@@ -141,20 +168,32 @@ static int tcf_pedit(struct sk_buff *skb, const struct 
tc_action *a,
for (i = p->tcfp_nkeys; i > 0; i--, tkey++) {
u32 *ptr, _data;
int offset = tkey->off;
+   int hoffset;
+   int rc;
+   enum pedit_header_type htype =
+   PEDIT_TYPE_GET(tkey->shift);
+
+   rc = pedit_skb_hdr_offset(skb, htype, );
+   if (rc) {
+   pr_info("tc filter pedit bad header type 
specified (0x%x)\n",
+   htype);
+   goto bad;
+   }
 
if (tkey->offmask) {
char *d, _d;
 
-   if (!offset_valid(skb, off + tkey->at)) {
+   if (!offset_valid(skb, hoffset + tkey->at)) {
pr_info("tc filter pedit 'at' offset %d 
out of bounds\n",
-   off + tkey->at);
+   hoffset + tkey->at);
goto bad;
}
-   d = skb_header_pointer(skb, off + tkey->at, 1,
-  &_d);
+   d = skb_header_pointer(skb,
+   

[PATCH net-next V2 1/3] net/skbuff: Introduce skb_mac_offset()

2017-01-05 Thread Amir Vadai
Introduce skb_mac_offset() that could be used to get mac header offset.

Signed-off-by: Amir Vadai <a...@vadai.me>
Reviewed-by: Or Gerlitz <ogerl...@mellanox.com>
---
 include/linux/skbuff.h | 5 +
 1 file changed, 5 insertions(+)

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index b53c0cfd417e..3d8f81f39c2b 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -2178,6 +2178,11 @@ static inline unsigned char *skb_mac_header(const struct 
sk_buff *skb)
return skb->head + skb->mac_header;
 }
 
+static inline int skb_mac_offset(const struct sk_buff *skb)
+{
+   return skb_mac_header(skb) - skb->data;
+}
+
 static inline int skb_mac_header_was_set(const struct sk_buff *skb)
 {
return skb->mac_header != (typeof(skb->mac_header))~0U;
-- 
2.11.0



[PATCH iproute2 V5 0/3] tc: Support for ip tunnel metadata set/unset/classify

2016-12-02 Thread Amir Vadai
Hi,

This short series adds support for matching and setting metadata for ip tunnel
shared device using the TC system, introduced in kernel 4.9 [1].

Applied and tested on top of commit b6c7fc61faab ("ss: print new tcp_info
fields: busy, rwnd-limited, sndbuf-limited times")


Example usage:

$ tc filter add dev vxlan0 protocol ip parent : \
flower \
  enc_src_ip 11.11.0.2 \
  enc_dst_ip 11.11.0.1 \
  enc_key_id 11 \
  dst_ip 11.11.11.1 \
action mirred egress redirect dev vnet0

$ tc filter add dev net0 protocol ip parent : \
flower \
  ip_proto 1 \
  dst_ip 11.11.11.2 \
action tunnel_key set \
  src_ip 11.11.0.1 \
  dst_ip 11.11.0.2 \
  id 11 \
action mirred egress redirect dev vxlan0

[1] - d1ba24feb466 ("Merge branch 'act_tunnel_key'")

Thanks,
Amir

Changes from V4:
- Fix rebase conflicts for net-next

Changes from V3:
- Fix bad wording in the man page about the use of the 'unset' operation

Changes from V2:
- Use const where needed
- Don't lose return value
- Introduce rta_getattr_be16() and rta_getattr_be32()

Changes from V1:
- Updated Patch 2/2 ("tc/act_tunnel: Introduce ip tunnel action") commit log
and the man page tc-tunnel_key to reflect the fact that 'unset' 
operation is
no mandatory.
And describe when it might be needed.
- Rename the 'release' operation to 'unset'

Amir Vadai (3):
  libnetlink: Introduce rta_getattr_be*()
  tc/cls_flower: Classify packet in ip tunnels
  tc/act_tunnel: Introduce ip tunnel action

Amir Vadai (3):
  libnetlink: Introduce rta_getattr_be*()
  tc/cls_flower: Classify packet in ip tunnels
  tc/act_tunnel: Introduce ip tunnel action

 bridge/fdb.c |   4 +-
 include/libnetlink.h |   9 ++
 include/linux/tc_act/tc_tunnel_key.h |  42 ++
 ip/iplink_geneve.c   |   2 +-
 ip/iplink_vxlan.c|   2 +-
 man/man8/tc-flower.8 |  17 ++-
 man/man8/tc-tunnel_key.8 | 112 +++
 tc/Makefile  |   1 +
 tc/f_flower.c|  84 +++-
 tc/m_tunnel_key.c| 258 +++
 10 files changed, 522 insertions(+), 9 deletions(-)
 create mode 100644 include/linux/tc_act/tc_tunnel_key.h
 create mode 100644 man/man8/tc-tunnel_key.8
 create mode 100644 tc/m_tunnel_key.c

-- 
2.10.2



[PATCH iproute2 V5 3/3] tc/act_tunnel: Introduce ip tunnel action

2016-12-02 Thread Amir Vadai
This action could be used before redirecting packets to a shared tunnel
device, or when redirecting packets arriving from a such a device.

The 'unset' action is optional. It is used to explicitly unset the
metadata created by the tunnel device during decap. If not used, the
metadata will be released automatically by the kernel.
The 'set' operation, will set the metadata with the specified values for
the encap.

For example, the following flower filter will forward all ICMP packets
destined to 11.11.11.2 through the shared vxlan device 'vxlan0'. Before
redirecting, a metadata for the vxlan tunnel is created using the
tunnel_key action and it's arguments:

$ tc filter add dev net0 protocol ip parent : \
flower \
  ip_proto 1 \
  dst_ip 11.11.11.2 \
action tunnel_key set \
  src_ip 11.11.0.1 \
  dst_ip 11.11.0.2 \
  id 11 \
action mirred egress redirect dev vxlan0

Signed-off-by: Amir Vadai <a...@vadai.me>
---
 include/linux/tc_act/tc_tunnel_key.h |  42 ++
 man/man8/tc-tunnel_key.8 | 112 +++
 tc/Makefile  |   1 +
 tc/m_tunnel_key.c| 258 +++
 4 files changed, 413 insertions(+)
 create mode 100644 include/linux/tc_act/tc_tunnel_key.h
 create mode 100644 man/man8/tc-tunnel_key.8
 create mode 100644 tc/m_tunnel_key.c

diff --git a/include/linux/tc_act/tc_tunnel_key.h 
b/include/linux/tc_act/tc_tunnel_key.h
new file mode 100644
index ..f9ddf5369a45
--- /dev/null
+++ b/include/linux/tc_act/tc_tunnel_key.h
@@ -0,0 +1,42 @@
+/*
+ * Copyright (c) 2016, Amir Vadai <a...@vadai.me>
+ * Copyright (c) 2016, Mellanox Technologies. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+
+#ifndef __LINUX_TC_TUNNEL_KEY_H
+#define __LINUX_TC_TUNNEL_KEY_H
+
+#include 
+
+#define TCA_ACT_TUNNEL_KEY 17
+
+#define TCA_TUNNEL_KEY_ACT_SET 1
+#define TCA_TUNNEL_KEY_ACT_RELEASE  2
+
+struct tc_tunnel_key {
+   tc_gen;
+   int t_action;
+};
+
+enum {
+   TCA_TUNNEL_KEY_UNSPEC,
+   TCA_TUNNEL_KEY_TM,
+   TCA_TUNNEL_KEY_PARMS,
+   TCA_TUNNEL_KEY_ENC_IPV4_SRC,/* be32 */
+   TCA_TUNNEL_KEY_ENC_IPV4_DST,/* be32 */
+   TCA_TUNNEL_KEY_ENC_IPV6_SRC,/* struct in6_addr */
+   TCA_TUNNEL_KEY_ENC_IPV6_DST,/* struct in6_addr */
+   TCA_TUNNEL_KEY_ENC_KEY_ID,  /* be64 */
+   TCA_TUNNEL_KEY_PAD,
+   __TCA_TUNNEL_KEY_MAX,
+};
+
+#define TCA_TUNNEL_KEY_MAX (__TCA_TUNNEL_KEY_MAX - 1)
+
+#endif
+
diff --git a/man/man8/tc-tunnel_key.8 b/man/man8/tc-tunnel_key.8
new file mode 100644
index ..17b15b9b34b9
--- /dev/null
+++ b/man/man8/tc-tunnel_key.8
@@ -0,0 +1,112 @@
+.TH "Tunnel metadata manipulation action in tc" 8 "10 Nov 2016" "iproute2" 
"Linux"
+
+.SH NAME
+tunnel_key - Tunnel metadata manipulation
+.SH SYNOPSIS
+.in +8
+.ti -8
+.BR tc " ... " "action tunnel_key" " { " unset " | "
+.IR SET " }"
+
+.ti -8
+.IR SET " := "
+.BR set " " src_ip
+.IR ADDRESS
+.BR dst_ip
+.IR ADDRESS
+.BI id " KEY_ID"
+
+.SH DESCRIPTION
+The
+.B tunnel_key
+action combined with a shared IP tunnel device, allows to perform IP tunnel en-
+or decapsulation on a packet, reflected by
+the operation modes
+.IR UNSET " and " SET .
+The
+.I UNSET
+mode is optional - even without using it, the metadata information will be
+released automatically when packet processing will be finished.
+.IR UNSET
+function could be used in cases when traffic is forwarded between two tunnels,
+where the metadata from the first tunnel will be used for encapsulation done by
+the second tunnel.
+.IR SET
+mode requires the source and destination ip
+.I ADDRESS
+and the tunnel key id
+.I KEY_ID
+which will be used by the ip tunnel shared device to create the tunnel header. 
The
+.B tunnel_key
+action is useful only in combination with a
+.B mirred redirect
+action to a shared IP tunnel device which will use the metadata (for
+.I SET
+) and unset the metadata created by it (for
+.I UNSET
+).
+
+.SH OPTIONS
+.TP
+.B unset
+Unset the tunnel metadata created by the IP tunnel device.  This function is
+not mandatory and might be used only in some specific use cases (as explained
+above).
+.TP
+.B set
+Set tunnel metadata to be used by the IP tunnel device. Requires
+.B id
+,
+.B src_ip
+and
+.B dst_ip
+options.
+.RS
+.TP
+.B id
+Tunnel ID (for example VNI in VXLAN tunnel)
+.TP
+.B src_ip
+Outer header source IP address (IPv4 or IPv6)
+.TP
+.B dst_ip
+Outer header destination IP address (IPv4 or IPv6)
+.RE
+.SH EXAMPLES
+The following example encapsulates incoming ICMP packets on eth0 into a vxlan
+tunnel, by s

[PATCH iproute2 V5 2/3] tc/cls_flower: Classify packet in ip tunnels

2016-12-02 Thread Amir Vadai
Introduce classifying by metadata extracted by the tunnel device.
Outer header fields - source/dest ip and tunnel id, are extracted from
the metadata when classifying.

For example, the following will add a filter on the ingress Qdisc of shared
vxlan device named 'vxlan0'. To forward packets with outer src ip
11.11.0.2, dst ip 11.11.0.1 and tunnel id 11. The packets will be
forwarded to tap device 'vnet0':

$ tc filter add dev vxlan0 protocol ip parent : \
flower \
  enc_src_ip 11.11.0.2 \
  enc_dst_ip 11.11.0.1 \
  enc_key_id 11 \
  dst_ip 11.11.11.1 \
action mirred egress redirect dev vnet0

Signed-off-by: Amir Vadai <a...@vadai.me>
---
 man/man8/tc-flower.8 | 17 ++-
 tc/f_flower.c| 82 ++--
 2 files changed, 95 insertions(+), 4 deletions(-)

diff --git a/man/man8/tc-flower.8 b/man/man8/tc-flower.8
index 16ef261797ab..dd3564917dcc 100644
--- a/man/man8/tc-flower.8
+++ b/man/man8/tc-flower.8
@@ -34,7 +34,11 @@ flower \- flow based traffic control filter
 .BR dst_ip " | " src_ip " } { "
 .IR ipv4_address " | " ipv6_address " } | { "
 .BR dst_port " | " src_port " } "
-.IR port_number " }"
+.IR port_number " } | "
+.B enc_key_id
+.IR KEY-ID " | {"
+.BR enc_dst_ip " | " enc_src_ip " } { "
+.IR ipv4_address " | " ipv6_address " } | "
 .SH DESCRIPTION
 The
 .B flower
@@ -112,6 +116,17 @@ which has to be specified in beforehand.
 Match on layer 4 protocol source or destination port number. Only available for
 .BR ip_proto " values " udp " and " tcp ,
 which has to be specified in beforehand.
+.TP
+.BI enc_key_id " NUMBER"
+.TQ
+.BI enc_dst_ip " ADDRESS"
+.TQ
+.BI enc_src_ip " ADDRESS"
+Match on IP tunnel metadata. Key id
+.I NUMBER
+is a 32 bit tunnel key id (e.g. VNI for VXLAN tunnel).
+.I ADDRESS
+must be a valid IPv4 or IPv6 address.
 .SH NOTES
 As stated above where applicable, matches of a certain layer implicitly depend
 on the matches of the next lower layer. Precisely, layer one and two matches (
diff --git a/tc/f_flower.c b/tc/f_flower.c
index e132974e0d1d..7e7f4c92a947 100644
--- a/tc/f_flower.c
+++ b/tc/f_flower.c
@@ -41,7 +41,10 @@ static void explain(void)
"   dst_ip [ IPV4-ADDR | IPV6-ADDR ] |\n"
"   src_ip [ IPV4-ADDR | IPV6-ADDR ] |\n"
"   dst_port PORT-NUMBER |\n"
-   "   src_port PORT-NUMBER }\n"
+   "   src_port PORT-NUMBER |\n"
+   "   enc_dst_ip [ IPV4-ADDR | IPV6-ADDR ] 
|\n"
+   "   enc_src_ip [ IPV4-ADDR | IPV6-ADDR ] 
|\n"
+   "   enc_key_id [ KEY-ID ] }\n"
"   FILTERID := X:Y:Z\n"
"   ACTION-SPEC := ... look at individual actions\n"
"\n"
@@ -125,8 +128,9 @@ static int flower_parse_ip_addr(char *str, __be16 eth_type,
family = AF_INET;
} else if (eth_type == htons(ETH_P_IPV6)) {
family = AF_INET6;
+   } else if (!eth_type) {
+   family = AF_UNSPEC;
} else {
-   fprintf(stderr, "Illegal \"eth_type\" for ip address\n");
return -1;
}
 
@@ -134,8 +138,10 @@ static int flower_parse_ip_addr(char *str, __be16 eth_type,
if (ret)
return -1;
 
-   if (addr.family != family)
+   if (family && (addr.family != family)) {
+   fprintf(stderr, "Illegal \"eth_type\" for ip address\n");
return -1;
+   }
 
addattr_l(n, MAX_MSG, addr.family == AF_INET ? addr4_type : addr6_type,
  addr.data, addr.bytelen);
@@ -197,6 +203,18 @@ static int flower_parse_port(char *str, __u8 ip_port, bool 
is_src,
return 0;
 }
 
+static int flower_parse_key_id(const char *str, int type, struct nlmsghdr *n)
+{
+   int ret;
+   __be32 key_id;
+
+   ret = get_be32(_id, str, 10);
+   if (!ret)
+   addattr32(n, MAX_MSG, type, key_id);
+
+   return ret;
+}
+
 static int flower_parse_opt(struct filter_util *qu, char *handle,
int argc, char **argv, struct nlmsghdr *n)
 {
@@ -354,6 +372,38 @@ static int flower_parse_opt(struct filter_util *qu, char 
*handle,
fprintf(stderr, "Illegal \"src_port\"\n");
return -1;
}
+   } else if (matches(*argv,

[PATCH iproute2 V5 1/3] libnetlink: Introduce rta_getattr_be*()

2016-12-02 Thread Amir Vadai
Add the utility functions rta_getattr_be16() and rta_getattr_be32(), and
change existing code to use it.

Signed-off-by: Amir Vadai <a...@vadai.me>
---
 bridge/fdb.c | 4 ++--
 include/libnetlink.h | 9 +
 ip/iplink_geneve.c   | 2 +-
 ip/iplink_vxlan.c| 2 +-
 tc/f_flower.c| 2 +-
 5 files changed, 14 insertions(+), 5 deletions(-)

diff --git a/bridge/fdb.c b/bridge/fdb.c
index 90f4b154c5dc..a91521776e99 100644
--- a/bridge/fdb.c
+++ b/bridge/fdb.c
@@ -168,10 +168,10 @@ int print_fdb(const struct sockaddr_nl *who, struct 
nlmsghdr *n, void *arg)
if (tb[NDA_PORT]) {
if (jw_global)
jsonw_uint_field(jw_global, "port",
-ntohs(rta_getattr_u16(tb[NDA_PORT])));
+rta_getattr_be16(tb[NDA_PORT]));
else
fprintf(fp, "port %d ",
-   ntohs(rta_getattr_u16(tb[NDA_PORT])));
+   rta_getattr_be16(tb[NDA_PORT]));
}
 
if (tb[NDA_VNI]) {
diff --git a/include/libnetlink.h b/include/libnetlink.h
index 483509ca9635..751ebf186dd4 100644
--- a/include/libnetlink.h
+++ b/include/libnetlink.h
@@ -10,6 +10,7 @@
 #include 
 #include 
 #include 
+#include 
 
 struct rtnl_handle {
int fd;
@@ -140,10 +141,18 @@ static inline __u16 rta_getattr_u16(const struct rtattr 
*rta)
 {
return *(__u16 *)RTA_DATA(rta);
 }
+static inline __be16 rta_getattr_be16(const struct rtattr *rta)
+{
+   return ntohs(rta_getattr_u16(rta));
+}
 static inline __u32 rta_getattr_u32(const struct rtattr *rta)
 {
return *(__u32 *)RTA_DATA(rta);
 }
+static inline __be32 rta_getattr_be32(const struct rtattr *rta)
+{
+   return ntohl(rta_getattr_u32(rta));
+}
 static inline __u64 rta_getattr_u64(const struct rtattr *rta)
 {
__u64 tmp;
diff --git a/ip/iplink_geneve.c b/ip/iplink_geneve.c
index 3bfba91c644c..1e6669d07d60 100644
--- a/ip/iplink_geneve.c
+++ b/ip/iplink_geneve.c
@@ -234,7 +234,7 @@ static void geneve_print_opt(struct link_util *lu, FILE *f, 
struct rtattr *tb[])
 
if (tb[IFLA_GENEVE_PORT])
fprintf(f, "dstport %u ",
-   ntohs(rta_getattr_u16(tb[IFLA_GENEVE_PORT])));
+   rta_getattr_be16(tb[IFLA_GENEVE_PORT]));
 
if (tb[IFLA_GENEVE_COLLECT_METADATA])
fputs("external ", f);
diff --git a/ip/iplink_vxlan.c b/ip/iplink_vxlan.c
index 93af979a1e97..6d02bb47b2f0 100644
--- a/ip/iplink_vxlan.c
+++ b/ip/iplink_vxlan.c
@@ -413,7 +413,7 @@ static void vxlan_print_opt(struct link_util *lu, FILE *f, 
struct rtattr *tb[])
 
if (tb[IFLA_VXLAN_PORT])
fprintf(f, "dstport %u ",
-   ntohs(rta_getattr_u16(tb[IFLA_VXLAN_PORT])));
+   rta_getattr_be16(tb[IFLA_VXLAN_PORT]));
 
if (tb[IFLA_VXLAN_LEARNING] &&
!rta_getattr_u8(tb[IFLA_VXLAN_LEARNING]))
diff --git a/tc/f_flower.c b/tc/f_flower.c
index 1555764b9996..e132974e0d1d 100644
--- a/tc/f_flower.c
+++ b/tc/f_flower.c
@@ -511,7 +511,7 @@ static void flower_print_ip_addr(FILE *f, char *name, 
__be16 eth_type,
 
 static void flower_print_port(FILE *f, char *name, struct rtattr *attr)
 {
-   fprintf(f, "\n  %s %d", name, ntohs(rta_getattr_u16(attr)));
+   fprintf(f, "\n  %s %d", name, rta_getattr_be16(attr));
 }
 
 static int flower_print_opt(struct filter_util *qu, FILE *f,
-- 
2.10.2



Re: [PATCH net-next 2/3] net/act_pedit: Support using offset relative to the conventional network headers

2016-12-02 Thread Amir Vadai
On Thu, Dec 01, 2016 at 02:41:14PM -0500, David Miller wrote:
> From: Amir Vadai <a...@vadai.me>
> Date: Wed, 30 Nov 2016 11:09:27 +0200
> 
> > @@ -119,18 +119,45 @@ static bool offset_valid(struct sk_buff *skb, int 
> > offset)
> > return true;
> >  }
> >  
> > +static int pedit_skb_hdr_offset(struct sk_buff *skb,
> > +   enum pedit_header_type htype, int *hoffset)
> > +{
> > +   int ret = -1;
> > +
> > +   switch (htype) {
> > +   case PEDIT_HDR_TYPE_ETH:
> > +   if (skb_mac_header_was_set(skb)) {
> > +   *hoffset = skb_mac_offset(skb);
> > +   ret = 0;
> > +   }
> > +   break;
> > +   case PEDIT_HDR_TYPE_RAW:
> > +   case PEDIT_HDR_TYPE_IP4:
> > +   case PEDIT_HDR_TYPE_IP6:
> > +   *hoffset = skb_network_offset(skb);
> > +   ret = 0;
> > +   break;
> > +   case PEDIT_HDR_TYPE_TCP:
> > +   case PEDIT_HDR_TYPE_UDP:
> > +   if (skb_transport_header_was_set(skb)) {
> > +   *hoffset = skb_transport_offset(skb);
> > +   ret = 0;
> > +   }
> > +   break;
> > +   };
> > +
> > +   return ret;
> > +}
> > +
> 
> The only distinction between the cases is "L2", "L3", and "L4".
> 
> Therefore I don't see any reason to break it down into IP4 vs. IP6 vs.
> RAW, for example.  They all map to the same thing.
> 
> So why not just have PEDIT_HDR_TYPE_L2, PEDIT_HDR_TYPE_L3, and
> PEDIT_HDR_TYPE_L4?  It definitely seems more straightforward
> and cleaner that way.
Yeh, is isn't by mistake. The next step will be to implement hardware
offloading of the action, and for that we would like to keep the
information about the specific header type.

> 
> Thanks.


[PATCH iproute2 V4 2/3] tc/cls_flower: Classify packet in ip tunnels

2016-12-01 Thread Amir Vadai
Introduce classifying by metadata extracted by the tunnel device.
Outer header fields - source/dest ip and tunnel id, are extracted from
the metadata when classifying.

For example, the following will add a filter on the ingress Qdisc of shared
vxlan device named 'vxlan0'. To forward packets with outer src ip
11.11.0.2, dst ip 11.11.0.1 and tunnel id 11. The packets will be
forwarded to tap device 'vnet0':

$ tc filter add dev vxlan0 protocol ip parent : \
flower \
  enc_src_ip 11.11.0.2 \
  enc_dst_ip 11.11.0.1 \
  enc_key_id 11 \
  dst_ip 11.11.11.1 \
action mirred egress redirect dev vnet0

Signed-off-by: Amir Vadai <a...@vadai.me>
---
 man/man8/tc-flower.8 | 17 ++-
 tc/f_flower.c| 84 +---
 2 files changed, 96 insertions(+), 5 deletions(-)

diff --git a/man/man8/tc-flower.8 b/man/man8/tc-flower.8
index 74f76647753b..0e0b0cf4bb72 100644
--- a/man/man8/tc-flower.8
+++ b/man/man8/tc-flower.8
@@ -36,7 +36,11 @@ flower \- flow based traffic control filter
 .BR dst_ip " | " src_ip " } { "
 .IR ipv4_address " | " ipv6_address " } | { "
 .BR dst_port " | " src_port " } "
-.IR port_number " }"
+.IR port_number " } | "
+.B enc_key_id
+.IR KEY-ID " | {"
+.BR enc_dst_ip " | " enc_src_ip " } { "
+.IR ipv4_address " | " ipv6_address " } | "
 .SH DESCRIPTION
 The
 .B flower
@@ -121,6 +125,17 @@ which has to be specified in beforehand.
 Match on layer 4 protocol source or destination port number. Only available for
 .BR ip_proto " values " udp " and " tcp ,
 which has to be specified in beforehand.
+.TP
+.BI enc_key_id " NUMBER"
+.TQ
+.BI enc_dst_ip " ADDRESS"
+.TQ
+.BI enc_src_ip " ADDRESS"
+Match on IP tunnel metadata. Key id
+.I NUMBER
+is a 32 bit tunnel key id (e.g. VNI for VXLAN tunnel).
+.I ADDRESS
+must be a valid IPv4 or IPv6 address.
 .SH NOTES
 As stated above where applicable, matches of a certain layer implicitly depend
 on the matches of the next lower layer. Precisely, layer one and two matches (
diff --git a/tc/f_flower.c b/tc/f_flower.c
index 2d31d1aa832d..173cfc20f90b 100644
--- a/tc/f_flower.c
+++ b/tc/f_flower.c
@@ -41,7 +41,10 @@ static void explain(void)
fprintf(stderr, "   dst_ip [ IPV4-ADDR | IPV6-ADDR 
] |\n");
fprintf(stderr, "   src_ip [ IPV4-ADDR | IPV6-ADDR 
] |\n");
fprintf(stderr, "   dst_port PORT-NUMBER |\n");
-   fprintf(stderr, "   src_port PORT-NUMBER }\n");
+   fprintf(stderr, "   src_port PORT-NUMBER |\n");
+   fprintf(stderr, "   enc_dst_ip [ IPV4-ADDR | 
IPV6-ADDR ] |\n");
+   fprintf(stderr, "   enc_src_ip [ IPV4-ADDR | 
IPV6-ADDR ] |\n");
+   fprintf(stderr, "   enc_key_id [ KEY-ID ] }\n");
fprintf(stderr, "   FILTERID := X:Y:Z\n");
fprintf(stderr, "   ACTION-SPEC := ... look at individual 
actions\n");
fprintf(stderr, "\n");
@@ -121,8 +124,9 @@ static int flower_parse_ip_addr(char *str, __be16 eth_type,
family = AF_INET;
} else if (eth_type == htons(ETH_P_IPV6)) {
family = AF_INET6;
+   } else if (!eth_type) {
+   family = AF_UNSPEC;
} else {
-   fprintf(stderr, "Illegal \"eth_type\" for ip address\n");
return -1;
}
 
@@ -130,8 +134,10 @@ static int flower_parse_ip_addr(char *str, __be16 eth_type,
if (ret)
return -1;
 
-   if (addr.family != family)
+   if (family && (addr.family != family)) {
+   fprintf(stderr, "Illegal \"eth_type\" for ip address\n");
return -1;
+   }
 
addattr_l(n, MAX_MSG, addr.family == AF_INET ? addr4_type : addr6_type,
  addr.data, addr.bytelen);
@@ -181,6 +187,18 @@ static int flower_parse_port(char *str, __u8 ip_port,
return 0;
 }
 
+static int flower_parse_key_id(const char *str, int type, struct nlmsghdr *n)
+{
+   int ret;
+   __be32 key_id;
+
+   ret = get_be32(_id, str, 10);
+   if (!ret)
+   addattr32(n, MAX_MSG, type, key_id);
+
+   return ret;
+}
+
 static int flower_parse_opt(struct filter_util *qu, char *handle,
int argc, char **argv, struct nlmsghdr *n)
 {
@@ -339,6 +357,38 @@ static int flower_parse_opt(struct filter_util *qu, char 
*handle,
fprintf(stderr, "Illegal \"src_port\"\n");
  

[PATCH iproute2 V4 0/3] tc: Support for ip tunnel metadata set/unset/classify

2016-12-01 Thread Amir Vadai
Hi,

This short series adds support for matching and setting metadata for ip tunnel
shared device using the TC system, introduced in kernel 4.9 [1].

Applied and tested on top of commit f3f339e9590a ("cleanup debris from revert")

Example usage:

$ tc filter add dev vxlan0 protocol ip parent : \
flower \
  enc_src_ip 11.11.0.2 \
  enc_dst_ip 11.11.0.1 \
  enc_key_id 11 \
  dst_ip 11.11.11.1 \
action mirred egress redirect dev vnet0

$ tc filter add dev net0 protocol ip parent : \
flower \
  ip_proto 1 \
  dst_ip 11.11.11.2 \
action tunnel_key set \
  src_ip 11.11.0.1 \
  dst_ip 11.11.0.2 \
  id 11 \
action mirred egress redirect dev vxlan0

[1] - d1ba24feb466 ("Merge branch 'act_tunnel_key'")

Thanks,
Amir

Changes from V3:
- Fix bad wording in the man page about the use of the 'unset' operation

Changes from V2:
- Use const where needed
- Don't lose return value
- Introduce rta_getattr_be16() and rta_getattr_be32()

Changes from V1:
- Updated Patch 2/2 ("tc/act_tunnel: Introduce ip tunnel action") commit log
and the man page tc-tunnel_key to reflect the fact that 'unset' 
operation is
no mandatory.
And describe when it might be needed.
- Rename the 'release' operation to 'unset'

Amir Vadai (3):
  libnetlink: Introduce rta_getattr_be*()
  tc/cls_flower: Classify packet in ip tunnels
  tc/act_tunnel: Introduce ip tunnel action

 bridge/fdb.c |   4 +-
 include/libnetlink.h |   9 ++
 include/linux/tc_act/tc_tunnel_key.h |  42 ++
 ip/iplink_geneve.c   |   2 +-
 ip/iplink_vxlan.c|   2 +-
 man/man8/tc-flower.8 |  17 ++-
 man/man8/tc-tunnel_key.8 | 112 +++
 tc/Makefile  |   1 +
 tc/f_flower.c|  84 +++-
 tc/m_tunnel_key.c| 258 +++
 10 files changed, 522 insertions(+), 9 deletions(-)
 create mode 100644 include/linux/tc_act/tc_tunnel_key.h
 create mode 100644 man/man8/tc-tunnel_key.8
 create mode 100644 tc/m_tunnel_key.c

-- 
2.10.2



[PATCH iproute2 V4 1/3] libnetlink: Introduce rta_getattr_be*()

2016-12-01 Thread Amir Vadai
Add the utility functions rta_getattr_be16() and rta_getattr_be32(), and
change existing code to use it.

Signed-off-by: Amir Vadai <a...@vadai.me>
---
 bridge/fdb.c | 4 ++--
 include/libnetlink.h | 9 +
 ip/iplink_geneve.c   | 2 +-
 ip/iplink_vxlan.c| 2 +-
 4 files changed, 13 insertions(+), 4 deletions(-)

diff --git a/bridge/fdb.c b/bridge/fdb.c
index 90f4b154c5dc..a91521776e99 100644
--- a/bridge/fdb.c
+++ b/bridge/fdb.c
@@ -168,10 +168,10 @@ int print_fdb(const struct sockaddr_nl *who, struct 
nlmsghdr *n, void *arg)
if (tb[NDA_PORT]) {
if (jw_global)
jsonw_uint_field(jw_global, "port",
-ntohs(rta_getattr_u16(tb[NDA_PORT])));
+rta_getattr_be16(tb[NDA_PORT]));
else
fprintf(fp, "port %d ",
-   ntohs(rta_getattr_u16(tb[NDA_PORT])));
+   rta_getattr_be16(tb[NDA_PORT]));
}
 
if (tb[NDA_VNI]) {
diff --git a/include/libnetlink.h b/include/libnetlink.h
index 483509ca9635..751ebf186dd4 100644
--- a/include/libnetlink.h
+++ b/include/libnetlink.h
@@ -10,6 +10,7 @@
 #include 
 #include 
 #include 
+#include 
 
 struct rtnl_handle {
int fd;
@@ -140,10 +141,18 @@ static inline __u16 rta_getattr_u16(const struct rtattr 
*rta)
 {
return *(__u16 *)RTA_DATA(rta);
 }
+static inline __be16 rta_getattr_be16(const struct rtattr *rta)
+{
+   return ntohs(rta_getattr_u16(rta));
+}
 static inline __u32 rta_getattr_u32(const struct rtattr *rta)
 {
return *(__u32 *)RTA_DATA(rta);
 }
+static inline __be32 rta_getattr_be32(const struct rtattr *rta)
+{
+   return ntohl(rta_getattr_u32(rta));
+}
 static inline __u64 rta_getattr_u64(const struct rtattr *rta)
 {
__u64 tmp;
diff --git a/ip/iplink_geneve.c b/ip/iplink_geneve.c
index 3bfba91c644c..1e6669d07d60 100644
--- a/ip/iplink_geneve.c
+++ b/ip/iplink_geneve.c
@@ -234,7 +234,7 @@ static void geneve_print_opt(struct link_util *lu, FILE *f, 
struct rtattr *tb[])
 
if (tb[IFLA_GENEVE_PORT])
fprintf(f, "dstport %u ",
-   ntohs(rta_getattr_u16(tb[IFLA_GENEVE_PORT])));
+   rta_getattr_be16(tb[IFLA_GENEVE_PORT]));
 
if (tb[IFLA_GENEVE_COLLECT_METADATA])
fputs("external ", f);
diff --git a/ip/iplink_vxlan.c b/ip/iplink_vxlan.c
index 93af979a1e97..6d02bb47b2f0 100644
--- a/ip/iplink_vxlan.c
+++ b/ip/iplink_vxlan.c
@@ -413,7 +413,7 @@ static void vxlan_print_opt(struct link_util *lu, FILE *f, 
struct rtattr *tb[])
 
if (tb[IFLA_VXLAN_PORT])
fprintf(f, "dstport %u ",
-   ntohs(rta_getattr_u16(tb[IFLA_VXLAN_PORT])));
+   rta_getattr_be16(tb[IFLA_VXLAN_PORT]));
 
if (tb[IFLA_VXLAN_LEARNING] &&
!rta_getattr_u8(tb[IFLA_VXLAN_LEARNING]))
-- 
2.10.2



[PATCH iproute2 V4 3/3] tc/act_tunnel: Introduce ip tunnel action

2016-12-01 Thread Amir Vadai
This action could be used before redirecting packets to a shared tunnel
device, or when redirecting packets arriving from a such a device.

The 'unset' action is optional. It is used to explicitly unset the
metadata created by the tunnel device during decap. If not used, the
metadata will be released automatically by the kernel.
The 'set' operation, will set the metadata with the specified values for
the encap.

For example, the following flower filter will forward all ICMP packets
destined to 11.11.11.2 through the shared vxlan device 'vxlan0'. Before
redirecting, a metadata for the vxlan tunnel is created using the
tunnel_key action and it's arguments:

$ tc filter add dev net0 protocol ip parent : \
flower \
  ip_proto 1 \
  dst_ip 11.11.11.2 \
action tunnel_key set \
  src_ip 11.11.0.1 \
  dst_ip 11.11.0.2 \
  id 11 \
action mirred egress redirect dev vxlan0

Signed-off-by: Amir Vadai <a...@vadai.me>
---
 include/linux/tc_act/tc_tunnel_key.h |  42 ++
 man/man8/tc-tunnel_key.8 | 112 +++
 tc/Makefile  |   1 +
 tc/m_tunnel_key.c| 258 +++
 4 files changed, 413 insertions(+)
 create mode 100644 include/linux/tc_act/tc_tunnel_key.h
 create mode 100644 man/man8/tc-tunnel_key.8
 create mode 100644 tc/m_tunnel_key.c

diff --git a/include/linux/tc_act/tc_tunnel_key.h 
b/include/linux/tc_act/tc_tunnel_key.h
new file mode 100644
index ..f9ddf5369a45
--- /dev/null
+++ b/include/linux/tc_act/tc_tunnel_key.h
@@ -0,0 +1,42 @@
+/*
+ * Copyright (c) 2016, Amir Vadai <a...@vadai.me>
+ * Copyright (c) 2016, Mellanox Technologies. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+
+#ifndef __LINUX_TC_TUNNEL_KEY_H
+#define __LINUX_TC_TUNNEL_KEY_H
+
+#include 
+
+#define TCA_ACT_TUNNEL_KEY 17
+
+#define TCA_TUNNEL_KEY_ACT_SET 1
+#define TCA_TUNNEL_KEY_ACT_RELEASE  2
+
+struct tc_tunnel_key {
+   tc_gen;
+   int t_action;
+};
+
+enum {
+   TCA_TUNNEL_KEY_UNSPEC,
+   TCA_TUNNEL_KEY_TM,
+   TCA_TUNNEL_KEY_PARMS,
+   TCA_TUNNEL_KEY_ENC_IPV4_SRC,/* be32 */
+   TCA_TUNNEL_KEY_ENC_IPV4_DST,/* be32 */
+   TCA_TUNNEL_KEY_ENC_IPV6_SRC,/* struct in6_addr */
+   TCA_TUNNEL_KEY_ENC_IPV6_DST,/* struct in6_addr */
+   TCA_TUNNEL_KEY_ENC_KEY_ID,  /* be64 */
+   TCA_TUNNEL_KEY_PAD,
+   __TCA_TUNNEL_KEY_MAX,
+};
+
+#define TCA_TUNNEL_KEY_MAX (__TCA_TUNNEL_KEY_MAX - 1)
+
+#endif
+
diff --git a/man/man8/tc-tunnel_key.8 b/man/man8/tc-tunnel_key.8
new file mode 100644
index ..17b15b9b34b9
--- /dev/null
+++ b/man/man8/tc-tunnel_key.8
@@ -0,0 +1,112 @@
+.TH "Tunnel metadata manipulation action in tc" 8 "10 Nov 2016" "iproute2" 
"Linux"
+
+.SH NAME
+tunnel_key - Tunnel metadata manipulation
+.SH SYNOPSIS
+.in +8
+.ti -8
+.BR tc " ... " "action tunnel_key" " { " unset " | "
+.IR SET " }"
+
+.ti -8
+.IR SET " := "
+.BR set " " src_ip
+.IR ADDRESS
+.BR dst_ip
+.IR ADDRESS
+.BI id " KEY_ID"
+
+.SH DESCRIPTION
+The
+.B tunnel_key
+action combined with a shared IP tunnel device, allows to perform IP tunnel en-
+or decapsulation on a packet, reflected by
+the operation modes
+.IR UNSET " and " SET .
+The
+.I UNSET
+mode is optional - even without using it, the metadata information will be
+released automatically when packet processing will be finished.
+.IR UNSET
+function could be used in cases when traffic is forwarded between two tunnels,
+where the metadata from the first tunnel will be used for encapsulation done by
+the second tunnel.
+.IR SET
+mode requires the source and destination ip
+.I ADDRESS
+and the tunnel key id
+.I KEY_ID
+which will be used by the ip tunnel shared device to create the tunnel header. 
The
+.B tunnel_key
+action is useful only in combination with a
+.B mirred redirect
+action to a shared IP tunnel device which will use the metadata (for
+.I SET
+) and unset the metadata created by it (for
+.I UNSET
+).
+
+.SH OPTIONS
+.TP
+.B unset
+Unset the tunnel metadata created by the IP tunnel device.  This function is
+not mandatory and might be used only in some specific use cases (as explained
+above).
+.TP
+.B set
+Set tunnel metadata to be used by the IP tunnel device. Requires
+.B id
+,
+.B src_ip
+and
+.B dst_ip
+options.
+.RS
+.TP
+.B id
+Tunnel ID (for example VNI in VXLAN tunnel)
+.TP
+.B src_ip
+Outer header source IP address (IPv4 or IPv6)
+.TP
+.B dst_ip
+Outer header destination IP address (IPv4 or IPv6)
+.RE
+.SH EXAMPLES
+The following example encapsulates incoming ICMP packets on eth0 into a vxlan
+tunnel, by s

Re: [PATCH iproute2 V3 3/3] tc/act_tunnel: Introduce ip tunnel action

2016-12-01 Thread Amir Vadai
On Wed, Nov 30, 2016 at 03:44:53PM +0100, Jiri Benc wrote:
> On Wed, 30 Nov 2016 09:38:40 +0200, Amir Vadai wrote:
> > +The
> > +.I UNSET
> > +mode is optional - even without using it, the metadata information will be
> > +released automatically when packet processing will be finished.
> > +.IR UNSET
> > +function could be used in cases when traffic is forwarded between two 
> > tunnels,
> > +where the metadata from the first tunnel will be used for encapsulation 
> > done by
> > +the second tunnel.
> 
> This looks good.
:)

> 
> > +It must be used for offloaded filters, such that hardware drivers can
> > +realize they need to program the HW to do decapsulation.
> 
> However, this is wrong. The hardware offloading must be transparent.
> The same configuration that works when processed in software must work
> in hardware if the hardware has the necessary capabilities. Requiring
> the user to alter the configuration to accommodate hardware
> peculiarities is not acceptable.
> 
> Or maybe I'm misunderstanding what you mean here. In which case it's
> not documented properly :-)
You understood it correctly. We should not force the user to use the
'unset' operation for offloading only. I will remove it from the text
here.

> 
> > +.IR SET
> > +mode requires the source and destination ip
> > +.I ADDRESS
> > +and the tunnel key id
> > +.I KEY_ID
> > +which will be used by the ip tunnel shared device to create the tunnel 
> > header. The
> > +.B tunnel_key
> > +action is useful only in combination with a
> > +.B mirred redirect
> > +action to a shared IP tunnel device which will use the metadata (for
> > +.I SET
> > +) and unset the metadata created by it (for
> > +.I UNSET
> > +).
> > +
> > +.SH OPTIONS
> > +.TP
> > +.B unset
> > +Decapsulation mode, no further arguments allowed. This function is not
> > +mandatory and might be used only in some specific use cases.
> 
> This is NOT decapsulation. The packet is decapsulated at this point in
> any case, whether or not set/unset or whatever is used. These actions
> are only and solely about metadata associated with the packet. The
> actual encapsulation and decapsulation happens at the target netdevice.
> 
> Calling this "decapsulation" is wrong. And if it's implemented as such
> in your hardware offloading, then it's doubly wrong as it doesn't match
> software processing and hence you must not do that and you must change
> that.
Got it. Bad wording by me - will fix it and make sure offloading will
realy be transparent to user.

> 
> > +.TP
> > +.B set
> > +Encapsulation mode. Requires
> 
> Likewise, this is not encapsulation. It just sets metadata.
ack

Thanks,
Amir
> 
>  Jiri


[PATCH net-next 2/3] net/act_pedit: Support using offset relative to the conventional network headers

2016-11-30 Thread Amir Vadai
Extend pedit to enable the user using offset relative to network
headers.  This change would enable to work with more complex header
schemes (vs the simple IPv4 case) where setting a fixed offset relative
to the network header is not enough. It is also forward looking to
enable hardware offloading of pedit more easier.

The header type is embedded in the 8 MSB of the u32 key->shift which
were never used till now. Therefore backward compatibility is being
kept.

Usage example:
$ tc filter add dev enp0s9 protocol ip parent : \
  flower \
ip_proto tcp \
src_port 80 \
  action pedit munge tcp dport set 8080 pipe \
  action mirred egress redirect dev veth0

Will forward traffic to tcp port 80, and modify the destination port to
8080.

hange-Id: Ibd7bbbe0b8c2f6adae0591868bb6892c55e75732
Signed-off-by: Amir Vadai <a...@vadai.me>
---
 include/uapi/linux/tc_act/tc_pedit.h | 17 ++
 net/sched/act_pedit.c| 65 +---
 2 files changed, 70 insertions(+), 12 deletions(-)

diff --git a/include/uapi/linux/tc_act/tc_pedit.h 
b/include/uapi/linux/tc_act/tc_pedit.h
index 6389959a5157..604e6729ad38 100644
--- a/include/uapi/linux/tc_act/tc_pedit.h
+++ b/include/uapi/linux/tc_act/tc_pedit.h
@@ -32,4 +32,21 @@ struct tc_pedit_sel {
 };
 #define tc_pedit tc_pedit_sel
 
+#define PEDIT_TYPE_SHIFT 24
+#define PEDIT_TYPE_MASK 0xff
+
+#define PEDIT_TYPE_GET(_val) \
+   (((_val) >> PEDIT_TYPE_SHIFT) & PEDIT_TYPE_MASK)
+#define PEDIT_SHIFT_GET(_val) ((_val) & 0xff)
+
+enum pedit_header_type {
+   PEDIT_HDR_TYPE_RAW = 0,
+
+   PEDIT_HDR_TYPE_ETH = 1,
+   PEDIT_HDR_TYPE_IP4 = 2,
+   PEDIT_HDR_TYPE_IP6 = 3,
+   PEDIT_HDR_TYPE_TCP = 4,
+   PEDIT_HDR_TYPE_UDP = 5,
+};
+
 #endif
diff --git a/net/sched/act_pedit.c b/net/sched/act_pedit.c
index b27c4daec88f..4b9c7184c752 100644
--- a/net/sched/act_pedit.c
+++ b/net/sched/act_pedit.c
@@ -119,18 +119,45 @@ static bool offset_valid(struct sk_buff *skb, int offset)
return true;
 }
 
+static int pedit_skb_hdr_offset(struct sk_buff *skb,
+   enum pedit_header_type htype, int *hoffset)
+{
+   int ret = -1;
+
+   switch (htype) {
+   case PEDIT_HDR_TYPE_ETH:
+   if (skb_mac_header_was_set(skb)) {
+   *hoffset = skb_mac_offset(skb);
+   ret = 0;
+   }
+   break;
+   case PEDIT_HDR_TYPE_RAW:
+   case PEDIT_HDR_TYPE_IP4:
+   case PEDIT_HDR_TYPE_IP6:
+   *hoffset = skb_network_offset(skb);
+   ret = 0;
+   break;
+   case PEDIT_HDR_TYPE_TCP:
+   case PEDIT_HDR_TYPE_UDP:
+   if (skb_transport_header_was_set(skb)) {
+   *hoffset = skb_transport_offset(skb);
+   ret = 0;
+   }
+   break;
+   };
+
+   return ret;
+}
+
 static int tcf_pedit(struct sk_buff *skb, const struct tc_action *a,
 struct tcf_result *res)
 {
struct tcf_pedit *p = to_pedit(a);
int i;
-   unsigned int off;
 
if (skb_unclone(skb, GFP_ATOMIC))
return p->tcf_action;
 
-   off = skb_network_offset(skb);
-
spin_lock(>tcf_lock);
 
tcf_lastuse_update(>tcf_tm);
@@ -141,20 +168,32 @@ static int tcf_pedit(struct sk_buff *skb, const struct 
tc_action *a,
for (i = p->tcfp_nkeys; i > 0; i--, tkey++) {
u32 *ptr, _data;
int offset = tkey->off;
+   int hoffset;
+   int rc;
+   enum pedit_header_type htype =
+   PEDIT_TYPE_GET(tkey->shift);
+
+   rc = pedit_skb_hdr_offset(skb, htype, );
+   if (rc) {
+   pr_info("tc filter pedit bad header type 
specified (0x%x)\n",
+   htype);
+   goto bad;
+   }
 
if (tkey->offmask) {
char *d, _d;
 
-   if (!offset_valid(skb, off + tkey->at)) {
+   if (!offset_valid(skb, hoffset + tkey->at)) {
pr_info("tc filter pedit 'at' offset %d 
out of bounds\n",
-   off + tkey->at);
+   hoffset + tkey->at);
goto bad;
}
-   d = skb_header_pointer(skb, off + tkey->at, 1,
-  &_d);
+   d = skb_header_pointer(skb,
+  hoffset + tkey->at,
+

[PATCH net-next 3/3] net/act_pedit: Introduce 'add' operation

2016-11-30 Thread Amir Vadai
This command could be useful to inc/dec fields.

For example, to forward any TCP packet and decrease its TTL:
$ tc filter add dev enp0s9 protocol ip parent : \
flower ip_proto tcp \
action pedit munge ip ttl add 0xff pipe \
action mirred egress redirect dev veth0

In the example above, adding 0xff to this u8 field is actually
decreasing it by one, since the operation is masked.

Signed-off-by: Amir Vadai <a...@vadai.me>
---
 include/uapi/linux/tc_act/tc_pedit.h | 10 ++
 net/sched/act_pedit.c| 16 +++-
 2 files changed, 25 insertions(+), 1 deletion(-)

diff --git a/include/uapi/linux/tc_act/tc_pedit.h 
b/include/uapi/linux/tc_act/tc_pedit.h
index 604e6729ad38..80028cd0bb1b 100644
--- a/include/uapi/linux/tc_act/tc_pedit.h
+++ b/include/uapi/linux/tc_act/tc_pedit.h
@@ -35,8 +35,13 @@ struct tc_pedit_sel {
 #define PEDIT_TYPE_SHIFT 24
 #define PEDIT_TYPE_MASK 0xff
 
+#define PEDIT_CMD_SHIFT 16
+#define PEDIT_CMD_MASK 0xff
+
 #define PEDIT_TYPE_GET(_val) \
(((_val) >> PEDIT_TYPE_SHIFT) & PEDIT_TYPE_MASK)
+#define PEDIT_CMD_GET(_val) \
+   (((_val) >> PEDIT_CMD_SHIFT) & PEDIT_CMD_MASK)
 #define PEDIT_SHIFT_GET(_val) ((_val) & 0xff)
 
 enum pedit_header_type {
@@ -49,4 +54,9 @@ enum pedit_header_type {
PEDIT_HDR_TYPE_UDP = 5,
 };
 
+enum pedit_cmd {
+   PEDIT_CMD_SET = 0,
+   PEDIT_CMD_ADD = 1,
+};
+
 #endif
diff --git a/net/sched/act_pedit.c b/net/sched/act_pedit.c
index 4b9c7184c752..aa137d51bf7f 100644
--- a/net/sched/act_pedit.c
+++ b/net/sched/act_pedit.c
@@ -169,6 +169,7 @@ static int tcf_pedit(struct sk_buff *skb, const struct 
tc_action *a,
u32 *ptr, _data;
int offset = tkey->off;
int hoffset;
+   u32 val;
int rc;
enum pedit_header_type htype =
PEDIT_TYPE_GET(tkey->shift);
@@ -214,7 +215,20 @@ static int tcf_pedit(struct sk_buff *skb, const struct 
tc_action *a,
if (!ptr)
goto bad;
/* just do it, baby */
-   *ptr = ((*ptr & tkey->mask) ^ tkey->val);
+   switch (PEDIT_CMD_GET(tkey->shift)) {
+   case PEDIT_CMD_SET:
+   val = tkey->val;
+   break;
+   case PEDIT_CMD_ADD:
+   val = (*ptr + tkey->val) & ~tkey->mask;
+   break;
+   default:
+   pr_info("tc filter pedit bad command (%d)\n",
+   PEDIT_CMD_GET(tkey->shift));
+   goto bad;
+   }
+
+   *ptr = ((*ptr & tkey->mask) ^ val);
if (ptr == &_data)
skb_store_bits(skb, hoffset + offset, ptr, 4);
}
-- 
2.10.2



[PATCH net-next 0/3] net/sched: act_pedit: Support using offset relative to the conventional network headers

2016-11-30 Thread Amir Vadai
Hi,

Patch 1/3 ("net/skbuff: Introduce skb_mac_offset()") adds a utility function to
get mac header offset.

Patch 2/3 ("net/act_pedit: Support using offset relative to the conventional
network headers") extends pedit to enable the user to set offset relative to
MAC/IPv4/IPv6/TCP network headers.
This would enable to work with more complex header schemes (vs the simple IPv4
case) where setting a fixed offset relative to the network header is not
enough. It is also forward looking to enable hardware offloading of pedit more
easier.

The header type is embedded in the 8 MSB of the u32 key->shift which
were never used till now. Therefore backward compatibility is being
kept.

Patch 3/3 ("net/act_pedit: Introduce 'add' operation") add a new operation to
increase the value of a header field. The operation is passed on another free
8bit in the key->shift.

Usage example:
$ tc filter add dev enp0s9 protocol ip parent : \
  flower \
ip_proto tcp \
src_port 80 \
  action \
  pedit munge ip ttl add 0xff \
  pedit munge tcp dport set 8080 \
pipe action mirred egress redirect dev veth0

Will forward traffic with tcp dport 80, and modify the destination port to
8080, and decrease the ttl by 1.

I've uploaded a draft for the userspace [2] to make it easier to review and
test the patchset.

The patchset will conflict if already accepted patch [1] from net is missing.
It was applied and tested with [1] on top of commit 93ba5504 ("hv_netvsc:
remove excessive logging on MTU change").

[1] - 95c2027bfeda ("net/sched: pedit: make sure that offset is valid")
[2] - git: https://bitbucket.org/av42/iproute2.git
  branch: pedit

Thanks,
Amir

Amir Vadai (3):
  net/skbuff: Introduce skb_mac_offset()
  net/act_pedit: Support using offset relative to the conventional
network headers
  net/act_pedit: Introduce 'add' operation

 include/linux/skbuff.h   |  5 +++
 include/uapi/linux/tc_act/tc_pedit.h | 27 
 net/sched/act_pedit.c| 81 ++--
 3 files changed, 100 insertions(+), 13 deletions(-)

-- 
2.10.2



[PATCH net-next 1/3] net/skbuff: Introduce skb_mac_offset()

2016-11-30 Thread Amir Vadai
Introduce skb_mac_offset() that could be used to get mac header offset.

Signed-off-by: Amir Vadai <a...@vadai.me>
---
 include/linux/skbuff.h | 5 +
 1 file changed, 5 insertions(+)

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 9c535fbccf2c..395eb5111df0 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -2169,6 +2169,11 @@ static inline unsigned char *skb_mac_header(const struct 
sk_buff *skb)
return skb->head + skb->mac_header;
 }
 
+static inline int skb_mac_offset(const struct sk_buff *skb)
+{
+   return skb_mac_header(skb) - skb->data;
+}
+
 static inline int skb_mac_header_was_set(const struct sk_buff *skb)
 {
return skb->mac_header != (typeof(skb->mac_header))~0U;
-- 
2.10.2



[PATCH iproute2 V3 2/3] tc/cls_flower: Classify packet in ip tunnels

2016-11-29 Thread Amir Vadai
Introduce classifying by metadata extracted by the tunnel device.
Outer header fields - source/dest ip and tunnel id, are extracted from
the metadata when classifying.

For example, the following will add a filter on the ingress Qdisc of shared
vxlan device named 'vxlan0'. To forward packets with outer src ip
11.11.0.2, dst ip 11.11.0.1 and tunnel id 11. The packets will be
forwarded to tap device 'vnet0':

$ tc filter add dev vxlan0 protocol ip parent : \
flower \
  enc_src_ip 11.11.0.2 \
  enc_dst_ip 11.11.0.1 \
  enc_key_id 11 \
  dst_ip 11.11.11.1 \
action mirred egress redirect dev vnet0

Signed-off-by: Amir Vadai <a...@vadai.me>
---
 man/man8/tc-flower.8 | 17 ++-
 tc/f_flower.c| 84 +---
 2 files changed, 96 insertions(+), 5 deletions(-)

diff --git a/man/man8/tc-flower.8 b/man/man8/tc-flower.8
index 74f76647753b..0e0b0cf4bb72 100644
--- a/man/man8/tc-flower.8
+++ b/man/man8/tc-flower.8
@@ -36,7 +36,11 @@ flower \- flow based traffic control filter
 .BR dst_ip " | " src_ip " } { "
 .IR ipv4_address " | " ipv6_address " } | { "
 .BR dst_port " | " src_port " } "
-.IR port_number " }"
+.IR port_number " } | "
+.B enc_key_id
+.IR KEY-ID " | {"
+.BR enc_dst_ip " | " enc_src_ip " } { "
+.IR ipv4_address " | " ipv6_address " } | "
 .SH DESCRIPTION
 The
 .B flower
@@ -121,6 +125,17 @@ which has to be specified in beforehand.
 Match on layer 4 protocol source or destination port number. Only available for
 .BR ip_proto " values " udp " and " tcp ,
 which has to be specified in beforehand.
+.TP
+.BI enc_key_id " NUMBER"
+.TQ
+.BI enc_dst_ip " ADDRESS"
+.TQ
+.BI enc_src_ip " ADDRESS"
+Match on IP tunnel metadata. Key id
+.I NUMBER
+is a 32 bit tunnel key id (e.g. VNI for VXLAN tunnel).
+.I ADDRESS
+must be a valid IPv4 or IPv6 address.
 .SH NOTES
 As stated above where applicable, matches of a certain layer implicitly depend
 on the matches of the next lower layer. Precisely, layer one and two matches (
diff --git a/tc/f_flower.c b/tc/f_flower.c
index 2d31d1aa832d..173cfc20f90b 100644
--- a/tc/f_flower.c
+++ b/tc/f_flower.c
@@ -41,7 +41,10 @@ static void explain(void)
fprintf(stderr, "   dst_ip [ IPV4-ADDR | IPV6-ADDR 
] |\n");
fprintf(stderr, "   src_ip [ IPV4-ADDR | IPV6-ADDR 
] |\n");
fprintf(stderr, "   dst_port PORT-NUMBER |\n");
-   fprintf(stderr, "   src_port PORT-NUMBER }\n");
+   fprintf(stderr, "   src_port PORT-NUMBER |\n");
+   fprintf(stderr, "   enc_dst_ip [ IPV4-ADDR | 
IPV6-ADDR ] |\n");
+   fprintf(stderr, "   enc_src_ip [ IPV4-ADDR | 
IPV6-ADDR ] |\n");
+   fprintf(stderr, "   enc_key_id [ KEY-ID ] }\n");
fprintf(stderr, "   FILTERID := X:Y:Z\n");
fprintf(stderr, "   ACTION-SPEC := ... look at individual 
actions\n");
fprintf(stderr, "\n");
@@ -121,8 +124,9 @@ static int flower_parse_ip_addr(char *str, __be16 eth_type,
family = AF_INET;
} else if (eth_type == htons(ETH_P_IPV6)) {
family = AF_INET6;
+   } else if (!eth_type) {
+   family = AF_UNSPEC;
} else {
-   fprintf(stderr, "Illegal \"eth_type\" for ip address\n");
return -1;
}
 
@@ -130,8 +134,10 @@ static int flower_parse_ip_addr(char *str, __be16 eth_type,
if (ret)
return -1;
 
-   if (addr.family != family)
+   if (family && (addr.family != family)) {
+   fprintf(stderr, "Illegal \"eth_type\" for ip address\n");
return -1;
+   }
 
addattr_l(n, MAX_MSG, addr.family == AF_INET ? addr4_type : addr6_type,
  addr.data, addr.bytelen);
@@ -181,6 +187,18 @@ static int flower_parse_port(char *str, __u8 ip_port,
return 0;
 }
 
+static int flower_parse_key_id(const char *str, int type, struct nlmsghdr *n)
+{
+   int ret;
+   __be32 key_id;
+
+   ret = get_be32(_id, str, 10);
+   if (!ret)
+   addattr32(n, MAX_MSG, type, key_id);
+
+   return ret;
+}
+
 static int flower_parse_opt(struct filter_util *qu, char *handle,
int argc, char **argv, struct nlmsghdr *n)
 {
@@ -339,6 +357,38 @@ static int flower_parse_opt(struct filter_util *qu, char 
*handle,
fprintf(stderr, "Illegal \"src_port\"\n");
  

[PATCH iproute2 V3 3/3] tc/act_tunnel: Introduce ip tunnel action

2016-11-29 Thread Amir Vadai
This action could be used before redirecting packets to a shared tunnel
device, or when redirecting packets arriving from a such a device.

The 'unset' action is optional. It is used to explicitly unset the
metadata created by the tunnel device during decap. If not used, the
metadata will be released automatically by the kernel.
The 'set' operation, will set the metadata with the specified values for
the encap.

For example, the following flower filter will forward all ICMP packets
destined to 11.11.11.2 through the shared vxlan device 'vxlan0'. Before
redirecting, a metadata for the vxlan tunnel is created using the
tunnel_key action and it's arguments:

$ tc filter add dev net0 protocol ip parent : \
flower \
  ip_proto 1 \
  dst_ip 11.11.11.2 \
action tunnel_key set \
  src_ip 11.11.0.1 \
  dst_ip 11.11.0.2 \
  id 11 \
action mirred egress redirect dev vxlan0

Signed-off-by: Amir Vadai <a...@vadai.me>
---
 include/linux/tc_act/tc_tunnel_key.h |  42 ++
 man/man8/tc-tunnel_key.8 | 113 +++
 tc/Makefile  |   1 +
 tc/m_tunnel_key.c| 258 +++
 4 files changed, 414 insertions(+)
 create mode 100644 include/linux/tc_act/tc_tunnel_key.h
 create mode 100644 man/man8/tc-tunnel_key.8
 create mode 100644 tc/m_tunnel_key.c

diff --git a/include/linux/tc_act/tc_tunnel_key.h 
b/include/linux/tc_act/tc_tunnel_key.h
new file mode 100644
index ..f9ddf5369a45
--- /dev/null
+++ b/include/linux/tc_act/tc_tunnel_key.h
@@ -0,0 +1,42 @@
+/*
+ * Copyright (c) 2016, Amir Vadai <a...@vadai.me>
+ * Copyright (c) 2016, Mellanox Technologies. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+
+#ifndef __LINUX_TC_TUNNEL_KEY_H
+#define __LINUX_TC_TUNNEL_KEY_H
+
+#include 
+
+#define TCA_ACT_TUNNEL_KEY 17
+
+#define TCA_TUNNEL_KEY_ACT_SET 1
+#define TCA_TUNNEL_KEY_ACT_RELEASE  2
+
+struct tc_tunnel_key {
+   tc_gen;
+   int t_action;
+};
+
+enum {
+   TCA_TUNNEL_KEY_UNSPEC,
+   TCA_TUNNEL_KEY_TM,
+   TCA_TUNNEL_KEY_PARMS,
+   TCA_TUNNEL_KEY_ENC_IPV4_SRC,/* be32 */
+   TCA_TUNNEL_KEY_ENC_IPV4_DST,/* be32 */
+   TCA_TUNNEL_KEY_ENC_IPV6_SRC,/* struct in6_addr */
+   TCA_TUNNEL_KEY_ENC_IPV6_DST,/* struct in6_addr */
+   TCA_TUNNEL_KEY_ENC_KEY_ID,  /* be64 */
+   TCA_TUNNEL_KEY_PAD,
+   __TCA_TUNNEL_KEY_MAX,
+};
+
+#define TCA_TUNNEL_KEY_MAX (__TCA_TUNNEL_KEY_MAX - 1)
+
+#endif
+
diff --git a/man/man8/tc-tunnel_key.8 b/man/man8/tc-tunnel_key.8
new file mode 100644
index ..d0c333d27158
--- /dev/null
+++ b/man/man8/tc-tunnel_key.8
@@ -0,0 +1,113 @@
+.TH "Tunnel metadata manipulation action in tc" 8 "10 Nov 2016" "iproute2" 
"Linux"
+
+.SH NAME
+tunnel_key - Tunnel metadata manipulation
+.SH SYNOPSIS
+.in +8
+.ti -8
+.BR tc " ... " "action tunnel_key" " { " unset " | "
+.IR SET " }"
+
+.ti -8
+.IR SET " := "
+.BR set " " src_ip
+.IR ADDRESS
+.BR dst_ip
+.IR ADDRESS
+.BI id " KEY_ID"
+
+.SH DESCRIPTION
+The
+.B tunnel_key
+action combined with a shared IP tunnel device, allows to perform IP tunnel en-
+or decapsulation on a packet, reflected by
+the operation modes
+.IR UNSET " and " SET .
+The
+.I UNSET
+mode is optional - even without using it, the metadata information will be
+released automatically when packet processing will be finished.
+.IR UNSET
+function could be used in cases when traffic is forwarded between two tunnels,
+where the metadata from the first tunnel will be used for encapsulation done by
+the second tunnel.
+It must be used for offloaded filters, such that hardware drivers can
+realize they need to program the HW to do decapsulation.
+.IR SET
+mode requires the source and destination ip
+.I ADDRESS
+and the tunnel key id
+.I KEY_ID
+which will be used by the ip tunnel shared device to create the tunnel header. 
The
+.B tunnel_key
+action is useful only in combination with a
+.B mirred redirect
+action to a shared IP tunnel device which will use the metadata (for
+.I SET
+) and unset the metadata created by it (for
+.I UNSET
+).
+
+.SH OPTIONS
+.TP
+.B unset
+Decapsulation mode, no further arguments allowed. This function is not
+mandatory and might be used only in some specific use cases.
+.TP
+.B set
+Encapsulation mode. Requires
+.B id
+,
+.B src_ip
+and
+.B dst_ip
+options.
+.RS
+.TP
+.B id
+Tunnel ID (for example VNI in VXLAN tunnel)
+.TP
+.B src_ip
+Outer header source IP address (IPv4 or IPv6)
+.TP
+.B dst_ip
+Outer header destination IP address (IPv4 or IPv6)
+.RE
+.SH EXAMPLES
+The following example en

[PATCH iproute2 V3 0/3] tc: Support for ip tunnel metadata set/unset/classify

2016-11-29 Thread Amir Vadai
Hi,

This short series adds support for matching and setting metadata for ip tunnel
shared device using the TC system, introduced in kernel 4.9 [1].

Applied and tested on top of commit f3f339e9590a ("cleanup debris from revert")

Example usage:

$ tc filter add dev vxlan0 protocol ip parent : \
flower \
  enc_src_ip 11.11.0.2 \
  enc_dst_ip 11.11.0.1 \
  enc_key_id 11 \
  dst_ip 11.11.11.1 \
action mirred egress redirect dev vnet0

$ tc filter add dev net0 protocol ip parent : \
flower \
  ip_proto 1 \
  dst_ip 11.11.11.2 \
action tunnel_key set \
  src_ip 11.11.0.1 \
  dst_ip 11.11.0.2 \
  id 11 \
action mirred egress redirect dev vxlan0

[1] - d1ba24feb466 ("Merge branch 'act_tunnel_key'")

Thanks,
Amir

Changes from V2:
- Use const where needed
- Don't lose return value
- Introduce rta_getattr_be16() and rta_getattr_be32()

Changes from V1:
- Updated Patch 2/2 ("tc/act_tunnel: Introduce ip tunnel action") commit log
and the man page tc-tunnel_key to reflect the fact that 'unset' 
operation is
no mandatory.
And describe when it might be needed.
- Rename the 'release' operation to 'unset'

Amir Vadai (3):
  libnetlink: Introduce rta_getattr_be*()
  tc/cls_flower: Classify packet in ip tunnels
  tc/act_tunnel: Introduce ip tunnel action

 bridge/fdb.c |   4 +-
 include/libnetlink.h |   9 ++
 include/linux/tc_act/tc_tunnel_key.h |  42 ++
 ip/iplink_geneve.c   |   2 +-
 ip/iplink_vxlan.c|   2 +-
 man/man8/tc-flower.8 |  17 ++-
 man/man8/tc-tunnel_key.8 | 113 +++
 tc/Makefile  |   1 +
 tc/f_flower.c|  84 +++-
 tc/m_tunnel_key.c| 258 +++
 10 files changed, 523 insertions(+), 9 deletions(-)
 create mode 100644 include/linux/tc_act/tc_tunnel_key.h
 create mode 100644 man/man8/tc-tunnel_key.8
 create mode 100644 tc/m_tunnel_key.c

-- 
2.10.2



[PATCH iproute2 V3 1/3] libnetlink: Introduce rta_getattr_be*()

2016-11-29 Thread Amir Vadai
Add the utility functions rta_getattr_be16() and rta_getattr_be32(), and
change existing code to use it.

Signed-off-by: Amir Vadai <a...@vadai.me>
---
 bridge/fdb.c | 4 ++--
 include/libnetlink.h | 9 +
 ip/iplink_geneve.c   | 2 +-
 ip/iplink_vxlan.c| 2 +-
 4 files changed, 13 insertions(+), 4 deletions(-)

diff --git a/bridge/fdb.c b/bridge/fdb.c
index 90f4b154c5dc..a91521776e99 100644
--- a/bridge/fdb.c
+++ b/bridge/fdb.c
@@ -168,10 +168,10 @@ int print_fdb(const struct sockaddr_nl *who, struct 
nlmsghdr *n, void *arg)
if (tb[NDA_PORT]) {
if (jw_global)
jsonw_uint_field(jw_global, "port",
-ntohs(rta_getattr_u16(tb[NDA_PORT])));
+rta_getattr_be16(tb[NDA_PORT]));
else
fprintf(fp, "port %d ",
-   ntohs(rta_getattr_u16(tb[NDA_PORT])));
+   rta_getattr_be16(tb[NDA_PORT]));
}
 
if (tb[NDA_VNI]) {
diff --git a/include/libnetlink.h b/include/libnetlink.h
index 483509ca9635..751ebf186dd4 100644
--- a/include/libnetlink.h
+++ b/include/libnetlink.h
@@ -10,6 +10,7 @@
 #include 
 #include 
 #include 
+#include 
 
 struct rtnl_handle {
int fd;
@@ -140,10 +141,18 @@ static inline __u16 rta_getattr_u16(const struct rtattr 
*rta)
 {
return *(__u16 *)RTA_DATA(rta);
 }
+static inline __be16 rta_getattr_be16(const struct rtattr *rta)
+{
+   return ntohs(rta_getattr_u16(rta));
+}
 static inline __u32 rta_getattr_u32(const struct rtattr *rta)
 {
return *(__u32 *)RTA_DATA(rta);
 }
+static inline __be32 rta_getattr_be32(const struct rtattr *rta)
+{
+   return ntohl(rta_getattr_u32(rta));
+}
 static inline __u64 rta_getattr_u64(const struct rtattr *rta)
 {
__u64 tmp;
diff --git a/ip/iplink_geneve.c b/ip/iplink_geneve.c
index 3bfba91c644c..1e6669d07d60 100644
--- a/ip/iplink_geneve.c
+++ b/ip/iplink_geneve.c
@@ -234,7 +234,7 @@ static void geneve_print_opt(struct link_util *lu, FILE *f, 
struct rtattr *tb[])
 
if (tb[IFLA_GENEVE_PORT])
fprintf(f, "dstport %u ",
-   ntohs(rta_getattr_u16(tb[IFLA_GENEVE_PORT])));
+   rta_getattr_be16(tb[IFLA_GENEVE_PORT]));
 
if (tb[IFLA_GENEVE_COLLECT_METADATA])
fputs("external ", f);
diff --git a/ip/iplink_vxlan.c b/ip/iplink_vxlan.c
index 93af979a1e97..6d02bb47b2f0 100644
--- a/ip/iplink_vxlan.c
+++ b/ip/iplink_vxlan.c
@@ -413,7 +413,7 @@ static void vxlan_print_opt(struct link_util *lu, FILE *f, 
struct rtattr *tb[])
 
if (tb[IFLA_VXLAN_PORT])
fprintf(f, "dstport %u ",
-   ntohs(rta_getattr_u16(tb[IFLA_VXLAN_PORT])));
+   rta_getattr_be16(tb[IFLA_VXLAN_PORT]));
 
if (tb[IFLA_VXLAN_LEARNING] &&
!rta_getattr_u8(tb[IFLA_VXLAN_LEARNING]))
-- 
2.10.2



Re: [PATCH iproute2 V2 1/2] tc/cls_flower: Classify packet in ip tunnels

2016-11-29 Thread Amir Vadai
On Wed, Nov 30, 2016 at 9:17 AM, Amir Vadai <a...@vadai.me> wrote:
> On Tue, Nov 29, 2016 at 07:26:58PM -0800, Stephen Hemminger wrote:
(Sending again since I just discovered that Google Inbox is adding an
HTML part...)

>> The overall design is fine, just a couple nits with the code.
>>
>> > diff --git a/tc/f_flower.c b/tc/f_flower.c
>> > index 2d31d1aa832d..1cf0750b5b83 100644
>> > --- a/tc/f_flower.c
>> > +++ b/tc/f_flower.c
>>
>> >
>> > +static int flower_parse_key_id(char *str, int type, struct nlmsghdr *n)
>>
>> str is not modified, therefore use: const char *str
> ack
>
>>
>> > +{
>> > +   int ret;
>> > +   __be32 key_id;
>> > +
>> > +   ret = get_be32(_id, str, 10);
>> > +   if (ret)
>> > +   return -1;
>>
>> Traditionally netlink attributes are in host order, why was flower
>> chosen to be different?
> I don't know, maybe Jiri (cc'ed) can explain, but it is all over the
> flower code.
Now the right Jiri (Pirko) is CC'ed

>
>>
>> > +
>> > +   addattr32(n, MAX_MSG, type, key_id);
>> > +
>> > +   return 0;
>>
>>
>> Why lose the return value here?  Why not:
>>
>>   ret = get_be32(_id, str, 10);
>>   if (!ret)
>>   addattr32(n, MAX_MSG, type, key_id);
> The get_*() function can return only -1 or 0. But you are right, and it
> is better the way you suggested.  Changing accordingly in V3.
>
>>
>> > +}
>> > +
>> >  static int flower_parse_opt(struct filter_util *qu, char *handle,
>> > int argc, char **argv, struct nlmsghdr *n)
>> >  {
>> > @@ -339,6 +359,38 @@ static int flower_parse_opt(struct filter_util *qu, 
>> > char *handle,
>> > fprintf(stderr, "Illegal \"src_port\"\n");
>> > return -1;
>> > }
>> > +   } else if (matches(*argv, "enc_dst_ip") == 0) {
>> > +   NEXT_ARG();
>> > +   ret = flower_parse_ip_addr(*argv, 0,
>> > +  TCA_FLOWER_KEY_ENC_IPV4_DST,
>> > +  
>> > TCA_FLOWER_KEY_ENC_IPV4_DST_MASK,
>> > +  TCA_FLOWER_KEY_ENC_IPV6_DST,
>> > +  
>> > TCA_FLOWER_KEY_ENC_IPV6_DST_MASK,
>> > +  n);
>> > +   if (ret < 0) {
>> > +   fprintf(stderr, "Illegal \"enc_dst_ip\"\n");
>> > +   return -1;
>> > +   }
>> > +   } else if (matches(*argv, "enc_src_ip") == 0) {
>> > +   NEXT_ARG();
>> > +   ret = flower_parse_ip_addr(*argv, 0,
>> > +  TCA_FLOWER_KEY_ENC_IPV4_SRC,
>> > +  
>> > TCA_FLOWER_KEY_ENC_IPV4_SRC_MASK,
>> > +  TCA_FLOWER_KEY_ENC_IPV6_SRC,
>> > +  
>> > TCA_FLOWER_KEY_ENC_IPV6_SRC_MASK,
>> > +  n);
>> > +   if (ret < 0) {
>> > +   fprintf(stderr, "Illegal \"enc_src_ip\"\n");
>> > +   return -1;
>> > +   }
>> > +   } else if (matches(*argv, "enc_key_id") == 0) {
>> > +   NEXT_ARG();
>> > +   ret = flower_parse_key_id(*argv,
>> > + TCA_FLOWER_KEY_ENC_KEY_ID, 
>> > n);
>> > +   if (ret < 0) {
>> > +   fprintf(stderr, "Illegal \"enc_key_id\"\n");
>> > +   return -1;
>> > +   }
>> > } else if (matches(*argv, "action") == 0) {
>> > NEXT_ARG();
>> > ret = parse_action(, , TCA_FLOWER_ACT, n);
>> > @@ -509,6 +561,14 @@ static void flower_print_port(FILE *f, char *name, 
>> > __u8 ip_proto,
>> > fprintf(f, "\n  %s %d", name, ntohs(rta_getattr_u16(attr)));
>> >  }
>> >
>> > +static void flower_print_key_id(FILE *f, char *name,
>> > +   struct rtattr *attr)
>>
>> const char *name?
> ack
>
>>
>>
>> > +{
>> > +   if (!attr)
>> > +   return;
>> > +   fprintf(f, "\n  %s %d", name, ntohl(rta_getattr_u32(attr)));
>> > +}
>> > +
>>
>> Why short circuit, just change the order:
>>
>>   if (attr)
>>   fprintf(f, "\n  %s %s", name, ntohl(rta_getattr_u32(attr));
>>
>> You might also want to introduce rta_getattr_be32()
> ack
>
>>
>> Please change, retest and resubmit both patches.
> ack
>
> Thanks for reviewing,
> Amir


Re: [PATCH iproute2 V2 1/2] tc/cls_flower: Classify packet in ip tunnels

2016-11-29 Thread Amir Vadai
On Tue, Nov 29, 2016 at 07:26:58PM -0800, Stephen Hemminger wrote:
> The overall design is fine, just a couple nits with the code.
> 
> > diff --git a/tc/f_flower.c b/tc/f_flower.c
> > index 2d31d1aa832d..1cf0750b5b83 100644
> > --- a/tc/f_flower.c
> > +++ b/tc/f_flower.c
> 
> >  
> > +static int flower_parse_key_id(char *str, int type, struct nlmsghdr *n)
> 
> str is not modified, therefore use: const char *str
ack

> 
> > +{
> > +   int ret;
> > +   __be32 key_id;
> > +
> > +   ret = get_be32(_id, str, 10);
> > +   if (ret)
> > +   return -1;
> 
> Traditionally netlink attributes are in host order, why was flower
> chosen to be different?
I don't know, maybe Jiri (cc'ed) can explain, but it is all over the
flower code.

> 
> > +
> > +   addattr32(n, MAX_MSG, type, key_id);
> > +
> > +   return 0;
> 
> 
> Why lose the return value here?  Why not:
> 
>   ret = get_be32(_id, str, 10);
>   if (!ret)
>   addattr32(n, MAX_MSG, type, key_id);
The get_*() function can return only -1 or 0. But you are right, and it
is better the way you suggested.  Changing accordingly in V3.

> 
> > +}
> > +
> >  static int flower_parse_opt(struct filter_util *qu, char *handle,
> > int argc, char **argv, struct nlmsghdr *n)
> >  {
> > @@ -339,6 +359,38 @@ static int flower_parse_opt(struct filter_util *qu, 
> > char *handle,
> > fprintf(stderr, "Illegal \"src_port\"\n");
> > return -1;
> > }
> > +   } else if (matches(*argv, "enc_dst_ip") == 0) {
> > +   NEXT_ARG();
> > +   ret = flower_parse_ip_addr(*argv, 0,
> > +  TCA_FLOWER_KEY_ENC_IPV4_DST,
> > +  
> > TCA_FLOWER_KEY_ENC_IPV4_DST_MASK,
> > +  TCA_FLOWER_KEY_ENC_IPV6_DST,
> > +  
> > TCA_FLOWER_KEY_ENC_IPV6_DST_MASK,
> > +  n);
> > +   if (ret < 0) {
> > +   fprintf(stderr, "Illegal \"enc_dst_ip\"\n");
> > +   return -1;
> > +   }
> > +   } else if (matches(*argv, "enc_src_ip") == 0) {
> > +   NEXT_ARG();
> > +   ret = flower_parse_ip_addr(*argv, 0,
> > +  TCA_FLOWER_KEY_ENC_IPV4_SRC,
> > +  
> > TCA_FLOWER_KEY_ENC_IPV4_SRC_MASK,
> > +  TCA_FLOWER_KEY_ENC_IPV6_SRC,
> > +  
> > TCA_FLOWER_KEY_ENC_IPV6_SRC_MASK,
> > +  n);
> > +   if (ret < 0) {
> > +   fprintf(stderr, "Illegal \"enc_src_ip\"\n");
> > +   return -1;
> > +   }
> > +   } else if (matches(*argv, "enc_key_id") == 0) {
> > +   NEXT_ARG();
> > +   ret = flower_parse_key_id(*argv,
> > + TCA_FLOWER_KEY_ENC_KEY_ID, n);
> > +   if (ret < 0) {
> > +   fprintf(stderr, "Illegal \"enc_key_id\"\n");
> > +   return -1;
> > +   }
> > } else if (matches(*argv, "action") == 0) {
> > NEXT_ARG();
> > ret = parse_action(, , TCA_FLOWER_ACT, n);
> > @@ -509,6 +561,14 @@ static void flower_print_port(FILE *f, char *name, 
> > __u8 ip_proto,
> > fprintf(f, "\n  %s %d", name, ntohs(rta_getattr_u16(attr)));
> >  }
> >  
> > +static void flower_print_key_id(FILE *f, char *name,
> > +   struct rtattr *attr)
> 
> const char *name?
ack

> 
> 
> > +{
> > +   if (!attr)
> > +   return;
> > +   fprintf(f, "\n  %s %d", name, ntohl(rta_getattr_u32(attr)));
> > +}
> > +
> 
> Why short circuit, just change the order:
> 
>   if (attr)
>   fprintf(f, "\n  %s %s", name, ntohl(rta_getattr_u32(attr));
> 
> You might also want to introduce rta_getattr_be32()
ack

> 
> Please change, retest and resubmit both patches.
ack

Thanks for reviewing,
Amir


Re: [PATCH net V2] net/sched: pedit: make sure that offset is valid

2016-11-29 Thread Amir Vadai
On Tue, Nov 29, 2016 at 05:27:05PM +0800, zhuyj wrote:
>  Thanks a lot.
> When will offset become -1?
offset is supplied by userspace. For example iproute2 tc tool.
It is valid to supply a negative value, since offset is relative to the
networking layer, one might want to edit a MAC layer field.

Please use bottom posting when replying to mails...

> 
> On Tue, Nov 29, 2016 at 3:14 PM, Amir Vadai <a...@vadai.me> wrote:
> > On Tue, Nov 29, 2016 at 10:32:05AM +0800, zhuyj wrote:
> >>  +   if (offset > 0 && offset > skb->len)
> >>
> >> offset > skb->len is enough?
> > offset is signed and skb->len is unsigned. Therefore for example if
> > offset=-1 and skb->len=10, the actual comparison is 0xff...>10
> >
> >>
> >> On Mon, Nov 28, 2016 at 6:56 PM, Amir Vadai <a...@vadai.me> wrote:
> >> > Add a validation function to make sure offset is valid:
> >> > 1. Not below skb head (could happen when offset is negative).
> >> > 2. Validate both 'offset' and 'at'.
> >> >
> >> > Signed-off-by: Amir Vadai <a...@vadai.me>
> >> > ---
> >> > Hi Dave,
> >> >
> >> > Please pull to -stable branches.
> >> >
> >> > Changes from V0:
> >> > - Add a validation to the 'at' value (this is used as an offset too)
> >> > - Instead of validating the output of skb_header_pointer(), make sure 
> >> > that the
> >> > offset is good before calling it.
> >> >
> >> > Thanks,
> >> > Amir
> >> >  net/sched/act_pedit.c | 24 
> >> >  1 file changed, 20 insertions(+), 4 deletions(-)
> >> >
> >> > diff --git a/net/sched/act_pedit.c b/net/sched/act_pedit.c
> >> > index b54d56d4959b..cf9b2fe8eac6 100644
> >> > --- a/net/sched/act_pedit.c
> >> > +++ b/net/sched/act_pedit.c
> >> > @@ -108,6 +108,17 @@ static void tcf_pedit_cleanup(struct tc_action *a, 
> >> > int bind)
> >> > kfree(keys);
> >> >  }
> >> >
> >> > +static bool offset_valid(struct sk_buff *skb, int offset)
> >> > +{
> >> > +   if (offset > 0 && offset > skb->len)
> >> > +   return false;
> >> > +
> >> > +   if  (offset < 0 && -offset > skb_headroom(skb))
> >> > +   return false;
> >> > +
> >> > +   return true;
> >> > +}
> >> > +
> >> >  static int tcf_pedit(struct sk_buff *skb, const struct tc_action *a,
> >> >  struct tcf_result *res)
> >> >  {
> >> > @@ -134,6 +145,11 @@ static int tcf_pedit(struct sk_buff *skb, const 
> >> > struct tc_action *a,
> >> > if (tkey->offmask) {
> >> > char *d, _d;
> >> >
> >> > +   if (!offset_valid(skb, off + tkey->at)) {
> >> > +   pr_info("tc filter pedit 'at' 
> >> > offset %d out of bounds\n",
> >> > +   off + tkey->at);
> >> > +   goto bad;
> >> > +   }
> >> > d = skb_header_pointer(skb, off + 
> >> > tkey->at, 1,
> >> >&_d);
> >> > if (!d)
> >> > @@ -146,10 +162,10 @@ static int tcf_pedit(struct sk_buff *skb, const 
> >> > struct tc_action *a,
> >> > " offset must be on 32 bit 
> >> > boundaries\n");
> >> > goto bad;
> >> > }
> >> > -   if (offset > 0 && offset > skb->len) {
> >> > -   pr_info("tc filter pedit"
> >> > -   " offset %d can't exceed pkt 
> >> > length %d\n",
> >> > -  offset, skb->len);
> >> > +
> >> > +   if (!offset_valid(skb, off + offset)) {
> >> > +   pr_info("tc filter pedit offset %d out 
> >> > of bounds\n",
> >> > +   offset);
> >> > goto bad;
> >> > }
> >> >
> >> > --
> >> > 2.10.2
> >> >


Re: [PATCH net V2] net/sched: pedit: make sure that offset is valid

2016-11-28 Thread Amir Vadai
On Tue, Nov 29, 2016 at 10:32:05AM +0800, zhuyj wrote:
>  +   if (offset > 0 && offset > skb->len)
> 
> offset > skb->len is enough?
offset is signed and skb->len is unsigned. Therefore for example if
offset=-1 and skb->len=10, the actual comparison is 0xff...>10

> 
> On Mon, Nov 28, 2016 at 6:56 PM, Amir Vadai <a...@vadai.me> wrote:
> > Add a validation function to make sure offset is valid:
> > 1. Not below skb head (could happen when offset is negative).
> > 2. Validate both 'offset' and 'at'.
> >
> > Signed-off-by: Amir Vadai <a...@vadai.me>
> > ---
> > Hi Dave,
> >
> > Please pull to -stable branches.
> >
> > Changes from V0:
> > - Add a validation to the 'at' value (this is used as an offset too)
> > - Instead of validating the output of skb_header_pointer(), make sure that 
> > the
> > offset is good before calling it.
> >
> > Thanks,
> > Amir
> >  net/sched/act_pedit.c | 24 
> >  1 file changed, 20 insertions(+), 4 deletions(-)
> >
> > diff --git a/net/sched/act_pedit.c b/net/sched/act_pedit.c
> > index b54d56d4959b..cf9b2fe8eac6 100644
> > --- a/net/sched/act_pedit.c
> > +++ b/net/sched/act_pedit.c
> > @@ -108,6 +108,17 @@ static void tcf_pedit_cleanup(struct tc_action *a, int 
> > bind)
> > kfree(keys);
> >  }
> >
> > +static bool offset_valid(struct sk_buff *skb, int offset)
> > +{
> > +   if (offset > 0 && offset > skb->len)
> > +   return false;
> > +
> > +   if  (offset < 0 && -offset > skb_headroom(skb))
> > +   return false;
> > +
> > +   return true;
> > +}
> > +
> >  static int tcf_pedit(struct sk_buff *skb, const struct tc_action *a,
> >  struct tcf_result *res)
> >  {
> > @@ -134,6 +145,11 @@ static int tcf_pedit(struct sk_buff *skb, const struct 
> > tc_action *a,
> > if (tkey->offmask) {
> > char *d, _d;
> >
> > +   if (!offset_valid(skb, off + tkey->at)) {
> > +   pr_info("tc filter pedit 'at' 
> > offset %d out of bounds\n",
> > +   off + tkey->at);
> > +   goto bad;
> > +   }
> > d = skb_header_pointer(skb, off + tkey->at, 
> > 1,
> >&_d);
> > if (!d)
> > @@ -146,10 +162,10 @@ static int tcf_pedit(struct sk_buff *skb, const 
> > struct tc_action *a,
> > " offset must be on 32 bit 
> > boundaries\n");
> > goto bad;
> > }
> > -   if (offset > 0 && offset > skb->len) {
> > -   pr_info("tc filter pedit"
> > -   " offset %d can't exceed pkt length 
> > %d\n",
> > -  offset, skb->len);
> > +
> > +   if (!offset_valid(skb, off + offset)) {
> > +   pr_info("tc filter pedit offset %d out of 
> > bounds\n",
> > +   offset);
> > goto bad;
> > }
> >
> > --
> > 2.10.2
> >


Re: [patch net] sched: cls_flower: remove from hashtable only in case skip sw flag is not set

2016-11-28 Thread Amir Vadai"
On Mon, Nov 28, 2016 at 03:40:13PM +0100, Jiri Pirko wrote:
> From: Jiri Pirko <j...@mellanox.com>
> 
> Be symmetric to hashtable insert and remove filter from hashtable only
> in case skip sw flag is not set.
> 
> Fixes: e69985c67c33 ("net/sched: cls_flower: Introduce support in SKIP SW 
> flag")
> Signed-off-by: Jiri Pirko <j...@mellanox.com>
> ---
Reviewed-by: Amir Vadai <a...@vadai.me>


[PATCH iproute2 V2 2/2] tc/act_tunnel: Introduce ip tunnel action

2016-11-28 Thread Amir Vadai
This action could be used before redirecting packets to a shared tunnel
device, or when redirecting packets arriving from a such a device.

The 'unset' action is optional. It is used to explicitly unset the
metadata created by the tunnel device during decap. If not used, the
metadata will be released automatically by the kernel.
The 'set' operation, will set the metadata with the specified values for
the encap.

For example, the following flower filter will forward all ICMP packets
destined to 11.11.11.2 through the shared vxlan device 'vxlan0'. Before
redirecting, a metadata for the vxlan tunnel is created using the
tunnel_key action and it's arguments:

$ tc filter add dev net0 protocol ip parent : \
flower \
  ip_proto 1 \
  dst_ip 11.11.11.2 \
action tunnel_key set \
  src_ip 11.11.0.1 \
  dst_ip 11.11.0.2 \
  id 11 \
action mirred egress redirect dev vxlan0

Signed-off-by: Amir Vadai <a...@vadai.me>
---
 include/linux/tc_act/tc_tunnel_key.h |  42 ++
 man/man8/tc-tunnel_key.8 | 113 +++
 tc/Makefile  |   1 +
 tc/m_tunnel_key.c| 259 +++
 4 files changed, 415 insertions(+)
 create mode 100644 include/linux/tc_act/tc_tunnel_key.h
 create mode 100644 man/man8/tc-tunnel_key.8
 create mode 100644 tc/m_tunnel_key.c

diff --git a/include/linux/tc_act/tc_tunnel_key.h 
b/include/linux/tc_act/tc_tunnel_key.h
new file mode 100644
index ..f9ddf5369a45
--- /dev/null
+++ b/include/linux/tc_act/tc_tunnel_key.h
@@ -0,0 +1,42 @@
+/*
+ * Copyright (c) 2016, Amir Vadai <a...@vadai.me>
+ * Copyright (c) 2016, Mellanox Technologies. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+
+#ifndef __LINUX_TC_TUNNEL_KEY_H
+#define __LINUX_TC_TUNNEL_KEY_H
+
+#include 
+
+#define TCA_ACT_TUNNEL_KEY 17
+
+#define TCA_TUNNEL_KEY_ACT_SET 1
+#define TCA_TUNNEL_KEY_ACT_RELEASE  2
+
+struct tc_tunnel_key {
+   tc_gen;
+   int t_action;
+};
+
+enum {
+   TCA_TUNNEL_KEY_UNSPEC,
+   TCA_TUNNEL_KEY_TM,
+   TCA_TUNNEL_KEY_PARMS,
+   TCA_TUNNEL_KEY_ENC_IPV4_SRC,/* be32 */
+   TCA_TUNNEL_KEY_ENC_IPV4_DST,/* be32 */
+   TCA_TUNNEL_KEY_ENC_IPV6_SRC,/* struct in6_addr */
+   TCA_TUNNEL_KEY_ENC_IPV6_DST,/* struct in6_addr */
+   TCA_TUNNEL_KEY_ENC_KEY_ID,  /* be64 */
+   TCA_TUNNEL_KEY_PAD,
+   __TCA_TUNNEL_KEY_MAX,
+};
+
+#define TCA_TUNNEL_KEY_MAX (__TCA_TUNNEL_KEY_MAX - 1)
+
+#endif
+
diff --git a/man/man8/tc-tunnel_key.8 b/man/man8/tc-tunnel_key.8
new file mode 100644
index ..d0c333d27158
--- /dev/null
+++ b/man/man8/tc-tunnel_key.8
@@ -0,0 +1,113 @@
+.TH "Tunnel metadata manipulation action in tc" 8 "10 Nov 2016" "iproute2" 
"Linux"
+
+.SH NAME
+tunnel_key - Tunnel metadata manipulation
+.SH SYNOPSIS
+.in +8
+.ti -8
+.BR tc " ... " "action tunnel_key" " { " unset " | "
+.IR SET " }"
+
+.ti -8
+.IR SET " := "
+.BR set " " src_ip
+.IR ADDRESS
+.BR dst_ip
+.IR ADDRESS
+.BI id " KEY_ID"
+
+.SH DESCRIPTION
+The
+.B tunnel_key
+action combined with a shared IP tunnel device, allows to perform IP tunnel en-
+or decapsulation on a packet, reflected by
+the operation modes
+.IR UNSET " and " SET .
+The
+.I UNSET
+mode is optional - even without using it, the metadata information will be
+released automatically when packet processing will be finished.
+.IR UNSET
+function could be used in cases when traffic is forwarded between two tunnels,
+where the metadata from the first tunnel will be used for encapsulation done by
+the second tunnel.
+It must be used for offloaded filters, such that hardware drivers can
+realize they need to program the HW to do decapsulation.
+.IR SET
+mode requires the source and destination ip
+.I ADDRESS
+and the tunnel key id
+.I KEY_ID
+which will be used by the ip tunnel shared device to create the tunnel header. 
The
+.B tunnel_key
+action is useful only in combination with a
+.B mirred redirect
+action to a shared IP tunnel device which will use the metadata (for
+.I SET
+) and unset the metadata created by it (for
+.I UNSET
+).
+
+.SH OPTIONS
+.TP
+.B unset
+Decapsulation mode, no further arguments allowed. This function is not
+mandatory and might be used only in some specific use cases.
+.TP
+.B set
+Encapsulation mode. Requires
+.B id
+,
+.B src_ip
+and
+.B dst_ip
+options.
+.RS
+.TP
+.B id
+Tunnel ID (for example VNI in VXLAN tunnel)
+.TP
+.B src_ip
+Outer header source IP address (IPv4 or IPv6)
+.TP
+.B dst_ip
+Outer header destination IP address (IPv4 or IPv6)
+.RE
+.SH EXAMPLES
+The following example en

[PATCH iproute2 V2 1/2] tc/cls_flower: Classify packet in ip tunnels

2016-11-28 Thread Amir Vadai
Introduce classifying by metadata extracted by the tunnel device.
Outer header fields - source/dest ip and tunnel id, are extracted from
the metadata when classifying.

For example, the following will add a filter on the ingress Qdisc of shared
vxlan device named 'vxlan0'. To forward packets with outer src ip
11.11.0.2, dst ip 11.11.0.1 and tunnel id 11. The packets will be
forwarded to tap device 'vnet0':

$ tc filter add dev vxlan0 protocol ip parent : \
flower \
  enc_src_ip 11.11.0.2 \
  enc_dst_ip 11.11.0.1 \
  enc_key_id 11 \
  dst_ip 11.11.11.1 \
action mirred egress redirect dev vnet0

Signed-off-by: Amir Vadai <a...@vadai.me>
---
 man/man8/tc-flower.8 | 17 ++-
 tc/f_flower.c| 85 ++--
 2 files changed, 98 insertions(+), 4 deletions(-)

diff --git a/man/man8/tc-flower.8 b/man/man8/tc-flower.8
index 74f76647753b..0e0b0cf4bb72 100644
--- a/man/man8/tc-flower.8
+++ b/man/man8/tc-flower.8
@@ -36,7 +36,11 @@ flower \- flow based traffic control filter
 .BR dst_ip " | " src_ip " } { "
 .IR ipv4_address " | " ipv6_address " } | { "
 .BR dst_port " | " src_port " } "
-.IR port_number " }"
+.IR port_number " } | "
+.B enc_key_id
+.IR KEY-ID " | {"
+.BR enc_dst_ip " | " enc_src_ip " } { "
+.IR ipv4_address " | " ipv6_address " } | "
 .SH DESCRIPTION
 The
 .B flower
@@ -121,6 +125,17 @@ which has to be specified in beforehand.
 Match on layer 4 protocol source or destination port number. Only available for
 .BR ip_proto " values " udp " and " tcp ,
 which has to be specified in beforehand.
+.TP
+.BI enc_key_id " NUMBER"
+.TQ
+.BI enc_dst_ip " ADDRESS"
+.TQ
+.BI enc_src_ip " ADDRESS"
+Match on IP tunnel metadata. Key id
+.I NUMBER
+is a 32 bit tunnel key id (e.g. VNI for VXLAN tunnel).
+.I ADDRESS
+must be a valid IPv4 or IPv6 address.
 .SH NOTES
 As stated above where applicable, matches of a certain layer implicitly depend
 on the matches of the next lower layer. Precisely, layer one and two matches (
diff --git a/tc/f_flower.c b/tc/f_flower.c
index 2d31d1aa832d..1cf0750b5b83 100644
--- a/tc/f_flower.c
+++ b/tc/f_flower.c
@@ -41,7 +41,10 @@ static void explain(void)
fprintf(stderr, "   dst_ip [ IPV4-ADDR | IPV6-ADDR 
] |\n");
fprintf(stderr, "   src_ip [ IPV4-ADDR | IPV6-ADDR 
] |\n");
fprintf(stderr, "   dst_port PORT-NUMBER |\n");
-   fprintf(stderr, "   src_port PORT-NUMBER }\n");
+   fprintf(stderr, "   src_port PORT-NUMBER |\n");
+   fprintf(stderr, "   enc_dst_ip [ IPV4-ADDR | 
IPV6-ADDR ] |\n");
+   fprintf(stderr, "   enc_src_ip [ IPV4-ADDR | 
IPV6-ADDR ] |\n");
+   fprintf(stderr, "   enc_key_id [ KEY-ID ] }\n");
fprintf(stderr, "   FILTERID := X:Y:Z\n");
fprintf(stderr, "   ACTION-SPEC := ... look at individual 
actions\n");
fprintf(stderr, "\n");
@@ -121,8 +124,9 @@ static int flower_parse_ip_addr(char *str, __be16 eth_type,
family = AF_INET;
} else if (eth_type == htons(ETH_P_IPV6)) {
family = AF_INET6;
+   } else if (!eth_type) {
+   family = AF_UNSPEC;
} else {
-   fprintf(stderr, "Illegal \"eth_type\" for ip address\n");
return -1;
}
 
@@ -130,8 +134,10 @@ static int flower_parse_ip_addr(char *str, __be16 eth_type,
if (ret)
return -1;
 
-   if (addr.family != family)
+   if (family && (addr.family != family)) {
+   fprintf(stderr, "Illegal \"eth_type\" for ip address\n");
return -1;
+   }
 
addattr_l(n, MAX_MSG, addr.family == AF_INET ? addr4_type : addr6_type,
  addr.data, addr.bytelen);
@@ -181,6 +187,20 @@ static int flower_parse_port(char *str, __u8 ip_port,
return 0;
 }
 
+static int flower_parse_key_id(char *str, int type, struct nlmsghdr *n)
+{
+   int ret;
+   __be32 key_id;
+
+   ret = get_be32(_id, str, 10);
+   if (ret)
+   return -1;
+
+   addattr32(n, MAX_MSG, type, key_id);
+
+   return 0;
+}
+
 static int flower_parse_opt(struct filter_util *qu, char *handle,
int argc, char **argv, struct nlmsghdr *n)
 {
@@ -339,6 +359,38 @@ static int flower_parse_opt(struct filter_util *qu, char 
*handle,
fprintf(stderr, "Illegal \"src_port\"\n");
  

[PATCH iproute2 V2 0/2] tc/cls_flower: Support for ip tunnel metadata set/unset/classify

2016-11-28 Thread Amir Vadai
Hi,

This short series adds support for matching and setting metadata for ip tunnel
shared device using the TC system, introduced in kernel 4.9 [1].

Applied and tested on top of commit f3f339e9590a ("cleanup debris from revert")

Example usage:

$ tc filter add dev vxlan0 protocol ip parent : \
flower \
  enc_src_ip 11.11.0.2 \
  enc_dst_ip 11.11.0.1 \
  enc_key_id 11 \
  dst_ip 11.11.11.1 \
action mirred egress redirect dev vnet0

$ tc filter add dev net0 protocol ip parent : \
flower \
  ip_proto 1 \
  dst_ip 11.11.11.2 \
action tunnel_key set \
  src_ip 11.11.0.1 \
  dst_ip 11.11.0.2 \
  id 11 \
action mirred egress redirect dev vxlan0

[1] - d1ba24feb466 ("Merge branch 'act_tunnel_key'")

Thanks,
Amir

Changes from V1:
- Updated Patch 2/2 ("tc/act_tunnel: Introduce ip tunnel action") commit log
and the man page tc-tunnel_key to reflect the fact that 'unset' 
operation is
no mandatory.
And describe when it might be needed.
- Rename the 'release' operation to 'unset'

Amir Vadai (2):
  tc/cls_flower: Classify packet in ip tunnels
  tc/act_tunnel: Introduce ip tunnel action

 include/linux/tc_act/tc_tunnel_key.h |  42 ++
 man/man8/tc-flower.8 |  17 ++-
 man/man8/tc-tunnel_key.8 | 113 +++
 tc/Makefile  |   1 +
 tc/f_flower.c|  85 +++-
 tc/m_tunnel_key.c| 259 +++
 6 files changed, 513 insertions(+), 4 deletions(-)
 create mode 100644 include/linux/tc_act/tc_tunnel_key.h
 create mode 100644 man/man8/tc-tunnel_key.8
 create mode 100644 tc/m_tunnel_key.c

-- 
2.10.2



[PATCH net V2] net/sched: pedit: make sure that offset is valid

2016-11-28 Thread Amir Vadai
Add a validation function to make sure offset is valid:
1. Not below skb head (could happen when offset is negative).
2. Validate both 'offset' and 'at'.

Signed-off-by: Amir Vadai <a...@vadai.me>
---
Hi Dave,

Please pull to -stable branches.

Changes from V0:
- Add a validation to the 'at' value (this is used as an offset too)
- Instead of validating the output of skb_header_pointer(), make sure that the
offset is good before calling it.

Thanks,
Amir
 net/sched/act_pedit.c | 24 
 1 file changed, 20 insertions(+), 4 deletions(-)

diff --git a/net/sched/act_pedit.c b/net/sched/act_pedit.c
index b54d56d4959b..cf9b2fe8eac6 100644
--- a/net/sched/act_pedit.c
+++ b/net/sched/act_pedit.c
@@ -108,6 +108,17 @@ static void tcf_pedit_cleanup(struct tc_action *a, int 
bind)
kfree(keys);
 }
 
+static bool offset_valid(struct sk_buff *skb, int offset)
+{
+   if (offset > 0 && offset > skb->len)
+   return false;
+
+   if  (offset < 0 && -offset > skb_headroom(skb))
+   return false;
+
+   return true;
+}
+
 static int tcf_pedit(struct sk_buff *skb, const struct tc_action *a,
 struct tcf_result *res)
 {
@@ -134,6 +145,11 @@ static int tcf_pedit(struct sk_buff *skb, const struct 
tc_action *a,
if (tkey->offmask) {
char *d, _d;
 
+   if (!offset_valid(skb, off + tkey->at)) {
+   pr_info("tc filter pedit 'at' offset %d 
out of bounds\n",
+   off + tkey->at);
+   goto bad;
+   }
d = skb_header_pointer(skb, off + tkey->at, 1,
   &_d);
if (!d)
@@ -146,10 +162,10 @@ static int tcf_pedit(struct sk_buff *skb, const struct 
tc_action *a,
" offset must be on 32 bit 
boundaries\n");
goto bad;
}
-   if (offset > 0 && offset > skb->len) {
-   pr_info("tc filter pedit"
-   " offset %d can't exceed pkt length 
%d\n",
-  offset, skb->len);
+
+   if (!offset_valid(skb, off + offset)) {
+   pr_info("tc filter pedit offset %d out of 
bounds\n",
+   offset);
goto bad;
}
 
-- 
2.10.2



Re: [PATCH net] net/sched: act_pedit: limit negative offset

2016-11-27 Thread Amir Vadai"
On Mon, Nov 28, 2016 at 12:49:36AM -0500, David Miller wrote:
> From: Cong Wang <xiyou.wangc...@gmail.com>
> Date: Sun, 27 Nov 2016 21:39:33 -0800
> 
> > On Sun, Nov 27, 2016 at 7:58 AM, Amir Vadai <a...@vadai.me> wrote:
> >> Should not allow setting a negative offset that goes below the skb head.
> > ...
> >> diff --git a/net/sched/act_pedit.c b/net/sched/act_pedit.c
> >> index b54d56d4959b..e79e8a88f2d2 100644
> >> --- a/net/sched/act_pedit.c
> >> +++ b/net/sched/act_pedit.c
> >> @@ -154,8 +154,11 @@ static int tcf_pedit(struct sk_buff *skb, const 
> >> struct tc_action *a,
> >> }
> >>
> >> ptr = skb_header_pointer(skb, off + offset, 4, 
> >> &_data);
> >> -   if (!ptr)
> >> +   if ((unsigned char *)ptr < skb->head) {
> > 
> > 
> > ptr returned could be &_data, which is on stack, so why this comparison
> > makes sense for this case?
> 
> Indeed, this will definitely do the wrong thing when the on-stack area
> passed back to ptr.
yes - my bad. will correct it and send v1


[PATCH net] net/sched: act_pedit: limit negative offset

2016-11-27 Thread Amir Vadai
Should not allow setting a negative offset that goes below the skb head.

Signed-off-by: Amir Vadai <a...@vadai.me>
---
Hi Dave,

Please pull to -stable branches.

Thanks,
Amir

 net/sched/act_pedit.c | 5 -
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/net/sched/act_pedit.c b/net/sched/act_pedit.c
index b54d56d4959b..e79e8a88f2d2 100644
--- a/net/sched/act_pedit.c
+++ b/net/sched/act_pedit.c
@@ -154,8 +154,11 @@ static int tcf_pedit(struct sk_buff *skb, const struct 
tc_action *a,
}
 
ptr = skb_header_pointer(skb, off + offset, 4, &_data);
-   if (!ptr)
+   if ((unsigned char *)ptr < skb->head) {
+   pr_info("tc filter pedit offset out of 
bounds\n");
goto bad;
+   }
+
/* just do it, baby */
*ptr = ((*ptr & tkey->mask) ^ tkey->val);
if (ptr == &_data)
-- 
2.10.2



Re: [PATCH iproute2 0/2] tc/cls_flower: Support for ip tunnel metadata set/release/classify

2016-11-27 Thread Amir Vadai
On Thu, Nov 24, 2016 at 04:33:55PM +0100, Jiri Benc wrote:
> On Thu, 24 Nov 2016 17:06:33 +0200, Amir Vadai wrote:
> > So you mean to just unconditionally call skb_dst_drop() from
> > act_mirred()?
> 
> That's one option. Or just leave the dst there, it shouldn't matter?
> (Except for forwarding to a different tunnel but as I said, it's a
> corner case and we may have a "tunnel_key unset" action for that.)
Ok, so I will write in the docs that it is optional to use the "unset"
operation (and will rename it from "release" to "unset")

> 
> > The use case we already have that uses the release action is the
> > hardware offload support, which is already in the kernel.
> > It is using the "tunnel_key release" action to signal the hardware to
> > strip off the ip tunnel headers.
> 
> The tunnel headers must be removed upon reception on the tunnel
> interface without specifying anything, because that's how the Linux
> kernel behaves currently. If this is offloaded, this behavior must be
> preserved. I don't see how "tunnel_key release" might be used for
> stripping the headers.
Maybe I didn't express myself right: I need to tell the hardware
explicitly during the filter initialization to redirect the packets
arriving from one interface to another and to strip off the tunnel
headers. This is what happens when a "tunnel_key unset" action is
created and offloaded - it configures the hardware respectively.
So this is one usecase where this operation is needed - and yes, in this
use case the actual skb_dst_drop() is not important or needed, but I
don't think it makes any harm.
In the tunnel dev to tunnel dev use case, the operation could be
meaningful, if the user don't want to reuse the metadata created by the
origin tunnel dev.

> 
>  Jiri


Re: [PATCH iproute2 0/2] tc/cls_flower: Support for ip tunnel metadata set/release/classify

2016-11-24 Thread Amir Vadai
On Thu, Nov 24, 2016 at 02:38:56PM +0100, Jiri Benc wrote:
> On Mon, 21 Nov 2016 12:20:54 +0200, Amir Vadai wrote:
> > $ tc filter add dev vxlan0 protocol ip parent : \
> > flower \
> >   enc_src_ip 11.11.0.2 \
> >   enc_dst_ip 11.11.0.1 \
> >   enc_key_id 11 \
> >   dst_ip 11.11.11.1 \
> > action tunnel_key release \
> > action mirred egress redirect dev vnet0
> 
> I really hate the "action tunnel_key release". This just exposes the
> kernel internal implementation detail (dst_metadata) to the user. Why
> should the user care about explicit releasing of the tunnel key? This
> should happen automatically. Users do not care about our internal
> implementation.
I see.
So you mean to just unconditionally call skb_dst_drop() from
act_mirred()?

> 
> > $ tc filter add dev net0 protocol ip parent : \
> > flower \
> >   ip_proto 1 \
> >   dst_ip 11.11.11.2 \
> > action tunnel_key set \
> >   src_ip 11.11.0.1 \
> >   dst_ip 11.11.0.2 \
> >   id 11 \
> > action mirred egress redirect dev vxlan0
> 
> Do you see the asymmetry? This is not called "alloc tunnel_key", and
> rightly so. It's very reasonable to call this "set", as it is what the
> action looks like to the user.
> 
> The only argument for the existence of an explicit "release" (we should
> rather call it "unset" in such case, though) is forwarding between two
> tunnels, where metadata from the first tunnel will be used for
> encapsulation done by the second tunnel. Or a similar case when there's
> classification based on the tunnel metadata done on the mirred
> interface. Somewhat corner cases, though. If we want to support them,
> then let's call the action "unset" and not "release". And in any case,
> it should not be mandatory to specify it, which should be made clear
> in the documentation (including examples where it is needed - basically
> only when forwarding between tunnels).
The use case we already have that uses the release action is the
hardware offload support, which is already in the kernel.
It is using the "tunnel_key release" action to signal the hardware to
strip off the ip tunnel headers.
I need to go over this again and see how can we make it work without the
release/unset action.

> 
>  Jiri


Re: [PATCH iproute2 2/2] tc/act_tunnel: Introduce ip tunnel action

2016-11-21 Thread Amir Vadai
On Mon, Nov 21, 2016 at 11:50:03PM +, Rosen, Rami wrote:
> Hi, Amir,
> 
> Following are three minor comments:
> 
> Seems that TCA_TUNNEL_KEY_PAD used anywhere:
I assume you ment that it is _NOT_ used anywhere:
This attribute type is used in the kernel side only - for padding 64bit
attributes. The userspace enum should match the kernel include/uapi one.

>  
> + TCA_TUNNEL_KEY_PAD,
> + __TCA_TUNNEL_KEY_MAX,
> +};
> 
> 
> Should be "and destination IP 11.11.0.2" instead of  "and destination IP 
> 11.11.0.1":
ack

> 
> +Tunnel ID (for example VNI in VXLAN tunnel) .TP .B src_ip Outer header 
> +source IP address (IPv4 or IPv6) .TP .B dst_ip Outer header destination 
> +IP address (IPv4 or IPv6) .RE .SH EXAMPLES The following example 
> +encapsulates incoming ICMP packets on eth0 into a vxlan tunnel by 
> +setting metadata to VNI 11, source IP 11.11.0.1 and destination IP
> +11.11.0.1 by forwarding the skb with the metadata to device vxlan0, 
> +which will prepare the VXLAN headers:
> +
> +.RS
> +.EX
> +#tc qdisc add dev eth0 handle : ingress #tc filter add dev eth0 
> +protocol ip parent : \\
> +  flower \\
> +ip_proto icmp \\
> +  action tunnel_key set \\
> +src_ip 11.11.0.1 \\
> +dst_ip 11.11.0.2 \\
> +id 11 \\
> 
> 
> Typo: should be "ip tunnel" instead of "ip tunel":
ack

> 
> + * m_tunnel_key.cip tunel manipulation module
> + *
> + *  This program is free software; you can redistribute it and/or
> 
> Keep on the good work!
Thanks for reviewing,
Amir

> 
> Regards,
> Rami Rosen
> Intel Corporation


[PATCH iproute2 1/2] tc/cls_flower: Classify packet in ip tunnels

2016-11-21 Thread Amir Vadai
Introduce classifying by metadata extracted by the tunnel device.
Outer header fields - source/dest ip and tunnel id, are extracted from
the metadata when classifying.

For example, the following will add a filter on the ingress Qdisc of shared
vxlan device named 'vxlan0'. To forward packets with outer src ip
11.11.0.2, dst ip 11.11.0.1 and tunnel id 11. The packets will be
forwarded to tap device 'vnet0' (after metadata is released):

$ tc filter add dev vxlan0 protocol ip parent : \
flower \
  enc_src_ip 11.11.0.2 \
  enc_dst_ip 11.11.0.1 \
  enc_key_id 11 \
  dst_ip 11.11.11.1 \
action tunnel_key release \
action mirred egress redirect dev vnet0

The action tunnel_key, will be introduced in the next patch in this
series.

Signed-off-by: Amir Vadai <a...@vadai.me>
---
 man/man8/tc-flower.8 | 17 ++-
 tc/f_flower.c| 85 ++--
 2 files changed, 98 insertions(+), 4 deletions(-)

diff --git a/man/man8/tc-flower.8 b/man/man8/tc-flower.8
index 74f76647753b..0e0b0cf4bb72 100644
--- a/man/man8/tc-flower.8
+++ b/man/man8/tc-flower.8
@@ -36,7 +36,11 @@ flower \- flow based traffic control filter
 .BR dst_ip " | " src_ip " } { "
 .IR ipv4_address " | " ipv6_address " } | { "
 .BR dst_port " | " src_port " } "
-.IR port_number " }"
+.IR port_number " } | "
+.B enc_key_id
+.IR KEY-ID " | {"
+.BR enc_dst_ip " | " enc_src_ip " } { "
+.IR ipv4_address " | " ipv6_address " } | "
 .SH DESCRIPTION
 The
 .B flower
@@ -121,6 +125,17 @@ which has to be specified in beforehand.
 Match on layer 4 protocol source or destination port number. Only available for
 .BR ip_proto " values " udp " and " tcp ,
 which has to be specified in beforehand.
+.TP
+.BI enc_key_id " NUMBER"
+.TQ
+.BI enc_dst_ip " ADDRESS"
+.TQ
+.BI enc_src_ip " ADDRESS"
+Match on IP tunnel metadata. Key id
+.I NUMBER
+is a 32 bit tunnel key id (e.g. VNI for VXLAN tunnel).
+.I ADDRESS
+must be a valid IPv4 or IPv6 address.
 .SH NOTES
 As stated above where applicable, matches of a certain layer implicitly depend
 on the matches of the next lower layer. Precisely, layer one and two matches (
diff --git a/tc/f_flower.c b/tc/f_flower.c
index 2d31d1aa832d..1cf0750b5b83 100644
--- a/tc/f_flower.c
+++ b/tc/f_flower.c
@@ -41,7 +41,10 @@ static void explain(void)
fprintf(stderr, "   dst_ip [ IPV4-ADDR | IPV6-ADDR 
] |\n");
fprintf(stderr, "   src_ip [ IPV4-ADDR | IPV6-ADDR 
] |\n");
fprintf(stderr, "   dst_port PORT-NUMBER |\n");
-   fprintf(stderr, "   src_port PORT-NUMBER }\n");
+   fprintf(stderr, "   src_port PORT-NUMBER |\n");
+   fprintf(stderr, "   enc_dst_ip [ IPV4-ADDR | 
IPV6-ADDR ] |\n");
+   fprintf(stderr, "   enc_src_ip [ IPV4-ADDR | 
IPV6-ADDR ] |\n");
+   fprintf(stderr, "   enc_key_id [ KEY-ID ] }\n");
fprintf(stderr, "   FILTERID := X:Y:Z\n");
fprintf(stderr, "   ACTION-SPEC := ... look at individual 
actions\n");
fprintf(stderr, "\n");
@@ -121,8 +124,9 @@ static int flower_parse_ip_addr(char *str, __be16 eth_type,
family = AF_INET;
} else if (eth_type == htons(ETH_P_IPV6)) {
family = AF_INET6;
+   } else if (!eth_type) {
+   family = AF_UNSPEC;
} else {
-   fprintf(stderr, "Illegal \"eth_type\" for ip address\n");
return -1;
}
 
@@ -130,8 +134,10 @@ static int flower_parse_ip_addr(char *str, __be16 eth_type,
if (ret)
return -1;
 
-   if (addr.family != family)
+   if (family && (addr.family != family)) {
+   fprintf(stderr, "Illegal \"eth_type\" for ip address\n");
return -1;
+   }
 
addattr_l(n, MAX_MSG, addr.family == AF_INET ? addr4_type : addr6_type,
  addr.data, addr.bytelen);
@@ -181,6 +187,20 @@ static int flower_parse_port(char *str, __u8 ip_port,
return 0;
 }
 
+static int flower_parse_key_id(char *str, int type, struct nlmsghdr *n)
+{
+   int ret;
+   __be32 key_id;
+
+   ret = get_be32(_id, str, 10);
+   if (ret)
+   return -1;
+
+   addattr32(n, MAX_MSG, type, key_id);
+
+   return 0;
+}
+
 static int flower_parse_opt(struct filter_util *qu, char *handle,
int argc, char **argv, struct nlmsghdr *n)
 {
@@ -339,6 +359,38 @@ static int flower_parse_opt(struct filter_util *qu, char 
*handle,
fprintf

[PATCH iproute2 2/2] tc/act_tunnel: Introduce ip tunnel action

2016-11-21 Thread Amir Vadai
This action could be used before redirecting packets to a shared tunnel
device, or when redirecting packets arriving from a such a device.

The action will release the metadata created by the tunnel device
(decap), or set the metadata with the specified values for encap
operation.

For example, the following flower filter will forward all ICMP packets
destined to 11.11.11.2 through the shared vxlan device 'vxlan0'. Before
redirecting, a metadata for the vxlan tunnel is created using the
tunnel_key action and it's arguments:

$ tc filter add dev net0 protocol ip parent : \
flower \
  ip_proto 1 \
  dst_ip 11.11.11.2 \
action tunnel_key set \
  src_ip 11.11.0.1 \
  dst_ip 11.11.0.2 \
  id 11 \
action mirred egress redirect dev vxlan0

Signed-off-by: Amir Vadai <a...@vadai.me>
---
 include/linux/tc_act/tc_tunnel_key.h |  42 ++
 man/man8/tc-tunnel_key.8 | 105 ++
 tc/Makefile  |   1 +
 tc/m_tunnel_key.c| 259 +++
 4 files changed, 407 insertions(+)
 create mode 100644 include/linux/tc_act/tc_tunnel_key.h
 create mode 100644 man/man8/tc-tunnel_key.8
 create mode 100644 tc/m_tunnel_key.c

diff --git a/include/linux/tc_act/tc_tunnel_key.h 
b/include/linux/tc_act/tc_tunnel_key.h
new file mode 100644
index ..f9ddf5369a45
--- /dev/null
+++ b/include/linux/tc_act/tc_tunnel_key.h
@@ -0,0 +1,42 @@
+/*
+ * Copyright (c) 2016, Amir Vadai <a...@vadai.me>
+ * Copyright (c) 2016, Mellanox Technologies. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+
+#ifndef __LINUX_TC_TUNNEL_KEY_H
+#define __LINUX_TC_TUNNEL_KEY_H
+
+#include 
+
+#define TCA_ACT_TUNNEL_KEY 17
+
+#define TCA_TUNNEL_KEY_ACT_SET 1
+#define TCA_TUNNEL_KEY_ACT_RELEASE  2
+
+struct tc_tunnel_key {
+   tc_gen;
+   int t_action;
+};
+
+enum {
+   TCA_TUNNEL_KEY_UNSPEC,
+   TCA_TUNNEL_KEY_TM,
+   TCA_TUNNEL_KEY_PARMS,
+   TCA_TUNNEL_KEY_ENC_IPV4_SRC,/* be32 */
+   TCA_TUNNEL_KEY_ENC_IPV4_DST,/* be32 */
+   TCA_TUNNEL_KEY_ENC_IPV6_SRC,/* struct in6_addr */
+   TCA_TUNNEL_KEY_ENC_IPV6_DST,/* struct in6_addr */
+   TCA_TUNNEL_KEY_ENC_KEY_ID,  /* be64 */
+   TCA_TUNNEL_KEY_PAD,
+   __TCA_TUNNEL_KEY_MAX,
+};
+
+#define TCA_TUNNEL_KEY_MAX (__TCA_TUNNEL_KEY_MAX - 1)
+
+#endif
+
diff --git a/man/man8/tc-tunnel_key.8 b/man/man8/tc-tunnel_key.8
new file mode 100644
index ..c3b21e7d040e
--- /dev/null
+++ b/man/man8/tc-tunnel_key.8
@@ -0,0 +1,105 @@
+.TH "Tunnel metadata manipulation action in tc" 8 "10 Nov 2016" "iproute2" 
"Linux"
+
+.SH NAME
+tunnel_key - Tunnel metadata manipulation
+.SH SYNOPSIS
+.in +8
+.ti -8
+.BR tc " ... " "action tunnel_key" " { " release " | "
+.IR SET " }"
+
+.ti -8
+.IR SET " := "
+.BR set " " src_ip
+.IR ADDRESS
+.BR dst_ip
+.IR ADDRESS
+.BI id " KEY_ID"
+
+.SH DESCRIPTION
+The
+.B tunnel_key
+action allows to perform IP tunnel en- or decapsulation on a packet, reflected 
by
+the operation modes
+.IR RELEASE " and " SET .
+The
+.I RELEASE
+mode is simple, as no further information is required to just drop the
+metadata attached to the skb. The
+.IR SET
+mode requires the source and destination ip
+.I ADDRESS
+and the tunnel key id
+.I KEY_ID
+which will be used by the ip tunnel shared device to create the tunnel header. 
The
+.B tunnel_key
+action is useful only in combination with a
+.B mirred redirect
+action to a shared IP tunnel device which will use the metadata (for
+.I SET
+) and release the metadata created by it (for
+.I RELEASE
+).
+
+.SH OPTIONS
+.TP
+.B release
+Decapsulation mode, no further arguments allowed.
+.TP
+.B set
+Encapsulation mode. Requires
+.B id
+,
+.B src_ip
+and
+.B dst_ip
+options.
+.RS
+.TP
+.B id
+Tunnel ID (for example VNI in VXLAN tunnel)
+.TP
+.B src_ip
+Outer header source IP address (IPv4 or IPv6)
+.TP
+.B dst_ip
+Outer header destination IP address (IPv4 or IPv6)
+.RE
+.SH EXAMPLES
+The following example encapsulates incoming ICMP packets on eth0 into a vxlan
+tunnel by setting metadata to VNI 11, source IP 11.11.0.1 and destination IP
+11.11.0.1 by forwarding the skb with the metadata to device vxlan0, which will
+prepare the VXLAN headers:
+
+.RS
+.EX
+#tc qdisc add dev eth0 handle : ingress
+#tc filter add dev eth0 protocol ip parent : \\
+  flower \\
+ip_proto icmp \\
+  action tunnel_key set \\
+src_ip 11.11.0.1 \\
+dst_ip 11.11.0.2 \\
+id 11 \\
+  action mirred egress redirect dev vxlan0
+.EE
+.RE
+
+Here is an example of the
+.B release
+function: Incoming VXLAN packets on v

[PATCH iproute2 0/2] tc/cls_flower: Support for ip tunnel metadata set/release/classify

2016-11-21 Thread Amir Vadai
Hi,

This short series adds support for matching and setting metadata for ip tunnel
shared device using the TC system, introduced in kernel 4.9 [1].

Applied and tested on top of commit f3f339e9590a ("cleanup debris from revert")

Example usage:

$ tc filter add dev vxlan0 protocol ip parent : \
flower \
  enc_src_ip 11.11.0.2 \
  enc_dst_ip 11.11.0.1 \
  enc_key_id 11 \
  dst_ip 11.11.11.1 \
action tunnel_key release \
action mirred egress redirect dev vnet0

$ tc filter add dev net0 protocol ip parent : \
flower \
  ip_proto 1 \
  dst_ip 11.11.11.2 \
action tunnel_key set \
  src_ip 11.11.0.1 \
  dst_ip 11.11.0.2 \
  id 11 \
action mirred egress redirect dev vxlan0

[1] - d1ba24feb466 ("Merge branch 'act_tunnel_key'")

Thanks,
Amir

Amir Vadai (2):
  tc/cls_flower: Classify packet in ip tunnels
  tc/act_tunnel: Introduce ip tunnel action

 include/linux/tc_act/tc_tunnel_key.h |  42 ++
 man/man8/tc-flower.8 |  17 ++-
 man/man8/tc-tunnel_key.8 | 105 ++
 tc/Makefile  |   1 +
 tc/f_flower.c|  85 +++-
 tc/m_tunnel_key.c| 259 +++
 6 files changed, 505 insertions(+), 4 deletions(-)
 create mode 100644 include/linux/tc_act/tc_tunnel_key.h
 create mode 100644 man/man8/tc-tunnel_key.8
 create mode 100644 tc/m_tunnel_key.c

-- 
2.10.2



Re: [PATCH net v2] flow_dissector: Check skb for VLAN only if skb specified.

2016-10-19 Thread Amir Vadai
On Tue, Oct 18, 2016 at 4:59 PM, Or Gerlitz <gerlitz...@gmail.com> wrote:
> On Mon, Oct 17, 2016 at 11:30 PM, Eric Garver <e...@erig.me> wrote:
>> Fixes a panic when calling eth_get_headlen(). Noticed on i40e driver.
>>
>> Fixes: d5709f7ab776 ("flow_dissector: For stripped vlan, get vlan info from 
>> skb->vlan_tci")
>> Signed-off-by: Eric Garver <e...@erig.me>
>
> Dave,
>
> Hadar is OOO and I have asked Amir to look on the fix, will appreciate
> if we can have 24 hours to respond
>
> Or.

Reviewed and tested.

Acked-by: Amir Vadai <a...@vadai.me>


Re: [PATCH net-next] net/sched: cls_flower: Use a proper mask value for enc key id parameter

2016-09-27 Thread Amir Vadai"
On Tue, Sep 27, 2016 at 11:21:18AM +0300, Hadar Hen Zion wrote:
> The current code use the encapsulation key id value as the mask of that
> parameter which is wrong. Fix that by using a full mask.
> 
> Fixes: bc3103f1ed40 ('net/sched: cls_flower: Classify packet in ip tunnels')
> Signed-off-by: Hadar Hen Zion <had...@mellanox.com>
> ---

Acked-by: Amir Vadai <a...@vadai.me>


Re: [PATCH net-next V3 4/4] net/sched: Introduce act_tunnel_key

2016-08-30 Thread Amir Vadai
On Tue, Aug 30, 2016 at 08:05:03AM -0400, Jamal Hadi Salim wrote:
> On 16-08-30 07:03 AM, Amir Vadai wrote:
> > On Sun, Aug 28, 2016 at 10:04:21PM -0700, Cong Wang wrote:
> > > On Fri, Aug 26, 2016 at 12:16 PM, Eric Dumazet <eric.duma...@gmail.com> 
> > > wrote:
> > > > On Fri, 2016-08-26 at 11:26 -0700, Cong Wang wrote:
> 
> 
> > Regarding the specific action in this patchset, correct me if I'm wrong,
> > but I think that the lock could be removed safely.
> > 
> 
> From what Eric suggested (refer to my posting on skbmod),
> this becomes:
> 
> +struct tcf_tunnel_key_p {
> + int tcft_action;
> + struct metadata_dst *tcft_enc_metadata;
> +};
> 
> /* rcu protected */
> +struct tcf_tunnel_key {
> + struct tc_actioncommon;
> +   struct tcf_tunnel_key_p *p;
> +};
> 
> At init() - always alloc struct tcf_tunnel_key_p, new
> 
> old = rtnl_dereference(mykey->p);
> if (ovr)
> spin_lock_bh(>tcf_lock);
Thanks for the detailed example :)

what are we protecting with this spin lock here? isn't concurrent init()
calls are protected by the rtnl lock?


> ... update all params here ..
> rcu_assign_pointer(mykey->p, new);
> if (ovr) {
>  spin_unlock_bh(>tcf_lock);
>  synchronize_rcu();
> }
> 
> kfree(old);
> 
> at act():
> 
> rcu_read_lock();
> struct tcf_tunnel_key_p *p = rcu_dereference(mykey->p);
> ... use p here ...
> rcu_read_unlock();
> 
> Cong was looking to do something more generic for all actions.
> 
> cheers,
> jamal


Re: [PATCH net-next V3 4/4] net/sched: Introduce act_tunnel_key

2016-08-30 Thread Amir Vadai
On Tue, Aug 30, 2016 at 02:03:08PM +0300, Amir Vadai wrote:
> On Sun, Aug 28, 2016 at 10:04:21PM -0700, Cong Wang wrote:
> > On Fri, Aug 26, 2016 at 12:16 PM, Eric Dumazet <eric.duma...@gmail.com> 
> > wrote:
> > > On Fri, 2016-08-26 at 11:26 -0700, Cong Wang wrote:
> > >> 1) Currently there are only a few actions using lockless, and they are
> > >> questionable, as we already discussed before, there could be some
> > >> race condition when you modify an existing action.
> > >
> > > There is no fundamental issue with a race condition.
> > 
> > For mirred action, maybe. As we already discussed, the more
> > complex an action is, the harder to make it lockless in your
> > way (that is, not using RCU)
> > 
> > >
> > > Sure, there are races, but they have no serious effect.
> > >
> > > Feel free to send a fix if you really have time to spare.
> > 
> > It's because the code is written by you?
> > 
> > I am surprised how you try to hide your own problem in
> > such a way...
> > 
> > 
> > >
> > >>
> > >> 2) We need to change the tc action API in order to fully support RCU,
> > >> which is what I have been working on these days. I should come up
> > >> with something next Monday (if not this weekend).
> > >>
> > >> So for this patchset, using spinlock is fine, just as many other actions.
> > >> I will take care of it later.
> > >
> > > This is _not_ fine.
> > 
> > 
> > OK, so where are your patches to make the rest actions
> > lockless?
> > 
> > 
> > >
> > > We are in 2016, not in 1995 anymore.
> > >
> > 
> > Fair enough, sounds like all actions are already lockless in
> > fast path now in 2016, you know this is not true...
> > 
> > 
> > > We are not adding a spinlock in a hot path unless absolutely needed.
> > 
> > If it is bug-free, yes, I am totally with you. I care about corretness
> > more than any performance.
> > 
> > 
> > >
> > > With multi queue NIC, this spinlock is going to hurt performance so much
> > > that this action wont be used by any serious user.
> > 
> > We have used mirred action even before you make it lockless.
> > 
> > 
> > >
> > > Here, it is absolutely trivial to use RCU and/or percpu counters.
> > 
> > Sounds like we don't need any API change, why not go ahead
> > and try it? Please do teach me how to modify an existing
> > action in a lockless way without changing any API (and of course
> > needs to be bug-free), I am very happy to learn your "trivial" way
> > to fix this, since I don't have any trivial fix.
> > 
> > Please, stop bullsh*t, show me your trivial code.
> 
> Regarding the specific action in this patchset, correct me if I'm wrong,
> but I think that the lock could be removed safely.
> 
> When the action is modified during traffic, an existing tcf_enc_metadata
> is not changed, but a new metadata is allocated and the pointer is
> replaced to point to the new one.
> I just need to make sure that when changing an action from 'release'
> into 'set' - tcf_enc_metadata will be set before the action type is
> changed - change the order of operations and add a memory barrier.
> Here is a pseudo code to explain:
> 
> metadata_new = new allocated metadata
> metadata_old = t->tcft_enc_metadata
> 

Oh - I had a typo here:
Need to set the metadata and only after that, set the action:

t->tcft_enc_metadata = metadata_new
wmb()
t->tcft_action = encapdecap

> t->tcft_action = encapdecap
> 
> /* make sure the compiler won't swap the setting of tcft_action with
>  * tcft_enc_metadata
>  */
> wmb()
> 
> t->tcft_enc_metadata = metadata_new
> release metadata_old
> 
> 
> This way, no need for lock between the init() and act() operations.
> 
> Please let me know if you see a problem with this approach.
> I will also change the stats to be percpu.
> 
> Thanks,
> Amir
> 


Re: [PATCH net-next V3 4/4] net/sched: Introduce act_tunnel_key

2016-08-30 Thread Amir Vadai
On Sun, Aug 28, 2016 at 10:04:21PM -0700, Cong Wang wrote:
> On Fri, Aug 26, 2016 at 12:16 PM, Eric Dumazet  wrote:
> > On Fri, 2016-08-26 at 11:26 -0700, Cong Wang wrote:
> >> 1) Currently there are only a few actions using lockless, and they are
> >> questionable, as we already discussed before, there could be some
> >> race condition when you modify an existing action.
> >
> > There is no fundamental issue with a race condition.
> 
> For mirred action, maybe. As we already discussed, the more
> complex an action is, the harder to make it lockless in your
> way (that is, not using RCU)
> 
> >
> > Sure, there are races, but they have no serious effect.
> >
> > Feel free to send a fix if you really have time to spare.
> 
> It's because the code is written by you?
> 
> I am surprised how you try to hide your own problem in
> such a way...
> 
> 
> >
> >>
> >> 2) We need to change the tc action API in order to fully support RCU,
> >> which is what I have been working on these days. I should come up
> >> with something next Monday (if not this weekend).
> >>
> >> So for this patchset, using spinlock is fine, just as many other actions.
> >> I will take care of it later.
> >
> > This is _not_ fine.
> 
> 
> OK, so where are your patches to make the rest actions
> lockless?
> 
> 
> >
> > We are in 2016, not in 1995 anymore.
> >
> 
> Fair enough, sounds like all actions are already lockless in
> fast path now in 2016, you know this is not true...
> 
> 
> > We are not adding a spinlock in a hot path unless absolutely needed.
> 
> If it is bug-free, yes, I am totally with you. I care about corretness
> more than any performance.
> 
> 
> >
> > With multi queue NIC, this spinlock is going to hurt performance so much
> > that this action wont be used by any serious user.
> 
> We have used mirred action even before you make it lockless.
> 
> 
> >
> > Here, it is absolutely trivial to use RCU and/or percpu counters.
> 
> Sounds like we don't need any API change, why not go ahead
> and try it? Please do teach me how to modify an existing
> action in a lockless way without changing any API (and of course
> needs to be bug-free), I am very happy to learn your "trivial" way
> to fix this, since I don't have any trivial fix.
> 
> Please, stop bullsh*t, show me your trivial code.

Regarding the specific action in this patchset, correct me if I'm wrong,
but I think that the lock could be removed safely.

When the action is modified during traffic, an existing tcf_enc_metadata
is not changed, but a new metadata is allocated and the pointer is
replaced to point to the new one.
I just need to make sure that when changing an action from 'release'
into 'set' - tcf_enc_metadata will be set before the action type is
changed - change the order of operations and add a memory barrier.
Here is a pseudo code to explain:

metadata_new = new allocated metadata
metadata_old = t->tcft_enc_metadata

t->tcft_action = encapdecap

/* make sure the compiler won't swap the setting of tcft_action with
 * tcft_enc_metadata
 */
wmb()

t->tcft_enc_metadata = metadata_new
release metadata_old


This way, no need for lock between the init() and act() operations.

Please let me know if you see a problem with this approach.
I will also change the stats to be percpu.

Thanks,
Amir



[PATCH net-next V2 3/4] net/sched: cls_flower: Classify packet in ip tunnels

2016-08-24 Thread Amir Vadai
Introduce classifying by metadata extracted by the tunnel device.
Outer header fields - source/dest ip and tunnel id, are extracted from
the metadata when classifying.

For example, the following will add a filter on the ingress Qdisc of shared
vxlan device named 'vxlan0'. To forward packets with outer src ip
11.11.0.2, dst ip 11.11.0.1 and tunnel id 11. The packets will be
forwarded to tap device 'vnet0' (after metadata is released):

$ filter add dev vxlan0 protocol ip parent : \
flower \
  enc_src_ip 11.11.0.2 \
  enc_dst_ip 11.11.0.1 \
  enc_key_id 11 \
  dst_ip 11.11.11.1 \
action tunnel_key release \
action mirred egress redirect dev vnet0

The action tunnel_key, will be introduced in the next patch in this series.

Signed-off-by: Amir Vadai <a...@vadai.me>
---
 include/uapi/linux/pkt_cls.h | 11 ++
 net/sched/cls_flower.c   | 89 +++-
 2 files changed, 98 insertions(+), 2 deletions(-)

diff --git a/include/uapi/linux/pkt_cls.h b/include/uapi/linux/pkt_cls.h
index 51b5b247fb5a..f9c287c67eae 100644
--- a/include/uapi/linux/pkt_cls.h
+++ b/include/uapi/linux/pkt_cls.h
@@ -431,6 +431,17 @@ enum {
TCA_FLOWER_KEY_VLAN_ID,
TCA_FLOWER_KEY_VLAN_PRIO,
TCA_FLOWER_KEY_VLAN_ETH_TYPE,
+
+   TCA_FLOWER_KEY_ENC_KEY_ID,  /* be32 */
+   TCA_FLOWER_KEY_ENC_IPV4_SRC,/* be32 */
+   TCA_FLOWER_KEY_ENC_IPV4_SRC_MASK,/* be32 */
+   TCA_FLOWER_KEY_ENC_IPV4_DST,/* be32 */
+   TCA_FLOWER_KEY_ENC_IPV4_DST_MASK,/* be32 */
+   TCA_FLOWER_KEY_ENC_IPV6_SRC,/* struct in6_addr */
+   TCA_FLOWER_KEY_ENC_IPV6_SRC_MASK,/* struct in6_addr */
+   TCA_FLOWER_KEY_ENC_IPV6_DST,/* struct in6_addr */
+   TCA_FLOWER_KEY_ENC_IPV6_DST_MASK,/* struct in6_addr */
+
__TCA_FLOWER_MAX,
 };
 
diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c
index 1e11e57e6947..90dd776ae6aa 100644
--- a/net/sched/cls_flower.c
+++ b/net/sched/cls_flower.c
@@ -23,6 +23,9 @@
 #include 
 #include 
 
+#include 
+#include 
+
 struct fl_flow_key {
int indev_ifindex;
struct flow_dissector_key_control control;
@@ -35,6 +38,9 @@ struct fl_flow_key {
struct flow_dissector_key_ipv6_addrs ipv6;
};
struct flow_dissector_key_ports tp;
+   struct flow_dissector_key_keyid enc_key_id;
+   struct flow_dissector_key_ipv4_addrs enc_ipv4;
+   struct flow_dissector_key_ipv6_addrs enc_ipv6;
 } __aligned(BITS_PER_LONG / 8); /* Ensure that we can do comparisons as longs. 
*/
 
 struct fl_flow_mask_range {
@@ -124,11 +130,31 @@ static int fl_classify(struct sk_buff *skb, const struct 
tcf_proto *tp,
struct cls_fl_filter *f;
struct fl_flow_key skb_key;
struct fl_flow_key skb_mkey;
+   struct ip_tunnel_info *info;
 
if (!atomic_read(>ht.nelems))
return -1;
 
fl_clear_masked_range(_key, >mask);
+
+   info = skb_tunnel_info(skb);
+   if (info) {
+   struct ip_tunnel_key *key = >key;
+
+   switch (ip_tunnel_info_af(info)) {
+   case AF_INET:
+   skb_key.enc_ipv4.src = key->u.ipv4.src;
+   skb_key.enc_ipv4.dst = key->u.ipv4.dst;
+   break;
+   case AF_INET6:
+   skb_key.enc_ipv6.src = key->u.ipv6.src;
+   skb_key.enc_ipv6.dst = key->u.ipv6.dst;
+   break;
+   }
+
+   skb_key.enc_key_id.keyid = tunnel_id_to_key32(key->tun_id);
+   }
+
skb_key.indev_ifindex = skb->skb_iif;
/* skb_flow_dissect() does not set n_proto in case an unknown protocol,
 * so do it rather here.
@@ -297,7 +323,15 @@ static const struct nla_policy fl_policy[TCA_FLOWER_MAX + 
1] = {
[TCA_FLOWER_KEY_VLAN_ID]= { .type = NLA_U16 },
[TCA_FLOWER_KEY_VLAN_PRIO]  = { .type = NLA_U8 },
[TCA_FLOWER_KEY_VLAN_ETH_TYPE]  = { .type = NLA_U16 },
-
+   [TCA_FLOWER_KEY_ENC_KEY_ID] = { .type = NLA_U32 },
+   [TCA_FLOWER_KEY_ENC_IPV4_SRC]   = { .type = NLA_U32 },
+   [TCA_FLOWER_KEY_ENC_IPV4_SRC_MASK] = { .type = NLA_U32 },
+   [TCA_FLOWER_KEY_ENC_IPV4_DST]   = { .type = NLA_U32 },
+   [TCA_FLOWER_KEY_ENC_IPV4_DST_MASK] = { .type = NLA_U32 },
+   [TCA_FLOWER_KEY_ENC_IPV6_SRC]   = { .len = sizeof(struct in6_addr) },
+   [TCA_FLOWER_KEY_ENC_IPV6_SRC_MASK] = { .len = sizeof(struct in6_addr) },
+   [TCA_FLOWER_KEY_ENC_IPV6_DST]   = { .len = sizeof(struct in6_addr) },
+   [TCA_FLOWER_KEY_ENC_IPV6_DST_MASK] = { .len = sizeof(struct in6_addr) },
 };
 
 static void fl_set_key_val(struct nlattr **tb,
@@ -345,7 +379,6 @@ static int fl_set_key(struct net *net, struct nlattr **tb,
mask->indev_ifindex = 0x;
}
 #endif
-
fl_set_key_val(tb, key->eth.dst, TCA_FLOWER_KEY_ETH_DS

[PATCH net-next V2 2/4] net/dst: Utility functions to build dst_metadata without supplying an skb

2016-08-24 Thread Amir Vadai
Extract _ip_tun_rx_dst() and _ipv6_tun_rx_dst() out of ip_tun_rx_dst()
and ipv6_tun_rx_dst(), to be used without supplying an skb.

Signed-off-by: Amir Vadai <a...@vadai.me>
---
 include/net/dst_metadata.h | 45 -
 1 file changed, 32 insertions(+), 13 deletions(-)

diff --git a/include/net/dst_metadata.h b/include/net/dst_metadata.h
index 5db9f5910428..892b3b69fb8d 100644
--- a/include/net/dst_metadata.h
+++ b/include/net/dst_metadata.h
@@ -112,12 +112,10 @@ static inline struct ip_tunnel_info 
*skb_tunnel_info_unclone(struct sk_buff *skb
return >u.tun_info;
 }
 
-static inline struct metadata_dst *ip_tun_rx_dst(struct sk_buff *skb,
-__be16 flags,
-__be64 tunnel_id,
-int md_size)
+static inline struct metadata_dst *
+_ip_tun_rx_dst(__be32 saddr, __be32 daddr, __u8 tos, __u8 ttl,
+  __be16 flags, __be64 tunnel_id, int md_size)
 {
-   const struct iphdr *iph = ip_hdr(skb);
struct metadata_dst *tun_dst;
 
tun_dst = tun_rx_dst(md_size);
@@ -125,17 +123,27 @@ static inline struct metadata_dst *ip_tun_rx_dst(struct 
sk_buff *skb,
return NULL;
 
ip_tunnel_key_init(_dst->u.tun_info.key,
-  iph->saddr, iph->daddr, iph->tos, iph->ttl,
+  saddr, daddr, tos, ttl,
   0, 0, 0, tunnel_id, flags);
return tun_dst;
 }
 
-static inline struct metadata_dst *ipv6_tun_rx_dst(struct sk_buff *skb,
+static inline struct metadata_dst *ip_tun_rx_dst(struct sk_buff *skb,
 __be16 flags,
 __be64 tunnel_id,
 int md_size)
 {
-   const struct ipv6hdr *ip6h = ipv6_hdr(skb);
+   const struct iphdr *iph = ip_hdr(skb);
+
+   return _ip_tun_rx_dst(iph->saddr, iph->daddr, iph->tos, iph->ttl,
+ flags, tunnel_id, md_size);
+}
+
+static inline struct metadata_dst *
+_ipv6_tun_rx_dst(struct in6_addr saddr, struct in6_addr daddr,
+__u8 tos, __u8 ttl, __be32 label,
+__be16 flags, __be64 tunnel_id, int md_size)
+{
struct metadata_dst *tun_dst;
struct ip_tunnel_info *info;
 
@@ -150,14 +158,25 @@ static inline struct metadata_dst *ipv6_tun_rx_dst(struct 
sk_buff *skb,
info->key.tp_src = 0;
info->key.tp_dst = 0;
 
-   info->key.u.ipv6.src = ip6h->saddr;
-   info->key.u.ipv6.dst = ip6h->daddr;
+   info->key.u.ipv6.src = saddr;
+   info->key.u.ipv6.dst = daddr;
 
-   info->key.tos = ipv6_get_dsfield(ip6h);
-   info->key.ttl = ip6h->hop_limit;
-   info->key.label = ip6_flowlabel(ip6h);
+   info->key.tos = tos;
+   info->key.ttl = ttl;
+   info->key.label = label;
 
return tun_dst;
 }
 
+static inline struct metadata_dst *
+ipv6_tun_rx_dst(struct sk_buff *skb, __be16 flags, __be64 tunnel_id,
+   int md_size)
+{
+   const struct ipv6hdr *ip6h = ipv6_hdr(skb);
+
+   return _ipv6_tun_rx_dst(ip6h->saddr, ip6h->daddr,
+   ipv6_get_dsfield(ip6h), ip6h->hop_limit,
+   ip6_flowlabel(ip6h), flags, tunnel_id,
+   md_size);
+}
 #endif /* __NET_DST_METADATA_H */
-- 
2.9.0



[PATCH net-next V2 4/4] net/sched: Introduce act_tunnel_key

2016-08-24 Thread Amir Vadai
This action could be used before redirecting packets to a shared tunnel
device, or when redirecting packets arriving from a such a device.

The action will release the metadata created by the tunnel device
(decap), or set the metadata with the specified values for encap
operation.

For example, the following flower filter will forward all ICMP packets
destined to 11.11.11.2 through the shared vxlan device 'vxlan0'. Before
redirecting, a metadata for the vxlan tunnel is created using the
tunnel_key action and it's arguments:

$ filter add dev net0 protocol ip parent : \
flower \
  ip_proto 1 \
  dst_ip 11.11.11.2 \
action tunnel_key set\
  src_ip 11.11.0.1 \
  dst_ip 11.11.0.2 \
  id 11 \
action mirred egress redirect dev vxlan0

Signed-off-by: Amir Vadai <a...@vadai.me>
---
 include/net/tc_act/tc_tunnel_key.h|  25 +++
 include/uapi/linux/tc_act/tc_tunnel_key.h |  42 
 net/sched/Kconfig |  11 ++
 net/sched/Makefile|   1 +
 net/sched/act_tunnel_key.c| 314 ++
 5 files changed, 393 insertions(+)
 create mode 100644 include/net/tc_act/tc_tunnel_key.h
 create mode 100644 include/uapi/linux/tc_act/tc_tunnel_key.h
 create mode 100644 net/sched/act_tunnel_key.c

diff --git a/include/net/tc_act/tc_tunnel_key.h 
b/include/net/tc_act/tc_tunnel_key.h
new file mode 100644
index ..18d5950059cb
--- /dev/null
+++ b/include/net/tc_act/tc_tunnel_key.h
@@ -0,0 +1,25 @@
+/*
+ * Copyright (c) 2016, Amir Vadai <a...@vadai.me>
+ * Copyright (c) 2016, Mellanox Technologies. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+
+#ifndef __NET_TC_TUNNEL_KEY_H
+#define __NET_TC_TUNNEL_KEY_H
+
+#include 
+
+struct tcf_tunnel_key {
+   struct tc_actioncommon;
+   int tcft_action;
+   struct metadata_dst *tcft_enc_metadata;
+};
+
+#define to_tunnel_key(a) ((struct tcf_tunnel_key *)a)
+
+#endif /* __NET_TC_TUNNEL_KEY_H */
+
diff --git a/include/uapi/linux/tc_act/tc_tunnel_key.h 
b/include/uapi/linux/tc_act/tc_tunnel_key.h
new file mode 100644
index ..f9ddf5369a45
--- /dev/null
+++ b/include/uapi/linux/tc_act/tc_tunnel_key.h
@@ -0,0 +1,42 @@
+/*
+ * Copyright (c) 2016, Amir Vadai <a...@vadai.me>
+ * Copyright (c) 2016, Mellanox Technologies. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+
+#ifndef __LINUX_TC_TUNNEL_KEY_H
+#define __LINUX_TC_TUNNEL_KEY_H
+
+#include 
+
+#define TCA_ACT_TUNNEL_KEY 17
+
+#define TCA_TUNNEL_KEY_ACT_SET 1
+#define TCA_TUNNEL_KEY_ACT_RELEASE  2
+
+struct tc_tunnel_key {
+   tc_gen;
+   int t_action;
+};
+
+enum {
+   TCA_TUNNEL_KEY_UNSPEC,
+   TCA_TUNNEL_KEY_TM,
+   TCA_TUNNEL_KEY_PARMS,
+   TCA_TUNNEL_KEY_ENC_IPV4_SRC,/* be32 */
+   TCA_TUNNEL_KEY_ENC_IPV4_DST,/* be32 */
+   TCA_TUNNEL_KEY_ENC_IPV6_SRC,/* struct in6_addr */
+   TCA_TUNNEL_KEY_ENC_IPV6_DST,/* struct in6_addr */
+   TCA_TUNNEL_KEY_ENC_KEY_ID,  /* be64 */
+   TCA_TUNNEL_KEY_PAD,
+   __TCA_TUNNEL_KEY_MAX,
+};
+
+#define TCA_TUNNEL_KEY_MAX (__TCA_TUNNEL_KEY_MAX - 1)
+
+#endif
+
diff --git a/net/sched/Kconfig b/net/sched/Kconfig
index ccf931b3b94c..f9f602d57f2a 100644
--- a/net/sched/Kconfig
+++ b/net/sched/Kconfig
@@ -761,6 +761,17 @@ config NET_ACT_IFE
  To compile this code as a module, choose M here: the
  module will be called act_ife.
 
+config NET_ACT_TUNNEL_KEY
+tristate "IP tunnel metadata manipulation"
+depends on NET_CLS_ACT
+---help---
+ Say Y here to set/release ip tunnel metadata.
+
+ If unsure, say N.
+
+ To compile this code as a module, choose M here: the
+ module will be called act_tunnel.
+
 config NET_IFE_SKBMARK
 tristate "Support to encoding decoding skb mark on IFE action"
 depends on NET_ACT_IFE
diff --git a/net/sched/Makefile b/net/sched/Makefile
index ae088a5a9d95..b9d046b9535a 100644
--- a/net/sched/Makefile
+++ b/net/sched/Makefile
@@ -22,6 +22,7 @@ obj-$(CONFIG_NET_ACT_CONNMARK)+= act_connmark.o
 obj-$(CONFIG_NET_ACT_IFE)  += act_ife.o
 obj-$(CONFIG_NET_IFE_SKBMARK)  += act_meta_mark.o
 obj-$(CONFIG_NET_IFE_SKBPRIO)  += act_meta_skbprio.o
+obj-$(CONFIG_NET_ACT_TUNNEL_KEY)+= act_tunnel_key.o
 obj-$(CONFIG_NET_SCH_FIFO) += sch_fifo.o
 obj-$(CONFIG_NET_SCH_CBQ)  += sch_cbq.o
 obj-$(CONFIG_NET_SCH_HTB)  += sch_htb.o
diff --git a/

[PATCH net-next V2 0/4] net/sched: ip tunnel metadata set/release/classify by using TC

2016-08-24 Thread Amir Vadai
Hi,

This patchset introduces ip tunnel manipulation support using the TC subsystem.

In the decap flow, it enables the user to redirect packets from a shared tunnel
device and classify by outer and inner headers. The outer headers are extracted
from the metadata and used by the flower filter. A new action act_tunnel_key,
releases the metadata.

In the encap flow, act_tunnel_key creates a metadata object to be used by the
shared tunnel device. The actual redirection to the tunnel device is done using
act_mirred.

For example:
$ tc qdisc add dev vnet0 ingress
$ tc filter add dev vnet0 protocol ip parent : \
flower \
  ip_proto 1 \
action tunnel_key set \
  src_ip 11.11.0.1 \
dst_ip 11.11.0.2 \
id 11 \
action mirred egress redirect dev vxlan0
  
$ tc qdisc add dev vxlan0 ingress
$ tc filter add dev vxlan0 protocol ip parent : \
flower \
  enc_src_ip 11.11.0.2 \
enc_dst_ip 11.11.0.1 \
enc_key_id 11 \
action tunnel_key release \
  action mirred egress redirect dev vnet0

Amir

Changes from V0:
- More cleanups to key32_to_tunnel_id() and tunnel_id_to_key32()
- IPv6 Support added
- Set TUNNEL_KEY flag to make GRE work
- Handle zero tunnel id properly in act_tunnel_key
- Don't leave junk in decap action
- Fix bug in act_tunnel_key initialization where (exists & ocr) is true
- Remove BUG() from code
- Rename action to tunnel_key
- Improve grep-ability of code
- Reuse code from ip_tun_rx_dst() and ipv6_tun_rx_dst()

Changes from RFC:
- Add a new action instead of making mirred too complex
- No need to specify UDP port in action - it is already in the tunnel device
configuration
- Added a decap operation to drop tunnel metadata

Amir Vadai (4):
  net/ip_tunnels: Introduce tunnel_id_to_key32() and
key32_to_tunnel_id()
  net/dst: Utility functions to build dst_metadata without supplying an
skb
  net/sched: cls_flower: Classify packet in ip tunnels
  net/sched: Introduce act_tunnel_key

 drivers/net/vxlan.c   |   4 +-
 include/net/dst_metadata.h|  45 +++--
 include/net/ip_tunnels.h  |  19 ++
 include/net/tc_act/tc_tunnel_key.h|  25 +++
 include/net/vxlan.h   |  18 --
 include/uapi/linux/pkt_cls.h  |  11 ++
 include/uapi/linux/tc_act/tc_tunnel_key.h |  42 
 net/ipv4/ip_gre.c |  23 +--
 net/sched/Kconfig |  11 ++
 net/sched/Makefile|   1 +
 net/sched/act_tunnel_key.c| 314 ++
 net/sched/cls_flower.c|  89 -
 12 files changed, 546 insertions(+), 56 deletions(-)
 create mode 100644 include/net/tc_act/tc_tunnel_key.h
 create mode 100644 include/uapi/linux/tc_act/tc_tunnel_key.h
 create mode 100644 net/sched/act_tunnel_key.c

-- 
2.9.0



[PATCH net-next V2 1/4] net/ip_tunnels: Introduce tunnel_id_to_key32() and key32_to_tunnel_id()

2016-08-24 Thread Amir Vadai
Add utility functions to convert a 32 bits key into a 64 bits tunnel and
vice versa.
These functions will be used instead of cloning code in GRE and VXLAN,
and in tc act_iptunnel which will be introduced in a following patch in
this patchset.

Signed-off-by: Amir Vadai <a...@vadai.me>
---
 drivers/net/vxlan.c  |  4 ++--
 include/net/ip_tunnels.h | 19 +++
 include/net/vxlan.h  | 18 --
 net/ipv4/ip_gre.c| 23 ++-
 4 files changed, 23 insertions(+), 41 deletions(-)

diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c
index c0dda6fc0921..b1ddf8f756d4 100644
--- a/drivers/net/vxlan.c
+++ b/drivers/net/vxlan.c
@@ -1294,7 +1294,7 @@ static int vxlan_rcv(struct sock *sk, struct sk_buff *skb)
struct metadata_dst *tun_dst;
 
tun_dst = udp_tun_rx_dst(skb, vxlan_get_sk_family(vs), 
TUNNEL_KEY,
-vxlan_vni_to_tun_id(vni), sizeof(*md));
+key32_to_tunnel_id(vni), sizeof(*md));
 
if (!tun_dst)
goto drop;
@@ -1948,7 +1948,7 @@ static void vxlan_xmit_one(struct sk_buff *skb, struct 
net_device *dev,
goto drop;
}
dst_port = info->key.tp_dst ? : vxlan->cfg.dst_port;
-   vni = vxlan_tun_id_to_vni(info->key.tun_id);
+   vni = tunnel_id_to_key32(info->key.tun_id);
remote_ip.sa.sa_family = ip_tunnel_info_af(info);
if (remote_ip.sa.sa_family == AF_INET) {
remote_ip.sin.sin_addr.s_addr = info->key.u.ipv4.dst;
diff --git a/include/net/ip_tunnels.h b/include/net/ip_tunnels.h
index a5e7035fb93f..e598c639aa6f 100644
--- a/include/net/ip_tunnels.h
+++ b/include/net/ip_tunnels.h
@@ -222,6 +222,25 @@ static inline unsigned short ip_tunnel_info_af(const 
struct ip_tunnel_info
return tun_info->mode & IP_TUNNEL_INFO_IPV6 ? AF_INET6 : AF_INET;
 }
 
+static inline __be64 key32_to_tunnel_id(__be32 key)
+{
+#ifdef __BIG_ENDIAN
+   return (__force __be64)key;
+#else
+   return (__force __be64)((__force u64)key << 32);
+#endif
+}
+
+/* Returns the least-significant 32 bits of a __be64. */
+static inline __be32 tunnel_id_to_key32(__be64 tun_id)
+{
+#ifdef __BIG_ENDIAN
+   return (__force __be32)tun_id;
+#else
+   return (__force __be32)((__force u64)tun_id >> 32);
+#endif
+}
+
 #ifdef CONFIG_INET
 
 int ip_tunnel_init(struct net_device *dev);
diff --git a/include/net/vxlan.h b/include/net/vxlan.h
index b96d0360c095..0255613a54a4 100644
--- a/include/net/vxlan.h
+++ b/include/net/vxlan.h
@@ -350,24 +350,6 @@ static inline __be32 vxlan_vni_field(__be32 vni)
 #endif
 }
 
-static inline __be32 vxlan_tun_id_to_vni(__be64 tun_id)
-{
-#if defined(__BIG_ENDIAN)
-   return (__force __be32)tun_id;
-#else
-   return (__force __be32)((__force u64)tun_id >> 32);
-#endif
-}
-
-static inline __be64 vxlan_vni_to_tun_id(__be32 vni)
-{
-#if defined(__BIG_ENDIAN)
-   return (__force __be64)vni;
-#else
-   return (__force __be64)((u64)(__force u32)vni << 32);
-#endif
-}
-
 static inline size_t vxlan_rco_start(__be32 vni_field)
 {
return be32_to_cpu(vni_field & VXLAN_RCO_MASK) << VXLAN_RCO_SHIFT;
diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c
index 113cc43df789..576f705d8180 100644
--- a/net/ipv4/ip_gre.c
+++ b/net/ipv4/ip_gre.c
@@ -246,25 +246,6 @@ static void gre_err(struct sk_buff *skb, u32 info)
ipgre_err(skb, info, );
 }
 
-static __be64 key_to_tunnel_id(__be32 key)
-{
-#ifdef __BIG_ENDIAN
-   return (__force __be64)((__force u32)key);
-#else
-   return (__force __be64)((__force u64)key << 32);
-#endif
-}
-
-/* Returns the least-significant 32 bits of a __be64. */
-static __be32 tunnel_id_to_key(__be64 x)
-{
-#ifdef __BIG_ENDIAN
-   return (__force __be32)x;
-#else
-   return (__force __be32)((__force u64)x >> 32);
-#endif
-}
-
 static int __ipgre_rcv(struct sk_buff *skb, const struct tnl_ptk_info *tpi,
   struct ip_tunnel_net *itn, int hdr_len, bool raw_proto)
 {
@@ -290,7 +271,7 @@ static int __ipgre_rcv(struct sk_buff *skb, const struct 
tnl_ptk_info *tpi,
__be64 tun_id;
 
flags = tpi->flags & (TUNNEL_CSUM | TUNNEL_KEY);
-   tun_id = key_to_tunnel_id(tpi->key);
+   tun_id = key32_to_tunnel_id(tpi->key);
tun_dst = ip_tun_rx_dst(skb, flags, tun_id, 0);
if (!tun_dst)
return PACKET_REJECT;
@@ -446,7 +427,7 @@ static void gre_fb_xmit(struct sk_buff *skb, struct 
net_device *dev,
 
flags = tun_info->key.tun_flags & (TUNNEL_CSUM | TUNNEL_KEY);
gre_build_header(skb, tunnel_hlen, flags, proto,
-tunnel_id_to_key(tun_info->key

Re: [PATCH net-next 3/3] net/sched: Introduce act_iptunnel

2016-08-23 Thread Amir Vadai
On Tue, Aug 23, 2016 at 08:37:07AM -0400, Jamal Hadi Salim wrote:
> On 16-08-22 10:38 AM, Amir Vadai wrote:
> > This action could be used before redirecting packets to a shared tunnel
> > device, or when redirecting packets arriving from a such a device
> > 
> > The action will release the metadata created by the tunnel device
> > (decap), or set the metadata with the specified values for encap
> > operation.
> > 
> > For example, the following flower filter will forward all ICMP packets
> > destined to 11.11.11.2 through the shared vxlan device 'vxlan0'. Before
> > redirecting, a metadata for the vxlan tunnel is created using the
> > iptunnel action and it's arguments:
> > 
> > $ filter add dev net0 protocol ip parent : \
> > flower \
> >   ip_proto 1 \
> >   dst_ip 11.11.11.2 \
> > action iptunnel encap \
> >   src_ip 11.11.0.1 \
> >   dst_ip 11.11.0.2 \
> >   id 11 \
> > action mirred egress redirect dev vxlan0
> 
> The noun "ip tunnel" is a little misleading. Unless you can use
> this for other types of tunnels (ipip, etc). If this is specific
> for just vxlan and metadata setting then some name like vxlanmeta
> or something else that signifies both metadata de/encap + vxlan would
> be helpful.
Yeh, this name is not the best...
The action is not vxlan specific, it should be good for all the
ip tunnel interfaces that use metadata for the outer headers.
I will rename it to something like mdtunnel - unless someone has a
better suggestion.

> 
> 
> > +static int tcf_iptunnel(struct sk_buff *skb, const struct tc_action *a,
> > +   struct tcf_result *res)
> 
> Can you rename this function to something more grep-able
> like tcf_iptunnel_run or tcf_iptunnel_exec?
> You already have a data structure called tcf_iptunnel
ack

> 
> > +{
> > +   struct tcf_iptunnel *t = to_iptunnel(a);
> > +   int action;
> > +
> > +   spin_lock(>tcf_lock);
> > +   tcf_lastuse_update(>tcf_tm);
> > +   bstats_update(>tcf_bstats, skb);
> > +   action = t->tcf_action;
> > +
> > +   switch (t->tcft_action) {
> > +   case TCA_IPTUNNEL_ACT_DECAP:
> > +   skb_dst_set_noref(skb, NULL);
> > +   break;
> 
> 
> So the real decap is going to be at the vxlan dev?
yes, the action here will just cleanup after the tunnel device will peel
off the outer headers and place it in the metadata. This metadata was
used by the classifier and now can be released.

> 
> 
> > +static struct metadata_dst *iptunnel_alloc(struct tcf_iptunnel *t,
> > +  __be32 saddr, __be32 daddr,
> > +  __be64 key_id)
> > +{
> > +   struct ip_tunnel_info *tun_info;
> > +   struct metadata_dst *metadata;
> > +
> > +   metadata = metadata_dst_alloc(0, GFP_KERNEL);
> > +   if (!metadata)
> > +   return ERR_PTR(-ENOMEM);
> > +
> > +   tun_info = >u.tun_info;
> > +   tun_info->mode = IP_TUNNEL_INFO_TX;
> > +
> 
> More grep-ability tun_info sounds and i think is used by tun netdev.
ack

> 
> Otherwise looks good (although i still think this wouldve scaled better
> if you didnt depend on presence of vxlan dev).
now I start to have some regrets :)
But I don't see a good enough reason to duplicate code, since I can't
point my finger on a performance problem with using the existing code.

Thanks,
Amir

> 
> cheers,
> jamal


Re: [PATCH net-next 3/3] net/sched: Introduce act_iptunnel

2016-08-23 Thread Amir Vadai
On Tue, Aug 23, 2016 at 05:33:49PM +0200, Jiri Benc wrote:
> On Tue, 23 Aug 2016 18:28:05 +0300, Amir Vadai wrote:
> > On Mon, Aug 22, 2016 at 08:51:37PM +0200, Jiri Benc wrote:
> > > 2. We may run into problems like tx path seeing the metadata_dst that
> > >it should not see. This means either this situation or such
> > >configuration must be prevented somehow.
> [...]
> > Anyway, this issue is orthogonal to this patchset...
> 
> Not really. If it's indeed (2) then such configuration needs to be
> rejected. 
The configuration that needs to be rejected is when act_iptunnel is not
used. So, I guess the fix won't be part of it...

> Or metadata_dst freed at an appropriate place. Thus it's
> something that needs to be handled by this patchset before the uAPI is
> set in stone.
It is already there - user can use act_mirred and redirect skb's with
metadata since shared tunnel devices introduced.
The only thing that was added here, is to enable the user to drop the
metadata, which I think we agree is the ok.

But I agree with you, that I must understand the life cycle of the metadata and 
dst
better. I will try to understand it better and explain/fix accordingly.

Again, would be happy if someone will chime in and give some hints if it
was a bug, that a user could redirect skb's with metadata, or something
harmless.

Thanks,
Amir
> 
>  Jiri


Re: [PATCH net-next 3/3] net/sched: Introduce act_iptunnel

2016-08-23 Thread Amir Vadai
On Mon, Aug 22, 2016 at 08:51:37PM +0200, Jiri Benc wrote:
> On Mon, 22 Aug 2016 21:15:41 +0300, Or Gerlitz wrote:
> > Jiri B > I understand the motivation for the decap action. However, what 
> > would
> > Jiri B > happen if someone does not include it?
> > 
> > The MD set by the (say) vxlan device will not be "consumed" (cleared)
> > and would be keep travelling with the SKB
> 
> Of course it would. That's not what I meant by the question :-)
> 
> There are three options:
> 
> 1. It does not matter, as the metadata_dst will be freed anyway before
>it reaches tx path. This means we do not need the 'decap' action.
> 
> 2. We may run into problems like tx path seeing the metadata_dst that
>it should not see. This means either this situation or such
>configuration must be prevented somehow.
> 
> 3. The metadata_dst can reach the tx path but it doesn't matter, as it
>would just mean the packet is encapsulated into the same outer
>headers it was received with or the metadata_dst would be ignored
>(for non-tunnel interfaces).
> 
> Which one is it? Quickly looking into the code, tcf_mirred calls
> dev_queue_xmit which indicates it's either 2 or 3. If it's 3., it
> should be explained in the patch description (especially the non-tunnel
> interface case) and documented.
First, as you suspected it is (2) or (3). AFAIK the skb is injected by
act_mirred as is, with the metadata into the tx path.
I couldn't find a case where having the metadata on the skb matters.
Still, I would be very happy to hear what other people have to say about
it.

Anyway, this issue is orthogonal to this patchset...


> 
>  Jiri


Re: [PATCH net-next 0/3] net/sched: iptunnel encap/decap/classify using TC

2016-08-23 Thread Amir Vadai
On Mon, Aug 22, 2016 at 03:23:45PM -0700, Tom Herbert wrote:
> On Mon, Aug 22, 2016 at 7:38 AM, Amir Vadai <a...@vadai.me> wrote:
> > Hi,
> >
> > This patchset introduces iptunnel support using the TC subsystem.
> >
> > In the decap flow, it enables the user to redirect packets from a shared 
> > tunnel
> > device and classify by outer and inner headers. The outer headers are 
> > extracted
> > from the metadata and used by the flower filter. A new action act_iptunnel,
> > releases the metadata.
> >
> > In the encap flow, act_iptunnel creates a metadata object to be used by the
> > shared tunnel device. The actual redirection to the tunnel device is done 
> > using
> > act_mirred.
> >
> > For example:
> > $ tc qdisc add dev vnet0 ingress
> > $ tc filter add dev vnet0 protocol ip parent : \
> > flower \
> >   ip_proto 1 \
> > action iptunnel encap \
> >   src_ip 11.11.0.1 \
> > dst_ip 11.11.0.2 \
> > id 11 \
> > action mirred egress redirect dev vxlan0
> >
> Is the device required to be a tunnel device? Consider that with LWT
> we can perform this sort of encapsulation without requiring a special
> device...
> 
> Tom
> 
Yes and no. This action is relevant only for a shared tunnel device.
like the one you get with:
$ ip link add vxlan0 type vxlan dstport 4789 external
A user can add metadata using this action and redirect to any netdev,
but only the shared tunnel netdev will do something with it.

Regarding LWT, in our use case we need to have classification in
addition to the routing, have both encap and decap operations and be
ready to add offloading to the API. For that, TC subsystem looked
the most suiteable.

Thanks,
Amir

[...]


Re: [PATCH net-next 3/3] net/sched: Introduce act_iptunnel

2016-08-23 Thread Amir Vadai
On Mon, Aug 22, 2016 at 08:57:06PM +0300, Shmulik Ladkani wrote:
> Hi,
> 
> On Mon, 22 Aug 2016 17:38:34 +0300 Amir Vadai <a...@vadai.me> wrote:
> > +static struct metadata_dst *iptunnel_alloc(struct tcf_iptunnel *t,
> > +  __be32 saddr, __be32 daddr,
> > +  __be64 key_id)
> > +{
> > +   struct ip_tunnel_info *tun_info;
> > +   struct metadata_dst *metadata;
> > +
> > +   metadata = metadata_dst_alloc(0, GFP_KERNEL);
> > +   if (!metadata)
> > +   return ERR_PTR(-ENOMEM);
> > +
> > +   tun_info = >u.tun_info;
> > +   tun_info->mode = IP_TUNNEL_INFO_TX;
> > 
> > +   ip_tunnel_key_init(_info->key, saddr, daddr, 0, 0, 0, 0, 0,
> > +  key_id, 0);
> 
> Seems key.tun_flags should be armed with TUNNEL_KEY.
> This will make things work with GRE as well.
> Pass it in the 'tun_flags' parameter.
ack

> 
> > +
> > +   return metadata;
> > +}
> > +
> > +static int tcf_iptunnel_init(struct net *net, struct nlattr *nla,
> > +struct nlattr *est, struct tc_action **a,
> > +int ovr, int bind)
> > +{
> > +   struct tc_action_net *tn = net_generic(net, iptunnel_net_id);
> > +   struct nlattr *tb[TCA_IPTUNNEL_MAX + 1];
> > +   struct metadata_dst *metadata;
> > +   struct tc_iptunnel *parm;
> > +   struct tcf_iptunnel *t;
> > +   __be32 saddr = 0;
> > +   __be32 daddr = 0;
> > +   __be64 key_id = 0;
> > +   int encapdecap;
> > +   bool exists = false;
> > +   int ret = -EINVAL;
> > +   int err;
> > +
> > +   if (!nla)
> > +   return -EINVAL;
> > +
> > +   err = nla_parse_nested(tb, TCA_IPTUNNEL_MAX, nla, iptunnel_policy);
> > +   if (err < 0)
> > +   return err;
> > +
> > +   if (!tb[TCA_IPTUNNEL_PARMS])
> > +   return -EINVAL;
> > +   parm = nla_data(tb[TCA_IPTUNNEL_PARMS]);
> > +   exists = tcf_hash_check(tn, parm->index, a, bind);
> > +   if (exists && bind)
> > +   return 0;
> > +
> > +   encapdecap = parm->t_action;
> > +
> > +   switch (encapdecap) {
> > +   case TCA_IPTUNNEL_ACT_DECAP:
> > +   break;
> > +   case TCA_IPTUNNEL_ACT_ENCAP:
> > +   if (tb[TCA_IPTUNNEL_ENC_IPV4_SRC])
> > +   saddr = nla_get_be32(tb[TCA_IPTUNNEL_ENC_IPV4_SRC]);
> > +   if (tb[TCA_IPTUNNEL_ENC_IPV4_DST])
> > +   daddr = nla_get_be32(tb[TCA_IPTUNNEL_ENC_IPV4_DST]);
> > +   if (tb[TCA_IPTUNNEL_ENC_KEY_ID])
> > +   key_id = 
> > key32_to_tunnel_id(nla_get_be32(tb[TCA_IPTUNNEL_ENC_KEY_ID]));
> > +
> > +   if (!saddr || !daddr || !key_id) {
> 
> A zero tunnel ID is legit.
ack

> 
> > +   ret = -EINVAL;
> > +   goto err_out;
> > +   }
> > +
> > +   metadata = iptunnel_alloc(t, saddr, daddr, key_id);
> > +   if (IS_ERR(metadata)) {
> > +   ret = PTR_ERR(metadata);
> > +   goto err_out;
> > +   }
> > +
> > +   break;
> > +   default:
> > +   goto err_out;
> > +   }
> > +
> > +   if (!exists) {
> > +   ret = tcf_hash_create(tn, parm->index, est, a,
> > + _iptunnel_ops, bind, false);
> > +   if (ret)
> > +   return ret;
> > +
> > +   ret = ACT_P_CREATED;
> > +   } else {
> > +   tcf_hash_release(*a, bind);
> > +   if (!ovr)
> > +   return -EEXIST;
> > +   }
> > +
> > +   t = to_iptunnel(*a);
> > +
> > +   spin_lock_bh(>tcf_lock);
> > +
> > +   t->tcf_action = parm->action;
> > +
> > +   t->tcft_action = encapdecap;
> > +   t->tcft_enc_metadata = metadata;
> 
> Although tcft_enc_metadata is not used in TCA_IPTUNNEL_ACT_DECAP, still
> prefer to nullify it instead of initializing it to stack junk.
good catch. strange that the compiler/sparse didn't catch it

> 
> > +
> > +   spin_unlock_bh(>tcf_lock);
> > +
> > +   if (ret == ACT_P_CREATED)
> > +   tcf_hash_insert(tn, *a);
> > +
> > +   return ret;
> 
> In the (exists && ovr) case, 'ret' seems to be left as '-EINVAL' as was
> initialized. Initialize 'ret' to zero instead.
another good catch - thanks.

> 
> > +
> > +err_out:
> > +   if (exists)
> > +   tcf_hash_release(*a, bind);
> > +   return ret;
> > +}
> > +
> 
> 


Re: [PATCH net-next 1/3] net/ip_tunnels: Introduce tunnel_id_to_key32() and key32_to_tunnel_id()

2016-08-23 Thread Amir Vadai
On Mon, Aug 22, 2016 at 07:00:27PM +0200, Jiri Benc wrote:
> While cleaning this up, you may as well take the best of both
> implementations.
> 
> On Mon, 22 Aug 2016 17:38:32 +0300, Amir Vadai wrote:
> > +static inline __be64 key32_to_tunnel_id(__be32 key)
> > +{
> > +#ifdef __BIG_ENDIAN
> > +   return (__force __be64)((__force u32)key);
> 
> The inner cast seems to be superfluous?
seems so. will check.

> 
> > +#else
> > +   return (__force __be64)((__force u64)key << 32);
> > +#endif
> > +}
> > +
> > +/* Returns the least-significant 32 bits of a __be64. */
> > +static inline __be32 tunnel_id_to_key32(__be64 x)
> 
> Please use a more descriptive name than "x". "tunnel_id" or "tun_id"
> seems to be more appropriate.
ack

> 
> > +{
> > +#ifdef __BIG_ENDIAN
> > +   return (__force __be32)x;
> > +#else
> > +   return (__force __be32)((__force u64)x >> 32);
> > +#endif
> > +}
> 
> Looks good otherwise.
> 
> Thanks,
> 
>  Jiri


[PATCH net-next 3/3] net/sched: Introduce act_iptunnel

2016-08-22 Thread Amir Vadai
This action could be used before redirecting packets to a shared tunnel
device, or when redirecting packets arriving from a such a device

The action will release the metadata created by the tunnel device
(decap), or set the metadata with the specified values for encap
operation.

For example, the following flower filter will forward all ICMP packets
destined to 11.11.11.2 through the shared vxlan device 'vxlan0'. Before
redirecting, a metadata for the vxlan tunnel is created using the
iptunnel action and it's arguments:

$ filter add dev net0 protocol ip parent : \
flower \
  ip_proto 1 \
  dst_ip 11.11.11.2 \
action iptunnel encap \
  src_ip 11.11.0.1 \
  dst_ip 11.11.0.2 \
  id 11 \
action mirred egress redirect dev vxlan0

Signed-off-by: Amir Vadai <a...@vadai.me>
---
 include/net/tc_act/tc_iptunnel.h|  24 +++
 include/uapi/linux/tc_act/tc_iptunnel.h |  40 +
 net/sched/Kconfig   |  11 ++
 net/sched/Makefile  |   1 +
 net/sched/act_iptunnel.c| 292 
 5 files changed, 368 insertions(+)
 create mode 100644 include/net/tc_act/tc_iptunnel.h
 create mode 100644 include/uapi/linux/tc_act/tc_iptunnel.h
 create mode 100644 net/sched/act_iptunnel.c

diff --git a/include/net/tc_act/tc_iptunnel.h b/include/net/tc_act/tc_iptunnel.h
new file mode 100644
index ..a325081478e7
--- /dev/null
+++ b/include/net/tc_act/tc_iptunnel.h
@@ -0,0 +1,24 @@
+/*
+ * Copyright (c) 2016, Amir Vadai <a...@vadai.me>
+ * Copyright (c) 2016, Mellanox Technologies. All rights reserved.
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+
+#ifndef __NET_TC_IPTUNNEL_H
+#define __NET_TC_IPTUNNEL_H
+
+#include 
+
+struct tcf_iptunnel {
+   struct tc_actioncommon;
+   int tcft_action;
+   struct metadata_dst *tcft_enc_metadata;
+};
+
+#define to_iptunnel(a) ((struct tcf_iptunnel *)a)
+
+#endif /* __NET_TC_IPTUNNEL_H */
+
diff --git a/include/uapi/linux/tc_act/tc_iptunnel.h 
b/include/uapi/linux/tc_act/tc_iptunnel.h
new file mode 100644
index ..a9b688c1f28b
--- /dev/null
+++ b/include/uapi/linux/tc_act/tc_iptunnel.h
@@ -0,0 +1,40 @@
+/*
+ * Copyright (c) 2016, Amir Vadai <a...@vadai.me>
+ * Copyright (c) 2016, Mellanox Technologies. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+
+#ifndef __LINUX_TC_IPTUNNEL_H
+#define __LINUX_TC_IPTUNNEL_H
+
+#include 
+
+#define TCA_ACT_IPTUNNEL 17
+
+#define TCA_IPTUNNEL_ACT_ENCAP 1
+#define TCA_IPTUNNEL_ACT_DECAP 2
+
+struct tc_iptunnel {
+   tc_gen;
+   int t_action;
+};
+
+enum {
+   TCA_IPTUNNEL_UNSPEC,
+   TCA_IPTUNNEL_TM,
+   TCA_IPTUNNEL_PARMS,
+   TCA_IPTUNNEL_ENC_IPV4_SRC,  /* be32 */
+   TCA_IPTUNNEL_ENC_IPV4_DST,  /* be32 */
+   TCA_IPTUNNEL_ENC_KEY_ID,/* be64 */
+   TCA_IPTUNNEL_PAD,
+   __TCA_IPTUNNEL_MAX,
+};
+
+#define TCA_IPTUNNEL_MAX (__TCA_IPTUNNEL_MAX - 1)
+
+#endif
+
diff --git a/net/sched/Kconfig b/net/sched/Kconfig
index ccf931b3b94c..a8a5ac4edb2e 100644
--- a/net/sched/Kconfig
+++ b/net/sched/Kconfig
@@ -761,6 +761,17 @@ config NET_ACT_IFE
  To compile this code as a module, choose M here: the
  module will be called act_ife.
 
+config NET_ACT_IPTUNNEL
+tristate "IP tunnel manipulation"
+depends on NET_CLS_ACT
+---help---
+ Say Y here to set/release ip tunnel metadata.
+
+ If unsure, say N.
+
+ To compile this code as a module, choose M here: the
+ module will be called act_tunnel.
+
 config NET_IFE_SKBMARK
 tristate "Support to encoding decoding skb mark on IFE action"
 depends on NET_ACT_IFE
diff --git a/net/sched/Makefile b/net/sched/Makefile
index ae088a5a9d95..c1287b95b574 100644
--- a/net/sched/Makefile
+++ b/net/sched/Makefile
@@ -22,6 +22,7 @@ obj-$(CONFIG_NET_ACT_CONNMARK)+= act_connmark.o
 obj-$(CONFIG_NET_ACT_IFE)  += act_ife.o
 obj-$(CONFIG_NET_IFE_SKBMARK)  += act_meta_mark.o
 obj-$(CONFIG_NET_IFE_SKBPRIO)  += act_meta_skbprio.o
+obj-$(CONFIG_NET_ACT_IPTUNNEL) += act_iptunnel.o
 obj-$(CONFIG_NET_SCH_FIFO) += sch_fifo.o
 obj-$(CONFIG_NET_SCH_CBQ)  += sch_cbq.o
 obj-$(CONFIG_NET_SCH_HTB)  += sch_htb.o
diff --git a/net/sched/act_iptunnel.c b/net/sched/act_iptunnel.c
new file mode 100644
index ..37640bd11b62
--- /dev/null
+++ b/net/sched/act_iptunnel.c
@@ -0,0 +1,292 @@
+/*
+ * Copyright (c) 2016, Amir Vadai <a...@v

[PATCH net-next 0/3] net/sched: iptunnel encap/decap/classify using TC

2016-08-22 Thread Amir Vadai
Hi,

This patchset introduces iptunnel support using the TC subsystem.

In the decap flow, it enables the user to redirect packets from a shared tunnel
device and classify by outer and inner headers. The outer headers are extracted
from the metadata and used by the flower filter. A new action act_iptunnel,
releases the metadata.

In the encap flow, act_iptunnel creates a metadata object to be used by the
shared tunnel device. The actual redirection to the tunnel device is done using
act_mirred.

For example:
$ tc qdisc add dev vnet0 ingress
$ tc filter add dev vnet0 protocol ip parent : \
flower \
  ip_proto 1 \
action iptunnel encap \
  src_ip 11.11.0.1 \
dst_ip 11.11.0.2 \
id 11 \
action mirred egress redirect dev vxlan0
  
$ tc qdisc add dev vxlan0 ingress
$ tc filter add dev vxlan0 protocol ip parent : \
flower \
  enc_src_ip 11.11.0.2 \
enc_dst_ip 11.11.0.1 \
enc_key_id 11 \
action iptunnel decap \
  action mirred egress redirect dev vnet0

note: Current implementation supports ipv4 only, but it should be easy to add
  ipv6 later on.

Amir

Changes from RFC:
- Add a new action instead of making mirred too complex
- No need to specify UDP port in action - it is already in the tunnel device
configuration
- Added a decap operation to drop tunnel metadata

Amir Vadai (3):
  net/ip_tunnels: Introduce tunnel_id_to_key32() and
key32_to_tunnel_id()
  net/sched: cls_flower: Classify packet in ip tunnels
  net/sched: Introduce act_iptunnel

 drivers/net/vxlan.c |   4 +-
 include/net/ip_tunnels.h|  19 +++
 include/net/tc_act/tc_iptunnel.h|  24 +++
 include/net/vxlan.h |  18 --
 include/uapi/linux/pkt_cls.h|  11 ++
 include/uapi/linux/tc_act/tc_iptunnel.h |  40 +
 net/ipv4/ip_gre.c   |  23 +--
 net/sched/Kconfig   |  11 ++
 net/sched/Makefile  |   1 +
 net/sched/act_iptunnel.c| 292 
 net/sched/cls_flower.c  |  59 ++-
 11 files changed, 459 insertions(+), 43 deletions(-)
 create mode 100644 include/net/tc_act/tc_iptunnel.h
 create mode 100644 include/uapi/linux/tc_act/tc_iptunnel.h
 create mode 100644 net/sched/act_iptunnel.c

-- 
2.9.0



[PATCH net-next 1/3] net/ip_tunnels: Introduce tunnel_id_to_key32() and key32_to_tunnel_id()

2016-08-22 Thread Amir Vadai
Add utility functions to convert a 32 bits key into a 64 bits tunnel and
vice versa.
These functions will be used instead of cloning code in GRE and VXLAN,
and in tc act_iptunnel which will be introduced in a following patch in
this patchset.

Signed-off-by: Amir Vadai <a...@vadai.me>
---
 drivers/net/vxlan.c  |  4 ++--
 include/net/ip_tunnels.h | 19 +++
 include/net/vxlan.h  | 18 --
 net/ipv4/ip_gre.c| 23 ++-
 4 files changed, 23 insertions(+), 41 deletions(-)

diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c
index c0dda6fc0921..b1ddf8f756d4 100644
--- a/drivers/net/vxlan.c
+++ b/drivers/net/vxlan.c
@@ -1294,7 +1294,7 @@ static int vxlan_rcv(struct sock *sk, struct sk_buff *skb)
struct metadata_dst *tun_dst;
 
tun_dst = udp_tun_rx_dst(skb, vxlan_get_sk_family(vs), 
TUNNEL_KEY,
-vxlan_vni_to_tun_id(vni), sizeof(*md));
+key32_to_tunnel_id(vni), sizeof(*md));
 
if (!tun_dst)
goto drop;
@@ -1948,7 +1948,7 @@ static void vxlan_xmit_one(struct sk_buff *skb, struct 
net_device *dev,
goto drop;
}
dst_port = info->key.tp_dst ? : vxlan->cfg.dst_port;
-   vni = vxlan_tun_id_to_vni(info->key.tun_id);
+   vni = tunnel_id_to_key32(info->key.tun_id);
remote_ip.sa.sa_family = ip_tunnel_info_af(info);
if (remote_ip.sa.sa_family == AF_INET) {
remote_ip.sin.sin_addr.s_addr = info->key.u.ipv4.dst;
diff --git a/include/net/ip_tunnels.h b/include/net/ip_tunnels.h
index a5e7035fb93f..d8afe4400373 100644
--- a/include/net/ip_tunnels.h
+++ b/include/net/ip_tunnels.h
@@ -222,6 +222,25 @@ static inline unsigned short ip_tunnel_info_af(const 
struct ip_tunnel_info
return tun_info->mode & IP_TUNNEL_INFO_IPV6 ? AF_INET6 : AF_INET;
 }
 
+static inline __be64 key32_to_tunnel_id(__be32 key)
+{
+#ifdef __BIG_ENDIAN
+   return (__force __be64)((__force u32)key);
+#else
+   return (__force __be64)((__force u64)key << 32);
+#endif
+}
+
+/* Returns the least-significant 32 bits of a __be64. */
+static inline __be32 tunnel_id_to_key32(__be64 x)
+{
+#ifdef __BIG_ENDIAN
+   return (__force __be32)x;
+#else
+   return (__force __be32)((__force u64)x >> 32);
+#endif
+}
+
 #ifdef CONFIG_INET
 
 int ip_tunnel_init(struct net_device *dev);
diff --git a/include/net/vxlan.h b/include/net/vxlan.h
index b96d0360c095..0255613a54a4 100644
--- a/include/net/vxlan.h
+++ b/include/net/vxlan.h
@@ -350,24 +350,6 @@ static inline __be32 vxlan_vni_field(__be32 vni)
 #endif
 }
 
-static inline __be32 vxlan_tun_id_to_vni(__be64 tun_id)
-{
-#if defined(__BIG_ENDIAN)
-   return (__force __be32)tun_id;
-#else
-   return (__force __be32)((__force u64)tun_id >> 32);
-#endif
-}
-
-static inline __be64 vxlan_vni_to_tun_id(__be32 vni)
-{
-#if defined(__BIG_ENDIAN)
-   return (__force __be64)vni;
-#else
-   return (__force __be64)((u64)(__force u32)vni << 32);
-#endif
-}
-
 static inline size_t vxlan_rco_start(__be32 vni_field)
 {
return be32_to_cpu(vni_field & VXLAN_RCO_MASK) << VXLAN_RCO_SHIFT;
diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c
index 113cc43df789..576f705d8180 100644
--- a/net/ipv4/ip_gre.c
+++ b/net/ipv4/ip_gre.c
@@ -246,25 +246,6 @@ static void gre_err(struct sk_buff *skb, u32 info)
ipgre_err(skb, info, );
 }
 
-static __be64 key_to_tunnel_id(__be32 key)
-{
-#ifdef __BIG_ENDIAN
-   return (__force __be64)((__force u32)key);
-#else
-   return (__force __be64)((__force u64)key << 32);
-#endif
-}
-
-/* Returns the least-significant 32 bits of a __be64. */
-static __be32 tunnel_id_to_key(__be64 x)
-{
-#ifdef __BIG_ENDIAN
-   return (__force __be32)x;
-#else
-   return (__force __be32)((__force u64)x >> 32);
-#endif
-}
-
 static int __ipgre_rcv(struct sk_buff *skb, const struct tnl_ptk_info *tpi,
   struct ip_tunnel_net *itn, int hdr_len, bool raw_proto)
 {
@@ -290,7 +271,7 @@ static int __ipgre_rcv(struct sk_buff *skb, const struct 
tnl_ptk_info *tpi,
__be64 tun_id;
 
flags = tpi->flags & (TUNNEL_CSUM | TUNNEL_KEY);
-   tun_id = key_to_tunnel_id(tpi->key);
+   tun_id = key32_to_tunnel_id(tpi->key);
tun_dst = ip_tun_rx_dst(skb, flags, tun_id, 0);
if (!tun_dst)
return PACKET_REJECT;
@@ -446,7 +427,7 @@ static void gre_fb_xmit(struct sk_buff *skb, struct 
net_device *dev,
 
flags = tun_info->key.tun_flags & (TUNNEL_CSUM | TUNNEL_KEY);
gre_build_header(skb, tunnel_hlen, flags, proto,
-tunnel_id_to_key(tun_info->key

[PATCH net-next 2/3] net/sched: cls_flower: Classify packet in ip tunnels

2016-08-22 Thread Amir Vadai
Introduce classifying by metadata extracted by the tunnel device.
Outer header fields - source/dest ip and tunnel id, are extracted from
the metadata when classifying.

For example, the following will add a filter on the ingress Qdisc of shared
vxlan device named 'vxlan0'. To forward packets with outer src ip
11.11.0.2, dst ip 11.11.0.1 and tunnel id 11. The packets will be
forwarded to tap device 'vnet0' (after metadata is released):

$ filter add dev vxlan0 protocol ip parent : \
flower \
  enc_src_ip 11.11.0.2 \
  enc_dst_ip 11.11.0.1 \
  enc_key_id 11 \
  dst_ip 11.11.11.1 \
action iptunnel decap \
action mirred egress redirect dev vnet0

The action iptunnel, will be introduced in the next patch in this
series.

Signed-off-by: Amir Vadai <a...@vadai.me>
---
 include/uapi/linux/pkt_cls.h | 11 +
 net/sched/cls_flower.c   | 59 ++--
 2 files changed, 68 insertions(+), 2 deletions(-)

diff --git a/include/uapi/linux/pkt_cls.h b/include/uapi/linux/pkt_cls.h
index 51b5b247fb5a..f9c287c67eae 100644
--- a/include/uapi/linux/pkt_cls.h
+++ b/include/uapi/linux/pkt_cls.h
@@ -431,6 +431,17 @@ enum {
TCA_FLOWER_KEY_VLAN_ID,
TCA_FLOWER_KEY_VLAN_PRIO,
TCA_FLOWER_KEY_VLAN_ETH_TYPE,
+
+   TCA_FLOWER_KEY_ENC_KEY_ID,  /* be32 */
+   TCA_FLOWER_KEY_ENC_IPV4_SRC,/* be32 */
+   TCA_FLOWER_KEY_ENC_IPV4_SRC_MASK,/* be32 */
+   TCA_FLOWER_KEY_ENC_IPV4_DST,/* be32 */
+   TCA_FLOWER_KEY_ENC_IPV4_DST_MASK,/* be32 */
+   TCA_FLOWER_KEY_ENC_IPV6_SRC,/* struct in6_addr */
+   TCA_FLOWER_KEY_ENC_IPV6_SRC_MASK,/* struct in6_addr */
+   TCA_FLOWER_KEY_ENC_IPV6_DST,/* struct in6_addr */
+   TCA_FLOWER_KEY_ENC_IPV6_DST_MASK,/* struct in6_addr */
+
__TCA_FLOWER_MAX,
 };
 
diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c
index 1e11e57e6947..75f719944fa8 100644
--- a/net/sched/cls_flower.c
+++ b/net/sched/cls_flower.c
@@ -23,6 +23,9 @@
 #include 
 #include 
 
+#include 
+#include 
+
 struct fl_flow_key {
int indev_ifindex;
struct flow_dissector_key_control control;
@@ -35,6 +38,8 @@ struct fl_flow_key {
struct flow_dissector_key_ipv6_addrs ipv6;
};
struct flow_dissector_key_ports tp;
+   struct flow_dissector_key_ipv4_addrs enc_ipv4;
+   struct flow_dissector_key_keyid enc_key_id;
 } __aligned(BITS_PER_LONG / 8); /* Ensure that we can do comparisons as longs. 
*/
 
 struct fl_flow_mask_range {
@@ -124,11 +129,22 @@ static int fl_classify(struct sk_buff *skb, const struct 
tcf_proto *tp,
struct cls_fl_filter *f;
struct fl_flow_key skb_key;
struct fl_flow_key skb_mkey;
+   struct ip_tunnel_info *info;
 
if (!atomic_read(>ht.nelems))
return -1;
 
fl_clear_masked_range(_key, >mask);
+
+   info = skb_tunnel_info(skb);
+   if (info) {
+   struct ip_tunnel_key *key = >key;
+
+   skb_key.enc_ipv4.src = key->u.ipv4.src;
+   skb_key.enc_ipv4.dst = key->u.ipv4.dst;
+   skb_key.enc_key_id.keyid = tunnel_id_to_key32(key->tun_id);
+   }
+
skb_key.indev_ifindex = skb->skb_iif;
/* skb_flow_dissect() does not set n_proto in case an unknown protocol,
 * so do it rather here.
@@ -297,7 +313,11 @@ static const struct nla_policy fl_policy[TCA_FLOWER_MAX + 
1] = {
[TCA_FLOWER_KEY_VLAN_ID]= { .type = NLA_U16 },
[TCA_FLOWER_KEY_VLAN_PRIO]  = { .type = NLA_U8 },
[TCA_FLOWER_KEY_VLAN_ETH_TYPE]  = { .type = NLA_U16 },
-
+   [TCA_FLOWER_KEY_ENC_KEY_ID] = { .type = NLA_U32 },
+   [TCA_FLOWER_KEY_ENC_IPV4_SRC]   = { .type = NLA_U32 },
+   [TCA_FLOWER_KEY_ENC_IPV4_SRC_MASK] = { .type = NLA_U32 },
+   [TCA_FLOWER_KEY_ENC_IPV4_DST]   = { .type = NLA_U32 },
+   [TCA_FLOWER_KEY_ENC_IPV4_DST_MASK] = { .type = NLA_U32 },
 };
 
 static void fl_set_key_val(struct nlattr **tb,
@@ -345,7 +365,6 @@ static int fl_set_key(struct net *net, struct nlattr **tb,
mask->indev_ifindex = 0x;
}
 #endif
-
fl_set_key_val(tb, key->eth.dst, TCA_FLOWER_KEY_ETH_DST,
   mask->eth.dst, TCA_FLOWER_KEY_ETH_DST_MASK,
   sizeof(key->eth.dst));
@@ -408,6 +427,29 @@ static int fl_set_key(struct net *net, struct nlattr **tb,
   sizeof(key->tp.dst));
}
 
+   if (tb[TCA_FLOWER_KEY_ENC_IPV4_SRC] ||
+   tb[TCA_FLOWER_KEY_ENC_IPV4_DST] ||
+   tb[TCA_FLOWER_KEY_ENC_KEY_ID]) {
+   fl_set_key_val(tb, >enc_ipv4.src,
+  TCA_FLOWER_KEY_ENC_IPV4_SRC,
+  >enc_ipv4.src,
+  TCA_FLOWER_KEY_ENC_IPV4_SRC_MASK,
+  sizeof(key->enc_ipv4.src)

Re: [RFC net-next 0/2] net/sched: cls_flower, act_mirred: VXLAN redirect using TC

2016-08-15 Thread Amir Vadai
On Mon, Aug 15, 2016 at 02:34:00PM +0200, Jiri Pirko wrote:
> Mon, Aug 15, 2016 at 12:08:10PM CEST, j...@mojatatu.com wrote:
> >On 16-08-15 05:08 AM, Amir Vadai wrote:
> >> On Mon, Aug 15, 2016 at 11:17:40AM +0300, Amir Vadai wrote:
> >> > On Mon, Aug 15, 2016 at 09:11:22AM +0200, Jiri Pirko wrote:
> >> > > Sun, Aug 14, 2016 at 07:53:30PM CEST, xiyou.wangc...@gmail.com wrote:
> >
> >> > 
> >> > Thanks,
> >> > Amir
> >> 
> >> Any objection to the following?
> >> 
> >> # ENCAP rule
> >> tc filter add dev $ETH protocol ip parent : prio 10 \
> >>flower ip_proto 1 \
> >>action set_tunnel_key src_ip 11.11.0.1 dst_ip 11.11.0.2 key_id 
> >> 11 dst_port 4789 \
> >>action mirred egress redirect dev $VXLAN
> >
> >Assuming $VXLAN is actually not a linux netdev of type vxlan?
> >then the action does vxlan encap redirect sends it to the $VXLAN
> >dev with encapsulation in place.
> >Sounds to me like a name like "vxlan" would be more usable. Example:
> 
> I believe those are generic tunelling data
> 
> 
> >
> >tc filter add dev $ETH protocol ip parent : prio 10 ..
> >action vxlan encap src_ip 11.11.0.1 dst_ip 11.11.0.2 key_id 11 
> >action mirred egress redirect dev eth0
> >
> >> 
> >> # DECAP rule
> >> tc filter add dev $VXLAN protocol ip parent : prio 10 \
> >>flower \
> >>enc_src_ip 11.11.0.2 enc_dst_ip 11.11.0.1 enc_key_id 11 
> >> \
> >>ip_proto 1 \
> >>action mirred egress redirect dev $ETH
> >> 
> >
> >And a decap would be of the form:
> >tc filter add dev $ETH protocol ip parent : prio 10 ..
> >action vxlan decap
> 
> That's right. Amir, don't you need decap here to drop the tunnel
> metadata?
Right. will add a decap that will release it.

> 
> 
> >
> >i.e there is no redirect needed here, no?
> >
> >cheers,
> >jamal


Re: [RFC net-next 0/2] net/sched: cls_flower, act_mirred: VXLAN redirect using TC

2016-08-15 Thread Amir Vadai
On Mon, Aug 15, 2016 at 06:41:14AM -0400, Jamal Hadi Salim wrote:
> On 16-08-15 06:24 AM, Shmulik Ladkani wrote:
> > On Mon, 15 Aug 2016 06:08:10 -0400, j...@mojatatu.com wrote:
> 
> > > Assuming $VXLAN is actually not a linux netdev of type vxlan?
> > > then the action does vxlan encap redirect sends it to the $VXLAN
> > > dev with encapsulation in place.
> > 
> > I assume Amir refers to vxlan netdev in VXLAN_F_COLLECT_METADATA mode,
> > using the tun_info metadata found in skb_metadata_dst.
> > The action is supposed to assign the tun metadata.
> > 
> 
> I see - so you let the vxlan netdev do the encap?
> Would it still scale to a _very large_ number of tunnels?
> How many netdevs are you going to use? I am assuming you will hit
> a nasty lock somewhere(qdisc?) if you use only one.
Having a netdev per tunnel is problematic in its memory use [1].
User can take each of the approaches. Can have a shared netdev, but will
have some contention on the qdisc lock, or create a vxlan dev per VNI
and increase memory use.
When offloading will be added, shared netdev will enjoy all worlds - low
memory use and no lock contention.


[1] - 
http://www.netdevconf.org/1.1/proceedings/slides/ahern-aleksandrov-prabhu-scaling-network-cumulus.pdf

> 
> cheers,
> jamal


Re: [RFC net-next 0/2] net/sched: cls_flower, act_mirred: VXLAN redirect using TC

2016-08-15 Thread Amir Vadai
On Mon, Aug 15, 2016 at 12:50:39PM +0300, Shmulik Ladkani wrote:
> On Mon, 15 Aug 2016 12:08:04 +0300, a...@vadai.me wrote:
> > 
> > Any objection to the following?
> > 
> > # ENCAP rule
> > tc filter add dev $ETH protocol ip parent : prio 10 \
> > flower ip_proto 1 \
> > action set_tunnel_key src_ip 11.11.0.1 dst_ip 11.11.0.2 key_id 
> > 11 dst_port 4789 \
> 
> Ability to control few tun_flags (e.g. TUNNEL_CSUM, TUNNEL_DONT_FRAGMENT)
> might be useful too.
I guess it should be added when needed. Currenly I don't have a use case
for that.

> 
> > # DECAP rule
> > tc filter add dev $VXLAN protocol ip parent : prio 10 \
> > flower \
> > enc_src_ip 11.11.0.2 enc_dst_ip 11.11.0.1 enc_key_id 11 
> > \
> > ip_proto 1 \
> 
> You might want to match the tunnel's udp port as well, for symmetry.
actually, now that you raise it, the udp port is already an attribute of
the vxlan device. So I think it should be ommitted in both encap and
decap. Selecting the udp port will be done when creating the vxlan
device.

Thanks,
Amir



Re: [RFC net-next 0/2] net/sched: cls_flower, act_mirred: VXLAN redirect using TC

2016-08-15 Thread Amir Vadai
On Mon, Aug 15, 2016 at 11:17:40AM +0300, Amir Vadai wrote:
> On Mon, Aug 15, 2016 at 09:11:22AM +0200, Jiri Pirko wrote:
> > Sun, Aug 14, 2016 at 07:53:30PM CEST, xiyou.wangc...@gmail.com wrote:
> > >On Sun, Aug 14, 2016 at 7:06 AM, Amir Vadai <a...@vadai.me> wrote:
> > >> tc qdisc add dev $ETH ingress
> > >>
> > >> # ENCAP rule for ARP
> > >> tc filter add dev $ETH protocol 0x806 parent : prio 11 \
> > >> flower \
> > >> action mirred egress redirect dev $VXLAN enc_src_ip 
> > >> 11.11.0.1 enc_dst_ip 11.11.0.2 enc_key_id 11 enc_dst_port 4789
> > >>
> > >> # ENCAP rule for ICMP
> > >> tc filter add dev $ETH protocol ip parent : prio 10 \
> > >> flower ip_proto 1 \
> > >> action mirred egress redirect dev $VXLAN enc_src_ip 
> > >> 11.11.0.1 enc_dst_ip 11.11.0.2 enc_key_id 11 enc_dst_port 4789
> > >>
> > >
> > >I don't like this. This makes mirred action unnecessarily
> > >complex, it should really just mirror or redirect packets as
> > >it is, why it should be aware of tunnel information?
> > >
> > >I think you probably need to introduce a new tc action
> > >for these tunnel information and pipe it to mirred.
> > 
> > that is the first thing that I thinked of when I saw the patch. I think
> > you can introduce act_vxlan similar to act_vlan.
> 
> introducing a new action was the first thing I thought of, but it felt
> problematic because the actual encap is done by the redirection to the
> vxlan device. This action is only responsible to supply the metadata and
> work tightly with the mirred. It is not exactly like vlan that the
> push/pop actions can live without mirroring/redirecting.
> But still as all of you said, it makes mirred complex with stuff that
> shouldn't be there. And between the two options it is better to
> introduce a new action.
> 
> I will go in this direction.
> 
> Thanks,
> Amir

Any objection to the following?

# ENCAP rule
tc filter add dev $ETH protocol ip parent : prio 10 \
flower ip_proto 1 \
action set_tunnel_key src_ip 11.11.0.1 dst_ip 11.11.0.2 key_id 
11 dst_port 4789 \
action mirred egress redirect dev $VXLAN 

# DECAP rule
tc filter add dev $VXLAN protocol ip parent : prio 10 \
flower \
enc_src_ip 11.11.0.2 enc_dst_ip 11.11.0.1 enc_key_id 11 
\
ip_proto 1 \
action mirred egress redirect dev $ETH


Re: [RFC net-next 0/2] net/sched: cls_flower, act_mirred: VXLAN redirect using TC

2016-08-15 Thread Amir Vadai
On Mon, Aug 15, 2016 at 09:11:22AM +0200, Jiri Pirko wrote:
> Sun, Aug 14, 2016 at 07:53:30PM CEST, xiyou.wangc...@gmail.com wrote:
> >On Sun, Aug 14, 2016 at 7:06 AM, Amir Vadai <a...@vadai.me> wrote:
> >> tc qdisc add dev $ETH ingress
> >>
> >> # ENCAP rule for ARP
> >> tc filter add dev $ETH protocol 0x806 parent : prio 11 \
> >> flower \
> >> action mirred egress redirect dev $VXLAN enc_src_ip 
> >> 11.11.0.1 enc_dst_ip 11.11.0.2 enc_key_id 11 enc_dst_port 4789
> >>
> >> # ENCAP rule for ICMP
> >> tc filter add dev $ETH protocol ip parent : prio 10 \
> >> flower ip_proto 1 \
> >> action mirred egress redirect dev $VXLAN enc_src_ip 
> >> 11.11.0.1 enc_dst_ip 11.11.0.2 enc_key_id 11 enc_dst_port 4789
> >>
> >
> >I don't like this. This makes mirred action unnecessarily
> >complex, it should really just mirror or redirect packets as
> >it is, why it should be aware of tunnel information?
> >
> >I think you probably need to introduce a new tc action
> >for these tunnel information and pipe it to mirred.
> 
> that is the first thing that I thinked of when I saw the patch. I think
> you can introduce act_vxlan similar to act_vlan.

introducing a new action was the first thing I thought of, but it felt
problematic because the actual encap is done by the redirection to the
vxlan device. This action is only responsible to supply the metadata and
work tightly with the mirred. It is not exactly like vlan that the
push/pop actions can live without mirroring/redirecting.
But still as all of you said, it makes mirred complex with stuff that
shouldn't be there. And between the two options it is better to
introduce a new action.

I will go in this direction.

Thanks,
Amir


[RFC net-next 2/2] net/sched: act_mirred: Introduce vxlan support

2016-08-14 Thread Amir Vadai
From: Amir Vadai <ami...@mellanox.com>>

Signed-off-by: Amir Vadai <ami...@mellanox.com>>
---
 include/net/tc_act/tc_mirred.h|  5 +++
 include/uapi/linux/tc_act/tc_mirred.h |  7 
 net/sched/act_mirred.c| 79 +++
 3 files changed, 91 insertions(+)

diff --git a/include/net/tc_act/tc_mirred.h b/include/net/tc_act/tc_mirred.h
index 62770add15bd..43704c5550ab 100644
--- a/include/net/tc_act/tc_mirred.h
+++ b/include/net/tc_act/tc_mirred.h
@@ -11,6 +11,11 @@ struct tcf_mirred {
int tcfm_ok_push;
struct net_device __rcu *tcfm_dev;
struct list_headtcfm_list;
+   struct metadata_dst *tun_dst;
+   __be32  tcf_enc_saddr;
+   __be32  tcf_enc_daddr;
+   __be32  tcf_enc_key_id;
+   __be16  tcf_enc_port;
 };
 #define to_mirred(a) ((struct tcf_mirred *)a)
 
diff --git a/include/uapi/linux/tc_act/tc_mirred.h 
b/include/uapi/linux/tc_act/tc_mirred.h
index 3d7a2b352a62..89ae754d8f5e 100644
--- a/include/uapi/linux/tc_act/tc_mirred.h
+++ b/include/uapi/linux/tc_act/tc_mirred.h
@@ -21,6 +21,13 @@ enum {
TCA_MIRRED_TM,
TCA_MIRRED_PARMS,
TCA_MIRRED_PAD,
+
+   TCA_MIRRED_ENC_IPV4_SRC,/* be32 */
+   TCA_MIRRED_ENC_IPV4_DST,/* be32 */
+   TCA_MIRRED_ENC_IPV6_SRC,/* struct in6_addr */
+   TCA_MIRRED_ENC_IPV6_DST,/* struct in6_addr */
+   TCA_MIRRED_ENC_KEY_ID,  /* be32 */
+   TCA_MIRRED_ENC_DST_PORT,/* be16 */
__TCA_MIRRED_MAX
 };
 #define TCA_MIRRED_MAX (__TCA_MIRRED_MAX - 1)
diff --git a/net/sched/act_mirred.c b/net/sched/act_mirred.c
index 6038c85d92f5..3aff8d8b2744 100644
--- a/net/sched/act_mirred.c
+++ b/net/sched/act_mirred.c
@@ -26,6 +26,9 @@
 #include 
 #include 
 #include 
+#include 
+#include 
+#include 
 
 #include 
 
@@ -38,6 +41,11 @@ static void tcf_mirred_release(struct tc_action *a, int bind)
struct tcf_mirred *m = to_mirred(a);
struct net_device *dev;
 
+if (m->tun_dst) {
+printk("%s:%d - releasing dst: %p\n", __func__, __LINE__, 
m->tun_dst);
+dst_release((struct dst_entry *)m->tun_dst);
+}
+
/* We could be called either in a RCU callback or with RTNL lock held. 
*/
spin_lock_bh(_list_lock);
list_del(>tcfm_list);
@@ -49,11 +57,67 @@ static void tcf_mirred_release(struct tc_action *a, int 
bind)
 
 static const struct nla_policy mirred_policy[TCA_MIRRED_MAX + 1] = {
[TCA_MIRRED_PARMS]  = { .len = sizeof(struct tc_mirred) },
+   [TCA_MIRRED_ENC_IPV4_SRC]   = { .type = NLA_U32 },
+   [TCA_MIRRED_ENC_IPV4_DST]   = { .type = NLA_U32 },
+   [TCA_MIRRED_ENC_KEY_ID] = { .type = NLA_U32 },
+   [TCA_MIRRED_ENC_DST_PORT]   = { .type = NLA_U16 },
 };
 
 static int mirred_net_id;
 static struct tc_action_ops act_mirred_ops;
 
+static int tunnel_alloc(struct tcf_mirred *m, struct nlattr **tb)
+{
+   struct ip_tunnel_info *tun_info;
+   struct metadata_dst *tun_dst;
+struct vxlan_metadata md = { 0 };
+u8 tos = 0;
+u8 ttl = 0;
+__be16 tun_flags = TUNNEL_VXLAN_OPT;
+int err;
+
+   m->tcf_enc_saddr = nla_get_be32(tb[TCA_MIRRED_ENC_IPV4_SRC]);
+   m->tcf_enc_daddr = nla_get_be32(tb[TCA_MIRRED_ENC_IPV4_DST]);
+   m->tcf_enc_key_id = nla_get_be32(tb[TCA_MIRRED_ENC_KEY_ID]);
+   m->tcf_enc_port = nla_get_be32(tb[TCA_MIRRED_ENC_DST_PORT]);
+
+   if (!m->tcf_enc_saddr || !m->tcf_enc_daddr ||
+   !m->tcf_enc_key_id || !m->tcf_enc_port)
+   return 0;
+
+   tun_dst = metadata_dst_alloc(sizeof(md), GFP_KERNEL);
+   if (!tun_dst)
+   return -ENOMEM;
+printk("%s:%d allocated dst: %p\n", __func__, __LINE__, tun_dst);
+
+   printk("%s:%d mirred vxlan saddr: %pI4 daddr: %pI4 key_id: %d port: 
%d\n",
+  __func__, __LINE__,
+  >tcf_enc_saddr, >tcf_enc_daddr,
+  be32_to_cpu(m->tcf_enc_key_id), be16_to_cpu(m->tcf_enc_port));
+
+   err = dst_cache_init(_dst->u.tun_info.dst_cache, GFP_KERNEL);
+   if (err) {
+   dst_release((struct dst_entry *)tun_dst);
+   return err;
+   }
+
+   tun_info = _dst->u.tun_info;
+   tun_info->mode = IP_TUNNEL_INFO_TX;
+
+ip_tunnel_key_init(_info->key,
+   m->tcf_enc_saddr, m->tcf_enc_daddr,
+  tos, ttl,
+   0, 0,
+  m->tcf_enc_port,
+  vxlan_vni_to_tun_id(m->tcf_enc_key_id),
+  tun_flags);
+ip_tunnel_info_opts_set(tun_info, , sizeof(md));
+
+   m->tun_dst = tun_dst;
+
+return 0;
+}
+
 static int tcf_mi

[RFC net-next 1/2] net/sched: cls_flower: Introduce classify by vxlan outer headers

2016-08-14 Thread Amir Vadai
From: Amir Vadai <ami...@mellanox.com>>

Signed-off-by: Amir Vadai <ami...@mellanox.com>>
---
 include/uapi/linux/pkt_cls.h | 11 +
 net/sched/cls_flower.c   | 53 
 2 files changed, 64 insertions(+)

diff --git a/include/uapi/linux/pkt_cls.h b/include/uapi/linux/pkt_cls.h
index d1c1ccaba787..a192195a5516 100644
--- a/include/uapi/linux/pkt_cls.h
+++ b/include/uapi/linux/pkt_cls.h
@@ -428,6 +428,17 @@ enum {
TCA_FLOWER_KEY_UDP_DST, /* be16 */
 
TCA_FLOWER_FLAGS,
+
+   TCA_FLOWER_KEY_ENC_IPV4_SRC,/* be32 */
+   TCA_FLOWER_KEY_ENC_IPV4_SRC_MASK,/* be32 */
+   TCA_FLOWER_KEY_ENC_IPV4_DST,/* be32 */
+   TCA_FLOWER_KEY_ENC_IPV4_DST_MASK,/* be32 */
+   TCA_FLOWER_KEY_ENC_IPV6_SRC,/* struct in6_addr */
+   TCA_FLOWER_KEY_ENC_IPV6_SRC_MASK,   /* struct in6_addr */
+   TCA_FLOWER_KEY_ENC_IPV6_DST,/* struct in6_addr */
+   TCA_FLOWER_KEY_ENC_IPV6_DST_MASK,   /* struct in6_addr */
+   TCA_FLOWER_KEY_ENC_KEY_ID,  /* be32 */
+
__TCA_FLOWER_MAX,
 };
 
diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c
index 5060801a2f6d..26436dd34e21 100644
--- a/net/sched/cls_flower.c
+++ b/net/sched/cls_flower.c
@@ -23,12 +23,18 @@
 #include 
 #include 
 
+#include 
+#include 
+#include 
+
 struct fl_flow_key {
int indev_ifindex;
struct flow_dissector_key_control control;
struct flow_dissector_key_basic basic;
struct flow_dissector_key_eth_addrs eth;
struct flow_dissector_key_addrs ipaddrs;
+   struct flow_dissector_key_ipv4_addrs enc_ipv4;
+   struct flow_dissector_key_keyid enc_key_id;
union {
struct flow_dissector_key_ipv4_addrs ipv4;
struct flow_dissector_key_ipv6_addrs ipv6;
@@ -123,11 +129,27 @@ static int fl_classify(struct sk_buff *skb, const struct 
tcf_proto *tp,
struct cls_fl_filter *f;
struct fl_flow_key skb_key;
struct fl_flow_key skb_mkey;
+   struct ip_tunnel_info *info;
 
if (!atomic_read(>ht.nelems))
return -1;
 
fl_clear_masked_range(_key, >mask);
+
+   info = skb_tunnel_info(skb);
+   if (info) {
+   struct ip_tunnel_key *key = >key;
+   netdev_err(skb->dev, "%s:%d saddr: %pI4, daddr: %pI4 vni: %d 
tos: %#x ttl: %#x src_port: %d dst_port: %d\n",
+ __func__, __LINE__,
+ >u.ipv4.src, >u.ipv4.dst,
+ be32_to_cpu(vxlan_tun_id_to_vni(key->tun_id)),
+ key->tos, key->ttl,
+ ntohs(key->tp_src), ntohs(key->tp_dst));
+   skb_key.enc_ipv4.src = key->u.ipv4.src;
+   skb_key.enc_ipv4.dst = key->u.ipv4.dst;
+   skb_key.enc_key_id.keyid = vxlan_tun_id_to_vni(key->tun_id);
+   }
+
skb_key.indev_ifindex = skb->skb_iif;
/* skb_flow_dissect() does not set n_proto in case an unknown protocol,
 * so do it rather here.
@@ -293,6 +315,12 @@ static const struct nla_policy fl_policy[TCA_FLOWER_MAX + 
1] = {
[TCA_FLOWER_KEY_TCP_DST]= { .type = NLA_U16 },
[TCA_FLOWER_KEY_UDP_SRC]= { .type = NLA_U16 },
[TCA_FLOWER_KEY_UDP_DST]= { .type = NLA_U16 },
+
+   [TCA_FLOWER_KEY_ENC_IPV4_SRC]   = { .type = NLA_U32 },
+   [TCA_FLOWER_KEY_ENC_IPV4_SRC_MASK] = { .type = NLA_U32 },
+   [TCA_FLOWER_KEY_ENC_IPV4_DST]   = { .type = NLA_U32 },
+   [TCA_FLOWER_KEY_ENC_IPV4_DST_MASK] = { .type = NLA_U32 },
+   [TCA_FLOWER_KEY_ENC_KEY_ID] = { .type = NLA_U32 },
 };
 
 static void fl_set_key_val(struct nlattr **tb,
@@ -373,6 +401,20 @@ static int fl_set_key(struct net *net, struct nlattr **tb,
   sizeof(key->tp.dst));
}
 
+   if (tb[TCA_FLOWER_KEY_ENC_IPV4_SRC] ||
+   tb[TCA_FLOWER_KEY_ENC_IPV4_DST] ||
+   tb[TCA_FLOWER_KEY_ENC_KEY_ID]) {
+   fl_set_key_val(tb, >enc_ipv4.src, 
TCA_FLOWER_KEY_ENC_IPV4_SRC,
+  >enc_ipv4.src, 
TCA_FLOWER_KEY_ENC_IPV4_SRC_MASK,
+  sizeof(key->enc_ipv4.src));
+   fl_set_key_val(tb, >enc_ipv4.dst, 
TCA_FLOWER_KEY_ENC_IPV4_DST,
+  >enc_ipv4.dst, 
TCA_FLOWER_KEY_ENC_IPV4_DST_MASK,
+  sizeof(key->enc_ipv4.dst));
+   fl_set_key_val(tb, >enc_key_id, TCA_FLOWER_KEY_ENC_KEY_ID,
+  >enc_key_id, TCA_FLOWER_KEY_ENC_KEY_ID,
+  sizeof(key->enc_key_id));
+   }
+
return 0;
 }
 
@@ -753,6 +795,17 @@ static int fl_dump(struct net *net, struct tcf_proto *tp, 
unsigned long fh,
  sizeof(key->tp.dst
  

[RFC net-next 0/2] net/sched: cls_flower, act_mirred: VXLAN redirect using TC

2016-08-14 Thread Amir Vadai
From: Amir Vadai <ami...@mellanox.com>>

Hi,

I would like to make it possible to manage VXLAN encap/decap using the flower
classifier, mirred action and vxlan device.
In order to make the solution scaleable, I'm using a shared vxlan device, with
encapsulation information packed in the metadata - by the mirred action in the
encap flow, and used in the decap flow, by the flower classifier.

For example for virt use case:
# [uplink NIC] --{cls_flower & mirred}--> [vxlan dev] --{udp/ip stack}--> [tap]
# [tap dev] --{udp/ip stack}--> [vxlan dev] --{cls_flower & mirred}--> [uplink 
NIC]
# In the example, vxlan tunnel ip's are 11.11.11.* and the real devices ip's
# are: 11.11.0.*

ip link add $VXLAN type vxlan dstport 4789 external

ifconfig $VXLAN up

tc qdisc add dev $ETH ingress

# ENCAP rule for ARP
tc filter add dev $ETH protocol 0x806 parent : prio 11 \
flower \
action mirred egress redirect dev $VXLAN enc_src_ip 11.11.0.1 
enc_dst_ip 11.11.0.2 enc_key_id 11 enc_dst_port 4789

# ENCAP rule for ICMP
tc filter add dev $ETH protocol ip parent : prio 10 \
flower ip_proto 1 \
action mirred egress redirect dev $VXLAN enc_src_ip 11.11.0.1 
enc_dst_ip 11.11.0.2 enc_key_id 11 enc_dst_port 4789

tc qdisc add dev $VXLAN ingress

# DECAP rule for ARP
tc filter add dev $VXLAN protocol 0x806 parent : prio 11 \
flower enc_src_ip 11.11.0.2 enc_dst_ip 11.11.0.1 enc_key_id 11 \
action mirred egress redirect dev $ETH

# DECAP rule for ICMP
tc filter add dev $VXLAN protocol ip parent : prio 10 \
flower enc_src_ip 11.11.0.2 enc_dst_ip 11.11.0.1 enc_key_id 11 \
action mirred egress redirect dev $ETH

Next step will be to enable offloading of those rules.

Following two patches to cls_flower and act_mirred were used to validate and
test this approach, and supplied to make things clearer, they will be modified
before the actual submission.

Thanks,
Amir

Amir Vadai (2):
  net/sched: cls_flower: Introduce classify by vxlan outer headers
  net/sched: act_mirred: Introduce vxlan support

 include/net/tc_act/tc_mirred.h|  5 +++
 include/uapi/linux/pkt_cls.h  | 11 +
 include/uapi/linux/tc_act/tc_mirred.h |  7 
 net/sched/act_mirred.c| 79 +++
 net/sched/cls_flower.c| 53 +++
 5 files changed, 155 insertions(+)

-- 
2.9.0



Re: [Patch net 5/5] net_sched: convert tcf_exts from list to flex_array

2016-08-09 Thread Amir Vadai
On Mon, Aug 8, 2016 at 11:46 PM, Cong Wang  wrote:
> As pointed out by Jamal, an action could be shared by
> multiple filters, so we can't use list to chain them
> any more after we get rid of the original tc_action.
> Instead, we could just save pointers to these actions
> in tcf_exts, since they are refcount'ed, so convert
> the list to a flex array.
>
> The ugly part is the action API still accepts list
> as a parameter, I just introduce a helper function to
> convert the flex array of pointers to a list.
>
> Fixes: a85a970af265 ("net_sched: move tc_action into tcf_common")
> Reported-by: Jamal Hadi Salim 
> Cc: Jamal Hadi Salim 
> Signed-off-by: Cong Wang 
> ---

[...]

> -#define tc_single_action(_exts) \
> -   (list_is_singular(&(_exts)->actions))
> +#define tc_no_actions(_exts)  (&(_exts)->nr_actions == 0)
> +#define tc_single_action(_exts) (&(_exts)->nr_actions == 1)

Should remove the '&' here.

Amir

[...]


Re: [Patch net 3/5] net_sched: fix a typo in tc_for_each_action()

2016-08-09 Thread Amir Vadai
On Mon, Aug 08, 2016 at 01:46:47PM -0700, Cong Wang wrote:
> It is harmless because all users pass 'a' to this macro.
> 
> Fixes: 00175aec941e ("net/sched: Macro instead of CONFIG_NET_CLS_ACT ifdef")
> Cc: Amir Vadai <a...@vadai.me>
> Signed-off-by: Cong Wang <xiyou.wangc...@gmail.com>
> ---

Acked-by: Amir Vadai <a...@vadai.me>

Thanks Cong.



[PATCH iproute2] tc: flower: Add skip_{hw|sw} support

2016-07-04 Thread Amir Vadai
From: Amir Vadai <ami...@mellanox.com>

On devices that support TC flower offloads, these flags enable a filter to be
added only to HW or only to SW. skip_sw and skip_hw are mutually exclusive
flags. By default without any flags, the filter is added to both HW and SW,
but no error checks are done in case of failure to add to HW.
With skip-sw, failure to add to HW is treated as an error.

Here is a sample script that adds 2 filters, one with skip_sw and the other
with skip_hw flag.

   # add ingress qdisc
   tc qdisc add dev enp0s9 ingress

   # enable hw tc offload.
   ethtool -K enp0s9 hw-tc-offload on

   # add a flower filter with skip-sw flag.
   tc filter add dev enp0s9 protocol ip parent : flower \
   ip_proto 1 indev enp0s9 skip_sw \
   action drop

   # add a flower filter with skip-hw flag.
   tc filter add dev enp0s9 protocol ip parent : flower \
   ip_proto 3 indev enp0s9 skip_hw \
   action drop

Signed-off-by: Amir Vadai <ami...@mellanox.com>
---
 man/man8/tc-flower.8 | 11 ++-
 tc/f_flower.c| 17 +
 2 files changed, 27 insertions(+), 1 deletion(-)

diff --git a/man/man8/tc-flower.8 b/man/man8/tc-flower.8
index df4d8e1..9ae10e6 100644
--- a/man/man8/tc-flower.8
+++ b/man/man8/tc-flower.8
@@ -18,7 +18,9 @@ flower \- flow based traffic control filter
 .ti -8
 .IR MATCH " := { "
 .B indev
-.IR ifname " | { "
+.IR ifname " | "
+.BR skip_sw " | " skip_hw
+.R " | { "
 .BR dst_mac " | " src_mac " } "
 .IR mac_address " | "
 .BR eth_type " { " ipv4 " | " ipv6 " | "
@@ -55,6 +57,13 @@ is the name of an interface which must exist at the time of
 .B tc
 invocation.
 .TP
+.BI skip_sw
+Do not process filter by software. If hardware has no offload support for this
+filter, or TC offload is not enabled for the interface, operation will fail.
+.TP
+.BI skip_hw
+Do not process filter by hardware.
+.TP
 .BI dst_mac " mac_address"
 .TQ
 .BI src_mac " mac_address"
diff --git a/tc/f_flower.c b/tc/f_flower.c
index fd2014b..7b46ceb 100644
--- a/tc/f_flower.c
+++ b/tc/f_flower.c
@@ -25,6 +25,7 @@
 static void explain(void)
 {
fprintf(stderr, "Usage: ... flower [ MATCH-LIST ]\n");
+   fprintf(stderr, "  [ skip_sw | skip_hw ]\n");
fprintf(stderr, "  [ action ACTION-SPEC ] [ classid 
CLASSID ]\n");
fprintf(stderr, "\n");
fprintf(stderr, "Where: MATCH-LIST := [ MATCH-LIST ] MATCH\n");
@@ -167,6 +168,7 @@ static int flower_parse_opt(struct filter_util *qu, char 
*handle,
struct rtattr *tail;
__be16 eth_type = TC_H_MIN(t->tcm_info);
__u8 ip_proto = 0xff;
+   __u32 flags = 0;
 
if (handle) {
ret = get_u32(>tcm_handle, handle, 0);
@@ -196,6 +198,10 @@ static int flower_parse_opt(struct filter_util *qu, char 
*handle,
return -1;
}
addattr_l(n, MAX_MSG, TCA_FLOWER_CLASSID, , 4);
+   } else if (matches(*argv, "skip_hw") == 0) {
+   flags |= TCA_CLS_FLAGS_SKIP_HW;
+   } else if (matches(*argv, "skip_sw") == 0) {
+   flags |= TCA_CLS_FLAGS_SKIP_SW;
} else if (matches(*argv, "indev") == 0) {
char ifname[IFNAMSIZ];
 
@@ -294,6 +300,8 @@ static int flower_parse_opt(struct filter_util *qu, char 
*handle,
}
 
 parse_done:
+   addattr32(n, MAX_MSG, TCA_FLOWER_FLAGS, flags);
+
ret = addattr16(n, MAX_MSG, TCA_FLOWER_KEY_ETH_TYPE, eth_type);
if (ret) {
fprintf(stderr, "Illegal \"eth_type\"(0x%x)\n",
@@ -498,6 +506,15 @@ static int flower_print_opt(struct filter_util *qu, FILE 
*f,
  tb[TCA_FLOWER_KEY_TCP_SRC],
  tb[TCA_FLOWER_KEY_UDP_SRC]);
 
+   if (tb[TCA_FLOWER_FLAGS])  {
+   __u32 flags = rta_getattr_u32(tb[TCA_FLOWER_FLAGS]);
+
+   if (flags & TCA_CLS_FLAGS_SKIP_HW)
+   fprintf(f, "\n  skip_hw");
+   if (flags & TCA_CLS_FLAGS_SKIP_SW)
+   fprintf(f, "\n  skip_sw");
+   }
+
if (tb[TCA_FLOWER_ACT]) {
tc_print_action(f, tb[TCA_FLOWER_ACT]);
}
-- 
2.9.0



[PATCH net-next] net/sched: flower: Return error when hw can't offload and skip_sw is set

2016-06-13 Thread Amir Vadai
From: Amir Vadai <ami...@mellanox.com>

When skip_sw is set and hardware fails to apply filter, return error to
user. This will make error propagation logic similar to the one
currently used in u32 classifier.
Also, changed code to use tc_skip_sw() utility function.

Signed-off-by: Amir Vadai <ami...@mellanox.com>
---
 net/sched/cls_flower.c | 42 +-
 1 file changed, 25 insertions(+), 17 deletions(-)

diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c
index 1ea6f76..5060801 100644
--- a/net/sched/cls_flower.c
+++ b/net/sched/cls_flower.c
@@ -140,7 +140,7 @@ static int fl_classify(struct sk_buff *skb, const struct 
tcf_proto *tp,
f = rhashtable_lookup_fast(>ht,
   fl_key_get_start(_mkey, >mask),
   head->ht_params);
-   if (f && !(f->flags & TCA_CLS_FLAGS_SKIP_SW)) {
+   if (f && !tc_skip_sw(f->flags)) {
*res = f->res;
return tcf_exts_exec(skb, >exts, res);
}
@@ -187,19 +187,20 @@ static void fl_hw_destroy_filter(struct tcf_proto *tp, 
unsigned long cookie)
dev->netdev_ops->ndo_setup_tc(dev, tp->q->handle, tp->protocol, );
 }
 
-static void fl_hw_replace_filter(struct tcf_proto *tp,
-struct flow_dissector *dissector,
-struct fl_flow_key *mask,
-struct fl_flow_key *key,
-struct tcf_exts *actions,
-unsigned long cookie, u32 flags)
+static int fl_hw_replace_filter(struct tcf_proto *tp,
+   struct flow_dissector *dissector,
+   struct fl_flow_key *mask,
+   struct fl_flow_key *key,
+   struct tcf_exts *actions,
+   unsigned long cookie, u32 flags)
 {
struct net_device *dev = tp->q->dev_queue->dev;
struct tc_cls_flower_offload offload = {0};
struct tc_to_netdev tc;
+   int err;
 
if (!tc_should_offload(dev, tp, flags))
-   return;
+   return tc_skip_sw(flags) ? -EINVAL : 0;
 
offload.command = TC_CLSFLOWER_REPLACE;
offload.cookie = cookie;
@@ -211,7 +212,12 @@ static void fl_hw_replace_filter(struct tcf_proto *tp,
tc.type = TC_SETUP_CLSFLOWER;
tc.cls_flower = 
 
-   dev->netdev_ops->ndo_setup_tc(dev, tp->q->handle, tp->protocol, );
+   err = dev->netdev_ops->ndo_setup_tc(dev, tp->q->handle, tp->protocol, 
);
+
+   if (tc_skip_sw(flags))
+   return err;
+
+   return 0;
 }
 
 static void fl_hw_update_stats(struct tcf_proto *tp, struct cls_fl_filter *f)
@@ -572,20 +578,22 @@ static int fl_change(struct net *net, struct sk_buff 
*in_skb,
if (err)
goto errout;
 
-   if (!(fnew->flags & TCA_CLS_FLAGS_SKIP_SW)) {
+   if (!tc_skip_sw(fnew->flags)) {
err = rhashtable_insert_fast(>ht, >ht_node,
 head->ht_params);
if (err)
goto errout;
}
 
-   fl_hw_replace_filter(tp,
->dissector,
-,
->key,
->exts,
-(unsigned long)fnew,
-fnew->flags);
+   err = fl_hw_replace_filter(tp,
+  >dissector,
+  ,
+  >key,
+  >exts,
+  (unsigned long)fnew,
+  fnew->flags);
+   if (err)
+   goto errout;
 
if (fold) {
rhashtable_remove_fast(>ht, >ht_node,
-- 
2.8.3



Re: [PATCH net-next] net/sched: cls_flower: Introduce support in SKIP SW flag

2016-06-13 Thread Amir Vadai
On Mon, Jun 13, 2016 at 11:58:12AM +0300, Amir Vadai wrote:
> From: Amir Vadai <ami...@mellanox.com>
> 
> In order to make a filter processed only by hardware, skip_sw flag
> should be supplied. This is an addition to the already existing skip_hw
> flag (filter will be processed by software only). If no flag is
> specified, filter will be processed by both software and hardware.
> 
> If only hardware offloaded filters exist, fl_classify() will return
> without doing anything.
> 
> A following userspace patch will be sent once kernel patch is accepted.
> 
> Example:
> 
> tc filter add dev enp0s9 protocol ip prio 20 parent : \
>   flower \
>   ip_proto 6 \
>   indev enp0s9 \
>   skip_sw \
>   action skbedit mark 0x1234
> 
> Signed-off-by: Amir Vadai <ami...@mellanox.com>
> ---

Please ignore this mail - wrong patch sent.

Amir


[PATCH net-next] net/sched: cls_flower: Introduce support in SKIP SW flag

2016-06-13 Thread Amir Vadai
From: Amir Vadai <ami...@mellanox.com>

In order to make a filter processed only by hardware, skip_sw flag
should be supplied. This is an addition to the already existing skip_hw
flag (filter will be processed by software only). If no flag is
specified, filter will be processed by both software and hardware.

If only hardware offloaded filters exist, fl_classify() will return
without doing anything.

A following userspace patch will be sent once kernel patch is accepted.

Example:

tc filter add dev enp0s9 protocol ip prio 20 parent : \
flower \
ip_proto 6 \
indev enp0s9 \
skip_sw \
action skbedit mark 0x1234

Signed-off-by: Amir Vadai <ami...@mellanox.com>
---
 net/sched/cls_flower.c | 31 ++-
 1 file changed, 22 insertions(+), 9 deletions(-)

diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c
index 730aaca..d737492 100644
--- a/net/sched/cls_flower.c
+++ b/net/sched/cls_flower.c
@@ -66,6 +66,7 @@ struct cls_fl_filter {
struct fl_flow_key key;
struct list_head list;
u32 handle;
+   u32 flags;
struct rcu_head rcu;
 };
 
@@ -123,6 +124,9 @@ static int fl_classify(struct sk_buff *skb, const struct 
tcf_proto *tp,
struct fl_flow_key skb_key;
struct fl_flow_key skb_mkey;
 
+   if (!atomic_read(>ht.nelems))
+   return -1;
+
fl_clear_masked_range(_key, >mask);
skb_key.indev_ifindex = skb->skb_iif;
/* skb_flow_dissect() does not set n_proto in case an unknown protocol,
@@ -136,7 +140,7 @@ static int fl_classify(struct sk_buff *skb, const struct 
tcf_proto *tp,
f = rhashtable_lookup_fast(>ht,
   fl_key_get_start(_mkey, >mask),
   head->ht_params);
-   if (f) {
+   if (f && !(f->flags & TCA_CLS_FLAGS_SKIP_SW)) {
*res = f->res;
return tcf_exts_exec(skb, >exts, res);
}
@@ -524,7 +528,6 @@ static int fl_change(struct net *net, struct sk_buff 
*in_skb,
struct cls_fl_filter *fnew;
struct nlattr *tb[TCA_FLOWER_MAX + 1];
struct fl_flow_mask mask = {};
-   u32 flags = 0;
int err;
 
if (!tca[TCA_OPTIONS])
@@ -552,8 +555,14 @@ static int fl_change(struct net *net, struct sk_buff 
*in_skb,
}
fnew->handle = handle;
 
-   if (tb[TCA_FLOWER_FLAGS])
-   flags = nla_get_u32(tb[TCA_FLOWER_FLAGS]);
+   if (tb[TCA_FLOWER_FLAGS]) {
+   fnew->flags = nla_get_u32(tb[TCA_FLOWER_FLAGS]);
+
+   if (!tc_flags_valid(fnew->flags)) {
+   err = -EINVAL;
+   goto errout;
+   }
+   }
 
err = fl_set_parms(net, tp, fnew, , base, tb, tca[TCA_RATE], ovr);
if (err)
@@ -563,10 +572,12 @@ static int fl_change(struct net *net, struct sk_buff 
*in_skb,
if (err)
goto errout;
 
-   err = rhashtable_insert_fast(>ht, >ht_node,
-head->ht_params);
-   if (err)
-   goto errout;
+   if (!(fnew->flags & TCA_CLS_FLAGS_SKIP_SW)) {
+   err = rhashtable_insert_fast(>ht, >ht_node,
+head->ht_params);
+   if (err)
+   goto errout;
+   }
 
fl_hw_replace_filter(tp,
 >dissector,
@@ -574,7 +585,7 @@ static int fl_change(struct net *net, struct sk_buff 
*in_skb,
 >key,
 >exts,
 (unsigned long)fnew,
-flags);
+fnew->flags);
 
if (fold) {
rhashtable_remove_fast(>ht, >ht_node,
@@ -734,6 +745,8 @@ static int fl_dump(struct net *net, struct tcf_proto *tp, 
unsigned long fh,
  sizeof(key->tp.dst
goto nla_put_failure;
 
+   nla_put_u32(skb, TCA_FLOWER_FLAGS, f->flags);
+
if (tcf_exts_dump(skb, >exts))
goto nla_put_failure;
 
-- 
2.8.3



Re: [PATCH net-next] net/sched: cls_flower: Introduce support in SKIP SW flag

2016-06-08 Thread Amir Vadai
On Tue, Jun 07, 2016 at 08:37:40AM -0700, John Fastabend wrote:
> On 16-06-05 07:11 AM, Amir Vadai wrote:
> > From: Amir Vadai <ami...@mellanox.com>
> > 
> > In order to make a filter processed only by hardware, skip_sw flag
> > should be supplied. This is an addition to the already existing skip_hw
> > flag (filter will be processed by software only). If no flag is
> > specified, filter will be processed by both software and hardware.
> > 
> > If only hardware offloaded filters exist, fl_classify() will return
> > without doing anything.
> > 
> > A following userspace patch will be sent once kernel patch is accepted.
> > 
> > Example:
> > 
> > tc filter add dev enp0s9 protocol ip prio 20 parent : \
> > flower \
> > ip_proto 6 \
> >     indev enp0s9 \
> > skip_sw \
> > action skbedit mark 0x1234
> > 
> > Signed-off-by: Amir Vadai <ami...@mellanox.com>
> > ---
> 
> 
> 
> Looks good to me. Although we need to do the same error propagation in
> flower that Jakub just added to cls_u32.
Thanks John,
I will send a patch to return error when add to hw is failing and skip_sw
is set.

> 
> Acked-by: John Fastabend <john.r.fastab...@intel.com>
> 


[PATCH net-next] net/sched: cls_flower: Introduce support in SKIP SW flag

2016-06-05 Thread Amir Vadai
From: Amir Vadai <ami...@mellanox.com>

In order to make a filter processed only by hardware, skip_sw flag
should be supplied. This is an addition to the already existing skip_hw
flag (filter will be processed by software only). If no flag is
specified, filter will be processed by both software and hardware.

If only hardware offloaded filters exist, fl_classify() will return
without doing anything.

A following userspace patch will be sent once kernel patch is accepted.

Example:

tc filter add dev enp0s9 protocol ip prio 20 parent : \
flower \
ip_proto 6 \
indev enp0s9 \
skip_sw \
action skbedit mark 0x1234

Signed-off-by: Amir Vadai <ami...@mellanox.com>
---
 net/sched/cls_flower.c | 31 ++-
 1 file changed, 22 insertions(+), 9 deletions(-)

diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c
index 730aaca..d737492 100644
--- a/net/sched/cls_flower.c
+++ b/net/sched/cls_flower.c
@@ -66,6 +66,7 @@ struct cls_fl_filter {
struct fl_flow_key key;
struct list_head list;
u32 handle;
+   u32 flags;
struct rcu_head rcu;
 };
 
@@ -123,6 +124,9 @@ static int fl_classify(struct sk_buff *skb, const struct 
tcf_proto *tp,
struct fl_flow_key skb_key;
struct fl_flow_key skb_mkey;
 
+   if (!atomic_read(>ht.nelems))
+   return -1;
+
fl_clear_masked_range(_key, >mask);
skb_key.indev_ifindex = skb->skb_iif;
/* skb_flow_dissect() does not set n_proto in case an unknown protocol,
@@ -136,7 +140,7 @@ static int fl_classify(struct sk_buff *skb, const struct 
tcf_proto *tp,
f = rhashtable_lookup_fast(>ht,
   fl_key_get_start(_mkey, >mask),
   head->ht_params);
-   if (f) {
+   if (f && !(f->flags & TCA_CLS_FLAGS_SKIP_SW)) {
*res = f->res;
return tcf_exts_exec(skb, >exts, res);
}
@@ -524,7 +528,6 @@ static int fl_change(struct net *net, struct sk_buff 
*in_skb,
struct cls_fl_filter *fnew;
struct nlattr *tb[TCA_FLOWER_MAX + 1];
struct fl_flow_mask mask = {};
-   u32 flags = 0;
int err;
 
if (!tca[TCA_OPTIONS])
@@ -552,8 +555,14 @@ static int fl_change(struct net *net, struct sk_buff 
*in_skb,
}
fnew->handle = handle;
 
-   if (tb[TCA_FLOWER_FLAGS])
-   flags = nla_get_u32(tb[TCA_FLOWER_FLAGS]);
+   if (tb[TCA_FLOWER_FLAGS]) {
+   fnew->flags = nla_get_u32(tb[TCA_FLOWER_FLAGS]);
+
+   if (!tc_flags_valid(fnew->flags)) {
+   err = -EINVAL;
+   goto errout;
+   }
+   }
 
err = fl_set_parms(net, tp, fnew, , base, tb, tca[TCA_RATE], ovr);
if (err)
@@ -563,10 +572,12 @@ static int fl_change(struct net *net, struct sk_buff 
*in_skb,
if (err)
goto errout;
 
-   err = rhashtable_insert_fast(>ht, >ht_node,
-head->ht_params);
-   if (err)
-   goto errout;
+   if (!(fnew->flags & TCA_CLS_FLAGS_SKIP_SW)) {
+   err = rhashtable_insert_fast(>ht, >ht_node,
+head->ht_params);
+   if (err)
+   goto errout;
+   }
 
fl_hw_replace_filter(tp,
 >dissector,
@@ -574,7 +585,7 @@ static int fl_change(struct net *net, struct sk_buff 
*in_skb,
 >key,
 >exts,
 (unsigned long)fnew,
-flags);
+fnew->flags);
 
if (fold) {
rhashtable_remove_fast(>ht, >ht_node,
@@ -734,6 +745,8 @@ static int fl_dump(struct net *net, struct tcf_proto *tp, 
unsigned long fh,
  sizeof(key->tp.dst
goto nla_put_failure;
 
+   nla_put_u32(skb, TCA_FLOWER_FLAGS, f->flags);
+
if (tcf_exts_dump(skb, >exts))
goto nla_put_failure;
 
-- 
2.8.3



Re: [PATCH] mlx5: avoid unused variable warning

2016-05-18 Thread Amir Vadai"
On Wed, May 18, 2016 at 04:21:07PM +0200, Arnd Bergmann wrote:
> When CONFIG_NET_CLS_ACT is disabled, we get a new warning in the mlx5
> ethernet driver because the tc_for_each_action() loop never references
> the iterator:
> 
> mellanox/mlx5/core/en_tc.c: In function 'mlx5e_stats_flower':
> mellanox/mlx5/core/en_tc.c:431:20: error: unused variable 'a' 
> [-Werror=unused-variable]
>   struct tc_action *a;
> 
> This changes the dummy tc_for_each_action() macro by adding a
> cast to void, letting the compiler know that the variable is
> intentionally declared but not used here. I could not come up
> with a nicer workaround, but this seems to do the trick.
> 
> Signed-off-by: Arnd Bergmann <a...@arndb.de>
> Fixes: aad7e08d39bd ("net/mlx5e: Hardware offloaded flower filter statistics 
> support")
> Fixes: 00175aec941e ("net/sched: Macro instead of CONFIG_NET_CLS_ACT ifdef")
> ---
Acked-By: Amir Vadai <a...@vadai.me>

Thanks Arnd.


[PATCH net-next 5/8] net/mlx5_core: Firmware commands to support flow counters

2016-05-13 Thread Amir Vadai
From: Amir Vadai <ami...@mellanox.com>

Getting packet/byte statistics on flows is done through flow counters.
Implement the firmware commands to alloc, free and query flow counters.

Signed-off-by: Amir Vadai <ami...@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/cmd.c|  6 ++
 drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c | 66 
 drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.h |  5 ++
 include/linux/mlx5/mlx5_ifc.h| 99 +++-
 4 files changed, 173 insertions(+), 3 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/cmd.c 
b/drivers/net/ethernet/mellanox/mlx5/core/cmd.c
index 63cac84..dcd2df6 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/cmd.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/cmd.c
@@ -294,6 +294,7 @@ static int mlx5_internal_err_ret_value(struct mlx5_core_dev 
*dev, u16 op,
case MLX5_CMD_OP_DESTROY_FLOW_TABLE:
case MLX5_CMD_OP_DESTROY_FLOW_GROUP:
case MLX5_CMD_OP_DELETE_FLOW_TABLE_ENTRY:
+   case MLX5_CMD_OP_DEALLOC_FLOW_COUNTER:
return MLX5_CMD_STAT_OK;
 
case MLX5_CMD_OP_QUERY_HCA_CAP:
@@ -395,6 +396,8 @@ static int mlx5_internal_err_ret_value(struct mlx5_core_dev 
*dev, u16 op,
case MLX5_CMD_OP_QUERY_FLOW_GROUP:
case MLX5_CMD_OP_SET_FLOW_TABLE_ENTRY:
case MLX5_CMD_OP_QUERY_FLOW_TABLE_ENTRY:
+   case MLX5_CMD_OP_ALLOC_FLOW_COUNTER:
+   case MLX5_CMD_OP_QUERY_FLOW_COUNTER:
*status = MLX5_DRIVER_STATUS_ABORTED;
*synd = MLX5_DRIVER_SYND;
return -EIO;
@@ -539,6 +542,9 @@ const char *mlx5_command_str(int command)
MLX5_COMMAND_STR_CASE(SET_FLOW_TABLE_ENTRY);
MLX5_COMMAND_STR_CASE(QUERY_FLOW_TABLE_ENTRY);
MLX5_COMMAND_STR_CASE(DELETE_FLOW_TABLE_ENTRY);
+   MLX5_COMMAND_STR_CASE(ALLOC_FLOW_COUNTER);
+   MLX5_COMMAND_STR_CASE(DEALLOC_FLOW_COUNTER);
+   MLX5_COMMAND_STR_CASE(QUERY_FLOW_COUNTER);
default: return "unknown command opcode";
}
 }
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c 
b/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c
index 9797768..ccb63a0 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c
@@ -323,3 +323,69 @@ int mlx5_cmd_delete_fte(struct mlx5_core_dev *dev,
 
return err;
 }
+
+int mlx5_cmd_fc_alloc(struct mlx5_core_dev *dev, u16 *id)
+{
+   u32 in[MLX5_ST_SZ_DW(alloc_flow_counter_in)];
+   u32 out[MLX5_ST_SZ_DW(alloc_flow_counter_out)];
+   int err;
+
+   memset(in, 0, sizeof(in));
+   memset(out, 0, sizeof(out));
+
+   MLX5_SET(alloc_flow_counter_in, in, opcode,
+MLX5_CMD_OP_ALLOC_FLOW_COUNTER);
+
+   err = mlx5_cmd_exec_check_status(dev, in, sizeof(in), out,
+sizeof(out));
+   if (err)
+   return err;
+
+   *id = MLX5_GET(alloc_flow_counter_out, out, flow_counter_id);
+
+   return 0;
+}
+
+int mlx5_cmd_fc_free(struct mlx5_core_dev *dev, u16 id)
+{
+   u32 in[MLX5_ST_SZ_DW(dealloc_flow_counter_in)];
+   u32 out[MLX5_ST_SZ_DW(dealloc_flow_counter_out)];
+
+   memset(in, 0, sizeof(in));
+   memset(out, 0, sizeof(out));
+
+   MLX5_SET(dealloc_flow_counter_in, in, opcode,
+MLX5_CMD_OP_DEALLOC_FLOW_COUNTER);
+   MLX5_SET(dealloc_flow_counter_in, in, flow_counter_id, id);
+
+   return mlx5_cmd_exec_check_status(dev, in, sizeof(in), out,
+ sizeof(out));
+}
+
+int mlx5_cmd_fc_query(struct mlx5_core_dev *dev, u16 id,
+ u64 *packets, u64 *bytes)
+{
+   u32 out[MLX5_ST_SZ_BYTES(query_flow_counter_out) +
+   MLX5_ST_SZ_BYTES(traffic_counter)];
+   u32 in[MLX5_ST_SZ_DW(query_flow_counter_in)];
+   void *stats;
+   int err = 0;
+
+   memset(in, 0, sizeof(in));
+   memset(out, 0, sizeof(out));
+
+   MLX5_SET(query_flow_counter_in, in, opcode,
+MLX5_CMD_OP_QUERY_FLOW_COUNTER);
+   MLX5_SET(query_flow_counter_in, in, op_mod, 0);
+   MLX5_SET(query_flow_counter_in, in, flow_counter_id, id);
+
+   err = mlx5_cmd_exec_check_status(dev, in, sizeof(in), out, sizeof(out));
+   if (err)
+   return err;
+
+   stats = MLX5_ADDR_OF(query_flow_counter_out, out, flow_statistics);
+   *packets = MLX5_GET64(traffic_counter, stats, packets);
+   *bytes = MLX5_GET64(traffic_counter, stats, octets);
+
+   return 0;
+}
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.h 
b/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.h
index c97b4a0..18c111a 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.h
@@ -70,4 +70,9 @@ int mlx5_cmd_delete_fte(struct mlx5_core_dev *dev,
 
 int mlx5_cmd_update_root_ft(struct mlx5_core_dev *dev,

  1   2   3   >