from:"Amritha Nambiar"

[iproute2-next PATCH v6] tc: flower: Classify packets based port ranges

2018-11-27 Thread Amritha Nambiar

Added support for filtering based on port ranges.
UAPI changes have been accepted into net-next.

Example:
1. Match on a port range:
-
$ tc filter add dev enp4s0 protocol ip parent :\
  prio 1 flower ip_proto tcp dst_port 20-30 skip_hw\
  action drop

$ tc -s filter show dev enp4s0 parent :
filter protocol ip pref 1 flower chain 0
filter protocol ip pref 1 flower chain 0 handle 0x1
  eth_type ipv4
  ip_proto tcp
  dst_port 20-30
  skip_hw
  not_in_hw
action order 1: gact action drop
 random type none pass val 0
 index 1 ref 1 bind 1 installed 85 sec used 3 sec
Action statistics:
Sent 460 bytes 10 pkt (dropped 10, overlimits 0 requeues 0)
backlog 0b 0p requeues 0

2. Match on IP address and port range:
--
$ tc filter add dev enp4s0 protocol ip parent :\
  prio 1 flower dst_ip 192.168.1.1 ip_proto tcp dst_port 100-200\
  skip_hw action drop

$ tc -s filter show dev enp4s0 parent :
filter protocol ip pref 1 flower chain 0 handle 0x2
  eth_type ipv4
  ip_proto tcp
  dst_ip 192.168.1.1
  dst_port 100-200
  skip_hw
  not_in_hw
action order 1: gact action drop
 random type none pass val 0
 index 2 ref 1 bind 1 installed 58 sec used 2 sec
Action statistics:
Sent 920 bytes 20 pkt (dropped 20, overlimits 0 requeues 0)
backlog 0b 0p requeues 0

v6:
Modified to change json output format as object for sport/dport.

 "dst_port":{
   "start":2000,
   "end":6000
 },
 "src_port":{
   "start":50,
   "end":60
 }

v5:
Simplified some code and used 'sscanf' for parsing. Removed
space in output format.

v4:
Added man updates explaining filtering based on port ranges.
Removed 'range' keyword.

v3:
Modified flower_port_range_attr_type calls.

v2:
Addressed Jiri's comment to sync output format with input

Signed-off-by: Amritha Nambiar 
---
 man/man8/tc-flower.8 |   13 +---
 tc/f_flower.c|   85 +-
 2 files changed, 84 insertions(+), 14 deletions(-)

diff --git a/man/man8/tc-flower.8 b/man/man8/tc-flower.8
index 8be8882..adff41e 100644
--- a/man/man8/tc-flower.8
+++ b/man/man8/tc-flower.8
@@ -56,8 +56,9 @@ flower \- flow based traffic control filter
 .IR MASKED_IP_TTL " | { "
 .BR dst_ip " | " src_ip " } "
 .IR PREFIX " | { "
-.BR dst_port " | " src_port " } "
-.IR port_number " } | "
+.BR dst_port " | " src_port " } { "
+.IR port_number " | "
+.IR min_port_number-max_port_number " } | "
 .B tcp_flags
 .IR MASKED_TCP_FLAGS " | "
 .B type
@@ -220,10 +221,12 @@ must be a valid IPv4 or IPv6 address, depending on the 
\fBprotocol\fR
 option to tc filter, optionally followed by a slash and the prefix length.
 If the prefix is missing, \fBtc\fR assumes a full-length host match.
 .TP
-.BI dst_port " NUMBER"
+.IR \fBdst_port " { "  NUMBER " | " " MIN_VALUE-MAX_VALUE "  }
 .TQ
-.BI src_port " NUMBER"
-Match on layer 4 protocol source or destination port number. Only available for
+.IR \fBsrc_port " { "  NUMBER " | " " MIN_VALUE-MAX_VALUE "  }
+Match on layer 4 protocol source or destination port number. Alternatively, the
+mininum and maximum values can be specified to match on a range of layer 4
+protocol source or destination port numbers. Only available for
 .BR ip_proto " values " udp ", " tcp  " and " sctp
 which have to be specified in beforehand.
 .TP
diff --git a/tc/f_flower.c b/tc/f_flower.c
index 65fca04..c563666 100644
--- a/tc/f_flower.c
+++ b/tc/f_flower.c
@@ -473,24 +473,57 @@ static int flower_port_attr_type(__u8 ip_proto, enum 
flower_endpoint endpoint)
return -1;
 }
 
+static int flower_port_range_attr_type(__u8 ip_proto, enum flower_endpoint 
type,
+  __be16 *min_port_type,
+  __be16 *max_port_type)
+{
+   if (ip_proto == IPPROTO_TCP || ip_proto == IPPROTO_UDP ||
+   ip_proto == IPPROTO_SCTP) {
+   if (type == FLOWER_ENDPOINT_SRC) {
+   *min_port_type = TCA_FLOWER_KEY_PORT_SRC_MIN;
+   *max_port_type = TCA_FLOWER_KEY_PORT_SRC_MAX;
+   } else {
+   *min_port_type = TCA_FLOWER_KEY_PORT_DST_MIN;
+   *max_port_type = TCA_FLOWER_KEY_PORT_DST_MAX;
+   }
+   } else {
+   return -1;
+   }
+   return 0;
+}
+
 static int flower_parse_port(char *str, __u8 ip_proto,
 enum flower_endpoint endpoint,
 struct nlmsghdr *n)
 {
+   __u16 min, max;
int ret;
-   int type;
-   __b

[iproute2-next PATCH v5] tc: flower: Classify packets based port ranges

2018-11-26 Thread Amritha Nambiar

Added support for filtering based on port ranges.
UAPI changes have been accepted into net-next.

Example:
1. Match on a port range:
-
$ tc filter add dev enp4s0 protocol ip parent :\
  prio 1 flower ip_proto tcp dst_port 20-30 skip_hw\
  action drop

$ tc -s filter show dev enp4s0 parent :
filter protocol ip pref 1 flower chain 0
filter protocol ip pref 1 flower chain 0 handle 0x1
  eth_type ipv4
  ip_proto tcp
  dst_port 20-30
  skip_hw
  not_in_hw
action order 1: gact action drop
 random type none pass val 0
 index 1 ref 1 bind 1 installed 85 sec used 3 sec
Action statistics:
Sent 460 bytes 10 pkt (dropped 10, overlimits 0 requeues 0)
backlog 0b 0p requeues 0

2. Match on IP address and port range:
--
$ tc filter add dev enp4s0 protocol ip parent :\
  prio 1 flower dst_ip 192.168.1.1 ip_proto tcp dst_port 100-200\
  skip_hw action drop

$ tc -s filter show dev enp4s0 parent :
filter protocol ip pref 1 flower chain 0 handle 0x2
  eth_type ipv4
  ip_proto tcp
  dst_ip 192.168.1.1
  dst_port 100-200
  skip_hw
  not_in_hw
action order 1: gact action drop
 random type none pass val 0
 index 2 ref 1 bind 1 installed 58 sec used 2 sec
Action statistics:
Sent 920 bytes 20 pkt (dropped 20, overlimits 0 requeues 0)
backlog 0b 0p requeues 0

v5:
Simplified some code and used 'sscanf' for parsing. Removed
space in output format.

v4:
Added man updates explaining filtering based on port ranges.
Removed 'range' keyword.

v3:
Modified flower_port_range_attr_type calls.

v2:
Addressed Jiri's comment to sync output format with input

Signed-off-by: Amritha Nambiar 
---
 man/man8/tc-flower.8 |   13 +---
 tc/f_flower.c|   78 --
 2 files changed, 77 insertions(+), 14 deletions(-)

diff --git a/man/man8/tc-flower.8 b/man/man8/tc-flower.8
index 8be8882..adff41e 100644
--- a/man/man8/tc-flower.8
+++ b/man/man8/tc-flower.8
@@ -56,8 +56,9 @@ flower \- flow based traffic control filter
 .IR MASKED_IP_TTL " | { "
 .BR dst_ip " | " src_ip " } "
 .IR PREFIX " | { "
-.BR dst_port " | " src_port " } "
-.IR port_number " } | "
+.BR dst_port " | " src_port " } { "
+.IR port_number " | "
+.IR min_port_number-max_port_number " } | "
 .B tcp_flags
 .IR MASKED_TCP_FLAGS " | "
 .B type
@@ -220,10 +221,12 @@ must be a valid IPv4 or IPv6 address, depending on the 
\fBprotocol\fR
 option to tc filter, optionally followed by a slash and the prefix length.
 If the prefix is missing, \fBtc\fR assumes a full-length host match.
 .TP
-.BI dst_port " NUMBER"
+.IR \fBdst_port " { "  NUMBER " | " " MIN_VALUE-MAX_VALUE "  }
 .TQ
-.BI src_port " NUMBER"
-Match on layer 4 protocol source or destination port number. Only available for
+.IR \fBsrc_port " { "  NUMBER " | " " MIN_VALUE-MAX_VALUE "  }
+Match on layer 4 protocol source or destination port number. Alternatively, the
+mininum and maximum values can be specified to match on a range of layer 4
+protocol source or destination port numbers. Only available for
 .BR ip_proto " values " udp ", " tcp  " and " sctp
 which have to be specified in beforehand.
 .TP
diff --git a/tc/f_flower.c b/tc/f_flower.c
index 65fca04..9a01b4a 100644
--- a/tc/f_flower.c
+++ b/tc/f_flower.c
@@ -473,24 +473,57 @@ static int flower_port_attr_type(__u8 ip_proto, enum 
flower_endpoint endpoint)
return -1;
 }
 
+static int flower_port_range_attr_type(__u8 ip_proto, enum flower_endpoint 
type,
+  __be16 *min_port_type,
+  __be16 *max_port_type)
+{
+   if (ip_proto == IPPROTO_TCP || ip_proto == IPPROTO_UDP ||
+   ip_proto == IPPROTO_SCTP) {
+   if (type == FLOWER_ENDPOINT_SRC) {
+   *min_port_type = TCA_FLOWER_KEY_PORT_SRC_MIN;
+   *max_port_type = TCA_FLOWER_KEY_PORT_SRC_MAX;
+   } else {
+   *min_port_type = TCA_FLOWER_KEY_PORT_DST_MIN;
+   *max_port_type = TCA_FLOWER_KEY_PORT_DST_MAX;
+   }
+   } else {
+   return -1;
+   }
+   return 0;
+}
+
 static int flower_parse_port(char *str, __u8 ip_proto,
 enum flower_endpoint endpoint,
 struct nlmsghdr *n)
 {
+   __u16 min, max;
int ret;
-   int type;
-   __be16 port;
 
-   type = flower_port_attr_type(ip_proto, endpoint);
-   if (type < 0)
-   return -1;
+   ret = sscanf(str, "%hu-%hu", , );
 
-   ret = get_be16(, str, 10);
-   if (ret)
-   return -1;
+   if

[iproute2-next PATCH v4] tc: flower: Classify packets based port ranges

2018-11-20 Thread Amritha Nambiar

Added support for filtering based on port ranges.
UAPI changes have been accepted into net-next.

Example:
1. Match on a port range:
-
$ tc filter add dev enp4s0 protocol ip parent :\
  prio 1 flower ip_proto tcp dst_port range 20-30 skip_hw\
  action drop

$ tc -s filter show dev enp4s0 parent :
filter protocol ip pref 1 flower chain 0
filter protocol ip pref 1 flower chain 0 handle 0x1
  eth_type ipv4
  ip_proto tcp
  dst_port range 20-30
  skip_hw
  not_in_hw
action order 1: gact action drop
 random type none pass val 0
 index 1 ref 1 bind 1 installed 85 sec used 3 sec
Action statistics:
Sent 460 bytes 10 pkt (dropped 10, overlimits 0 requeues 0)
backlog 0b 0p requeues 0

2. Match on IP address and port range:
--
$ tc filter add dev enp4s0 protocol ip parent :\
  prio 1 flower dst_ip 192.168.1.1 ip_proto tcp dst_port range 100-200\
  skip_hw action drop

$ tc -s filter show dev enp4s0 parent :
filter protocol ip pref 1 flower chain 0 handle 0x2
  eth_type ipv4
  ip_proto tcp
  dst_ip 192.168.1.1
  dst_port range 100-200
  skip_hw
  not_in_hw
action order 1: gact action drop
 random type none pass val 0
 index 2 ref 1 bind 1 installed 58 sec used 2 sec
Action statistics:
Sent 920 bytes 20 pkt (dropped 20, overlimits 0 requeues 0)
backlog 0b 0p requeues 0

v4:
Added man updates explaining filtering based on port ranges.
Removed 'range' keyword.

v3:
Modified flower_port_range_attr_type calls.

v2:
Addressed Jiri's comment to sync output format with input

Signed-off-by: Amritha Nambiar 
---
 man/man8/tc-flower.8 |   13 +++--
 tc/f_flower.c|  136 ++
 2 files changed, 134 insertions(+), 15 deletions(-)

diff --git a/man/man8/tc-flower.8 b/man/man8/tc-flower.8
index 8be8882..adff41e 100644
--- a/man/man8/tc-flower.8
+++ b/man/man8/tc-flower.8
@@ -56,8 +56,9 @@ flower \- flow based traffic control filter
 .IR MASKED_IP_TTL " | { "
 .BR dst_ip " | " src_ip " } "
 .IR PREFIX " | { "
-.BR dst_port " | " src_port " } "
-.IR port_number " } | "
+.BR dst_port " | " src_port " } { "
+.IR port_number " | "
+.IR min_port_number-max_port_number " } | "
 .B tcp_flags
 .IR MASKED_TCP_FLAGS " | "
 .B type
@@ -220,10 +221,12 @@ must be a valid IPv4 or IPv6 address, depending on the 
\fBprotocol\fR
 option to tc filter, optionally followed by a slash and the prefix length.
 If the prefix is missing, \fBtc\fR assumes a full-length host match.
 .TP
-.BI dst_port " NUMBER"
+.IR \fBdst_port " { "  NUMBER " | " " MIN_VALUE-MAX_VALUE "  }
 .TQ
-.BI src_port " NUMBER"
-Match on layer 4 protocol source or destination port number. Only available for
+.IR \fBsrc_port " { "  NUMBER " | " " MIN_VALUE-MAX_VALUE "  }
+Match on layer 4 protocol source or destination port number. Alternatively, the
+mininum and maximum values can be specified to match on a range of layer 4
+protocol source or destination port numbers. Only available for
 .BR ip_proto " values " udp ", " tcp  " and " sctp
 which have to be specified in beforehand.
 .TP
diff --git a/tc/f_flower.c b/tc/f_flower.c
index 65fca04..722647d 100644
--- a/tc/f_flower.c
+++ b/tc/f_flower.c
@@ -494,6 +494,68 @@ static int flower_parse_port(char *str, __u8 ip_proto,
return 0;
 }
 
+static int flower_port_range_attr_type(__u8 ip_proto, enum flower_endpoint 
type,
+  __be16 *min_port_type,
+  __be16 *max_port_type)
+{
+   if (ip_proto == IPPROTO_TCP || ip_proto == IPPROTO_UDP ||
+   ip_proto == IPPROTO_SCTP) {
+   if (type == FLOWER_ENDPOINT_SRC) {
+   *min_port_type = TCA_FLOWER_KEY_PORT_SRC_MIN;
+   *max_port_type = TCA_FLOWER_KEY_PORT_SRC_MAX;
+   } else {
+   *min_port_type = TCA_FLOWER_KEY_PORT_DST_MIN;
+   *max_port_type = TCA_FLOWER_KEY_PORT_DST_MAX;
+   }
+   } else {
+   return -1;
+   }
+
+   return 0;
+}
+
+static int flower_parse_port_range(__be16 *min, __be16 *max, __u8 ip_proto,
+  enum flower_endpoint endpoint,
+  struct nlmsghdr *n)
+{
+   __be16 min_port_type, max_port_type;
+
+   if (htons(*max) <= htons(*min)) {
+   fprintf(stderr, "max value should be greater than min value\n");
+   return -1;
+   }
+
+   if (flower_port_range_attr_type(ip_proto, endpoint, _port_type,
+   _port_type))
+   return -1;
+
+   adda

[iproute2-next PATCH v2] man: tc-flower: Add explanation for range option

2018-11-20 Thread Amritha Nambiar

Add details explaining filtering based on port ranges.

v2: Modified description to remove range as standalone option
and updated as part of dst_port/src_port.

Signed-off-by: Amritha Nambiar 
---
 man/man8/tc-flower.8 |   15 ++-
 1 file changed, 10 insertions(+), 5 deletions(-)

diff --git a/man/man8/tc-flower.8 b/man/man8/tc-flower.8
index 8be8882..1d195d0 100644
--- a/man/man8/tc-flower.8
+++ b/man/man8/tc-flower.8
@@ -56,8 +56,10 @@ flower \- flow based traffic control filter
 .IR MASKED_IP_TTL " | { "
 .BR dst_ip " | " src_ip " } "
 .IR PREFIX " | { "
-.BR dst_port " | " src_port " } "
-.IR port_number " } | "
+.BR dst_port " | " src_port " } { "
+.IR port_number " | "
+.B range
+.IR min_port_number-max_port_number " } | "
 .B tcp_flags
 .IR MASKED_TCP_FLAGS " | "
 .B type
@@ -220,10 +222,13 @@ must be a valid IPv4 or IPv6 address, depending on the 
\fBprotocol\fR
 option to tc filter, optionally followed by a slash and the prefix length.
 If the prefix is missing, \fBtc\fR assumes a full-length host match.
 .TP
-.BI dst_port " NUMBER"
+.BR dst_port " { "  \fINUMBER " | " range " \fIMIN_VALUE-MAX_VALUE "  \fR }
 .TQ
-.BI src_port " NUMBER"
-Match on layer 4 protocol source or destination port number. Only available for
+.BR src_port " { "  \fINUMBER " | " range " \fIMIN_VALUE-MAX_VALUE "  \fR }
+Match on layer 4 protocol source or destination port number. Alternatively, the
+\fBrange\fR option can be used to match on a range of layer 4 protocol source
+or destination port numbers by specifying the mininum and maximum values. Only
+available for
 .BR ip_proto " values " udp ", " tcp  " and " sctp
 which have to be specified in beforehand.
 .TP

[iproute2-next PATCH v3 2/2] man: tc-flower: Add explanation for range option

2018-11-15 Thread Amritha Nambiar

Add details explaining filtering based on port ranges.

Signed-off-by: Amritha Nambiar 
---
 man/man8/tc-flower.8 |   12 ++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/man/man8/tc-flower.8 b/man/man8/tc-flower.8
index 8be8882..768bfa1 100644
--- a/man/man8/tc-flower.8
+++ b/man/man8/tc-flower.8
@@ -56,8 +56,10 @@ flower \- flow based traffic control filter
 .IR MASKED_IP_TTL " | { "
 .BR dst_ip " | " src_ip " } "
 .IR PREFIX " | { "
-.BR dst_port " | " src_port " } "
-.IR port_number " } | "
+.BR dst_port " | " src_port " } { "
+.IR port_number " | "
+.B range
+.IR min_port_number-max_port_number " } | "
 .B tcp_flags
 .IR MASKED_TCP_FLAGS " | "
 .B type
@@ -227,6 +229,12 @@ Match on layer 4 protocol source or destination port 
number. Only available for
 .BR ip_proto " values " udp ", " tcp  " and " sctp
 which have to be specified in beforehand.
 .TP
+.BI range " MIN_VALUE-MAX_VALUE"
+Match on a range of layer 4 protocol source or destination port number. Only
+available for
+.BR ip_proto " values " udp ", " tcp  " and " sctp
+which have to be specified in beforehand.
+.TP
 .BI tcp_flags " MASKED_TCP_FLAGS"
 Match on TCP flags represented as 12bit bitfield in in hexadecimal format.
 A mask may be optionally provided to limit the bits which are matched. A mask

[iproute2-next PATCH v3 1/2] tc: flower: Classify packets based port ranges

2018-11-15 Thread Amritha Nambiar

Added support for filtering based on port ranges.
UAPI changes have been accepted into net-next.

Example:
1. Match on a port range:
-
$ tc filter add dev enp4s0 protocol ip parent :\
  prio 1 flower ip_proto tcp dst_port range 20-30 skip_hw\
  action drop

$ tc -s filter show dev enp4s0 parent :
filter protocol ip pref 1 flower chain 0
filter protocol ip pref 1 flower chain 0 handle 0x1
  eth_type ipv4
  ip_proto tcp
  dst_port range 20-30
  skip_hw
  not_in_hw
action order 1: gact action drop
 random type none pass val 0
 index 1 ref 1 bind 1 installed 85 sec used 3 sec
Action statistics:
Sent 460 bytes 10 pkt (dropped 10, overlimits 0 requeues 0)
backlog 0b 0p requeues 0

2. Match on IP address and port range:
--
$ tc filter add dev enp4s0 protocol ip parent :\
  prio 1 flower dst_ip 192.168.1.1 ip_proto tcp dst_port range 100-200\
  skip_hw action drop

$ tc -s filter show dev enp4s0 parent :
filter protocol ip pref 1 flower chain 0 handle 0x2
  eth_type ipv4
  ip_proto tcp
  dst_ip 192.168.1.1
  dst_port range 100-200
  skip_hw
  not_in_hw
action order 1: gact action drop
 random type none pass val 0
 index 2 ref 1 bind 1 installed 58 sec used 2 sec
Action statistics:
Sent 920 bytes 20 pkt (dropped 20, overlimits 0 requeues 0)
backlog 0b 0p requeues 0

v3:
Modified flower_port_range_attr_type calls.

v2:
Addressed Jiri's comment to sync output format with input

Signed-off-by: Amritha Nambiar 
---
 include/uapi/linux/pkt_cls.h |7 ++
 tc/f_flower.c|  143 +++---
 2 files changed, 140 insertions(+), 10 deletions(-)

diff --git a/include/uapi/linux/pkt_cls.h b/include/uapi/linux/pkt_cls.h
index 401d0c1..95d0db2 100644
--- a/include/uapi/linux/pkt_cls.h
+++ b/include/uapi/linux/pkt_cls.h
@@ -485,6 +485,11 @@ enum {
 
TCA_FLOWER_IN_HW_COUNT,
 
+   TCA_FLOWER_KEY_PORT_SRC_MIN,/* be16 */
+   TCA_FLOWER_KEY_PORT_SRC_MAX,/* be16 */
+   TCA_FLOWER_KEY_PORT_DST_MIN,/* be16 */
+   TCA_FLOWER_KEY_PORT_DST_MAX,/* be16 */
+
__TCA_FLOWER_MAX,
 };
 
@@ -518,6 +523,8 @@ enum {
TCA_FLOWER_KEY_FLAGS_FRAG_IS_FIRST = (1 << 1),
 };
 
+#define TCA_FLOWER_MASK_FLAGS_RANGE(1 << 0) /* Range-based match */
+
 /* Match-all classifier */
 
 enum {
diff --git a/tc/f_flower.c b/tc/f_flower.c
index 65fca04..9bddf7b 100644
--- a/tc/f_flower.c
+++ b/tc/f_flower.c
@@ -494,6 +494,68 @@ static int flower_parse_port(char *str, __u8 ip_proto,
return 0;
 }
 
+static int flower_port_range_attr_type(__u8 ip_proto, enum flower_endpoint 
type,
+  __be16 *min_port_type,
+  __be16 *max_port_type)
+{
+   if (ip_proto == IPPROTO_TCP || ip_proto == IPPROTO_UDP ||
+   ip_proto == IPPROTO_SCTP) {
+   if (type == FLOWER_ENDPOINT_SRC) {
+   *min_port_type = TCA_FLOWER_KEY_PORT_SRC_MIN;
+   *max_port_type = TCA_FLOWER_KEY_PORT_SRC_MAX;
+   } else {
+   *min_port_type = TCA_FLOWER_KEY_PORT_DST_MIN;
+   *max_port_type = TCA_FLOWER_KEY_PORT_DST_MAX;
+   }
+   } else {
+   return -1;
+   }
+
+   return 0;
+}
+
+static int flower_parse_port_range(__be16 *min, __be16 *max, __u8 ip_proto,
+  enum flower_endpoint endpoint,
+  struct nlmsghdr *n)
+{
+   __be16 min_port_type, max_port_type;
+
+   if (flower_port_range_attr_type(ip_proto, endpoint, _port_type,
+   _port_type))
+   return -1;
+
+   addattr16(n, MAX_MSG, min_port_type, *min);
+   addattr16(n, MAX_MSG, max_port_type, *max);
+
+   return 0;
+}
+
+static int get_range(__be16 *min, __be16 *max, char *argv)
+{
+   char *r;
+
+   r = strchr(argv, '-');
+   if (r) {
+   *r = '\0';
+   if (get_be16(min, argv, 10)) {
+   fprintf(stderr, "invalid min range\n");
+   return -1;
+   }
+   if (get_be16(max, r + 1, 10)) {
+   fprintf(stderr, "invalid max range\n");
+   return -1;
+   }
+   if (htons(*max) <= htons(*min)) {
+   fprintf(stderr, "max value should be greater than min 
value\n");
+   return -1;
+   }
+   } else {
+   fprintf(stderr, "Illegal range format\n");
+   return -1;
+   }
+   return 0;
+}
+
 #define TCP_FLAGS_MAX_MASK 0xfff
 
 static int flower_parse_tcp_flags(char *str, int flags_type, int mask_type,
@@ -1061,20 +1123,54 @@ static int

[net-next PATCH v4] net: sched: cls_flower: Classify packets using port ranges

2018-11-12 Thread Amritha Nambiar

Added support in tc flower for filtering based on port ranges.

Example:
1. Match on a port range:
-
$ tc filter add dev enp4s0 protocol ip parent :\
  prio 1 flower ip_proto tcp dst_port range 20-30 skip_hw\
  action drop

$ tc -s filter show dev enp4s0 parent :
filter protocol ip pref 1 flower chain 0
filter protocol ip pref 1 flower chain 0 handle 0x1
  eth_type ipv4
  ip_proto tcp
  dst_port range 20-30
  skip_hw
  not_in_hw
action order 1: gact action drop
 random type none pass val 0
 index 1 ref 1 bind 1 installed 85 sec used 3 sec
Action statistics:
Sent 460 bytes 10 pkt (dropped 10, overlimits 0 requeues 0)
backlog 0b 0p requeues 0

2. Match on IP address and port range:
--
$ tc filter add dev enp4s0 protocol ip parent :\
  prio 1 flower dst_ip 192.168.1.1 ip_proto tcp dst_port range 100-200\
  skip_hw action drop

$ tc -s filter show dev enp4s0 parent :
filter protocol ip pref 1 flower chain 0 handle 0x2
  eth_type ipv4
  ip_proto tcp
  dst_ip 192.168.1.1
  dst_port range 100-200
  skip_hw
  not_in_hw
action order 1: gact action drop
 random type none pass val 0
 index 2 ref 1 bind 1 installed 58 sec used 2 sec
Action statistics:
Sent 920 bytes 20 pkt (dropped 20, overlimits 0 requeues 0)
backlog 0b 0p requeues 0

v4:
1. Added condition before setting port key.
2. Organized setting and dumping port range keys into functions
   and added validation of input range.

v3:
1. Moved new fields in UAPI enum to the end of enum.
2. Removed couple of empty lines.

v2:
Addressed Jiri's comments:
1. Added separate functions for dst and src comparisons.
2. Removed endpoint enum.
3. Added new bit TCA_FLOWER_FLAGS_RANGE to decide normal/range
  lookup.
4. Cleaned up fl_lookup function.

Signed-off-by: Amritha Nambiar 
---
 include/uapi/linux/pkt_cls.h |7 ++
 net/sched/cls_flower.c   |  155 --
 2 files changed, 156 insertions(+), 6 deletions(-)

diff --git a/include/uapi/linux/pkt_cls.h b/include/uapi/linux/pkt_cls.h
index 401d0c1..95d0db2 100644
--- a/include/uapi/linux/pkt_cls.h
+++ b/include/uapi/linux/pkt_cls.h
@@ -485,6 +485,11 @@ enum {
 
TCA_FLOWER_IN_HW_COUNT,
 
+   TCA_FLOWER_KEY_PORT_SRC_MIN,/* be16 */
+   TCA_FLOWER_KEY_PORT_SRC_MAX,/* be16 */
+   TCA_FLOWER_KEY_PORT_DST_MIN,/* be16 */
+   TCA_FLOWER_KEY_PORT_DST_MAX,/* be16 */
+
__TCA_FLOWER_MAX,
 };
 
@@ -518,6 +523,8 @@ enum {
TCA_FLOWER_KEY_FLAGS_FRAG_IS_FIRST = (1 << 1),
 };
 
+#define TCA_FLOWER_MASK_FLAGS_RANGE(1 << 0) /* Range-based match */
+
 /* Match-all classifier */
 
 enum {
diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c
index c6c3278..85e9f8e 100644
--- a/net/sched/cls_flower.c
+++ b/net/sched/cls_flower.c
@@ -55,6 +55,8 @@ struct fl_flow_key {
struct flow_dissector_key_ip ip;
struct flow_dissector_key_ip enc_ip;
struct flow_dissector_key_enc_opts enc_opts;
+   struct flow_dissector_key_ports tp_min;
+   struct flow_dissector_key_ports tp_max;
 } __aligned(BITS_PER_LONG / 8); /* Ensure that we can do comparisons as longs. 
*/
 
 struct fl_flow_mask_range {
@@ -65,6 +67,7 @@ struct fl_flow_mask_range {
 struct fl_flow_mask {
struct fl_flow_key key;
struct fl_flow_mask_range range;
+   u32 flags;
struct rhash_head ht_node;
struct rhashtable ht;
struct rhashtable_params filter_ht_params;
@@ -179,13 +182,89 @@ static void fl_clear_masked_range(struct fl_flow_key *key,
memset(fl_key_get_start(key, mask), 0, fl_mask_range(mask));
 }
 
-static struct cls_fl_filter *fl_lookup(struct fl_flow_mask *mask,
-  struct fl_flow_key *mkey)
+static bool fl_range_port_dst_cmp(struct cls_fl_filter *filter,
+ struct fl_flow_key *key,
+ struct fl_flow_key *mkey)
+{
+   __be16 min_mask, max_mask, min_val, max_val;
+
+   min_mask = htons(filter->mask->key.tp_min.dst);
+   max_mask = htons(filter->mask->key.tp_max.dst);
+   min_val = htons(filter->key.tp_min.dst);
+   max_val = htons(filter->key.tp_max.dst);
+
+   if (min_mask && max_mask) {
+   if (htons(key->tp.dst) < min_val ||
+   htons(key->tp.dst) > max_val)
+   return false;
+
+   /* skb does not have min and max values */
+   mkey->tp_min.dst = filter->mkey.tp_min.dst;
+   mkey->tp_max.dst = filter->mkey.tp_max.dst;
+   }
+   return true;
+}
+
+static bool fl_range_port_src_cmp(struct cls_fl_filter *filter,
+ struct fl_flow_key *key,
+ struct fl_flow_key *mkey)
+{
+   __be16 min_mask, max_m

[net-next PATCH v3] net: sched: cls_flower: Classify packets using port ranges

2018-11-09 Thread Amritha Nambiar

Added support in tc flower for filtering based on port ranges.

Example:
1. Match on a port range:
-
$ tc filter add dev enp4s0 protocol ip parent :\
  prio 1 flower ip_proto tcp dst_port range 20-30 skip_hw\
  action drop

$ tc -s filter show dev enp4s0 parent :
filter protocol ip pref 1 flower chain 0
filter protocol ip pref 1 flower chain 0 handle 0x1
  eth_type ipv4
  ip_proto tcp
  dst_port range 20-30
  skip_hw
  not_in_hw
action order 1: gact action drop
 random type none pass val 0
 index 1 ref 1 bind 1 installed 85 sec used 3 sec
Action statistics:
Sent 460 bytes 10 pkt (dropped 10, overlimits 0 requeues 0)
backlog 0b 0p requeues 0

2. Match on IP address and port range:
--
$ tc filter add dev enp4s0 protocol ip parent :\
  prio 1 flower dst_ip 192.168.1.1 ip_proto tcp dst_port range 100-200\
  skip_hw action drop

$ tc -s filter show dev enp4s0 parent :
filter protocol ip pref 1 flower chain 0 handle 0x2
  eth_type ipv4
  ip_proto tcp
  dst_ip 192.168.1.1
  dst_port range 100-200
  skip_hw
  not_in_hw
action order 1: gact action drop
 random type none pass val 0
 index 2 ref 1 bind 1 installed 58 sec used 2 sec
Action statistics:
Sent 920 bytes 20 pkt (dropped 20, overlimits 0 requeues 0)
backlog 0b 0p requeues 0

v3:
1. Moved new fields in UAPI enum to the end of enum.
2. Removed couple of empty lines.

v2:
Addressed Jiri's comments:
1. Added separate functions for dst and src comparisons.
2. Removed endpoint enum.
3. Added new bit TCA_FLOWER_FLAGS_RANGE to decide normal/range
  lookup.
4. Cleaned up fl_lookup function.

Signed-off-by: Amritha Nambiar 
---
 include/uapi/linux/pkt_cls.h |7 ++
 net/sched/cls_flower.c   |  132 --
 2 files changed, 133 insertions(+), 6 deletions(-)

diff --git a/include/uapi/linux/pkt_cls.h b/include/uapi/linux/pkt_cls.h
index 401d0c1..95d0db2 100644
--- a/include/uapi/linux/pkt_cls.h
+++ b/include/uapi/linux/pkt_cls.h
@@ -485,6 +485,11 @@ enum {
 
TCA_FLOWER_IN_HW_COUNT,
 
+   TCA_FLOWER_KEY_PORT_SRC_MIN,/* be16 */
+   TCA_FLOWER_KEY_PORT_SRC_MAX,/* be16 */
+   TCA_FLOWER_KEY_PORT_DST_MIN,/* be16 */
+   TCA_FLOWER_KEY_PORT_DST_MAX,/* be16 */
+
__TCA_FLOWER_MAX,
 };
 
@@ -518,6 +523,8 @@ enum {
TCA_FLOWER_KEY_FLAGS_FRAG_IS_FIRST = (1 << 1),
 };
 
+#define TCA_FLOWER_MASK_FLAGS_RANGE(1 << 0) /* Range-based match */
+
 /* Match-all classifier */
 
 enum {
diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c
index 9aada2d..7780106 100644
--- a/net/sched/cls_flower.c
+++ b/net/sched/cls_flower.c
@@ -55,6 +55,8 @@ struct fl_flow_key {
struct flow_dissector_key_ip ip;
struct flow_dissector_key_ip enc_ip;
struct flow_dissector_key_enc_opts enc_opts;
+   struct flow_dissector_key_ports tp_min;
+   struct flow_dissector_key_ports tp_max;
 } __aligned(BITS_PER_LONG / 8); /* Ensure that we can do comparisons as longs. 
*/
 
 struct fl_flow_mask_range {
@@ -65,6 +67,7 @@ struct fl_flow_mask_range {
 struct fl_flow_mask {
struct fl_flow_key key;
struct fl_flow_mask_range range;
+   u32 flags;
struct rhash_head ht_node;
struct rhashtable ht;
struct rhashtable_params filter_ht_params;
@@ -179,13 +182,89 @@ static void fl_clear_masked_range(struct fl_flow_key *key,
memset(fl_key_get_start(key, mask), 0, fl_mask_range(mask));
 }
 
-static struct cls_fl_filter *fl_lookup(struct fl_flow_mask *mask,
-  struct fl_flow_key *mkey)
+static bool fl_range_port_dst_cmp(struct cls_fl_filter *filter,
+ struct fl_flow_key *key,
+ struct fl_flow_key *mkey)
+{
+   __be16 min_mask, max_mask, min_val, max_val;
+
+   min_mask = htons(filter->mask->key.tp_min.dst);
+   max_mask = htons(filter->mask->key.tp_max.dst);
+   min_val = htons(filter->key.tp_min.dst);
+   max_val = htons(filter->key.tp_max.dst);
+
+   if (min_mask && max_mask) {
+   if (htons(key->tp.dst) < min_val ||
+   htons(key->tp.dst) > max_val)
+   return false;
+
+   /* skb does not have min and max values */
+   mkey->tp_min.dst = filter->mkey.tp_min.dst;
+   mkey->tp_max.dst = filter->mkey.tp_max.dst;
+   }
+   return true;
+}
+
+static bool fl_range_port_src_cmp(struct cls_fl_filter *filter,
+ struct fl_flow_key *key,
+ struct fl_flow_key *mkey)
+{
+   __be16 min_mask, max_mask, min_val, max_val;
+
+   min_mask = htons(filter->mask->key.tp_min.src);
+   max_mask = htons(filter->mask->key.tp_max.src);
+

[iproute2 PATCH v2] tc: flower: Classify packets based port ranges

2018-11-07 Thread Amritha Nambiar

Added support for filtering based on port ranges.

Example:
1. Match on a port range:
-
$ tc filter add dev enp4s0 protocol ip parent :\
  prio 1 flower ip_proto tcp dst_port range 20-30 skip_hw\
  action drop

$ tc -s filter show dev enp4s0 parent :
filter protocol ip pref 1 flower chain 0
filter protocol ip pref 1 flower chain 0 handle 0x1
  eth_type ipv4
  ip_proto tcp
  dst_port range 20-30
  skip_hw
  not_in_hw
action order 1: gact action drop
 random type none pass val 0
 index 1 ref 1 bind 1 installed 85 sec used 3 sec
Action statistics:
Sent 460 bytes 10 pkt (dropped 10, overlimits 0 requeues 0)
backlog 0b 0p requeues 0

2. Match on IP address and port range:
--
$ tc filter add dev enp4s0 protocol ip parent :\
  prio 1 flower dst_ip 192.168.1.1 ip_proto tcp dst_port range 100-200\
  skip_hw action drop

$ tc -s filter show dev enp4s0 parent :
filter protocol ip pref 1 flower chain 0 handle 0x2
  eth_type ipv4
  ip_proto tcp
  dst_ip 192.168.1.1
  dst_port range 100-200
  skip_hw
  not_in_hw
action order 1: gact action drop
 random type none pass val 0
 index 2 ref 1 bind 1 installed 58 sec used 2 sec
Action statistics:
Sent 920 bytes 20 pkt (dropped 20, overlimits 0 requeues 0)
backlog 0b 0p requeues 0

v2:
Addressed Jiri's comment to sync output format with input

Signed-off-by: Amritha Nambiar 
---
 include/uapi/linux/pkt_cls.h |7 ++
 tc/f_flower.c|  145 +++---
 2 files changed, 142 insertions(+), 10 deletions(-)

diff --git a/include/uapi/linux/pkt_cls.h b/include/uapi/linux/pkt_cls.h
index 401d0c1..b63c3cf 100644
--- a/include/uapi/linux/pkt_cls.h
+++ b/include/uapi/linux/pkt_cls.h
@@ -405,6 +405,11 @@ enum {
TCA_FLOWER_KEY_UDP_SRC, /* be16 */
TCA_FLOWER_KEY_UDP_DST, /* be16 */
 
+   TCA_FLOWER_KEY_PORT_SRC_MIN,/* be16 */
+   TCA_FLOWER_KEY_PORT_SRC_MAX,/* be16 */
+   TCA_FLOWER_KEY_PORT_DST_MIN,/* be16 */
+   TCA_FLOWER_KEY_PORT_DST_MAX,/* be16 */
+
TCA_FLOWER_FLAGS,
TCA_FLOWER_KEY_VLAN_ID, /* be16 */
TCA_FLOWER_KEY_VLAN_PRIO,   /* u8   */
@@ -518,6 +523,8 @@ enum {
TCA_FLOWER_KEY_FLAGS_FRAG_IS_FIRST = (1 << 1),
 };
 
+#define TCA_FLOWER_MASK_FLAGS_RANGE(1 << 0) /* Range-based match */
+
 /* Match-all classifier */
 
 enum {
diff --git a/tc/f_flower.c b/tc/f_flower.c
index 65fca04..7724a1d 100644
--- a/tc/f_flower.c
+++ b/tc/f_flower.c
@@ -494,6 +494,66 @@ static int flower_parse_port(char *str, __u8 ip_proto,
return 0;
 }
 
+static int flower_port_range_attr_type(__u8 ip_proto, enum flower_endpoint 
type,
+  __be16 *min_port_type,
+  __be16 *max_port_type)
+{
+   if (ip_proto == IPPROTO_TCP || ip_proto == IPPROTO_UDP ||
+   ip_proto == IPPROTO_SCTP) {
+   if (type == FLOWER_ENDPOINT_SRC) {
+   *min_port_type = TCA_FLOWER_KEY_PORT_SRC_MIN;
+   *max_port_type = TCA_FLOWER_KEY_PORT_SRC_MAX;
+   } else {
+   *min_port_type = TCA_FLOWER_KEY_PORT_DST_MIN;
+   *max_port_type = TCA_FLOWER_KEY_PORT_DST_MAX;
+   }
+   } else {
+   return -1;
+   }
+
+   return 0;
+}
+
+static int flower_parse_port_range(__be16 *min, __be16 *max, __u8 ip_proto,
+  enum flower_endpoint endpoint,
+  struct nlmsghdr *n)
+{
+   __be16 min_port_type, max_port_type;
+
+   flower_port_range_attr_type(ip_proto, endpoint, _port_type,
+   _port_type);
+   addattr16(n, MAX_MSG, min_port_type, *min);
+   addattr16(n, MAX_MSG, max_port_type, *max);
+
+   return 0;
+}
+
+static int get_range(__be16 *min, __be16 *max, char *argv)
+{
+   char *r;
+
+   r = strchr(argv, '-');
+   if (r) {
+   *r = '\0';
+   if (get_be16(min, argv, 10)) {
+   fprintf(stderr, "invalid min range\n");
+   return -1;
+   }
+   if (get_be16(max, r + 1, 10)) {
+   fprintf(stderr, "invalid max range\n");
+   return -1;
+   }
+   if (htons(*max) <= htons(*min)) {
+   fprintf(stderr, "max value should be greater than min 
value\n");
+   return -1;
+   }
+   } else {
+   fprintf(stderr, "Illegal range format\n");
+   return -1;
+   }
+   return 0;
+}
+
 #define TCP_FLAGS_MAX_MASK 0xfff
 
 static int flower_parse_tcp_flags(char *str, int flags_type, int mask_type,
@@ -1061,20 +1121,5

[net-next PATCH v2] net: sched: cls_flower: Classify packets using port ranges

2018-11-07 Thread Amritha Nambiar

Added support in tc flower for filtering based on port ranges.

Example:
1. Match on a port range:
-
$ tc filter add dev enp4s0 protocol ip parent :\
  prio 1 flower ip_proto tcp dst_port range 20-30 skip_hw\
  action drop

$ tc -s filter show dev enp4s0 parent :
filter protocol ip pref 1 flower chain 0
filter protocol ip pref 1 flower chain 0 handle 0x1
  eth_type ipv4
  ip_proto tcp
  dst_port range 20-30
  skip_hw
  not_in_hw
action order 1: gact action drop
 random type none pass val 0
 index 1 ref 1 bind 1 installed 85 sec used 3 sec
Action statistics:
Sent 460 bytes 10 pkt (dropped 10, overlimits 0 requeues 0)
backlog 0b 0p requeues 0

2. Match on IP address and port range:
--
$ tc filter add dev enp4s0 protocol ip parent :\
  prio 1 flower dst_ip 192.168.1.1 ip_proto tcp dst_port range 100-200\
  skip_hw action drop

$ tc -s filter show dev enp4s0 parent :
filter protocol ip pref 1 flower chain 0 handle 0x2
  eth_type ipv4
  ip_proto tcp
  dst_ip 192.168.1.1
  dst_port range 100-200
  skip_hw
  not_in_hw
action order 1: gact action drop
 random type none pass val 0
 index 2 ref 1 bind 1 installed 58 sec used 2 sec
Action statistics:
Sent 920 bytes 20 pkt (dropped 20, overlimits 0 requeues 0)
backlog 0b 0p requeues 0

v2:
Addressed Jiri's comments:
1. Added separate functions for dst and src comparisons.
2. Removed endpoint enum.
3. Added new bit TCA_FLOWER_FLAGS_RANGE to decide normal/range
  lookup.
4. Cleaned up fl_lookup function.

Signed-off-by: Amritha Nambiar 
---
 include/uapi/linux/pkt_cls.h |7 ++
 net/sched/cls_flower.c   |  133 --
 2 files changed, 134 insertions(+), 6 deletions(-)

diff --git a/include/uapi/linux/pkt_cls.h b/include/uapi/linux/pkt_cls.h
index 401d0c1..b63c3cf 100644
--- a/include/uapi/linux/pkt_cls.h
+++ b/include/uapi/linux/pkt_cls.h
@@ -405,6 +405,11 @@ enum {
TCA_FLOWER_KEY_UDP_SRC, /* be16 */
TCA_FLOWER_KEY_UDP_DST, /* be16 */
 
+   TCA_FLOWER_KEY_PORT_SRC_MIN,/* be16 */
+   TCA_FLOWER_KEY_PORT_SRC_MAX,/* be16 */
+   TCA_FLOWER_KEY_PORT_DST_MIN,/* be16 */
+   TCA_FLOWER_KEY_PORT_DST_MAX,/* be16 */
+
TCA_FLOWER_FLAGS,
TCA_FLOWER_KEY_VLAN_ID, /* be16 */
TCA_FLOWER_KEY_VLAN_PRIO,   /* u8   */
@@ -518,6 +523,8 @@ enum {
TCA_FLOWER_KEY_FLAGS_FRAG_IS_FIRST = (1 << 1),
 };
 
+#define TCA_FLOWER_MASK_FLAGS_RANGE(1 << 0) /* Range-based match */
+
 /* Match-all classifier */
 
 enum {
diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c
index 9aada2d..9d2582d 100644
--- a/net/sched/cls_flower.c
+++ b/net/sched/cls_flower.c
@@ -55,6 +55,9 @@ struct fl_flow_key {
struct flow_dissector_key_ip ip;
struct flow_dissector_key_ip enc_ip;
struct flow_dissector_key_enc_opts enc_opts;
+
+   struct flow_dissector_key_ports tp_min;
+   struct flow_dissector_key_ports tp_max;
 } __aligned(BITS_PER_LONG / 8); /* Ensure that we can do comparisons as longs. 
*/
 
 struct fl_flow_mask_range {
@@ -65,6 +68,7 @@ struct fl_flow_mask_range {
 struct fl_flow_mask {
struct fl_flow_key key;
struct fl_flow_mask_range range;
+   u32 flags;
struct rhash_head ht_node;
struct rhashtable ht;
struct rhashtable_params filter_ht_params;
@@ -179,13 +183,89 @@ static void fl_clear_masked_range(struct fl_flow_key *key,
memset(fl_key_get_start(key, mask), 0, fl_mask_range(mask));
 }
 
-static struct cls_fl_filter *fl_lookup(struct fl_flow_mask *mask,
-  struct fl_flow_key *mkey)
+static bool fl_range_port_dst_cmp(struct cls_fl_filter *filter,
+ struct fl_flow_key *key,
+ struct fl_flow_key *mkey)
+{
+   __be16 min_mask, max_mask, min_val, max_val;
+
+   min_mask = htons(filter->mask->key.tp_min.dst);
+   max_mask = htons(filter->mask->key.tp_max.dst);
+   min_val = htons(filter->key.tp_min.dst);
+   max_val = htons(filter->key.tp_max.dst);
+
+   if (min_mask && max_mask) {
+   if (htons(key->tp.dst) < min_val ||
+   htons(key->tp.dst) > max_val)
+   return false;
+
+   /* skb does not have min and max values */
+   mkey->tp_min.dst = filter->mkey.tp_min.dst;
+   mkey->tp_max.dst = filter->mkey.tp_max.dst;
+   }
+   return true;
+}
+
+static bool fl_range_port_src_cmp(struct cls_fl_filter *filter,
+ struct fl_flow_key *key,
+ struct fl_flow_key *mkey)
+{
+   __be16 min_mask, max_mask, min_val, max_val;
+
+   min_mask = htons(filter->mask->ke

[iproute2 PATCH] tc: flower: Classify packets based port ranges

2018-10-12 Thread Amritha Nambiar

Added support for filtering based on port ranges.

Example:
1. Match on a port range:
-
$ tc filter add dev enp4s0 protocol ip parent :\
  prio 1 flower ip_proto tcp dst_port range 20-30 skip_hw\
  action drop

$ tc -s filter show dev enp4s0 parent :
filter protocol ip pref 1 flower chain 0
filter protocol ip pref 1 flower chain 0 handle 0x1
  eth_type ipv4
  ip_proto tcp
  dst_port_min 20
  dst_port_max 30
  skip_hw
  not_in_hw
action order 1: gact action drop
 random type none pass val 0
 index 1 ref 1 bind 1 installed 181 sec used 5 sec
Action statistics:
Sent 460 bytes 10 pkt (dropped 10, overlimits 0 requeues 0)
backlog 0b 0p requeues 0

2. Match on IP address and port range:
--
$ tc filter add dev enp4s0 protocol ip parent :\
  prio 1 flower dst_ip 192.168.1.1 ip_proto tcp dst_port range 100-200\
  skip_hw action drop

$ tc -s filter show dev enp4s0 parent :
filter protocol ip pref 1 flower chain 0 handle 0x2
  eth_type ipv4
  ip_proto tcp
  dst_ip 192.168.1.1
  dst_port_min 100
  dst_port_max 200
  skip_hw
  not_in_hw
action order 1: gact action drop
 random type none pass val 0
 index 2 ref 1 bind 1 installed 28 sec used 6 sec
Action statistics:
Sent 460 bytes 10 pkt (dropped 10, overlimits 0 requeues 0)
backlog 0b 0p requeues 0

Signed-off-by: Amritha Nambiar 
---
 include/uapi/linux/pkt_cls.h |5 +
 tc/f_flower.c|  145 +++---
 2 files changed, 140 insertions(+), 10 deletions(-)

diff --git a/include/uapi/linux/pkt_cls.h b/include/uapi/linux/pkt_cls.h
index be382fb..3d9727f 100644
--- a/include/uapi/linux/pkt_cls.h
+++ b/include/uapi/linux/pkt_cls.h
@@ -405,6 +405,11 @@ enum {
TCA_FLOWER_KEY_UDP_SRC, /* be16 */
TCA_FLOWER_KEY_UDP_DST, /* be16 */
 
+   TCA_FLOWER_KEY_PORT_SRC_MIN,/* be16 */
+   TCA_FLOWER_KEY_PORT_SRC_MAX,/* be16 */
+   TCA_FLOWER_KEY_PORT_DST_MIN,/* be16 */
+   TCA_FLOWER_KEY_PORT_DST_MAX,/* be16 */
+
TCA_FLOWER_FLAGS,
TCA_FLOWER_KEY_VLAN_ID, /* be16 */
TCA_FLOWER_KEY_VLAN_PRIO,   /* u8   */
diff --git a/tc/f_flower.c b/tc/f_flower.c
index 59e5f57..1a7bc80 100644
--- a/tc/f_flower.c
+++ b/tc/f_flower.c
@@ -33,6 +33,11 @@ enum flower_endpoint {
FLOWER_ENDPOINT_DST
 };
 
+struct range_type {
+   __be16 min_port_type;
+   __be16 max_port_type;
+};
+
 enum flower_icmp_field {
FLOWER_ICMP_FIELD_TYPE,
FLOWER_ICMP_FIELD_CODE
@@ -493,6 +498,64 @@ static int flower_parse_port(char *str, __u8 ip_proto,
return 0;
 }
 
+static int flower_port_range_attr_type(__u8 ip_proto, enum flower_endpoint 
type,
+  struct range_type *range)
+{
+   if (ip_proto == IPPROTO_TCP || ip_proto == IPPROTO_UDP ||
+   ip_proto == IPPROTO_SCTP) {
+   if (type == FLOWER_ENDPOINT_SRC) {
+   range->min_port_type = TCA_FLOWER_KEY_PORT_SRC_MIN;
+   range->max_port_type = TCA_FLOWER_KEY_PORT_SRC_MAX;
+   } else {
+   range->min_port_type = TCA_FLOWER_KEY_PORT_DST_MIN;
+   range->max_port_type = TCA_FLOWER_KEY_PORT_DST_MAX;
+   }
+   } else {
+   return -1;
+   }
+
+   return 0;
+}
+
+static int flower_parse_port_range(__be16 *min, __be16 *max, __u8 ip_proto,
+  enum flower_endpoint endpoint,
+  struct nlmsghdr *n)
+{
+   struct range_type range;
+
+   flower_port_range_attr_type(ip_proto, endpoint, );
+   addattr16(n, MAX_MSG, range.min_port_type, *min);
+   addattr16(n, MAX_MSG, range.max_port_type, *max);
+
+   return 0;
+}
+
+static int get_range(__be16 *min, __be16 *max, char *argv)
+{
+   char *r;
+
+   r = strchr(argv, '-');
+   if (r) {
+   *r = '\0';
+   if (get_be16(min, argv, 10)) {
+   fprintf(stderr, "invalid min range\n");
+   return -1;
+   }
+   if (get_be16(max, r + 1, 10)) {
+   fprintf(stderr, "invalid max range\n");
+   return -1;
+   }
+   if (htons(*max) <= htons(*min)) {
+   fprintf(stderr, "max value should be greater than min 
value\n");
+   return -1;
+   }
+   } else {
+   fprintf(stderr, "Illegal range format\n");
+   return -1;
+   }
+   return 0;
+}
+
 #define TCP_FLAGS_MAX_MASK 0xfff
 
 static int flower_parse_tcp_flags(char *str, int flags_type, int mask_type,
@@ -887,20 +950,54 @@ static int flower_parse_opt(struct filter_util *qu, c

[net-next PATCH] net: sched: cls_flower: Classify packets using port ranges

2018-10-12 Thread Amritha Nambiar

Added support in tc flower for filtering based on port ranges.
This is a rework of the RFC patch at:
https://patchwork.ozlabs.org/patch/969595/

Example:
1. Match on a port range:
-
$ tc filter add dev enp4s0 protocol ip parent :\
  prio 1 flower ip_proto tcp dst_port range 20-30 skip_hw\
  action drop

$ tc -s filter show dev enp4s0 parent :
filter protocol ip pref 1 flower chain 0
filter protocol ip pref 1 flower chain 0 handle 0x1
  eth_type ipv4
  ip_proto tcp
  dst_port_min 20
  dst_port_max 30
  skip_hw
  not_in_hw
action order 1: gact action drop
 random type none pass val 0
 index 1 ref 1 bind 1 installed 181 sec used 5 sec
Action statistics:
Sent 460 bytes 10 pkt (dropped 10, overlimits 0 requeues 0)
backlog 0b 0p requeues 0

2. Match on IP address and port range:
--
$ tc filter add dev enp4s0 protocol ip parent :\
  prio 1 flower dst_ip 192.168.1.1 ip_proto tcp dst_port range 100-200\
  skip_hw action drop

$ tc -s filter show dev enp4s0 parent :
filter protocol ip pref 1 flower chain 0 handle 0x2
  eth_type ipv4
  ip_proto tcp
  dst_ip 192.168.1.1
  dst_port_min 100
  dst_port_max 200
  skip_hw
  not_in_hw
action order 1: gact action drop
 random type none pass val 0
 index 2 ref 1 bind 1 installed 28 sec used 6 sec
Action statistics:
Sent 460 bytes 10 pkt (dropped 10, overlimits 0 requeues 0)
backlog 0b 0p requeues 0

Signed-off-by: Amritha Nambiar 
---
 include/uapi/linux/pkt_cls.h |5 ++
 net/sched/cls_flower.c   |  134 --
 2 files changed, 132 insertions(+), 7 deletions(-)

diff --git a/include/uapi/linux/pkt_cls.h b/include/uapi/linux/pkt_cls.h
index 401d0c1..b569308 100644
--- a/include/uapi/linux/pkt_cls.h
+++ b/include/uapi/linux/pkt_cls.h
@@ -405,6 +405,11 @@ enum {
TCA_FLOWER_KEY_UDP_SRC, /* be16 */
TCA_FLOWER_KEY_UDP_DST, /* be16 */
 
+   TCA_FLOWER_KEY_PORT_SRC_MIN,/* be16 */
+   TCA_FLOWER_KEY_PORT_SRC_MAX,/* be16 */
+   TCA_FLOWER_KEY_PORT_DST_MIN,/* be16 */
+   TCA_FLOWER_KEY_PORT_DST_MAX,/* be16 */
+
TCA_FLOWER_FLAGS,
TCA_FLOWER_KEY_VLAN_ID, /* be16 */
TCA_FLOWER_KEY_VLAN_PRIO,   /* u8   */
diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c
index 9aada2d..5f135f0 100644
--- a/net/sched/cls_flower.c
+++ b/net/sched/cls_flower.c
@@ -55,6 +55,9 @@ struct fl_flow_key {
struct flow_dissector_key_ip ip;
struct flow_dissector_key_ip enc_ip;
struct flow_dissector_key_enc_opts enc_opts;
+
+   struct flow_dissector_key_ports tp_min;
+   struct flow_dissector_key_ports tp_max;
 } __aligned(BITS_PER_LONG / 8); /* Ensure that we can do comparisons as longs. 
*/
 
 struct fl_flow_mask_range {
@@ -103,6 +106,11 @@ struct cls_fl_filter {
struct net_device *hw_dev;
 };
 
+enum fl_endpoint {
+   FLOWER_ENDPOINT_DST,
+   FLOWER_ENDPOINT_SRC
+};
+
 static const struct rhashtable_params mask_ht_params = {
.key_offset = offsetof(struct fl_flow_mask, key),
.key_len = sizeof(struct fl_flow_key),
@@ -179,11 +187,86 @@ static void fl_clear_masked_range(struct fl_flow_key *key,
memset(fl_key_get_start(key, mask), 0, fl_mask_range(mask));
 }
 
+static int fl_range_compare_params(struct cls_fl_filter *filter,
+  struct fl_flow_key *key,
+  struct fl_flow_key *mkey,
+  enum fl_endpoint endpoint)
+{
+   __be16 min_mask, max_mask, min_val, max_val;
+
+   if (endpoint == FLOWER_ENDPOINT_DST) {
+   min_mask = htons(filter->mask->key.tp_min.dst);
+   max_mask = htons(filter->mask->key.tp_max.dst);
+   min_val = htons(filter->key.tp_min.dst);
+   max_val = htons(filter->key.tp_max.dst);
+
+   if (min_mask && max_mask) {
+   if (htons(key->tp.dst) < min_val ||
+   htons(key->tp.dst) > max_val)
+   return -1;
+
+   /* skb does not have min and max values */
+   mkey->tp_min.dst = filter->mkey.tp_min.dst;
+   mkey->tp_max.dst = filter->mkey.tp_max.dst;
+   }
+   } else {
+   min_mask = htons(filter->mask->key.tp_min.src);
+   max_mask = htons(filter->mask->key.tp_max.src);
+   min_val = htons(filter->key.tp_min.src);
+   max_val = htons(filter->key.tp_max.src);
+
+   if (min_mask && max_mask) {
+   if (htons(key->tp.src) < min_val ||
+   htons(key->tp.src) > max_val)
+   return -1;
+
+

[iproute2,RFC PATCH] tc: range: Introduce TC range classifier

2018-09-13 Thread Amritha Nambiar

Range classifier is introduced to support filters based
on ranges. Only port-range filters are supported currently.
This can be combined with flower classifier to support a
combination of port-ranges and other parameters based
on existing fields supported by cls_flower.

Example:
1. Match on a port range:
---
$ tc filter add dev enp4s0 protocol ip parent : prio 2 range\
ip_proto tcp dst_port 1-15 skip_hw action drop

$ tc -s filter show dev enp4s0 parent :
filter protocol ip pref 2 range chain 0
filter protocol ip pref 2 range chain 0 handle 0x1
  eth_type ipv4
  ip_proto tcp
  dst_port_min 1
  dst_port_max 15
  skip_hw
  not_in_hw
action order 1: gact action drop
 random type none pass val 0
 index 1 ref 1 bind 1 installed 34 sec used 2 sec
Action statistics:
Sent 1380 bytes 30 pkt (dropped 30, overlimits 0 requeues 0)
backlog 0b 0p requeues 0

2. Match on IP address and port range:
--
$ tc filter add dev enp4s0 protocol ip parent : prio 2 flower\
  dst_ip 192.168.1.1 skip_hw action goto chain 11

$ tc filter add dev enp4s0 protocol ip parent : prio 2 chain 11\
  range ip_proto tcp dst_port 1-15 action drop

$ tc -s filter show dev enp4s0 parent :
filter protocol ip pref 2 flower chain 0
filter protocol ip pref 2 flower chain 0 handle 0x1
  eth_type ipv4
  dst_ip 192.168.1.1
  skip_hw
  not_in_hw
action order 1: gact action goto chain 11
 random type none pass val 0
 index 1 ref 1 bind 1 installed 1426 sec used 2 sec
Action statistics:
Sent 460 bytes 10 pkt (dropped 0, overlimits 0 requeues 0)
backlog 0b 0p requeues 0

filter protocol ip pref 2 range chain 11
filter protocol ip pref 2 range chain 11 handle 0x1
  eth_type ipv4
  ip_proto tcp
  dst_port_min 1
  dst_port_max 15
  not_in_hw
action order 1: gact action drop
 random type none pass val 0
 index 2 ref 1 bind 1 installed 1310 sec used 2 sec
Action statistics:
Sent 460 bytes 10 pkt (dropped 10, overlimits 0 requeues 0)
backlog 0b 0p requeues 0

Signed-off-by: Amritha Nambiar 
---
 include/uapi/linux/pkt_cls.h |   19 ++
 tc/Makefile  |1 
 tc/f_range.c |  369 ++
 3 files changed, 389 insertions(+)
 create mode 100644 tc/f_range.c

diff --git a/include/uapi/linux/pkt_cls.h b/include/uapi/linux/pkt_cls.h
index be382fb..8ef3a5a 100644
--- a/include/uapi/linux/pkt_cls.h
+++ b/include/uapi/linux/pkt_cls.h
@@ -379,6 +379,25 @@ enum {
 
 #define TCA_BPF_MAX (__TCA_BPF_MAX - 1)
 
+/* RANGE classifier */
+
+enum {
+   TCA_RANGE_UNSPEC,
+   TCA_RANGE_CLASSID,  /* u32 */
+   TCA_RANGE_INDEV,
+   TCA_RANGE_ACT,
+   TCA_RANGE_KEY_ETH_TYPE, /* be16 */
+   TCA_RANGE_KEY_IP_PROTO, /* u8 */
+   TCA_RANGE_KEY_PORT_SRC_MIN, /* be16 */
+   TCA_RANGE_KEY_PORT_SRC_MAX, /* be16 */
+   TCA_RANGE_KEY_PORT_DST_MIN, /* be16 */
+   TCA_RANGE_KEY_PORT_DST_MAX, /* be16 */
+   TCA_RANGE_FLAGS,/* u32 */
+   __TCA_RANGE_MAX,
+};
+
+#define TCA_RANGE_MAX (__TCA_RANGE_MAX - 1)
+
 /* Flower classifier */
 
 enum {
diff --git a/tc/Makefile b/tc/Makefile
index 5a1a7ff..155cabe 100644
--- a/tc/Makefile
+++ b/tc/Makefile
@@ -29,6 +29,7 @@ TCMODULES += f_bpf.o
 TCMODULES += f_flow.o
 TCMODULES += f_cgroup.o
 TCMODULES += f_flower.o
+TCMODULES += f_range.o
 TCMODULES += q_dsmark.o
 TCMODULES += q_gred.o
 TCMODULES += f_tcindex.o
diff --git a/tc/f_range.c b/tc/f_range.c
new file mode 100644
index 000..388b275
--- /dev/null
+++ b/tc/f_range.c
@@ -0,0 +1,369 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * f_range.c   Range Classifier
+ *
+ * This program is free software; you can distribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Authors:Amritha Nambiar 
+ *
+ */
+
+#include 
+#include 
+#include 
+#include 
+#include 
+
+#include "utils.h"
+#include "tc_util.h"
+
+enum range_type {
+   RANGE_PORT_SRC,
+   RANGE_PORT_DST
+};
+
+struct range_values {
+   __be16 min_port_type;
+   __be16 max_port_type;
+};
+
+static void explain(void)
+{
+   fprintf(stderr, "Usage: ... range [ MATCH-LIST ]\n");
+   fprintf(stderr, " [skip_sw | skip_hw]\n");
+   fprintf(stderr, " [ action ACTION_SPEC ] [ classid 
CLASSID ]\n");
+   fprintf(stderr, "\n");
+   fprintf(stderr, "Where: SELECTOR := SAMPLE SAMPLE ...\n");
+   fprintf(stderr, "   FILTERID := X:Y:Z\n");
+   fprintf(stderr, "   ACTION_SPEC := ... look at individual

[net-next, RFC PATCH] net: sched: cls_range: Introduce Range classifier

2018-09-13 Thread Amritha Nambiar

This patch introduces a range classifier to support filtering based
on ranges. Only port-range filters are supported currently. This can
be combined with flower classifier to support filters that are a
combination of port-ranges and other parameters based on existing
fields supported by cls_flower.

Example:
1. Match on a port range:
---
$ tc filter add dev enp4s0 protocol ip parent : prio 2 range\
ip_proto tcp dst_port 1-15 skip_hw action drop

$ tc -s filter show dev enp4s0 parent :
filter protocol ip pref 2 range chain 0
filter protocol ip pref 2 range chain 0 handle 0x1
  eth_type ipv4
  ip_proto tcp
  dst_port_min 1
  dst_port_max 15
  skip_hw
  not_in_hw
action order 1: gact action drop
 random type none pass val 0
 index 1 ref 1 bind 1 installed 34 sec used 2 sec
Action statistics:
Sent 1380 bytes 30 pkt (dropped 30, overlimits 0 requeues 0)
backlog 0b 0p requeues 0

2. Match on IP address and port range:
--
$ tc filter add dev enp4s0 protocol ip parent : prio 2 flower\
  dst_ip 192.168.1.1 skip_hw action goto chain 11

$ tc filter add dev enp4s0 protocol ip parent : prio 2 chain 11\
  range ip_proto tcp dst_port 1-15 action drop

$ tc -s filter show dev enp4s0 parent :
filter protocol ip pref 2 flower chain 0
filter protocol ip pref 2 flower chain 0 handle 0x1
  eth_type ipv4
  dst_ip 192.168.1.1
  skip_hw
  not_in_hw
action order 1: gact action goto chain 11
 random type none pass val 0
 index 1 ref 1 bind 1 installed 1426 sec used 2 sec
Action statistics:
Sent 460 bytes 10 pkt (dropped 0, overlimits 0 requeues 0)
backlog 0b 0p requeues 0

filter protocol ip pref 2 range chain 11
filter protocol ip pref 2 range chain 11 handle 0x1
  eth_type ipv4
  ip_proto tcp
  dst_port_min 1
  dst_port_max 15
  not_in_hw
action order 1: gact action drop
 random type none pass val 0
 index 2 ref 1 bind 1 installed 1310 sec used 2 sec
Action statistics:
Sent 460 bytes 10 pkt (dropped 10, overlimits 0 requeues 0)
backlog 0b 0p requeues 0

Signed-off-by: Amritha Nambiar 
---
 include/uapi/linux/pkt_cls.h |   19 +
 net/sched/Kconfig|   10 +
 net/sched/Makefile   |1 
 net/sched/cls_range.c|  725 ++
 4 files changed, 755 insertions(+)
 create mode 100644 net/sched/cls_range.c

diff --git a/include/uapi/linux/pkt_cls.h b/include/uapi/linux/pkt_cls.h
index 401d0c1..b2b68e6 100644
--- a/include/uapi/linux/pkt_cls.h
+++ b/include/uapi/linux/pkt_cls.h
@@ -379,6 +379,25 @@ enum {
 
 #define TCA_BPF_MAX (__TCA_BPF_MAX - 1)
 
+/* RANGE classifier */
+
+enum {
+   TCA_RANGE_UNSPEC,
+   TCA_RANGE_CLASSID,  /* u32 */
+   TCA_RANGE_INDEV,
+   TCA_RANGE_ACT,
+   TCA_RANGE_KEY_ETH_TYPE, /* be16 */
+   TCA_RANGE_KEY_IP_PROTO, /* u8 */
+   TCA_RANGE_KEY_PORT_SRC_MIN, /* be16 */
+   TCA_RANGE_KEY_PORT_SRC_MAX, /* be16 */
+   TCA_RANGE_KEY_PORT_DST_MIN, /* be16 */
+   TCA_RANGE_KEY_PORT_DST_MAX, /* be16 */
+   TCA_RANGE_FLAGS,/* u32 */
+   __TCA_RANGE_MAX,
+};
+
+#define TCA_RANGE_MAX (__TCA_RANGE_MAX - 1)
+
 /* Flower classifier */
 
 enum {
diff --git a/net/sched/Kconfig b/net/sched/Kconfig
index e957413..f68770d 100644
--- a/net/sched/Kconfig
+++ b/net/sched/Kconfig
@@ -585,6 +585,16 @@ config NET_CLS_FLOWER
  To compile this code as a module, choose M here: the module will
  be called cls_flower.
 
+config NET_CLS_RANGE
+   tristate "Range classifier"
+   select NET_CLS
+   help
+ If you say Y here, you will be able to classify packets based on
+ ranges with minimum and maximum values.
+
+ To compile this code as a module, choose M here: the module will
+ be called cls_range.
+
 config NET_CLS_MATCHALL
tristate "Match-all classifier"
select NET_CLS
diff --git a/net/sched/Makefile b/net/sched/Makefile
index f0403f4..d1f57a8 100644
--- a/net/sched/Makefile
+++ b/net/sched/Makefile
@@ -69,6 +69,7 @@ obj-$(CONFIG_NET_CLS_FLOW)+= cls_flow.o
 obj-$(CONFIG_NET_CLS_CGROUP)   += cls_cgroup.o
 obj-$(CONFIG_NET_CLS_BPF)  += cls_bpf.o
 obj-$(CONFIG_NET_CLS_FLOWER)   += cls_flower.o
+obj-$(CONFIG_NET_CLS_RANGE)+= cls_range.o
 obj-$(CONFIG_NET_CLS_MATCHALL) += cls_matchall.o
 obj-$(CONFIG_NET_EMATCH)   += ematch.o
 obj-$(CONFIG_NET_EMATCH_CMP)   += em_cmp.o
diff --git a/net/sched/cls_range.c b/net/sched/cls_range.c
new file mode 100644
index 000..2ed53c7
--- /dev/null
+++ b/net/sched/cls_range.c
@@ -0,0 +1,725 @@
+// SPDX-License-Identifier: GPL-2.0
+/* net/sched/cls_range.c   Range classifier
+ *
+ * Copyright (c) 2018 Amritha Nambiar 
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under

[net-next,RFC PATCH] Introduce TC Range classifier

2018-09-13 Thread Amritha Nambiar

This patch introduces a TC range classifier to support filtering based
on ranges. Only port-range filters are supported currently. This can
be combined with flower classifier to support filters that are a
combination of port-ranges and other parameters based on existing
fields supported by cls_flower. The 'goto chain' action can be used to
combine the flower and range filter.
The filter precedence is decided based on the 'prio' value.

Example:
1. Match on a port range:
---
$ tc filter add dev enp4s0 protocol ip parent : prio 2 range\
ip_proto tcp dst_port 1-15 skip_hw action drop

$ tc -s filter show dev enp4s0 parent :
filter protocol ip pref 2 range chain 0
filter protocol ip pref 2 range chain 0 handle 0x1
  eth_type ipv4
  ip_proto tcp
  dst_port_min 1
  dst_port_max 15
  skip_hw
  not_in_hw
action order 1: gact action drop
 random type none pass val 0
 index 1 ref 1 bind 1 installed 34 sec used 2 sec
Action statistics:
Sent 1380 bytes 30 pkt (dropped 30, overlimits 0 requeues 0)
backlog 0b 0p requeues 0

2. Match on IP address and port range:
--
$ tc filter add dev enp4s0 protocol ip parent : prio 2 flower\
  dst_ip 192.168.1.1 skip_hw action goto chain 11

$ tc filter add dev enp4s0 protocol ip parent : prio 2 chain 11\
  range ip_proto tcp dst_port 1-15 action drop

$ tc -s filter show dev enp4s0 parent :
filter protocol ip pref 2 flower chain 0
filter protocol ip pref 2 flower chain 0 handle 0x1
  eth_type ipv4
  dst_ip 192.168.1.1
  skip_hw
  not_in_hw
action order 1: gact action goto chain 11
 random type none pass val 0
 index 1 ref 1 bind 1 installed 1426 sec used 2 sec
Action statistics:
Sent 460 bytes 10 pkt (dropped 0, overlimits 0 requeues 0)
backlog 0b 0p requeues 0

filter protocol ip pref 2 range chain 11
filter protocol ip pref 2 range chain 11 handle 0x1
  eth_type ipv4
  ip_proto tcp
  dst_port_min 1
  dst_port_max 15
  not_in_hw
action order 1: gact action drop
 random type none pass val 0
 index 2 ref 1 bind 1 installed 1310 sec used 2 sec
Action statistics:
Sent 460 bytes 10 pkt (dropped 10, overlimits 0 requeues 0)
backlog 0b 0p requeues 0
---

Amritha Nambiar (1):
  net: sched: cls_range: Introduce Range classifier


 include/uapi/linux/pkt_cls.h |   19 +
 net/sched/Kconfig|   10 +
 net/sched/Makefile   |1 
 net/sched/cls_range.c|  725 ++
 4 files changed, 755 insertions(+)
 create mode 100644 net/sched/cls_range.c

--

[net-next PATCH v6 6/7] net-sysfs: Add interface for Rx queue(s) map per Tx queue

2018-06-30 Thread Amritha Nambiar

Extend transmit queue sysfs attribute to configure Rx queue(s) map
per Tx queue. By default no receive queues are configured for the
Tx queue.

- /sys/class/net/eth0/queues/tx-*/xps_rxqs

Signed-off-by: Amritha Nambiar 
---
 net/core/net-sysfs.c |   83 ++
 1 file changed, 83 insertions(+)

diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c
index b39987c..f25ac5f 100644
--- a/net/core/net-sysfs.c
+++ b/net/core/net-sysfs.c
@@ -1283,6 +1283,88 @@ static ssize_t xps_cpus_store(struct netdev_queue *queue,
 
 static struct netdev_queue_attribute xps_cpus_attribute __ro_after_init
= __ATTR_RW(xps_cpus);
+
+static ssize_t xps_rxqs_show(struct netdev_queue *queue, char *buf)
+{
+   struct net_device *dev = queue->dev;
+   struct xps_dev_maps *dev_maps;
+   unsigned long *mask, index;
+   int j, len, num_tc = 1, tc = 0;
+
+   index = get_netdev_queue_index(queue);
+
+   if (dev->num_tc) {
+   num_tc = dev->num_tc;
+   tc = netdev_txq_to_tc(dev, index);
+   if (tc < 0)
+   return -EINVAL;
+   }
+   mask = kcalloc(BITS_TO_LONGS(dev->num_rx_queues), sizeof(long),
+  GFP_KERNEL);
+   if (!mask)
+   return -ENOMEM;
+
+   rcu_read_lock();
+   dev_maps = rcu_dereference(dev->xps_rxqs_map);
+   if (!dev_maps)
+   goto out_no_maps;
+
+   for (j = -1; j = netif_attrmask_next(j, NULL, dev->num_rx_queues),
+j < dev->num_rx_queues;) {
+   int i, tci = j * num_tc + tc;
+   struct xps_map *map;
+
+   map = rcu_dereference(dev_maps->attr_map[tci]);
+   if (!map)
+   continue;
+
+   for (i = map->len; i--;) {
+   if (map->queues[i] == index) {
+   set_bit(j, mask);
+   break;
+   }
+   }
+   }
+out_no_maps:
+   rcu_read_unlock();
+
+   len = bitmap_print_to_pagebuf(false, buf, mask, dev->num_rx_queues);
+   kfree(mask);
+
+   return len < PAGE_SIZE ? len : -EINVAL;
+}
+
+static ssize_t xps_rxqs_store(struct netdev_queue *queue, const char *buf,
+ size_t len)
+{
+   struct net_device *dev = queue->dev;
+   struct net *net = dev_net(dev);
+   unsigned long *mask, index;
+   int err;
+
+   if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
+   return -EPERM;
+
+   mask = kcalloc(BITS_TO_LONGS(dev->num_rx_queues), sizeof(long),
+  GFP_KERNEL);
+   if (!mask)
+   return -ENOMEM;
+
+   index = get_netdev_queue_index(queue);
+
+   err = bitmap_parse(buf, len, mask, dev->num_rx_queues);
+   if (err) {
+   kfree(mask);
+   return err;
+   }
+
+   err = __netif_set_xps_queue(dev, mask, index, true);
+   kfree(mask);
+   return err ? : len;
+}
+
+static struct netdev_queue_attribute xps_rxqs_attribute __ro_after_init
+   = __ATTR_RW(xps_rxqs);
 #endif /* CONFIG_XPS */
 
 static struct attribute *netdev_queue_default_attrs[] __ro_after_init = {
@@ -1290,6 +1372,7 @@ static struct attribute *netdev_queue_default_attrs[] 
__ro_after_init = {
_traffic_class.attr,
 #ifdef CONFIG_XPS
_cpus_attribute.attr,
+   _rxqs_attribute.attr,
_tx_maxrate.attr,
 #endif
NULL

[net-next PATCH v6 7/7] Documentation: Add explanation for XPS using Rx-queue(s) map

2018-06-30 Thread Amritha Nambiar

Signed-off-by: Amritha Nambiar 
---
 Documentation/ABI/testing/sysfs-class-net-queues |   11 
 Documentation/networking/scaling.txt |   61 ++
 2 files changed, 61 insertions(+), 11 deletions(-)

diff --git a/Documentation/ABI/testing/sysfs-class-net-queues 
b/Documentation/ABI/testing/sysfs-class-net-queues
index 0c0df91..978b763 100644
--- a/Documentation/ABI/testing/sysfs-class-net-queues
+++ b/Documentation/ABI/testing/sysfs-class-net-queues
@@ -42,6 +42,17 @@ Description:
network device transmit queue. Possible vaules depend on the
number of available CPU(s) in the system.
 
+What:  /sys/class//queues/tx-/xps_rxqs
+Date:  June 2018
+KernelVersion: 4.18.0
+Contact:   netdev@vger.kernel.org
+Description:
+   Mask of the receive queue(s) currently enabled to participate
+   into the Transmit Packet Steering packet processing flow for 
this
+   network device transmit queue. Possible values depend on the
+   number of available receive queue(s) in the network device.
+   Default is disabled.
+
 What:  /sys/class//queues/tx-/byte_queue_limits/hold_time
 Date:  November 2011
 KernelVersion: 3.3
diff --git a/Documentation/networking/scaling.txt 
b/Documentation/networking/scaling.txt
index f55639d..b7056a8 100644
--- a/Documentation/networking/scaling.txt
+++ b/Documentation/networking/scaling.txt
@@ -366,8 +366,13 @@ XPS: Transmit Packet Steering
 
 Transmit Packet Steering is a mechanism for intelligently selecting
 which transmit queue to use when transmitting a packet on a multi-queue
-device. To accomplish this, a mapping from CPU to hardware queue(s) is
-recorded. The goal of this mapping is usually to assign queues
+device. This can be accomplished by recording two kinds of maps, either
+a mapping of CPU to hardware queue(s) or a mapping of receive queue(s)
+to hardware transmit queue(s).
+
+1. XPS using CPUs map
+
+The goal of this mapping is usually to assign queues
 exclusively to a subset of CPUs, where the transmit completions for
 these queues are processed on a CPU within this set. This choice
 provides two benefits. First, contention on the device queue lock is
@@ -377,15 +382,40 @@ transmit queue). Secondly, cache miss rate on transmit 
completion is
 reduced, in particular for data cache lines that hold the sk_buff
 structures.
 
-XPS is configured per transmit queue by setting a bitmap of CPUs that
-may use that queue to transmit. The reverse mapping, from CPUs to
-transmit queues, is computed and maintained for each network device.
-When transmitting the first packet in a flow, the function
-get_xps_queue() is called to select a queue. This function uses the ID
-of the running CPU as a key into the CPU-to-queue lookup table. If the
+2. XPS using receive queues map
+
+This mapping is used to pick transmit queue based on the receive
+queue(s) map configuration set by the administrator. A set of receive
+queues can be mapped to a set of transmit queues (many:many), although
+the common use case is a 1:1 mapping. This will enable sending packets
+on the same queue associations for transmit and receive. This is useful for
+busy polling multi-threaded workloads where there are challenges in
+associating a given CPU to a given application thread. The application
+threads are not pinned to CPUs and each thread handles packets
+received on a single queue. The receive queue number is cached in the
+socket for the connection. In this model, sending the packets on the same
+transmit queue corresponding to the associated receive queue has benefits
+in keeping the CPU overhead low. Transmit completion work is locked into
+the same queue-association that a given application is polling on. This
+avoids the overhead of triggering an interrupt on another CPU. When the
+application cleans up the packets during the busy poll, transmit completion
+may be processed along with it in the same thread context and so result in
+reduced latency.
+
+XPS is configured per transmit queue by setting a bitmap of
+CPUs/receive-queues that may use that queue to transmit. The reverse
+mapping, from CPUs to transmit queues or from receive-queues to transmit
+queues, is computed and maintained for each network device. When
+transmitting the first packet in a flow, the function get_xps_queue() is
+called to select a queue. This function uses the ID of the receive queue
+for the socket connection for a match in the receive queue-to-transmit queue
+lookup table. Alternatively, this function can also use the ID of the
+running CPU as a key into the CPU-to-queue lookup table. If the
 ID matches a single queue, that is used for transmission. If multiple
 queues match, one is selected by using the flow hash to compute an index
-into the set.
+into the set. When selecting the transmit queue based on receive queue(s)
+map, the transmit device is not validated against

[net-next PATCH v6 5/7] net: Enable Tx queue selection based on Rx queues

2018-06-30 Thread Amritha Nambiar

This patch adds support to pick Tx queue based on the Rx queue(s) map
configuration set by the admin through the sysfs attribute
for each Tx queue. If the user configuration for receive queue(s) map
does not apply, then the Tx queue selection falls back to CPU(s) map
based selection and finally to hashing.

Signed-off-by: Amritha Nambiar 
---
 include/net/sock.h |   10 
 net/core/dev.c |   62 ++--
 2 files changed, 55 insertions(+), 17 deletions(-)

diff --git a/include/net/sock.h b/include/net/sock.h
index 2b097cc..2ed99bf 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -1730,6 +1730,16 @@ static inline void sk_rx_queue_clear(struct sock *sk)
 #endif
 }
 
+#ifdef CONFIG_XPS
+static inline int sk_rx_queue_get(const struct sock *sk)
+{
+   if (sk && sk->sk_rx_queue_mapping != NO_QUEUE_MAPPING)
+   return sk->sk_rx_queue_mapping;
+
+   return -1;
+}
+#endif
+
 static inline void sk_set_socket(struct sock *sk, struct socket *sock)
 {
sk_tx_queue_clear(sk);
diff --git a/net/core/dev.c b/net/core/dev.c
index 43b5575..08d58e0 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -3459,35 +3459,63 @@ sch_handle_egress(struct sk_buff *skb, int *ret, struct 
net_device *dev)
 }
 #endif /* CONFIG_NET_EGRESS */
 
-static inline int get_xps_queue(struct net_device *dev, struct sk_buff *skb)
+#ifdef CONFIG_XPS
+static int __get_xps_queue_idx(struct net_device *dev, struct sk_buff *skb,
+  struct xps_dev_maps *dev_maps, unsigned int tci)
+{
+   struct xps_map *map;
+   int queue_index = -1;
+
+   if (dev->num_tc) {
+   tci *= dev->num_tc;
+   tci += netdev_get_prio_tc_map(dev, skb->priority);
+   }
+
+   map = rcu_dereference(dev_maps->attr_map[tci]);
+   if (map) {
+   if (map->len == 1)
+   queue_index = map->queues[0];
+   else
+   queue_index = map->queues[reciprocal_scale(
+   skb_get_hash(skb), map->len)];
+   if (unlikely(queue_index >= dev->real_num_tx_queues))
+   queue_index = -1;
+   }
+   return queue_index;
+}
+#endif
+
+static int get_xps_queue(struct net_device *dev, struct sk_buff *skb)
 {
 #ifdef CONFIG_XPS
struct xps_dev_maps *dev_maps;
-   struct xps_map *map;
+   struct sock *sk = skb->sk;
int queue_index = -1;
 
if (!static_key_false(_needed))
return -1;
 
rcu_read_lock();
-   dev_maps = rcu_dereference(dev->xps_cpus_map);
+   if (!static_key_false(_rxqs_needed))
+   goto get_cpus_map;
+
+   dev_maps = rcu_dereference(dev->xps_rxqs_map);
if (dev_maps) {
-   unsigned int tci = skb->sender_cpu - 1;
+   int tci = sk_rx_queue_get(sk);
 
-   if (dev->num_tc) {
-   tci *= dev->num_tc;
-   tci += netdev_get_prio_tc_map(dev, skb->priority);
-   }
+   if (tci >= 0 && tci < dev->num_rx_queues)
+   queue_index = __get_xps_queue_idx(dev, skb, dev_maps,
+ tci);
+   }
 
-   map = rcu_dereference(dev_maps->attr_map[tci]);
-   if (map) {
-   if (map->len == 1)
-   queue_index = map->queues[0];
-   else
-   queue_index = 
map->queues[reciprocal_scale(skb_get_hash(skb),
-  
map->len)];
-   if (unlikely(queue_index >= dev->real_num_tx_queues))
-   queue_index = -1;
+get_cpus_map:
+   if (queue_index < 0) {
+   dev_maps = rcu_dereference(dev->xps_cpus_map);
+   if (dev_maps) {
+   unsigned int tci = skb->sender_cpu - 1;
+
+   queue_index = __get_xps_queue_idx(dev, skb, dev_maps,
+ tci);
}
}
rcu_read_unlock();

[net-next PATCH v6 0/7] Symmetric queue selection using XPS for Rx queues

2018-06-30 Thread Amritha Nambiar

   4.668
(% of all cache refs)

L1-dcache-load- 6.556.29
-misses
(% of all L1-dcache hits)

LLC-load-misses 13.91   10.44
(% of all LL-cache hits)

---

v6:
- Changed the names of some functions to begin with net_if.
- Cleaned up sk_tx_queue_set/sk_rx_queue_set functions.
- Added sk_rx_queue_clear to make it consistent with tx_queue_mapping
  initialization.

---

Amritha Nambiar (7):
  net: Refactor XPS for CPUs and Rx queues
  net: Use static_key for XPS maps
  net: sock: Change tx_queue_mapping in sock_common to unsigned short
  net: Record receive queue number for a connection
  net: Enable Tx queue selection based on Rx queues
  net-sysfs: Add interface for Rx queue(s) map per Tx queue
  Documentation: Add explanation for XPS using Rx-queue(s) map


 Documentation/ABI/testing/sysfs-class-net-queues |   11 +
 Documentation/networking/scaling.txt |   61 -
 include/linux/cpumask.h  |   11 +
 include/linux/netdevice.h|   98 +++
 include/net/busy_poll.h  |1 
 include/net/sock.h   |   52 
 net/core/dev.c   |  288 +++---
 net/core/net-sysfs.c |   87 ++-
 net/core/sock.c  |2 
 net/ipv4/tcp_input.c |3 
 10 files changed, 505 insertions(+), 109 deletions(-)

--

[net-next PATCH v6 1/7] net: Refactor XPS for CPUs and Rx queues

2018-06-30 Thread Amritha Nambiar

Refactor XPS code to support Tx queue selection based on
CPU(s) map or Rx queue(s) map.

Signed-off-by: Amritha Nambiar 
---
 include/linux/cpumask.h   |   11 ++
 include/linux/netdevice.h |   98 -
 net/core/dev.c|  211 ++---
 net/core/net-sysfs.c  |4 -
 4 files changed, 244 insertions(+), 80 deletions(-)

diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h
index bf53d89..57f20a0 100644
--- a/include/linux/cpumask.h
+++ b/include/linux/cpumask.h
@@ -115,12 +115,17 @@ extern struct cpumask __cpu_active_mask;
 #define cpu_active(cpu)((cpu) == 0)
 #endif
 
-/* verify cpu argument to cpumask_* operators */
-static inline unsigned int cpumask_check(unsigned int cpu)
+static inline void cpu_max_bits_warn(unsigned int cpu, unsigned int bits)
 {
 #ifdef CONFIG_DEBUG_PER_CPU_MAPS
-   WARN_ON_ONCE(cpu >= nr_cpumask_bits);
+   WARN_ON_ONCE(cpu >= bits);
 #endif /* CONFIG_DEBUG_PER_CPU_MAPS */
+}
+
+/* verify cpu argument to cpumask_* operators */
+static inline unsigned int cpumask_check(unsigned int cpu)
+{
+   cpu_max_bits_warn(cpu, nr_cpumask_bits);
return cpu;
 }
 
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index c6b377a..8bf8d61 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -731,10 +731,15 @@ struct xps_map {
  */
 struct xps_dev_maps {
struct rcu_head rcu;
-   struct xps_map __rcu *cpu_map[0];
+   struct xps_map __rcu *attr_map[0]; /* Either CPUs map or RXQs map */
 };
-#define XPS_DEV_MAPS_SIZE(_tcs) (sizeof(struct xps_dev_maps) + \
+
+#define XPS_CPU_DEV_MAPS_SIZE(_tcs) (sizeof(struct xps_dev_maps) + \
(nr_cpu_ids * (_tcs) * sizeof(struct xps_map *)))
+
+#define XPS_RXQ_DEV_MAPS_SIZE(_tcs, _rxqs) (sizeof(struct xps_dev_maps) +\
+   (_rxqs * (_tcs) * sizeof(struct xps_map *)))
+
 #endif /* CONFIG_XPS */
 
 #define TC_MAX_QUEUE   16
@@ -1910,7 +1915,8 @@ struct net_device {
int watchdog_timeo;
 
 #ifdef CONFIG_XPS
-   struct xps_dev_maps __rcu *xps_maps;
+   struct xps_dev_maps __rcu *xps_cpus_map;
+   struct xps_dev_maps __rcu *xps_rxqs_map;
 #endif
 #ifdef CONFIG_NET_CLS_ACT
struct mini_Qdisc __rcu *miniq_egress;
@@ -3259,6 +3265,92 @@ static inline void netif_wake_subqueue(struct net_device 
*dev, u16 queue_index)
 #ifdef CONFIG_XPS
 int netif_set_xps_queue(struct net_device *dev, const struct cpumask *mask,
u16 index);
+int __netif_set_xps_queue(struct net_device *dev, const unsigned long *mask,
+ u16 index, bool is_rxqs_map);
+
+/**
+ * netif_attr_test_mask - Test a CPU or Rx queue set in a mask
+ * @j: CPU/Rx queue index
+ * @mask: bitmask of all cpus/rx queues
+ * @nr_bits: number of bits in the bitmask
+ *
+ * Test if a CPU or Rx queue index is set in a mask of all CPU/Rx queues.
+ */
+static inline bool netif_attr_test_mask(unsigned long j,
+   const unsigned long *mask,
+   unsigned int nr_bits)
+{
+   cpu_max_bits_warn(j, nr_bits);
+   return test_bit(j, mask);
+}
+
+/**
+ * netif_attr_test_online - Test for online CPU/Rx queue
+ * @j: CPU/Rx queue index
+ * @online_mask: bitmask for CPUs/Rx queues that are online
+ * @nr_bits: number of bits in the bitmask
+ *
+ * Returns true if a CPU/Rx queue is online.
+ */
+static inline bool netif_attr_test_online(unsigned long j,
+ const unsigned long *online_mask,
+ unsigned int nr_bits)
+{
+   cpu_max_bits_warn(j, nr_bits);
+
+   if (online_mask)
+   return test_bit(j, online_mask);
+
+   return (j < nr_bits);
+}
+
+/**
+ * netif_attrmask_next - get the next CPU/Rx queue in a cpu/Rx queues mask
+ * @n: CPU/Rx queue index
+ * @srcp: the cpumask/Rx queue mask pointer
+ * @nr_bits: number of bits in the bitmask
+ *
+ * Returns >= nr_bits if no further CPUs/Rx queues set.
+ */
+static inline unsigned int netif_attrmask_next(int n, const unsigned long 
*srcp,
+  unsigned int nr_bits)
+{
+   /* -1 is a legal arg here. */
+   if (n != -1)
+   cpu_max_bits_warn(n, nr_bits);
+
+   if (srcp)
+   return find_next_bit(srcp, nr_bits, n + 1);
+
+   return n + 1;
+}
+
+/**
+ * netif_attrmask_next_and - get the next CPU/Rx queue in *src1p & *src2p
+ * @n: CPU/Rx queue index
+ * @src1p: the first CPUs/Rx queues mask pointer
+ * @src2p: the second CPUs/Rx queues mask pointer
+ * @nr_bits: number of bits in the bitmask
+ *
+ * Returns >= nr_bits if no further CPUs/Rx queues set in both.
+ */
+static inline int netif_attrmask_next_and(int n, const unsigned long *src1p,
+ const u

[net-next PATCH v6 3/7] net: sock: Change tx_queue_mapping in sock_common to unsigned short

2018-06-30 Thread Amritha Nambiar

Change 'skc_tx_queue_mapping' field in sock_common structure from
'int' to 'unsigned short' type with ~0 indicating unset and
other positive queue values being set. This will accommodate adding
a new 'unsigned short' field in sock_common in the next patch for
rx_queue_mapping.

Signed-off-by: Amritha Nambiar 
---
 include/net/sock.h |   14 +++---
 1 file changed, 11 insertions(+), 3 deletions(-)

diff --git a/include/net/sock.h b/include/net/sock.h
index b3b7541..37b09c8 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -214,7 +214,7 @@ struct sock_common {
struct hlist_node   skc_node;
struct hlist_nulls_node skc_nulls_node;
};
-   int skc_tx_queue_mapping;
+   unsigned short  skc_tx_queue_mapping;
union {
int skc_incoming_cpu;
u32 skc_rcv_wnd;
@@ -1681,17 +1681,25 @@ static inline int sk_receive_skb(struct sock *sk, 
struct sk_buff *skb,
 
 static inline void sk_tx_queue_set(struct sock *sk, int tx_queue)
 {
+   /* sk_tx_queue_mapping accept only upto a 16-bit value */
+   if (WARN_ON_ONCE((unsigned short)tx_queue >= USHRT_MAX))
+   return;
sk->sk_tx_queue_mapping = tx_queue;
 }
 
+#define NO_QUEUE_MAPPING   USHRT_MAX
+
 static inline void sk_tx_queue_clear(struct sock *sk)
 {
-   sk->sk_tx_queue_mapping = -1;
+   sk->sk_tx_queue_mapping = NO_QUEUE_MAPPING;
 }
 
 static inline int sk_tx_queue_get(const struct sock *sk)
 {
-   return sk ? sk->sk_tx_queue_mapping : -1;
+   if (sk && sk->sk_tx_queue_mapping != NO_QUEUE_MAPPING)
+   return sk->sk_tx_queue_mapping;
+
+   return -1;
 }
 
 static inline void sk_set_socket(struct sock *sk, struct socket *sock)

[net-next PATCH v6 4/7] net: Record receive queue number for a connection

2018-06-30 Thread Amritha Nambiar

This patch adds a new field to sock_common 'skc_rx_queue_mapping'
which holds the receive queue number for the connection. The Rx queue
is marked in tcp_finish_connect() to allow a client app to do
SO_INCOMING_NAPI_ID after a connect() call to get the right queue
association for a socket. Rx queue is also marked in tcp_conn_request()
to allow syn-ack to go on the right tx-queue associated with
the queue on which syn is received.

Signed-off-by: Amritha Nambiar 
Signed-off-by: Sridhar Samudrala 
---
 include/net/busy_poll.h |1 +
 include/net/sock.h  |   28 
 net/core/sock.c |2 ++
 net/ipv4/tcp_input.c|3 +++
 4 files changed, 34 insertions(+)

diff --git a/include/net/busy_poll.h b/include/net/busy_poll.h
index c518743..9e36fda6 100644
--- a/include/net/busy_poll.h
+++ b/include/net/busy_poll.h
@@ -151,6 +151,7 @@ static inline void sk_mark_napi_id(struct sock *sk, const 
struct sk_buff *skb)
 #ifdef CONFIG_NET_RX_BUSY_POLL
sk->sk_napi_id = skb->napi_id;
 #endif
+   sk_rx_queue_set(sk, skb);
 }
 
 /* variant used for unconnected sockets */
diff --git a/include/net/sock.h b/include/net/sock.h
index 37b09c8..2b097cc 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -139,6 +139,7 @@ typedef __u64 __bitwise __addrpair;
  * @skc_node: main hash linkage for various protocol lookup tables
  * @skc_nulls_node: main hash linkage for TCP/UDP/UDP-Lite protocol
  * @skc_tx_queue_mapping: tx queue number for this connection
+ * @skc_rx_queue_mapping: rx queue number for this connection
  * @skc_flags: place holder for sk_flags
  * %SO_LINGER (l_onoff), %SO_BROADCAST, %SO_KEEPALIVE,
  * %SO_OOBINLINE settings, %SO_TIMESTAMPING settings
@@ -215,6 +216,9 @@ struct sock_common {
struct hlist_nulls_node skc_nulls_node;
};
unsigned short  skc_tx_queue_mapping;
+#ifdef CONFIG_XPS
+   unsigned short  skc_rx_queue_mapping;
+#endif
union {
int skc_incoming_cpu;
u32 skc_rcv_wnd;
@@ -326,6 +330,9 @@ struct sock {
 #define sk_nulls_node  __sk_common.skc_nulls_node
 #define sk_refcnt  __sk_common.skc_refcnt
 #define sk_tx_queue_mapping__sk_common.skc_tx_queue_mapping
+#ifdef CONFIG_XPS
+#define sk_rx_queue_mapping__sk_common.skc_rx_queue_mapping
+#endif
 
 #define sk_dontcopy_begin  __sk_common.skc_dontcopy_begin
 #define sk_dontcopy_end__sk_common.skc_dontcopy_end
@@ -1702,6 +1709,27 @@ static inline int sk_tx_queue_get(const struct sock *sk)
return -1;
 }
 
+static inline void sk_rx_queue_set(struct sock *sk, const struct sk_buff *skb)
+{
+#ifdef CONFIG_XPS
+   if (skb_rx_queue_recorded(skb)) {
+   u16 rx_queue = skb_get_rx_queue(skb);
+
+   if (WARN_ON_ONCE(rx_queue == NO_QUEUE_MAPPING))
+   return;
+
+   sk->sk_rx_queue_mapping = rx_queue;
+   }
+#endif
+}
+
+static inline void sk_rx_queue_clear(struct sock *sk)
+{
+#ifdef CONFIG_XPS
+   sk->sk_rx_queue_mapping = NO_QUEUE_MAPPING;
+#endif
+}
+
 static inline void sk_set_socket(struct sock *sk, struct socket *sock)
 {
sk_tx_queue_clear(sk);
diff --git a/net/core/sock.c b/net/core/sock.c
index bcc4182..dac6d78 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -2818,6 +2818,8 @@ void sock_init_data(struct socket *sock, struct sock *sk)
sk->sk_pacing_rate = ~0U;
sk->sk_pacing_shift = 10;
sk->sk_incoming_cpu = -1;
+
+   sk_rx_queue_clear(sk);
/*
 * Before updating sk_refcnt, we must commit prior changes to memory
 * (Documentation/RCU/rculist_nulls.txt for details)
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 9c5b341..b3b5aef 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -78,6 +78,7 @@
 #include 
 #include 
 #include 
+#include 
 
 int sysctl_tcp_max_orphans __read_mostly = NR_FILE;
 
@@ -5588,6 +5589,7 @@ void tcp_finish_connect(struct sock *sk, struct sk_buff 
*skb)
if (skb) {
icsk->icsk_af_ops->sk_rx_dst_set(sk, skb);
security_inet_conn_established(sk, skb);
+   sk_mark_napi_id(sk, skb);
}
 
tcp_init_transfer(sk, BPF_SOCK_OPS_ACTIVE_ESTABLISHED_CB);
@@ -6416,6 +6418,7 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops,
tcp_rsk(req)->snt_isn = isn;
tcp_rsk(req)->txhash = net_tx_rndhash();
tcp_openreq_init_rwin(req, sk, dst);
+   sk_rx_queue_set(req_to_sk(req), skb);
if (!want_cookie) {
tcp_reqsk_record_syn(sk, req, skb);
fastopen_sk = tcp_try_fastopen(sk, skb, req, , dst);

[net-next PATCH v6 2/7] net: Use static_key for XPS maps

2018-06-30 Thread Amritha Nambiar

Use static_key for XPS maps to reduce the cost of extra map checks,
similar to how it is used for RPS and RFS. This includes static_key
'xps_needed' for XPS and another for 'xps_rxqs_needed' for XPS using
Rx queues map.

Signed-off-by: Amritha Nambiar 
---
 net/core/dev.c |   31 +--
 1 file changed, 25 insertions(+), 6 deletions(-)

diff --git a/net/core/dev.c b/net/core/dev.c
index 7105955..43b5575 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -2081,6 +2081,10 @@ int netdev_txq_to_tc(struct net_device *dev, unsigned 
int txq)
 EXPORT_SYMBOL(netdev_txq_to_tc);
 
 #ifdef CONFIG_XPS
+struct static_key xps_needed __read_mostly;
+EXPORT_SYMBOL(xps_needed);
+struct static_key xps_rxqs_needed __read_mostly;
+EXPORT_SYMBOL(xps_rxqs_needed);
 static DEFINE_MUTEX(xps_map_mutex);
 #define xmap_dereference(P)\
rcu_dereference_protected((P), lockdep_is_held(_map_mutex))
@@ -2168,14 +2172,18 @@ static void netif_reset_xps_queues(struct net_device 
*dev, u16 offset,
struct xps_dev_maps *dev_maps;
unsigned int nr_ids;
 
-   mutex_lock(_map_mutex);
+   if (!static_key_false(_needed))
+   return;
 
-   dev_maps = xmap_dereference(dev->xps_rxqs_map);
-   if (dev_maps) {
-   nr_ids = dev->num_rx_queues;
-   clean_xps_maps(dev, possible_mask, dev_maps, nr_ids, offset,
-  count, true);
+   mutex_lock(_map_mutex);
 
+   if (static_key_false(_rxqs_needed)) {
+   dev_maps = xmap_dereference(dev->xps_rxqs_map);
+   if (dev_maps) {
+   nr_ids = dev->num_rx_queues;
+   clean_xps_maps(dev, possible_mask, dev_maps, nr_ids,
+  offset, count, true);
+   }
}
 
dev_maps = xmap_dereference(dev->xps_cpus_map);
@@ -2189,6 +2197,10 @@ static void netif_reset_xps_queues(struct net_device 
*dev, u16 offset,
   false);
 
 out_no_maps:
+   if (static_key_enabled(_rxqs_needed))
+   static_key_slow_dec(_rxqs_needed);
+
+   static_key_slow_dec(_needed);
mutex_unlock(_map_mutex);
 }
 
@@ -2297,6 +2309,10 @@ int __netif_set_xps_queue(struct net_device *dev, const 
unsigned long *mask,
if (!new_dev_maps)
goto out_no_new_maps;
 
+   static_key_slow_inc(_needed);
+   if (is_rxqs_map)
+   static_key_slow_inc(_rxqs_needed);
+
for (j = -1; j = netif_attrmask_next(j, possible_mask, nr_ids),
 j < nr_ids;) {
/* copy maps belonging to foreign traffic classes */
@@ -3450,6 +3466,9 @@ static inline int get_xps_queue(struct net_device *dev, 
struct sk_buff *skb)
struct xps_map *map;
int queue_index = -1;
 
+   if (!static_key_false(_needed))
+   return -1;
+
rcu_read_lock();
dev_maps = rcu_dereference(dev->xps_cpus_map);
if (dev_maps) {

[net-next PATCH v5 7/7] Documentation: Add explanation for XPS using Rx-queue(s) map

2018-06-27 Thread Amritha Nambiar

Signed-off-by: Amritha Nambiar 
---
 Documentation/ABI/testing/sysfs-class-net-queues |   11 
 Documentation/networking/scaling.txt |   61 ++
 2 files changed, 61 insertions(+), 11 deletions(-)

diff --git a/Documentation/ABI/testing/sysfs-class-net-queues 
b/Documentation/ABI/testing/sysfs-class-net-queues
index 0c0df91..978b763 100644
--- a/Documentation/ABI/testing/sysfs-class-net-queues
+++ b/Documentation/ABI/testing/sysfs-class-net-queues
@@ -42,6 +42,17 @@ Description:
network device transmit queue. Possible vaules depend on the
number of available CPU(s) in the system.
 
+What:  /sys/class//queues/tx-/xps_rxqs
+Date:  June 2018
+KernelVersion: 4.18.0
+Contact:   netdev@vger.kernel.org
+Description:
+   Mask of the receive queue(s) currently enabled to participate
+   into the Transmit Packet Steering packet processing flow for 
this
+   network device transmit queue. Possible values depend on the
+   number of available receive queue(s) in the network device.
+   Default is disabled.
+
 What:  /sys/class//queues/tx-/byte_queue_limits/hold_time
 Date:  November 2011
 KernelVersion: 3.3
diff --git a/Documentation/networking/scaling.txt 
b/Documentation/networking/scaling.txt
index f55639d..b7056a8 100644
--- a/Documentation/networking/scaling.txt
+++ b/Documentation/networking/scaling.txt
@@ -366,8 +366,13 @@ XPS: Transmit Packet Steering
 
 Transmit Packet Steering is a mechanism for intelligently selecting
 which transmit queue to use when transmitting a packet on a multi-queue
-device. To accomplish this, a mapping from CPU to hardware queue(s) is
-recorded. The goal of this mapping is usually to assign queues
+device. This can be accomplished by recording two kinds of maps, either
+a mapping of CPU to hardware queue(s) or a mapping of receive queue(s)
+to hardware transmit queue(s).
+
+1. XPS using CPUs map
+
+The goal of this mapping is usually to assign queues
 exclusively to a subset of CPUs, where the transmit completions for
 these queues are processed on a CPU within this set. This choice
 provides two benefits. First, contention on the device queue lock is
@@ -377,15 +382,40 @@ transmit queue). Secondly, cache miss rate on transmit 
completion is
 reduced, in particular for data cache lines that hold the sk_buff
 structures.
 
-XPS is configured per transmit queue by setting a bitmap of CPUs that
-may use that queue to transmit. The reverse mapping, from CPUs to
-transmit queues, is computed and maintained for each network device.
-When transmitting the first packet in a flow, the function
-get_xps_queue() is called to select a queue. This function uses the ID
-of the running CPU as a key into the CPU-to-queue lookup table. If the
+2. XPS using receive queues map
+
+This mapping is used to pick transmit queue based on the receive
+queue(s) map configuration set by the administrator. A set of receive
+queues can be mapped to a set of transmit queues (many:many), although
+the common use case is a 1:1 mapping. This will enable sending packets
+on the same queue associations for transmit and receive. This is useful for
+busy polling multi-threaded workloads where there are challenges in
+associating a given CPU to a given application thread. The application
+threads are not pinned to CPUs and each thread handles packets
+received on a single queue. The receive queue number is cached in the
+socket for the connection. In this model, sending the packets on the same
+transmit queue corresponding to the associated receive queue has benefits
+in keeping the CPU overhead low. Transmit completion work is locked into
+the same queue-association that a given application is polling on. This
+avoids the overhead of triggering an interrupt on another CPU. When the
+application cleans up the packets during the busy poll, transmit completion
+may be processed along with it in the same thread context and so result in
+reduced latency.
+
+XPS is configured per transmit queue by setting a bitmap of
+CPUs/receive-queues that may use that queue to transmit. The reverse
+mapping, from CPUs to transmit queues or from receive-queues to transmit
+queues, is computed and maintained for each network device. When
+transmitting the first packet in a flow, the function get_xps_queue() is
+called to select a queue. This function uses the ID of the receive queue
+for the socket connection for a match in the receive queue-to-transmit queue
+lookup table. Alternatively, this function can also use the ID of the
+running CPU as a key into the CPU-to-queue lookup table. If the
 ID matches a single queue, that is used for transmission. If multiple
 queues match, one is selected by using the flow hash to compute an index
-into the set.
+into the set. When selecting the transmit queue based on receive queue(s)
+map, the transmit device is not validated against

[net-next PATCH v5 4/7] net: Record receive queue number for a connection

2018-06-27 Thread Amritha Nambiar

This patch adds a new field to sock_common 'skc_rx_queue_mapping'
which holds the receive queue number for the connection. The Rx queue
is marked in tcp_finish_connect() to allow a client app to do
SO_INCOMING_NAPI_ID after a connect() call to get the right queue
association for a socket. Rx queue is also marked in tcp_conn_request()
to allow syn-ack to go on the right tx-queue associated with
the queue on which syn is received.

Signed-off-by: Amritha Nambiar 
Signed-off-by: Sridhar Samudrala 
---
 include/net/busy_poll.h |1 +
 include/net/sock.h  |   14 ++
 net/core/sock.c |4 
 net/ipv4/tcp_input.c|3 +++
 4 files changed, 22 insertions(+)

diff --git a/include/net/busy_poll.h b/include/net/busy_poll.h
index c518743..9e36fda6 100644
--- a/include/net/busy_poll.h
+++ b/include/net/busy_poll.h
@@ -151,6 +151,7 @@ static inline void sk_mark_napi_id(struct sock *sk, const 
struct sk_buff *skb)
 #ifdef CONFIG_NET_RX_BUSY_POLL
sk->sk_napi_id = skb->napi_id;
 #endif
+   sk_rx_queue_set(sk, skb);
 }
 
 /* variant used for unconnected sockets */
diff --git a/include/net/sock.h b/include/net/sock.h
index 0a7d57b..f73dbca 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -139,6 +139,7 @@ typedef __u64 __bitwise __addrpair;
  * @skc_node: main hash linkage for various protocol lookup tables
  * @skc_nulls_node: main hash linkage for TCP/UDP/UDP-Lite protocol
  * @skc_tx_queue_mapping: tx queue number for this connection
+ * @skc_rx_queue_mapping: rx queue number for this connection
  * @skc_flags: place holder for sk_flags
  * %SO_LINGER (l_onoff), %SO_BROADCAST, %SO_KEEPALIVE,
  * %SO_OOBINLINE settings, %SO_TIMESTAMPING settings
@@ -215,6 +216,9 @@ struct sock_common {
struct hlist_nulls_node skc_nulls_node;
};
unsigned short  skc_tx_queue_mapping;
+#ifdef CONFIG_XPS
+   unsigned short  skc_rx_queue_mapping;
+#endif
union {
int skc_incoming_cpu;
u32 skc_rcv_wnd;
@@ -326,6 +330,9 @@ struct sock {
 #define sk_nulls_node  __sk_common.skc_nulls_node
 #define sk_refcnt  __sk_common.skc_refcnt
 #define sk_tx_queue_mapping__sk_common.skc_tx_queue_mapping
+#ifdef CONFIG_XPS
+#define sk_rx_queue_mapping__sk_common.skc_rx_queue_mapping
+#endif
 
 #define sk_dontcopy_begin  __sk_common.skc_dontcopy_begin
 #define sk_dontcopy_end__sk_common.skc_dontcopy_end
@@ -1702,6 +1709,13 @@ static inline int sk_tx_queue_get(const struct sock *sk)
return -1;
 }
 
+static inline void sk_rx_queue_set(struct sock *sk, const struct sk_buff *skb)
+{
+#ifdef CONFIG_XPS
+   sk->sk_rx_queue_mapping = skb_get_rx_queue(skb);
+#endif
+}
+
 static inline void sk_set_socket(struct sock *sk, struct socket *sock)
 {
sk_tx_queue_clear(sk);
diff --git a/net/core/sock.c b/net/core/sock.c
index bcc4182..fe8cb25c 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -2818,6 +2818,10 @@ void sock_init_data(struct socket *sock, struct sock *sk)
sk->sk_pacing_rate = ~0U;
sk->sk_pacing_shift = 10;
sk->sk_incoming_cpu = -1;
+
+#ifdef CONFIG_XPS
+   sk->sk_rx_queue_mapping = NO_QUEUE_MAPPING;
+#endif
/*
 * Before updating sk_refcnt, we must commit prior changes to memory
 * (Documentation/RCU/rculist_nulls.txt for details)
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 9c5b341..b3b5aef 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -78,6 +78,7 @@
 #include 
 #include 
 #include 
+#include 
 
 int sysctl_tcp_max_orphans __read_mostly = NR_FILE;
 
@@ -5588,6 +5589,7 @@ void tcp_finish_connect(struct sock *sk, struct sk_buff 
*skb)
if (skb) {
icsk->icsk_af_ops->sk_rx_dst_set(sk, skb);
security_inet_conn_established(sk, skb);
+   sk_mark_napi_id(sk, skb);
}
 
tcp_init_transfer(sk, BPF_SOCK_OPS_ACTIVE_ESTABLISHED_CB);
@@ -6416,6 +6418,7 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops,
tcp_rsk(req)->snt_isn = isn;
tcp_rsk(req)->txhash = net_tx_rndhash();
tcp_openreq_init_rwin(req, sk, dst);
+   sk_rx_queue_set(req_to_sk(req), skb);
if (!want_cookie) {
tcp_reqsk_record_syn(sk, req, skb);
fastopen_sk = tcp_try_fastopen(sk, skb, req, , dst);

[net-next PATCH v5 6/7] net-sysfs: Add interface for Rx queue(s) map per Tx queue

2018-06-27 Thread Amritha Nambiar

Extend transmit queue sysfs attribute to configure Rx queue(s) map
per Tx queue. By default no receive queues are configured for the
Tx queue.

- /sys/class/net/eth0/queues/tx-*/xps_rxqs

Signed-off-by: Amritha Nambiar 
---
 net/core/net-sysfs.c |   81 ++
 1 file changed, 81 insertions(+)

diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c
index b39987c..f4800c5 100644
--- a/net/core/net-sysfs.c
+++ b/net/core/net-sysfs.c
@@ -1283,6 +1283,86 @@ static ssize_t xps_cpus_store(struct netdev_queue *queue,
 
 static struct netdev_queue_attribute xps_cpus_attribute __ro_after_init
= __ATTR_RW(xps_cpus);
+
+static ssize_t xps_rxqs_show(struct netdev_queue *queue, char *buf)
+{
+   struct net_device *dev = queue->dev;
+   struct xps_dev_maps *dev_maps;
+   unsigned long *mask, index;
+   int j, len, num_tc = 1, tc = 0;
+
+   index = get_netdev_queue_index(queue);
+
+   if (dev->num_tc) {
+   num_tc = dev->num_tc;
+   tc = netdev_txq_to_tc(dev, index);
+   if (tc < 0)
+   return -EINVAL;
+   }
+   mask = kcalloc(BITS_TO_LONGS(dev->num_rx_queues), sizeof(long),
+  GFP_KERNEL);
+   if (!mask)
+   return -ENOMEM;
+
+   rcu_read_lock();
+   dev_maps = rcu_dereference(dev->xps_rxqs_map);
+   if (dev_maps) {
+   for (j = -1; j = attrmask_next(j, NULL, dev->num_rx_queues),
+j < dev->num_rx_queues;) {
+   int i, tci = j * num_tc + tc;
+   struct xps_map *map;
+
+   map = rcu_dereference(dev_maps->attr_map[tci]);
+   if (!map)
+   continue;
+
+   for (i = map->len; i--;) {
+   if (map->queues[i] == index) {
+   set_bit(j, mask);
+   break;
+   }
+   }
+   }
+   }
+   rcu_read_unlock();
+
+   len = bitmap_print_to_pagebuf(false, buf, mask, dev->num_rx_queues);
+   kfree(mask);
+
+   return len < PAGE_SIZE ? len : -EINVAL;
+}
+
+static ssize_t xps_rxqs_store(struct netdev_queue *queue, const char *buf,
+ size_t len)
+{
+   struct net_device *dev = queue->dev;
+   struct net *net = dev_net(dev);
+   unsigned long *mask, index;
+   int err;
+
+   if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
+   return -EPERM;
+
+   mask = kcalloc(BITS_TO_LONGS(dev->num_rx_queues), sizeof(long),
+  GFP_KERNEL);
+   if (!mask)
+   return -ENOMEM;
+
+   index = get_netdev_queue_index(queue);
+
+   err = bitmap_parse(buf, len, mask, dev->num_rx_queues);
+   if (err) {
+   kfree(mask);
+   return err;
+   }
+
+   err = __netif_set_xps_queue(dev, mask, index, true);
+   kfree(mask);
+   return err ? : len;
+}
+
+static struct netdev_queue_attribute xps_rxqs_attribute __ro_after_init
+   = __ATTR_RW(xps_rxqs);
 #endif /* CONFIG_XPS */
 
 static struct attribute *netdev_queue_default_attrs[] __ro_after_init = {
@@ -1290,6 +1370,7 @@ static struct attribute *netdev_queue_default_attrs[] 
__ro_after_init = {
_traffic_class.attr,
 #ifdef CONFIG_XPS
_cpus_attribute.attr,
+   _rxqs_attribute.attr,
_tx_maxrate.attr,
 #endif
NULL

[net-next PATCH v5 3/7] net: sock: Change tx_queue_mapping in sock_common to unsigned short

2018-06-27 Thread Amritha Nambiar

Change 'skc_tx_queue_mapping' field in sock_common structure from
'int' to 'unsigned short' type with ~0 indicating unset and
other positive queue values being set. This will accommodate adding
a new 'unsigned short' field in sock_common in the next patch for
rx_queue_mapping.

Signed-off-by: Amritha Nambiar 
---
 include/net/sock.h |   14 +++---
 1 file changed, 11 insertions(+), 3 deletions(-)

diff --git a/include/net/sock.h b/include/net/sock.h
index b3b7541..0a7d57b 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -214,7 +214,7 @@ struct sock_common {
struct hlist_node   skc_node;
struct hlist_nulls_node skc_nulls_node;
};
-   int skc_tx_queue_mapping;
+   unsigned short  skc_tx_queue_mapping;
union {
int skc_incoming_cpu;
u32 skc_rcv_wnd;
@@ -1681,17 +1681,25 @@ static inline int sk_receive_skb(struct sock *sk, 
struct sk_buff *skb,
 
 static inline void sk_tx_queue_set(struct sock *sk, int tx_queue)
 {
+   /* sk_tx_queue_mapping accept only upto a 16-bit value */
+   if (WARN_ON_ONCE((unsigned short)tx_queue > USHRT_MAX))
+   return;
sk->sk_tx_queue_mapping = tx_queue;
 }
 
+#define NO_QUEUE_MAPPING   USHRT_MAX
+
 static inline void sk_tx_queue_clear(struct sock *sk)
 {
-   sk->sk_tx_queue_mapping = -1;
+   sk->sk_tx_queue_mapping = NO_QUEUE_MAPPING;
 }
 
 static inline int sk_tx_queue_get(const struct sock *sk)
 {
-   return sk ? sk->sk_tx_queue_mapping : -1;
+   if (sk && sk->sk_tx_queue_mapping != NO_QUEUE_MAPPING)
+   return sk->sk_tx_queue_mapping;
+
+   return -1;
 }
 
 static inline void sk_set_socket(struct sock *sk, struct socket *sock)

[net-next PATCH v5 2/7] net: Use static_key for XPS maps

2018-06-27 Thread Amritha Nambiar

Use static_key for XPS maps to reduce the cost of extra map checks,
similar to how it is used for RPS and RFS. This includes static_key
'xps_needed' for XPS and another for 'xps_rxqs_needed' for XPS using
Rx queues map.

Signed-off-by: Amritha Nambiar 
---
 net/core/dev.c |   26 --
 1 file changed, 20 insertions(+), 6 deletions(-)

diff --git a/net/core/dev.c b/net/core/dev.c
index 6ca62df..09cba23 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -2081,6 +2081,10 @@ int netdev_txq_to_tc(struct net_device *dev, unsigned 
int txq)
 EXPORT_SYMBOL(netdev_txq_to_tc);
 
 #ifdef CONFIG_XPS
+struct static_key xps_needed __read_mostly;
+EXPORT_SYMBOL(xps_needed);
+struct static_key xps_rxqs_needed __read_mostly;
+EXPORT_SYMBOL(xps_rxqs_needed);
 static DEFINE_MUTEX(xps_map_mutex);
 #define xmap_dereference(P)\
rcu_dereference_protected((P), lockdep_is_held(_map_mutex))
@@ -2170,12 +2174,14 @@ static void netif_reset_xps_queues(struct net_device 
*dev, u16 offset,
 
mutex_lock(_map_mutex);
 
-   dev_maps = xmap_dereference(dev->xps_rxqs_map);
-   if (dev_maps) {
-   nr_ids = dev->num_rx_queues;
-   clean_xps_maps(dev, possible_mask, dev_maps, nr_ids, offset,
-  count, true);
-
+   if (static_key_false(_rxqs_needed)) {
+   dev_maps = xmap_dereference(dev->xps_rxqs_map);
+   if (dev_maps) {
+   nr_ids = dev->num_rx_queues;
+   clean_xps_maps(dev, possible_mask, dev_maps, nr_ids,
+  offset, count, true);
+   }
+   static_key_slow_dec(_rxqs_needed);
}
 
dev_maps = xmap_dereference(dev->xps_cpus_map);
@@ -2189,6 +2195,7 @@ static void netif_reset_xps_queues(struct net_device 
*dev, u16 offset,
   false);
 
 out_no_maps:
+   static_key_slow_dec(_needed);
mutex_unlock(_map_mutex);
 }
 
@@ -2297,6 +2304,10 @@ int __netif_set_xps_queue(struct net_device *dev, const 
unsigned long *mask,
if (!new_dev_maps)
goto out_no_new_maps;
 
+   static_key_slow_inc(_needed);
+   if (is_rxqs_map)
+   static_key_slow_inc(_rxqs_needed);
+
for (j = -1; j = attrmask_next(j, possible_mask, nr_ids),
 j < nr_ids;) {
/* copy maps belonging to foreign traffic classes */
@@ -3450,6 +3461,9 @@ static inline int get_xps_queue(struct net_device *dev, 
struct sk_buff *skb)
struct xps_map *map;
int queue_index = -1;
 
+   if (!static_key_false(_needed))
+   return -1;
+
rcu_read_lock();
dev_maps = rcu_dereference(dev->xps_cpus_map);
if (dev_maps) {

[net-next PATCH v5 5/7] net: Enable Tx queue selection based on Rx queues

2018-06-27 Thread Amritha Nambiar

This patch adds support to pick Tx queue based on the Rx queue(s) map
configuration set by the admin through the sysfs attribute
for each Tx queue. If the user configuration for receive queue(s) map
does not apply, then the Tx queue selection falls back to CPU(s) map
based selection and finally to hashing.

Signed-off-by: Amritha Nambiar 
---
 include/net/sock.h |   10 
 net/core/dev.c |   62 ++--
 2 files changed, 55 insertions(+), 17 deletions(-)

diff --git a/include/net/sock.h b/include/net/sock.h
index f73dbca..3b22782 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -1716,6 +1716,16 @@ static inline void sk_rx_queue_set(struct sock *sk, 
const struct sk_buff *skb)
 #endif
 }
 
+#ifdef CONFIG_XPS
+static inline int sk_rx_queue_get(const struct sock *sk)
+{
+   if (sk && sk->sk_rx_queue_mapping != NO_QUEUE_MAPPING)
+   return sk->sk_rx_queue_mapping;
+
+   return -1;
+}
+#endif
+
 static inline void sk_set_socket(struct sock *sk, struct socket *sock)
 {
sk_tx_queue_clear(sk);
diff --git a/net/core/dev.c b/net/core/dev.c
index 09cba23..1122f68 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -3454,35 +3454,63 @@ sch_handle_egress(struct sk_buff *skb, int *ret, struct 
net_device *dev)
 }
 #endif /* CONFIG_NET_EGRESS */
 
-static inline int get_xps_queue(struct net_device *dev, struct sk_buff *skb)
+#ifdef CONFIG_XPS
+static int __get_xps_queue_idx(struct net_device *dev, struct sk_buff *skb,
+  struct xps_dev_maps *dev_maps, unsigned int tci)
+{
+   struct xps_map *map;
+   int queue_index = -1;
+
+   if (dev->num_tc) {
+   tci *= dev->num_tc;
+   tci += netdev_get_prio_tc_map(dev, skb->priority);
+   }
+
+   map = rcu_dereference(dev_maps->attr_map[tci]);
+   if (map) {
+   if (map->len == 1)
+   queue_index = map->queues[0];
+   else
+   queue_index = map->queues[reciprocal_scale(
+   skb_get_hash(skb), map->len)];
+   if (unlikely(queue_index >= dev->real_num_tx_queues))
+   queue_index = -1;
+   }
+   return queue_index;
+}
+#endif
+
+static int get_xps_queue(struct net_device *dev, struct sk_buff *skb)
 {
 #ifdef CONFIG_XPS
struct xps_dev_maps *dev_maps;
-   struct xps_map *map;
+   struct sock *sk = skb->sk;
int queue_index = -1;
 
if (!static_key_false(_needed))
return -1;
 
rcu_read_lock();
-   dev_maps = rcu_dereference(dev->xps_cpus_map);
+   if (!static_key_false(_rxqs_needed))
+   goto get_cpus_map;
+
+   dev_maps = rcu_dereference(dev->xps_rxqs_map);
if (dev_maps) {
-   unsigned int tci = skb->sender_cpu - 1;
+   int tci = sk_rx_queue_get(sk);
 
-   if (dev->num_tc) {
-   tci *= dev->num_tc;
-   tci += netdev_get_prio_tc_map(dev, skb->priority);
-   }
+   if (tci >= 0 && tci < dev->num_rx_queues)
+   queue_index = __get_xps_queue_idx(dev, skb, dev_maps,
+ tci);
+   }
 
-   map = rcu_dereference(dev_maps->attr_map[tci]);
-   if (map) {
-   if (map->len == 1)
-   queue_index = map->queues[0];
-   else
-   queue_index = 
map->queues[reciprocal_scale(skb_get_hash(skb),
-  
map->len)];
-   if (unlikely(queue_index >= dev->real_num_tx_queues))
-   queue_index = -1;
+get_cpus_map:
+   if (queue_index < 0) {
+   dev_maps = rcu_dereference(dev->xps_cpus_map);
+   if (dev_maps) {
+   unsigned int tci = skb->sender_cpu - 1;
+
+   queue_index = __get_xps_queue_idx(dev, skb, dev_maps,
+ tci);
}
}
rcu_read_unlock();

[net-next PATCH v5 1/7] net: Refactor XPS for CPUs and Rx queues

2018-06-27 Thread Amritha Nambiar

Refactor XPS code to support Tx queue selection based on
CPU(s) map or Rx queue(s) map.

Signed-off-by: Amritha Nambiar 
---
 include/linux/cpumask.h   |   11 ++
 include/linux/netdevice.h |   97 -
 net/core/dev.c|  211 ++---
 net/core/net-sysfs.c  |4 -
 4 files changed, 243 insertions(+), 80 deletions(-)

diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h
index bf53d89..57f20a0 100644
--- a/include/linux/cpumask.h
+++ b/include/linux/cpumask.h
@@ -115,12 +115,17 @@ extern struct cpumask __cpu_active_mask;
 #define cpu_active(cpu)((cpu) == 0)
 #endif
 
-/* verify cpu argument to cpumask_* operators */
-static inline unsigned int cpumask_check(unsigned int cpu)
+static inline void cpu_max_bits_warn(unsigned int cpu, unsigned int bits)
 {
 #ifdef CONFIG_DEBUG_PER_CPU_MAPS
-   WARN_ON_ONCE(cpu >= nr_cpumask_bits);
+   WARN_ON_ONCE(cpu >= bits);
 #endif /* CONFIG_DEBUG_PER_CPU_MAPS */
+}
+
+/* verify cpu argument to cpumask_* operators */
+static inline unsigned int cpumask_check(unsigned int cpu)
+{
+   cpu_max_bits_warn(cpu, nr_cpumask_bits);
return cpu;
 }
 
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index c6b377a..3790ac9 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -731,10 +731,15 @@ struct xps_map {
  */
 struct xps_dev_maps {
struct rcu_head rcu;
-   struct xps_map __rcu *cpu_map[0];
+   struct xps_map __rcu *attr_map[0]; /* Either CPUs map or RXQs map */
 };
-#define XPS_DEV_MAPS_SIZE(_tcs) (sizeof(struct xps_dev_maps) + \
+
+#define XPS_CPU_DEV_MAPS_SIZE(_tcs) (sizeof(struct xps_dev_maps) + \
(nr_cpu_ids * (_tcs) * sizeof(struct xps_map *)))
+
+#define XPS_RXQ_DEV_MAPS_SIZE(_tcs, _rxqs) (sizeof(struct xps_dev_maps) +\
+   (_rxqs * (_tcs) * sizeof(struct xps_map *)))
+
 #endif /* CONFIG_XPS */
 
 #define TC_MAX_QUEUE   16
@@ -1910,7 +1915,8 @@ struct net_device {
int watchdog_timeo;
 
 #ifdef CONFIG_XPS
-   struct xps_dev_maps __rcu *xps_maps;
+   struct xps_dev_maps __rcu *xps_cpus_map;
+   struct xps_dev_maps __rcu *xps_rxqs_map;
 #endif
 #ifdef CONFIG_NET_CLS_ACT
struct mini_Qdisc __rcu *miniq_egress;
@@ -3259,6 +3265,91 @@ static inline void netif_wake_subqueue(struct net_device 
*dev, u16 queue_index)
 #ifdef CONFIG_XPS
 int netif_set_xps_queue(struct net_device *dev, const struct cpumask *mask,
u16 index);
+int __netif_set_xps_queue(struct net_device *dev, const unsigned long *mask,
+ u16 index, bool is_rxqs_map);
+
+/**
+ * attr_test_mask - Test a CPU or Rx queue set in a cpumask/rx queues mask
+ * @j: CPU/Rx queue index
+ * @mask: bitmask of all cpus/rx queues
+ * @nr_bits: number of bits in the bitmask
+ *
+ * Test if a CPU or Rx queue index is set in a mask of all CPU/Rx queues.
+ */
+static inline bool attr_test_mask(unsigned long j, const unsigned long *mask,
+ unsigned int nr_bits)
+{
+   cpu_max_bits_warn(j, nr_bits);
+   return test_bit(j, mask);
+}
+
+/**
+ * attr_test_online - Test for online CPU/Rx queue
+ * @j: CPU/Rx queue index
+ * @online_mask: bitmask for CPUs/Rx queues that are online
+ * @nr_bits: number of bits in the bitmask
+ *
+ * Returns true if a CPU/Rx queue is online.
+ */
+static inline bool attr_test_online(unsigned long j,
+   const unsigned long *online_mask,
+   unsigned int nr_bits)
+{
+   cpu_max_bits_warn(j, nr_bits);
+
+   if (online_mask)
+   return test_bit(j, online_mask);
+
+   return (j < nr_bits);
+}
+
+/**
+ * attrmask_next - get the next CPU/Rx queue in a cpumask/Rx queues mask
+ * @n: CPU/Rx queue index
+ * @srcp: the cpumask/Rx queue mask pointer
+ * @nr_bits: number of bits in the bitmask
+ *
+ * Returns >= nr_bits if no further CPUs/Rx queues set.
+ */
+static inline unsigned int attrmask_next(int n, const unsigned long *srcp,
+unsigned int nr_bits)
+{
+   /* -1 is a legal arg here. */
+   if (n != -1)
+   cpu_max_bits_warn(n, nr_bits);
+
+   if (srcp)
+   return find_next_bit(srcp, nr_bits, n + 1);
+
+   return n + 1;
+}
+
+/**
+ * attrmask_next_and - get the next CPU/Rx queue in *src1p & *src2p
+ * @n: CPU/Rx queue index
+ * @src1p: the first CPUs/Rx queues mask pointer
+ * @src2p: the second CPUs/Rx queues mask pointer
+ * @nr_bits: number of bits in the bitmask
+ *
+ * Returns >= nr_bits if no further CPUs/Rx queues set in both.
+ */
+static inline int attrmask_next_and(int n, const unsigned long *src1p,
+   const unsigned long *src2p,
+   unsigned int nr_bits)
+{
+   /* -1

[net-next PATCH v5 0/7] Symmetric queue selection using XPS for Rx queues

2018-06-27 Thread Amritha Nambiar

   4.668
(% of all cache refs)

L1-dcache-load- 6.556.29
-misses
(% of all L1-dcache hits)

LLC-load-misses 13.91   10.44
(% of all LL-cache hits)

---

v5:
- Clean sk_tx_queue_mapping set and get functions, initialize it to USHRT_MAX.
- Similarly clean sk_rx_queue_mapping set and get functions.
- Use ns_capable in place of capable(), reorganize/properly free pointer
  in xps_rxqs_show.
- Add a note in documentation not validating transmit device against
  receive device to avoid expensive lookup in datapath.

---

Amritha Nambiar (7):
  net: Refactor XPS for CPUs and Rx queues
  net: Use static_key for XPS maps
  net: sock: Change tx_queue_mapping in sock_common to unsigned short
  net: Record receive queue number for a connection
  net: Enable Tx queue selection based on Rx queues
  net-sysfs: Add interface for Rx queue(s) map per Tx queue
  Documentation: Add explanation for XPS using Rx-queue(s) map


 Documentation/ABI/testing/sysfs-class-net-queues |   11 +
 Documentation/networking/scaling.txt |   61 -
 include/linux/cpumask.h  |   11 +
 include/linux/netdevice.h|   97 +++-
 include/net/busy_poll.h  |1 
 include/net/sock.h   |   38 +++
 net/core/dev.c   |  283 +++---
 net/core/net-sysfs.c |   85 ++-
 net/core/sock.c  |4 
 net/ipv4/tcp_input.c |3 
 10 files changed, 485 insertions(+), 109 deletions(-)

--

[net-next PATCH v4 7/7] Documentation: Add explanation for XPS using Rx-queue(s) map

2018-06-25 Thread Amritha Nambiar

Signed-off-by: Amritha Nambiar 
---
 Documentation/ABI/testing/sysfs-class-net-queues |   11 
 Documentation/networking/scaling.txt |   57 ++
 2 files changed, 58 insertions(+), 10 deletions(-)

diff --git a/Documentation/ABI/testing/sysfs-class-net-queues 
b/Documentation/ABI/testing/sysfs-class-net-queues
index 0c0df91..978b763 100644
--- a/Documentation/ABI/testing/sysfs-class-net-queues
+++ b/Documentation/ABI/testing/sysfs-class-net-queues
@@ -42,6 +42,17 @@ Description:
network device transmit queue. Possible vaules depend on the
number of available CPU(s) in the system.
 
+What:  /sys/class//queues/tx-/xps_rxqs
+Date:  June 2018
+KernelVersion: 4.18.0
+Contact:   netdev@vger.kernel.org
+Description:
+   Mask of the receive queue(s) currently enabled to participate
+   into the Transmit Packet Steering packet processing flow for 
this
+   network device transmit queue. Possible values depend on the
+   number of available receive queue(s) in the network device.
+   Default is disabled.
+
 What:  /sys/class//queues/tx-/byte_queue_limits/hold_time
 Date:  November 2011
 KernelVersion: 3.3
diff --git a/Documentation/networking/scaling.txt 
b/Documentation/networking/scaling.txt
index f55639d..8336116 100644
--- a/Documentation/networking/scaling.txt
+++ b/Documentation/networking/scaling.txt
@@ -366,8 +366,13 @@ XPS: Transmit Packet Steering
 
 Transmit Packet Steering is a mechanism for intelligently selecting
 which transmit queue to use when transmitting a packet on a multi-queue
-device. To accomplish this, a mapping from CPU to hardware queue(s) is
-recorded. The goal of this mapping is usually to assign queues
+device. This can be accomplished by recording two kinds of maps, either
+a mapping of CPU to hardware queue(s) or a mapping of receive queue(s)
+to hardware transmit queue(s).
+
+1. XPS using CPUs map
+
+The goal of this mapping is usually to assign queues
 exclusively to a subset of CPUs, where the transmit completions for
 these queues are processed on a CPU within this set. This choice
 provides two benefits. First, contention on the device queue lock is
@@ -377,12 +382,35 @@ transmit queue). Secondly, cache miss rate on transmit 
completion is
 reduced, in particular for data cache lines that hold the sk_buff
 structures.
 
-XPS is configured per transmit queue by setting a bitmap of CPUs that
-may use that queue to transmit. The reverse mapping, from CPUs to
-transmit queues, is computed and maintained for each network device.
-When transmitting the first packet in a flow, the function
-get_xps_queue() is called to select a queue. This function uses the ID
-of the running CPU as a key into the CPU-to-queue lookup table. If the
+2. XPS using receive queues map
+
+This mapping is used to pick transmit queue based on the receive
+queue(s) map configuration set by the administrator. A set of receive
+queues can be mapped to a set of transmit queues (many:many), although
+the common use case is a 1:1 mapping. This will enable sending packets
+on the same queue associations for transmit and receive. This is useful for
+busy polling multi-threaded workloads where there are challenges in
+associating a given CPU to a given application thread. The application
+threads are not pinned to CPUs and each thread handles packets
+received on a single queue. The receive queue number is cached in the
+socket for the connection. In this model, sending the packets on the same
+transmit queue corresponding to the associated receive queue has benefits
+in keeping the CPU overhead low. Transmit completion work is locked into
+the same queue-association that a given application is polling on. This
+avoids the overhead of triggering an interrupt on another CPU. When the
+application cleans up the packets during the busy poll, transmit completion
+may be processed along with it in the same thread context and so result in
+reduced latency.
+
+XPS is configured per transmit queue by setting a bitmap of
+CPUs/receive-queues that may use that queue to transmit. The reverse
+mapping, from CPUs to transmit queues or from receive-queues to transmit
+queues, is computed and maintained for each network device. When
+transmitting the first packet in a flow, the function get_xps_queue() is
+called to select a queue. This function uses the ID of the receive queue
+for the socket connection for a match in the receive queue-to-transmit queue
+lookup table. Alternatively, this function can also use the ID of the
+running CPU as a key into the CPU-to-queue lookup table. If the
 ID matches a single queue, that is used for transmission. If multiple
 queues match, one is selected by using the flow hash to compute an index
 into the set.
@@ -404,11 +432,15 @@ acknowledged.
 
 XPS is only available if the kconfig symbol CONFIG_XPS is enabled (on by
 default for SMP

[net-next PATCH v4 6/7] net-sysfs: Add interface for Rx queue(s) map per Tx queue

2018-06-25 Thread Amritha Nambiar

Extend transmit queue sysfs attribute to configure Rx queue(s) map
per Tx queue. By default no receive queues are configured for the
Tx queue.

- /sys/class/net/eth0/queues/tx-*/xps_rxqs

Signed-off-by: Amritha Nambiar 
---
 net/core/net-sysfs.c |   81 ++
 1 file changed, 81 insertions(+)

diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c
index b39987c..5d2ed02 100644
--- a/net/core/net-sysfs.c
+++ b/net/core/net-sysfs.c
@@ -1283,6 +1283,86 @@ static ssize_t xps_cpus_store(struct netdev_queue *queue,
 
 static struct netdev_queue_attribute xps_cpus_attribute __ro_after_init
= __ATTR_RW(xps_cpus);
+
+static ssize_t xps_rxqs_show(struct netdev_queue *queue, char *buf)
+{
+   struct net_device *dev = queue->dev;
+   struct xps_dev_maps *dev_maps;
+   unsigned long *mask, index;
+   int j, len, num_tc = 1, tc = 0;
+
+   mask = kcalloc(BITS_TO_LONGS(dev->num_rx_queues), sizeof(long),
+  GFP_KERNEL);
+   if (!mask)
+   return -ENOMEM;
+
+   index = get_netdev_queue_index(queue);
+
+   if (dev->num_tc) {
+   num_tc = dev->num_tc;
+   tc = netdev_txq_to_tc(dev, index);
+   if (tc < 0)
+   return -EINVAL;
+   }
+
+   rcu_read_lock();
+   dev_maps = rcu_dereference(dev->xps_rxqs_map);
+   if (dev_maps) {
+   for (j = -1; j = attrmask_next(j, NULL, dev->num_rx_queues),
+j < dev->num_rx_queues;) {
+   int i, tci = j * num_tc + tc;
+   struct xps_map *map;
+
+   map = rcu_dereference(dev_maps->attr_map[tci]);
+   if (!map)
+   continue;
+
+   for (i = map->len; i--;) {
+   if (map->queues[i] == index) {
+   set_bit(j, mask);
+   break;
+   }
+   }
+   }
+   }
+
+   len = bitmap_print_to_pagebuf(false, buf, mask, dev->num_rx_queues);
+   rcu_read_unlock();
+   kfree(mask);
+
+   return len < PAGE_SIZE ? len : -EINVAL;
+}
+
+static ssize_t xps_rxqs_store(struct netdev_queue *queue, const char *buf,
+ size_t len)
+{
+   struct net_device *dev = queue->dev;
+   unsigned long *mask, index;
+   int err;
+
+   if (!capable(CAP_NET_ADMIN))
+   return -EPERM;
+
+   mask = kcalloc(BITS_TO_LONGS(dev->num_rx_queues), sizeof(long),
+  GFP_KERNEL);
+   if (!mask)
+   return -ENOMEM;
+
+   index = get_netdev_queue_index(queue);
+
+   err = bitmap_parse(buf, len, mask, dev->num_rx_queues);
+   if (err) {
+   kfree(mask);
+   return err;
+   }
+
+   err = __netif_set_xps_queue(dev, mask, index, true);
+   kfree(mask);
+   return err ? : len;
+}
+
+static struct netdev_queue_attribute xps_rxqs_attribute __ro_after_init
+   = __ATTR_RW(xps_rxqs);
 #endif /* CONFIG_XPS */
 
 static struct attribute *netdev_queue_default_attrs[] __ro_after_init = {
@@ -1290,6 +1370,7 @@ static struct attribute *netdev_queue_default_attrs[] 
__ro_after_init = {
_traffic_class.attr,
 #ifdef CONFIG_XPS
_cpus_attribute.attr,
+   _rxqs_attribute.attr,
_tx_maxrate.attr,
 #endif
NULL

[net-next PATCH v4 4/7] net: Record receive queue number for a connection

2018-06-25 Thread Amritha Nambiar

This patch adds a new field to sock_common 'skc_rx_queue_mapping'
which holds the receive queue number for the connection. The Rx queue
is marked in tcp_finish_connect() to allow a client app to do
SO_INCOMING_NAPI_ID after a connect() call to get the right queue
association for a socket. Rx queue is also marked in tcp_conn_request()
to allow syn-ack to go on the right tx-queue associated with
the queue on which syn is received.

Signed-off-by: Amritha Nambiar 
Signed-off-by: Sridhar Samudrala 
---
 include/net/busy_poll.h |1 +
 include/net/sock.h  |   14 ++
 net/core/sock.c |4 
 net/ipv4/tcp_input.c|3 +++
 4 files changed, 22 insertions(+)

diff --git a/include/net/busy_poll.h b/include/net/busy_poll.h
index c518743..9e36fda6 100644
--- a/include/net/busy_poll.h
+++ b/include/net/busy_poll.h
@@ -151,6 +151,7 @@ static inline void sk_mark_napi_id(struct sock *sk, const 
struct sk_buff *skb)
 #ifdef CONFIG_NET_RX_BUSY_POLL
sk->sk_napi_id = skb->napi_id;
 #endif
+   sk_rx_queue_set(sk, skb);
 }
 
 /* variant used for unconnected sockets */
diff --git a/include/net/sock.h b/include/net/sock.h
index 009fd30..0ff4416 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -139,6 +139,7 @@ typedef __u64 __bitwise __addrpair;
  * @skc_node: main hash linkage for various protocol lookup tables
  * @skc_nulls_node: main hash linkage for TCP/UDP/UDP-Lite protocol
  * @skc_tx_queue_mapping: tx queue number for this connection
+ * @skc_rx_queue_mapping: rx queue number for this connection
  * @skc_flags: place holder for sk_flags
  * %SO_LINGER (l_onoff), %SO_BROADCAST, %SO_KEEPALIVE,
  * %SO_OOBINLINE settings, %SO_TIMESTAMPING settings
@@ -215,6 +216,9 @@ struct sock_common {
struct hlist_nulls_node skc_nulls_node;
};
unsigned short  skc_tx_queue_mapping;
+#ifdef CONFIG_XPS
+   unsigned short  skc_rx_queue_mapping;
+#endif
union {
int skc_incoming_cpu;
u32 skc_rcv_wnd;
@@ -326,6 +330,9 @@ struct sock {
 #define sk_nulls_node  __sk_common.skc_nulls_node
 #define sk_refcnt  __sk_common.skc_refcnt
 #define sk_tx_queue_mapping__sk_common.skc_tx_queue_mapping
+#ifdef CONFIG_XPS
+#define sk_rx_queue_mapping__sk_common.skc_rx_queue_mapping
+#endif
 
 #define sk_dontcopy_begin  __sk_common.skc_dontcopy_begin
 #define sk_dontcopy_end__sk_common.skc_dontcopy_end
@@ -1696,6 +1703,13 @@ static inline int sk_tx_queue_get(const struct sock *sk)
return sk ? sk->sk_tx_queue_mapping - 1 : -1;
 }
 
+static inline void sk_rx_queue_set(struct sock *sk, const struct sk_buff *skb)
+{
+#ifdef CONFIG_XPS
+   sk->sk_rx_queue_mapping = skb_get_rx_queue(skb) + 1;
+#endif
+}
+
 static inline void sk_set_socket(struct sock *sk, struct socket *sock)
 {
sk_tx_queue_clear(sk);
diff --git a/net/core/sock.c b/net/core/sock.c
index bcc4182..5e4715b 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -2818,6 +2818,10 @@ void sock_init_data(struct socket *sock, struct sock *sk)
sk->sk_pacing_rate = ~0U;
sk->sk_pacing_shift = 10;
sk->sk_incoming_cpu = -1;
+
+#ifdef CONFIG_XPS
+   sk->sk_rx_queue_mapping = ~0;
+#endif
/*
 * Before updating sk_refcnt, we must commit prior changes to memory
 * (Documentation/RCU/rculist_nulls.txt for details)
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 76ca88f..c404c53 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -78,6 +78,7 @@
 #include 
 #include 
 #include 
+#include 
 
 int sysctl_tcp_max_orphans __read_mostly = NR_FILE;
 
@@ -5584,6 +5585,7 @@ void tcp_finish_connect(struct sock *sk, struct sk_buff 
*skb)
if (skb) {
icsk->icsk_af_ops->sk_rx_dst_set(sk, skb);
security_inet_conn_established(sk, skb);
+   sk_mark_napi_id(sk, skb);
}
 
tcp_init_transfer(sk, BPF_SOCK_OPS_ACTIVE_ESTABLISHED_CB);
@@ -6412,6 +6414,7 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops,
tcp_rsk(req)->snt_isn = isn;
tcp_rsk(req)->txhash = net_tx_rndhash();
tcp_openreq_init_rwin(req, sk, dst);
+   sk_rx_queue_set(req_to_sk(req), skb);
if (!want_cookie) {
tcp_reqsk_record_syn(sk, req, skb);
fastopen_sk = tcp_try_fastopen(sk, skb, req, , dst);

[net-next PATCH v4 5/7] net: Enable Tx queue selection based on Rx queues

2018-06-25 Thread Amritha Nambiar

This patch adds support to pick Tx queue based on the Rx queue(s) map
configuration set by the admin through the sysfs attribute
for each Tx queue. If the user configuration for receive queue(s) map
does not apply, then the Tx queue selection falls back to CPU(s) map
based selection and finally to hashing.

Signed-off-by: Amritha Nambiar 
---
 include/net/sock.h |4 +++
 net/core/dev.c |   62 ++--
 2 files changed, 49 insertions(+), 17 deletions(-)

diff --git a/include/net/sock.h b/include/net/sock.h
index 0ff4416..cb18139 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -1710,6 +1710,10 @@ static inline void sk_rx_queue_set(struct sock *sk, 
const struct sk_buff *skb)
 #endif
 }
 
+static inline int sk_rx_queue_get(const struct sock *sk)
+{
+   return sk ? sk->sk_rx_queue_mapping - 1 : -1;
+}
 static inline void sk_set_socket(struct sock *sk, struct socket *sock)
 {
sk_tx_queue_clear(sk);
diff --git a/net/core/dev.c b/net/core/dev.c
index df2a78d..2450c5e 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -3454,35 +3454,63 @@ sch_handle_egress(struct sk_buff *skb, int *ret, struct 
net_device *dev)
 }
 #endif /* CONFIG_NET_EGRESS */
 
-static inline int get_xps_queue(struct net_device *dev, struct sk_buff *skb)
+#ifdef CONFIG_XPS
+static int __get_xps_queue_idx(struct net_device *dev, struct sk_buff *skb,
+  struct xps_dev_maps *dev_maps, unsigned int tci)
+{
+   struct xps_map *map;
+   int queue_index = -1;
+
+   if (dev->num_tc) {
+   tci *= dev->num_tc;
+   tci += netdev_get_prio_tc_map(dev, skb->priority);
+   }
+
+   map = rcu_dereference(dev_maps->attr_map[tci]);
+   if (map) {
+   if (map->len == 1)
+   queue_index = map->queues[0];
+   else
+   queue_index = map->queues[reciprocal_scale(
+   skb_get_hash(skb), map->len)];
+   if (unlikely(queue_index >= dev->real_num_tx_queues))
+   queue_index = -1;
+   }
+   return queue_index;
+}
+#endif
+
+static int get_xps_queue(struct net_device *dev, struct sk_buff *skb)
 {
 #ifdef CONFIG_XPS
struct xps_dev_maps *dev_maps;
-   struct xps_map *map;
+   struct sock *sk = skb->sk;
int queue_index = -1;
 
if (!static_key_false(_needed))
return -1;
 
rcu_read_lock();
-   dev_maps = rcu_dereference(dev->xps_cpus_map);
+   if (!static_key_false(_rxqs_needed))
+   goto get_cpus_map;
+
+   dev_maps = rcu_dereference(dev->xps_rxqs_map);
if (dev_maps) {
-   unsigned int tci = skb->sender_cpu - 1;
+   int tci = sk_rx_queue_get(sk);
 
-   if (dev->num_tc) {
-   tci *= dev->num_tc;
-   tci += netdev_get_prio_tc_map(dev, skb->priority);
-   }
+   if (tci >= 0 && tci < dev->num_rx_queues)
+   queue_index = __get_xps_queue_idx(dev, skb, dev_maps,
+ tci);
+   }
 
-   map = rcu_dereference(dev_maps->attr_map[tci]);
-   if (map) {
-   if (map->len == 1)
-   queue_index = map->queues[0];
-   else
-   queue_index = 
map->queues[reciprocal_scale(skb_get_hash(skb),
-  
map->len)];
-   if (unlikely(queue_index >= dev->real_num_tx_queues))
-   queue_index = -1;
+get_cpus_map:
+   if (queue_index < 0) {
+   dev_maps = rcu_dereference(dev->xps_cpus_map);
+   if (dev_maps) {
+   unsigned int tci = skb->sender_cpu - 1;
+
+   queue_index = __get_xps_queue_idx(dev, skb, dev_maps,
+ tci);
}
}
rcu_read_unlock();

[net-next PATCH v4 0/7] Symmetric queue selection using XPS for Rx queues

2018-06-25 Thread Amritha Nambiar

   4.668
(% of all cache refs)

L1-dcache-load- 6.556.29
-misses
(% of all L1-dcache hits)

LLC-load-misses 13.91   10.44
(% of all LL-cache hits)

---

v4:
- Removed enum for map types and used boolean to identify rxqs_map vs cpus_map.
- Added comments for helper functions.
- Added another static_key for rxqs_map (xps_rxqs_needed).
- New patch to change tx_queue_mapping in sock_common to unsigned short.
- Separated marking receive queue number into a standalone patch.
- Changed wording in documentation (queue-pair to queue-association)

---

Amritha Nambiar (7):
  net: Refactor XPS for CPUs and Rx queues
  net: Use static_key for XPS maps
  net: sock: Change tx_queue_mapping in sock_common to unsigned short
  net: Record receive queue number for a connection
  net: Enable Tx queue selection based on Rx queues
  net-sysfs: Add interface for Rx queue(s) map per Tx queue
  Documentation: Add explanation for XPS using Rx-queue(s) map


 Documentation/ABI/testing/sysfs-class-net-queues |   11 +
 Documentation/networking/scaling.txt |   57 
 include/linux/cpumask.h  |   11 +
 include/linux/netdevice.h|  100 
 include/net/busy_poll.h  |1 
 include/net/sock.h   |   28 ++
 net/core/dev.c   |  283 +++---
 net/core/net-sysfs.c |   85 ++-
 net/core/sock.c  |4 
 net/ipv4/tcp_input.c |3 
 10 files changed, 474 insertions(+), 109 deletions(-)

--

[net-next PATCH v4 3/7] net: sock: Change tx_queue_mapping in sock_common to unsigned short

2018-06-25 Thread Amritha Nambiar

Change 'skc_tx_queue_mapping' field in sock_common structure from
'int' to 'unsigned short' type with 0 indicating unset and
a positive queue value being set. This way it is consistent with
the queue_mapping field in the sk_buff. This will also accommodate
adding a new 'unsigned short' field in sock_common in the next
patch for rx_queue_mapping.

Signed-off-by: Amritha Nambiar 
---
 include/net/sock.h |   10 ++
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/include/net/sock.h b/include/net/sock.h
index b3b7541..009fd30 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -214,7 +214,7 @@ struct sock_common {
struct hlist_node   skc_node;
struct hlist_nulls_node skc_nulls_node;
};
-   int skc_tx_queue_mapping;
+   unsigned short  skc_tx_queue_mapping;
union {
int skc_incoming_cpu;
u32 skc_rcv_wnd;
@@ -1681,17 +1681,19 @@ static inline int sk_receive_skb(struct sock *sk, 
struct sk_buff *skb,
 
 static inline void sk_tx_queue_set(struct sock *sk, int tx_queue)
 {
-   sk->sk_tx_queue_mapping = tx_queue;
+   /* sk_tx_queue_mapping accept only upto a 16-bit value */
+   WARN_ON((unsigned short)tx_queue > USHRT_MAX);
+   sk->sk_tx_queue_mapping = tx_queue + 1;
 }
 
 static inline void sk_tx_queue_clear(struct sock *sk)
 {
-   sk->sk_tx_queue_mapping = -1;
+   sk->sk_tx_queue_mapping = 0;
 }
 
 static inline int sk_tx_queue_get(const struct sock *sk)
 {
-   return sk ? sk->sk_tx_queue_mapping : -1;
+   return sk ? sk->sk_tx_queue_mapping - 1 : -1;
 }
 
 static inline void sk_set_socket(struct sock *sk, struct socket *sock)

[net-next PATCH v4 1/7] net: Refactor XPS for CPUs and Rx queues

2018-06-25 Thread Amritha Nambiar

Refactor XPS code to support Tx queue selection based on
CPU(s) map or Rx queue(s) map.

Signed-off-by: Amritha Nambiar 
---
 include/linux/cpumask.h   |   11 ++
 include/linux/netdevice.h |  100 +
 net/core/dev.c|  211 ++---
 net/core/net-sysfs.c  |4 -
 4 files changed, 246 insertions(+), 80 deletions(-)

diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h
index bf53d89..57f20a0 100644
--- a/include/linux/cpumask.h
+++ b/include/linux/cpumask.h
@@ -115,12 +115,17 @@ extern struct cpumask __cpu_active_mask;
 #define cpu_active(cpu)((cpu) == 0)
 #endif
 
-/* verify cpu argument to cpumask_* operators */
-static inline unsigned int cpumask_check(unsigned int cpu)
+static inline void cpu_max_bits_warn(unsigned int cpu, unsigned int bits)
 {
 #ifdef CONFIG_DEBUG_PER_CPU_MAPS
-   WARN_ON_ONCE(cpu >= nr_cpumask_bits);
+   WARN_ON_ONCE(cpu >= bits);
 #endif /* CONFIG_DEBUG_PER_CPU_MAPS */
+}
+
+/* verify cpu argument to cpumask_* operators */
+static inline unsigned int cpumask_check(unsigned int cpu)
+{
+   cpu_max_bits_warn(cpu, nr_cpumask_bits);
return cpu;
 }
 
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 3ec9850..c534f03 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -730,10 +730,15 @@ struct xps_map {
  */
 struct xps_dev_maps {
struct rcu_head rcu;
-   struct xps_map __rcu *cpu_map[0];
+   struct xps_map __rcu *attr_map[0]; /* Either CPUs map or RXQs map */
 };
-#define XPS_DEV_MAPS_SIZE(_tcs) (sizeof(struct xps_dev_maps) + \
+
+#define XPS_CPU_DEV_MAPS_SIZE(_tcs) (sizeof(struct xps_dev_maps) + \
(nr_cpu_ids * (_tcs) * sizeof(struct xps_map *)))
+
+#define XPS_RXQ_DEV_MAPS_SIZE(_tcs, _rxqs) (sizeof(struct xps_dev_maps) +\
+   (_rxqs * (_tcs) * sizeof(struct xps_map *)))
+
 #endif /* CONFIG_XPS */
 
 #define TC_MAX_QUEUE   16
@@ -1909,7 +1914,8 @@ struct net_device {
int watchdog_timeo;
 
 #ifdef CONFIG_XPS
-   struct xps_dev_maps __rcu *xps_maps;
+   struct xps_dev_maps __rcu *xps_cpus_map;
+   struct xps_dev_maps __rcu *xps_rxqs_map;
 #endif
 #ifdef CONFIG_NET_CLS_ACT
struct mini_Qdisc __rcu *miniq_egress;
@@ -3258,6 +3264,94 @@ static inline void netif_wake_subqueue(struct net_device 
*dev, u16 queue_index)
 #ifdef CONFIG_XPS
 int netif_set_xps_queue(struct net_device *dev, const struct cpumask *mask,
u16 index);
+int __netif_set_xps_queue(struct net_device *dev, const unsigned long *mask,
+ u16 index, bool is_rxqs_map);
+
+/**
+ * attr_test_mask - Test a CPU or Rx queue set in a cpumask/rx queues mask
+ * @j: CPU/Rx queue index
+ * @mask: bitmask of all cpus/rx queues
+ * @nr_bits: number of bits in the bitmask
+ *
+ * Test if a CPU or Rx queue index is set in a mask of all CPU/Rx queues.
+ */
+static inline bool attr_test_mask(unsigned long j, const unsigned long *mask,
+ unsigned int nr_bits)
+{
+   cpu_max_bits_warn(j, nr_bits);
+   return test_bit(j, mask);
+}
+
+/**
+ * attr_test_online - Test for online CPU/Rx queue
+ * @j: CPU/Rx queue index
+ * @online_mask: bitmask for CPUs/Rx queues that are online
+ * @nr_bits: number of bits in the bitmask
+ *
+ * Returns true if a CPU/Rx queue is online.
+ */
+static inline bool attr_test_online(unsigned long j,
+   const unsigned long *online_mask,
+   unsigned int nr_bits)
+{
+   cpu_max_bits_warn(j, nr_bits);
+
+   if (online_mask)
+   return test_bit(j, online_mask);
+
+   if (j >= 0 && j < nr_bits)
+   return true;
+
+   return false;
+}
+
+/**
+ * attrmask_next - get the next CPU/Rx queue in a cpumask/Rx queues mask
+ * @n: CPU/Rx queue index
+ * @srcp: the cpumask/Rx queue mask pointer
+ * @nr_bits: number of bits in the bitmask
+ *
+ * Returns >= nr_bits if no further CPUs/Rx queues set.
+ */
+static inline unsigned int attrmask_next(int n, const unsigned long *srcp,
+unsigned int nr_bits)
+{
+   /* -1 is a legal arg here. */
+   if (n != -1)
+   cpu_max_bits_warn(n, nr_bits);
+
+   if (srcp)
+   return find_next_bit(srcp, nr_bits, n + 1);
+
+   return n + 1;
+}
+
+/**
+ * attrmask_next_and - get the next CPU/Rx queue in *src1p & *src2p
+ * @n: CPU/Rx queue index
+ * @src1p: the first CPUs/Rx queues mask pointer
+ * @src2p: the second CPUs/Rx queues mask pointer
+ * @nr_bits: number of bits in the bitmask
+ *
+ * Returns >= nr_bits if no further CPUs/Rx queues set in both.
+ */
+static inline int attrmask_next_and(int n, const unsigned long *src1p,
+

[net-next PATCH v4 2/7] net: Use static_key for XPS maps

2018-06-25 Thread Amritha Nambiar

Use static_key for XPS maps to reduce the cost of extra map checks,
similar to how it is used for RPS and RFS. This includes static_key
'xps_needed' for XPS and another for 'xps_rxqs_needed' for XPS using
Rx queues map.

Signed-off-by: Amritha Nambiar 
---
 net/core/dev.c |   26 --
 1 file changed, 20 insertions(+), 6 deletions(-)

diff --git a/net/core/dev.c b/net/core/dev.c
index 2552556..df2a78d 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -2081,6 +2081,10 @@ int netdev_txq_to_tc(struct net_device *dev, unsigned 
int txq)
 EXPORT_SYMBOL(netdev_txq_to_tc);
 
 #ifdef CONFIG_XPS
+struct static_key xps_needed __read_mostly;
+EXPORT_SYMBOL(xps_needed);
+struct static_key xps_rxqs_needed __read_mostly;
+EXPORT_SYMBOL(xps_rxqs_needed);
 static DEFINE_MUTEX(xps_map_mutex);
 #define xmap_dereference(P)\
rcu_dereference_protected((P), lockdep_is_held(_map_mutex))
@@ -2170,12 +2174,14 @@ static void netif_reset_xps_queues(struct net_device 
*dev, u16 offset,
 
mutex_lock(_map_mutex);
 
-   dev_maps = xmap_dereference(dev->xps_rxqs_map);
-   if (dev_maps) {
-   nr_ids = dev->num_rx_queues;
-   clean_xps_maps(dev, possible_mask, dev_maps, nr_ids, offset,
-  count, true);
-
+   if (static_key_false(_rxqs_needed)) {
+   dev_maps = xmap_dereference(dev->xps_rxqs_map);
+   if (dev_maps) {
+   nr_ids = dev->num_rx_queues;
+   clean_xps_maps(dev, possible_mask, dev_maps, nr_ids,
+  offset, count, true);
+   }
+   static_key_slow_dec(_rxqs_needed);
}
 
dev_maps = xmap_dereference(dev->xps_cpus_map);
@@ -2189,6 +2195,7 @@ static void netif_reset_xps_queues(struct net_device 
*dev, u16 offset,
   false);
 
 out_no_maps:
+   static_key_slow_dec(_needed);
mutex_unlock(_map_mutex);
 }
 
@@ -2297,6 +2304,10 @@ int __netif_set_xps_queue(struct net_device *dev, const 
unsigned long *mask,
if (!new_dev_maps)
goto out_no_new_maps;
 
+   static_key_slow_inc(_needed);
+   if (is_rxqs_map)
+   static_key_slow_inc(_rxqs_needed);
+
for (j = -1; j = attrmask_next(j, possible_mask, nr_ids),
 j < nr_ids;) {
/* copy maps belonging to foreign traffic classes */
@@ -3450,6 +3461,9 @@ static inline int get_xps_queue(struct net_device *dev, 
struct sk_buff *skb)
struct xps_map *map;
int queue_index = -1;
 
+   if (!static_key_false(_needed))
+   return -1;
+
rcu_read_lock();
dev_maps = rcu_dereference(dev->xps_cpus_map);
if (dev_maps) {

[net-next PATCH v3 5/5] Documentation: Add explanation for XPS using Rx-queue(s) map

2018-06-05 Thread Amritha Nambiar

Signed-off-by: Amritha Nambiar 
---
 Documentation/ABI/testing/sysfs-class-net-queues |   11 
 Documentation/networking/scaling.txt |   58 ++
 2 files changed, 59 insertions(+), 10 deletions(-)

diff --git a/Documentation/ABI/testing/sysfs-class-net-queues 
b/Documentation/ABI/testing/sysfs-class-net-queues
index 0c0df91..1b4cc21 100644
--- a/Documentation/ABI/testing/sysfs-class-net-queues
+++ b/Documentation/ABI/testing/sysfs-class-net-queues
@@ -42,6 +42,17 @@ Description:
network device transmit queue. Possible vaules depend on the
number of available CPU(s) in the system.
 
+What:  /sys/class//queues/tx-/xps_rxqs
+Date:  June 2018
+KernelVersion: 4.17.0
+Contact:   netdev@vger.kernel.org
+Description:
+   Mask of the receive queue(s) currently enabled to participate
+   into the Transmit Packet Steering packet processing flow for 
this
+   network device transmit queue. Possible values depend on the
+   number of available receive queue(s) in the network device.
+   Default is disabled.
+
 What:  /sys/class//queues/tx-/byte_queue_limits/hold_time
 Date:  November 2011
 KernelVersion: 3.3
diff --git a/Documentation/networking/scaling.txt 
b/Documentation/networking/scaling.txt
index f55639d..834147c 100644
--- a/Documentation/networking/scaling.txt
+++ b/Documentation/networking/scaling.txt
@@ -366,8 +366,13 @@ XPS: Transmit Packet Steering
 
 Transmit Packet Steering is a mechanism for intelligently selecting
 which transmit queue to use when transmitting a packet on a multi-queue
-device. To accomplish this, a mapping from CPU to hardware queue(s) is
-recorded. The goal of this mapping is usually to assign queues
+device. This can be accomplished by recording two kinds of maps, either
+a mapping of CPU to hardware queue(s) or a mapping of receive queue(s)
+to hardware transmit queue(s).
+
+1. XPS using CPUs map
+
+The goal of this mapping is usually to assign queues
 exclusively to a subset of CPUs, where the transmit completions for
 these queues are processed on a CPU within this set. This choice
 provides two benefits. First, contention on the device queue lock is
@@ -377,12 +382,36 @@ transmit queue). Secondly, cache miss rate on transmit 
completion is
 reduced, in particular for data cache lines that hold the sk_buff
 structures.
 
-XPS is configured per transmit queue by setting a bitmap of CPUs that
-may use that queue to transmit. The reverse mapping, from CPUs to
-transmit queues, is computed and maintained for each network device.
-When transmitting the first packet in a flow, the function
-get_xps_queue() is called to select a queue. This function uses the ID
-of the running CPU as a key into the CPU-to-queue lookup table. If the
+2. XPS using receive queues map
+
+This mapping is used to pick transmit queue based on the receive
+queue(s) map configuration set by the administrator. A set of receive
+queues can be mapped to a set of transmit queues (many:many), although
+the common use case is a 1:1 mapping. This will enable sending packets
+on the same queue pair for transmit and receive. This is useful for
+busy polling multi-threaded workloads where there are challenges in
+associating a given CPU to a given application thread. The application
+threads are not pinned to CPUs and each thread handles packets
+received on a single queue. The receive queue number is cached in the
+socket for the connection and there is no need for adding flow entries
+as in the case of aRFS or flow director. In this model, sending the
+packets on the same transmit queue corresponding to the queue-pair
+associated with the receive queue has benefits in keeping the CPU overhead
+low. Transmit completion work is locked into the same queue pair that
+a given application is polling on. This avoids the overhead of triggering
+an interrupt on another CPU. When the application cleans up the packets
+during the busy poll, transmit completion may be processed along with it
+in the same thread context and so result in reduced latency.
+
+XPS is configured per transmit queue by setting a bitmap of
+CPUs/receive-queues that may use that queue to transmit. The reverse
+mapping, from CPUs to transmit queues or from receive-queues to transmit
+queues, is computed and maintained for each network device. When
+transmitting the first packet in a flow, the function get_xps_queue() is
+called to select a queue. This function uses the ID of the receive queue
+for the socket connection for a match in the receive queue-to-transmit queue
+lookup table. Alternatively, this function can also use the ID of the
+running CPU as a key into the CPU-to-queue lookup table. If the
 ID matches a single queue, that is used for transmission. If multiple
 queues match, one is selected by using the flow hash to compute an index
 into the set.
@@ -404,11 +433,15 @@ acknowledged.
 
 XPS

[net-next PATCH v3 0/5] Symmetric queue selection using XPS for Rx queues

2018-06-05 Thread Amritha Nambiar

   4.668
(% of all cache refs)

L1-dcache-load- 6.556.29
-misses
(% of all L1-dcache hits)

LLC-load-misses 13.91   10.44
(% of all LL-cache hits)

---

v3:
- Removed xps_maps array and used separate pointers for rxqs_map and cpus_map
- Used static_key for XPS maps
- Removed skc_rx_ifindex from sock_common
- Record rx_queue_mapping as part of sk_mark_napi_id, sk_mark_rx_queue is
  retained to be used on request sock which has only struct sock_common and not
  a full sock.
- Added documentation in ABI/testing/sysfs-class-net-queues

---

Amritha Nambiar (5):
  net: Refactor XPS for CPUs and Rx queues
  net: Use static_key for XPS maps
  net: Enable Tx queue selection based on Rx queues
  net-sysfs: Add interface for Rx queue(s) map per Tx queue
  Documentation: Add explanation for XPS using Rx-queue(s) map


 Documentation/ABI/testing/sysfs-class-net-queues |   11 +
 Documentation/networking/scaling.txt |   58 
 include/linux/cpumask.h  |   11 +
 include/linux/netdevice.h|   73 +-
 include/net/busy_poll.h  |3 
 include/net/sock.h   |   14 +
 net/core/dev.c   |  292 +++---
 net/core/net-sysfs.c |   85 ++
 net/core/sock.c  |4 
 net/ipv4/tcp_input.c |3 
 10 files changed, 445 insertions(+), 109 deletions(-)

--

[net-next PATCH v3 3/5] net: Enable Tx queue selection based on Rx queues

2018-06-05 Thread Amritha Nambiar

This patch adds support to pick Tx queue based on the Rx queue(s) map
configuration set by the admin through the sysfs attribute
for each Tx queue. If the user configuration for receive queue(s) map
does not apply, then the Tx queue selection falls back to CPU(s) map
based selection and finally to hashing.

Signed-off-by: Amritha Nambiar 
Signed-off-by: Sridhar Samudrala 
---
 include/net/busy_poll.h |3 ++
 include/net/sock.h  |   14 +++
 net/core/dev.c  |   60 ---
 net/core/sock.c |4 +++
 net/ipv4/tcp_input.c|3 ++
 5 files changed, 65 insertions(+), 19 deletions(-)

diff --git a/include/net/busy_poll.h b/include/net/busy_poll.h
index 71c72a9..fc4fb68 100644
--- a/include/net/busy_poll.h
+++ b/include/net/busy_poll.h
@@ -136,6 +136,9 @@ static inline void sk_mark_napi_id(struct sock *sk, const 
struct sk_buff *skb)
 #ifdef CONFIG_NET_RX_BUSY_POLL
sk->sk_napi_id = skb->napi_id;
 #endif
+#ifdef CONFIG_XPS
+   sk->sk_rx_queue_mapping = skb_get_rx_queue(skb);
+#endif
 }
 
 /* variant used for unconnected sockets */
diff --git a/include/net/sock.h b/include/net/sock.h
index 4f7c584..12313653 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -139,6 +139,7 @@ typedef __u64 __bitwise __addrpair;
  * @skc_node: main hash linkage for various protocol lookup tables
  * @skc_nulls_node: main hash linkage for TCP/UDP/UDP-Lite protocol
  * @skc_tx_queue_mapping: tx queue number for this connection
+ * @skc_rx_queue_mapping: rx queue number for this connection
  * @skc_flags: place holder for sk_flags
  * %SO_LINGER (l_onoff), %SO_BROADCAST, %SO_KEEPALIVE,
  * %SO_OOBINLINE settings, %SO_TIMESTAMPING settings
@@ -215,6 +216,9 @@ struct sock_common {
struct hlist_nulls_node skc_nulls_node;
};
int skc_tx_queue_mapping;
+#ifdef CONFIG_XPS
+   int skc_rx_queue_mapping;
+#endif
union {
int skc_incoming_cpu;
u32 skc_rcv_wnd;
@@ -326,6 +330,9 @@ struct sock {
 #define sk_nulls_node  __sk_common.skc_nulls_node
 #define sk_refcnt  __sk_common.skc_refcnt
 #define sk_tx_queue_mapping__sk_common.skc_tx_queue_mapping
+#ifdef CONFIG_XPS
+#define sk_rx_queue_mapping__sk_common.skc_rx_queue_mapping
+#endif
 
 #define sk_dontcopy_begin  __sk_common.skc_dontcopy_begin
 #define sk_dontcopy_end__sk_common.skc_dontcopy_end
@@ -1696,6 +1703,13 @@ static inline int sk_tx_queue_get(const struct sock *sk)
return sk ? sk->sk_tx_queue_mapping : -1;
 }
 
+static inline void sk_mark_rx_queue(struct sock *sk, struct sk_buff *skb)
+{
+#ifdef CONFIG_XPS
+   sk->sk_rx_queue_mapping = skb_get_rx_queue(skb);
+#endif
+}
+
 static inline void sk_set_socket(struct sock *sk, struct socket *sock)
 {
sk_tx_queue_clear(sk);
diff --git a/net/core/dev.c b/net/core/dev.c
index bba755f..1880e6c 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -3479,36 +3479,58 @@ sch_handle_egress(struct sk_buff *skb, int *ret, struct 
net_device *dev)
 }
 #endif /* CONFIG_NET_EGRESS */
 
-static inline int get_xps_queue(struct net_device *dev, struct sk_buff *skb)
+#ifdef CONFIG_XPS
+static int __get_xps_queue_idx(struct net_device *dev, struct sk_buff *skb,
+  struct xps_dev_maps *dev_maps, unsigned int tci)
+{
+   struct xps_map *map;
+   int queue_index = -1;
+
+   if (dev->num_tc) {
+   tci *= dev->num_tc;
+   tci += netdev_get_prio_tc_map(dev, skb->priority);
+   }
+
+   map = rcu_dereference(dev_maps->attr_map[tci]);
+   if (map) {
+   if (map->len == 1)
+   queue_index = map->queues[0];
+   else
+   queue_index = map->queues[reciprocal_scale(
+   skb_get_hash(skb), map->len)];
+   if (unlikely(queue_index >= dev->real_num_tx_queues))
+   queue_index = -1;
+   }
+   return queue_index;
+}
+#endif
+
+static int get_xps_queue(struct net_device *dev, struct sk_buff *skb)
 {
 #ifdef CONFIG_XPS
struct xps_dev_maps *dev_maps;
-   struct xps_map *map;
+   struct sock *sk = skb->sk;
int queue_index = -1;
+   unsigned int tci = 0;
 
if (!static_key_false(_needed))
return -1;
 
+   if (sk && sk->sk_rx_queue_mapping <= dev->num_rx_queues)
+   tci = sk->sk_rx_queue_mapping;
+
rcu_read_lock();
-   dev_maps = rcu_dereference(dev->xps_cpus_map);
-   if (dev_maps) {
-   unsigned int tci = skb->sender_cpu - 1;
+   dev_maps = rcu_dereference(dev->xps_rxqs_map);
+   if (dev_maps)
+   queue_index = __

[net-next PATCH v3 1/5] net: Refactor XPS for CPUs and Rx queues

2018-06-05 Thread Amritha Nambiar

Refactor XPS code to support Tx queue selection based on
CPU(s) map or Rx queue(s) map.

Signed-off-by: Amritha Nambiar 
---
 include/linux/cpumask.h   |   11 ++
 include/linux/netdevice.h |   73 ++
 net/core/dev.c|  228 ++---
 net/core/net-sysfs.c  |4 -
 4 files changed, 234 insertions(+), 82 deletions(-)

diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h
index bf53d89..57f20a0 100644
--- a/include/linux/cpumask.h
+++ b/include/linux/cpumask.h
@@ -115,12 +115,17 @@ extern struct cpumask __cpu_active_mask;
 #define cpu_active(cpu)((cpu) == 0)
 #endif
 
-/* verify cpu argument to cpumask_* operators */
-static inline unsigned int cpumask_check(unsigned int cpu)
+static inline void cpu_max_bits_warn(unsigned int cpu, unsigned int bits)
 {
 #ifdef CONFIG_DEBUG_PER_CPU_MAPS
-   WARN_ON_ONCE(cpu >= nr_cpumask_bits);
+   WARN_ON_ONCE(cpu >= bits);
 #endif /* CONFIG_DEBUG_PER_CPU_MAPS */
+}
+
+/* verify cpu argument to cpumask_* operators */
+static inline unsigned int cpumask_check(unsigned int cpu)
+{
+   cpu_max_bits_warn(cpu, nr_cpumask_bits);
return cpu;
 }
 
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 6b863ed..354c866 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -730,10 +730,21 @@ struct xps_map {
  */
 struct xps_dev_maps {
struct rcu_head rcu;
-   struct xps_map __rcu *cpu_map[0];
+   struct xps_map __rcu *attr_map[0];
 };
-#define XPS_DEV_MAPS_SIZE(_tcs) (sizeof(struct xps_dev_maps) + \
+
+#define XPS_CPU_DEV_MAPS_SIZE(_tcs) (sizeof(struct xps_dev_maps) + \
(nr_cpu_ids * (_tcs) * sizeof(struct xps_map *)))
+
+#define XPS_RXQ_DEV_MAPS_SIZE(_tcs, _rxqs) (sizeof(struct xps_dev_maps) +\
+   (_rxqs * (_tcs) * sizeof(struct xps_map *)))
+
+enum xps_map_type {
+   XPS_MAP_RXQS,
+   XPS_MAP_CPUS,
+   __XPS_MAP_MAX
+};
+
 #endif /* CONFIG_XPS */
 
 #define TC_MAX_QUEUE   16
@@ -1902,7 +1913,8 @@ struct net_device {
int watchdog_timeo;
 
 #ifdef CONFIG_XPS
-   struct xps_dev_maps __rcu *xps_maps;
+   struct xps_dev_maps __rcu *xps_cpus_map;
+   struct xps_dev_maps __rcu *xps_rxqs_map;
 #endif
 #ifdef CONFIG_NET_CLS_ACT
struct mini_Qdisc __rcu *miniq_egress;
@@ -3251,6 +3263,61 @@ static inline void netif_wake_subqueue(struct net_device 
*dev, u16 queue_index)
 #ifdef CONFIG_XPS
 int netif_set_xps_queue(struct net_device *dev, const struct cpumask *mask,
u16 index);
+int __netif_set_xps_queue(struct net_device *dev, const unsigned long *mask,
+ u16 index, enum xps_map_type type);
+
+static inline bool attr_test_mask(unsigned long j, const unsigned long *mask,
+ unsigned int nr_bits)
+{
+   cpu_max_bits_warn(j, nr_bits);
+   return test_bit(j, mask);
+}
+
+static inline bool attr_test_online(unsigned long j,
+   const unsigned long *online_mask,
+   unsigned int nr_bits)
+{
+   cpu_max_bits_warn(j, nr_bits);
+
+   if (online_mask)
+   return test_bit(j, online_mask);
+
+   if (j >= 0 && j < nr_bits)
+   return true;
+
+   return false;
+}
+
+static inline unsigned int attrmask_next(int n, const unsigned long *srcp,
+unsigned int nr_bits)
+{
+   /* -1 is a legal arg here. */
+   if (n != -1)
+   cpu_max_bits_warn(n, nr_bits);
+
+   if (srcp)
+   return find_next_bit(srcp, nr_bits, n + 1);
+
+   return n + 1;
+}
+
+static inline int attrmask_next_and(int n, const unsigned long *src1p,
+   const unsigned long *src2p,
+   unsigned int nr_bits)
+{
+   /* -1 is a legal arg here. */
+   if (n != -1)
+   cpu_max_bits_warn(n, nr_bits);
+
+   if (src1p && src2p)
+   return find_next_and_bit(src1p, src2p, nr_bits, n + 1);
+   else if (src1p)
+   return find_next_bit(src1p, nr_bits, n + 1);
+   else if (src2p)
+   return find_next_bit(src2p, nr_bits, n + 1);
+
+   return n + 1;
+}
 #else
 static inline int netif_set_xps_queue(struct net_device *dev,
  const struct cpumask *mask,
diff --git a/net/core/dev.c b/net/core/dev.c
index 1844d9b..156acbe 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -2092,7 +2092,7 @@ static bool remove_xps_queue(struct xps_dev_maps 
*dev_maps,
int pos;
 
if (dev_maps)
-   map = xmap_dereference(dev_maps->cpu_map[tci]);
+   map = xmap_dereference(dev_maps->attr_map[tci]);
if (!map)
return false;
 
@@ -2105,7 +2105,7 @@ static bool remove_xps_queue(struct xps_dev_maps 
*dev_m

[net-next PATCH v3 4/5] net-sysfs: Add interface for Rx queue(s) map per Tx queue

2018-06-05 Thread Amritha Nambiar

Extend transmit queue sysfs attribute to configure Rx queue(s) map
per Tx queue. By default no receive queues are configured for the
Tx queue.

- /sys/class/net/eth0/queues/tx-*/xps_rxqs

Signed-off-by: Amritha Nambiar 
---
 net/core/net-sysfs.c |   81 ++
 1 file changed, 81 insertions(+)

diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c
index b39987c..2ed4317 100644
--- a/net/core/net-sysfs.c
+++ b/net/core/net-sysfs.c
@@ -1283,6 +1283,86 @@ static ssize_t xps_cpus_store(struct netdev_queue *queue,
 
 static struct netdev_queue_attribute xps_cpus_attribute __ro_after_init
= __ATTR_RW(xps_cpus);
+
+static ssize_t xps_rxqs_show(struct netdev_queue *queue, char *buf)
+{
+   struct net_device *dev = queue->dev;
+   struct xps_dev_maps *dev_maps;
+   unsigned long *mask, index;
+   int j, len, num_tc = 1, tc = 0;
+
+   mask = kcalloc(BITS_TO_LONGS(dev->num_rx_queues), sizeof(long),
+  GFP_KERNEL);
+   if (!mask)
+   return -ENOMEM;
+
+   index = get_netdev_queue_index(queue);
+
+   if (dev->num_tc) {
+   num_tc = dev->num_tc;
+   tc = netdev_txq_to_tc(dev, index);
+   if (tc < 0)
+   return -EINVAL;
+   }
+
+   rcu_read_lock();
+   dev_maps = rcu_dereference(dev->xps_rxqs_map);
+   if (dev_maps) {
+   for (j = -1; j = attrmask_next(j, NULL, dev->num_rx_queues),
+j < dev->num_rx_queues;) {
+   int i, tci = j * num_tc + tc;
+   struct xps_map *map;
+
+   map = rcu_dereference(dev_maps->attr_map[tci]);
+   if (!map)
+   continue;
+
+   for (i = map->len; i--;) {
+   if (map->queues[i] == index) {
+   set_bit(j, mask);
+   break;
+   }
+   }
+   }
+   }
+
+   len = bitmap_print_to_pagebuf(false, buf, mask, dev->num_rx_queues);
+   rcu_read_unlock();
+   kfree(mask);
+
+   return len < PAGE_SIZE ? len : -EINVAL;
+}
+
+static ssize_t xps_rxqs_store(struct netdev_queue *queue, const char *buf,
+ size_t len)
+{
+   struct net_device *dev = queue->dev;
+   unsigned long *mask, index;
+   int err;
+
+   if (!capable(CAP_NET_ADMIN))
+   return -EPERM;
+
+   mask = kcalloc(BITS_TO_LONGS(dev->num_rx_queues), sizeof(long),
+  GFP_KERNEL);
+   if (!mask)
+   return -ENOMEM;
+
+   index = get_netdev_queue_index(queue);
+
+   err = bitmap_parse(buf, len, mask, dev->num_rx_queues);
+   if (err) {
+   kfree(mask);
+   return err;
+   }
+
+   err = __netif_set_xps_queue(dev, mask, index, XPS_MAP_RXQS);
+   kfree(mask);
+   return err ? : len;
+}
+
+static struct netdev_queue_attribute xps_rxqs_attribute __ro_after_init
+   = __ATTR_RW(xps_rxqs);
 #endif /* CONFIG_XPS */
 
 static struct attribute *netdev_queue_default_attrs[] __ro_after_init = {
@@ -1290,6 +1370,7 @@ static struct attribute *netdev_queue_default_attrs[] 
__ro_after_init = {
_traffic_class.attr,
 #ifdef CONFIG_XPS
_cpus_attribute.attr,
+   _rxqs_attribute.attr,
_tx_maxrate.attr,
 #endif
NULL

[net-next PATCH v3 2/5] net: Use static_key for XPS maps

2018-06-05 Thread Amritha Nambiar

Use static_key for XPS maps to reduce the cost of extra map checks,
similar to how it is used for RPS and RFS.

Signed-off-by: Amritha Nambiar 
---
 net/core/dev.c |8 
 1 file changed, 8 insertions(+)

diff --git a/net/core/dev.c b/net/core/dev.c
index 156acbe..bba755f 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -2081,6 +2081,8 @@ int netdev_txq_to_tc(struct net_device *dev, unsigned int 
txq)
 EXPORT_SYMBOL(netdev_txq_to_tc);
 
 #ifdef CONFIG_XPS
+struct static_key xps_needed __read_mostly;
+EXPORT_SYMBOL(xps_needed);
 static DEFINE_MUTEX(xps_map_mutex);
 #define xmap_dereference(P)\
rcu_dereference_protected((P), lockdep_is_held(_map_mutex))
@@ -2189,6 +2191,7 @@ static void netif_reset_xps_queues(struct net_device 
*dev, u16 offset,
 out_no_maps:
type++;
}
+   static_key_slow_dec(_needed);
mutex_unlock(_map_mutex);
 }
 
@@ -2309,6 +2312,8 @@ int __netif_set_xps_queue(struct net_device *dev, const 
unsigned long *mask,
if (!new_dev_maps)
goto out_no_new_maps;
 
+   static_key_slow_inc(_needed);
+
for (j = -1; j = attrmask_next(j, possible_mask, nr_ids),
 j < nr_ids;) {
/* copy maps belonging to foreign traffic classes */
@@ -3481,6 +3486,9 @@ static inline int get_xps_queue(struct net_device *dev, 
struct sk_buff *skb)
struct xps_map *map;
int queue_index = -1;
 
+   if (!static_key_false(_needed))
+   return -1;
+
rcu_read_lock();
dev_maps = rcu_dereference(dev->xps_cpus_map);
if (dev_maps) {

[net PATCH] net: Fix a bug in removing queues from XPS map

2018-05-17 Thread Amritha Nambiar

While removing queues from the XPS map, the individual CPU ID
alone was used to index the CPUs map, this should be changed to also
factor in the traffic class mapping for the CPU-to-queue lookup.

Fixes: 184c449f91fe ("net: Add support for XPS with QoS via traffic classes")
Signed-off-by: Amritha Nambiar <amritha.namb...@intel.com>
Acked-by: Alexander Duyck <alexander.h.du...@intel.com>
---
 net/core/dev.c |2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/core/dev.c b/net/core/dev.c
index 9f43901..9397577 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -2125,7 +2125,7 @@ static bool remove_xps_queue_cpu(struct net_device *dev,
int i, j;
 
for (i = count, j = offset; i--; j++) {
-   if (!remove_xps_queue(dev_maps, cpu, j))
+   if (!remove_xps_queue(dev_maps, tci, j))
break;
}

[net-next PATCH v2 4/4] Documentation: Add explanation for XPS using Rx-queue map

2018-05-15 Thread Amritha Nambiar

Signed-off-by: Amritha Nambiar <amritha.namb...@intel.com>
---
 Documentation/networking/scaling.txt |   58 --
 1 file changed, 48 insertions(+), 10 deletions(-)

diff --git a/Documentation/networking/scaling.txt 
b/Documentation/networking/scaling.txt
index f55639d..834147c 100644
--- a/Documentation/networking/scaling.txt
+++ b/Documentation/networking/scaling.txt
@@ -366,8 +366,13 @@ XPS: Transmit Packet Steering
 
 Transmit Packet Steering is a mechanism for intelligently selecting
 which transmit queue to use when transmitting a packet on a multi-queue
-device. To accomplish this, a mapping from CPU to hardware queue(s) is
-recorded. The goal of this mapping is usually to assign queues
+device. This can be accomplished by recording two kinds of maps, either
+a mapping of CPU to hardware queue(s) or a mapping of receive queue(s)
+to hardware transmit queue(s).
+
+1. XPS using CPUs map
+
+The goal of this mapping is usually to assign queues
 exclusively to a subset of CPUs, where the transmit completions for
 these queues are processed on a CPU within this set. This choice
 provides two benefits. First, contention on the device queue lock is
@@ -377,12 +382,36 @@ transmit queue). Secondly, cache miss rate on transmit 
completion is
 reduced, in particular for data cache lines that hold the sk_buff
 structures.
 
-XPS is configured per transmit queue by setting a bitmap of CPUs that
-may use that queue to transmit. The reverse mapping, from CPUs to
-transmit queues, is computed and maintained for each network device.
-When transmitting the first packet in a flow, the function
-get_xps_queue() is called to select a queue. This function uses the ID
-of the running CPU as a key into the CPU-to-queue lookup table. If the
+2. XPS using receive queues map
+
+This mapping is used to pick transmit queue based on the receive
+queue(s) map configuration set by the administrator. A set of receive
+queues can be mapped to a set of transmit queues (many:many), although
+the common use case is a 1:1 mapping. This will enable sending packets
+on the same queue pair for transmit and receive. This is useful for
+busy polling multi-threaded workloads where there are challenges in
+associating a given CPU to a given application thread. The application
+threads are not pinned to CPUs and each thread handles packets
+received on a single queue. The receive queue number is cached in the
+socket for the connection and there is no need for adding flow entries
+as in the case of aRFS or flow director. In this model, sending the
+packets on the same transmit queue corresponding to the queue-pair
+associated with the receive queue has benefits in keeping the CPU overhead
+low. Transmit completion work is locked into the same queue pair that
+a given application is polling on. This avoids the overhead of triggering
+an interrupt on another CPU. When the application cleans up the packets
+during the busy poll, transmit completion may be processed along with it
+in the same thread context and so result in reduced latency.
+
+XPS is configured per transmit queue by setting a bitmap of
+CPUs/receive-queues that may use that queue to transmit. The reverse
+mapping, from CPUs to transmit queues or from receive-queues to transmit
+queues, is computed and maintained for each network device. When
+transmitting the first packet in a flow, the function get_xps_queue() is
+called to select a queue. This function uses the ID of the receive queue
+for the socket connection for a match in the receive queue-to-transmit queue
+lookup table. Alternatively, this function can also use the ID of the
+running CPU as a key into the CPU-to-queue lookup table. If the
 ID matches a single queue, that is used for transmission. If multiple
 queues match, one is selected by using the flow hash to compute an index
 into the set.
@@ -404,11 +433,15 @@ acknowledged.
 
 XPS is only available if the kconfig symbol CONFIG_XPS is enabled (on by
 default for SMP). The functionality remains disabled until explicitly
-configured. To enable XPS, the bitmap of CPUs that may use a transmit
-queue is configured using the sysfs file entry:
+configured. To enable XPS, the bitmap of CPUs/receive-queues that may
+use a transmit queue is configured using the sysfs file entry:
 
+For selection based on CPUs map:
 /sys/class/net//queues/tx-/xps_cpus
 
+For selection based on receive-queues map:
+/sys/class/net//queues/tx-/xps_rxqs
+
 == Suggested Configuration
 
 For a network device with a single transmission queue, XPS configuration
@@ -421,6 +454,11 @@ best CPUs to share a given queue are probably those that 
share the cache
 with the CPU that processes transmit completions for that queue
 (transmit interrupts).
 
+For transmit queue selection based on receive queue(s), XPS has to be
+explicitly configured mapping receive-queue(s) to transmit queue(s). If the
+user configuration for receive-queue map does not apply, then the transmit

[net-next PATCH v2 0/4] Symmetric queue selection using XPS for Rx queues

2018-05-15 Thread Amritha Nambiar

This patch series implements support for Tx queue selection based on
Rx queue(s) map. This is done by configuring Rx queue(s) map per Tx-queue
using sysfs attribute. If the user configuration for Rx queues does
not apply, then the Tx queue selection falls back to XPS using CPUs and
finally to hashing.

XPS is refactored to support Tx queue selection based on either the
CPUs map or the Rx-queues map. The config option CONFIG_XPS needs to be
enabled. By default no receive queues are configured for the Tx queue.

- /sys/class/net//queues/tx-*/xps_rxqs

This is to enable sending packets on the same Tx-Rx queue pair as this
is useful for busy polling multi-threaded workloads where it is not
possible to pin the threads to a CPU. This is a rework of Sridhar's
patch for symmetric queueing via socket option:
https://www.spinics.net/lists/netdev/msg453106.html

v2:
- Added documentation in networking/scaling.txt
- Added a simple routine to replace multiple ifdef blocks.

---

Amritha Nambiar (4):
  net: Refactor XPS for CPUs and Rx queues
  net: Enable Tx queue selection based on Rx queues
  net-sysfs: Add interface for Rx queue map per Tx queue
  Documentation: Add explanation for XPS using Rx-queue map


 Documentation/networking/scaling.txt |   58 +++-
 include/linux/cpumask.h  |   11 +-
 include/linux/netdevice.h|   72 ++
 include/net/sock.h   |   18 +++
 net/core/dev.c   |  242 +++---
 net/core/net-sysfs.c |   85 
 net/core/sock.c  |5 +
 net/ipv4/tcp_input.c |7 +
 net/ipv4/tcp_ipv4.c  |1 
 net/ipv4/tcp_minisocks.c |1 
 10 files changed, 404 insertions(+), 96 deletions(-)

--

[net-next PATCH v2 1/4] net: Refactor XPS for CPUs and Rx queues

2018-05-15 Thread Amritha Nambiar

Refactor XPS code to support Tx queue selection based on
CPU map or Rx queue map.

Signed-off-by: Amritha Nambiar <amritha.namb...@intel.com>
---
 include/linux/cpumask.h   |   11 ++
 include/linux/netdevice.h |   72 +++-
 net/core/dev.c|  208 +
 net/core/net-sysfs.c  |4 -
 4 files changed, 215 insertions(+), 80 deletions(-)

diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h
index bf53d89..57f20a0 100644
--- a/include/linux/cpumask.h
+++ b/include/linux/cpumask.h
@@ -115,12 +115,17 @@ extern struct cpumask __cpu_active_mask;
 #define cpu_active(cpu)((cpu) == 0)
 #endif
 
-/* verify cpu argument to cpumask_* operators */
-static inline unsigned int cpumask_check(unsigned int cpu)
+static inline void cpu_max_bits_warn(unsigned int cpu, unsigned int bits)
 {
 #ifdef CONFIG_DEBUG_PER_CPU_MAPS
-   WARN_ON_ONCE(cpu >= nr_cpumask_bits);
+   WARN_ON_ONCE(cpu >= bits);
 #endif /* CONFIG_DEBUG_PER_CPU_MAPS */
+}
+
+/* verify cpu argument to cpumask_* operators */
+static inline unsigned int cpumask_check(unsigned int cpu)
+{
+   cpu_max_bits_warn(cpu, nr_cpumask_bits);
return cpu;
 }
 
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 03ed492..c2eeb36 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -730,10 +730,21 @@ struct xps_map {
  */
 struct xps_dev_maps {
struct rcu_head rcu;
-   struct xps_map __rcu *cpu_map[0];
+   struct xps_map __rcu *attr_map[0];
 };
-#define XPS_DEV_MAPS_SIZE(_tcs) (sizeof(struct xps_dev_maps) + \
+
+#define XPS_CPU_DEV_MAPS_SIZE(_tcs) (sizeof(struct xps_dev_maps) + \
(nr_cpu_ids * (_tcs) * sizeof(struct xps_map *)))
+
+#define XPS_RXQ_DEV_MAPS_SIZE(_tcs, _rxqs) (sizeof(struct xps_dev_maps) +\
+   (_rxqs * (_tcs) * sizeof(struct xps_map *)))
+
+enum xps_map_type {
+   XPS_MAP_RXQS,
+   XPS_MAP_CPUS,
+   __XPS_MAP_MAX
+};
+
 #endif /* CONFIG_XPS */
 
 #define TC_MAX_QUEUE   16
@@ -1891,7 +1902,7 @@ struct net_device {
int watchdog_timeo;
 
 #ifdef CONFIG_XPS
-   struct xps_dev_maps __rcu *xps_maps;
+   struct xps_dev_maps __rcu *xps_maps[__XPS_MAP_MAX];
 #endif
 #ifdef CONFIG_NET_CLS_ACT
struct mini_Qdisc __rcu *miniq_egress;
@@ -3229,6 +3240,61 @@ static inline void netif_wake_subqueue(struct net_device 
*dev, u16 queue_index)
 #ifdef CONFIG_XPS
 int netif_set_xps_queue(struct net_device *dev, const struct cpumask *mask,
u16 index);
+int __netif_set_xps_queue(struct net_device *dev, const unsigned long *mask,
+ u16 index, enum xps_map_type type);
+
+static inline bool attr_test_mask(unsigned long j, const unsigned long *mask,
+ unsigned int nr_bits)
+{
+   cpu_max_bits_warn(j, nr_bits);
+   return test_bit(j, mask);
+}
+
+static inline bool attr_test_online(unsigned long j,
+   const unsigned long *online_mask,
+   unsigned int nr_bits)
+{
+   cpu_max_bits_warn(j, nr_bits);
+
+   if (online_mask)
+   return test_bit(j, online_mask);
+
+   if (j >= 0 && j < nr_bits)
+   return true;
+
+   return false;
+}
+
+static inline unsigned int attrmask_next(int n, const unsigned long *srcp,
+unsigned int nr_bits)
+{
+   /* -1 is a legal arg here. */
+   if (n != -1)
+   cpu_max_bits_warn(n, nr_bits);
+
+   if (srcp)
+   return find_next_bit(srcp, nr_bits, n + 1);
+
+   return n + 1;
+}
+
+static inline int attrmask_next_and(int n, const unsigned long *src1p,
+   const unsigned long *src2p,
+   unsigned int nr_bits)
+{
+   /* -1 is a legal arg here. */
+   if (n != -1)
+   cpu_max_bits_warn(n, nr_bits);
+
+   if (src1p && src2p)
+   return find_next_and_bit(src1p, src2p, nr_bits, n + 1);
+   else if (src1p)
+   return find_next_bit(src1p, nr_bits, n + 1);
+   else if (src2p)
+   return find_next_bit(src2p, nr_bits, n + 1);
+
+   return n + 1;
+}
 #else
 static inline int netif_set_xps_queue(struct net_device *dev,
  const struct cpumask *mask,
diff --git a/net/core/dev.c b/net/core/dev.c
index 9f43901..7e5dfdb 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -2092,7 +2092,7 @@ static bool remove_xps_queue(struct xps_dev_maps 
*dev_maps,
int pos;
 
if (dev_maps)
-   map = xmap_dereference(dev_maps->cpu_map[tci]);
+   map = xmap_dereference(dev_maps->attr_map[tci]);
if (!map)
return false;
 
@@ -2105,7 +2105,7 @@ static bool remove_xps_queue(struct xps_dev_maps

[net-next PATCH v2 3/4] net-sysfs: Add interface for Rx queue map per Tx queue

2018-05-15 Thread Amritha Nambiar

Extend transmit queue sysfs attribute to configure Rx queue map
per Tx queue. By default no receive queues are configured for the
Tx queue.

- /sys/class/net/eth0/queues/tx-*/xps_rxqs

Signed-off-by: Amritha Nambiar <amritha.namb...@intel.com>
---
 net/core/net-sysfs.c |   81 ++
 1 file changed, 81 insertions(+)

diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c
index d7abd33..0654243 100644
--- a/net/core/net-sysfs.c
+++ b/net/core/net-sysfs.c
@@ -1283,6 +1283,86 @@ static ssize_t xps_cpus_store(struct netdev_queue *queue,
 
 static struct netdev_queue_attribute xps_cpus_attribute __ro_after_init
= __ATTR_RW(xps_cpus);
+
+static ssize_t xps_rxqs_show(struct netdev_queue *queue, char *buf)
+{
+   struct net_device *dev = queue->dev;
+   struct xps_dev_maps *dev_maps;
+   unsigned long *mask, index;
+   int j, len, num_tc = 1, tc = 0;
+
+   mask = kcalloc(BITS_TO_LONGS(dev->num_rx_queues), sizeof(long),
+  GFP_KERNEL);
+   if (!mask)
+   return -ENOMEM;
+
+   index = get_netdev_queue_index(queue);
+
+   if (dev->num_tc) {
+   num_tc = dev->num_tc;
+   tc = netdev_txq_to_tc(dev, index);
+   if (tc < 0)
+   return -EINVAL;
+   }
+
+   rcu_read_lock();
+   dev_maps = rcu_dereference(dev->xps_maps[XPS_MAP_RXQS]);
+   if (dev_maps) {
+   for (j = -1; j = attrmask_next(j, NULL, dev->num_rx_queues),
+j < dev->num_rx_queues;) {
+   int i, tci = j * num_tc + tc;
+   struct xps_map *map;
+
+   map = rcu_dereference(dev_maps->attr_map[tci]);
+   if (!map)
+   continue;
+
+   for (i = map->len; i--;) {
+   if (map->queues[i] == index) {
+   set_bit(j, mask);
+   break;
+   }
+   }
+   }
+   }
+
+   len = bitmap_print_to_pagebuf(false, buf, mask, dev->num_rx_queues);
+   rcu_read_unlock();
+   kfree(mask);
+
+   return len < PAGE_SIZE ? len : -EINVAL;
+}
+
+static ssize_t xps_rxqs_store(struct netdev_queue *queue, const char *buf,
+ size_t len)
+{
+   struct net_device *dev = queue->dev;
+   unsigned long *mask, index;
+   int err;
+
+   if (!capable(CAP_NET_ADMIN))
+   return -EPERM;
+
+   mask = kcalloc(BITS_TO_LONGS(dev->num_rx_queues), sizeof(long),
+  GFP_KERNEL);
+   if (!mask)
+   return -ENOMEM;
+
+   index = get_netdev_queue_index(queue);
+
+   err = bitmap_parse(buf, len, mask, dev->num_rx_queues);
+   if (err) {
+   kfree(mask);
+   return err;
+   }
+
+   err = __netif_set_xps_queue(dev, mask, index, XPS_MAP_RXQS);
+   kfree(mask);
+   return err ? : len;
+}
+
+static struct netdev_queue_attribute xps_rxqs_attribute __ro_after_init
+   = __ATTR_RW(xps_rxqs);
 #endif /* CONFIG_XPS */
 
 static struct attribute *netdev_queue_default_attrs[] __ro_after_init = {
@@ -1290,6 +1370,7 @@ static struct attribute *netdev_queue_default_attrs[] 
__ro_after_init = {
_traffic_class.attr,
 #ifdef CONFIG_XPS
_cpus_attribute.attr,
+   _rxqs_attribute.attr,
_tx_maxrate.attr,
 #endif
NULL

[net-next PATCH v2 2/4] net: Enable Tx queue selection based on Rx queues

2018-05-15 Thread Amritha Nambiar

This patch adds support to pick Tx queue based on the Rx queue map
configuration set by the admin through the sysfs attribute
for each Tx queue. If the user configuration for receive
queue map does not apply, then the Tx queue selection falls back
to CPU map based selection and finally to hashing.

Signed-off-by: Amritha Nambiar <amritha.namb...@intel.com>
Signed-off-by: Sridhar Samudrala <sridhar.samudr...@intel.com>
---
 include/net/sock.h   |   18 ++
 net/core/dev.c   |   36 +---
 net/core/sock.c  |5 +
 net/ipv4/tcp_input.c |7 +++
 net/ipv4/tcp_ipv4.c  |1 +
 net/ipv4/tcp_minisocks.c |1 +
 6 files changed, 61 insertions(+), 7 deletions(-)

diff --git a/include/net/sock.h b/include/net/sock.h
index 4f7c584..0613f63 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -139,6 +139,8 @@ typedef __u64 __bitwise __addrpair;
  * @skc_node: main hash linkage for various protocol lookup tables
  * @skc_nulls_node: main hash linkage for TCP/UDP/UDP-Lite protocol
  * @skc_tx_queue_mapping: tx queue number for this connection
+ * @skc_rx_queue_mapping: rx queue number for this connection
+ * @skc_rx_ifindex: rx ifindex for this connection
  * @skc_flags: place holder for sk_flags
  * %SO_LINGER (l_onoff), %SO_BROADCAST, %SO_KEEPALIVE,
  * %SO_OOBINLINE settings, %SO_TIMESTAMPING settings
@@ -215,6 +217,10 @@ struct sock_common {
struct hlist_nulls_node skc_nulls_node;
};
int skc_tx_queue_mapping;
+#ifdef CONFIG_XPS
+   int skc_rx_queue_mapping;
+   int skc_rx_ifindex;
+#endif
union {
int skc_incoming_cpu;
u32 skc_rcv_wnd;
@@ -326,6 +332,10 @@ struct sock {
 #define sk_nulls_node  __sk_common.skc_nulls_node
 #define sk_refcnt  __sk_common.skc_refcnt
 #define sk_tx_queue_mapping__sk_common.skc_tx_queue_mapping
+#ifdef CONFIG_XPS
+#define sk_rx_queue_mapping__sk_common.skc_rx_queue_mapping
+#define sk_rx_ifindex  __sk_common.skc_rx_ifindex
+#endif
 
 #define sk_dontcopy_begin  __sk_common.skc_dontcopy_begin
 #define sk_dontcopy_end__sk_common.skc_dontcopy_end
@@ -1696,6 +1706,14 @@ static inline int sk_tx_queue_get(const struct sock *sk)
return sk ? sk->sk_tx_queue_mapping : -1;
 }
 
+static inline void sk_mark_rx_queue(struct sock *sk, struct sk_buff *skb)
+{
+#ifdef CONFIG_XPS
+   sk->sk_rx_ifindex = skb->skb_iif;
+   sk->sk_rx_queue_mapping = skb_get_rx_queue(skb);
+#endif
+}
+
 static inline void sk_set_socket(struct sock *sk, struct socket *sock)
 {
sk_tx_queue_clear(sk);
diff --git a/net/core/dev.c b/net/core/dev.c
index 7e5dfdb..4030368 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -3458,18 +3458,14 @@ sch_handle_egress(struct sk_buff *skb, int *ret, struct 
net_device *dev)
 }
 #endif /* CONFIG_NET_EGRESS */
 
-static inline int get_xps_queue(struct net_device *dev, struct sk_buff *skb)
-{
 #ifdef CONFIG_XPS
-   struct xps_dev_maps *dev_maps;
+static int __get_xps_queue_idx(struct net_device *dev, struct sk_buff *skb,
+  struct xps_dev_maps *dev_maps, unsigned int tci)
+{
struct xps_map *map;
int queue_index = -1;
 
-   rcu_read_lock();
-   dev_maps = rcu_dereference(dev->xps_maps[XPS_MAP_CPUS]);
if (dev_maps) {
-   unsigned int tci = skb->sender_cpu - 1;
-
if (dev->num_tc) {
tci *= dev->num_tc;
tci += netdev_get_prio_tc_map(dev, skb->priority);
@@ -3486,6 +3482,32 @@ static inline int get_xps_queue(struct net_device *dev, 
struct sk_buff *skb)
queue_index = -1;
}
}
+   return queue_index;
+}
+#endif
+
+static int get_xps_queue(struct net_device *dev, struct sk_buff *skb)
+{
+#ifdef CONFIG_XPS
+   enum xps_map_type i = XPS_MAP_RXQS;
+   struct xps_dev_maps *dev_maps;
+   struct sock *sk = skb->sk;
+   int queue_index = -1;
+   unsigned int tci = 0;
+
+   if (sk && sk->sk_rx_queue_mapping <= dev->real_num_rx_queues &&
+   dev->ifindex == sk->sk_rx_ifindex)
+   tci = sk->sk_rx_queue_mapping;
+
+   rcu_read_lock();
+   while (queue_index < 0 && i < __XPS_MAP_MAX) {
+   if (i == XPS_MAP_CPUS)
+   tci = skb->sender_cpu - 1;
+   dev_maps = rcu_dereference(dev->xps_maps[i]);
+   queue_index = __get_xps_queue_idx(dev, skb, dev_maps, tci);
+   i++;
+   }
+
rcu_read_unlock();
 
return queue_index;
diff --git a/net/core/sock.c b/net/core/sock.c
index 042cfc6..73d7fa8 100644
--- a/net/core/soc

[net-next PATCH 1/3] net: Refactor XPS for CPUs and Rx queues

2018-04-19 Thread Amritha Nambiar

Refactor XPS code to support Tx queue selection based on
CPU map or Rx queue map.

Signed-off-by: Amritha Nambiar <amritha.namb...@intel.com>
---
 include/linux/netdevice.h |   82 +-
 net/core/dev.c|  206 +
 net/core/net-sysfs.c  |4 -
 3 files changed, 216 insertions(+), 76 deletions(-)

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 14e0777..40a9171 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -730,10 +730,21 @@ struct xps_map {
  */
 struct xps_dev_maps {
struct rcu_head rcu;
-   struct xps_map __rcu *cpu_map[0];
+   struct xps_map __rcu *attr_map[0];
 };
-#define XPS_DEV_MAPS_SIZE(_tcs) (sizeof(struct xps_dev_maps) + \
+
+#define XPS_CPU_DEV_MAPS_SIZE(_tcs) (sizeof(struct xps_dev_maps) + \
(nr_cpu_ids * (_tcs) * sizeof(struct xps_map *)))
+
+#define XPS_RXQ_DEV_MAPS_SIZE(_tcs, _rxqs) (sizeof(struct xps_dev_maps) +\
+   (_rxqs * (_tcs) * sizeof(struct xps_map *)))
+
+enum xps_map_type {
+   XPS_MAP_RXQS,
+   XPS_MAP_CPUS,
+   __XPS_MAP_MAX
+};
+
 #endif /* CONFIG_XPS */
 
 #define TC_MAX_QUEUE   16
@@ -1867,7 +1878,7 @@ struct net_device {
int watchdog_timeo;
 
 #ifdef CONFIG_XPS
-   struct xps_dev_maps __rcu *xps_maps;
+   struct xps_dev_maps __rcu *xps_maps[__XPS_MAP_MAX];
 #endif
 #ifdef CONFIG_NET_CLS_ACT
struct mini_Qdisc __rcu *miniq_egress;
@@ -3204,6 +3215,71 @@ static inline void netif_wake_subqueue(struct net_device 
*dev, u16 queue_index)
 #ifdef CONFIG_XPS
 int netif_set_xps_queue(struct net_device *dev, const struct cpumask *mask,
u16 index);
+int __netif_set_xps_queue(struct net_device *dev, const unsigned long *mask,
+ u16 index, enum xps_map_type type);
+
+static inline bool attr_test_mask(unsigned long j, const unsigned long *mask,
+ unsigned int nr_bits)
+{
+#ifdef CONFIG_DEBUG_PER_CPU_MAPS
+   WARN_ON_ONCE(j >= nr_bits);
+#endif /* CONFIG_DEBUG_PER_CPU_MAPS */
+   return test_bit(j, mask);
+}
+
+static inline bool attr_test_online(unsigned long j,
+   const unsigned long *online_mask,
+   unsigned int nr_bits)
+{
+#ifdef CONFIG_DEBUG_PER_CPU_MAPS
+   WARN_ON_ONCE(j >= nr_bits);
+#endif /* CONFIG_DEBUG_PER_CPU_MAPS */
+
+   if (online_mask)
+   return test_bit(j, online_mask);
+
+   if (j >= 0 && j < nr_bits)
+   return true;
+
+   return false;
+}
+
+static inline unsigned int attrmask_next(int n, const unsigned long *srcp,
+unsigned int nr_bits)
+{
+   /* -1 is a legal arg here. */
+   if (n != -1) {
+#ifdef CONFIG_DEBUG_PER_CPU_MAPS
+   WARN_ON_ONCE(n >= nr_bits);
+#endif /* CONFIG_DEBUG_PER_CPU_MAPS */
+   }
+
+   if (srcp)
+   return find_next_bit(srcp, nr_bits, n + 1);
+
+   return n + 1;
+}
+
+static inline int attrmask_next_and(int n, const unsigned long *src1p,
+   const unsigned long *src2p,
+   unsigned int nr_bits)
+{
+   /* -1 is a legal arg here. */
+   if (n != -1) {
+#ifdef CONFIG_DEBUG_PER_CPU_MAPS
+   WARN_ON_ONCE(n >= nr_bits);
+#endif /* CONFIG_DEBUG_PER_CPU_MAPS */
+   }
+
+   if (src1p && src2p)
+   return find_next_and_bit(src1p, src2p, nr_bits, n + 1);
+   else if (src1p)
+   return find_next_bit(src1p, nr_bits, n + 1);
+   else if (src2p)
+   return find_next_bit(src2p, nr_bits, n + 1);
+
+   return n + 1;
+}
 #else
 static inline int netif_set_xps_queue(struct net_device *dev,
  const struct cpumask *mask,
diff --git a/net/core/dev.c b/net/core/dev.c
index a490ef6..17c4883 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -2092,7 +2092,7 @@ static bool remove_xps_queue(struct xps_dev_maps 
*dev_maps,
int pos;
 
if (dev_maps)
-   map = xmap_dereference(dev_maps->cpu_map[tci]);
+   map = xmap_dereference(dev_maps->attr_map[tci]);
if (!map)
return false;
 
@@ -2105,7 +2105,7 @@ static bool remove_xps_queue(struct xps_dev_maps 
*dev_maps,
break;
}
 
-   RCU_INIT_POINTER(dev_maps->cpu_map[tci], NULL);
+   RCU_INIT_POINTER(dev_maps->attr_map[tci], NULL);
kfree_rcu(map, rcu);
return false;
}
@@ -2138,30 +2138,47 @@ static bool remove_xps_queue_cpu(struct net_device *dev,
 static void netif_reset_xps_queues(struct net_device *dev, u16 offset,
   u16 count)
 {
+   const unsigned long *possible_mask = NULL;
+   enum xps_map_type type =

[net-next PATCH 3/3] net-sysfs: Add interface for Rx queue map per Tx queue

2018-04-19 Thread Amritha Nambiar

Extend transmit queue sysfs attribute to configure Rx queue map
per Tx queue. By default no receive queues are configured for the
Tx queue.

- /sys/class/net/eth0/queues/tx-*/xps_rxqs

Signed-off-by: Amritha Nambiar <amritha.namb...@intel.com>
---
 net/core/net-sysfs.c |   81 ++
 1 file changed, 81 insertions(+)

diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c
index d7abd33..0654243 100644
--- a/net/core/net-sysfs.c
+++ b/net/core/net-sysfs.c
@@ -1283,6 +1283,86 @@ static ssize_t xps_cpus_store(struct netdev_queue *queue,
 
 static struct netdev_queue_attribute xps_cpus_attribute __ro_after_init
= __ATTR_RW(xps_cpus);
+
+static ssize_t xps_rxqs_show(struct netdev_queue *queue, char *buf)
+{
+   struct net_device *dev = queue->dev;
+   struct xps_dev_maps *dev_maps;
+   unsigned long *mask, index;
+   int j, len, num_tc = 1, tc = 0;
+
+   mask = kcalloc(BITS_TO_LONGS(dev->num_rx_queues), sizeof(long),
+  GFP_KERNEL);
+   if (!mask)
+   return -ENOMEM;
+
+   index = get_netdev_queue_index(queue);
+
+   if (dev->num_tc) {
+   num_tc = dev->num_tc;
+   tc = netdev_txq_to_tc(dev, index);
+   if (tc < 0)
+   return -EINVAL;
+   }
+
+   rcu_read_lock();
+   dev_maps = rcu_dereference(dev->xps_maps[XPS_MAP_RXQS]);
+   if (dev_maps) {
+   for (j = -1; j = attrmask_next(j, NULL, dev->num_rx_queues),
+j < dev->num_rx_queues;) {
+   int i, tci = j * num_tc + tc;
+   struct xps_map *map;
+
+   map = rcu_dereference(dev_maps->attr_map[tci]);
+   if (!map)
+   continue;
+
+   for (i = map->len; i--;) {
+   if (map->queues[i] == index) {
+   set_bit(j, mask);
+   break;
+   }
+   }
+   }
+   }
+
+   len = bitmap_print_to_pagebuf(false, buf, mask, dev->num_rx_queues);
+   rcu_read_unlock();
+   kfree(mask);
+
+   return len < PAGE_SIZE ? len : -EINVAL;
+}
+
+static ssize_t xps_rxqs_store(struct netdev_queue *queue, const char *buf,
+ size_t len)
+{
+   struct net_device *dev = queue->dev;
+   unsigned long *mask, index;
+   int err;
+
+   if (!capable(CAP_NET_ADMIN))
+   return -EPERM;
+
+   mask = kcalloc(BITS_TO_LONGS(dev->num_rx_queues), sizeof(long),
+  GFP_KERNEL);
+   if (!mask)
+   return -ENOMEM;
+
+   index = get_netdev_queue_index(queue);
+
+   err = bitmap_parse(buf, len, mask, dev->num_rx_queues);
+   if (err) {
+   kfree(mask);
+   return err;
+   }
+
+   err = __netif_set_xps_queue(dev, mask, index, XPS_MAP_RXQS);
+   kfree(mask);
+   return err ? : len;
+}
+
+static struct netdev_queue_attribute xps_rxqs_attribute __ro_after_init
+   = __ATTR_RW(xps_rxqs);
 #endif /* CONFIG_XPS */
 
 static struct attribute *netdev_queue_default_attrs[] __ro_after_init = {
@@ -1290,6 +1370,7 @@ static struct attribute *netdev_queue_default_attrs[] 
__ro_after_init = {
_traffic_class.attr,
 #ifdef CONFIG_XPS
_cpus_attribute.attr,
+   _rxqs_attribute.attr,
_tx_maxrate.attr,
 #endif
NULL

[net-next PATCH 0/3] Symmetric queue selection using XPS for Rx queues

2018-04-19 Thread Amritha Nambiar

This patch series implements support for Tx queue selection based on
Rx queue map. This is done by configuring Rx queue map per Tx-queue
using sysfs attribute. If the user configuration for Rx queues does
not apply, then the Tx queue selection falls back to XPS using CPUs and
finally to hashing.

XPS is refactored to support Tx queue selection based on either the
CPU map or the Rx-queue map. The config option CONFIG_XPS needs to be
enabled. By default no receive queues are configured for the Tx queue.

- /sys/class/net/eth0/queues/tx-*/xps_rxqs

This is to enable sending packets on the same Tx-Rx queue pair as this
is useful for busy polling multi-threaded workloads where it is not
possible to pin the threads to a CPU. This is a rework of Sridhar's
patch for symmetric queueing via socket option:
https://www.spinics.net/lists/netdev/msg453106.html

---

Amritha Nambiar (3):
  net: Refactor XPS for CPUs and Rx queues
  net: Enable Tx queue selection based on Rx queues
  net-sysfs: Add interface for Rx queue map per Tx queue


 include/linux/netdevice.h |   82 +++
 include/net/sock.h|   18 +++
 net/core/dev.c|  240 +++--
 net/core/net-sysfs.c  |   85 
 net/core/sock.c   |5 +
 net/ipv4/tcp_input.c  |7 +
 net/ipv4/tcp_ipv4.c   |1 
 net/ipv4/tcp_minisocks.c  |1 
 8 files changed, 357 insertions(+), 82 deletions(-)

--

[net-next PATCH 2/3] net: Enable Tx queue selection based on Rx queues

2018-04-19 Thread Amritha Nambiar

This patch adds support to pick Tx queue based on the Rx queue map
configuration set by the admin through the sysfs attribute
for each Tx queue. If the user configuration for receive
queue map does not apply, then the Tx queue selection falls back
to CPU map based selection and finally to hashing.

Signed-off-by: Amritha Nambiar <amritha.namb...@intel.com>
Signed-off-by: Sridhar Samudrala <sridhar.samudr...@intel.com>
---
 include/net/sock.h   |   18 ++
 net/core/dev.c   |   36 +---
 net/core/sock.c  |5 +
 net/ipv4/tcp_input.c |7 +++
 net/ipv4/tcp_ipv4.c  |1 +
 net/ipv4/tcp_minisocks.c |1 +
 6 files changed, 61 insertions(+), 7 deletions(-)

diff --git a/include/net/sock.h b/include/net/sock.h
index 74d725f..f10b2a2 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -139,6 +139,8 @@ typedef __u64 __bitwise __addrpair;
  * @skc_node: main hash linkage for various protocol lookup tables
  * @skc_nulls_node: main hash linkage for TCP/UDP/UDP-Lite protocol
  * @skc_tx_queue_mapping: tx queue number for this connection
+ * @skc_rx_queue_mapping: rx queue number for this connection
+ * @skc_rx_ifindex: rx ifindex for this connection
  * @skc_flags: place holder for sk_flags
  * %SO_LINGER (l_onoff), %SO_BROADCAST, %SO_KEEPALIVE,
  * %SO_OOBINLINE settings, %SO_TIMESTAMPING settings
@@ -215,6 +217,10 @@ struct sock_common {
struct hlist_nulls_node skc_nulls_node;
};
int skc_tx_queue_mapping;
+#ifdef CONFIG_XPS
+   int skc_rx_queue_mapping;
+   int skc_rx_ifindex;
+#endif
union {
int skc_incoming_cpu;
u32 skc_rcv_wnd;
@@ -326,6 +332,10 @@ struct sock {
 #define sk_nulls_node  __sk_common.skc_nulls_node
 #define sk_refcnt  __sk_common.skc_refcnt
 #define sk_tx_queue_mapping__sk_common.skc_tx_queue_mapping
+#ifdef CONFIG_XPS
+#define sk_rx_queue_mapping__sk_common.skc_rx_queue_mapping
+#define sk_rx_ifindex  __sk_common.skc_rx_ifindex
+#endif
 
 #define sk_dontcopy_begin  __sk_common.skc_dontcopy_begin
 #define sk_dontcopy_end__sk_common.skc_dontcopy_end
@@ -1691,6 +1701,14 @@ static inline int sk_tx_queue_get(const struct sock *sk)
return sk ? sk->sk_tx_queue_mapping : -1;
 }
 
+static inline void sk_mark_rx_queue(struct sock *sk, struct sk_buff *skb)
+{
+#ifdef CONFIG_XPS
+   sk->sk_rx_ifindex = skb->skb_iif;
+   sk->sk_rx_queue_mapping = skb_get_rx_queue(skb);
+#endif
+}
+
 static inline void sk_set_socket(struct sock *sk, struct socket *sock)
 {
sk_tx_queue_clear(sk);
diff --git a/net/core/dev.c b/net/core/dev.c
index 17c4883..cf24d47 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -3456,18 +3456,14 @@ sch_handle_egress(struct sk_buff *skb, int *ret, struct 
net_device *dev)
 }
 #endif /* CONFIG_NET_EGRESS */
 
-static inline int get_xps_queue(struct net_device *dev, struct sk_buff *skb)
-{
 #ifdef CONFIG_XPS
-   struct xps_dev_maps *dev_maps;
+static int __get_xps_queue_idx(struct net_device *dev, struct sk_buff *skb,
+  struct xps_dev_maps *dev_maps, unsigned int tci)
+{
struct xps_map *map;
int queue_index = -1;
 
-   rcu_read_lock();
-   dev_maps = rcu_dereference(dev->xps_maps[XPS_MAP_CPUS]);
if (dev_maps) {
-   unsigned int tci = skb->sender_cpu - 1;
-
if (dev->num_tc) {
tci *= dev->num_tc;
tci += netdev_get_prio_tc_map(dev, skb->priority);
@@ -3484,6 +3480,32 @@ static inline int get_xps_queue(struct net_device *dev, 
struct sk_buff *skb)
queue_index = -1;
}
}
+   return queue_index;
+}
+#endif
+
+static int get_xps_queue(struct net_device *dev, struct sk_buff *skb)
+{
+#ifdef CONFIG_XPS
+   enum xps_map_type i = XPS_MAP_RXQS;
+   struct xps_dev_maps *dev_maps;
+   struct sock *sk = skb->sk;
+   int queue_index = -1;
+   unsigned int tci = 0;
+
+   if (sk && sk->sk_rx_queue_mapping <= dev->real_num_rx_queues &&
+   dev->ifindex == sk->sk_rx_ifindex)
+   tci = sk->sk_rx_queue_mapping;
+
+   rcu_read_lock();
+   while (queue_index < 0 && i < __XPS_MAP_MAX) {
+   if (i == XPS_MAP_CPUS)
+   tci = skb->sender_cpu - 1;
+   dev_maps = rcu_dereference(dev->xps_maps[i]);
+   queue_index = __get_xps_queue_idx(dev, skb, dev_maps, tci);
+   i++;
+   }
+
rcu_read_unlock();
 
return queue_index;
diff --git a/net/core/sock.c b/net/core/sock.c
index b2c3db1..f7a4b46 100644
--- a/net/core/soc

[jkirsher/next-queue, RFC PATCH 3/3] net-sysfs: Add interface for Rx queue map per Tx queue

2018-04-04 Thread Amritha Nambiar

Extend transmit queue sysfs attribute to configure Rx queue map
per Tx queue. By default no receive queues are configured for the
Tx queue.

- /sys/class/net/eth0/queues/tx-*/xps_rxqs

Signed-off-by: Amritha Nambiar <amritha.namb...@intel.com>
---
 net/core/net-sysfs.c |   81 ++
 1 file changed, 81 insertions(+)

diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c
index d7abd33..0654243 100644
--- a/net/core/net-sysfs.c
+++ b/net/core/net-sysfs.c
@@ -1283,6 +1283,86 @@ static ssize_t xps_cpus_store(struct netdev_queue *queue,
 
 static struct netdev_queue_attribute xps_cpus_attribute __ro_after_init
= __ATTR_RW(xps_cpus);
+
+static ssize_t xps_rxqs_show(struct netdev_queue *queue, char *buf)
+{
+   struct net_device *dev = queue->dev;
+   struct xps_dev_maps *dev_maps;
+   unsigned long *mask, index;
+   int j, len, num_tc = 1, tc = 0;
+
+   mask = kcalloc(BITS_TO_LONGS(dev->num_rx_queues), sizeof(long),
+  GFP_KERNEL);
+   if (!mask)
+   return -ENOMEM;
+
+   index = get_netdev_queue_index(queue);
+
+   if (dev->num_tc) {
+   num_tc = dev->num_tc;
+   tc = netdev_txq_to_tc(dev, index);
+   if (tc < 0)
+   return -EINVAL;
+   }
+
+   rcu_read_lock();
+   dev_maps = rcu_dereference(dev->xps_maps[XPS_MAP_RXQS]);
+   if (dev_maps) {
+   for (j = -1; j = attrmask_next(j, NULL, dev->num_rx_queues),
+j < dev->num_rx_queues;) {
+   int i, tci = j * num_tc + tc;
+   struct xps_map *map;
+
+   map = rcu_dereference(dev_maps->attr_map[tci]);
+   if (!map)
+   continue;
+
+   for (i = map->len; i--;) {
+   if (map->queues[i] == index) {
+   set_bit(j, mask);
+   break;
+   }
+   }
+   }
+   }
+
+   len = bitmap_print_to_pagebuf(false, buf, mask, dev->num_rx_queues);
+   rcu_read_unlock();
+   kfree(mask);
+
+   return len < PAGE_SIZE ? len : -EINVAL;
+}
+
+static ssize_t xps_rxqs_store(struct netdev_queue *queue, const char *buf,
+ size_t len)
+{
+   struct net_device *dev = queue->dev;
+   unsigned long *mask, index;
+   int err;
+
+   if (!capable(CAP_NET_ADMIN))
+   return -EPERM;
+
+   mask = kcalloc(BITS_TO_LONGS(dev->num_rx_queues), sizeof(long),
+  GFP_KERNEL);
+   if (!mask)
+   return -ENOMEM;
+
+   index = get_netdev_queue_index(queue);
+
+   err = bitmap_parse(buf, len, mask, dev->num_rx_queues);
+   if (err) {
+   kfree(mask);
+   return err;
+   }
+
+   err = __netif_set_xps_queue(dev, mask, index, XPS_MAP_RXQS);
+   kfree(mask);
+   return err ? : len;
+}
+
+static struct netdev_queue_attribute xps_rxqs_attribute __ro_after_init
+   = __ATTR_RW(xps_rxqs);
 #endif /* CONFIG_XPS */
 
 static struct attribute *netdev_queue_default_attrs[] __ro_after_init = {
@@ -1290,6 +1370,7 @@ static struct attribute *netdev_queue_default_attrs[] 
__ro_after_init = {
_traffic_class.attr,
 #ifdef CONFIG_XPS
_cpus_attribute.attr,
+   _rxqs_attribute.attr,
_tx_maxrate.attr,
 #endif
NULL

[jkirsher/next-queue, RFC PATCH 0/3] Symmetric queue selection using XPS for Rx queues

2018-04-04 Thread Amritha Nambiar

This patch series implements support for Tx queue selection based on
Rx queue map. This is done by configuring Rx queue map per Tx-queue
using sysfs attribute. If the user configuration for Rx queues does
not apply, then the Tx queue selection falls back to XPS using CPUs and
finally to hashing.

XPS is refactored to support Tx queue selection based on either the
CPU map or the Rx-queue map. The config option CONFIG_XPS needs to be
enabled. By default no receive queues are configured for the Tx queue.

- /sys/class/net/eth0/queues/tx-*/xps_rxqs

This is to enable sending packets on the same Tx-Rx queue pair as this
is useful for busy polling multi-threaded workloads where it is not
possible to pin the threads to a CPU. This is a rework of Sridhar's
patch for symmetric queueing via socket option:
https://www.spinics.net/lists/netdev/msg453106.html

---

Amritha Nambiar (3):
  net: Refactor XPS for CPUs and Rx queues
  net: Enable Tx queue selection based on Rx queues
  net-sysfs: Add interface for Rx queue map per Tx queue


 include/linux/netdevice.h |   82 +++
 include/net/sock.h|   18 +++
 net/core/dev.c|  242 +++--
 net/core/net-sysfs.c  |   85 +++-
 net/core/sock.c   |5 +
 net/ipv4/tcp_input.c  |7 +
 net/ipv4/tcp_ipv4.c   |1 
 net/ipv4/tcp_minisocks.c  |1 
 8 files changed, 360 insertions(+), 81 deletions(-)

--

[jkirsher/next-queue, RFC PATCH 2/3] net: Enable Tx queue selection based on Rx queues

2018-04-04 Thread Amritha Nambiar

This patch adds support to pick Tx queue based on the Rx queue map
configuration set by the admin through the sysfs attribute
for each Tx queue. If the user configuration for receive
queue map does not apply, then the Tx queue selection falls back
to CPU map based selection and finally to hashing.

Signed-off-by: Amritha Nambiar <amritha.namb...@intel.com>
Signed-off-by: Sridhar Samudrala <sridhar.samudr...@intel.com>
---
 include/net/sock.h   |   18 ++
 net/core/dev.c   |   36 ++--
 net/core/sock.c  |5 +
 net/ipv4/tcp_input.c |7 +++
 net/ipv4/tcp_ipv4.c  |1 +
 net/ipv4/tcp_minisocks.c |1 +
 6 files changed, 62 insertions(+), 6 deletions(-)

diff --git a/include/net/sock.h b/include/net/sock.h
index 49bd2c1..53d58bc 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -139,6 +139,8 @@ typedef __u64 __bitwise __addrpair;
  * @skc_node: main hash linkage for various protocol lookup tables
  * @skc_nulls_node: main hash linkage for TCP/UDP/UDP-Lite protocol
  * @skc_tx_queue_mapping: tx queue number for this connection
+ * @skc_rx_queue_mapping: rx queue number for this connection
+ * @skc_rx_ifindex: rx ifindex for this connection
  * @skc_flags: place holder for sk_flags
  * %SO_LINGER (l_onoff), %SO_BROADCAST, %SO_KEEPALIVE,
  * %SO_OOBINLINE settings, %SO_TIMESTAMPING settings
@@ -215,6 +217,10 @@ struct sock_common {
struct hlist_nulls_node skc_nulls_node;
};
int skc_tx_queue_mapping;
+#ifdef CONFIG_XPS
+   int skc_rx_queue_mapping;
+   int skc_rx_ifindex;
+#endif
union {
int skc_incoming_cpu;
u32 skc_rcv_wnd;
@@ -326,6 +332,10 @@ struct sock {
 #define sk_nulls_node  __sk_common.skc_nulls_node
 #define sk_refcnt  __sk_common.skc_refcnt
 #define sk_tx_queue_mapping__sk_common.skc_tx_queue_mapping
+#ifdef CONFIG_XPS
+#define sk_rx_queue_mapping__sk_common.skc_rx_queue_mapping
+#define sk_rx_ifindex  __sk_common.skc_rx_ifindex
+#endif
 
 #define sk_dontcopy_begin  __sk_common.skc_dontcopy_begin
 #define sk_dontcopy_end__sk_common.skc_dontcopy_end
@@ -1691,6 +1701,14 @@ static inline int sk_tx_queue_get(const struct sock *sk)
return sk ? sk->sk_tx_queue_mapping : -1;
 }
 
+static inline void sk_mark_rx_queue(struct sock *sk, struct sk_buff *skb)
+{
+#ifdef CONFIG_XPS
+   sk->sk_rx_ifindex = skb->skb_iif;
+   sk->sk_rx_queue_mapping = skb_get_rx_queue(skb);
+#endif
+}
+
 static inline void sk_set_socket(struct sock *sk, struct socket *sock)
 {
sk_tx_queue_clear(sk);
diff --git a/net/core/dev.c b/net/core/dev.c
index 4cfc179..d43f1c2 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -3457,18 +3457,14 @@ sch_handle_egress(struct sk_buff *skb, int *ret, struct 
net_device *dev)
 }
 #endif /* CONFIG_NET_EGRESS */
 
-static inline int get_xps_queue(struct net_device *dev, struct sk_buff *skb)
+static int __get_xps_queue_idx(struct net_device *dev, struct sk_buff *skb,
+  struct xps_dev_maps *dev_maps, unsigned int tci)
 {
 #ifdef CONFIG_XPS
-   struct xps_dev_maps *dev_maps;
struct xps_map *map;
int queue_index = -1;
 
-   rcu_read_lock();
-   dev_maps = rcu_dereference(dev->xps_maps[XPS_MAP_CPUS]);
if (dev_maps) {
-   unsigned int tci = skb->sender_cpu - 1;
-
if (dev->num_tc) {
tci *= dev->num_tc;
tci += netdev_get_prio_tc_map(dev, skb->priority);
@@ -3485,6 +3481,34 @@ static inline int get_xps_queue(struct net_device *dev, 
struct sk_buff *skb)
queue_index = -1;
}
}
+   return queue_index;
+#else
+   return -1;
+#endif
+}
+
+static int get_xps_queue(struct net_device *dev, struct sk_buff *skb)
+{
+#ifdef CONFIG_XPS
+   enum xps_map_type i = XPS_MAP_RXQS;
+   struct xps_dev_maps *dev_maps;
+   struct sock *sk = skb->sk;
+   int queue_index = -1;
+   unsigned int tci = 0;
+
+   if (sk && sk->sk_rx_queue_mapping <= dev->real_num_rx_queues &&
+   dev->ifindex == sk->sk_rx_ifindex)
+   tci = sk->sk_rx_queue_mapping;
+
+   rcu_read_lock();
+   while (queue_index < 0 && i < __XPS_MAP_MAX) {
+   if (i == XPS_MAP_CPUS)
+   tci = skb->sender_cpu - 1;
+   dev_maps = rcu_dereference(dev->xps_maps[i]);
+   queue_index = __get_xps_queue_idx(dev, skb, dev_maps, tci);
+   i++;
+   }
+
rcu_read_unlock();
 
return queue_index;
diff --git a/net/core/sock.c b/net/core/sock.c
index 6444525..bd053db 1006

[jkirsher/next-queue, RFC PATCH 1/3] net: Refactor XPS for CPUs and Rx queues

2018-04-04 Thread Amritha Nambiar

Refactor XPS code to support Tx queue selection based on
CPU map or Rx queue map.

Signed-off-by: Amritha Nambiar <amritha.namb...@intel.com>
---
 include/linux/netdevice.h |   82 +-
 net/core/dev.c|  208 ++---
 net/core/net-sysfs.c  |4 -
 3 files changed, 218 insertions(+), 76 deletions(-)

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index cf44503..37dbffe 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -730,10 +730,21 @@ struct xps_map {
  */
 struct xps_dev_maps {
struct rcu_head rcu;
-   struct xps_map __rcu *cpu_map[0];
+   struct xps_map __rcu *attr_map[0];
 };
-#define XPS_DEV_MAPS_SIZE(_tcs) (sizeof(struct xps_dev_maps) + \
+
+#define XPS_CPU_DEV_MAPS_SIZE(_tcs) (sizeof(struct xps_dev_maps) + \
(nr_cpu_ids * (_tcs) * sizeof(struct xps_map *)))
+
+#define XPS_RXQ_DEV_MAPS_SIZE(_tcs, _rxqs) (sizeof(struct xps_dev_maps) +\
+   (_rxqs * (_tcs) * sizeof(struct xps_map *)))
+
+enum xps_map_type {
+   XPS_MAP_RXQS,
+   XPS_MAP_CPUS,
+   __XPS_MAP_MAX
+};
+
 #endif /* CONFIG_XPS */
 
 #define TC_MAX_QUEUE   16
@@ -1867,7 +1878,7 @@ struct net_device {
int watchdog_timeo;
 
 #ifdef CONFIG_XPS
-   struct xps_dev_maps __rcu *xps_maps;
+   struct xps_dev_maps __rcu *xps_maps[__XPS_MAP_MAX];
 #endif
 #ifdef CONFIG_NET_CLS_ACT
struct mini_Qdisc __rcu *miniq_egress;
@@ -3204,6 +3215,71 @@ static inline void netif_wake_subqueue(struct net_device 
*dev, u16 queue_index)
 #ifdef CONFIG_XPS
 int netif_set_xps_queue(struct net_device *dev, const struct cpumask *mask,
u16 index);
+int __netif_set_xps_queue(struct net_device *dev, const unsigned long *mask,
+ u16 index, enum xps_map_type type);
+
+static inline bool attr_test_mask(unsigned long j, const unsigned long *mask,
+ unsigned int nr_bits)
+{
+#ifdef CONFIG_DEBUG_PER_CPU_MAPS
+   WARN_ON_ONCE(j >= nr_bits);
+#endif /* CONFIG_DEBUG_PER_CPU_MAPS */
+   return test_bit(j, mask);
+}
+
+static inline bool attr_test_online(unsigned long j,
+   const unsigned long *online_mask,
+   unsigned int nr_bits)
+{
+#ifdef CONFIG_DEBUG_PER_CPU_MAPS
+   WARN_ON_ONCE(j >= nr_bits);
+#endif /* CONFIG_DEBUG_PER_CPU_MAPS */
+
+   if (online_mask)
+   return test_bit(j, online_mask);
+
+   if (j >= 0 && j < nr_bits)
+   return true;
+
+   return false;
+}
+
+static inline unsigned int attrmask_next(int n, const unsigned long *srcp,
+unsigned int nr_bits)
+{
+   /* -1 is a legal arg here. */
+   if (n != -1) {
+#ifdef CONFIG_DEBUG_PER_CPU_MAPS
+   WARN_ON_ONCE(n >= nr_bits);
+#endif /* CONFIG_DEBUG_PER_CPU_MAPS */
+   }
+
+   if (srcp)
+   return find_next_bit(srcp, nr_bits, n + 1);
+
+   return n + 1;
+}
+
+static inline int attrmask_next_and(int n, const unsigned long *src1p,
+   const unsigned long *src2p,
+   unsigned int nr_bits)
+{
+   /* -1 is a legal arg here. */
+   if (n != -1) {
+#ifdef CONFIG_DEBUG_PER_CPU_MAPS
+   WARN_ON_ONCE(n >= nr_bits);
+#endif /* CONFIG_DEBUG_PER_CPU_MAPS */
+   }
+
+   if (src1p && src2p)
+   return find_next_and_bit(src1p, src2p, nr_bits, n + 1);
+   else if (src1p)
+   return find_next_bit(src1p, nr_bits, n + 1);
+   else if (src2p)
+   return find_next_bit(src2p, nr_bits, n + 1);
+
+   return n + 1;
+}
 #else
 static inline int netif_set_xps_queue(struct net_device *dev,
  const struct cpumask *mask,
diff --git a/net/core/dev.c b/net/core/dev.c
index 9b04a9f..4cfc179 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -2091,7 +2091,7 @@ static bool remove_xps_queue(struct xps_dev_maps 
*dev_maps,
int pos;
 
if (dev_maps)
-   map = xmap_dereference(dev_maps->cpu_map[tci]);
+   map = xmap_dereference(dev_maps->attr_map[tci]);
if (!map)
return false;
 
@@ -2104,7 +2104,7 @@ static bool remove_xps_queue(struct xps_dev_maps 
*dev_maps,
break;
}
 
-   RCU_INIT_POINTER(dev_maps->cpu_map[tci], NULL);
+   RCU_INIT_POINTER(dev_maps->attr_map[tci], NULL);
kfree_rcu(map, rcu);
return false;
}
@@ -2137,30 +2137,49 @@ static bool remove_xps_queue_cpu(struct net_device *dev,
 static void netif_reset_xps_queues(struct net_device *dev, u16 offset,
   u16 count)
 {
+   const unsigned long *possible_mask = NULL;
+   enum xps_map_type type =

[iproute2 PATCH] man: tc-flower: add explanation for hw_tc option

2017-11-17 Thread Amritha Nambiar

Add details explaining the hw_tc option.

Signed-off-by: Amritha Nambiar <amritha.namb...@intel.com>
---
 man/man8/tc-flower.8 |9 -
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/man/man8/tc-flower.8 b/man/man8/tc-flower.8
index be46f02..fd9098e 100644
--- a/man/man8/tc-flower.8
+++ b/man/man8/tc-flower.8
@@ -10,7 +10,10 @@ flower \- flow based traffic control filter
 .B action
 .IR ACTION_SPEC " ] [ "
 .B classid
-.IR CLASSID " ]"
+.IR CLASSID " ] [ "
+.B hw_tc
+.IR TCID " ]"
+
 
 .ti -8
 .IR MATCH_LIST " := [ " MATCH_LIST " ] " MATCH
@@ -77,6 +80,10 @@ is in the form
 .BR X : Y ", while " X " and " Y
 are interpreted as numbers in hexadecimal format.
 .TP
+.BI hw_tc " TCID"
+Specify a hardware traffic class to pass matching packets on to. TCID is in the
+range 0 through 15.
+.TP
 .BI indev " ifname"
 Match on incoming interface name. Obviously this makes sense only for forwarded
 flows.

[iproute2 PATCH] man: tc-mqprio: add documentation for new offload options

2017-11-17 Thread Amritha Nambiar

This patch adds documentation for additional offload modes and
associated parameters in tc-mqprio.

Signed-off-by: Amritha Nambiar <amritha.namb...@intel.com>
---
 man/man8/tc-mqprio.8 |   60 +-
 1 file changed, 49 insertions(+), 11 deletions(-)

diff --git a/man/man8/tc-mqprio.8 b/man/man8/tc-mqprio.8
index 0e1d305..a1bedd3 100644
--- a/man/man8/tc-mqprio.8
+++ b/man/man8/tc-mqprio.8
@@ -16,7 +16,17 @@ P0 P1 P2...
 count1@offset1 count2@offset2 ...
 .B ] [ hw
 1|0
-.B ]
+.B ] [ mode
+dcb|channel]
+.B ] [ shaper
+dcb|
+.B [ bw_rlimit
+.B min_rate
+min_rate1 min_rate2 ...
+.B max_rate
+max_rate1 max_rate2 ...
+.B ]]
+
 
 .SH DESCRIPTION
 The MQPRIO qdisc is a simple queuing discipline that allows mapping
@@ -36,14 +46,16 @@ and
 By default these parameters are configured by the hardware
 driver to match the hardware QOS structures.
 
-Enabled hardware can provide hardware QOS with the ability to steer
-traffic flows to designated traffic classes provided by this qdisc.
-Configuring the hardware based QOS mechanism is outside the scope of
-this qdisc. Tools such as
-.B lldpad
-and
-.B ethtool
-exist to provide this functionality. Also further qdiscs may be added
+.B Channel
+mode supports full offload of the mqprio options, the traffic classes, the 
queue
+configurations and QOS attributes to the hardware. Enabled hardware can provide
+hardware QOS with the ability to steer traffic flows to designated traffic
+classes provided by this qdisc. Hardware based QOS is configured using the
+.B shaper
+parameter.
+.B bw_rlimit
+with minimum and maximum bandwidth rates can be used for setting
+transmission rates on each traffic class. Also further qdiscs may be added
 to the classes of MQPRIO to create more complex configurations.
 
 .SH ALGORITHM
@@ -104,9 +116,35 @@ contiguous range of queues.
 hw
 Set to
 .B 1
-to use hardware QOS defaults. Set to
+to support hardware offload. Set to
 .B 0
-to override hardware defaults with user specified values.
+to configure user specified values in software only.
+
+.TP
+mode
+Set to
+.B channel
+for full use of the mqprio options. Use
+.B dcb
+to offload only TC values and use hardware QOS defaults. Supported with 'hw'
+set to 1 only.
+
+.TP
+shaper
+Use
+.B bw_rlimit
+to set bandwidth rate limits for a traffic class. Use
+.B dcb
+for hardware QOS defaults. Supported with 'hw' set to 1 only.
+
+.TP
+min_rate
+Minimum value of bandwidth rate limit for a traffic class.
+
+.TP
+max_rate
+Maximum value of bandwidth rate limit for a traffic class.
+
 
 .SH AUTHORS
 John Fastabend, <john.r.fastab...@intel.com>

[iproute2 PATCH] flower: Represent HW traffic classes as classid values

2017-11-03 Thread Amritha Nambiar

This patch was previously submitted as RFC. Submitting this as
non-RFC now that the classid reservation scheme for hardware
traffic classes and offloads to route packets to a hardware
traffic class are accepted in net-next.

HW traffic classes 0 through 15 are represented using the
reserved classid values :ffe0 - :ffef.

Example:
Match Dst IPv4,Dst Port and route to TC1:
# tc filter add dev eth0 protocol ip parent :\
  prio 1 flower dst_ip 192.168.1.1/32\
  ip_proto udp dst_port 12000 skip_sw\
  hw_tc 1

# tc filter show dev eth0 parent :
filter pref 1 flower chain 0
filter pref 1 flower chain 0 handle 0x1 hw_tc 1
  eth_type ipv4
  ip_proto udp
  dst_ip 192.168.1.1
  dst_port 12000
  skip_sw
  in_hw

Signed-off-by: Amritha Nambiar <amritha.namb...@intel.com>
---
 include/uapi/linux/pkt_sched.h |1 +
 tc/f_flower.c  |   33 +
 2 files changed, 30 insertions(+), 4 deletions(-)

diff --git a/include/uapi/linux/pkt_sched.h b/include/uapi/linux/pkt_sched.h
index e95b5c9..e7cc3d3 100644
--- a/include/uapi/linux/pkt_sched.h
+++ b/include/uapi/linux/pkt_sched.h
@@ -74,6 +74,7 @@ struct tc_estimator {
 #define TC_H_INGRESS(0xFFF1U)
 #define TC_H_CLSACTTC_H_INGRESS
 
+#define TC_H_MIN_PRIORITY  0xFFE0U
 #define TC_H_MIN_INGRESS   0xFFF2U
 #define TC_H_MIN_EGRESS0xFFF3U
 
diff --git a/tc/f_flower.c b/tc/f_flower.c
index b180210..a72c512 100644
--- a/tc/f_flower.c
+++ b/tc/f_flower.c
@@ -614,6 +614,25 @@ static int flower_parse_opt(struct filter_util *qu, char 
*handle,
return -1;
}
addattr_l(n, MAX_MSG, TCA_FLOWER_CLASSID, , 4);
+   } else if (matches(*argv, "hw_tc") == 0) {
+   unsigned int handle;
+   __u32 tc;
+   char *end;
+
+   NEXT_ARG();
+   tc = strtoul(*argv, , 0);
+   if (*end) {
+   fprintf(stderr, "Illegal TC index\n");
+   return -1;
+   }
+   if (tc >= TC_QOPT_MAX_QUEUE) {
+   fprintf(stderr, "TC index exceeds max range\n");
+   return -1;
+   }
+   handle = TC_H_MAKE(TC_H_MAJ(t->tcm_parent),
+  TC_H_MIN(tc + TC_H_MIN_PRIORITY));
+   addattr_l(n, MAX_MSG, TCA_FLOWER_CLASSID, ,
+ sizeof(handle));
} else if (matches(*argv, "ip_flags") == 0) {
NEXT_ARG();
ret = flower_parse_matching_flags(*argv,
@@ -1187,10 +1206,16 @@ static int flower_print_opt(struct filter_util *qu, 
FILE *f,
fprintf(f, "handle 0x%x ", handle);
 
if (tb[TCA_FLOWER_CLASSID]) {
-   SPRINT_BUF(b1);
-   fprintf(f, "classid %s ",
-   
sprint_tc_classid(rta_getattr_u32(tb[TCA_FLOWER_CLASSID]),
- b1));
+   __u32 h = rta_getattr_u32(tb[TCA_FLOWER_CLASSID]);
+
+   if (TC_H_MIN(h) < TC_H_MIN_PRIORITY ||
+   TC_H_MIN(h) > (TC_H_MIN_PRIORITY + TC_QOPT_MAX_QUEUE - 1)) {
+   SPRINT_BUF(b1);
+   fprintf(f, "classid %s ", sprint_tc_classid(h, b1));
+   } else {
+   fprintf(f, "hw_tc %u ",
+   TC_H_MIN(h) - TC_H_MIN_PRIORITY);
+   }
}
 
if (tb[TCA_FLOWER_INDEV]) {

[iproute2 PATCH v2] tc/mqprio: Offload mode and shaper options in mqprio

2017-11-01 Thread Amritha Nambiar

This patch was previously submitted as RFC. Submitting this as
non-RFC now that the tc/mqprio changes are accepted in net-next.

Adds new mqprio options for 'mode' and 'shaper'. The mode
option can take values for offload modes such as 'dcb' (default),
'channel' with the 'hw' option set to 1. The new 'channel' mode
supports offloading TCs and other queue configurations. The
'shaper' option is to support HW shapers ('dcb' default) and
takes the value 'bw_rlimit' for bandwidth rate limiting. The
parameters to the bw_rlimit shaper are minimum and maximum
bandwidth rates. New HW shapers in future can be supported
through the shaper attribute.

# tc qdisc add dev eth0 root mqprio num_tc 2  map 0 0 0 0 1 1 1 1\
  queues 4@0 4@4 hw 1 mode channel shaper bw_rlimit\
  min_rate 1Gbit 2Gbit max_rate 4Gbit 5Gbit

# tc qdisc show dev eth0

qdisc mqprio 804a: root  tc 2 map 0 0 0 0 1 1 1 1 0 0 0 0 0 0 0 0
 queues:(0:3) (4:7)
 mode:channel
 shaper:bw_rlimit   min_rate:1Gbit 2Gbit   max_rate:4Gbit 5Gbit

v2: Avoid buffer overrun and minor cleanup.

Signed-off-by: Amritha Nambiar <amritha.namb...@intel.com>
---
 include/uapi/linux/pkt_sched.h |   32 +++
 tc/q_mqprio.c  |  178 
 2 files changed, 210 insertions(+)

diff --git a/include/uapi/linux/pkt_sched.h b/include/uapi/linux/pkt_sched.h
index 099bf55..e95b5c9 100644
--- a/include/uapi/linux/pkt_sched.h
+++ b/include/uapi/linux/pkt_sched.h
@@ -625,6 +625,22 @@ enum {
 
 #define TC_MQPRIO_HW_OFFLOAD_MAX (__TC_MQPRIO_HW_OFFLOAD_MAX - 1)
 
+enum {
+   TC_MQPRIO_MODE_DCB,
+   TC_MQPRIO_MODE_CHANNEL,
+   __TC_MQPRIO_MODE_MAX
+};
+
+#define __TC_MQPRIO_MODE_MAX (__TC_MQPRIO_MODE_MAX - 1)
+
+enum {
+   TC_MQPRIO_SHAPER_DCB,
+   TC_MQPRIO_SHAPER_BW_RATE,   /* Add new shapers below */
+   __TC_MQPRIO_SHAPER_MAX
+};
+
+#define __TC_MQPRIO_SHAPER_MAX (__TC_MQPRIO_SHAPER_MAX - 1)
+
 struct tc_mqprio_qopt {
__u8num_tc;
__u8prio_tc_map[TC_QOPT_BITMASK + 1];
@@ -633,6 +649,22 @@ struct tc_mqprio_qopt {
__u16   offset[TC_QOPT_MAX_QUEUE];
 };
 
+#define TC_MQPRIO_F_MODE   0x1
+#define TC_MQPRIO_F_SHAPER 0x2
+#define TC_MQPRIO_F_MIN_RATE   0x4
+#define TC_MQPRIO_F_MAX_RATE   0x8
+
+enum {
+   TCA_MQPRIO_UNSPEC,
+   TCA_MQPRIO_MODE,
+   TCA_MQPRIO_SHAPER,
+   TCA_MQPRIO_MIN_RATE64,
+   TCA_MQPRIO_MAX_RATE64,
+   __TCA_MQPRIO_MAX,
+};
+
+#define TCA_MQPRIO_MAX (__TCA_MQPRIO_MAX - 1)
+
 /* SFB */
 
 enum {
diff --git a/tc/q_mqprio.c b/tc/q_mqprio.c
index d6718fb..b57351c 100644
--- a/tc/q_mqprio.c
+++ b/tc/q_mqprio.c
@@ -27,6 +27,10 @@ static void explain(void)
fprintf(stderr, "Usage: ... mqprio [num_tc NUMBER] [map P0 P1 ...]\n");
fprintf(stderr, "  [queues count1@offset1 
count2@offset2 ...] ");
fprintf(stderr, "[hw 1|0]\n");
+   fprintf(stderr, "  [mode dcb|channel]\n");
+   fprintf(stderr, "  [shaper bw_rlimit SHAPER_PARAMS]\n"
+   "Where: SHAPER_PARAMS := { min_rate MIN_RATE1 MIN_RATE2 ...|\n"
+   "  max_rate MAX_RATE1 MAX_RATE2 ... 
}\n");
 }
 
 static int mqprio_parse_opt(struct qdisc_util *qu, int argc,
@@ -40,6 +44,12 @@ static int mqprio_parse_opt(struct qdisc_util *qu, int argc,
.count = { },
.offset = { },
};
+   __u64 min_rate64[TC_QOPT_MAX_QUEUE] = {0};
+   __u64 max_rate64[TC_QOPT_MAX_QUEUE] = {0};
+   __u16 shaper = TC_MQPRIO_SHAPER_DCB;
+   __u16 mode = TC_MQPRIO_MODE_DCB;
+   struct rtattr *tail;
+   __u32 flags = 0;
 
while (argc > 0) {
idx = 0;
@@ -92,6 +102,68 @@ static int mqprio_parse_opt(struct qdisc_util *qu, int argc,
return -1;
}
idx++;
+   } else if (opt.hw && strcmp(*argv, "mode") == 0) {
+   NEXT_ARG();
+   if (matches(*argv, "dcb") == 0) {
+   mode = TC_MQPRIO_MODE_DCB;
+   } else if (matches(*argv, "channel") == 0) {
+   mode = TC_MQPRIO_MODE_CHANNEL;
+   }  else {
+   fprintf(stderr, "Illegal mode (%s)\n",
+   *argv);
+   return -1;
+   }
+   if (mode != TC_MQPRIO_MODE_DCB)
+   flags |= TC_MQPRIO_F_MODE;
+   idx++;
+   } else if (opt.hw && strcmp(*argv, "shaper") == 0) {
+   NEXT_ARG();
+   if (matches(*argv,

[iproute2, RFC PATCH] flower: Represent HW traffic classes as classid values

2017-10-27 Thread Amritha Nambiar

Use the classid values reserved in the range :ffe0 - :ffef
to identify hardware traffic classes.

Example:
Match Dst IPv4,Dst Port and route to TC1:
# tc filter add dev eth0 protocol ip parent :\
  prio 1 flower dst_ip 192.168.1.1/32\
  ip_proto udp dst_port 12000 skip_sw\
  hw_tc 1

# tc filter show dev eth0 parent :
filter pref 1 flower chain 0
filter pref 1 flower chain 0 handle 0x1 hw_tc 1
  eth_type ipv4
  ip_proto udp
  dst_ip 192.168.1.1
  dst_port 12000
  skip_sw
  in_hw

Signed-off-by: Amritha Nambiar <amritha.namb...@intel.com>
---
 include/uapi/linux/pkt_sched.h |1 +
 tc/f_flower.c  |   35 +++
 2 files changed, 32 insertions(+), 4 deletions(-)

diff --git a/include/uapi/linux/pkt_sched.h b/include/uapi/linux/pkt_sched.h
index e95b5c9..e7cc3d3 100644
--- a/include/uapi/linux/pkt_sched.h
+++ b/include/uapi/linux/pkt_sched.h
@@ -74,6 +74,7 @@ struct tc_estimator {
 #define TC_H_INGRESS(0xFFF1U)
 #define TC_H_CLSACTTC_H_INGRESS
 
+#define TC_H_MIN_PRIORITY  0xFFE0U
 #define TC_H_MIN_INGRESS   0xFFF2U
 #define TC_H_MIN_EGRESS0xFFF3U
 
diff --git a/tc/f_flower.c b/tc/f_flower.c
index b180210..6ea0fba 100644
--- a/tc/f_flower.c
+++ b/tc/f_flower.c
@@ -614,6 +614,25 @@ static int flower_parse_opt(struct filter_util *qu, char 
*handle,
return -1;
}
addattr_l(n, MAX_MSG, TCA_FLOWER_CLASSID, , 4);
+   } else if (matches(*argv, "hw_tc") == 0) {
+   unsigned int handle;
+   __u32 tc;
+   char *end;
+
+   NEXT_ARG();
+   tc = strtoul(*argv, , 0);
+   if (*end) {
+   fprintf(stderr, "Illegal TC index\n");
+   return -1;
+   }
+   if (tc >= TC_QOPT_MAX_QUEUE) {
+   fprintf(stderr, "TC index exceeds max range\n");
+   return -1;
+   }
+   handle = TC_H_MAKE(TC_H_MAJ(t->tcm_parent),
+  TC_H_MIN(tc + TC_H_MIN_PRIORITY));
+   addattr_l(n, MAX_MSG, TCA_FLOWER_CLASSID, ,
+ sizeof(handle));
} else if (matches(*argv, "ip_flags") == 0) {
NEXT_ARG();
ret = flower_parse_matching_flags(*argv,
@@ -1187,10 +1206,18 @@ static int flower_print_opt(struct filter_util *qu, 
FILE *f,
fprintf(f, "handle 0x%x ", handle);
 
if (tb[TCA_FLOWER_CLASSID]) {
-   SPRINT_BUF(b1);
-   fprintf(f, "classid %s ",
-   
sprint_tc_classid(rta_getattr_u32(tb[TCA_FLOWER_CLASSID]),
- b1));
+   __u32 h = rta_getattr_u32(tb[TCA_FLOWER_CLASSID]);
+
+   if (TC_H_MIN(h) < TC_H_MIN_PRIORITY ||
+   TC_H_MIN(h) > (TC_H_MIN_PRIORITY + TC_QOPT_MAX_QUEUE - 1)) {
+   SPRINT_BUF(b1);
+   fprintf(f, "classid %s ",
+   
sprint_tc_classid(rta_getattr_u32(tb[TCA_FLOWER_CLASSID]),
+ b1));
+   } else {
+   fprintf(f, "hw_tc %u ",
+   TC_H_MIN(h) - TC_H_MIN_PRIORITY);
+   }
}
 
if (tb[TCA_FLOWER_INDEV]) {

[jkirsher/next-queue PATCH v5 5/6] i40e: Clean up of cloud filters

2017-10-27 Thread Amritha Nambiar

Introduce the cloud filter datastructure and cleanup of cloud
filters associated with the device.

v5: Fixed a typo.
v2: Moved field comments in struct i40e_cloud_filter to the right.
Removed hlist_empty check from i40e_cloud_filter_exit()

Signed-off-by: Amritha Nambiar <amritha.namb...@intel.com>
---
 drivers/net/ethernet/intel/i40e/i40e.h  |9 +
 drivers/net/ethernet/intel/i40e/i40e_main.c |   24 
 2 files changed, 33 insertions(+)

diff --git a/drivers/net/ethernet/intel/i40e/i40e.h 
b/drivers/net/ethernet/intel/i40e/i40e.h
index f3c501e..b938bb4a 100644
--- a/drivers/net/ethernet/intel/i40e/i40e.h
+++ b/drivers/net/ethernet/intel/i40e/i40e.h
@@ -253,6 +253,12 @@ struct i40e_fdir_filter {
u32 fd_id;
 };
 
+struct i40e_cloud_filter {
+   struct hlist_node cloud_node;
+   unsigned long cookie;
+   u16 seid;   /* filter control */
+};
+
 #define I40E_ETH_P_LLDP0x88cc
 
 #define I40E_DCB_PRIO_TYPE_STRICT  0
@@ -420,6 +426,9 @@ struct i40e_pf {
struct i40e_udp_port_config udp_ports[I40E_MAX_PF_UDP_OFFLOAD_PORTS];
u16 pending_udp_bitmap;
 
+   struct hlist_head cloud_filter_list;
+   u16 num_cloud_filters;
+
enum i40e_interrupt_policy int_policy;
u16 rx_itr_default;
u16 tx_itr_default;
diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c 
b/drivers/net/ethernet/intel/i40e/i40e_main.c
index 128f259..fbe3450 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_main.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_main.c
@@ -6937,6 +6937,26 @@ static void i40e_fdir_filter_exit(struct i40e_pf *pf)
 }
 
 /**
+ * i40e_cloud_filter_exit - Cleans up the cloud filters
+ * @pf: Pointer to PF
+ *
+ * This function destroys the hlist where all the cloud filters
+ * were saved.
+ **/
+static void i40e_cloud_filter_exit(struct i40e_pf *pf)
+{
+   struct i40e_cloud_filter *cfilter;
+   struct hlist_node *node;
+
+   hlist_for_each_entry_safe(cfilter, node,
+ >cloud_filter_list, cloud_node) {
+   hlist_del(>cloud_node);
+   kfree(cfilter);
+   }
+   pf->num_cloud_filters = 0;
+}
+
+/**
  * i40e_close - Disables a network interface
  * @netdev: network interface device structure
  *
@@ -12196,6 +12216,7 @@ static int i40e_setup_pf_switch(struct i40e_pf *pf, 
bool reinit)
vsi = i40e_vsi_reinit_setup(pf->vsi[pf->lan_vsi]);
if (!vsi) {
dev_info(>pdev->dev, "setup of MAIN VSI failed\n");
+   i40e_cloud_filter_exit(pf);
i40e_fdir_teardown(pf);
return -EAGAIN;
}
@@ -13030,6 +13051,8 @@ static void i40e_remove(struct pci_dev *pdev)
if (pf->vsi[pf->lan_vsi])
i40e_vsi_release(pf->vsi[pf->lan_vsi]);
 
+   i40e_cloud_filter_exit(pf);
+
/* remove attached clients */
if (pf->flags & I40E_FLAG_IWARP_ENABLED) {
ret_code = i40e_lan_del_device(pf);
@@ -13261,6 +13284,7 @@ static void i40e_shutdown(struct pci_dev *pdev)
 
del_timer_sync(>service_timer);
cancel_work_sync(>service_task);
+   i40e_cloud_filter_exit(pf);
i40e_fdir_teardown(pf);
 
/* Client close must be called explicitly here because the timer

[jkirsher/next-queue PATCH v5 6/6] i40e: Enable cloud filters via tc-flower

2017-10-27 Thread Amritha Nambiar

This patch enables tc-flower based hardware offloads. tc flower
filter provided by the kernel is configured as driver specific
cloud filter. The patch implements functions and admin queue
commands needed to support cloud filters in the driver and
adds cloud filters to configure these tc-flower filters.

The classification function of the filter is to direct matched
packets to a traffic class. The hardware traffic class is set
based on the the classid reserved in the range :ffe0 - :ffef.

# tc qdisc add dev eth0 ingress
# ethtool -K eth0 hw-tc-offload on

Match Dst MAC and route to TC0:
# tc filter add dev eth0 protocol ip parent :\
  prio 1 flower dst_mac 3c:fd:fe:a0:d6:70 skip_sw\
  hw_tc 1

Match Dst IPv4,Dst Port and route to TC1:
# tc filter add dev eth0 protocol ip parent :\
  prio 2 flower dst_ip 192.168.3.5/32\
  ip_proto udp dst_port 25 skip_sw\
  hw_tc 2

Match Dst IPv6,Dst Port and route to TC1:
# tc filter add dev eth0 protocol ipv6 parent :\
  prio 3 flower dst_ip fe8::200:1\
  ip_proto udp dst_port 66 skip_sw\
  hw_tc 2

Delete tc flower filter:
Example:

# tc filter del dev eth0 parent : prio 3 handle 0x1 flower
# tc filter del dev eth0 parent :

Flow Director Sideband is disabled while configuring cloud filters
via tc-flower and until any cloud filter exists.

Unsupported matches when cloud filters are added using enhanced
big buffer cloud filter mode of underlying switch include:
1. source port and source IP
2. Combined MAC address and IP fields.
3. Not specifying L4 port

These filter matches can however be used to redirect traffic to
the main VSI (tc 0) which does not require the enhanced big buffer
cloud filter support.

v5: Changes to align with Jiri's filter-block series. Use the
helper function to get the HW traffic class value from classid.
Pack cloud filter struct and other code cleanup based on Shannon's
comments.
v4: Use classid to set traffic class for matched packets. Do not
allow disabling hw-tc-offloads when offloaded tc filters are active.
v3: Cleaned up some lengthy function names. Changed ipv6 address to
__be32 array instead of u8 array. Used macro for IP version. Minor
formatting changes.
v2:
1. Moved I40E_SWITCH_MODE_MASK definition to i40e_type.h
2. Moved dev_info for add/deleting cloud filters in else condition
3. Fixed some format specifier in dev_err logs
4. Refactored i40e_get_capabilities to take an additional
   list_type parameter and use it to query device and function
   level capabilities.
5. Fixed parsing tc redirect action to check for the is_tcf_mirred_tc()
   to verify if redirect to a traffic class is supported.
6. Added comments for Geneve fix in cloud filter big buffer AQ
   function definitions.
7. Cleaned up setup_tc interface to rebase and work with Jiri's
   updates, separate function to process tc cls flower offloads.
8. Changes to make Flow Director Sideband and Cloud filters mutually
   exclusive.

Signed-off-by: Amritha Nambiar <amritha.namb...@intel.com>
Signed-off-by: Kiran Patil <kiran.pa...@intel.com>
Signed-off-by: Anjali Singhai Jain <anjali.sing...@intel.com>
Signed-off-by: Jingjing Wu <jingjing...@intel.com>
---
 drivers/net/ethernet/intel/i40e/i40e.h |   54 +
 drivers/net/ethernet/intel/i40e/i40e_adminq_cmd.h  |3 
 drivers/net/ethernet/intel/i40e/i40e_common.c  |  189 
 drivers/net/ethernet/intel/i40e/i40e_main.c|  957 +++-
 drivers/net/ethernet/intel/i40e/i40e_prototype.h   |   16 
 drivers/net/ethernet/intel/i40e/i40e_type.h|1 
 .../net/ethernet/intel/i40evf/i40e_adminq_cmd.h|3 
 7 files changed, 1192 insertions(+), 31 deletions(-)

diff --git a/drivers/net/ethernet/intel/i40e/i40e.h 
b/drivers/net/ethernet/intel/i40e/i40e.h
index b938bb4a..5829715 100644
--- a/drivers/net/ethernet/intel/i40e/i40e.h
+++ b/drivers/net/ethernet/intel/i40e/i40e.h
@@ -55,6 +55,8 @@
 #include 
 #include 
 #include 
+#include 
+#include 
 #include "i40e_type.h"
 #include "i40e_prototype.h"
 #include "i40e_client.h"
@@ -253,10 +255,56 @@ struct i40e_fdir_filter {
u32 fd_id;
 };
 
+#define I40E_CLOUD_FIELD_OMAC  0x01
+#define I40E_CLOUD_FIELD_IMAC  0x02
+#define I40E_CLOUD_FIELD_IVLAN 0x04
+#define I40E_CLOUD_FIELD_TEN_ID0x08
+#define I40E_CLOUD_FIELD_IIP   0x10
+
+#define I40E_CLOUD_FILTER_FLAGS_OMAC   I40E_CLOUD_FIELD_OMAC
+#define I40E_CLOUD_FILTER_FLAGS_IMAC   I40E_CLOUD_FIELD_IMAC
+#define I40E_CLOUD_FILTER_FLAGS_IMAC_IVLAN (I40E_CLOUD_FIELD_IMAC | \
+I40E_CLOUD_FIELD_IVLAN)
+#define I40E_CLOUD_FILTER_FLAGS_IMAC_TEN_ID(I40E_CLOUD_FIELD_IMAC | \
+I40E_CLOUD_FIELD_TEN_ID)
+#define I40E_CLOUD_FILTER_FLAGS_OMAC_TEN_ID_IMAC (I40E_CLOUD_FIELD_OMAC | \
+ I40E_CLOUD_FIELD_IMAC | \
+ I40E_CLOUD_FIELD_TEN_

[jkirsher/next-queue PATCH v5 3/6] i40e: Cloud filter mode for set_switch_config command

2017-10-27 Thread Amritha Nambiar

Add definitions for L4 filters and switch modes based on cloud filters
modes and extend the set switch config command to include the
additional cloud filter mode.

v5: Addressed Shannon's comments to format ':'s and changed
names to I40E_CLOUD_FILTER_MODEx

Signed-off-by: Amritha Nambiar <amritha.namb...@intel.com>
Signed-off-by: Kiran Patil <kiran.pa...@intel.com>
---
 drivers/net/ethernet/intel/i40e/i40e_adminq_cmd.h |   30 -
 drivers/net/ethernet/intel/i40e/i40e_common.c |4 ++-
 drivers/net/ethernet/intel/i40e/i40e_ethtool.c|2 +
 drivers/net/ethernet/intel/i40e/i40e_main.c   |2 +
 drivers/net/ethernet/intel/i40e/i40e_prototype.h  |2 +
 drivers/net/ethernet/intel/i40e/i40e_type.h   |9 ++
 6 files changed, 44 insertions(+), 5 deletions(-)

diff --git a/drivers/net/ethernet/intel/i40e/i40e_adminq_cmd.h 
b/drivers/net/ethernet/intel/i40e/i40e_adminq_cmd.h
index 6a5db1b..47d 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_adminq_cmd.h
+++ b/drivers/net/ethernet/intel/i40e/i40e_adminq_cmd.h
@@ -790,7 +790,35 @@ struct i40e_aqc_set_switch_config {
 */
__le16  first_tag;
__le16  second_tag;
-   u8  reserved[6];
+   /* Next byte is split into following:
+* Bit 7: 0 : No action, 1: Switch to mode defined by bits 6:0
+* Bit 6: 0 : Destination Port, 1: source port
+* Bit 5..4 : L4 type
+* 0: rsvd
+* 1: TCP
+* 2: UDP
+* 3: Both TCP and UDP
+* Bits 3:0 Mode
+* 0: default mode
+* 1: L4 port only mode
+* 2: non-tunneled mode
+* 3: tunneled mode
+*/
+#define I40E_AQ_SET_SWITCH_BIT7_VALID  0x80
+
+#define I40E_AQ_SET_SWITCH_L4_SRC_PORT 0x40
+
+#define I40E_AQ_SET_SWITCH_L4_TYPE_RSVD0x00
+#define I40E_AQ_SET_SWITCH_L4_TYPE_TCP 0x10
+#define I40E_AQ_SET_SWITCH_L4_TYPE_UDP 0x20
+#define I40E_AQ_SET_SWITCH_L4_TYPE_BOTH0x30
+
+#define I40E_AQ_SET_SWITCH_MODE_DEFAULT0x00
+#define I40E_AQ_SET_SWITCH_MODE_L4_PORT0x01
+#define I40E_AQ_SET_SWITCH_MODE_NON_TUNNEL 0x02
+#define I40E_AQ_SET_SWITCH_MODE_TUNNEL 0x03
+   u8  mode;
+   u8  rsvd5[5];
 };
 
 I40E_CHECK_CMD_LENGTH(i40e_aqc_set_switch_config);
diff --git a/drivers/net/ethernet/intel/i40e/i40e_common.c 
b/drivers/net/ethernet/intel/i40e/i40e_common.c
index 8d0ee00..a9460e0 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_common.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_common.c
@@ -2407,13 +2407,14 @@ i40e_status i40e_aq_get_switch_config(struct i40e_hw 
*hw,
  * @hw: pointer to the hardware structure
  * @flags: bit flag values to set
  * @valid_flags: which bit flags to set
+ * @mode: cloud filter mode
  * @cmd_details: pointer to command details structure or NULL
  *
  * Set switch configuration bits
  **/
 enum i40e_status_code i40e_aq_set_switch_config(struct i40e_hw *hw,
u16 flags,
-   u16 valid_flags,
+   u16 valid_flags, u8 mode,
struct i40e_asq_cmd_details *cmd_details)
 {
struct i40e_aq_desc desc;
@@ -2425,6 +2426,7 @@ enum i40e_status_code i40e_aq_set_switch_config(struct 
i40e_hw *hw,
  i40e_aqc_opc_set_switch_config);
scfg->flags = cpu_to_le16(flags);
scfg->valid_flags = cpu_to_le16(valid_flags);
+   scfg->mode = mode;
if (hw->flags & I40E_HW_FLAG_802_1AD_CAPABLE) {
scfg->switch_tag = cpu_to_le16(hw->switch_tag);
scfg->first_tag = cpu_to_le16(hw->first_tag);
diff --git a/drivers/net/ethernet/intel/i40e/i40e_ethtool.c 
b/drivers/net/ethernet/intel/i40e/i40e_ethtool.c
index 9eb6187..dc9b8dc 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_ethtool.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_ethtool.c
@@ -4343,7 +4343,7 @@ static int i40e_set_priv_flags(struct net_device *dev, 
u32 flags)
sw_flags = I40E_AQ_SET_SWITCH_CFG_PROMISC;
valid_flags = I40E_AQ_SET_SWITCH_CFG_PROMISC;
ret = i40e_aq_set_switch_config(>hw, sw_flags, valid_flags,
-   NULL);
+   0, NULL);
if (ret && pf->hw.aq.asq_last_status != I40E_AQ_RC_ESRCH) {
dev_info(>pdev->dev,
 "couldn't set switch config bits, err %s 
aq_err %s\n",
diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c 
b/drivers/net/ethernet/intel/i40e/i40e_main.c
index 2ff7384..128f259 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_main.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_main.c
@@ -12166,7 +12166,7 @@ static int i40e_setup_pf_sw

[jkirsher/next-queue PATCH v5 4/6] i40e: Admin queue definitions for cloud filters

2017-10-27 Thread Amritha Nambiar

Add new admin queue definitions and extended fields for cloud
filter support. Define big buffer for extended general fields
in Add/Remove Cloud filters command.

v5: Addressed Shannon's comment to move couple of description
comments to the right.
v3: Shortened some lengthy struct names.
v2: Added I40E_CHECK_STRUCT_LEN check to AQ command structs and
added AQ definitions to i40evf for consistency based on Shannon's
feedback.

Signed-off-by: Amritha Nambiar <amritha.namb...@intel.com>
Signed-off-by: Kiran Patil <kiran.pa...@intel.com>
Signed-off-by: Jingjing Wu <jingjing...@intel.com>
---
 drivers/net/ethernet/intel/i40e/i40e_adminq_cmd.h  |  107 
 .../net/ethernet/intel/i40evf/i40e_adminq_cmd.h|  107 
 2 files changed, 210 insertions(+), 4 deletions(-)

diff --git a/drivers/net/ethernet/intel/i40e/i40e_adminq_cmd.h 
b/drivers/net/ethernet/intel/i40e/i40e_adminq_cmd.h
index 47d..9f1f578 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_adminq_cmd.h
+++ b/drivers/net/ethernet/intel/i40e/i40e_adminq_cmd.h
@@ -1371,14 +1371,16 @@ struct i40e_aqc_add_remove_cloud_filters {
 #define I40E_AQC_ADD_CLOUD_CMD_SEID_NUM_SHIFT  0
 #define I40E_AQC_ADD_CLOUD_CMD_SEID_NUM_MASK   (0x3FF << \
I40E_AQC_ADD_CLOUD_CMD_SEID_NUM_SHIFT)
-   u8  reserved2[4];
+   u8  big_buffer_flag;
+#define I40E_AQC_ADD_CLOUD_CMD_BB  1
+   u8  reserved2[3];
__le32  addr_high;
__le32  addr_low;
 };
 
 I40E_CHECK_CMD_LENGTH(i40e_aqc_add_remove_cloud_filters);
 
-struct i40e_aqc_add_remove_cloud_filters_element_data {
+struct i40e_aqc_cloud_filters_element_data {
u8  outer_mac[6];
u8  inner_mac[6];
__le16  inner_vlan;
@@ -1408,6 +1410,10 @@ struct i40e_aqc_add_remove_cloud_filters_element_data {
 #define I40E_AQC_ADD_CLOUD_FILTER_IMAC 0x000A
 #define I40E_AQC_ADD_CLOUD_FILTER_OMAC_TEN_ID_IMAC 0x000B
 #define I40E_AQC_ADD_CLOUD_FILTER_IIP  0x000C
+/* 0x0010 to 0x0017 is for custom filters */
+#define I40E_AQC_ADD_CLOUD_FILTER_IP_PORT  0x0010 /* Dest IP + L4 
Port */
+#define I40E_AQC_ADD_CLOUD_FILTER_MAC_PORT 0x0011 /* Dest MAC + L4 
Port */
+#define I40E_AQC_ADD_CLOUD_FILTER_MAC_VLAN_PORT0x0012 /* Dest 
MAC + VLAN + L4 Port */
 
 #define I40E_AQC_ADD_CLOUD_FLAGS_TO_QUEUE  0x0080
 #define I40E_AQC_ADD_CLOUD_VNK_SHIFT   6
@@ -1442,6 +1448,49 @@ struct i40e_aqc_add_remove_cloud_filters_element_data {
u8  response_reserved[7];
 };
 
+I40E_CHECK_STRUCT_LEN(0x40, i40e_aqc_cloud_filters_element_data);
+
+/* i40e_aqc_cloud_filters_element_bb is used when
+ * I40E_AQC_CLOUD_CMD_BB flag is set.
+ */
+struct i40e_aqc_cloud_filters_element_bb {
+   struct i40e_aqc_cloud_filters_element_data element;
+   u16 general_fields[32];
+#define I40E_AQC_ADD_CLOUD_FV_FLU_0X10_WORD0   0
+#define I40E_AQC_ADD_CLOUD_FV_FLU_0X10_WORD1   1
+#define I40E_AQC_ADD_CLOUD_FV_FLU_0X10_WORD2   2
+#define I40E_AQC_ADD_CLOUD_FV_FLU_0X11_WORD0   3
+#define I40E_AQC_ADD_CLOUD_FV_FLU_0X11_WORD1   4
+#define I40E_AQC_ADD_CLOUD_FV_FLU_0X11_WORD2   5
+#define I40E_AQC_ADD_CLOUD_FV_FLU_0X12_WORD0   6
+#define I40E_AQC_ADD_CLOUD_FV_FLU_0X12_WORD1   7
+#define I40E_AQC_ADD_CLOUD_FV_FLU_0X12_WORD2   8
+#define I40E_AQC_ADD_CLOUD_FV_FLU_0X13_WORD0   9
+#define I40E_AQC_ADD_CLOUD_FV_FLU_0X13_WORD1   10
+#define I40E_AQC_ADD_CLOUD_FV_FLU_0X13_WORD2   11
+#define I40E_AQC_ADD_CLOUD_FV_FLU_0X14_WORD0   12
+#define I40E_AQC_ADD_CLOUD_FV_FLU_0X14_WORD1   13
+#define I40E_AQC_ADD_CLOUD_FV_FLU_0X14_WORD2   14
+#define I40E_AQC_ADD_CLOUD_FV_FLU_0X16_WORD0   15
+#define I40E_AQC_ADD_CLOUD_FV_FLU_0X16_WORD1   16
+#define I40E_AQC_ADD_CLOUD_FV_FLU_0X16_WORD2   17
+#define I40E_AQC_ADD_CLOUD_FV_FLU_0X16_WORD3   18
+#define I40E_AQC_ADD_CLOUD_FV_FLU_0X16_WORD4   19
+#define I40E_AQC_ADD_CLOUD_FV_FLU_0X16_WORD5   20
+#define I40E_AQC_ADD_CLOUD_FV_FLU_0X16_WORD6   21
+#define I40E_AQC_ADD_CLOUD_FV_FLU_0X16_WORD7   22
+#define I40E_AQC_ADD_CLOUD_FV_FLU_0X17_WORD0   23
+#define I40E_AQC_ADD_CLOUD_FV_FLU_0X17_WORD1   24
+#define I40E_AQC_ADD_CLOUD_FV_FLU_0X17_WORD2   25
+#define I40E_AQC_ADD_CLOUD_FV_FLU_0X17_WORD3   26
+#define I40E_AQC_ADD_CLOUD_FV_FLU_0X17_WORD4   27
+#define I40E_AQC_ADD_CLOUD_FV_FLU_0X17_WORD5   28
+#define I40E_AQC_ADD_CLOUD_FV_FLU_0X17_WORD6   29
+#define I40E_AQC_ADD_CLOUD_FV_FLU_0X17_WORD7   30
+};
+
+I40E_CHECK_STRUCT_LEN(0x80, i40e_aqc_cloud_filters_element_bb);
+
 struct i40e_aqc_remove_cloud_filters_completion {
__le16 perfect_ovlan_used;
__le16 perfect_ovlan_free;
@@ -1453,6 +1502,60 @@ struct i40e_aqc_remove_cloud_filters_completion {
 
 I40E_CHECK_CMD_LENGTH(i40e_aqc_remove_cloud_filters_completion);
 
+/* Replace filter Command 0x025F
+ * uses the i40e_aqc_replace_cloud_filters,
+ * and the generic indirect completion structure
+ */
+struct i40e_

[jkirsher/next-queue PATCH v5 1/6] net: sched: Identify hardware traffic classes using classid

2017-10-27 Thread Amritha Nambiar

This patch offloads the classid to hardware and uses the classid
reserved in the range :ffe0 - :ffef to identify hardware traffic
classes reported via dev->num_tc.

tcf_result structure contains the class ID of the class to which
the packet belongs and is offloaded to hardware via flower filter.
A new helper function is introduced to represent HW traffic
classes 0 through 15 using the reserved classid values :ffe0 - :ffef.

v5: Added helper function to get HW TC values from classid.

Signed-off-by: Amritha Nambiar <amritha.namb...@intel.com>
---
 include/net/pkt_cls.h |1 +
 include/net/sch_generic.h |7 +++
 net/sched/cls_flower.c|2 ++
 3 files changed, 10 insertions(+)

diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h
index 04caa24..45c958f 100644
--- a/include/net/pkt_cls.h
+++ b/include/net/pkt_cls.h
@@ -664,6 +664,7 @@ struct tc_cls_flower_offload {
struct fl_flow_key *mask;
struct fl_flow_key *key;
struct tcf_exts *exts;
+   u32 classid;
 };
 
 enum tc_matchall_command {
diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
index 031dffd..c79d9aa 100644
--- a/include/net/sch_generic.h
+++ b/include/net/sch_generic.h
@@ -409,6 +409,13 @@ qdisc_class_find(const struct Qdisc_class_hash *hash, u32 
id)
return NULL;
 }
 
+static inline int tc_classid_to_hwtc(struct net_device *dev, u32 classid)
+{
+   u32 hwtc = TC_H_MIN(classid) - TC_H_MIN_PRIORITY;
+
+   return (hwtc < netdev_get_num_tc(dev)) ? hwtc : -EINVAL;
+}
+
 int qdisc_class_hash_init(struct Qdisc_class_hash *);
 void qdisc_class_hash_insert(struct Qdisc_class_hash *,
 struct Qdisc_class_common *);
diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c
index 16f58ab..addbc31 100644
--- a/net/sched/cls_flower.c
+++ b/net/sched/cls_flower.c
@@ -227,6 +227,7 @@ static int fl_hw_replace_filter(struct tcf_proto *tp,
cls_flower.mask = mask;
cls_flower.key = >mkey;
cls_flower.exts = >exts;
+   cls_flower.classid = f->res.classid;
 
err = tc_setup_cb_call(block, >exts, TC_SETUP_CLSFLOWER,
   _flower, skip_sw);
@@ -252,6 +253,7 @@ static void fl_hw_update_stats(struct tcf_proto *tp, struct 
cls_fl_filter *f)
cls_flower.command = TC_CLSFLOWER_STATS;
cls_flower.cookie = (unsigned long) f;
cls_flower.exts = >exts;
+   cls_flower.classid = f->res.classid;
 
tc_setup_cb_call(block, >exts, TC_SETUP_CLSFLOWER,
 _flower, false);

[jkirsher/next-queue PATCH v5 2/6] i40e: Map TCs with the VSI seids

2017-10-27 Thread Amritha Nambiar

Add mapping of TCs with the seids of the channel VSIs. TC0
will be mapped to the main VSI seid and all other TCs are
mapped to the seid of the corresponding channel VSI.

Signed-off-by: Amritha Nambiar <amritha.namb...@intel.com>
---
 drivers/net/ethernet/intel/i40e/i40e.h  |1 +
 drivers/net/ethernet/intel/i40e/i40e_main.c |2 ++
 2 files changed, 3 insertions(+)

diff --git a/drivers/net/ethernet/intel/i40e/i40e.h 
b/drivers/net/ethernet/intel/i40e/i40e.h
index eb01776..f3c501e 100644
--- a/drivers/net/ethernet/intel/i40e/i40e.h
+++ b/drivers/net/ethernet/intel/i40e/i40e.h
@@ -739,6 +739,7 @@ struct i40e_vsi {
u16 next_base_queue;/* next queue to be used for channel setup */
 
struct list_head ch_list;
+   u16 tc_seid_map[I40E_MAX_TRAFFIC_CLASS];
 
void *priv; /* client driver data reference. */
 
diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c 
b/drivers/net/ethernet/intel/i40e/i40e_main.c
index 1cf9ba2..2ff7384 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_main.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_main.c
@@ -6100,6 +6100,7 @@ static int i40e_configure_queue_channels(struct i40e_vsi 
*vsi)
int ret = 0, i;
 
/* Create app vsi with the TCs. Main VSI with TC0 is already set up */
+   vsi->tc_seid_map[0] = vsi->seid;
for (i = 1; i < I40E_MAX_TRAFFIC_CLASS; i++) {
if (vsi->tc_config.enabled_tc & BIT(i)) {
ch = kzalloc(sizeof(*ch), GFP_KERNEL);
@@ -6130,6 +6131,7 @@ static int i40e_configure_queue_channels(struct i40e_vsi 
*vsi)
i, ch->num_queue_pairs);
goto err_free;
}
+   vsi->tc_seid_map[i] = ch->seid;
}
}
return ret;

[jkirsher/next-queue PATCH v5 0/6] tc-flower based cloud filters in i40e

2017-10-27 Thread Amritha Nambiar

This patch series enables configuring cloud filters in i40e
using the tc-flower classifier. The classification function
of the filter is to match a packet to a traffic class. cls_flower is
extended to offload classid to hardware. Hardware traffic classes
are identified using classid values reserved in the range :ffe0 - :ffef.

The cloud filters are added for a VSI and are cleaned up when
the VSI is deleted. The filters that match on L4 ports needs
enhanced admin queue functions with big buffer support for
extended fields in cloud filter commands.

Example:
# tc qdisc add dev eth0 ingress
# ethtool -K eth0 hw-tc-offload on

Match Dst IPv4,Dst Port and route to TC1:
# tc filter add dev eth0 protocol ip parent : prio 1 flower\
  dst_ip 192.168.1.1/32 ip_proto udp dst_port 12000\
  skip_sw hw_tc 1

# tc filter show dev eth0 parent :
filter pref 1 flower chain 0
filter pref 1 flower chain 0 handle 0x1 hw_tc 1
  eth_type ipv4
  ip_proto udp
  dst_ip 192.168.1.1
  dst_port 12000
  skip_sw
  in_hw

v5: Hardware traffic class based on reserved classid values.
Changes to align with Jiri's filter-block series. i40e code
cleanup based on Shannon's comments.

Authors:
Amritha Nambiar <amritha.namb...@intel.com>
Kiran Patil <kiran.pa...@intel.com>
Anjali Singhai Jain <anjali.sing...@intel.com>
Jingjing Wu <jingjing...@intel.com>

---

Amritha Nambiar (6):
  net: sched: Identify hardware traffic classes using classid
  i40e: Map TCs with the VSI seids
  i40e: Cloud filter mode for set_switch_config command
  i40e: Admin queue definitions for cloud filters
  i40e: Clean up of cloud filters
  i40e: Enable cloud filters via tc-flower


 drivers/net/ethernet/intel/i40e/i40e.h |   62 +
 drivers/net/ethernet/intel/i40e/i40e_adminq_cmd.h  |  140 +++
 drivers/net/ethernet/intel/i40e/i40e_common.c  |  193 
 drivers/net/ethernet/intel/i40e/i40e_ethtool.c |2 
 drivers/net/ethernet/intel/i40e/i40e_main.c|  985 +++-
 drivers/net/ethernet/intel/i40e/i40e_prototype.h   |   18 
 drivers/net/ethernet/intel/i40e/i40e_type.h|   10 
 .../net/ethernet/intel/i40evf/i40e_adminq_cmd.h|  110 ++
 include/net/pkt_cls.h  |1 
 include/net/sch_generic.h  |7 
 net/sched/cls_flower.c |2 
 11 files changed, 1491 insertions(+), 39 deletions(-)

--

[iproute2 PATCH] tc/mqprio: Offload mode and shaper options in mqprio

2017-10-26 Thread Amritha Nambiar

This patch was previously submitted as RFC. Submitting this as
non-RFC now that the tc/mqprio changes are accepted in net-next.

Adds new mqprio options for 'mode' and 'shaper'. The mode
option can take values for offload modes such as 'dcb' (default),
'channel' with the 'hw' option set to 1. The new 'channel' mode
supports offloading TCs and other queue configurations. The
'shaper' option is to support HW shapers ('dcb' default) and
takes the value 'bw_rlimit' for bandwidth rate limiting. The
parameters to the bw_rlimit shaper are minimum and maximum
bandwidth rates. New HW shapers in future can be supported
through the shaper attribute.

# tc qdisc add dev eth0 root mqprio num_tc 2  map 0 0 0 0 1 1 1 1\
  queues 4@0 4@4 hw 1 mode channel shaper bw_rlimit\
  min_rate 1Gbit 2Gbit max_rate 4Gbit 5Gbit

# tc qdisc show dev eth0

qdisc mqprio 804a: root  tc 2 map 0 0 0 0 1 1 1 1 0 0 0 0 0 0 0 0
 queues:(0:3) (4:7)
 mode:channel
 shaper:bw_rlimit   min_rate:1Gbit 2Gbit   max_rate:4Gbit 5Gbit

Signed-off-by: Amritha Nambiar <amritha.namb...@intel.com>
---
 include/uapi/linux/pkt_sched.h |   32 +++
 tc/q_mqprio.c  |  192 +++-
 2 files changed, 217 insertions(+), 7 deletions(-)

diff --git a/include/uapi/linux/pkt_sched.h b/include/uapi/linux/pkt_sched.h
index 099bf55..e95b5c9 100644
--- a/include/uapi/linux/pkt_sched.h
+++ b/include/uapi/linux/pkt_sched.h
@@ -625,6 +625,22 @@ enum {
 
 #define TC_MQPRIO_HW_OFFLOAD_MAX (__TC_MQPRIO_HW_OFFLOAD_MAX - 1)
 
+enum {
+   TC_MQPRIO_MODE_DCB,
+   TC_MQPRIO_MODE_CHANNEL,
+   __TC_MQPRIO_MODE_MAX
+};
+
+#define __TC_MQPRIO_MODE_MAX (__TC_MQPRIO_MODE_MAX - 1)
+
+enum {
+   TC_MQPRIO_SHAPER_DCB,
+   TC_MQPRIO_SHAPER_BW_RATE,   /* Add new shapers below */
+   __TC_MQPRIO_SHAPER_MAX
+};
+
+#define __TC_MQPRIO_SHAPER_MAX (__TC_MQPRIO_SHAPER_MAX - 1)
+
 struct tc_mqprio_qopt {
__u8num_tc;
__u8prio_tc_map[TC_QOPT_BITMASK + 1];
@@ -633,6 +649,22 @@ struct tc_mqprio_qopt {
__u16   offset[TC_QOPT_MAX_QUEUE];
 };
 
+#define TC_MQPRIO_F_MODE   0x1
+#define TC_MQPRIO_F_SHAPER 0x2
+#define TC_MQPRIO_F_MIN_RATE   0x4
+#define TC_MQPRIO_F_MAX_RATE   0x8
+
+enum {
+   TCA_MQPRIO_UNSPEC,
+   TCA_MQPRIO_MODE,
+   TCA_MQPRIO_SHAPER,
+   TCA_MQPRIO_MIN_RATE64,
+   TCA_MQPRIO_MAX_RATE64,
+   __TCA_MQPRIO_MAX,
+};
+
+#define TCA_MQPRIO_MAX (__TCA_MQPRIO_MAX - 1)
+
 /* SFB */
 
 enum {
diff --git a/tc/q_mqprio.c b/tc/q_mqprio.c
index d6718fb..cd305b7 100644
--- a/tc/q_mqprio.c
+++ b/tc/q_mqprio.c
@@ -27,6 +27,10 @@ static void explain(void)
fprintf(stderr, "Usage: ... mqprio [num_tc NUMBER] [map P0 P1 ...]\n");
fprintf(stderr, "  [queues count1@offset1 
count2@offset2 ...] ");
fprintf(stderr, "[hw 1|0]\n");
+   fprintf(stderr, "  [mode dcb|channel]\n");
+   fprintf(stderr, "  [shaper bw_rlimit SHAPER_PARAMS]\n"
+   "Where: SHAPER_PARAMS := { min_rate MIN_RATE1 MIN_RATE2 ...|\n"
+   "  max_rate MAX_RATE1 MAX_RATE2 ... 
}\n");
 }
 
 static int mqprio_parse_opt(struct qdisc_util *qu, int argc,
@@ -40,6 +44,12 @@ static int mqprio_parse_opt(struct qdisc_util *qu, int argc,
.count = { },
.offset = { },
};
+   __u64 min_rate64[TC_QOPT_MAX_QUEUE] = {0};
+   __u64 max_rate64[TC_QOPT_MAX_QUEUE] = {0};
+   __u16 shaper = TC_MQPRIO_SHAPER_DCB;
+   __u16 mode = TC_MQPRIO_MODE_DCB;
+   struct rtattr *tail;
+   __u32 flags = 0;
 
while (argc > 0) {
idx = 0;
@@ -92,6 +102,68 @@ static int mqprio_parse_opt(struct qdisc_util *qu, int argc,
return -1;
}
idx++;
+   } else if (opt.hw && strcmp(*argv, "mode") == 0) {
+   NEXT_ARG();
+   if (matches(*argv, "dcb") == 0) {
+   mode = TC_MQPRIO_MODE_DCB;
+   } else if (matches(*argv, "channel") == 0) {
+   mode = TC_MQPRIO_MODE_CHANNEL;
+   }  else {
+   fprintf(stderr, "Illegal mode (%s)\n",
+   *argv);
+   return -1;
+   }
+   if (mode != TC_MQPRIO_MODE_DCB)
+   flags |= TC_MQPRIO_F_MODE;
+   idx++;
+   } else if (opt.hw && strcmp(*argv, "shaper") == 0) {
+   NEXT_ARG();
+   if (matches(*argv, "dcb") == 0) {
+

[jkirsher/next-queue PATCH v4 4/6] i40e: Admin queue definitions for cloud filters

2017-10-10 Thread Amritha Nambiar

Add new admin queue definitions and extended fields for cloud
filter support. Define big buffer for extended general fields
in Add/Remove Cloud filters command.

v3: Shortened some lengthy struct names.
v2: Added I40E_CHECK_STRUCT_LEN check to AQ command structs and
added AQ definitions to i40evf for consistency based on Shannon's
feedback.

Signed-off-by: Amritha Nambiar <amritha.namb...@intel.com>
Signed-off-by: Kiran Patil <kiran.pa...@intel.com>
Signed-off-by: Jingjing Wu <jingjing...@intel.com>
---
 drivers/net/ethernet/intel/i40e/i40e_adminq_cmd.h  |  110 
 .../net/ethernet/intel/i40evf/i40e_adminq_cmd.h|  110 
 2 files changed, 216 insertions(+), 4 deletions(-)

diff --git a/drivers/net/ethernet/intel/i40e/i40e_adminq_cmd.h 
b/drivers/net/ethernet/intel/i40e/i40e_adminq_cmd.h
index 729976b..bcc7986 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_adminq_cmd.h
+++ b/drivers/net/ethernet/intel/i40e/i40e_adminq_cmd.h
@@ -1371,14 +1371,16 @@ struct i40e_aqc_add_remove_cloud_filters {
 #define I40E_AQC_ADD_CLOUD_CMD_SEID_NUM_SHIFT  0
 #define I40E_AQC_ADD_CLOUD_CMD_SEID_NUM_MASK   (0x3FF << \
I40E_AQC_ADD_CLOUD_CMD_SEID_NUM_SHIFT)
-   u8  reserved2[4];
+   u8  big_buffer_flag;
+#define I40E_AQC_ADD_CLOUD_CMD_BB  1
+   u8  reserved2[3];
__le32  addr_high;
__le32  addr_low;
 };
 
 I40E_CHECK_CMD_LENGTH(i40e_aqc_add_remove_cloud_filters);
 
-struct i40e_aqc_add_remove_cloud_filters_element_data {
+struct i40e_aqc_cloud_filters_element_data {
u8  outer_mac[6];
u8  inner_mac[6];
__le16  inner_vlan;
@@ -1408,6 +1410,13 @@ struct i40e_aqc_add_remove_cloud_filters_element_data {
 #define I40E_AQC_ADD_CLOUD_FILTER_IMAC 0x000A
 #define I40E_AQC_ADD_CLOUD_FILTER_OMAC_TEN_ID_IMAC 0x000B
 #define I40E_AQC_ADD_CLOUD_FILTER_IIP  0x000C
+/* 0x0010 to 0x0017 is for custom filters */
+/* flag to be used when adding cloud filter: IP + L4 Port */
+#define I40E_AQC_ADD_CLOUD_FILTER_IP_PORT  0x0010
+/* flag to be used when adding cloud filter: Dest MAC + L4 Port */
+#define I40E_AQC_ADD_CLOUD_FILTER_MAC_PORT 0x0011
+/* flag to be used when adding cloud filter: Dest MAC + VLAN + L4 Port */
+#define I40E_AQC_ADD_CLOUD_FILTER_MAC_VLAN_PORT0x0012
 
 #define I40E_AQC_ADD_CLOUD_FLAGS_TO_QUEUE  0x0080
 #define I40E_AQC_ADD_CLOUD_VNK_SHIFT   6
@@ -1442,6 +1451,49 @@ struct i40e_aqc_add_remove_cloud_filters_element_data {
u8  response_reserved[7];
 };
 
+I40E_CHECK_STRUCT_LEN(0x40, i40e_aqc_cloud_filters_element_data);
+
+/* i40e_aqc_cloud_filters_element_bb is used when
+ * I40E_AQC_CLOUD_CMD_BB flag is set.
+ */
+struct i40e_aqc_cloud_filters_element_bb {
+   struct i40e_aqc_cloud_filters_element_data element;
+   u16 general_fields[32];
+#define I40E_AQC_ADD_CLOUD_FV_FLU_0X10_WORD0   0
+#define I40E_AQC_ADD_CLOUD_FV_FLU_0X10_WORD1   1
+#define I40E_AQC_ADD_CLOUD_FV_FLU_0X10_WORD2   2
+#define I40E_AQC_ADD_CLOUD_FV_FLU_0X11_WORD0   3
+#define I40E_AQC_ADD_CLOUD_FV_FLU_0X11_WORD1   4
+#define I40E_AQC_ADD_CLOUD_FV_FLU_0X11_WORD2   5
+#define I40E_AQC_ADD_CLOUD_FV_FLU_0X12_WORD0   6
+#define I40E_AQC_ADD_CLOUD_FV_FLU_0X12_WORD1   7
+#define I40E_AQC_ADD_CLOUD_FV_FLU_0X12_WORD2   8
+#define I40E_AQC_ADD_CLOUD_FV_FLU_0X13_WORD0   9
+#define I40E_AQC_ADD_CLOUD_FV_FLU_0X13_WORD1   10
+#define I40E_AQC_ADD_CLOUD_FV_FLU_0X13_WORD2   11
+#define I40E_AQC_ADD_CLOUD_FV_FLU_0X14_WORD0   12
+#define I40E_AQC_ADD_CLOUD_FV_FLU_0X14_WORD1   13
+#define I40E_AQC_ADD_CLOUD_FV_FLU_0X14_WORD2   14
+#define I40E_AQC_ADD_CLOUD_FV_FLU_0X16_WORD0   15
+#define I40E_AQC_ADD_CLOUD_FV_FLU_0X16_WORD1   16
+#define I40E_AQC_ADD_CLOUD_FV_FLU_0X16_WORD2   17
+#define I40E_AQC_ADD_CLOUD_FV_FLU_0X16_WORD3   18
+#define I40E_AQC_ADD_CLOUD_FV_FLU_0X16_WORD4   19
+#define I40E_AQC_ADD_CLOUD_FV_FLU_0X16_WORD5   20
+#define I40E_AQC_ADD_CLOUD_FV_FLU_0X16_WORD6   21
+#define I40E_AQC_ADD_CLOUD_FV_FLU_0X16_WORD7   22
+#define I40E_AQC_ADD_CLOUD_FV_FLU_0X17_WORD0   23
+#define I40E_AQC_ADD_CLOUD_FV_FLU_0X17_WORD1   24
+#define I40E_AQC_ADD_CLOUD_FV_FLU_0X17_WORD2   25
+#define I40E_AQC_ADD_CLOUD_FV_FLU_0X17_WORD3   26
+#define I40E_AQC_ADD_CLOUD_FV_FLU_0X17_WORD4   27
+#define I40E_AQC_ADD_CLOUD_FV_FLU_0X17_WORD5   28
+#define I40E_AQC_ADD_CLOUD_FV_FLU_0X17_WORD6   29
+#define I40E_AQC_ADD_CLOUD_FV_FLU_0X17_WORD7   30
+};
+
+I40E_CHECK_STRUCT_LEN(0x80, i40e_aqc_cloud_filters_element_bb);
+
 struct i40e_aqc_remove_cloud_filters_completion {
__le16 perfect_ovlan_used;
__le16 perfect_ovlan_free;
@@ -1453,6 +1505,60 @@ struct i40e_aqc_remove_cloud_filters_completion {
 
 I40E_CHECK_CMD_LENGTH(i40e_aqc_remove_cloud_filters_completion);
 
+/* Replace filter Command 0x025F
+ * uses the i40e_aqc_replace_cloud_filters,
+ * and the generic indirect completio

[jkirsher/next-queue PATCH v4 3/6] i40e: Cloud filter mode for set_switch_config command

2017-10-10 Thread Amritha Nambiar

Add definitions for L4 filters and switch modes based on cloud filters
modes and extend the set switch config command to include the
additional cloud filter mode.

Signed-off-by: Amritha Nambiar <amritha.namb...@intel.com>
Signed-off-by: Kiran Patil <kiran.pa...@intel.com>
---
 drivers/net/ethernet/intel/i40e/i40e_adminq_cmd.h |   30 -
 drivers/net/ethernet/intel/i40e/i40e_common.c |4 ++-
 drivers/net/ethernet/intel/i40e/i40e_ethtool.c|2 +
 drivers/net/ethernet/intel/i40e/i40e_main.c   |2 +
 drivers/net/ethernet/intel/i40e/i40e_prototype.h  |2 +
 drivers/net/ethernet/intel/i40e/i40e_type.h   |9 ++
 6 files changed, 44 insertions(+), 5 deletions(-)

diff --git a/drivers/net/ethernet/intel/i40e/i40e_adminq_cmd.h 
b/drivers/net/ethernet/intel/i40e/i40e_adminq_cmd.h
index 6a5db1b..729976b 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_adminq_cmd.h
+++ b/drivers/net/ethernet/intel/i40e/i40e_adminq_cmd.h
@@ -790,7 +790,35 @@ struct i40e_aqc_set_switch_config {
 */
__le16  first_tag;
__le16  second_tag;
-   u8  reserved[6];
+   /* Next byte is split into following:
+* Bit 7 : 0: No action, 1: Switch to mode defined by bits 6:0
+* Bit 6: 0 : Destination Port, 1: source port
+* Bit 5..4: L4 type
+* 0: rsvd
+* 1: TCP
+* 2: UDP
+* 3: Both TCP and UDP
+* Bits 3:0 Mode
+* 0: default mode
+* 1: L4 port only mode
+* 2: non-tunneled mode
+* 3: tunneled mode
+*/
+#define I40E_AQ_SET_SWITCH_BIT7_VALID  0x80
+
+#define I40E_AQ_SET_SWITCH_L4_SRC_PORT 0x40
+
+#define I40E_AQ_SET_SWITCH_L4_TYPE_RSVD0x00
+#define I40E_AQ_SET_SWITCH_L4_TYPE_TCP 0x10
+#define I40E_AQ_SET_SWITCH_L4_TYPE_UDP 0x20
+#define I40E_AQ_SET_SWITCH_L4_TYPE_BOTH0x30
+
+#define I40E_AQ_SET_SWITCH_MODE_DEFAULT0x00
+#define I40E_AQ_SET_SWITCH_MODE_L4_PORT0x01
+#define I40E_AQ_SET_SWITCH_MODE_NON_TUNNEL 0x02
+#define I40E_AQ_SET_SWITCH_MODE_TUNNEL 0x03
+   u8  mode;
+   u8  rsvd5[5];
 };
 
 I40E_CHECK_CMD_LENGTH(i40e_aqc_set_switch_config);
diff --git a/drivers/net/ethernet/intel/i40e/i40e_common.c 
b/drivers/net/ethernet/intel/i40e/i40e_common.c
index 1b85eb3..0b3c5b7 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_common.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_common.c
@@ -2402,13 +2402,14 @@ i40e_status i40e_aq_get_switch_config(struct i40e_hw 
*hw,
  * @hw: pointer to the hardware structure
  * @flags: bit flag values to set
  * @valid_flags: which bit flags to set
+ * @mode: cloud filter mode
  * @cmd_details: pointer to command details structure or NULL
  *
  * Set switch configuration bits
  **/
 enum i40e_status_code i40e_aq_set_switch_config(struct i40e_hw *hw,
u16 flags,
-   u16 valid_flags,
+   u16 valid_flags, u8 mode,
struct i40e_asq_cmd_details *cmd_details)
 {
struct i40e_aq_desc desc;
@@ -2420,6 +2421,7 @@ enum i40e_status_code i40e_aq_set_switch_config(struct 
i40e_hw *hw,
  i40e_aqc_opc_set_switch_config);
scfg->flags = cpu_to_le16(flags);
scfg->valid_flags = cpu_to_le16(valid_flags);
+   scfg->mode = mode;
if (hw->flags & I40E_HW_FLAG_802_1AD_CAPABLE) {
scfg->switch_tag = cpu_to_le16(hw->switch_tag);
scfg->first_tag = cpu_to_le16(hw->first_tag);
diff --git a/drivers/net/ethernet/intel/i40e/i40e_ethtool.c 
b/drivers/net/ethernet/intel/i40e/i40e_ethtool.c
index a760d75..37ca294 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_ethtool.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_ethtool.c
@@ -4341,7 +4341,7 @@ static int i40e_set_priv_flags(struct net_device *dev, 
u32 flags)
sw_flags = I40E_AQ_SET_SWITCH_CFG_PROMISC;
valid_flags = I40E_AQ_SET_SWITCH_CFG_PROMISC;
ret = i40e_aq_set_switch_config(>hw, sw_flags, valid_flags,
-   NULL);
+   0, NULL);
if (ret && pf->hw.aq.asq_last_status != I40E_AQ_RC_ESRCH) {
dev_info(>pdev->dev,
 "couldn't set switch config bits, err %s 
aq_err %s\n",
diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c 
b/drivers/net/ethernet/intel/i40e/i40e_main.c
index 33a8f429..0539d43 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_main.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_main.c
@@ -12165,7 +12165,7 @@ static int i40e_setup_pf_switch(struct i40e_pf *pf, 
bool reinit)
u16 valid_flags;

[jkirsher/next-queue PATCH v4 1/6] cls_flower: Offload classid to hardware

2017-10-10 Thread Amritha Nambiar

The classid on a filter is used to match a packet to a class.
tcf_result structure contains the class ID of the class to which
the packet belongs. This patch enables offloading the classid to
the hardware.

Signed-off-by: Amritha Nambiar <amritha.namb...@intel.com>
---
 include/net/pkt_cls.h  |1 +
 net/sched/cls_flower.c |2 ++
 2 files changed, 3 insertions(+)

diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h
index 456017a..c2f847f 100644
--- a/include/net/pkt_cls.h
+++ b/include/net/pkt_cls.h
@@ -515,6 +515,7 @@ struct tc_cls_flower_offload {
struct fl_flow_key *key;
struct tcf_exts *exts;
bool egress_dev;
+   u32 classid;
 };
 
 enum tc_matchall_command {
diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c
index db831ac..50c8a52 100644
--- a/net/sched/cls_flower.c
+++ b/net/sched/cls_flower.c
@@ -241,6 +241,7 @@ static int fl_hw_replace_filter(struct tcf_proto *tp,
cls_flower.mask = mask;
cls_flower.key = >mkey;
cls_flower.exts = >exts;
+   cls_flower.classid = f->res.classid;
 
err = dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_CLSFLOWER,
_flower);
@@ -264,6 +265,7 @@ static void fl_hw_update_stats(struct tcf_proto *tp, struct 
cls_fl_filter *f)
cls_flower.command = TC_CLSFLOWER_STATS;
cls_flower.cookie = (unsigned long) f;
cls_flower.exts = >exts;
+   cls_flower.classid = f->res.classid;
 
dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_CLSFLOWER,
  _flower);

[jkirsher/next-queue PATCH v4 0/6] tc-flower based cloud filters in i40e

2017-10-10 Thread Amritha Nambiar

This patch series enables configuring cloud filters in i40e
using the tc-flower classifier. The classification function
of the filter is to match a packet to a class. cls_flower is
extended to offload classid to hardware. The offloaded classid
is used direct matched packets to a traffic class on the device. 
The approach here is similar to the tc 'prio' qdisc which uses
the classid for band selection. The ingress qdisc is called :0,
so traffic classes are :1 to :8 (i40e has max of 8 TCs).
TC0 is minor number 1, TC1 is minor number 2 etc.

The cloud filters are added for a VSI and are cleaned up when
the VSI is deleted. The filters that match on L4 ports needs
enhanced admin queue functions with big buffer support for
extended fields in cloud filter commands.

Example:
# tc qdisc add dev eth0 ingress
# ethtool -K eth0 hw-tc-offload on

Match Dst IPv4,Dst Port and route to TC1:
# tc filter add dev eth0 protocol ip parent : prio 1 flower\
  dst_ip 192.168.1.1/32 ip_proto udp dst_port 22\
  skip_sw classid :2

# tc filter show dev eth0 parent :
filter pref 1 flower chain 0
filter pref 1 flower chain 0 handle 0x1 classid :2
  eth_type ipv4
  ip_proto udp
  dst_ip 192.168.1.1
  dst_port 22
  skip_sw
  in_hw

v4: classid based approach to set traffic class for matched packets.

Authors:
Amritha Nambiar <amritha.namb...@intel.com>
Kiran Patil <kiran.pa...@intel.com>
Anjali Singhai Jain <anjali.sing...@intel.com>
Jingjing Wu <jingjing...@intel.com>
---

Amritha Nambiar (6):
  cls_flower: Offload classid to hardware
  i40e: Map TCs with the VSI seids
  i40e: Cloud filter mode for set_switch_config command
  i40e: Admin queue definitions for cloud filters
  i40e: Clean up of cloud filters
  i40e: Enable cloud filters via tc-flower


 drivers/net/ethernet/intel/i40e/i40e.h |   55 +
 drivers/net/ethernet/intel/i40e/i40e_adminq_cmd.h  |  143 +++
 drivers/net/ethernet/intel/i40e/i40e_common.c  |  193 
 drivers/net/ethernet/intel/i40e/i40e_ethtool.c |2 
 drivers/net/ethernet/intel/i40e/i40e_main.c|  941 +++-
 drivers/net/ethernet/intel/i40e/i40e_prototype.h   |   18 
 drivers/net/ethernet/intel/i40e/i40e_type.h|   10 
 .../net/ethernet/intel/i40evf/i40e_adminq_cmd.h|  113 ++
 include/net/pkt_cls.h  |1 
 net/sched/cls_flower.c |2 
 10 files changed, 1439 insertions(+), 39 deletions(-)

--

[jkirsher/next-queue PATCH v4 5/6] i40e: Clean up of cloud filters

2017-10-10 Thread Amritha Nambiar

Introduce the cloud filter datastructure and cleanup of cloud
filters associated with the device.

v2: Moved field comments in struct i40e_cloud_filter to the right.
Removed hlist_empty check from i40e_cloud_filter_exit()

Signed-off-by: Amritha Nambiar <amritha.namb...@intel.com>
---
 drivers/net/ethernet/intel/i40e/i40e.h  |9 +
 drivers/net/ethernet/intel/i40e/i40e_main.c |   24 
 2 files changed, 33 insertions(+)

diff --git a/drivers/net/ethernet/intel/i40e/i40e.h 
b/drivers/net/ethernet/intel/i40e/i40e.h
index f3c501e..b938bb4a 100644
--- a/drivers/net/ethernet/intel/i40e/i40e.h
+++ b/drivers/net/ethernet/intel/i40e/i40e.h
@@ -253,6 +253,12 @@ struct i40e_fdir_filter {
u32 fd_id;
 };
 
+struct i40e_cloud_filter {
+   struct hlist_node cloud_node;
+   unsigned long cookie;
+   u16 seid;   /* filter control */
+};
+
 #define I40E_ETH_P_LLDP0x88cc
 
 #define I40E_DCB_PRIO_TYPE_STRICT  0
@@ -420,6 +426,9 @@ struct i40e_pf {
struct i40e_udp_port_config udp_ports[I40E_MAX_PF_UDP_OFFLOAD_PORTS];
u16 pending_udp_bitmap;
 
+   struct hlist_head cloud_filter_list;
+   u16 num_cloud_filters;
+
enum i40e_interrupt_policy int_policy;
u16 rx_itr_default;
u16 tx_itr_default;
diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c 
b/drivers/net/ethernet/intel/i40e/i40e_main.c
index 0539d43..bcdb16a 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_main.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_main.c
@@ -6937,6 +6937,26 @@ static void i40e_fdir_filter_exit(struct i40e_pf *pf)
 }
 
 /**
+ * i40e_cloud_filter_exit - Cleans up the Cloud Filters
+ * @pf: Pointer to PF
+ *
+ * This function destroys the hlist where all the Cloud Filters
+ * filters were saved.
+ **/
+static void i40e_cloud_filter_exit(struct i40e_pf *pf)
+{
+   struct i40e_cloud_filter *cfilter;
+   struct hlist_node *node;
+
+   hlist_for_each_entry_safe(cfilter, node,
+ >cloud_filter_list, cloud_node) {
+   hlist_del(>cloud_node);
+   kfree(cfilter);
+   }
+   pf->num_cloud_filters = 0;
+}
+
+/**
  * i40e_close - Disables a network interface
  * @netdev: network interface device structure
  *
@@ -12195,6 +12215,7 @@ static int i40e_setup_pf_switch(struct i40e_pf *pf, 
bool reinit)
vsi = i40e_vsi_reinit_setup(pf->vsi[pf->lan_vsi]);
if (!vsi) {
dev_info(>pdev->dev, "setup of MAIN VSI failed\n");
+   i40e_cloud_filter_exit(pf);
i40e_fdir_teardown(pf);
return -EAGAIN;
}
@@ -13029,6 +13050,8 @@ static void i40e_remove(struct pci_dev *pdev)
if (pf->vsi[pf->lan_vsi])
i40e_vsi_release(pf->vsi[pf->lan_vsi]);
 
+   i40e_cloud_filter_exit(pf);
+
/* remove attached clients */
if (pf->flags & I40E_FLAG_IWARP_ENABLED) {
ret_code = i40e_lan_del_device(pf);
@@ -13260,6 +13283,7 @@ static void i40e_shutdown(struct pci_dev *pdev)
 
del_timer_sync(>service_timer);
cancel_work_sync(>service_task);
+   i40e_cloud_filter_exit(pf);
i40e_fdir_teardown(pf);
 
/* Client close must be called explicitly here because the timer

[jkirsher/next-queue PATCH v4 6/6] i40e: Enable cloud filters via tc-flower

2017-10-10 Thread Amritha Nambiar

This patch enables tc-flower based hardware offloads. tc flower
filter provided by the kernel is configured as driver specific
cloud filter. The patch implements functions and admin queue
commands needed to support cloud filters in the driver and
adds cloud filters to configure these tc-flower filters.

The classification function of the filter is to direct matched
packets to a traffic class which is set based on the offloaded
tc-flower classid. The approach here is similar to the tc 'prio'
qdisc which uses the classid for band selection. The ingress qdisc
is called :0, so traffic classes are :1 to :8 (i40e
has max of 8 TCs). TC0 is minor number 1, TC1 is minor number 2 etc.

# tc qdisc add dev eth0 ingress
# ethtool -K eth0 hw-tc-offload on

Match Dst MAC and route to TC0:
# tc filter add dev eth0 protocol ip parent :\
  prio 1 flower dst_mac 3c:fd:fe:a0:d6:70 skip_sw\
  classid :1

Match Dst IPv4,Dst Port and route to TC1:
# tc filter add dev eth0 protocol ip parent :\
  prio 2 flower dst_ip 192.168.3.5/32\
  ip_proto udp dst_port 25 skip_sw\
  classid :2

Match Dst IPv6,Dst Port and route to TC1:
# tc filter add dev eth0 protocol ipv6 parent :\
  prio 3 flower dst_ip fe8::200:1\
  ip_proto udp dst_port 66 skip_sw\
  classid :2

Delete tc flower filter:
Example:

# tc filter del dev eth0 parent : prio 3 handle 0x1 flower
# tc filter del dev eth0 parent :

Flow Director Sideband is disabled while configuring cloud filters
via tc-flower and until any cloud filter exists.

Unsupported matches when cloud filters are added using enhanced
big buffer cloud filter mode of underlying switch include:
1. source port and source IP
2. Combined MAC address and IP fields.
3. Not specifying L4 port

These filter matches can however be used to redirect traffic to
the main VSI (tc 0) which does not require the enhanced big buffer
cloud filter support.

v4: Use classid to set traffic class for matched packets. Do not
allow disabling hw-tc-offloads when offloaded tc filters are active.
v3: Cleaned up some lengthy function names. Changed ipv6 address to
__be32 array instead of u8 array. Used macro for IP version. Minor
formatting changes.
v2:
1. Moved I40E_SWITCH_MODE_MASK definition to i40e_type.h
2. Moved dev_info for add/deleting cloud filters in else condition
3. Fixed some format specifier in dev_err logs
4. Refactored i40e_get_capabilities to take an additional
   list_type parameter and use it to query device and function
   level capabilities.
5. Fixed parsing tc redirect action to check for the is_tcf_mirred_tc()
   to verify if redirect to a traffic class is supported.
6. Added comments for Geneve fix in cloud filter big buffer AQ
   function definitions.
7. Cleaned up setup_tc interface to rebase and work with Jiri's
   updates, separate function to process tc cls flower offloads.
8. Changes to make Flow Director Sideband and Cloud filters mutually
   exclusive.

Signed-off-by: Amritha Nambiar <amritha.namb...@intel.com>
Signed-off-by: Kiran Patil <kiran.pa...@intel.com>
Signed-off-by: Anjali Singhai Jain <anjali.sing...@intel.com>
Signed-off-by: Jingjing Wu <jingjing...@intel.com>
---
 drivers/net/ethernet/intel/i40e/i40e.h |   45 +
 drivers/net/ethernet/intel/i40e/i40e_adminq_cmd.h  |3 
 drivers/net/ethernet/intel/i40e/i40e_common.c  |  189 
 drivers/net/ethernet/intel/i40e/i40e_main.c|  913 +++-
 drivers/net/ethernet/intel/i40e/i40e_prototype.h   |   16 
 drivers/net/ethernet/intel/i40e/i40e_type.h|1 
 .../net/ethernet/intel/i40evf/i40e_adminq_cmd.h|3 
 7 files changed, 1140 insertions(+), 30 deletions(-)

diff --git a/drivers/net/ethernet/intel/i40e/i40e.h 
b/drivers/net/ethernet/intel/i40e/i40e.h
index b938bb4a..c3f1312 100644
--- a/drivers/net/ethernet/intel/i40e/i40e.h
+++ b/drivers/net/ethernet/intel/i40e/i40e.h
@@ -55,6 +55,8 @@
 #include 
 #include 
 #include 
+#include 
+#include 
 #include "i40e_type.h"
 #include "i40e_prototype.h"
 #include "i40e_client.h"
@@ -253,9 +255,48 @@ struct i40e_fdir_filter {
u32 fd_id;
 };
 
+#define IPV4_VERSION 4
+#define IPV6_VERSION 6
+
+#define I40E_CLOUD_FIELD_OMAC  0x01
+#define I40E_CLOUD_FIELD_IMAC  0x02
+#define I40E_CLOUD_FIELD_IVLAN 0x04
+#define I40E_CLOUD_FIELD_TEN_ID0x08
+#define I40E_CLOUD_FIELD_IIP   0x10
+
+#define I40E_CLOUD_FILTER_FLAGS_OMAC   I40E_CLOUD_FIELD_OMAC
+#define I40E_CLOUD_FILTER_FLAGS_IMAC   I40E_CLOUD_FIELD_IMAC
+#define I40E_CLOUD_FILTER_FLAGS_IMAC_IVLAN (I40E_CLOUD_FIELD_IMAC | \
+I40E_CLOUD_FIELD_IVLAN)
+#define I40E_CLOUD_FILTER_FLAGS_IMAC_TEN_ID(I40E_CLOUD_FIELD_IMAC | \
+I40E_CLOUD_FIELD_TEN_ID)
+#define I40E_CLOUD_FILTER_FLAGS_OMAC_TEN_ID_IMAC (I40E_CLOUD_FIELD_OMAC | \
+

[jkirsher/next-queue PATCH v4 2/6] i40e: Map TCs with the VSI seids

2017-10-10 Thread Amritha Nambiar

Add mapping of TCs with the seids of the channel VSIs. TC0
will be mapped to the main VSI seid and all other TCs are
mapped to the seid of the corresponding channel VSI.

Signed-off-by: Amritha Nambiar <amritha.namb...@intel.com>
---
 drivers/net/ethernet/intel/i40e/i40e.h  |1 +
 drivers/net/ethernet/intel/i40e/i40e_main.c |2 ++
 2 files changed, 3 insertions(+)

diff --git a/drivers/net/ethernet/intel/i40e/i40e.h 
b/drivers/net/ethernet/intel/i40e/i40e.h
index eb01776..f3c501e 100644
--- a/drivers/net/ethernet/intel/i40e/i40e.h
+++ b/drivers/net/ethernet/intel/i40e/i40e.h
@@ -739,6 +739,7 @@ struct i40e_vsi {
u16 next_base_queue;/* next queue to be used for channel setup */
 
struct list_head ch_list;
+   u16 tc_seid_map[I40E_MAX_TRAFFIC_CLASS];
 
void *priv; /* client driver data reference. */
 
diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c 
b/drivers/net/ethernet/intel/i40e/i40e_main.c
index 75f944f..33a8f429 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_main.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_main.c
@@ -6100,6 +6100,7 @@ static int i40e_configure_queue_channels(struct i40e_vsi 
*vsi)
int ret = 0, i;
 
/* Create app vsi with the TCs. Main VSI with TC0 is already set up */
+   vsi->tc_seid_map[0] = vsi->seid;
for (i = 1; i < I40E_MAX_TRAFFIC_CLASS; i++) {
if (vsi->tc_config.enabled_tc & BIT(i)) {
ch = kzalloc(sizeof(*ch), GFP_KERNEL);
@@ -6130,6 +6131,7 @@ static int i40e_configure_queue_channels(struct i40e_vsi 
*vsi)
i, ch->num_queue_pairs);
goto err_free;
}
+   vsi->tc_seid_map[i] = ch->seid;
}
}
return ret;

[iproute2 PATCH] tc/mirred: Clean up white-space noise

2017-09-13 Thread Amritha Nambiar

Signed-off-by: Amritha Nambiar <amritha.namb...@intel.com>
---
 include/linux/tc_act/tc_mirred.h |6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/include/linux/tc_act/tc_mirred.h b/include/linux/tc_act/tc_mirred.h
index 3d7a2b3..69038c2 100644
--- a/include/linux/tc_act/tc_mirred.h
+++ b/include/linux/tc_act/tc_mirred.h
@@ -9,13 +9,13 @@
 #define TCA_EGRESS_MIRROR 2 /* mirror packet to EGRESS */
 #define TCA_INGRESS_REDIR 3  /* packet redirect to INGRESS*/
 #define TCA_INGRESS_MIRROR 4 /* mirror packet to INGRESS */
-   
 
+
 struct tc_mirred {
tc_gen;
int eaction;   /* one of IN/EGRESS_MIRROR/REDIR */
__u32   ifindex;  /* ifindex of egress port */
 };
-   
 
+
 enum {
TCA_MIRRED_UNSPEC,
TCA_MIRRED_TM,
@@ -24,5 +24,5 @@ enum {
__TCA_MIRRED_MAX
 };
 #define TCA_MIRRED_MAX (__TCA_MIRRED_MAX - 1)
-   
 
+
 #endif

[RFC PATCH v3 3/7] i40e: Map TCs with the VSI seids

2017-09-13 Thread Amritha Nambiar

Add mapping of TCs with the seids of the channel VSIs. TC0
will be mapped to the main VSI seid and all other TCs are
mapped to the seid of the corresponding channel VSI.

Signed-off-by: Amritha Nambiar <amritha.namb...@intel.com>
---
 drivers/net/ethernet/intel/i40e/i40e.h  |1 +
 drivers/net/ethernet/intel/i40e/i40e_main.c |2 ++
 2 files changed, 3 insertions(+)

diff --git a/drivers/net/ethernet/intel/i40e/i40e.h 
b/drivers/net/ethernet/intel/i40e/i40e.h
index 266e1dc..d846da9 100644
--- a/drivers/net/ethernet/intel/i40e/i40e.h
+++ b/drivers/net/ethernet/intel/i40e/i40e.h
@@ -738,6 +738,7 @@ struct i40e_vsi {
u16 next_base_queue;/* next queue to be used for channel setup */
 
struct list_head ch_list;
+   u16 tc_seid_map[I40E_MAX_TRAFFIC_CLASS];
 
void *priv; /* client driver data reference. */
 
diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c 
b/drivers/net/ethernet/intel/i40e/i40e_main.c
index 5ef3927..0455283 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_main.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_main.c
@@ -6093,6 +6093,7 @@ static int i40e_configure_queue_channels(struct i40e_vsi 
*vsi)
int ret = 0, i;
 
/* Create app vsi with the TCs. Main VSI with TC0 is already set up */
+   vsi->tc_seid_map[0] = vsi->seid;
for (i = 1; i < I40E_MAX_TRAFFIC_CLASS; i++) {
if (vsi->tc_config.enabled_tc & BIT(i)) {
ch = kzalloc(sizeof(*ch), GFP_KERNEL);
@@ -6122,6 +6123,7 @@ static int i40e_configure_queue_channels(struct i40e_vsi 
*vsi)
i, ch->num_queue_pairs);
goto err_free;
}
+   vsi->tc_seid_map[i] = ch->seid;
}
}
return ret;

[RFC PATCH v3 0/7] tc-flower based cloud filters in i40e

2017-09-13 Thread Amritha Nambiar

This patch series enables configuring cloud filters in i40e
using the tc-flower classifier. The only tc-filter action
supported is to redirect packets to a traffic class on the
same device. The mirror/redirect action is extended to
accept a traffic class to achieve this.

The cloud filters are added for a VSI and are cleaned up when
the VSI is deleted. The filters that match on L4 ports needs
enhanced admin queue functions with big buffer support for
extended fields in cloud filter commands.

Example:
# tc qdisc add dev eth0 ingress

# ethtool -K eth0 hw-tc-offload on

# tc filter add dev eth0 protocol ip parent : prio 1 flower\
  dst_ip 192.168.1.1/32 ip_proto udp dst_port 22\
  skip_sw action mirred ingress redirect dev eth0 tclass 1

# tc filter show dev eth0 parent :
filter protocol ip pref 1 flower chain 0
filter protocol ip pref 1 flower chain 0 handle 0x1
  eth_type ipv4
  ip_proto udp
  dst_ip 192.168.1.1
  dst_port 22
  skip_sw
  in_hw
action order 1: mirred (Ingress Redirect to device eth0) stolen tclass 1
index 7 ref 1 bind 1

v3: Added an extra patch to clean up white-space noise. Cleaned up
some lengthy function names. Used __be32 array for ipv6 address.
Used macro for IP version. Minor formatting changes.

---

Amritha Nambiar (7):
  tc_mirred: Clean up white-space noise
  sched: act_mirred: Traffic class option for mirror/redirect action
  i40e: Map TCs with the VSI seids
  i40e: Cloud filter mode for set_switch_config command
  i40e: Admin queue definitions for cloud filters
  i40e: Clean up of cloud filters
  i40e: Enable cloud filters via tc-flower


 drivers/net/ethernet/chelsio/cxgb4/cxgb4_tc_u32.c  |2 
 drivers/net/ethernet/intel/i40e/i40e.h |   59 +
 drivers/net/ethernet/intel/i40e/i40e_adminq_cmd.h  |  143 +++
 drivers/net/ethernet/intel/i40e/i40e_common.c  |  193 
 drivers/net/ethernet/intel/i40e/i40e_ethtool.c |2 
 drivers/net/ethernet/intel/i40e/i40e_main.c|  999 +++-
 drivers/net/ethernet/intel/i40e/i40e_prototype.h   |   18 
 drivers/net/ethernet/intel/i40e/i40e_type.h|   10 
 .../net/ethernet/intel/i40evf/i40e_adminq_cmd.h|  113 ++
 drivers/net/ethernet/intel/ixgbe/ixgbe_main.c  |2 
 drivers/net/ethernet/mellanox/mlx5/core/en_tc.c|2 
 drivers/net/ethernet/mellanox/mlxsw/spectrum.c |3 
 .../net/ethernet/mellanox/mlxsw/spectrum_flower.c  |3 
 drivers/net/ethernet/netronome/nfp/bpf/offload.c   |1 
 drivers/net/ethernet/netronome/nfp/flower/action.c |4 
 include/net/tc_act/tc_mirred.h |   16 
 include/uapi/linux/tc_act/tc_mirred.h  |9 
 net/dsa/slave.c|3 
 net/sched/act_mirred.c |   15 
 19 files changed, 1547 insertions(+), 50 deletions(-)

--

[RFC PATCH v3 7/7] i40e: Enable cloud filters via tc-flower

2017-09-13 Thread Amritha Nambiar

This patch enables tc-flower based hardware offloads. tc flower
filter provided by the kernel is configured as driver specific
cloud filter. The patch implements functions and admin queue
commands needed to support cloud filters in the driver and
adds cloud filters to configure these tc-flower filters.

The only action supported is to redirect packets to a traffic class
on the same device.

# tc qdisc add dev eth0 ingress
# ethtool -K eth0 hw-tc-offload on

# tc filter add dev eth0 protocol ip parent :\
  prio 1 flower dst_mac 3c:fd:fe:a0:d6:70 skip_sw\
  action mirred ingress redirect dev eth0 tclass 0

# tc filter add dev eth0 protocol ip parent :\
  prio 2 flower dst_ip 192.168.3.5/32\
  ip_proto udp dst_port 25 skip_sw\
  action mirred ingress redirect dev eth0 tclass 1

# tc filter add dev eth0 protocol ipv6 parent :\
  prio 3 flower dst_ip fe8::200:1\
  ip_proto udp dst_port 66 skip_sw\
  action mirred ingress redirect dev eth0 tclass 1

Delete tc flower filter:
Example:

# tc filter del dev eth0 parent : prio 3 handle 0x1 flower
# tc filter del dev eth0 parent :

Flow Director Sideband is disabled while configuring cloud filters
via tc-flower and until any cloud filter exists.

Unsupported matches when cloud filters are added using enhanced
big buffer cloud filter mode of underlying switch include:
1. source port and source IP
2. Combined MAC address and IP fields.
3. Not specifying L4 port

These filter matches can however be used to redirect traffic to
the main VSI (tc 0) which does not require the enhanced big buffer
cloud filter support.

v3: Cleaned up some lengthy function names. Changed ipv6 address to
__be32 array instead of u8 array. Used macro for IP version. Minor
formatting changes.
v2:
1. Moved I40E_SWITCH_MODE_MASK definition to i40e_type.h
2. Moved dev_info for add/deleting cloud filters in else condition
3. Fixed some format specifier in dev_err logs
4. Refactored i40e_get_capabilities to take an additional
   list_type parameter and use it to query device and function
   level capabilities.
5. Fixed parsing tc redirect action to check for the is_tcf_mirred_tc()
   to verify if redirect to a traffic class is supported.
6. Added comments for Geneve fix in cloud filter big buffer AQ
   function definitions.
7. Cleaned up setup_tc interface to rebase and work with Jiri's
   updates, separate function to process tc cls flower offloads.
8. Changes to make Flow Director Sideband and Cloud filters mutually
   exclusive.

Signed-off-by: Amritha Nambiar <amritha.namb...@intel.com>
Signed-off-by: Kiran Patil <kiran.pa...@intel.com>
Signed-off-by: Anjali Singhai Jain <anjali.sing...@intel.com>
Signed-off-by: Jingjing Wu <jingjing...@intel.com>
---
 drivers/net/ethernet/intel/i40e/i40e.h |   49 +
 drivers/net/ethernet/intel/i40e/i40e_adminq_cmd.h  |3 
 drivers/net/ethernet/intel/i40e/i40e_common.c  |  189 
 drivers/net/ethernet/intel/i40e/i40e_main.c|  971 +++-
 drivers/net/ethernet/intel/i40e/i40e_prototype.h   |   16 
 drivers/net/ethernet/intel/i40e/i40e_type.h|1 
 .../net/ethernet/intel/i40evf/i40e_adminq_cmd.h|3 
 7 files changed, 1202 insertions(+), 30 deletions(-)

diff --git a/drivers/net/ethernet/intel/i40e/i40e.h 
b/drivers/net/ethernet/intel/i40e/i40e.h
index 6018fb6..b110519 100644
--- a/drivers/net/ethernet/intel/i40e/i40e.h
+++ b/drivers/net/ethernet/intel/i40e/i40e.h
@@ -55,6 +55,8 @@
 #include 
 #include 
 #include 
+#include 
+#include 
 #include "i40e_type.h"
 #include "i40e_prototype.h"
 #include "i40e_client.h"
@@ -252,9 +254,52 @@ struct i40e_fdir_filter {
u32 fd_id;
 };
 
+#define IPV4_VERSION 4
+#define IPV6_VERSION 6
+
+#define I40E_CLOUD_FIELD_OMAC  0x01
+#define I40E_CLOUD_FIELD_IMAC  0x02
+#define I40E_CLOUD_FIELD_IVLAN 0x04
+#define I40E_CLOUD_FIELD_TEN_ID0x08
+#define I40E_CLOUD_FIELD_IIP   0x10
+
+#define I40E_CLOUD_FILTER_FLAGS_OMAC   I40E_CLOUD_FIELD_OMAC
+#define I40E_CLOUD_FILTER_FLAGS_IMAC   I40E_CLOUD_FIELD_IMAC
+#define I40E_CLOUD_FILTER_FLAGS_IMAC_IVLAN (I40E_CLOUD_FIELD_IMAC | \
+I40E_CLOUD_FIELD_IVLAN)
+#define I40E_CLOUD_FILTER_FLAGS_IMAC_TEN_ID(I40E_CLOUD_FIELD_IMAC | \
+I40E_CLOUD_FIELD_TEN_ID)
+#define I40E_CLOUD_FILTER_FLAGS_OMAC_TEN_ID_IMAC (I40E_CLOUD_FIELD_OMAC | \
+ I40E_CLOUD_FIELD_IMAC | \
+ I40E_CLOUD_FIELD_TEN_ID)
+#define I40E_CLOUD_FILTER_FLAGS_IMAC_IVLAN_TEN_ID (I40E_CLOUD_FIELD_IMAC | \
+  I40E_CLOUD_FIELD_IVLAN | \
+  I40E_CLOUD_FIELD_TEN_ID)
+#define I40E_CLOUD_FILTER_FLAGS_IIPI40E_CLOUD_FIELD_IIP
+
 struct i40e_cloud_filter {
struct hlist_node cloud_node;
unsigned

[RFC PATCH v3 1/7] tc_mirred: Clean up white-space noise

2017-09-13 Thread Amritha Nambiar

Signed-off-by: Amritha Nambiar <amritha.namb...@intel.com>
---
 include/uapi/linux/tc_act/tc_mirred.h |6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/include/uapi/linux/tc_act/tc_mirred.h 
b/include/uapi/linux/tc_act/tc_mirred.h
index 3d7a2b3..69038c2 100644
--- a/include/uapi/linux/tc_act/tc_mirred.h
+++ b/include/uapi/linux/tc_act/tc_mirred.h
@@ -9,13 +9,13 @@
 #define TCA_EGRESS_MIRROR 2 /* mirror packet to EGRESS */
 #define TCA_INGRESS_REDIR 3  /* packet redirect to INGRESS*/
 #define TCA_INGRESS_MIRROR 4 /* mirror packet to INGRESS */
-   
 
+
 struct tc_mirred {
tc_gen;
int eaction;   /* one of IN/EGRESS_MIRROR/REDIR */
__u32   ifindex;  /* ifindex of egress port */
 };
-   
 
+
 enum {
TCA_MIRRED_UNSPEC,
TCA_MIRRED_TM,
@@ -24,5 +24,5 @@ enum {
__TCA_MIRRED_MAX
 };
 #define TCA_MIRRED_MAX (__TCA_MIRRED_MAX - 1)
-   
 
+
 #endif

[RFC PATCH v3 6/7] i40e: Clean up of cloud filters

2017-09-13 Thread Amritha Nambiar

Introduce the cloud filter datastructure and cleanup of cloud
filters associated with the device.

v2: Moved field comments in struct i40e_cloud_filter to the right.
Removed hlist_empty check from i40e_cloud_filter_exit()

Signed-off-by: Amritha Nambiar <amritha.namb...@intel.com>
---
 drivers/net/ethernet/intel/i40e/i40e.h  |9 +
 drivers/net/ethernet/intel/i40e/i40e_main.c |   24 
 2 files changed, 33 insertions(+)

diff --git a/drivers/net/ethernet/intel/i40e/i40e.h 
b/drivers/net/ethernet/intel/i40e/i40e.h
index d846da9..6018fb6 100644
--- a/drivers/net/ethernet/intel/i40e/i40e.h
+++ b/drivers/net/ethernet/intel/i40e/i40e.h
@@ -252,6 +252,12 @@ struct i40e_fdir_filter {
u32 fd_id;
 };
 
+struct i40e_cloud_filter {
+   struct hlist_node cloud_node;
+   unsigned long cookie;
+   u16 seid;   /* filter control */
+};
+
 #define I40E_ETH_P_LLDP0x88cc
 
 #define I40E_DCB_PRIO_TYPE_STRICT  0
@@ -419,6 +425,9 @@ struct i40e_pf {
struct i40e_udp_port_config udp_ports[I40E_MAX_PF_UDP_OFFLOAD_PORTS];
u16 pending_udp_bitmap;
 
+   struct hlist_head cloud_filter_list;
+   u16 num_cloud_filters;
+
enum i40e_interrupt_policy int_policy;
u16 rx_itr_default;
u16 tx_itr_default;
diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c 
b/drivers/net/ethernet/intel/i40e/i40e_main.c
index 60c689a..afcf08a 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_main.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_main.c
@@ -6922,6 +6922,26 @@ static void i40e_fdir_filter_exit(struct i40e_pf *pf)
 }
 
 /**
+ * i40e_cloud_filter_exit - Cleans up the Cloud Filters
+ * @pf: Pointer to PF
+ *
+ * This function destroys the hlist where all the Cloud Filters
+ * filters were saved.
+ **/
+static void i40e_cloud_filter_exit(struct i40e_pf *pf)
+{
+   struct i40e_cloud_filter *cfilter;
+   struct hlist_node *node;
+
+   hlist_for_each_entry_safe(cfilter, node,
+ >cloud_filter_list, cloud_node) {
+   hlist_del(>cloud_node);
+   kfree(cfilter);
+   }
+   pf->num_cloud_filters = 0;
+}
+
+/**
  * i40e_close - Disables a network interface
  * @netdev: network interface device structure
  *
@@ -12176,6 +12196,7 @@ static int i40e_setup_pf_switch(struct i40e_pf *pf, 
bool reinit)
vsi = i40e_vsi_reinit_setup(pf->vsi[pf->lan_vsi]);
if (!vsi) {
dev_info(>pdev->dev, "setup of MAIN VSI failed\n");
+   i40e_cloud_filter_exit(pf);
i40e_fdir_teardown(pf);
return -EAGAIN;
}
@@ -13010,6 +13031,8 @@ static void i40e_remove(struct pci_dev *pdev)
if (pf->vsi[pf->lan_vsi])
i40e_vsi_release(pf->vsi[pf->lan_vsi]);
 
+   i40e_cloud_filter_exit(pf);
+
/* remove attached clients */
if (pf->flags & I40E_FLAG_IWARP_ENABLED) {
ret_code = i40e_lan_del_device(pf);
@@ -13241,6 +13264,7 @@ static void i40e_shutdown(struct pci_dev *pdev)
 
del_timer_sync(>service_timer);
cancel_work_sync(>service_task);
+   i40e_cloud_filter_exit(pf);
i40e_fdir_teardown(pf);
 
/* Client close must be called explicitly here because the timer

[RFC PATCH v3 4/7] i40e: Cloud filter mode for set_switch_config command

2017-09-13 Thread Amritha Nambiar

Add definitions for L4 filters and switch modes based on cloud filters
modes and extend the set switch config command to include the
additional cloud filter mode.

Signed-off-by: Amritha Nambiar <amritha.namb...@intel.com>
Signed-off-by: Kiran Patil <kiran.pa...@intel.com>
---
 drivers/net/ethernet/intel/i40e/i40e_adminq_cmd.h |   30 -
 drivers/net/ethernet/intel/i40e/i40e_common.c |4 ++-
 drivers/net/ethernet/intel/i40e/i40e_ethtool.c|2 +
 drivers/net/ethernet/intel/i40e/i40e_main.c   |2 +
 drivers/net/ethernet/intel/i40e/i40e_prototype.h  |2 +
 drivers/net/ethernet/intel/i40e/i40e_type.h   |9 ++
 6 files changed, 44 insertions(+), 5 deletions(-)

diff --git a/drivers/net/ethernet/intel/i40e/i40e_adminq_cmd.h 
b/drivers/net/ethernet/intel/i40e/i40e_adminq_cmd.h
index a8f65ae..e41050a 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_adminq_cmd.h
+++ b/drivers/net/ethernet/intel/i40e/i40e_adminq_cmd.h
@@ -790,7 +790,35 @@ struct i40e_aqc_set_switch_config {
 */
__le16  first_tag;
__le16  second_tag;
-   u8  reserved[6];
+   /* Next byte is split into following:
+* Bit 7 : 0: No action, 1: Switch to mode defined by bits 6:0
+* Bit 6: 0 : Destination Port, 1: source port
+* Bit 5..4: L4 type
+* 0: rsvd
+* 1: TCP
+* 2: UDP
+* 3: Both TCP and UDP
+* Bits 3:0 Mode
+* 0: default mode
+* 1: L4 port only mode
+* 2: non-tunneled mode
+* 3: tunneled mode
+*/
+#define I40E_AQ_SET_SWITCH_BIT7_VALID  0x80
+
+#define I40E_AQ_SET_SWITCH_L4_SRC_PORT 0x40
+
+#define I40E_AQ_SET_SWITCH_L4_TYPE_RSVD0x00
+#define I40E_AQ_SET_SWITCH_L4_TYPE_TCP 0x10
+#define I40E_AQ_SET_SWITCH_L4_TYPE_UDP 0x20
+#define I40E_AQ_SET_SWITCH_L4_TYPE_BOTH0x30
+
+#define I40E_AQ_SET_SWITCH_MODE_DEFAULT0x00
+#define I40E_AQ_SET_SWITCH_MODE_L4_PORT0x01
+#define I40E_AQ_SET_SWITCH_MODE_NON_TUNNEL 0x02
+#define I40E_AQ_SET_SWITCH_MODE_TUNNEL 0x03
+   u8  mode;
+   u8  rsvd5[5];
 };
 
 I40E_CHECK_CMD_LENGTH(i40e_aqc_set_switch_config);
diff --git a/drivers/net/ethernet/intel/i40e/i40e_common.c 
b/drivers/net/ethernet/intel/i40e/i40e_common.c
index e7d8a01..9567702 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_common.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_common.c
@@ -2405,13 +2405,14 @@ i40e_status i40e_aq_get_switch_config(struct i40e_hw 
*hw,
  * @hw: pointer to the hardware structure
  * @flags: bit flag values to set
  * @valid_flags: which bit flags to set
+ * @mode: cloud filter mode
  * @cmd_details: pointer to command details structure or NULL
  *
  * Set switch configuration bits
  **/
 enum i40e_status_code i40e_aq_set_switch_config(struct i40e_hw *hw,
u16 flags,
-   u16 valid_flags,
+   u16 valid_flags, u8 mode,
struct i40e_asq_cmd_details *cmd_details)
 {
struct i40e_aq_desc desc;
@@ -2423,6 +2424,7 @@ enum i40e_status_code i40e_aq_set_switch_config(struct 
i40e_hw *hw,
  i40e_aqc_opc_set_switch_config);
scfg->flags = cpu_to_le16(flags);
scfg->valid_flags = cpu_to_le16(valid_flags);
+   scfg->mode = mode;
if (hw->flags & I40E_HW_FLAG_802_1AD_CAPABLE) {
scfg->switch_tag = cpu_to_le16(hw->switch_tag);
scfg->first_tag = cpu_to_le16(hw->first_tag);
diff --git a/drivers/net/ethernet/intel/i40e/i40e_ethtool.c 
b/drivers/net/ethernet/intel/i40e/i40e_ethtool.c
index 3fa90a6..7a0aa08 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_ethtool.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_ethtool.c
@@ -4186,7 +4186,7 @@ static int i40e_set_priv_flags(struct net_device *dev, 
u32 flags)
sw_flags = I40E_AQ_SET_SWITCH_CFG_PROMISC;
valid_flags = I40E_AQ_SET_SWITCH_CFG_PROMISC;
ret = i40e_aq_set_switch_config(>hw, sw_flags, valid_flags,
-   NULL);
+   0, NULL);
if (ret && pf->hw.aq.asq_last_status != I40E_AQ_RC_ESRCH) {
dev_info(>pdev->dev,
 "couldn't set switch config bits, err %s 
aq_err %s\n",
diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c 
b/drivers/net/ethernet/intel/i40e/i40e_main.c
index 0455283..60c689a 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_main.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_main.c
@@ -12146,7 +12146,7 @@ static int i40e_setup_pf_switch(struct i40e_pf *pf, 
bool reinit)
u16 valid_flags;

[RFC PATCH v3 5/7] i40e: Admin queue definitions for cloud filters

2017-09-13 Thread Amritha Nambiar

Add new admin queue definitions and extended fields for cloud
filter support. Define big buffer for extended general fields
in Add/Remove Cloud filters command.

v3: Shortened some lengthy struct names.
v2: Added I40E_CHECK_STRUCT_LEN check to AQ command structs and
added AQ definitions to i40evf for consistency based on Shannon's
feedback.

Signed-off-by: Amritha Nambiar <amritha.namb...@intel.com>
Signed-off-by: Kiran Patil <kiran.pa...@intel.com>
Signed-off-by: Jingjing Wu <jingjing...@intel.com>
---
 drivers/net/ethernet/intel/i40e/i40e_adminq_cmd.h  |  110 
 .../net/ethernet/intel/i40evf/i40e_adminq_cmd.h|  110 
 2 files changed, 216 insertions(+), 4 deletions(-)

diff --git a/drivers/net/ethernet/intel/i40e/i40e_adminq_cmd.h 
b/drivers/net/ethernet/intel/i40e/i40e_adminq_cmd.h
index e41050a..2e567c2 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_adminq_cmd.h
+++ b/drivers/net/ethernet/intel/i40e/i40e_adminq_cmd.h
@@ -1371,14 +1371,16 @@ struct i40e_aqc_add_remove_cloud_filters {
 #define I40E_AQC_ADD_CLOUD_CMD_SEID_NUM_SHIFT  0
 #define I40E_AQC_ADD_CLOUD_CMD_SEID_NUM_MASK   (0x3FF << \
I40E_AQC_ADD_CLOUD_CMD_SEID_NUM_SHIFT)
-   u8  reserved2[4];
+   u8  big_buffer_flag;
+#define I40E_AQC_ADD_CLOUD_CMD_BB  1
+   u8  reserved2[3];
__le32  addr_high;
__le32  addr_low;
 };
 
 I40E_CHECK_CMD_LENGTH(i40e_aqc_add_remove_cloud_filters);
 
-struct i40e_aqc_add_remove_cloud_filters_element_data {
+struct i40e_aqc_cloud_filters_element_data {
u8  outer_mac[6];
u8  inner_mac[6];
__le16  inner_vlan;
@@ -1408,6 +1410,13 @@ struct i40e_aqc_add_remove_cloud_filters_element_data {
 #define I40E_AQC_ADD_CLOUD_FILTER_IMAC 0x000A
 #define I40E_AQC_ADD_CLOUD_FILTER_OMAC_TEN_ID_IMAC 0x000B
 #define I40E_AQC_ADD_CLOUD_FILTER_IIP  0x000C
+/* 0x0010 to 0x0017 is for custom filters */
+/* flag to be used when adding cloud filter: IP + L4 Port */
+#define I40E_AQC_ADD_CLOUD_FILTER_IP_PORT  0x0010
+/* flag to be used when adding cloud filter: Dest MAC + L4 Port */
+#define I40E_AQC_ADD_CLOUD_FILTER_MAC_PORT 0x0011
+/* flag to be used when adding cloud filter: Dest MAC + VLAN + L4 Port */
+#define I40E_AQC_ADD_CLOUD_FILTER_MAC_VLAN_PORT0x0012
 
 #define I40E_AQC_ADD_CLOUD_FLAGS_TO_QUEUE  0x0080
 #define I40E_AQC_ADD_CLOUD_VNK_SHIFT   6
@@ -1442,6 +1451,49 @@ struct i40e_aqc_add_remove_cloud_filters_element_data {
u8  response_reserved[7];
 };
 
+I40E_CHECK_STRUCT_LEN(0x40, i40e_aqc_cloud_filters_element_data);
+
+/* i40e_aqc_cloud_filters_element_bb is used when
+ * I40E_AQC_CLOUD_CMD_BB flag is set.
+ */
+struct i40e_aqc_cloud_filters_element_bb {
+   struct i40e_aqc_cloud_filters_element_data element;
+   u16 general_fields[32];
+#define I40E_AQC_ADD_CLOUD_FV_FLU_0X10_WORD0   0
+#define I40E_AQC_ADD_CLOUD_FV_FLU_0X10_WORD1   1
+#define I40E_AQC_ADD_CLOUD_FV_FLU_0X10_WORD2   2
+#define I40E_AQC_ADD_CLOUD_FV_FLU_0X11_WORD0   3
+#define I40E_AQC_ADD_CLOUD_FV_FLU_0X11_WORD1   4
+#define I40E_AQC_ADD_CLOUD_FV_FLU_0X11_WORD2   5
+#define I40E_AQC_ADD_CLOUD_FV_FLU_0X12_WORD0   6
+#define I40E_AQC_ADD_CLOUD_FV_FLU_0X12_WORD1   7
+#define I40E_AQC_ADD_CLOUD_FV_FLU_0X12_WORD2   8
+#define I40E_AQC_ADD_CLOUD_FV_FLU_0X13_WORD0   9
+#define I40E_AQC_ADD_CLOUD_FV_FLU_0X13_WORD1   10
+#define I40E_AQC_ADD_CLOUD_FV_FLU_0X13_WORD2   11
+#define I40E_AQC_ADD_CLOUD_FV_FLU_0X14_WORD0   12
+#define I40E_AQC_ADD_CLOUD_FV_FLU_0X14_WORD1   13
+#define I40E_AQC_ADD_CLOUD_FV_FLU_0X14_WORD2   14
+#define I40E_AQC_ADD_CLOUD_FV_FLU_0X16_WORD0   15
+#define I40E_AQC_ADD_CLOUD_FV_FLU_0X16_WORD1   16
+#define I40E_AQC_ADD_CLOUD_FV_FLU_0X16_WORD2   17
+#define I40E_AQC_ADD_CLOUD_FV_FLU_0X16_WORD3   18
+#define I40E_AQC_ADD_CLOUD_FV_FLU_0X16_WORD4   19
+#define I40E_AQC_ADD_CLOUD_FV_FLU_0X16_WORD5   20
+#define I40E_AQC_ADD_CLOUD_FV_FLU_0X16_WORD6   21
+#define I40E_AQC_ADD_CLOUD_FV_FLU_0X16_WORD7   22
+#define I40E_AQC_ADD_CLOUD_FV_FLU_0X17_WORD0   23
+#define I40E_AQC_ADD_CLOUD_FV_FLU_0X17_WORD1   24
+#define I40E_AQC_ADD_CLOUD_FV_FLU_0X17_WORD2   25
+#define I40E_AQC_ADD_CLOUD_FV_FLU_0X17_WORD3   26
+#define I40E_AQC_ADD_CLOUD_FV_FLU_0X17_WORD4   27
+#define I40E_AQC_ADD_CLOUD_FV_FLU_0X17_WORD5   28
+#define I40E_AQC_ADD_CLOUD_FV_FLU_0X17_WORD6   29
+#define I40E_AQC_ADD_CLOUD_FV_FLU_0X17_WORD7   30
+};
+
+I40E_CHECK_STRUCT_LEN(0x80, i40e_aqc_cloud_filters_element_bb);
+
 struct i40e_aqc_remove_cloud_filters_completion {
__le16 perfect_ovlan_used;
__le16 perfect_ovlan_free;
@@ -1453,6 +1505,60 @@ struct i40e_aqc_remove_cloud_filters_completion {
 
 I40E_CHECK_CMD_LENGTH(i40e_aqc_remove_cloud_filters_completion);
 
+/* Replace filter Command 0x025F
+ * uses the i40e_aqc_replace_cloud_filters,
+ * and the generic indirect completio

[RFC PATCH v3 2/7] sched: act_mirred: Traffic class option for mirror/redirect action

2017-09-13 Thread Amritha Nambiar

Adds optional traffic class parameter to the mirror/redirect action.
The mirror/redirect action is extended to forward to a traffic
class on the device if the traffic class index is provided in
addition to the device's ifindex.

Example:
# tc filter add dev eth0 protocol ip parent : prio 1 flower\
  dst_ip 192.168.1.1/32 ip_proto udp dst_port 22\
  skip_sw action mirred ingress redirect dev eth0 tclass 1

v2: Introduced is_tcf_mirred_tc() helper function to check if
the rule is supported in current offloaders. Removed the
additional definitions for max number of TCs and its bitmask
and replaced their usages with existing defines in linux/netdevice.h.

Signed-off-by: Amritha Nambiar <amritha.namb...@intel.com>
---
 drivers/net/ethernet/chelsio/cxgb4/cxgb4_tc_u32.c  |2 +-
 drivers/net/ethernet/intel/ixgbe/ixgbe_main.c  |2 +-
 drivers/net/ethernet/mellanox/mlx5/core/en_tc.c|2 +-
 drivers/net/ethernet/mellanox/mlxsw/spectrum.c |3 ++-
 .../net/ethernet/mellanox/mlxsw/spectrum_flower.c  |3 ++-
 drivers/net/ethernet/netronome/nfp/bpf/offload.c   |1 +
 drivers/net/ethernet/netronome/nfp/flower/action.c |4 ++--
 include/net/tc_act/tc_mirred.h |   16 
 include/uapi/linux/tc_act/tc_mirred.h  |3 +++
 net/dsa/slave.c|3 ++-
 net/sched/act_mirred.c |   15 +++
 11 files changed, 46 insertions(+), 8 deletions(-)

diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_tc_u32.c 
b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_tc_u32.c
index 48970ba..54a7004 100644
--- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_tc_u32.c
+++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_tc_u32.c
@@ -113,7 +113,7 @@ static int fill_action_fields(struct adapter *adap,
}
 
/* Re-direct to specified port in hardware. */
-   if (is_tcf_mirred_egress_redirect(a)) {
+   if (is_tcf_mirred_egress_redirect(a) && !is_tcf_mirred_tc(a)) {
struct net_device *n_dev;
unsigned int i, index;
bool found = false;
diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c 
b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
index 3d3739f..b46d45d 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
@@ -8999,7 +8999,7 @@ static int parse_tc_actions(struct ixgbe_adapter *adapter,
}
 
/* Redirect to a VF or a offloaded macvlan */
-   if (is_tcf_mirred_egress_redirect(a)) {
+   if (is_tcf_mirred_egress_redirect(a) && !is_tcf_mirred_tc(a)) {
int ifindex = tcf_mirred_ifindex(a);
 
err = handle_redirect_action(adapter, ifindex, queue,
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c 
b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
index da503e6..f2352a0 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
@@ -1869,7 +1869,7 @@ static int parse_tc_fdb_actions(struct mlx5e_priv *priv, 
struct tcf_exts *exts,
return -EOPNOTSUPP;
}
 
-   if (is_tcf_mirred_egress_redirect(a)) {
+   if (is_tcf_mirred_egress_redirect(a) && !is_tcf_mirred_tc(a)) {
int ifindex = tcf_mirred_ifindex(a);
struct net_device *out_dev, *encap_dev = NULL;
struct mlx5e_priv *out_priv;
diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum.c 
b/drivers/net/ethernet/mellanox/mlxsw/spectrum.c
index ed7cd6c..5ec56f4 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/spectrum.c
+++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum.c
@@ -1641,7 +1641,8 @@ static int mlxsw_sp_port_add_cls_matchall(struct 
mlxsw_sp_port *mlxsw_sp_port,
tcf_exts_to_list(f->exts, );
a = list_first_entry(, struct tc_action, list);
 
-   if (is_tcf_mirred_egress_mirror(a) && protocol == htons(ETH_P_ALL)) {
+   if (is_tcf_mirred_egress_mirror(a) && !is_tcf_mirred_tc(a) &&
+   protocol == htons(ETH_P_ALL)) {
struct mlxsw_sp_port_mall_mirror_tc_entry *mirror;
 
mall_tc_entry->type = MLXSW_SP_PORT_MALL_MIRROR;
diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_flower.c 
b/drivers/net/ethernet/mellanox/mlxsw/spectrum_flower.c
index 8aace9a..88403a1 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_flower.c
+++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_flower.c
@@ -85,7 +85,8 @@ static int mlxsw_sp_flower_parse_actions(struct mlxsw_sp 
*mlxsw_sp,
 
group_id = mlxsw_sp_acl_ruleset_group_id(ruleset);
mlxsw_sp_acl_rulei_act_jump(rulei, group_id);
-   } else if

[RFC, iproute2 PATCH v2] tc/mqprio: Offload mode and shaper options in mqprio

2017-09-07 Thread Amritha Nambiar

Adds new mqprio options for 'mode' and 'shaper'. The mode
option can take values for offload modes such as 'dcb' (default),
'channel' with the 'hw' option set to 1. The 'shaper' option is
to support HW shapers ('dcb' default) and takes the value
'bw_rlimit' for bandwidth rate limiting. The parameters to the
bw_rlimit shaper are minimum and maximum bandwidth rates.
New HW shapers in future can be supported through the shaper
attribute.

# tc qdisc add dev eth0 root mqprio num_tc 2  map 0 0 0 0 1 1 1 1\
  queues 4@0 4@4 hw 1 mode channel shaper bw_rlimit\
  min_rate 1Gbit 2Gbit max_rate 4Gbit 5Gbit

# tc qdisc show dev eth0

qdisc mqprio 804a: root  tc 2 map 0 0 0 0 1 1 1 1 0 0 0 0 0 0 0 0
 queues:(0:3) (4:7)
 mode:channel
 shaper:bw_rlimit   min_rate:1Gbit 2Gbit   max_rate:4Gbit 5Gbit

Signed-off-by: Amritha Nambiar <amritha.namb...@intel.com>
---
 include/linux/pkt_sched.h |   32 
 tc/q_mqprio.c |  191 +++--
 2 files changed, 216 insertions(+), 7 deletions(-)

diff --git a/include/linux/pkt_sched.h b/include/linux/pkt_sched.h
index 099bf55..e95b5c9 100644
--- a/include/linux/pkt_sched.h
+++ b/include/linux/pkt_sched.h
@@ -625,6 +625,22 @@ enum {
 
 #define TC_MQPRIO_HW_OFFLOAD_MAX (__TC_MQPRIO_HW_OFFLOAD_MAX - 1)
 
+enum {
+   TC_MQPRIO_MODE_DCB,
+   TC_MQPRIO_MODE_CHANNEL,
+   __TC_MQPRIO_MODE_MAX
+};
+
+#define __TC_MQPRIO_MODE_MAX (__TC_MQPRIO_MODE_MAX - 1)
+
+enum {
+   TC_MQPRIO_SHAPER_DCB,
+   TC_MQPRIO_SHAPER_BW_RATE,   /* Add new shapers below */
+   __TC_MQPRIO_SHAPER_MAX
+};
+
+#define __TC_MQPRIO_SHAPER_MAX (__TC_MQPRIO_SHAPER_MAX - 1)
+
 struct tc_mqprio_qopt {
__u8num_tc;
__u8prio_tc_map[TC_QOPT_BITMASK + 1];
@@ -633,6 +649,22 @@ struct tc_mqprio_qopt {
__u16   offset[TC_QOPT_MAX_QUEUE];
 };
 
+#define TC_MQPRIO_F_MODE   0x1
+#define TC_MQPRIO_F_SHAPER 0x2
+#define TC_MQPRIO_F_MIN_RATE   0x4
+#define TC_MQPRIO_F_MAX_RATE   0x8
+
+enum {
+   TCA_MQPRIO_UNSPEC,
+   TCA_MQPRIO_MODE,
+   TCA_MQPRIO_SHAPER,
+   TCA_MQPRIO_MIN_RATE64,
+   TCA_MQPRIO_MAX_RATE64,
+   __TCA_MQPRIO_MAX,
+};
+
+#define TCA_MQPRIO_MAX (__TCA_MQPRIO_MAX - 1)
+
 /* SFB */
 
 enum {
diff --git a/tc/q_mqprio.c b/tc/q_mqprio.c
index d6718fb..5fec63d 100644
--- a/tc/q_mqprio.c
+++ b/tc/q_mqprio.c
@@ -27,6 +27,10 @@ static void explain(void)
fprintf(stderr, "Usage: ... mqprio [num_tc NUMBER] [map P0 P1 ...]\n");
fprintf(stderr, "  [queues count1@offset1 
count2@offset2 ...] ");
fprintf(stderr, "[hw 1|0]\n");
+   fprintf(stderr, "  [mode dcb|channel]\n");
+   fprintf(stderr, "  [shaper bw_rlimit SHAPER_PARAMS]\n"
+   "Where: SHAPER_PARAMS := { min_rate MIN_RATE1 MIN_RATE2 ...|\n"
+   "  max_rate MAX_RATE1 MAX_RATE2 ... 
}\n");
 }
 
 static int mqprio_parse_opt(struct qdisc_util *qu, int argc,
@@ -40,6 +44,12 @@ static int mqprio_parse_opt(struct qdisc_util *qu, int argc,
.count = { },
.offset = { },
};
+   __u64 min_rate64[TC_QOPT_MAX_QUEUE] = {0};
+   __u64 max_rate64[TC_QOPT_MAX_QUEUE] = {0};
+   __u16 shaper = TC_MQPRIO_SHAPER_DCB;
+   __u16 mode = TC_MQPRIO_MODE_DCB;
+   struct rtattr *tail;
+   __u32 flags = 0;
 
while (argc > 0) {
idx = 0;
@@ -92,6 +102,68 @@ static int mqprio_parse_opt(struct qdisc_util *qu, int argc,
return -1;
}
idx++;
+   } else if (opt.hw && strcmp(*argv, "mode") == 0) {
+   NEXT_ARG();
+   if (matches(*argv, "dcb") == 0) {
+   mode = TC_MQPRIO_MODE_DCB;
+   } else if (matches(*argv, "channel") == 0) {
+   mode = TC_MQPRIO_MODE_CHANNEL;
+   }  else {
+   fprintf(stderr, "Illegal mode (%s)\n",
+   *argv);
+   return -1;
+   }
+   if (mode != TC_MQPRIO_MODE_DCB)
+   flags |= TC_MQPRIO_F_MODE;
+   idx++;
+   } else if (opt.hw && strcmp(*argv, "shaper") == 0) {
+   NEXT_ARG();
+   if (matches(*argv, "dcb") == 0) {
+   shaper = TC_MQPRIO_SHAPER_DCB;
+   } else if (matches(*argv, "bw_rlimit") == 0) {
+   shaper = TC_MQPRIO_SHAPER_BW_RATE;
+

[RFC PATCH v3 6/6] i40e: Add support setting TC max bandwidth rates

2017-09-07 Thread Amritha Nambiar

This patch enables setting up maximum Tx rates for the traffic
classes in i40e. The maximum rate is offloaded to the hardware through
the mqprio framework by specifying the mode option as 'channel' and
shaper option as 'bw_rlimit' and is configured for the VSI. Configuring
minimum Tx rate limit is not supported in the device. The minimum
usable value for Tx rate is 50Mbps.

Example:
# tc qdisc add dev eth0 root mqprio num_tc 2  map 0 0 0 0 1 1 1 1\
  queues 4@0 4@4 hw 1 mode channel shaper bw_rlimit\
  max_rate 4Gbit 5Gbit

To dump the bandwidth rates:
# tc qdisc show dev eth0

qdisc mqprio 804a: root  tc 2 map 0 0 0 0 1 1 1 1 0 0 0 0 0 0 0 0
 queues:(0:3) (4:7)
 mode:channel
 shaper:bw_rlimit   max_rate:4Gbit 5Gbit

Signed-off-by: Amritha Nambiar <amritha.namb...@intel.com>
---
 drivers/net/ethernet/intel/i40e/i40e.h  |2 +
 drivers/net/ethernet/intel/i40e/i40e_main.c |  101 +--
 2 files changed, 94 insertions(+), 9 deletions(-)

diff --git a/drivers/net/ethernet/intel/i40e/i40e.h 
b/drivers/net/ethernet/intel/i40e/i40e.h
index 2ee0197..02cae34 100644
--- a/drivers/net/ethernet/intel/i40e/i40e.h
+++ b/drivers/net/ethernet/intel/i40e/i40e.h
@@ -359,6 +359,8 @@ struct i40e_channel {
u8 enabled_tc;
struct i40e_aqc_vsi_properties_data info;
 
+   u64 max_tx_rate;
+
/* track this channel belongs to which VSI */
struct i40e_vsi *parent_vsi;
 };
diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c 
b/drivers/net/ethernet/intel/i40e/i40e_main.c
index 7f13bc7..14a177b 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_main.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_main.c
@@ -5202,9 +5202,16 @@ static int i40e_vsi_configure_bw_alloc(struct i40e_vsi 
*vsi, u8 enabled_tc,
i40e_status ret;
int i;
 
-   if ((vsi->back->flags & I40E_FLAG_TC_MQPRIO) ||
-   !vsi->mqprio_qopt.qopt.hw)
+   if (vsi->back->flags & I40E_FLAG_TC_MQPRIO)
return 0;
+   if (!vsi->mqprio_qopt.qopt.hw) {
+   ret = i40e_set_bw_limit(vsi, vsi->seid, 0);
+   if (ret)
+   dev_info(>back->pdev->dev,
+"Failed to reset tx rate for vsi->seid %u\n",
+vsi->seid);
+   return ret;
+   }
bw_data.tc_valid_bits = enabled_tc;
for (i = 0; i < I40E_MAX_TRAFFIC_CLASS; i++)
bw_data.tc_bw_credits[i] = bw_share[i];
@@ -5511,6 +5518,13 @@ static void i40e_remove_queue_channels(struct i40e_vsi 
*vsi)
rx_ring->ch = NULL;
}
 
+   /* Reset BW configured for this VSI via mqprio */
+   ret = i40e_set_bw_limit(vsi, ch->seid, 0);
+   if (ret)
+   dev_info(>back->pdev->dev,
+"Failed to reset tx rate for ch->seid %u\n",
+ch->seid);
+
/* delete VSI from FW */
ret = i40e_aq_delete_element(>back->hw, ch->seid,
 NULL);
@@ -6053,6 +6067,17 @@ int i40e_create_queue_channel(struct i40e_vsi *vsi,
 "Setup channel (id:%u) utilizing num_queues %d\n",
 ch->seid, ch->num_queue_pairs);
 
+   /* configure VSI for BW limit */
+   if (ch->max_tx_rate) {
+   if (i40e_set_bw_limit(vsi, ch->seid, ch->max_tx_rate))
+   return -EINVAL;
+
+   dev_dbg(>pdev->dev,
+   "Set tx rate of %llu Mbps (count of 50Mbps %llu) for 
vsi->seid %u\n",
+   ch->max_tx_rate,
+   ch->max_tx_rate / I40E_BW_CREDIT_DIVISOR, ch->seid);
+   }
+
/* in case of VF, this will be main SRIOV VSI */
ch->parent_vsi = vsi;
 
@@ -6088,6 +6113,12 @@ static int i40e_configure_queue_channels(struct i40e_vsi 
*vsi)
ch->base_queue =
vsi->tc_config.tc_info[i].qoffset;
 
+   /* Bandwidth limit through tc interface is in bytes/s,
+* change to Mbit/s
+*/
+   ch->max_tx_rate =
+   vsi->mqprio_qopt.max_rate[i] / (100 / 8);
+
list_add_tail(>list, >ch_list);
 
ret = i40e_create_queue_channel(vsi, ch);
@@ -6514,6 +6545,7 @@ void i40e_down(struct i40e_vsi *vsi)
 static int i40e_validate_mqprio_qopt(struct i40e_vsi *vsi,
 struct tc_mqprio_qopt_offload *mqprio_qopt)
 {
+   u64 sum_max_rate = 0;
int i;
 
if ((mqprio_qopt->qopt.offset[0] != 0) ||
@@ -6523,8 +6555,13 @@ static int i40e_validate_mqpr

[RFC PATCH v3 5/6] i40e: Refactor VF BW rate limiting

2017-09-07 Thread Amritha Nambiar

This patch refactors the BW rate limiting for Tx traffic
on the VF to be reused in the next patch for rate limiting Tx
traffic for the VSIs on the PF as well.

v3: Minor fixes, clean up log messages based on Shannon's comments.

Signed-off-by: Amritha Nambiar <amritha.namb...@intel.com>
---
 drivers/net/ethernet/intel/i40e/i40e.h |5 ++
 drivers/net/ethernet/intel/i40e/i40e_main.c|   64 
 drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c |   45 +-
 3 files changed, 71 insertions(+), 43 deletions(-)

diff --git a/drivers/net/ethernet/intel/i40e/i40e.h 
b/drivers/net/ethernet/intel/i40e/i40e.h
index ddb7292..2ee0197 100644
--- a/drivers/net/ethernet/intel/i40e/i40e.h
+++ b/drivers/net/ethernet/intel/i40e/i40e.h
@@ -128,6 +128,10 @@
 /* default to trying for four seconds */
 #define I40E_TRY_LINK_TIMEOUT  (4 * HZ)
 
+/* BW rate limiting */
+#define I40E_BW_CREDIT_DIVISOR 50 /* 50Mbps per BW credit */
+#define I40E_MAX_BW_INACTIVE_ACCUM 4  /* accumulate 4 credits max */
+
 /* driver state flags */
 enum i40e_state_t {
__I40E_TESTING,
@@ -1042,4 +1046,5 @@ static inline bool i40e_enabled_xdp_vsi(struct i40e_vsi 
*vsi)
 }
 
 int i40e_create_queue_channel(struct i40e_vsi *vsi, struct i40e_channel *ch);
+int i40e_set_bw_limit(struct i40e_vsi *vsi, u16 seid, u64 max_tx_rate);
 #endif /* _I40E_H_ */
diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c 
b/drivers/net/ethernet/intel/i40e/i40e_main.c
index a953c2e..7f13bc7 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_main.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_main.c
@@ -5406,6 +5406,70 @@ static int i40e_vsi_config_tc(struct i40e_vsi *vsi, u8 
enabled_tc)
 }
 
 /**
+ * i40e_get_link_speed - Returns link speed for the interface
+ * @vsi: VSI to be configured
+ *
+ **/
+int i40e_get_link_speed(struct i40e_vsi *vsi)
+{
+   struct i40e_pf *pf = vsi->back;
+
+   switch (pf->hw.phy.link_info.link_speed) {
+   case I40E_LINK_SPEED_40GB:
+   return 4;
+   case I40E_LINK_SPEED_25GB:
+   return 25000;
+   case I40E_LINK_SPEED_20GB:
+   return 2;
+   case I40E_LINK_SPEED_10GB:
+   return 1;
+   case I40E_LINK_SPEED_1GB:
+   return 1000;
+   default:
+   return -EINVAL;
+   }
+}
+
+/**
+ * i40e_set_bw_limit - setup BW limit for Tx traffic based on max_tx_rate
+ * @vsi: VSI to be configured
+ * @seid: seid of the channel/VSI
+ * @max_tx_rate: max TX rate to be configured as BW limit
+ *
+ * Helper function to set BW limit for a given VSI
+ **/
+int i40e_set_bw_limit(struct i40e_vsi *vsi, u16 seid, u64 max_tx_rate)
+{
+   struct i40e_pf *pf = vsi->back;
+   int speed = 0;
+   int ret = 0;
+
+   speed = i40e_get_link_speed(vsi);
+   if (max_tx_rate > speed) {
+   dev_err(>pdev->dev,
+   "Invalid max tx rate %llu specified for VSI seid %d.",
+   max_tx_rate, seid);
+   return -EINVAL;
+   }
+   if (max_tx_rate && max_tx_rate < 50) {
+   dev_warn(>pdev->dev,
+"Setting max tx rate to minimum usable value of 
50Mbps.\n");
+   max_tx_rate = 50;
+   }
+
+   /* Tx rate credits are in values of 50Mbps, 0 is disabled */
+   ret = i40e_aq_config_vsi_bw_limit(>hw, seid,
+ max_tx_rate / I40E_BW_CREDIT_DIVISOR,
+ I40E_MAX_BW_INACTIVE_ACCUM, NULL);
+   if (ret)
+   dev_err(>pdev->dev,
+   "Failed set tx rate (%llu Mbps) for vsi->seid %u, err 
%s aq_err %s\n",
+   max_tx_rate, seid, i40e_stat_str(>hw, ret),
+   i40e_aq_str(>hw, pf->hw.aq.asq_last_status));
+   return ret;
+}
+
+/**
  * i40e_remove_queue_channels - Remove queue channels for the TCs
  * @vsi: VSI to be configured
  *
diff --git a/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c 
b/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c
index ac88d96..06b2548 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c
@@ -3115,8 +3115,6 @@ int i40e_ndo_set_vf_port_vlan(struct net_device *netdev, 
int vf_id,
return ret;
 }
 
-#define I40E_BW_CREDIT_DIVISOR 50 /* 50Mbps per BW credit */
-#define I40E_MAX_BW_INACTIVE_ACCUM 4  /* device can accumulate 4 credits max */
 /**
  * i40e_ndo_set_vf_bw
  * @netdev: network interface device structure
@@ -3132,7 +3130,6 @@ int i40e_ndo_set_vf_bw(struct net_device *netdev, int 
vf_id, int min_tx_rate,
struct i40e_pf *pf = np->vsi->back;
struct i40e_vsi *vsi;
struct i40e_vf *vf;
-   int speed = 0;
int ret = 0;
 
/* validate the request */
@@ -3157,48 +3154,10 @@ int i40

[RFC PATCH v3 2/6] i40e: Add macro for PF reset bit

2017-09-07 Thread Amritha Nambiar

Introduce a macro for the bit setting the PF reset flag and
update its usages. This makes it easier to use this flag
in functions to be introduced in future without encountering
checkpatch issues related to alignment and line over 80
characters.

Signed-off-by: Amritha Nambiar <amritha.namb...@intel.com>
---
 drivers/net/ethernet/intel/i40e/i40e.h |2 ++
 drivers/net/ethernet/intel/i40e/i40e_debugfs.c |3 +--
 drivers/net/ethernet/intel/i40e/i40e_main.c|9 -
 drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c |5 ++---
 4 files changed, 9 insertions(+), 10 deletions(-)

diff --git a/drivers/net/ethernet/intel/i40e/i40e.h 
b/drivers/net/ethernet/intel/i40e/i40e.h
index 18c453a..d414adc 100644
--- a/drivers/net/ethernet/intel/i40e/i40e.h
+++ b/drivers/net/ethernet/intel/i40e/i40e.h
@@ -157,6 +157,8 @@ enum i40e_state_t {
__I40E_STATE_SIZE__,
 };
 
+#define I40E_PF_RESET_FLAG BIT_ULL(__I40E_PF_RESET_REQUESTED)
+
 /* VSI state flags */
 enum i40e_vsi_state_t {
__I40E_VSI_DOWN,
diff --git a/drivers/net/ethernet/intel/i40e/i40e_debugfs.c 
b/drivers/net/ethernet/intel/i40e/i40e_debugfs.c
index 6f2725f..2b8bbc8 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_debugfs.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_debugfs.c
@@ -798,8 +798,7 @@ static ssize_t i40e_dbg_command_write(struct file *filp,
 */
if (!(pf->flags & I40E_FLAG_VEB_MODE_ENABLED)) {
pf->flags |= I40E_FLAG_VEB_MODE_ENABLED;
-   i40e_do_reset_safe(pf,
-  BIT_ULL(__I40E_PF_RESET_REQUESTED));
+   i40e_do_reset_safe(pf, I40E_PF_RESET_FLAG);
}
 
vsi = i40e_vsi_setup(pf, I40E_VSI_VMDQ2, vsi_seid, 0);
diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c 
b/drivers/net/ethernet/intel/i40e/i40e_main.c
index 76b03f7..2fdb99f 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_main.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_main.c
@@ -5753,7 +5753,7 @@ int i40e_vsi_open(struct i40e_vsi *vsi)
 err_setup_tx:
i40e_vsi_free_tx_resources(vsi);
if (vsi == pf->vsi[pf->lan_vsi])
-   i40e_do_reset(pf, BIT_ULL(__I40E_PF_RESET_REQUESTED), true);
+   i40e_do_reset(pf, I40E_PF_RESET_FLAG, true);
 
return err;
 }
@@ -5881,7 +5881,7 @@ void i40e_do_reset(struct i40e_pf *pf, u32 reset_flags, 
bool lock_acquired)
wr32(>hw, I40E_GLGEN_RTRIG, val);
i40e_flush(>hw);
 
-   } else if (reset_flags & BIT_ULL(__I40E_PF_RESET_REQUESTED)) {
+   } else if (reset_flags & I40E_PF_RESET_FLAG) {
 
/* Request a PF Reset
 *
@@ -9229,7 +9229,7 @@ static int i40e_set_features(struct net_device *netdev,
need_reset = i40e_set_ntuple(pf, features);
 
if (need_reset)
-   i40e_do_reset(pf, BIT_ULL(__I40E_PF_RESET_REQUESTED), true);
+   i40e_do_reset(pf, I40E_PF_RESET_FLAG, true);
 
return 0;
 }
@@ -9481,8 +9481,7 @@ static int i40e_ndo_bridge_setlink(struct net_device *dev,
pf->flags |= I40E_FLAG_VEB_MODE_ENABLED;
else
pf->flags &= ~I40E_FLAG_VEB_MODE_ENABLED;
-   i40e_do_reset(pf, BIT_ULL(__I40E_PF_RESET_REQUESTED),
- true);
+   i40e_do_reset(pf, I40E_PF_RESET_FLAG, true);
break;
}
}
diff --git a/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c 
b/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c
index 8bedc74c..ac88d96 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c
@@ -1424,8 +1424,7 @@ int i40e_pci_sriov_configure(struct pci_dev *pdev, int 
num_vfs)
if (num_vfs) {
if (!(pf->flags & I40E_FLAG_VEB_MODE_ENABLED)) {
pf->flags |= I40E_FLAG_VEB_MODE_ENABLED;
-   i40e_do_reset_safe(pf,
-  BIT_ULL(__I40E_PF_RESET_REQUESTED));
+   i40e_do_reset_safe(pf, I40E_PF_RESET_FLAG);
}
return i40e_pci_sriov_enable(pdev, num_vfs);
}
@@ -1433,7 +1432,7 @@ int i40e_pci_sriov_configure(struct pci_dev *pdev, int 
num_vfs)
if (!pci_vfs_assigned(pf->pdev)) {
i40e_free_vfs(pf);
pf->flags &= ~I40E_FLAG_VEB_MODE_ENABLED;
-   i40e_do_reset_safe(pf, BIT_ULL(__I40E_PF_RESET_REQUESTED));
+   i40e_do_reset_safe(pf, I40E_PF_RESET_FLAG);
} else {
dev_warn(>dev, "Unable to free VFs because some are 
assigned to VMs.\n");
return -EINVAL;

[RFC PATCH v3 4/6] i40e: Enable 'channel' mode in mqprio for TC configs

2017-09-07 Thread Amritha Nambiar

The i40e driver is modified to enable the new mqprio hardware
offload mode and factor the TCs and queue configuration by
creating channel VSIs. In this mode, the priority to traffic
class mapping and the user specified queue ranges are used
to configure the traffic classes by setting the mode option to
'channel'.

Example:
# tc qdisc add dev eth0 root mqprio num_tc 4\
  map 0 0 0 0 1 2 2 3 queues 2@0 2@2 1@4 1@5\
  hw 1 mode channel

# tc qdisc show dev eth0

qdisc mqprio 8038: root  tc 4 map 0 0 0 0 1 2 2 3 0 0 0 0 0 0 0 0
 queues:(0:1) (2:3) (4:4) (5:5)
 mode:channel
 shaper:dcb

The HW channels created are removed and all the queue configuration
is set to default when the qdisc is detached from the root of the
device.

# tc qdisc del dev eth0 root

This patch also disables setting up channels via ethtool (ethtool -L)
when the TCs are configured using mqprio scheduler.

The patch also limits setting ethtool Rx flow hash indirection
(ethtool -X eth0 equal N) to max queues configured via mqprio.
The Rx flow hash indirection input through ethtool should be
validated so that it is within in the queue range configured via
tc/mqprio. The bound checking is achieved by reporting the current
rss size to the kernel when queues are configured via mqprio.

Example:
# tc qdisc add dev eth0 root mqprio num_tc 4\
  map 0 0 0 1 0 2 3 0 queues 2@0 4@2 8@6 11@14\
  hw 1 mode channel

# ethtool -X eth0 equal 12
Cannot set RX flow hash configuration: Invalid argument

v3: Changes to incorporate new mqprio mode option. Minor clean
up of setup_tc error handling based on Shannon's comments.
v2: Clean up __i40e_setup_tc() and i40e_setup_tc() to work
with Jiri's changes to the ndo_setup_tc interface which
now takes a type and the type_data for the offload.
No need to disable ATR in MQPRIO mode.

Signed-off-by: Amritha Nambiar <amritha.namb...@intel.com>
---
 drivers/net/ethernet/intel/i40e/i40e.h |3 
 drivers/net/ethernet/intel/i40e/i40e_ethtool.c |8 
 drivers/net/ethernet/intel/i40e/i40e_main.c|  457 ++--
 3 files changed, 362 insertions(+), 106 deletions(-)

diff --git a/drivers/net/ethernet/intel/i40e/i40e.h 
b/drivers/net/ethernet/intel/i40e/i40e.h
index 2401931..ddb7292 100644
--- a/drivers/net/ethernet/intel/i40e/i40e.h
+++ b/drivers/net/ethernet/intel/i40e/i40e.h
@@ -54,6 +54,7 @@
 #include 
 #include 
 #include 
+#include 
 #include "i40e_type.h"
 #include "i40e_prototype.h"
 #include "i40e_client.h"
@@ -700,6 +701,7 @@ struct i40e_vsi {
enum i40e_vsi_type type;  /* VSI type, e.g., LAN, FCoE, etc */
s16 vf_id;  /* Virtual function ID for SRIOV VSIs */
 
+   struct tc_mqprio_qopt_offload mqprio_qopt; /* queue parameters */
struct i40e_tc_configuration tc_config;
struct i40e_aqc_vsi_properties_data info;
 
@@ -725,6 +727,7 @@ struct i40e_vsi {
u16 cnt_q_avail;/* num of queues available for channel usage */
u16 orig_rss_size;
u16 current_rss_size;
+   bool reconfig_rss;
 
u16 next_base_queue;/* next queue to be used for channel setup */
 
diff --git a/drivers/net/ethernet/intel/i40e/i40e_ethtool.c 
b/drivers/net/ethernet/intel/i40e/i40e_ethtool.c
index b531aa3..3fa90a6 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_ethtool.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_ethtool.c
@@ -2652,7 +2652,7 @@ static int i40e_get_rxnfc(struct net_device *netdev, 
struct ethtool_rxnfc *cmd,
 
switch (cmd->cmd) {
case ETHTOOL_GRXRINGS:
-   cmd->data = vsi->num_queue_pairs;
+   cmd->data = vsi->rss_size;
ret = 0;
break;
case ETHTOOL_GRXFH:
@@ -3897,6 +3897,12 @@ static int i40e_set_channels(struct net_device *dev,
if (vsi->type != I40E_VSI_MAIN)
return -EINVAL;
 
+   /* We do not support setting channels via ethtool when TCs are
+* configured through mqprio
+*/
+   if (pf->flags & I40E_FLAG_TC_MQPRIO)
+   return -EINVAL;
+
/* verify they are not requesting separate vectors */
if (!count || ch->rx_count || ch->tx_count)
return -EINVAL;
diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c 
b/drivers/net/ethernet/intel/i40e/i40e_main.c
index a5b164d..a953c2e 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_main.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_main.c
@@ -1589,6 +1589,170 @@ static int i40e_set_mac(struct net_device *netdev, void 
*p)
 }
 
 /**
+ * i40e_config_rss_aq - Prepare for RSS using AQ commands
+ * @vsi: vsi structure
+ * @seed: RSS hash seed
+ **/
+static int i40e_config_rss_aq(struct i40e_vsi *vsi, const u8 *seed,
+ u8 *lut, u16 lut_size)
+{
+   struct i40e_pf *pf = vsi->back;
+   struct i40e_hw *hw = >hw;
+   int ret = 0;
+
+

[RFC PATCH v3 3/6] i40e: Add infrastructure for queue channel support

2017-09-07 Thread Amritha Nambiar

This patch sets up the infrastructure for offloading TCs and
queue configurations to the hardware by creating HW channels(VSI).
A new channel is created for each of the traffic class
configuration offloaded via mqprio framework except for the first TC
(TC0). TC0 for the main VSI is also reconfigured as per user provided
queue parameters. Queue counts that are not power-of-2 are handled by
reconfiguring RSS by reprogramming LUTs using the queue count value.
This patch also handles configuring the TX rings for the channels,
setting up the RX queue map for channel.

Also, the channels so created are removed and all the queue
configuration is set to default when the qdisc is detached from the
root of the device.

v3: Addressed Shannon's feedback removing unwanted code, variables,
atomic operations (since setup_tc is protected with rtnl_lock held
by stack) and other minor clean up.

Signed-off-by: Amritha Nambiar <amritha.namb...@intel.com>
Signed-off-by: Kiran Patil <kiran.pa...@intel.com>
---
 drivers/net/ethernet/intel/i40e/i40e.h  |   32 +
 drivers/net/ethernet/intel/i40e/i40e_main.c |  718 +++
 drivers/net/ethernet/intel/i40e/i40e_txrx.h |2 
 3 files changed, 743 insertions(+), 9 deletions(-)

diff --git a/drivers/net/ethernet/intel/i40e/i40e.h 
b/drivers/net/ethernet/intel/i40e/i40e.h
index d414adc..2401931 100644
--- a/drivers/net/ethernet/intel/i40e/i40e.h
+++ b/drivers/net/ethernet/intel/i40e/i40e.h
@@ -87,6 +87,7 @@
 #define I40E_AQ_LEN256
 #define I40E_AQ_WORK_LIMIT 66 /* max number of VFs + a little */
 #define I40E_MAX_USER_PRIORITY 8
+#define I40E_MAX_QUEUES_PER_CH 64
 #define I40E_DEFAULT_TRAFFIC_CLASS BIT(0)
 #define I40E_DEFAULT_MSG_ENABLE4
 #define I40E_QUEUE_WAIT_RETRY_LIMIT10
@@ -340,6 +341,23 @@ struct i40e_flex_pit {
u8 pit_index;
 };
 
+struct i40e_channel {
+   struct list_head list;
+   bool initialized;
+   u8 type;
+   u16 vsi_number; /* Assigned VSI number from AQ 'Add VSI' response */
+   u16 stat_counter_idx;
+   u16 base_queue;
+   u16 num_queue_pairs; /* Requested by user */
+   u16 seid;
+
+   u8 enabled_tc;
+   struct i40e_aqc_vsi_properties_data info;
+
+   /* track this channel belongs to which VSI */
+   struct i40e_vsi *parent_vsi;
+};
+
 /* struct that defines the Ethernet device */
 struct i40e_pf {
struct pci_dev *pdev;
@@ -456,6 +474,7 @@ struct i40e_pf {
 #define I40E_FLAG_CLIENT_RESET BIT(26)
 #define I40E_FLAG_LINK_DOWN_ON_CLOSE_ENABLED   BIT(27)
 #define I40E_FLAG_SOURCE_PRUNING_DISABLED  BIT(28)
+#define I40E_FLAG_TC_MQPRIOBIT(29)
 
struct i40e_client_instance *cinst;
bool stat_offsets_loaded;
@@ -536,6 +555,8 @@ struct i40e_pf {
u32 ioremap_len;
u32 fd_inv;
u16 phy_led_val;
+
+   u16 override_q_count;
 };
 
 /**
@@ -700,6 +721,15 @@ struct i40e_vsi {
bool current_isup;  /* Sync 'link up' logging */
enum i40e_aq_link_speed current_speed;  /* Sync link speed logging */
 
+   /* channel specific fields */
+   u16 cnt_q_avail;/* num of queues available for channel usage */
+   u16 orig_rss_size;
+   u16 current_rss_size;
+
+   u16 next_base_queue;/* next queue to be used for channel setup */
+
+   struct list_head ch_list;
+
void *priv; /* client driver data reference. */
 
/* VSI specific handlers */
@@ -1007,4 +1037,6 @@ static inline bool i40e_enabled_xdp_vsi(struct i40e_vsi 
*vsi)
 {
return !!vsi->xdp_prog;
 }
+
+int i40e_create_queue_channel(struct i40e_vsi *vsi, struct i40e_channel *ch);
 #endif /* _I40E_H_ */
diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c 
b/drivers/net/ethernet/intel/i40e/i40e_main.c
index 2fdb99f..a5b164d 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_main.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_main.c
@@ -2882,7 +2882,7 @@ static void i40e_config_xps_tx_ring(struct i40e_ring 
*ring)
struct i40e_vsi *vsi = ring->vsi;
int cpu;
 
-   if (!ring->q_vector || !ring->netdev)
+   if (!ring->q_vector || !ring->netdev || ring->ch)
return;
 
if ((vsi->tc_config.numtc <= 1) &&
@@ -2949,7 +2949,14 @@ static int i40e_configure_tx_ring(struct i40e_ring *ring)
 * initialization. This has to be done regardless of
 * DCB as by default everything is mapped to TC0.
 */
-   tx_ctx.rdylist = le16_to_cpu(vsi->info.qs_handle[ring->dcb_tc]);
+
+   if (ring->ch)
+   tx_ctx.rdylist =
+   le16_to_cpu(ring->ch->info.qs_handle[ring->dcb_tc]);
+
+   else
+   tx_ctx.rdylist = le16_to_cpu(vsi->info.qs_handle[ring->dcb_tc]);
+
tx_ctx.rdylist_act = 0;
 
/* clear the context in the HMC */
@@ -2971,1

[RFC PATCH v3 0/6] Configuring traffic classes via new hardware offload mechanism in tc/mqprio

2017-09-07 Thread Amritha Nambiar

The following series introduces a new hardware offload mode in
tc/mqprio where the TCs, the queue configurations and
bandwidth rate limits are offloaded to the hardware. The existing
mqprio framework is extended to configure the queue counts and
layout and also added support for rate limiting. This is achieved
through new netlink attributes for the 'mode' option which takes
values such as 'dcb' (default) and 'channel' and a 'shaper' option
for QoS attributes such as bandwidth rate limits in hw mode 1.
Legacy devices can fall back to the existing setup supporting hw mode
1 without these additional options where only the TCs are offloaded
and then the 'mode' and 'shaper' options defaults to DCB support.
The i40e driver enables the new mqprio hardware offload mechanism
factoring the TCs, queue configuration and bandwidth rates by
creating HW channel VSIs.

In this new mode, the priority to traffic class mapping and the
user specified queue ranges are used to configure the traffic
class when the 'mode' option is set to 'channel'. This is achieved by
creating HW channels(VSI). A new channel is created for each of the
traffic class configuration offloaded via mqprio framework except for
the first TC (TC0) which is for the main VSI. TC0 for the main VSI is
also reconfigured as per user provided queue parameters. Finally,
bandwidth rate limits are set on these traffic classes through the
shaper attribute by sending these rates in addition to the number of
TCs and the queue configurations.

Example:
# tc qdisc add dev eth0 root mqprio num_tc 2 map 0 0 0 0 1 1 1 1\
  queues 4@0 4@4 hw 1 mode channel shaper bw_rlimit\
  min_rate 1Gbit 2Gbit max_rate 4Gbit 5Gbit

To dump the bandwidth rates:

# tc qdisc show dev eth0

qdisc mqprio 804a: root  tc 2 map 0 0 0 0 1 1 1 1 0 0 0 0 0 0 0 0
 queues:(0:3) (4:7)
 mode:channel
 shaper:bw_rlimit   min_rate:1Gbit 2Gbit   max_rate:4Gbit 5Gbit

---

Amritha Nambiar (6):
  mqprio: Introduce new hardware offload mode and shaper in mqprio
  i40e: Add macro for PF reset bit
  i40e: Add infrastructure for queue channel support
  i40e: Enable 'channel' mode in mqprio for TC configs
  i40e: Refactor VF BW rate limiting
  i40e: Add support setting TC max bandwidth rates


 drivers/net/ethernet/intel/i40e/i40e.h |   44 +
 drivers/net/ethernet/intel/i40e/i40e_debugfs.c |3 
 drivers/net/ethernet/intel/i40e/i40e_ethtool.c |8 
 drivers/net/ethernet/intel/i40e/i40e_main.c| 1463 +---
 drivers/net/ethernet/intel/i40e/i40e_txrx.h|2 
 drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c |   50 -
 include/net/pkt_cls.h  |9 
 include/uapi/linux/pkt_sched.h |   32 
 net/sched/sch_mqprio.c |  183 ++-
 9 files changed, 1551 insertions(+), 243 deletions(-)

--

[RFC PATCH v3 1/6] mqprio: Introduce new hardware offload mode and shaper in mqprio

2017-09-07 Thread Amritha Nambiar

The offload types currently supported in mqprio are 0 (no offload) and
1 (offload only TCs) by setting these values for the 'hw' option. If
offloads are supported by setting the 'hw' option to 1, the default
offload mode is 'dcb' where only the TC values are offloaded to the
device. This patch introduces a new hardware offload mode called
'channel' with 'hw' set to 1 in mqprio which makes full use of the
mqprio options, the TCs, the queue configurations and the QoS parameters
for the TCs. This is achieved through a new netlink attribute for the
'mode' option which takes values such as 'dcb' (default) and 'channel'.
The 'channel' mode also supports QoS attributes for traffic class such as
minimum and maximum values for bandwidth rate limits.

This patch enables configuring additional HW shaper attributes associated
with a traffic class. Currently the shaper for bandwidth rate limiting is
supported which takes options such as minimum and maximum bandwidth rates
and are offloaded to the hardware in the 'channel' mode. The min and max
limits for bandwidth rates are provided by the user along with the the TCs
and the queue configurations when creating the mqprio qdisc. The interface
can be extended to support new HW shapers in future through the 'shaper'
attribute.

Introduces a new datastructure 'tc_mqprio_qopt_offload' for offloading
mqprio queue options and use this to be shared between the kernel and
device driver. This contains a copy of the exisiting datastructure
for mqprio queue options. This new datastructure can be extended when
adding new attributes for traffic class such as mode, shaper, shaper
parameters (bandwidth rate limits). The existing datastructure for mqprio
queue options will be shared between the kernel and userspace.

Example:
# tc qdisc add dev eth0 root mqprio num_tc 2 map 0 0 0 0 1 1 1 1\
  queues 4@0 4@4 hw 1 mode channel shaper bw_rlimit\
  min_rate 1Gbit 2Gbit max_rate 4Gbit 5Gbit

To dump the bandwidth rates:

# tc qdisc show dev eth0

qdisc mqprio 804a: root  tc 2 map 0 0 0 0 1 1 1 1 0 0 0 0 0 0 0 0
 queues:(0:3) (4:7)
 mode:channel
 shaper:bw_rlimit   min_rate:1Gbit 2Gbit   max_rate:4Gbit 5Gbit

v3 : Removed supporting new offloads through value 2 for 'hw' option,
introduced new netlink based options for offload mode and HW shaper.
v2 : Jiri's changes accepted upstream removes the struct
tc_to_netdev. Clean up the full offload related changes added to
mqprio_init() and mqprio_destroy() to rebase on these changes.

Signed-off-by: Amritha Nambiar <amritha.namb...@intel.com>
---
 include/net/pkt_cls.h  |9 ++
 include/uapi/linux/pkt_sched.h |   32 +++
 net/sched/sch_mqprio.c |  183 ++--
 3 files changed, 215 insertions(+), 9 deletions(-)

diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h
index e80edd8..456017a 100644
--- a/include/net/pkt_cls.h
+++ b/include/net/pkt_cls.h
@@ -546,6 +546,15 @@ struct tc_cls_bpf_offload {
u32 gen_flags;
 };
 
+struct tc_mqprio_qopt_offload {
+   /* struct tc_mqprio_qopt must always be the first element */
+   struct tc_mqprio_qopt qopt;
+   u16 mode;
+   u16 shaper;
+   u32 flags;
+   u64 min_rate[TC_QOPT_MAX_QUEUE];
+   u64 max_rate[TC_QOPT_MAX_QUEUE];
+};
 
 /* This structure holds cookie structure that is passed from user
  * to the kernel for actions and classifiers
diff --git a/include/uapi/linux/pkt_sched.h b/include/uapi/linux/pkt_sched.h
index 099bf55..e95b5c9 100644
--- a/include/uapi/linux/pkt_sched.h
+++ b/include/uapi/linux/pkt_sched.h
@@ -625,6 +625,22 @@ enum {
 
 #define TC_MQPRIO_HW_OFFLOAD_MAX (__TC_MQPRIO_HW_OFFLOAD_MAX - 1)
 
+enum {
+   TC_MQPRIO_MODE_DCB,
+   TC_MQPRIO_MODE_CHANNEL,
+   __TC_MQPRIO_MODE_MAX
+};
+
+#define __TC_MQPRIO_MODE_MAX (__TC_MQPRIO_MODE_MAX - 1)
+
+enum {
+   TC_MQPRIO_SHAPER_DCB,
+   TC_MQPRIO_SHAPER_BW_RATE,   /* Add new shapers below */
+   __TC_MQPRIO_SHAPER_MAX
+};
+
+#define __TC_MQPRIO_SHAPER_MAX (__TC_MQPRIO_SHAPER_MAX - 1)
+
 struct tc_mqprio_qopt {
__u8num_tc;
__u8prio_tc_map[TC_QOPT_BITMASK + 1];
@@ -633,6 +649,22 @@ struct tc_mqprio_qopt {
__u16   offset[TC_QOPT_MAX_QUEUE];
 };
 
+#define TC_MQPRIO_F_MODE   0x1
+#define TC_MQPRIO_F_SHAPER 0x2
+#define TC_MQPRIO_F_MIN_RATE   0x4
+#define TC_MQPRIO_F_MAX_RATE   0x8
+
+enum {
+   TCA_MQPRIO_UNSPEC,
+   TCA_MQPRIO_MODE,
+   TCA_MQPRIO_SHAPER,
+   TCA_MQPRIO_MIN_RATE64,
+   TCA_MQPRIO_MAX_RATE64,
+   __TCA_MQPRIO_MAX,
+};
+
+#define TCA_MQPRIO_MAX (__TCA_MQPRIO_MAX - 1)
+
 /* SFB */
 
 enum {
diff --git a/net/sched/sch_mqprio.c b/net/sched/sch_mqprio.c
index 6bcdfe6..e5e7724 100644
--- a/net/sched/sch_mqprio.c
+++ b/net/sched/sch_mqprio.c
@@ -18,10 +18,16 @@
 #include 
 #include 
 #include 
+#include 
 
 struct mqprio_sched {
struct Qdisc**qdiscs;
+   u1

[RFC, iproute2 PATCH v2] tc/mirred: Extend the mirred/redirect action to accept additional traffic class parameter

2017-08-17 Thread Amritha Nambiar

The Mirred/redirect action is extended to accept a traffic
class on the device in addition to the device's ifindex.

Usage: mirred

Example:
# tc qdisc add dev eth0 ingress

# tc filter add dev eth0 protocol ip parent : prio 1 flower\
  dst_ip 192.168.1.1/32 ip_proto udp dst_port 22\
  skip_sw action mirred ingress redirect dev eth0 tclass 1

v2: Renamed the parameter 'tc' to 'tclass'. Replaced atoi with
strtoul and used NEXT_ARG() construct.

Signed-off-by: Amritha Nambiar <amritha.namb...@intel.com>
---
 include/linux/tc_act/tc_mirred.h |3 +++
 tc/m_mirred.c|   35 +++
 2 files changed, 34 insertions(+), 4 deletions(-)

diff --git a/include/linux/tc_act/tc_mirred.h b/include/linux/tc_act/tc_mirred.h
index 3d7a2b3..ea06a47 100644
--- a/include/linux/tc_act/tc_mirred.h
+++ b/include/linux/tc_act/tc_mirred.h
@@ -9,6 +9,8 @@
 #define TCA_EGRESS_MIRROR 2 /* mirror packet to EGRESS */
 #define TCA_INGRESS_REDIR 3  /* packet redirect to INGRESS*/
 #define TCA_INGRESS_MIRROR 4 /* mirror packet to INGRESS */
+
+#define MIRRED_F_TCLASS0x1

 
 struct tc_mirred {
tc_gen;
@@ -21,6 +23,7 @@ enum {
TCA_MIRRED_TM,
TCA_MIRRED_PARMS,
TCA_MIRRED_PAD,
+   TCA_MIRRED_TCLASS,
__TCA_MIRRED_MAX
 };
 #define TCA_MIRRED_MAX (__TCA_MIRRED_MAX - 1)
diff --git a/tc/m_mirred.c b/tc/m_mirred.c
index 2384bda..1cb477a 100644
--- a/tc/m_mirred.c
+++ b/tc/m_mirred.c
@@ -29,12 +29,13 @@
 static void
 explain(void)
 {
-   fprintf(stderr, "Usage: mirred   [index INDEX] \n");
+   fprintf(stderr, "Usage: mirred   [index INDEX]  [tclass TCINDEX]\n");
fprintf(stderr, "where:\n");
fprintf(stderr, "\tDIRECTION := \n");
fprintf(stderr, "\tACTION := \n");
fprintf(stderr, "\tINDEX  is the specific policy instance id\n");
fprintf(stderr, "\tDEVICENAME is the devicename\n");
+   fprintf(stderr, "\tTCINDEX is the traffic class index\n");
 
 }
 
@@ -72,6 +73,9 @@ parse_direction(struct action_util *a, int *argc_p, char 
***argv_p,
struct tc_mirred p = {};
struct rtattr *tail;
char d[16] = {};
+   __u32 flags = 0;
+   char *end;
+   __u8 tc;
 
while (argc > 0) {
 
@@ -139,9 +143,23 @@ parse_direction(struct action_util *a, int *argc_p, char 
***argv_p,
duparg("dev", *argv);
 
strncpy(d, *argv, sizeof(d)-1);
-   argc--;
-   argv++;
-
+   NEXT_ARG_FWD();
+
+   if (argc > 0 && matches(*argv, "tclass") == 0) {
+   NEXT_ARG();
+   tc = strtoul(*argv, , 0);
+   if (*end) {
+   fprintf(stderr, "Illegal TC 
index\n");
+   return -1;
+   }
+   if (tc >= TC_QOPT_MAX_QUEUE) {
+   fprintf(stderr, "TC index 
exceeds max range\n");
+   return -1;
+   }
+   flags |= MIRRED_F_TCLASS;
+   ok++;
+   NEXT_ARG_FWD();
+   }
break;
 
}
@@ -193,6 +211,9 @@ parse_direction(struct action_util *a, int *argc_p, char 
***argv_p,
tail = NLMSG_TAIL(n);
addattr_l(n, MAX_MSG, tca_id, NULL, 0);
addattr_l(n, MAX_MSG, TCA_MIRRED_PARMS, , sizeof(p));
+   if (flags & MIRRED_F_TCLASS)
+   addattr_l(n, MAX_MSG, TCA_MIRRED_TCLASS,
+ , sizeof(tc));
tail->rta_len = (void *) NLMSG_TAIL(n) - (void *) tail;
 
*argc_p = argc;
@@ -248,6 +269,7 @@ print_mirred(struct action_util *au, FILE * f, struct 
rtattr *arg)
struct tc_mirred *p;
struct rtattr *tb[TCA_MIRRED_MAX + 1];
const char *dev;
+   __u8 *tc;
 
if (arg == NULL)
return -1;
@@ -273,6 +295,11 @@ print_mirred(struct action_util *au, FILE * f, struct 
rtattr *arg)
fprintf(f, "mirred (%s to device %s)", mirred_n2a(p->eaction), dev);
print_action_control(f, " ", p->action, "");
 
+   if (tb[TCA_MIRRED_TCLASS]) {
+   tc = RTA_DATA(tb[TCA_MIRRED_TCLASS]);
+   fprintf(f, " tclass %u", *tc);
+   }
+
fprintf(f, "\n ");
fprintf(f, "\tindex %u ref %d bind %d", p->index, p->refcnt,
p->bindcnt);

[RFC PATCH v2 6/6] [net-next]net: i40e: Enable cloud filters in i40e via tc flower classifier

2017-08-17 Thread Amritha Nambiar

This patch enables tc-flower based hardware offloads. tc flower
filter provided by the kernel is configured as driver specific
cloud filter. The patch implements functions and admin queue
commands needed to support cloud filters in the driver and
adds cloud filters to configure these tc-flower filters.

The only action supported is to redirect packets to a traffic class
on the same device.

# tc qdisc add dev eth0 ingress
# ethtool -K eth0 hw-tc-offload on

# tc filter add dev eth0 protocol ip parent :\
  prio 1 flower dst_mac 3c:fd:fe:a0:d6:70 skip_sw\
  action mirred ingress redirect dev eth0 tclass 0

# tc filter add dev eth0 protocol ip parent :\
  prio 2 flower dst_ip 192.168.3.5/32\
  ip_proto udp dst_port 25 skip_sw\
  action mirred ingress redirect dev eth0 tclass 1

# tc filter add dev eth0 protocol ipv6 parent :\
  prio 3 flower dst_ip fe8::200:1\
  ip_proto udp dst_port 66 skip_sw\
  action mirred ingress redirect dev eth0 tclass 2

Delete tc flower filter:
Example:

# tc filter del dev eth0 parent : prio 3 handle 0x1 flower
# tc filter del dev eth0 parent :

Flow Director Sideband is disabled while configuring cloud filters
via tc-flower and until any cloud filter exists.

Unsupported matches when cloud filters are added using enhanced
big buffer cloud filter mode of underlying switch include:
1. source port and source IP
2. Combined MAC address and IP fields.
3. Not specifying L4 port

These filter matches can however be used to redirect traffic to
the main VSI (tc 0) which does not require the enhanced big buffer
cloud filter support.

v2:
1. Moved I40E_SWITCH_MODE_MASK definition to i40e_type.h
2. Moved dev_info for add/deleting cloud filters in else condition
3. Fixed some format specifier in dev_err logs
4. Refactored i40e_get_capabilities to take an additional
   list_type parameter and use it to query device and function
   level capabilities.
5. Fixed parsing tc redirect action to check for the is_tcf_mirred_tc()
   to verify if redirect to a traffic class is supported.
6. Added comments for Geneve fix in cloud filter big buffer AQ
   function definitions.
7. Cleaned up setup_tc interface to rebase and work with Jiri's
   updates, separate function to process tc cls flower offloads.
8. Changes to make Flow Director Sideband and Cloud filters mutually
   exclusive.

Signed-off-by: Amritha Nambiar <amritha.namb...@intel.com>
Signed-off-by: Kiran Patil <kiran.pa...@intel.com>
Signed-off-by: Anjali Singhai Jain <anjali.sing...@intel.com>
Signed-off-by: Jingjing Wu <jingjing...@intel.com>
---
 drivers/net/ethernet/intel/i40e/i40e.h   |   46 +
 drivers/net/ethernet/intel/i40e/i40e_common.c|  190 
 drivers/net/ethernet/intel/i40e/i40e_main.c  |  975 +-
 drivers/net/ethernet/intel/i40e/i40e_prototype.h |   17 
 drivers/net/ethernet/intel/i40e/i40e_type.h  |1 
 5 files changed, 1202 insertions(+), 27 deletions(-)

diff --git a/drivers/net/ethernet/intel/i40e/i40e.h 
b/drivers/net/ethernet/intel/i40e/i40e.h
index ac57ab0..dbf4b9d 100644
--- a/drivers/net/ethernet/intel/i40e/i40e.h
+++ b/drivers/net/ethernet/intel/i40e/i40e.h
@@ -55,6 +55,8 @@
 #include 
 #include 
 #include 
+#include 
+#include 
 #include "i40e_type.h"
 #include "i40e_prototype.h"
 #include "i40e_client.h"
@@ -252,9 +254,49 @@ struct i40e_fdir_filter {
u32 fd_id;
 };
 
+#define I40E_CLOUD_FIELD_OMAC  0x01
+#define I40E_CLOUD_FIELD_IMAC  0x02
+#define I40E_CLOUD_FIELD_IVLAN 0x04
+#define I40E_CLOUD_FIELD_TEN_ID0x08
+#define I40E_CLOUD_FIELD_IIP   0x10
+
+#define I40E_CLOUD_FILTER_FLAGS_OMAC   I40E_CLOUD_FIELD_OMAC
+#define I40E_CLOUD_FILTER_FLAGS_IMAC   I40E_CLOUD_FIELD_IMAC
+#define I40E_CLOUD_FILTER_FLAGS_IMAC_IVLAN (I40E_CLOUD_FIELD_IMAC | \
+I40E_CLOUD_FIELD_IVLAN)
+#define I40E_CLOUD_FILTER_FLAGS_IMAC_TEN_ID(I40E_CLOUD_FIELD_IMAC | \
+I40E_CLOUD_FIELD_TEN_ID)
+#define I40E_CLOUD_FILTER_FLAGS_OMAC_TEN_ID_IMAC (I40E_CLOUD_FIELD_OMAC | \
+ I40E_CLOUD_FIELD_IMAC | \
+ I40E_CLOUD_FIELD_TEN_ID)
+#define I40E_CLOUD_FILTER_FLAGS_IMAC_IVLAN_TEN_ID (I40E_CLOUD_FIELD_IMAC | \
+  I40E_CLOUD_FIELD_IVLAN | \
+  I40E_CLOUD_FIELD_TEN_ID)
+#define I40E_CLOUD_FILTER_FLAGS_IIPI40E_CLOUD_FIELD_IIP
+
 struct i40e_cloud_filter {
struct hlist_node cloud_node;
unsigned long cookie;
+   /* cloud filter input set follows */
+   u8 dst_mac[ETH_ALEN];
+   u8 src_mac[ETH_ALEN];
+   __be16 vlan_id;
+   __be32 dst_ip[4];
+   __be32 src_ip[4];
+   u8 dst_ipv6[16];
+   u8 src_ipv6[16];
+   __be16 dst_port;
+   __be16 src_port;
+   bool is_ipv6;   /* IPv6 based

[RFC PATCH v2 3/6] [net-next]net: i40e: Extend set switch config command to accept cloud filter mode

2017-08-17 Thread Amritha Nambiar

Add definitions for L4 filters and switch modes based on cloud filters
modes and extend the set switch config command to include the
additional cloud filter mode.

Signed-off-by: Amritha Nambiar <amritha.namb...@intel.com>
Signed-off-by: Kiran Patil <kiran.pa...@intel.com>
---
 drivers/net/ethernet/intel/i40e/i40e_adminq_cmd.h |   31 -
 drivers/net/ethernet/intel/i40e/i40e_common.c |4 ++-
 drivers/net/ethernet/intel/i40e/i40e_ethtool.c|2 +
 drivers/net/ethernet/intel/i40e/i40e_main.c   |2 +
 drivers/net/ethernet/intel/i40e/i40e_prototype.h  |2 +
 drivers/net/ethernet/intel/i40e/i40e_type.h   |9 ++
 6 files changed, 45 insertions(+), 5 deletions(-)

diff --git a/drivers/net/ethernet/intel/i40e/i40e_adminq_cmd.h 
b/drivers/net/ethernet/intel/i40e/i40e_adminq_cmd.h
index e2a9ec8..eac5ff5 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_adminq_cmd.h
+++ b/drivers/net/ethernet/intel/i40e/i40e_adminq_cmd.h
@@ -773,7 +773,36 @@ struct i40e_aqc_set_switch_config {
 #define I40E_AQ_SET_SWITCH_CFG_PROMISC 0x0001
 #define I40E_AQ_SET_SWITCH_CFG_L2_FILTER   0x0002
__le16  valid_flags;
-   u8  reserved[12];
+   u8  rsvd6[6];
+   /* Next byte is split into following:
+* Bit 7 : 0: No action, 1: Switch to mode defined by bits 6:0
+* Bit 6: 0 : Destination Port, 1: source port
+* Bit 5..4: L4 type
+* 0: rsvd
+* 1: TCP
+* 2: UDP
+* 3: Both TCP and UDP
+* Bits 3:0 Mode
+* 0: default mode
+* 1: L4 port only mode
+* 2: non-tunneled mode
+* 3: tunneled mode
+*/
+#define I40E_AQ_SET_SWITCH_BIT7_VALID  0x80
+
+#define I40E_AQ_SET_SWITCH_L4_SRC_PORT 0x40
+
+#define I40E_AQ_SET_SWITCH_L4_TYPE_RSVD0x00
+#define I40E_AQ_SET_SWITCH_L4_TYPE_TCP 0x10
+#define I40E_AQ_SET_SWITCH_L4_TYPE_UDP 0x20
+#define I40E_AQ_SET_SWITCH_L4_TYPE_BOTH0x30
+
+#define I40E_AQ_SET_SWITCH_MODE_DEFAULT0x00
+#define I40E_AQ_SET_SWITCH_MODE_L4_PORT0x01
+#define I40E_AQ_SET_SWITCH_MODE_NON_TUNNEL 0x02
+#define I40E_AQ_SET_SWITCH_MODE_TUNNEL 0x03
+   u8  mode;
+   u8  rsvd5[5];
 };
 
 I40E_CHECK_CMD_LENGTH(i40e_aqc_set_switch_config);
diff --git a/drivers/net/ethernet/intel/i40e/i40e_common.c 
b/drivers/net/ethernet/intel/i40e/i40e_common.c
index e4e86e0..d0e8138 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_common.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_common.c
@@ -2380,13 +2380,14 @@ i40e_status i40e_aq_get_switch_config(struct i40e_hw 
*hw,
  * @hw: pointer to the hardware structure
  * @flags: bit flag values to set
  * @valid_flags: which bit flags to set
+ * @mode: cloud filter mode
  * @cmd_details: pointer to command details structure or NULL
  *
  * Set switch configuration bits
  **/
 enum i40e_status_code i40e_aq_set_switch_config(struct i40e_hw *hw,
u16 flags,
-   u16 valid_flags,
+   u16 valid_flags, u8 mode,
struct i40e_asq_cmd_details *cmd_details)
 {
struct i40e_aq_desc desc;
@@ -2398,6 +2399,7 @@ enum i40e_status_code i40e_aq_set_switch_config(struct 
i40e_hw *hw,
  i40e_aqc_opc_set_switch_config);
scfg->flags = cpu_to_le16(flags);
scfg->valid_flags = cpu_to_le16(valid_flags);
+   scfg->mode = mode;
 
status = i40e_asq_send_command(hw, , NULL, 0, cmd_details);
 
diff --git a/drivers/net/ethernet/intel/i40e/i40e_ethtool.c 
b/drivers/net/ethernet/intel/i40e/i40e_ethtool.c
index 326fc18..232e066e 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_ethtool.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_ethtool.c
@@ -4181,7 +4181,7 @@ static int i40e_set_priv_flags(struct net_device *dev, 
u32 flags)
sw_flags = I40E_AQ_SET_SWITCH_CFG_PROMISC;
valid_flags = I40E_AQ_SET_SWITCH_CFG_PROMISC;
ret = i40e_aq_set_switch_config(>hw, sw_flags, valid_flags,
-   NULL);
+   0, NULL);
if (ret && pf->hw.aq.asq_last_status != I40E_AQ_RC_ESRCH) {
dev_info(>pdev->dev,
 "couldn't set switch config bits, err %s 
aq_err %s\n",
diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c 
b/drivers/net/ethernet/intel/i40e/i40e_main.c
index b02da99..e53d1be 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_main.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_main.c
@@ -12146,7 +12146,7 @@ static int i40e_setup_pf_switch(struct i40e_pf *pf, 
bool reinit)
u16 valid_flags;
 
valid_flags = I40E_AQ_SET_SWITCH_CFG_PROMI

[RFC PATCH v2 4/6] [net-next]net: i40e: Admin queue definitions for cloud filters

2017-08-17 Thread Amritha Nambiar

Add new admin queue definitions and extended fields for cloud
filter support. Define big buffer for extended general fields
in Add/Remove Cloud filters command.

v2: Added I40E_CHECK_STRUCT_LEN check to AQ command structs and
added AQ definitions to i40evf for consistency based on Shannon's
feedback.

Signed-off-by: Amritha Nambiar <amritha.namb...@intel.com>
Signed-off-by: Kiran Patil <kiran.pa...@intel.com>
Signed-off-by: Jingjing Wu <jingjing...@intel.com>
---
 drivers/net/ethernet/intel/i40e/i40e_adminq_cmd.h  |  108 
 .../net/ethernet/intel/i40evf/i40e_adminq_cmd.h|  108 
 2 files changed, 214 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/intel/i40e/i40e_adminq_cmd.h 
b/drivers/net/ethernet/intel/i40e/i40e_adminq_cmd.h
index eac5ff5..1f5 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_adminq_cmd.h
+++ b/drivers/net/ethernet/intel/i40e/i40e_adminq_cmd.h
@@ -1355,7 +1355,9 @@ struct i40e_aqc_add_remove_cloud_filters {
 #define I40E_AQC_ADD_CLOUD_CMD_SEID_NUM_SHIFT  0
 #define I40E_AQC_ADD_CLOUD_CMD_SEID_NUM_MASK   (0x3FF << \
I40E_AQC_ADD_CLOUD_CMD_SEID_NUM_SHIFT)
-   u8  reserved2[4];
+   u8  big_buffer_flag;
+#defineI40E_AQC_ADD_REM_CLOUD_CMD_BIG_BUFFER   1
+   u8  reserved2[3];
__le32  addr_high;
__le32  addr_low;
 };
@@ -1392,6 +1394,13 @@ struct i40e_aqc_add_remove_cloud_filters_element_data {
 #define I40E_AQC_ADD_CLOUD_FILTER_IMAC 0x000A
 #define I40E_AQC_ADD_CLOUD_FILTER_OMAC_TEN_ID_IMAC 0x000B
 #define I40E_AQC_ADD_CLOUD_FILTER_IIP  0x000C
+/* 0x0010 to 0x0017 is for custom filters */
+/* flag to be used when adding cloud filter: IP + L4 Port */
+#define I40E_AQC_ADD_CLOUD_FILTER_IP_PORT  0x0010
+/* flag to be used when adding cloud filter: Dest MAC + L4 Port */
+#define I40E_AQC_ADD_CLOUD_FILTER_MAC_PORT 0x0011
+/* flag to be used when adding cloud filter: Dest MAC + VLAN + L4 Port */
+#define I40E_AQC_ADD_CLOUD_FILTER_MAC_VLAN_PORT0x0012
 
 #define I40E_AQC_ADD_CLOUD_FLAGS_TO_QUEUE  0x0080
 #define I40E_AQC_ADD_CLOUD_VNK_SHIFT   6
@@ -1426,6 +1435,49 @@ struct i40e_aqc_add_remove_cloud_filters_element_data {
u8  response_reserved[7];
 };
 
+I40E_CHECK_STRUCT_LEN(0x40, i40e_aqc_add_remove_cloud_filters_element_data);
+
+/* i40e_aqc_add_remove_cloud_filters_element_big_data is used when
+ * I40E_AQC_ADD_REM_CLOUD_CMD_BIG_BUFFER flag is set.
+ */
+struct i40e_aqc_add_remove_cloud_filters_element_big_data {
+   struct i40e_aqc_add_remove_cloud_filters_element_data element;
+   u16 general_fields[32];
+#define I40E_AQC_ADD_CLOUD_FV_FLU_0X10_WORD0   0
+#define I40E_AQC_ADD_CLOUD_FV_FLU_0X10_WORD1   1
+#define I40E_AQC_ADD_CLOUD_FV_FLU_0X10_WORD2   2
+#define I40E_AQC_ADD_CLOUD_FV_FLU_0X11_WORD0   3
+#define I40E_AQC_ADD_CLOUD_FV_FLU_0X11_WORD1   4
+#define I40E_AQC_ADD_CLOUD_FV_FLU_0X11_WORD2   5
+#define I40E_AQC_ADD_CLOUD_FV_FLU_0X12_WORD0   6
+#define I40E_AQC_ADD_CLOUD_FV_FLU_0X12_WORD1   7
+#define I40E_AQC_ADD_CLOUD_FV_FLU_0X12_WORD2   8
+#define I40E_AQC_ADD_CLOUD_FV_FLU_0X13_WORD0   9
+#define I40E_AQC_ADD_CLOUD_FV_FLU_0X13_WORD1   10
+#define I40E_AQC_ADD_CLOUD_FV_FLU_0X13_WORD2   11
+#define I40E_AQC_ADD_CLOUD_FV_FLU_0X14_WORD0   12
+#define I40E_AQC_ADD_CLOUD_FV_FLU_0X14_WORD1   13
+#define I40E_AQC_ADD_CLOUD_FV_FLU_0X14_WORD2   14
+#define I40E_AQC_ADD_CLOUD_FV_FLU_0X16_WORD0   15
+#define I40E_AQC_ADD_CLOUD_FV_FLU_0X16_WORD1   16
+#define I40E_AQC_ADD_CLOUD_FV_FLU_0X16_WORD2   17
+#define I40E_AQC_ADD_CLOUD_FV_FLU_0X16_WORD3   18
+#define I40E_AQC_ADD_CLOUD_FV_FLU_0X16_WORD4   19
+#define I40E_AQC_ADD_CLOUD_FV_FLU_0X16_WORD5   20
+#define I40E_AQC_ADD_CLOUD_FV_FLU_0X16_WORD6   21
+#define I40E_AQC_ADD_CLOUD_FV_FLU_0X16_WORD7   22
+#define I40E_AQC_ADD_CLOUD_FV_FLU_0X17_WORD0   23
+#define I40E_AQC_ADD_CLOUD_FV_FLU_0X17_WORD1   24
+#define I40E_AQC_ADD_CLOUD_FV_FLU_0X17_WORD2   25
+#define I40E_AQC_ADD_CLOUD_FV_FLU_0X17_WORD3   26
+#define I40E_AQC_ADD_CLOUD_FV_FLU_0X17_WORD4   27
+#define I40E_AQC_ADD_CLOUD_FV_FLU_0X17_WORD5   28
+#define I40E_AQC_ADD_CLOUD_FV_FLU_0X17_WORD6   29
+#define I40E_AQC_ADD_CLOUD_FV_FLU_0X17_WORD7   30
+};
+
+I40E_CHECK_STRUCT_LEN(0x80, 
i40e_aqc_add_remove_cloud_filters_element_big_data);
+
 struct i40e_aqc_remove_cloud_filters_completion {
__le16 perfect_ovlan_used;
__le16 perfect_ovlan_free;
@@ -1437,6 +1489,60 @@ struct i40e_aqc_remove_cloud_filters_completion {
 
 I40E_CHECK_CMD_LENGTH(i40e_aqc_remove_cloud_filters_completion);
 
+/* Replace filter Command 0x025F
+ * uses the i40e_aqc_replace_cloud_filters,
+ * and the generic indirect completion structure
+ */
+struct i40e_filter_data {
+   u8 filter_type;
+   u8 input[3];
+};
+
+I40E_CHECK_STRUCT_LEN(4, i40e_filter_data);
+
+struct i40e_aqc_replace_cloud_filters_cmd {
+

1 2 >

1 - 100 of 125 matches

Mail list logo