Re: Fwd: Re: Kernel panic with 4.16-rc1 (and 4.16-rc2) running selftest

2018-02-26 Thread Chris Mi

Hi Matthew,

Sorry for the late response. I'll add the idr test cases for the new 
APIs ASAP.


Thanks,
Chris

On 2/24/2018 10:46 AM, Matthew Wilcox wrote:

On Sat, Feb 24, 2018 at 01:49:35AM +, Chris Mi wrote:

To verify this patch, the following is a sanity test case:

# tc qdisc delete dev $link ingress > /dev/null 2>&1;
# tc qdisc add dev $link ingress;
# tc filter add dev $link prio 1 protocol ip handle 0x8001 parent : 
flower skip_hw src_mac e4:11:0:0:0:2 dst_mac e4:12:0:0:0:2 action drop;
# tc filter show dev $link parent :

filter pref 1 flower chain 0
filter pref 1 flower chain 0 handle 0x8001

I added these tests to my local tree for now.

diff --git a/tools/testing/radix-tree/idr-test.c 
b/tools/testing/radix-tree/idr-test.c
index 44ef9eba5a7a..28d99325a32d 100644
--- a/tools/testing/radix-tree/idr-test.c
+++ b/tools/testing/radix-tree/idr-test.c
@@ -178,6 +178,29 @@ void idr_get_next_test(int base)
idr_destroy();
  }
  
+void idr_u32_test(struct idr *idr, int base)

+{
+   assert(idr_is_empty(idr));
+   idr_init_base(idr, base);
+   u32 handle = 10;
+   idr_alloc_u32(idr, NULL, , handle, GFP_KERNEL);
+   BUG_ON(handle != 10);
+   idr_remove(idr, handle);
+   assert(idr_is_empty(idr));
+
+   handle = 0x8001;
+   idr_alloc_u32(idr, NULL, , handle, GFP_KERNEL);
+   BUG_ON(handle != 0x8001);
+   idr_remove(idr, handle);
+   assert(idr_is_empty(idr));
+
+   handle = 0xffe0;
+   idr_alloc_u32(idr, NULL, , handle, GFP_KERNEL);
+   BUG_ON(handle != 0xffe0);
+   idr_remove(idr, handle);
+   assert(idr_is_empty(idr));
+}
+
  void idr_checks(void)
  {
unsigned long i;
@@ -248,6 +271,9 @@ void idr_checks(void)
idr_get_next_test(0);
idr_get_next_test(1);
idr_get_next_test(4);
+   idr_u32_test(, 0);
+   idr_u32_test(, 1);
+   idr_u32_test(, 4);
  }
  
  /*




RE: Fwd: Re: Kernel panic with 4.16-rc1 (and 4.16-rc2) running selftest

2018-02-23 Thread Chris Mi
> -Original Message-
> From: Matthew Wilcox [mailto:wi...@infradead.org]
> Sent: Saturday, February 24, 2018 9:15 AM
> To: Cong Wang <xiyou.wangc...@gmail.com>; Khalid Aziz
> <khalid.a...@oracle.com>; linux-ker...@vger.kernel.org;
> netdev@vger.kernel.org
> Cc: Chris Mi <chr...@mellanox.com>
> Subject: Re: Fwd: Re: Kernel panic with 4.16-rc1 (and 4.16-rc2) running
> selftest
> 
> On Fri, Feb 23, 2018 Randy Dunlap wrote:
> > [add Matthew Wilcox; hopefully he can look/see]
> 
> Thanks, Randy.  I don't understand why nobody else thought to cc the author
> of the patch that it was bisected to ...
> 
> > On 02/23/2018 04:13 PM, Cong Wang wrote:
> > > On Fri, Feb 23, 2018 at 3:27 PM, Cong Wang
> > > <xiyou.wangc...@gmail.com>
> > wrote:
> > >> On Fri, Feb 23, 2018 at 11:00 AM, Randy Dunlap
> > >> <rdun...@infradead.org>
> > wrote:
> > >>> On 02/23/2018 08:05 AM, Khalid Aziz wrote:
> > >>>> Same selftest does not cause panic on 4.15. git bisect pointed to
> > commit 6ce711f2750031d12cec91384ac5cfa0a485b60a ("idr: Make 1-based
> > IDRs more efficient").
> > >>>> Kernel config is attached.
> > >>
> > >> Looks like something horribly wrong with u32 key id idr...
> > >
> > > Adding a few printk's, I got:
> > >
> > > [   31.231560] requested handle = ffe0
> > > [   31.232426] allocated handle = 0
> > > ...
> > > [   31.246475] requested handle = ffd0
> > > [   31.247555] allocated handle = 1
> > >
> > >
> > > So the bug is here where we can't allocate a specific handle:
> > >
> > > err = idr_alloc_u32(_c->handle_idr, ht,
> > ,
> > > handle, GFP_KERNEL);
> > > if (err) {
> > > kfree(ht);
> > > return err;
> > > }
> 
> Please try this patch.  It fixes ffe0, but there may be more things tested
> that it may not work for.
> 
> Chris Mi, what happened to that set of testcases you promised to write for
> me?
I promised to write it after the API is stabilized since you were going to 
change it.
I will inform the management about this new task and get back to you later.
> 
> diff --git a/lib/idr.c b/lib/idr.c
> index c98d77fcf393..10d9b8d47c33 100644
> --- a/lib/idr.c
> +++ b/lib/idr.c
> @@ -36,8 +36,8 @@ int idr_alloc_u32(struct idr *idr, void *ptr, u32 *nextid,  
> {
>   struct radix_tree_iter iter;
>   void __rcu **slot;
> - int base = idr->idr_base;
> - int id = *nextid;
> + unsigned int base = idr->idr_base;
> + unsigned int id = *nextid;
> 
>   if (WARN_ON_ONCE(radix_tree_is_internal_node(ptr)))
>   return -EINVAL;
To verify this patch, the following is a sanity test case:

# tc qdisc delete dev $link ingress > /dev/null 2>&1;
# tc qdisc add dev $link ingress;
# tc filter add dev $link prio 1 protocol ip handle 0x8001 parent : 
flower skip_hw src_mac e4:11:0:0:0:2 dst_mac e4:12:0:0:0:2 action drop;
# tc filter show dev $link parent :

filter pref 1 flower chain 0
filter pref 1 flower chain 0 handle 0x8001
  dst_mac e4:12:00:00:00:02
  src_mac e4:11:00:00:00:02
  eth_type ipv4
  skip_hw
  not_in_hw
action order 1: gact action drop
 random type none pass val 0
 index 1 ref 1 bind 1

Please make sure the handle is the same as the user specifies.


RE: [patch iproute2 v10 0/2] tc: Add batchsize feature to batch mode

2018-01-15 Thread Chris Mi
> -Original Message-
> From: David Ahern [mailto:dsah...@gmail.com]
> Sent: Tuesday, January 16, 2018 12:41 AM
> To: Chris Mi <chr...@mellanox.com>; netdev@vger.kernel.org
> Cc: gerlitz...@gmail.com; step...@networkplumber.org;
> marcelo.leit...@gmail.com; p...@nwl.cc
> Subject: Re: [patch iproute2 v10 0/2] tc: Add batchsize feature to batch mode
> 
> On 1/11/18 10:13 PM, Chris Mi wrote:
> > Currently in tc batch mode, only one command is read from the batch
> > file and sent to kernel to process. With this patchset, at most 128
> > commands can be accumulated before sending to kernel.
> >
> > We introduced a new function in patch 1 to support for sending
> > multiple messages. In patch 2, we add this support for filter
> > add/delete/change/replace and actions add/change/replace commands.
> >
> > But please note that kernel still processes the requests one by one.
> > To process the requests in parallel in kernel is another effort.
> > The time we're saving in this patchset is the user mode and kernel
> > mode context switch. So this patchset works on top of the current kernel.
> >
> > Using the following script in kernel, we can generate 1,000,000 rules.
> > tools/testing/selftests/tc-testing/tdc_batch.py
> >
> > Without this patchset, 'tc -b $file' exection time is:
> >
> > real0m15.555s
> > user0m7.211s
> > sys 0m8.284s
> >
> > With this patchset, 'tc -b $file' exection time is:
> >
> > real0m12.360s
> > user0m6.082s
> > sys 0m6.213s
> >
> > The insertion rate is improved more than 10%.
> 
> LGTM. Applied to iproute2-next.
Thank you, David.  And thanks for your careful review that improves the code 
quality and
design very much.

-Chris


[patch iproute2 v10 1/2] lib/libnetlink: Add a new function rtnl_talk_iov

2018-01-11 Thread Chris Mi
rtnl_talk can only send a single message to kernel. Add a new function
rtnl_talk_iov that can send multiple messages to kernel.
rtnl_talk_iov takes struct iovec * and iovlen as arguments.

Signed-off-by: Chris Mi <chr...@mellanox.com>
Signed-off-by: David Ahern <dsah...@gmail.com>
---
 include/libnetlink.h |  3 +++
 lib/libnetlink.c | 65 +---
 2 files changed, 49 insertions(+), 19 deletions(-)

diff --git a/include/libnetlink.h b/include/libnetlink.h
index a4d83b9e..d6322190 100644
--- a/include/libnetlink.h
+++ b/include/libnetlink.h
@@ -96,6 +96,9 @@ int rtnl_dump_filter_nc(struct rtnl_handle *rth,
 int rtnl_talk(struct rtnl_handle *rtnl, struct nlmsghdr *n,
  struct nlmsghdr **answer)
__attribute__((warn_unused_result));
+int rtnl_talk_iov(struct rtnl_handle *rtnl, struct iovec *iovec, size_t iovlen,
+ struct nlmsghdr **answer)
+   __attribute__((warn_unused_result));
 int rtnl_talk_extack(struct rtnl_handle *rtnl, struct nlmsghdr *n,
  struct nlmsghdr **answer, nl_ext_ack_fn_t errfn)
__attribute__((warn_unused_result));
diff --git a/lib/libnetlink.c b/lib/libnetlink.c
index 00e6ce0c..7ca47b22 100644
--- a/lib/libnetlink.c
+++ b/lib/libnetlink.c
@@ -581,30 +581,30 @@ static void rtnl_talk_error(struct nlmsghdr *h, struct 
nlmsgerr *err,
strerror(-err->error));
 }
 
-static int __rtnl_talk(struct rtnl_handle *rtnl, struct nlmsghdr *n,
-  struct nlmsghdr **answer,
-  bool show_rtnl_err, nl_ext_ack_fn_t errfn)
+
+static int __rtnl_talk_iov(struct rtnl_handle *rtnl, struct iovec *iov,
+  size_t iovlen, struct nlmsghdr **answer,
+  bool show_rtnl_err, nl_ext_ack_fn_t errfn)
 {
-   int status;
-   unsigned int seq;
-   struct nlmsghdr *h;
struct sockaddr_nl nladdr = { .nl_family = AF_NETLINK };
-   struct iovec iov = {
-   .iov_base = n,
-   .iov_len = n->nlmsg_len
-   };
+   struct iovec riov;
struct msghdr msg = {
.msg_name = ,
.msg_namelen = sizeof(nladdr),
-   .msg_iov = ,
-   .msg_iovlen = 1,
+   .msg_iov = iov,
+   .msg_iovlen = iovlen,
};
+   unsigned int seq = 0;
+   struct nlmsghdr *h;
+   int i, status;
char *buf;
 
-   n->nlmsg_seq = seq = ++rtnl->seq;
-
-   if (answer == NULL)
-   n->nlmsg_flags |= NLM_F_ACK;
+   for (i = 0; i < iovlen; i++) {
+   h = iov[i].iov_base;
+   h->nlmsg_seq = seq = ++rtnl->seq;
+   if (answer == NULL)
+   h->nlmsg_flags |= NLM_F_ACK;
+   }
 
status = sendmsg(rtnl->fd, , 0);
if (status < 0) {
@@ -612,8 +612,14 @@ static int __rtnl_talk(struct rtnl_handle *rtnl, struct 
nlmsghdr *n,
return -1;
}
 
+   /* change msg to use the response iov */
+   msg.msg_iov = 
+   msg.msg_iovlen = 1;
+   i = 0;
while (1) {
+next:
status = rtnl_recvmsg(rtnl->fd, , );
+   ++i;
 
if (status < 0)
return status;
@@ -642,7 +648,7 @@ static int __rtnl_talk(struct rtnl_handle *rtnl, struct 
nlmsghdr *n,
 
if (nladdr.nl_pid != 0 ||
h->nlmsg_pid != rtnl->local.nl_pid ||
-   h->nlmsg_seq != seq) {
+   h->nlmsg_seq > seq || h->nlmsg_seq < seq - iovlen) {
/* Don't forget to skip that message. */
status -= NLMSG_ALIGN(len);
h = (struct nlmsghdr *)((char *)h + 
NLMSG_ALIGN(len));
@@ -662,7 +668,10 @@ static int __rtnl_talk(struct rtnl_handle *rtnl, struct 
nlmsghdr *n,
*answer = (struct nlmsghdr 
*)buf;
else
free(buf);
-   return 0;
+   if (h->nlmsg_seq == seq)
+   return 0;
+   else
+   goto next;
}
 
if (rtnl->proto != NETLINK_SOCK_DIAG &&
@@ -671,7 +680,7 @@ static int __rtnl_talk(struct rtnl_handle *rtnl, struct 
nlmsghdr *n,
 
errno = -err->error;
free(buf);
-   return -1;
+   return -i;
}
 
if (answer) {
@@ -698,12 +707,30 @@ static int __rtnl_talk(struct rtnl_handle *r

[patch iproute2 v10 2/2] tc: Add batchsize feature for filter and actions

2018-01-11 Thread Chris Mi
Currently in tc batch mode, only one command is read from the batch
file and sent to kernel to process. With this support, at most 128
commands can be accumulated before sending to kernel.

Now it only works for the following successive commands:
1. filter add/delete/change/replace
2. actions add/change/replace

Signed-off-by: Chris Mi <chr...@mellanox.com>
---
 tc/m_action.c  |  65 ---
 tc/tc.c| 199 -
 tc/tc_common.h |   5 +-
 tc/tc_filter.c | 104 ++
 4 files changed, 294 insertions(+), 79 deletions(-)

diff --git a/tc/m_action.c b/tc/m_action.c
index fc422364..611f6cc2 100644
--- a/tc/m_action.c
+++ b/tc/m_action.c
@@ -546,40 +546,61 @@ bad_val:
return ret;
 }
 
+struct tc_action_req {
+   struct nlmsghdr n;
+   struct tcamsg   t;
+   charbuf[MAX_MSG];
+};
+
 static int tc_action_modify(int cmd, unsigned int flags,
-   int *argc_p, char ***argv_p)
+   int *argc_p, char ***argv_p,
+   void *buf, size_t buflen)
 {
-   int argc = *argc_p;
+   struct tc_action_req *req, action_req;
char **argv = *argv_p;
+   struct rtattr *tail;
+   int argc = *argc_p;
+   struct iovec iov;
int ret = 0;
-   struct {
-   struct nlmsghdr n;
-   struct tcamsg   t;
-   charbuf[MAX_MSG];
-   } req = {
-   .n.nlmsg_len = NLMSG_LENGTH(sizeof(struct tcamsg)),
-   .n.nlmsg_flags = NLM_F_REQUEST | flags,
-   .n.nlmsg_type = cmd,
-   .t.tca_family = AF_UNSPEC,
-   };
-   struct rtattr *tail = NLMSG_TAIL();
+
+   if (buf) {
+   req = buf;
+   if (buflen < sizeof (struct tc_action_req)) {
+   fprintf(stderr, "buffer is too small: %zu\n", buflen);
+   return -1;
+   }
+   } else {
+   memset(_req, 0, sizeof (struct tc_action_req));
+   req = _req;
+   }
+
+   req->n.nlmsg_len = NLMSG_LENGTH(sizeof(struct tcamsg));
+   req->n.nlmsg_flags = NLM_F_REQUEST | flags;
+   req->n.nlmsg_type = cmd;
+   req->t.tca_family = AF_UNSPEC;
+   tail = NLMSG_TAIL(>n);
 
argc -= 1;
argv += 1;
-   if (parse_action(, , TCA_ACT_TAB, )) {
+   if (parse_action(, , TCA_ACT_TAB, >n)) {
fprintf(stderr, "Illegal \"action\"\n");
return -1;
}
-   tail->rta_len = (void *) NLMSG_TAIL() - (void *) tail;
+   tail->rta_len = (void *) NLMSG_TAIL(>n) - (void *) tail;
+
+   *argc_p = argc;
+   *argv_p = argv;
+
+   if (buf)
+   return 0;
 
-   if (rtnl_talk(, , NULL) < 0) {
+   iov.iov_base = >n;
+   iov.iov_len = req->n.nlmsg_len;
+   if (rtnl_talk_iov(, , 1, NULL) < 0) {
fprintf(stderr, "We have an error talking to the kernel\n");
ret = -1;
}
 
-   *argc_p = argc;
-   *argv_p = argv;
-
return ret;
 }
 
@@ -679,7 +700,7 @@ bad_val:
return ret;
 }
 
-int do_action(int argc, char **argv)
+int do_action(int argc, char **argv, void *buf, size_t buflen)
 {
 
int ret = 0;
@@ -689,12 +710,12 @@ int do_action(int argc, char **argv)
if (matches(*argv, "add") == 0) {
ret =  tc_action_modify(RTM_NEWACTION,
NLM_F_EXCL | NLM_F_CREATE,
-   , );
+   , , buf, buflen);
} else if (matches(*argv, "change") == 0 ||
  matches(*argv, "replace") == 0) {
ret = tc_action_modify(RTM_NEWACTION,
   NLM_F_CREATE | NLM_F_REPLACE,
-  , );
+  , , buf, buflen);
} else if (matches(*argv, "delete") == 0) {
argc -= 1;
argv += 1;
diff --git a/tc/tc.c b/tc/tc.c
index ad9f07e9..63e64fec 100644
--- a/tc/tc.c
+++ b/tc/tc.c
@@ -193,16 +193,16 @@ static void usage(void)
"-nm | -nam[es] | { -cf | -conf } 
path } | -j[son]\n");
 }
 
-static int do_cmd(int argc, char **argv)
+static int do_cmd(int argc, char **argv, void *buf, size_t buflen)
 {
if (matches(*argv, "qdisc") == 0)
return do_qdisc(argc-1, argv+1);
if (matches(*argv, "class") == 0)
return do_class(argc-1, argv+1);
if (matches(*argv, "filter&

[patch iproute2 v10 0/2] tc: Add batchsize feature to batch mode

2018-01-11 Thread Chris Mi
Currently in tc batch mode, only one command is read from the batch
file and sent to kernel to process. With this patchset, at most 128
commands can be accumulated before sending to kernel.

We introduced a new function in patch 1 to support for sending
multiple messages. In patch 2, we add this support for filter
add/delete/change/replace and actions add/change/replace commands.

But please note that kernel still processes the requests one by one.
To process the requests in parallel in kernel is another effort.
The time we're saving in this patchset is the user mode and kernel mode
context switch. So this patchset works on top of the current kernel.

Using the following script in kernel, we can generate 1,000,000 rules.
tools/testing/selftests/tc-testing/tdc_batch.py

Without this patchset, 'tc -b $file' exection time is:

real0m15.555s
user0m7.211s
sys 0m8.284s

With this patchset, 'tc -b $file' exection time is:

real0m12.360s
user0m6.082s
sys 0m6.213s

The insertion rate is improved more than 10%.

v3
==
1. Instead of hacking function rtnl_talk directly, add a new function
   rtnl_talk_msg.
2. remove most of global variables to use parameter passing
3. divide the previous patch into 4 patches.

v4
==
1. Remove function setcmdlinetotal. Now in function batch, we read one
   more line to determine if we are reaching the end of file.
2. Remove function __rtnl_check_ack. Now __rtnl_talk calls __rtnl_talk_msg
   directly.
3. if (batch_size < 1)
batch_size = 1;

v5
==
1. Fix a bug that can't deal with batch file with blank line.
2. Describe the limitation in man page.

v6
==
1. Add support for mixed commands.
2. Fix a bug that not all messages are acked if batch size > 1.

v7
==
1. We can tell exactly which command fails.
2. Add a new function rtnl_talk_iov
3. Allocate the memory in function batch() instead of each client.
4. Remove option -bs.

v8
==
1. Replace strcmp with matches.
2. Recycle buffers.

v9
==
1. remove rtnl_talk_msg
2. use a table to determine if supporting batchsize feature or not

v10
===
1. Improve function batchsize_enabled.


Chris Mi (2):
  lib/libnetlink: Add a new function rtnl_talk_iov
  tc: Add batchsize feature for filter and actions

 include/libnetlink.h |   3 +
 lib/libnetlink.c |  65 -
 tc/m_action.c|  65 +++--
 tc/tc.c  | 199 +++
 tc/tc_common.h   |   5 +-
 tc/tc_filter.c   | 104 ---
 6 files changed, 343 insertions(+), 98 deletions(-)

-- 
2.14.2



[patch iproute2 v9 1/2] lib/libnetlink: Add a new function rtnl_talk_iov

2018-01-10 Thread Chris Mi
rtnl_talk can only send a single message to kernel. Add a new function
rtnl_talk_iov that can send multiple messages to kernel.
rtnl_talk_iov takes struct iovec * and iovlen as arguments.

Signed-off-by: Chris Mi <chr...@mellanox.com>
Signed-off-by: David Ahern <dsah...@gmail.com>
---
 include/libnetlink.h |  3 +++
 lib/libnetlink.c | 65 +---
 2 files changed, 49 insertions(+), 19 deletions(-)

diff --git a/include/libnetlink.h b/include/libnetlink.h
index a4d83b9e..d6322190 100644
--- a/include/libnetlink.h
+++ b/include/libnetlink.h
@@ -96,6 +96,9 @@ int rtnl_dump_filter_nc(struct rtnl_handle *rth,
 int rtnl_talk(struct rtnl_handle *rtnl, struct nlmsghdr *n,
  struct nlmsghdr **answer)
__attribute__((warn_unused_result));
+int rtnl_talk_iov(struct rtnl_handle *rtnl, struct iovec *iovec, size_t iovlen,
+ struct nlmsghdr **answer)
+   __attribute__((warn_unused_result));
 int rtnl_talk_extack(struct rtnl_handle *rtnl, struct nlmsghdr *n,
  struct nlmsghdr **answer, nl_ext_ack_fn_t errfn)
__attribute__((warn_unused_result));
diff --git a/lib/libnetlink.c b/lib/libnetlink.c
index 00e6ce0c..7ca47b22 100644
--- a/lib/libnetlink.c
+++ b/lib/libnetlink.c
@@ -581,30 +581,30 @@ static void rtnl_talk_error(struct nlmsghdr *h, struct 
nlmsgerr *err,
strerror(-err->error));
 }
 
-static int __rtnl_talk(struct rtnl_handle *rtnl, struct nlmsghdr *n,
-  struct nlmsghdr **answer,
-  bool show_rtnl_err, nl_ext_ack_fn_t errfn)
+
+static int __rtnl_talk_iov(struct rtnl_handle *rtnl, struct iovec *iov,
+  size_t iovlen, struct nlmsghdr **answer,
+  bool show_rtnl_err, nl_ext_ack_fn_t errfn)
 {
-   int status;
-   unsigned int seq;
-   struct nlmsghdr *h;
struct sockaddr_nl nladdr = { .nl_family = AF_NETLINK };
-   struct iovec iov = {
-   .iov_base = n,
-   .iov_len = n->nlmsg_len
-   };
+   struct iovec riov;
struct msghdr msg = {
.msg_name = ,
.msg_namelen = sizeof(nladdr),
-   .msg_iov = ,
-   .msg_iovlen = 1,
+   .msg_iov = iov,
+   .msg_iovlen = iovlen,
};
+   unsigned int seq = 0;
+   struct nlmsghdr *h;
+   int i, status;
char *buf;
 
-   n->nlmsg_seq = seq = ++rtnl->seq;
-
-   if (answer == NULL)
-   n->nlmsg_flags |= NLM_F_ACK;
+   for (i = 0; i < iovlen; i++) {
+   h = iov[i].iov_base;
+   h->nlmsg_seq = seq = ++rtnl->seq;
+   if (answer == NULL)
+   h->nlmsg_flags |= NLM_F_ACK;
+   }
 
status = sendmsg(rtnl->fd, , 0);
if (status < 0) {
@@ -612,8 +612,14 @@ static int __rtnl_talk(struct rtnl_handle *rtnl, struct 
nlmsghdr *n,
return -1;
}
 
+   /* change msg to use the response iov */
+   msg.msg_iov = 
+   msg.msg_iovlen = 1;
+   i = 0;
while (1) {
+next:
status = rtnl_recvmsg(rtnl->fd, , );
+   ++i;
 
if (status < 0)
return status;
@@ -642,7 +648,7 @@ static int __rtnl_talk(struct rtnl_handle *rtnl, struct 
nlmsghdr *n,
 
if (nladdr.nl_pid != 0 ||
h->nlmsg_pid != rtnl->local.nl_pid ||
-   h->nlmsg_seq != seq) {
+   h->nlmsg_seq > seq || h->nlmsg_seq < seq - iovlen) {
/* Don't forget to skip that message. */
status -= NLMSG_ALIGN(len);
h = (struct nlmsghdr *)((char *)h + 
NLMSG_ALIGN(len));
@@ -662,7 +668,10 @@ static int __rtnl_talk(struct rtnl_handle *rtnl, struct 
nlmsghdr *n,
*answer = (struct nlmsghdr 
*)buf;
else
free(buf);
-   return 0;
+   if (h->nlmsg_seq == seq)
+   return 0;
+   else
+   goto next;
}
 
if (rtnl->proto != NETLINK_SOCK_DIAG &&
@@ -671,7 +680,7 @@ static int __rtnl_talk(struct rtnl_handle *rtnl, struct 
nlmsghdr *n,
 
errno = -err->error;
free(buf);
-   return -1;
+   return -i;
}
 
if (answer) {
@@ -698,12 +707,30 @@ static int __rtnl_talk(struct rtnl_handle *r

[patch iproute2 v9 0/2] tc: Add batchsize feature to batch mode

2018-01-10 Thread Chris Mi
Currently in tc batch mode, only one command is read from the batch
file and sent to kernel to process. With this patchset, at most 128
commands can be accumulated before sending to kernel.

We introduced two new functions in patch 1 to support for sending
multiple messages. In patch 2, we add this support for filter and
actions add/delete/change/replace commands.

But please note that kernel still processes the requests one by one.
To process the requests in parallel in kernel is another effort.
The time we're saving in this patchset is the user mode and kernel mode
context switch. So this patchset works on top of the current kernel.

Using the following script in kernel, we can generate 1,000,000 rules.
tools/testing/selftests/tc-testing/tdc_batch.py

Without this patchset, 'tc -b $file' exection time is:

real0m15.555s
user0m7.211s
sys 0m8.284s

With this patchset, 'tc -b $file' exection time is:

real0m12.360s
user0m6.082s
sys 0m6.213s

The insertion rate is improved more than 10%.

v3
==
1. Instead of hacking function rtnl_talk directly, add a new function
   rtnl_talk_msg.
2. remove most of global variables to use parameter passing
3. divide the previous patch into 4 patches.

v4
==
1. Remove function setcmdlinetotal. Now in function batch, we read one
   more line to determine if we are reaching the end of file.
2. Remove function __rtnl_check_ack. Now __rtnl_talk calls __rtnl_talk_msg
   directly.
3. if (batch_size < 1)
batch_size = 1;

v5
==
1. Fix a bug that can't deal with batch file with blank line.
2. Describe the limitation in man page.

v6
==
1. Add support for mixed commands.
2. Fix a bug that not all messages are acked if batch size > 1.

v7
==
1. We can tell exactly which command fails.
2. Add a new function rtnl_talk_iov
3. Allocate the memory in function batch() instead of each client.
4. Remove option -bs.

v8
==
1. Replace strcmp with matches.
2. Recycle buffers.

v9
==
1. remove rtnl_talk_msg
2. use a table to determine if supporting batchsize feature or not


Chris Mi (2):
  lib/libnetlink: Add a new function rtnl_talk_iov
  tc: Add batchsize feature for filter and actions

 include/libnetlink.h |   3 +
 lib/libnetlink.c |  65 +++-
 tc/m_action.c|  65 ++--
 tc/tc.c  | 208 +++
 tc/tc_common.h   |   5 +-
 tc/tc_filter.c   | 104 --
 6 files changed, 352 insertions(+), 98 deletions(-)

-- 
2.14.2



[patch iproute2 v9 2/2] tc: Add batchsize feature for filter and actions

2018-01-10 Thread Chris Mi
Currently in tc batch mode, only one command is read from the batch
file and sent to kernel to process. With this support, at most 128
commands can be accumulated before sending to kernel.

Now it only works for the following successive commands:
1. filter add/delete/change/replace
2. actions add/change/replace

Signed-off-by: Chris Mi <chr...@mellanox.com>
---
 tc/m_action.c  |  65 --
 tc/tc.c| 208 -
 tc/tc_common.h |   5 +-
 tc/tc_filter.c | 104 ++---
 4 files changed, 303 insertions(+), 79 deletions(-)

diff --git a/tc/m_action.c b/tc/m_action.c
index fc422364..611f6cc2 100644
--- a/tc/m_action.c
+++ b/tc/m_action.c
@@ -546,40 +546,61 @@ bad_val:
return ret;
 }
 
+struct tc_action_req {
+   struct nlmsghdr n;
+   struct tcamsg   t;
+   charbuf[MAX_MSG];
+};
+
 static int tc_action_modify(int cmd, unsigned int flags,
-   int *argc_p, char ***argv_p)
+   int *argc_p, char ***argv_p,
+   void *buf, size_t buflen)
 {
-   int argc = *argc_p;
+   struct tc_action_req *req, action_req;
char **argv = *argv_p;
+   struct rtattr *tail;
+   int argc = *argc_p;
+   struct iovec iov;
int ret = 0;
-   struct {
-   struct nlmsghdr n;
-   struct tcamsg   t;
-   charbuf[MAX_MSG];
-   } req = {
-   .n.nlmsg_len = NLMSG_LENGTH(sizeof(struct tcamsg)),
-   .n.nlmsg_flags = NLM_F_REQUEST | flags,
-   .n.nlmsg_type = cmd,
-   .t.tca_family = AF_UNSPEC,
-   };
-   struct rtattr *tail = NLMSG_TAIL();
+
+   if (buf) {
+   req = buf;
+   if (buflen < sizeof (struct tc_action_req)) {
+   fprintf(stderr, "buffer is too small: %zu\n", buflen);
+   return -1;
+   }
+   } else {
+   memset(_req, 0, sizeof (struct tc_action_req));
+   req = _req;
+   }
+
+   req->n.nlmsg_len = NLMSG_LENGTH(sizeof(struct tcamsg));
+   req->n.nlmsg_flags = NLM_F_REQUEST | flags;
+   req->n.nlmsg_type = cmd;
+   req->t.tca_family = AF_UNSPEC;
+   tail = NLMSG_TAIL(>n);
 
argc -= 1;
argv += 1;
-   if (parse_action(, , TCA_ACT_TAB, )) {
+   if (parse_action(, , TCA_ACT_TAB, >n)) {
fprintf(stderr, "Illegal \"action\"\n");
return -1;
}
-   tail->rta_len = (void *) NLMSG_TAIL() - (void *) tail;
+   tail->rta_len = (void *) NLMSG_TAIL(>n) - (void *) tail;
+
+   *argc_p = argc;
+   *argv_p = argv;
+
+   if (buf)
+   return 0;
 
-   if (rtnl_talk(, , NULL) < 0) {
+   iov.iov_base = >n;
+   iov.iov_len = req->n.nlmsg_len;
+   if (rtnl_talk_iov(, , 1, NULL) < 0) {
fprintf(stderr, "We have an error talking to the kernel\n");
ret = -1;
}
 
-   *argc_p = argc;
-   *argv_p = argv;
-
return ret;
 }
 
@@ -679,7 +700,7 @@ bad_val:
return ret;
 }
 
-int do_action(int argc, char **argv)
+int do_action(int argc, char **argv, void *buf, size_t buflen)
 {
 
int ret = 0;
@@ -689,12 +710,12 @@ int do_action(int argc, char **argv)
if (matches(*argv, "add") == 0) {
ret =  tc_action_modify(RTM_NEWACTION,
NLM_F_EXCL | NLM_F_CREATE,
-   , );
+   , , buf, buflen);
} else if (matches(*argv, "change") == 0 ||
  matches(*argv, "replace") == 0) {
ret = tc_action_modify(RTM_NEWACTION,
   NLM_F_CREATE | NLM_F_REPLACE,
-  , );
+  , , buf, buflen);
} else if (matches(*argv, "delete") == 0) {
argc -= 1;
argv += 1;
diff --git a/tc/tc.c b/tc/tc.c
index ad9f07e9..8ffea11c 100644
--- a/tc/tc.c
+++ b/tc/tc.c
@@ -193,16 +193,16 @@ static void usage(void)
"-nm | -nam[es] | { -cf | -conf } 
path } | -j[son]\n");
 }
 
-static int do_cmd(int argc, char **argv)
+static int do_cmd(int argc, char **argv, void *buf, size_t buflen)
 {
if (matches(*argv, "qdisc") == 0)
return do_qdisc(argc-1, argv+1);
if (matches(*argv, "class") == 0)
return do_class(argc-1, argv+1);
if (matches(*argv, "filter&

RE: [patch iproute2 v8 2/2] tc: Add batchsize feature for filter and actions

2018-01-10 Thread Chris Mi
> -Original Message-
> From: David Ahern [mailto:dsah...@gmail.com]
> Sent: Thursday, January 11, 2018 3:41 AM
> To: Chris Mi <chr...@mellanox.com>; netdev@vger.kernel.org
> Cc: gerlitz...@gmail.com; step...@networkplumber.org;
> marcelo.leit...@gmail.com; p...@nwl.cc
> Subject: Re: [patch iproute2 v8 2/2] tc: Add batchsize feature for filter and
> actions
> 
> On 1/9/18 8:27 PM, Chris Mi wrote:
> > Currently in tc batch mode, only one command is read from the batch
> > file and sent to kernel to process. With this support, at most 128
> > commands can be accumulated before sending to kernel.
> >
> > Now it only works for the following successive commands:
> > filter and actions add/delete/change/replace.
> >
> > Signed-off-by: Chris Mi <chr...@mellanox.com>
> > ---
> >  tc/m_action.c  |  60 +
> >  tc/tc.c| 165
> -
> >  tc/tc_common.h |   5 +-
> >  tc/tc_filter.c |  97 +++--
> >  4 files changed, 249 insertions(+), 78 deletions(-)
> >
> > diff --git a/tc/m_action.c b/tc/m_action.c index fc422364..e5c53a80
> > 100644
> > --- a/tc/m_action.c
> > +++ b/tc/m_action.c
> > @@ -546,40 +546,56 @@ bad_val:
> > return ret;
> >  }
> >
> > +struct tc_action_req {
> > +   struct nlmsghdr n;
> > +   struct tcamsg   t;
> > +   charbuf[MAX_MSG];
> > +};
> > +
> >  static int tc_action_modify(int cmd, unsigned int flags,
> > -   int *argc_p, char ***argv_p)
> > +   int *argc_p, char ***argv_p,
> > +   void *buf)
> 
> you really need a buflen; you should not make assumptions about the length
> of buffer passed to these functions.
Done.
> 
> >  {
> > -   int argc = *argc_p;
> > +   struct tc_action_req *req, action_req;
> > char **argv = *argv_p;
> > +   struct rtattr *tail;
> > +   int argc = *argc_p;
> > +   struct iovec iov;
> > int ret = 0;
> > -   struct {
> > -   struct nlmsghdr n;
> > -   struct tcamsg   t;
> > -   charbuf[MAX_MSG];
> > -   } req = {
> > -   .n.nlmsg_len = NLMSG_LENGTH(sizeof(struct tcamsg)),
> > -   .n.nlmsg_flags = NLM_F_REQUEST | flags,
> > -   .n.nlmsg_type = cmd,
> > -   .t.tca_family = AF_UNSPEC,
> > -   };
> > -   struct rtattr *tail = NLMSG_TAIL();
> > +
> > +   if (buf)
> > +   req = buf;
> > +   else
> > +   req = _req;
> > +
> 
> And a memset is needed for the !buf path since action_req is not initialized.
Done.
> 
> > +   req->n.nlmsg_len = NLMSG_LENGTH(sizeof(struct tcamsg));
> > +   req->n.nlmsg_flags = NLM_F_REQUEST | flags;
> > +   req->n.nlmsg_type = cmd;
> > +   req->t.tca_family = AF_UNSPEC;
> > +   tail = NLMSG_TAIL(>n);
> >
> > argc -= 1;
> > argv += 1;
> > -   if (parse_action(, , TCA_ACT_TAB, )) {
> > +   if (parse_action(, , TCA_ACT_TAB, >n)) {
> > fprintf(stderr, "Illegal \"action\"\n");
> > return -1;
> > }
> > -   tail->rta_len = (void *) NLMSG_TAIL() - (void *) tail;
> > +   tail->rta_len = (void *) NLMSG_TAIL(>n) - (void *) tail;
> > +
> > +   *argc_p = argc;
> > +   *argv_p = argv;
> >
> > -   if (rtnl_talk(, , NULL) < 0) {
> > +   iov.iov_base = >n;
> > +   iov.iov_len = req->n.nlmsg_len;
> 
> you can leave that as rtnl_talk; no need to change the !buf case to
> rtnl_talk_iov.
Done.
> 
> > +
> > +   if (buf)
> > +   return 0;
> > +
> > +   if (rtnl_talk_iov(, , 1, NULL) < 0) {
> > fprintf(stderr, "We have an error talking to the kernel\n");
> > ret = -1;
> > }
> >
> > -   *argc_p = argc;
> > -   *argv_p = argv;
> > -
> > return ret;
> >  }
> >
> > @@ -679,7 +695,7 @@ bad_val:
> > return ret;
> >  }
> >
> > -int do_action(int argc, char **argv)
> > +int do_action(int argc, char **argv, void *buf)
> >  {
> >
> > int ret = 0;
> > @@ -689,12 +705,12 @@ int do_action(int argc, char **argv)
> > if (matches(*argv, "add") == 0) {
> > ret =  tc_action_modify(RTM_NEWACTION,
> >   

RE: [patch iproute2 v8 1/2] lib/libnetlink: Add functions rtnl_talk_msg and rtnl_talk_iov

2018-01-10 Thread Chris Mi
> -Original Message-
> From: David Ahern [mailto:dsah...@gmail.com]
> Sent: Thursday, January 11, 2018 3:21 AM
> To: Chris Mi <chr...@mellanox.com>; netdev@vger.kernel.org
> Cc: gerlitz...@gmail.com; step...@networkplumber.org;
> marcelo.leit...@gmail.com; p...@nwl.cc
> Subject: Re: [patch iproute2 v8 1/2] lib/libnetlink: Add functions
> rtnl_talk_msg and rtnl_talk_iov
> 
> On 1/9/18 8:27 PM, Chris Mi wrote:
> > rtnl_talk can only send a single message to kernel. Add two functions
> > rtnl_talk_msg and rtnl_talk_iov that can send multiple messages to kernel.
> > rtnl_talk_msg takes struct msghdr * as argument.
> > rtnl_talk_iov takes struct iovec * and iovlen as arguments.
> >
> > Signed-off-by: Chris Mi <chr...@mellanox.com>
> > ---
> >  include/libnetlink.h |  6 
> >  lib/libnetlink.c | 82 -
> ---
> >  2 files changed, 70 insertions(+), 18 deletions(-)
> >
> > diff --git a/include/libnetlink.h b/include/libnetlink.h index
> > a4d83b9e..e9a63dbc 100644
> > --- a/include/libnetlink.h
> > +++ b/include/libnetlink.h
> > @@ -96,6 +96,12 @@ int rtnl_dump_filter_nc(struct rtnl_handle *rth,
> > int rtnl_talk(struct rtnl_handle *rtnl, struct nlmsghdr *n,
> >   struct nlmsghdr **answer)
> > __attribute__((warn_unused_result));
> > +int rtnl_talk_msg(struct rtnl_handle *rtnl, struct msghdr *m,
> > + struct nlmsghdr **answer)
> > +   __attribute__((warn_unused_result));
> 
> As mentioned before rtnl_talk_msg is not needed; you only need to add
> rtnl_talk_iov. The attached fixup on top of your patch removes it and adjusts
> __rtnl_talk_iov. Please roll that change into your patch.
Done. I misunderstood you previous comment.
Thanks for your patch, David.
> 
> 
> While testing this I noticed 2 other oddities:
> 
> $ perf trace -s tc -b tc.batch
> (stddev column removed to shorten line width)
> 
>  Summary of events:
> 
>  tc (780), 1857 events, 97.9%
> 
>syscallcallstotal   min   avg   max
>(msec)(msec)(msec)(msec)
>---  - - - -
>recvmsg  530 6.532 0.008 0.012 0.218
>open 269 5.429 0.012 0.020 0.117
>sendmsg4 3.518 0.092 0.879 1.647
> 
> 
> 
> 1. recvmsg is called twice - once to peek at message size, allocate a buffer
> and then really receive the message. That is overkill for ACKs.
> 
> 2. I am using a batch file with drop filters:
> 
> filter add dev eth2 ingress protocol ip pref 273 flower dst_ip
> 192.168.253.0/16 action drop
> 
> and for each command tc is trying to dlopen m_drop.so:
> 
> open("/usr/lib/tc//m_drop.so", O_RDONLY|O_CLOEXEC) = -1 ENOENT (No
> such file or directory)
> 
> 
> With a patch to use a stack buffer for ACKs, the above perf summary
> becomes:
> 
> $ perf trace -s tc -b tc.batch
> 
>  Summary of events:
> 
>  tc (777), 1345 events, 97.1%
> 
>syscallcallstotal   min   avg   max
>(msec)(msec)(msec)(msec)
>---  - - - -
>open 269 5.510 0.013 0.020 0.160
>recvmsg  274 3.758 0.009 0.014 0.396
>sendmsg4 3.531 0.098 0.883 1.672
> 
> 
> Making the open errors now the dominate overhead affecting performance.
> If tc had some smarts that it already tried that file it would avoid the
> subsequent open calls. The end result is a significant speed up compared to
> the current tc:
> 
>  Summary of events:
> 
>  tc (785), 2333 events, 98.3%
> 
>syscallcallstotal   min   avg   max
>(msec)(msec)(msec)(msec)
>---  - - - -
>sendmsg  256 9.832 0.029 0.038 0.181
>open 269 5.819 0.013 0.022 0.353
>recvmsg  530 5.592 0.009 0.011 0.285
> 
> 
> Can you look at a follow on patch (not part of this set) to cache status of
> dlopen attempts?
Sure, I will investigate this issue.


RE: [patch iproute2 v8 2/2] tc: Add batchsize feature for filter and actions

2018-01-10 Thread Chris Mi
> -Original Message-
> From: Marcelo Ricardo Leitner [mailto:marcelo.leit...@gmail.com]
> Sent: Wednesday, January 10, 2018 7:42 PM
> To: Chris Mi <chr...@mellanox.com>
> Cc: netdev@vger.kernel.org; gerlitz...@gmail.com;
> step...@networkplumber.org; dsah...@gmail.com; p...@nwl.cc
> Subject: Re: [patch iproute2 v8 2/2] tc: Add batchsize feature for filter and
> actions
> 
> On Wed, Jan 10, 2018 at 12:27:42PM +0900, Chris Mi wrote:
> > Currently in tc batch mode, only one command is read from the batch
> > file and sent to kernel to process. With this support, at most 128
> > commands can be accumulated before sending to kernel.
> >
> > Now it only works for the following successive commands:
> > filter and actions add/delete/change/replace.
> >
> > Signed-off-by: Chris Mi <chr...@mellanox.com>
> > ---
> >  tc/m_action.c  |  60 +
> >  tc/tc.c| 165
> -
> >  tc/tc_common.h |   5 +-
> >  tc/tc_filter.c |  97 +++--
> >  4 files changed, 249 insertions(+), 78 deletions(-)
> >
> > diff --git a/tc/m_action.c b/tc/m_action.c index fc422364..e5c53a80
> > 100644
> > --- a/tc/m_action.c
> > +++ b/tc/m_action.c
> > @@ -546,40 +546,56 @@ bad_val:
> > return ret;
> >  }
> >
> > +struct tc_action_req {
> > +   struct nlmsghdr n;
> > +   struct tcamsg   t;
> > +   charbuf[MAX_MSG];
> > +};
> > +
> >  static int tc_action_modify(int cmd, unsigned int flags,
> > -   int *argc_p, char ***argv_p)
> > +   int *argc_p, char ***argv_p,
> > +   void *buf)
> >  {
> > -   int argc = *argc_p;
> > +   struct tc_action_req *req, action_req;
> > char **argv = *argv_p;
> > +   struct rtattr *tail;
> > +   int argc = *argc_p;
> > +   struct iovec iov;
> > int ret = 0;
> > -   struct {
> > -   struct nlmsghdr n;
> > -   struct tcamsg   t;
> > -   charbuf[MAX_MSG];
> > -   } req = {
> > -   .n.nlmsg_len = NLMSG_LENGTH(sizeof(struct tcamsg)),
> > -   .n.nlmsg_flags = NLM_F_REQUEST | flags,
> > -   .n.nlmsg_type = cmd,
> > -   .t.tca_family = AF_UNSPEC,
> > -   };
> > -   struct rtattr *tail = NLMSG_TAIL();
> > +
> > +   if (buf)
> > +   req = buf;
> > +   else
> > +   req = _req;
> > +
> > +   req->n.nlmsg_len = NLMSG_LENGTH(sizeof(struct tcamsg));
> > +   req->n.nlmsg_flags = NLM_F_REQUEST | flags;
> > +   req->n.nlmsg_type = cmd;
> > +   req->t.tca_family = AF_UNSPEC;
> > +   tail = NLMSG_TAIL(>n);
> >
> > argc -= 1;
> > argv += 1;
> > -   if (parse_action(, , TCA_ACT_TAB, )) {
> > +   if (parse_action(, , TCA_ACT_TAB, >n)) {
> > fprintf(stderr, "Illegal \"action\"\n");
> > return -1;
> > }
> > -   tail->rta_len = (void *) NLMSG_TAIL() - (void *) tail;
> > +   tail->rta_len = (void *) NLMSG_TAIL(>n) - (void *) tail;
> > +
> > +   *argc_p = argc;
> > +   *argv_p = argv;
> >
> > -   if (rtnl_talk(, , NULL) < 0) {
> > +   iov.iov_base = >n;
> > +   iov.iov_len = req->n.nlmsg_len;
> > +
> > +   if (buf)
> > +   return 0;
> > +
> > +   if (rtnl_talk_iov(, , 1, NULL) < 0) {
> > fprintf(stderr, "We have an error talking to the kernel\n");
> > ret = -1;
> > }
> >
> > -   *argc_p = argc;
> > -   *argv_p = argv;
> > -
> > return ret;
> >  }
> >
> > @@ -679,7 +695,7 @@ bad_val:
> > return ret;
> >  }
> >
> > -int do_action(int argc, char **argv)
> > +int do_action(int argc, char **argv, void *buf)
> >  {
> >
> > int ret = 0;
> > @@ -689,12 +705,12 @@ int do_action(int argc, char **argv)
> > if (matches(*argv, "add") == 0) {
> > ret =  tc_action_modify(RTM_NEWACTION,
> > NLM_F_EXCL |
> NLM_F_CREATE,
> > -   , );
> > +   , , buf);
> > } else if (matches(*argv, "change") == 0 ||
> >   matches(*argv, "replace") == 0) {
> > ret =

[patch iproute2 v8 2/2] tc: Add batchsize feature for filter and actions

2018-01-09 Thread Chris Mi
Currently in tc batch mode, only one command is read from the batch
file and sent to kernel to process. With this support, at most 128
commands can be accumulated before sending to kernel.

Now it only works for the following successive commands:
filter and actions add/delete/change/replace.

Signed-off-by: Chris Mi <chr...@mellanox.com>
---
 tc/m_action.c  |  60 +
 tc/tc.c| 165 -
 tc/tc_common.h |   5 +-
 tc/tc_filter.c |  97 +++--
 4 files changed, 249 insertions(+), 78 deletions(-)

diff --git a/tc/m_action.c b/tc/m_action.c
index fc422364..e5c53a80 100644
--- a/tc/m_action.c
+++ b/tc/m_action.c
@@ -546,40 +546,56 @@ bad_val:
return ret;
 }
 
+struct tc_action_req {
+   struct nlmsghdr n;
+   struct tcamsg   t;
+   charbuf[MAX_MSG];
+};
+
 static int tc_action_modify(int cmd, unsigned int flags,
-   int *argc_p, char ***argv_p)
+   int *argc_p, char ***argv_p,
+   void *buf)
 {
-   int argc = *argc_p;
+   struct tc_action_req *req, action_req;
char **argv = *argv_p;
+   struct rtattr *tail;
+   int argc = *argc_p;
+   struct iovec iov;
int ret = 0;
-   struct {
-   struct nlmsghdr n;
-   struct tcamsg   t;
-   charbuf[MAX_MSG];
-   } req = {
-   .n.nlmsg_len = NLMSG_LENGTH(sizeof(struct tcamsg)),
-   .n.nlmsg_flags = NLM_F_REQUEST | flags,
-   .n.nlmsg_type = cmd,
-   .t.tca_family = AF_UNSPEC,
-   };
-   struct rtattr *tail = NLMSG_TAIL();
+
+   if (buf)
+   req = buf;
+   else
+   req = _req;
+
+   req->n.nlmsg_len = NLMSG_LENGTH(sizeof(struct tcamsg));
+   req->n.nlmsg_flags = NLM_F_REQUEST | flags;
+   req->n.nlmsg_type = cmd;
+   req->t.tca_family = AF_UNSPEC;
+   tail = NLMSG_TAIL(>n);
 
argc -= 1;
argv += 1;
-   if (parse_action(, , TCA_ACT_TAB, )) {
+   if (parse_action(, , TCA_ACT_TAB, >n)) {
fprintf(stderr, "Illegal \"action\"\n");
return -1;
}
-   tail->rta_len = (void *) NLMSG_TAIL() - (void *) tail;
+   tail->rta_len = (void *) NLMSG_TAIL(>n) - (void *) tail;
+
+   *argc_p = argc;
+   *argv_p = argv;
 
-   if (rtnl_talk(, , NULL) < 0) {
+   iov.iov_base = >n;
+   iov.iov_len = req->n.nlmsg_len;
+
+   if (buf)
+   return 0;
+
+   if (rtnl_talk_iov(, , 1, NULL) < 0) {
fprintf(stderr, "We have an error talking to the kernel\n");
ret = -1;
}
 
-   *argc_p = argc;
-   *argv_p = argv;
-
return ret;
 }
 
@@ -679,7 +695,7 @@ bad_val:
return ret;
 }
 
-int do_action(int argc, char **argv)
+int do_action(int argc, char **argv, void *buf)
 {
 
int ret = 0;
@@ -689,12 +705,12 @@ int do_action(int argc, char **argv)
if (matches(*argv, "add") == 0) {
ret =  tc_action_modify(RTM_NEWACTION,
NLM_F_EXCL | NLM_F_CREATE,
-   , );
+   , , buf);
} else if (matches(*argv, "change") == 0 ||
  matches(*argv, "replace") == 0) {
ret = tc_action_modify(RTM_NEWACTION,
   NLM_F_CREATE | NLM_F_REPLACE,
-  , );
+  , , buf);
} else if (matches(*argv, "delete") == 0) {
argc -= 1;
argv += 1;
diff --git a/tc/tc.c b/tc/tc.c
index ad9f07e9..44277405 100644
--- a/tc/tc.c
+++ b/tc/tc.c
@@ -193,16 +193,16 @@ static void usage(void)
"-nm | -nam[es] | { -cf | -conf } 
path } | -j[son]\n");
 }
 
-static int do_cmd(int argc, char **argv)
+static int do_cmd(int argc, char **argv, void *buf)
 {
if (matches(*argv, "qdisc") == 0)
return do_qdisc(argc-1, argv+1);
if (matches(*argv, "class") == 0)
return do_class(argc-1, argv+1);
if (matches(*argv, "filter") == 0)
-   return do_filter(argc-1, argv+1);
+   return do_filter(argc-1, argv+1, buf);
if (matches(*argv, "actions") == 0)
-   return do_action(argc-1, argv+1);
+   return do_action(argc-1, argv+1, buf);
if (matches(*argv, "monitor") == 0)
return do_tcmonitor(argc-1, argv+1)

[patch iproute2 v8 0/2] tc: Add batchsize feature to batch mode

2018-01-09 Thread Chris Mi
Currently in tc batch mode, only one command is read from the batch
file and sent to kernel to process. With this patchset, at most 128
commands can be accumulated before sending to kernel.

We introduced two new functions in patch 1 to support for sending
multiple messages. In patch 2, we add this support for filter and
actions add/delete/change/replace commands.

But please note that kernel still processes the requests one by one.
To process the requests in parallel in kernel is another effort.
The time we're saving in this patchset is the user mode and kernel mode
context switch. So this patchset works on top of the current kernel.

Using the following script in kernel, we can generate 1,000,000 rules.
tools/testing/selftests/tc-testing/tdc_batch.py

Without this patchset, 'tc -b $file' exection time is:

real0m15.555s
user0m7.211s
sys 0m8.284s

With this patchset, 'tc -b $file' exection time is:

real0m12.360s
user0m6.082s
sys 0m6.213s

The insertion rate is improved more than 10%.

v3
==
1. Instead of hacking function rtnl_talk directly, add a new function
   rtnl_talk_msg.
2. remove most of global variables to use parameter passing
3. divide the previous patch into 4 patches.

v4
==
1. Remove function setcmdlinetotal. Now in function batch, we read one
   more line to determine if we are reaching the end of file.
2. Remove function __rtnl_check_ack. Now __rtnl_talk calls __rtnl_talk_msg
   directly.
3. if (batch_size < 1)
batch_size = 1;

v5
==
1. Fix a bug that can't deal with batch file with blank line.
2. Describe the limitation in man page.

v6
==
1. Add support for mixed commands.
2. Fix a bug that not all messages are acked if batch size > 1.

v7
==
1. We can tell exactly which command fails.
2. Add a new function rtnl_talk_iov
3. Allocate the memory in function batch() instead of each client.
4. Remove option -bs.

v8
==
1. Replace strcmp with matches.
2. Recycle buffers.


Chris Mi (2):
  lib/libnetlink: Add functions rtnl_talk_msg and rtnl_talk_iov
  tc: Add batchsize feature for filter and actions

 include/libnetlink.h |   6 ++
 lib/libnetlink.c |  82 +++--
 tc/m_action.c|  60 ---
 tc/tc.c  | 165 ++-
 tc/tc_common.h   |   5 +-
 tc/tc_filter.c   |  97 +-
 6 files changed, 319 insertions(+), 96 deletions(-)

-- 
2.14.2



[patch iproute2 v8 1/2] lib/libnetlink: Add functions rtnl_talk_msg and rtnl_talk_iov

2018-01-09 Thread Chris Mi
rtnl_talk can only send a single message to kernel. Add two functions
rtnl_talk_msg and rtnl_talk_iov that can send multiple messages to kernel.
rtnl_talk_msg takes struct msghdr * as argument.
rtnl_talk_iov takes struct iovec * and iovlen as arguments.

Signed-off-by: Chris Mi <chr...@mellanox.com>
---
 include/libnetlink.h |  6 
 lib/libnetlink.c | 82 
 2 files changed, 70 insertions(+), 18 deletions(-)

diff --git a/include/libnetlink.h b/include/libnetlink.h
index a4d83b9e..e9a63dbc 100644
--- a/include/libnetlink.h
+++ b/include/libnetlink.h
@@ -96,6 +96,12 @@ int rtnl_dump_filter_nc(struct rtnl_handle *rth,
 int rtnl_talk(struct rtnl_handle *rtnl, struct nlmsghdr *n,
  struct nlmsghdr **answer)
__attribute__((warn_unused_result));
+int rtnl_talk_msg(struct rtnl_handle *rtnl, struct msghdr *m,
+ struct nlmsghdr **answer)
+   __attribute__((warn_unused_result));
+int rtnl_talk_iov(struct rtnl_handle *rtnl, struct iovec *iovec, size_t iovlen,
+ struct nlmsghdr **answer)
+   __attribute__((warn_unused_result));
 int rtnl_talk_extack(struct rtnl_handle *rtnl, struct nlmsghdr *n,
  struct nlmsghdr **answer, nl_ext_ack_fn_t errfn)
__attribute__((warn_unused_result));
diff --git a/lib/libnetlink.c b/lib/libnetlink.c
index 00e6ce0c..183825c7 100644
--- a/lib/libnetlink.c
+++ b/lib/libnetlink.c
@@ -581,39 +581,43 @@ static void rtnl_talk_error(struct nlmsghdr *h, struct 
nlmsgerr *err,
strerror(-err->error));
 }
 
-static int __rtnl_talk(struct rtnl_handle *rtnl, struct nlmsghdr *n,
-  struct nlmsghdr **answer,
-  bool show_rtnl_err, nl_ext_ack_fn_t errfn)
+static int __rtnl_talk_msg(struct rtnl_handle *rtnl, struct msghdr *m,
+  struct nlmsghdr **answer,
+  bool show_rtnl_err, nl_ext_ack_fn_t errfn)
 {
-   int status;
-   unsigned int seq;
-   struct nlmsghdr *h;
struct sockaddr_nl nladdr = { .nl_family = AF_NETLINK };
-   struct iovec iov = {
-   .iov_base = n,
-   .iov_len = n->nlmsg_len
-   };
+   int i, status, iovlen = m->msg_iovlen;
+   struct iovec iov;
struct msghdr msg = {
.msg_name = ,
.msg_namelen = sizeof(nladdr),
.msg_iov = ,
.msg_iovlen = 1,
};
+   unsigned int seq = 0;
+   struct nlmsghdr *h;
char *buf;
 
-   n->nlmsg_seq = seq = ++rtnl->seq;
-
-   if (answer == NULL)
-   n->nlmsg_flags |= NLM_F_ACK;
+   for (i = 0; i < iovlen; i++) {
+   struct iovec *v;
+   v = >msg_iov[i];
+   h = v->iov_base;
+   h->nlmsg_seq = seq = ++rtnl->seq;
+   if (answer == NULL)
+   h->nlmsg_flags |= NLM_F_ACK;
+   }
 
-   status = sendmsg(rtnl->fd, , 0);
+   status = sendmsg(rtnl->fd, m, 0);
if (status < 0) {
perror("Cannot talk to rtnetlink");
return -1;
}
 
+   i = 0;
while (1) {
+next:
status = rtnl_recvmsg(rtnl->fd, , );
+   ++i;
 
if (status < 0)
return status;
@@ -642,7 +646,7 @@ static int __rtnl_talk(struct rtnl_handle *rtnl, struct 
nlmsghdr *n,
 
if (nladdr.nl_pid != 0 ||
h->nlmsg_pid != rtnl->local.nl_pid ||
-   h->nlmsg_seq != seq) {
+   h->nlmsg_seq > seq || h->nlmsg_seq < seq - iovlen) {
/* Don't forget to skip that message. */
status -= NLMSG_ALIGN(len);
h = (struct nlmsghdr *)((char *)h + 
NLMSG_ALIGN(len));
@@ -662,7 +666,10 @@ static int __rtnl_talk(struct rtnl_handle *rtnl, struct 
nlmsghdr *n,
*answer = (struct nlmsghdr 
*)buf;
else
free(buf);
-   return 0;
+   if (h->nlmsg_seq == seq)
+   return 0;
+   else
+   goto next;
}
 
if (rtnl->proto != NETLINK_SOCK_DIAG &&
@@ -671,7 +678,7 @@ static int __rtnl_talk(struct rtnl_handle *rtnl, struct 
nlmsghdr *n,
 
errno = -err->error;
free(buf);
-   return -1;
+   return -i;
}
 
  

RE: [patch iproute2 v7 2/2] tc: Add batchsize feature for filter and actions

2018-01-09 Thread Chris Mi
> -Original Message-
> From: Stephen Hemminger [mailto:step...@networkplumber.org]
> Sent: Wednesday, January 10, 2018 12:01 AM
> To: Chris Mi <chr...@mellanox.com>
> Cc: netdev@vger.kernel.org; gerlitz...@gmail.com; dsah...@gmail.com;
> marcelo.leit...@gmail.com; p...@nwl.cc
> Subject: Re: [patch iproute2 v7 2/2] tc: Add batchsize feature for filter and
> actions
> 
> On Tue,  9 Jan 2018 15:59:08 +0900
> Chris Mi <chr...@mellanox.com> wrote:
> 
> > +static bool batchsize_enabled(int argc, char *argv[]) {
> > +   if (argc < 2)
> > +   return false;
> > +   if ((strcmp(argv[0], "filter") && strcmp(argv[0], "action"))
> > +   || (strcmp(argv[1], "add") && strcmp(argv[1], "delete")
> > +   && strcmp(argv[1], "change") && strcmp(argv[1], "replace")))
> > +   return false;
> > +
> > +   return true;
> > +}
> 
> Maybe this should be a table, also the action can be abbreviated as in:
>   tc qd a dev eth0 ...
Thanks for your notification. I've changed strcmp to matches.
Since the list is not very big, I didn't change it to use table in this 
patchset.
> 
> Actually, I have been wondering if all of IP commmand parsing needs to be
> more table driven.



RE: [patch iproute2 v7 1/2] lib/libnetlink: Add functions rtnl_talk_msg and rtnl_talk_iov

2018-01-09 Thread Chris Mi
> -Original Message-
> From: n...@orbyte.nwl.cc [mailto:n...@orbyte.nwl.cc] On Behalf Of Phil
> Sutter
> Sent: Wednesday, January 10, 2018 3:24 AM
> To: Chris Mi <chr...@mellanox.com>
> Cc: netdev@vger.kernel.org; gerlitz...@gmail.com;
> step...@networkplumber.org; dsah...@gmail.com;
> marcelo.leit...@gmail.com
> Subject: Re: [patch iproute2 v7 1/2] lib/libnetlink: Add functions
> rtnl_talk_msg and rtnl_talk_iov
> 
> Hi,
> 
> On Tue, Jan 09, 2018 at 03:59:07PM +0900, Chris Mi wrote:
> [...]
> > diff --git a/lib/libnetlink.c b/lib/libnetlink.c index
> > 00e6ce0c..ae0059f9 100644
> > --- a/lib/libnetlink.c
> > +++ b/lib/libnetlink.c
> > @@ -581,39 +581,43 @@ static void rtnl_talk_error(struct nlmsghdr *h,
> struct nlmsgerr *err,
> > strerror(-err->error));
> >  }
> >
> > -static int __rtnl_talk(struct rtnl_handle *rtnl, struct nlmsghdr *n,
> > -  struct nlmsghdr **answer,
> > -  bool show_rtnl_err, nl_ext_ack_fn_t errfn)
> > +static int __rtnl_talk_msg(struct rtnl_handle *rtnl, struct msghdr *m,
> > +  struct nlmsghdr **answer,
> > +  bool show_rtnl_err, nl_ext_ack_fn_t errfn)
> >  {
> > -   int status;
> > -   unsigned int seq;
> > -   struct nlmsghdr *h;
> > struct sockaddr_nl nladdr = { .nl_family = AF_NETLINK };
> > -   struct iovec iov = {
> > -   .iov_base = n,
> > -   .iov_len = n->nlmsg_len
> > -   };
> > +   int i, status, iovlen = m->msg_iovlen;
> > +   struct iovec iov;
> > struct msghdr msg = {
> > .msg_name = ,
> > .msg_namelen = sizeof(nladdr),
> > .msg_iov = ,
> > .msg_iovlen = 1,
> > };
> > -   char *buf;
> > -
> > -   n->nlmsg_seq = seq = ++rtnl->seq;
> > +   unsigned int seq = 0;
> > +   struct nlmsghdr *h;
> >
> > -   if (answer == NULL)
> > -   n->nlmsg_flags |= NLM_F_ACK;
> > +   for (i = 0; i < iovlen; i++) {
> > +   struct iovec *v;
> > +   v = >msg_iov[i];
> > +   h = v->iov_base;
> > +   h->nlmsg_seq = seq = ++rtnl->seq;
> > +   if (answer == NULL)
> > +   h->nlmsg_flags |= NLM_F_ACK;
> > +   }
> >
> > -   status = sendmsg(rtnl->fd, , 0);
> > +   status = sendmsg(rtnl->fd, m, 0);
> > if (status < 0) {
> > perror("Cannot talk to rtnetlink");
> > return -1;
> > }
> >
> > +   i = 0;
> > while (1) {
> 
> for (i = 1; ; i++) ?
> 
> > +   char *buf;
> 
> Why did you move this declaration?
> 
> > +next:
> 
> Drop this and use 'continue' instead of 'goto next' below?
Actually there are two loops, I need go to the outer while loop instead of the 
inner for loop.
> 
> > status = rtnl_recvmsg(rtnl->fd, , );
> > +   ++i;
> >
> > if (status < 0)
> > return status;
> > @@ -642,7 +646,7 @@ static int __rtnl_talk(struct rtnl_handle *rtnl,
> > struct nlmsghdr *n,
> >
> > if (nladdr.nl_pid != 0 ||
> > h->nlmsg_pid != rtnl->local.nl_pid ||
> > -   h->nlmsg_seq != seq) {
> > +   h->nlmsg_seq > seq || h->nlmsg_seq < seq - iovlen)
> {
> > /* Don't forget to skip that message. */
> > status -= NLMSG_ALIGN(len);
> > h = (struct nlmsghdr *)((char *)h +
> NLMSG_ALIGN(len)); @@ -662,7
> > +666,10 @@ static int __rtnl_talk(struct rtnl_handle *rtnl, struct nlmsghdr
> *n,
> > *answer = (struct nlmsghdr
> *)buf;
> > else
> > free(buf);
> > -   return 0;
> > +   if (h->nlmsg_seq == seq)
> > +   return 0;
> > +   else
> > +   goto next;
> > }
> >
> > if (rtnl->proto != NETLINK_SOCK_DIAG &&
> 
> Cheers, Phil


RE: [patch iproute2 v7 2/2] tc: Add batchsize feature for filter and actions

2018-01-09 Thread Chris Mi
> -Original Message-
> From: Marcelo Ricardo Leitner [mailto:marcelo.leit...@gmail.com]
> Sent: Wednesday, January 10, 2018 3:14 AM
> To: Chris Mi <chr...@mellanox.com>
> Cc: netdev@vger.kernel.org; gerlitz...@gmail.com;
> step...@networkplumber.org; dsah...@gmail.com; p...@nwl.cc
> Subject: Re: [patch iproute2 v7 2/2] tc: Add batchsize feature for filter and
> actions
> 
> On Tue, Jan 09, 2018 at 03:59:08PM +0900, Chris Mi wrote:
> > Currently in tc batch mode, only one command is read from the batch
> > file and sent to kernel to process. With this support, at most 128
> > commands can be accumulated before sending to kernel.
> >
> > Now it only works for the following successive commands:
> > filter and actions add/delete/change/replace.
> >
> > Signed-off-by: Chris Mi <chr...@mellanox.com>
> > ---
> >  tc/m_action.c  |  60 +--
> >  tc/tc.c| 127
> ++---
> >  tc/tc_common.h |   5 ++-
> >  tc/tc_filter.c |  97 +--
> >  4 files changed, 210 insertions(+), 79 deletions(-)
> >
> > diff --git a/tc/m_action.c b/tc/m_action.c index fc422364..e5c53a80
> > 100644
> > --- a/tc/m_action.c
> > +++ b/tc/m_action.c
> > @@ -546,40 +546,56 @@ bad_val:
> > return ret;
> >  }
> >
> > +struct tc_action_req {
> > +   struct nlmsghdr n;
> > +   struct tcamsg   t;
> > +   charbuf[MAX_MSG];
> > +};
> > +
> >  static int tc_action_modify(int cmd, unsigned int flags,
> > -   int *argc_p, char ***argv_p)
> > +   int *argc_p, char ***argv_p,
> > +   void *buf)
> >  {
> > -   int argc = *argc_p;
> > +   struct tc_action_req *req, action_req;
> > char **argv = *argv_p;
> > +   struct rtattr *tail;
> > +   int argc = *argc_p;
> > +   struct iovec iov;
> > int ret = 0;
> > -   struct {
> > -   struct nlmsghdr n;
> > -   struct tcamsg   t;
> > -   charbuf[MAX_MSG];
> > -   } req = {
> > -   .n.nlmsg_len = NLMSG_LENGTH(sizeof(struct tcamsg)),
> > -   .n.nlmsg_flags = NLM_F_REQUEST | flags,
> > -   .n.nlmsg_type = cmd,
> > -   .t.tca_family = AF_UNSPEC,
> > -   };
> > -   struct rtattr *tail = NLMSG_TAIL();
> > +
> > +   if (buf)
> > +   req = buf;
> > +   else
> > +   req = _req;
> > +
> > +   req->n.nlmsg_len = NLMSG_LENGTH(sizeof(struct tcamsg));
> > +   req->n.nlmsg_flags = NLM_F_REQUEST | flags;
> > +   req->n.nlmsg_type = cmd;
> > +   req->t.tca_family = AF_UNSPEC;
> > +   tail = NLMSG_TAIL(>n);
> >
> > argc -= 1;
> > argv += 1;
> > -   if (parse_action(, , TCA_ACT_TAB, )) {
> > +   if (parse_action(, , TCA_ACT_TAB, >n)) {
> > fprintf(stderr, "Illegal \"action\"\n");
> > return -1;
> > }
> > -   tail->rta_len = (void *) NLMSG_TAIL() - (void *) tail;
> > +   tail->rta_len = (void *) NLMSG_TAIL(>n) - (void *) tail;
> > +
> > +   *argc_p = argc;
> > +   *argv_p = argv;
> >
> > -   if (rtnl_talk(, , NULL) < 0) {
> > +   iov.iov_base = >n;
> > +   iov.iov_len = req->n.nlmsg_len;
> > +
> > +   if (buf)
> > +   return 0;
> > +
> > +   if (rtnl_talk_iov(, , 1, NULL) < 0) {
> > fprintf(stderr, "We have an error talking to the kernel\n");
> > ret = -1;
> > }
> >
> > -   *argc_p = argc;
> > -   *argv_p = argv;
> > -
> > return ret;
> >  }
> >
> > @@ -679,7 +695,7 @@ bad_val:
> > return ret;
> >  }
> >
> > -int do_action(int argc, char **argv)
> > +int do_action(int argc, char **argv, void *buf)
> >  {
> >
> > int ret = 0;
> > @@ -689,12 +705,12 @@ int do_action(int argc, char **argv)
> > if (matches(*argv, "add") == 0) {
> > ret =  tc_action_modify(RTM_NEWACTION,
> > NLM_F_EXCL |
> NLM_F_CREATE,
> > -   , );
> > +   , , buf);
> > } else if (matches(*argv, "change") == 0 ||
> >   matches(*argv, "replace") == 0) {
> >  

[patch iproute2 v7 2/2] tc: Add batchsize feature for filter and actions

2018-01-08 Thread Chris Mi
Currently in tc batch mode, only one command is read from the batch
file and sent to kernel to process. With this support, at most 128
commands can be accumulated before sending to kernel.

Now it only works for the following successive commands:
filter and actions add/delete/change/replace.

Signed-off-by: Chris Mi <chr...@mellanox.com>
---
 tc/m_action.c  |  60 +--
 tc/tc.c| 127 ++---
 tc/tc_common.h |   5 ++-
 tc/tc_filter.c |  97 +--
 4 files changed, 210 insertions(+), 79 deletions(-)

diff --git a/tc/m_action.c b/tc/m_action.c
index fc422364..e5c53a80 100644
--- a/tc/m_action.c
+++ b/tc/m_action.c
@@ -546,40 +546,56 @@ bad_val:
return ret;
 }
 
+struct tc_action_req {
+   struct nlmsghdr n;
+   struct tcamsg   t;
+   charbuf[MAX_MSG];
+};
+
 static int tc_action_modify(int cmd, unsigned int flags,
-   int *argc_p, char ***argv_p)
+   int *argc_p, char ***argv_p,
+   void *buf)
 {
-   int argc = *argc_p;
+   struct tc_action_req *req, action_req;
char **argv = *argv_p;
+   struct rtattr *tail;
+   int argc = *argc_p;
+   struct iovec iov;
int ret = 0;
-   struct {
-   struct nlmsghdr n;
-   struct tcamsg   t;
-   charbuf[MAX_MSG];
-   } req = {
-   .n.nlmsg_len = NLMSG_LENGTH(sizeof(struct tcamsg)),
-   .n.nlmsg_flags = NLM_F_REQUEST | flags,
-   .n.nlmsg_type = cmd,
-   .t.tca_family = AF_UNSPEC,
-   };
-   struct rtattr *tail = NLMSG_TAIL();
+
+   if (buf)
+   req = buf;
+   else
+   req = _req;
+
+   req->n.nlmsg_len = NLMSG_LENGTH(sizeof(struct tcamsg));
+   req->n.nlmsg_flags = NLM_F_REQUEST | flags;
+   req->n.nlmsg_type = cmd;
+   req->t.tca_family = AF_UNSPEC;
+   tail = NLMSG_TAIL(>n);
 
argc -= 1;
argv += 1;
-   if (parse_action(, , TCA_ACT_TAB, )) {
+   if (parse_action(, , TCA_ACT_TAB, >n)) {
fprintf(stderr, "Illegal \"action\"\n");
return -1;
}
-   tail->rta_len = (void *) NLMSG_TAIL() - (void *) tail;
+   tail->rta_len = (void *) NLMSG_TAIL(>n) - (void *) tail;
+
+   *argc_p = argc;
+   *argv_p = argv;
 
-   if (rtnl_talk(, , NULL) < 0) {
+   iov.iov_base = >n;
+   iov.iov_len = req->n.nlmsg_len;
+
+   if (buf)
+   return 0;
+
+   if (rtnl_talk_iov(, , 1, NULL) < 0) {
fprintf(stderr, "We have an error talking to the kernel\n");
ret = -1;
}
 
-   *argc_p = argc;
-   *argv_p = argv;
-
return ret;
 }
 
@@ -679,7 +695,7 @@ bad_val:
return ret;
 }
 
-int do_action(int argc, char **argv)
+int do_action(int argc, char **argv, void *buf)
 {
 
int ret = 0;
@@ -689,12 +705,12 @@ int do_action(int argc, char **argv)
if (matches(*argv, "add") == 0) {
ret =  tc_action_modify(RTM_NEWACTION,
NLM_F_EXCL | NLM_F_CREATE,
-   , );
+   , , buf);
} else if (matches(*argv, "change") == 0 ||
  matches(*argv, "replace") == 0) {
ret = tc_action_modify(RTM_NEWACTION,
   NLM_F_CREATE | NLM_F_REPLACE,
-  , );
+  , , buf);
} else if (matches(*argv, "delete") == 0) {
argc -= 1;
argv += 1;
diff --git a/tc/tc.c b/tc/tc.c
index ad9f07e9..f32e4978 100644
--- a/tc/tc.c
+++ b/tc/tc.c
@@ -193,16 +193,16 @@ static void usage(void)
"-nm | -nam[es] | { -cf | -conf } 
path } | -j[son]\n");
 }
 
-static int do_cmd(int argc, char **argv)
+static int do_cmd(int argc, char **argv, void *buf)
 {
if (matches(*argv, "qdisc") == 0)
return do_qdisc(argc-1, argv+1);
if (matches(*argv, "class") == 0)
return do_class(argc-1, argv+1);
if (matches(*argv, "filter") == 0)
-   return do_filter(argc-1, argv+1);
+   return do_filter(argc-1, argv+1, buf);
if (matches(*argv, "actions") == 0)
-   return do_action(argc-1, argv+1);
+   return do_action(argc-1, argv+1, buf);
if (matches(*argv, "monitor") == 0)
re

[patch iproute2 v7 1/2] lib/libnetlink: Add functions rtnl_talk_msg and rtnl_talk_iov

2018-01-08 Thread Chris Mi
rtnl_talk can only send a single message to kernel. Add two functions
rtnl_talk_msg and rtnl_talk_iov that can send multiple messages to kernel.
rtnl_talk_msg takes struct msghdr * as argument.
rtnl_talk_iov takes struct iovec * and iovlen as arguments.

Signed-off-by: Chris Mi <chr...@mellanox.com>
---
 include/libnetlink.h |  6 
 lib/libnetlink.c | 84 
 2 files changed, 71 insertions(+), 19 deletions(-)

diff --git a/include/libnetlink.h b/include/libnetlink.h
index a4d83b9e..e9a63dbc 100644
--- a/include/libnetlink.h
+++ b/include/libnetlink.h
@@ -96,6 +96,12 @@ int rtnl_dump_filter_nc(struct rtnl_handle *rth,
 int rtnl_talk(struct rtnl_handle *rtnl, struct nlmsghdr *n,
  struct nlmsghdr **answer)
__attribute__((warn_unused_result));
+int rtnl_talk_msg(struct rtnl_handle *rtnl, struct msghdr *m,
+ struct nlmsghdr **answer)
+   __attribute__((warn_unused_result));
+int rtnl_talk_iov(struct rtnl_handle *rtnl, struct iovec *iovec, size_t iovlen,
+ struct nlmsghdr **answer)
+   __attribute__((warn_unused_result));
 int rtnl_talk_extack(struct rtnl_handle *rtnl, struct nlmsghdr *n,
  struct nlmsghdr **answer, nl_ext_ack_fn_t errfn)
__attribute__((warn_unused_result));
diff --git a/lib/libnetlink.c b/lib/libnetlink.c
index 00e6ce0c..ae0059f9 100644
--- a/lib/libnetlink.c
+++ b/lib/libnetlink.c
@@ -581,39 +581,43 @@ static void rtnl_talk_error(struct nlmsghdr *h, struct 
nlmsgerr *err,
strerror(-err->error));
 }
 
-static int __rtnl_talk(struct rtnl_handle *rtnl, struct nlmsghdr *n,
-  struct nlmsghdr **answer,
-  bool show_rtnl_err, nl_ext_ack_fn_t errfn)
+static int __rtnl_talk_msg(struct rtnl_handle *rtnl, struct msghdr *m,
+  struct nlmsghdr **answer,
+  bool show_rtnl_err, nl_ext_ack_fn_t errfn)
 {
-   int status;
-   unsigned int seq;
-   struct nlmsghdr *h;
struct sockaddr_nl nladdr = { .nl_family = AF_NETLINK };
-   struct iovec iov = {
-   .iov_base = n,
-   .iov_len = n->nlmsg_len
-   };
+   int i, status, iovlen = m->msg_iovlen;
+   struct iovec iov;
struct msghdr msg = {
.msg_name = ,
.msg_namelen = sizeof(nladdr),
.msg_iov = ,
.msg_iovlen = 1,
};
-   char *buf;
-
-   n->nlmsg_seq = seq = ++rtnl->seq;
+   unsigned int seq = 0;
+   struct nlmsghdr *h;
 
-   if (answer == NULL)
-   n->nlmsg_flags |= NLM_F_ACK;
+   for (i = 0; i < iovlen; i++) {
+   struct iovec *v;
+   v = >msg_iov[i];
+   h = v->iov_base;
+   h->nlmsg_seq = seq = ++rtnl->seq;
+   if (answer == NULL)
+   h->nlmsg_flags |= NLM_F_ACK;
+   }
 
-   status = sendmsg(rtnl->fd, , 0);
+   status = sendmsg(rtnl->fd, m, 0);
if (status < 0) {
perror("Cannot talk to rtnetlink");
return -1;
}
 
+   i = 0;
while (1) {
+   char *buf;
+next:
status = rtnl_recvmsg(rtnl->fd, , );
+   ++i;
 
if (status < 0)
return status;
@@ -642,7 +646,7 @@ static int __rtnl_talk(struct rtnl_handle *rtnl, struct 
nlmsghdr *n,
 
if (nladdr.nl_pid != 0 ||
h->nlmsg_pid != rtnl->local.nl_pid ||
-   h->nlmsg_seq != seq) {
+   h->nlmsg_seq > seq || h->nlmsg_seq < seq - iovlen) {
/* Don't forget to skip that message. */
status -= NLMSG_ALIGN(len);
h = (struct nlmsghdr *)((char *)h + 
NLMSG_ALIGN(len));
@@ -662,7 +666,10 @@ static int __rtnl_talk(struct rtnl_handle *rtnl, struct 
nlmsghdr *n,
*answer = (struct nlmsghdr 
*)buf;
else
free(buf);
-   return 0;
+   if (h->nlmsg_seq == seq)
+   return 0;
+   else
+   goto next;
}
 
if (rtnl->proto != NETLINK_SOCK_DIAG &&
@@ -671,7 +678,7 @@ static int __rtnl_talk(struct rtnl_handle *rtnl, struct 
nlmsghdr *n,
 
errno = -err->error;
free(buf);
-   return -1;
+   return -i;

[patch iproute2 v7 0/2] tc: Add batchsize feature to batch mode

2018-01-08 Thread Chris Mi
Currently in tc batch mode, only one command is read from the batch
file and sent to kernel to process. With this patchset, at most 128
commands can be accumulated before sending to kernel.

We introduced two new functions in patch 1 to support for sending
multiple messages. In patch 2, we add this support for filter and
actions add/delete/change/replace commands.

But please note that kernel still processes the requests one by one.
To process the requests in parallel in kernel is another effort.
The time we're saving in this patchset is the user mode and kernel mode
context switch. So this patchset works on top of the current kernel.

Using the following script in kernel, we can generate 1,000,000 rules.
tools/testing/selftests/tc-testing/tdc_batch.py

Without this patchset, 'tc -b $file' exection time is:

real0m15.555s
user0m7.211s
sys 0m8.284s

With this patchset, 'tc -b $file' exection time is:

real0m13.562s
user0m6.463s
sys 0m7.031s

The insertion rate is improved more than 10%.

v3
==
1. Instead of hacking function rtnl_talk directly, add a new function
   rtnl_talk_msg.
2. remove most of global variables to use parameter passing
3. divide the previous patch into 4 patches.

v4
==
1. Remove function setcmdlinetotal. Now in function batch, we read one
   more line to determine if we are reaching the end of file.
2. Remove function __rtnl_check_ack. Now __rtnl_talk calls __rtnl_talk_msg
   directly.
3. if (batch_size < 1)
batch_size = 1;

v5
==
1. Fix a bug that can't deal with batch file with blank line.
2. Describe the limitation in man page.

v6
==
1. Add support for mixed commands.
2. Fix a bug that not all messages are acked if batch size > 1.

v7
==
1. We can tell exactly which command fails.
2. Add a new function rtnl_talk_iov
3. Allocate the memory in function batch() instead of each client.
4. Remove option -bs.


Chris Mi (2):
  lib/libnetlink: Add functions rtnl_talk_msg and rtnl_talk_iov
  tc: Add batchsize feature to batch mode

 include/libnetlink.h |   6 +++
 lib/libnetlink.c |  84 ++
 tc/m_action.c|  60 +++-
 tc/tc.c  | 127 +--
 tc/tc_common.h   |   5 +-
 tc/tc_filter.c   |  97 +++
 6 files changed, 281 insertions(+), 98 deletions(-)

-- 
2.14.3



RE: [patch iproute2 v6 2/3] tc: Add -bs option to batch mode

2018-01-08 Thread Chris Mi
> -Original Message-
> From: Marcelo Ricardo Leitner [mailto:marcelo.leit...@gmail.com]
> Sent: Saturday, January 6, 2018 3:15 AM
> To: David Ahern <dsah...@gmail.com>
> Cc: Chris Mi <chr...@mellanox.com>; netdev@vger.kernel.org;
> gerlitz...@gmail.com; step...@networkplumber.org
> Subject: Re: [patch iproute2 v6 2/3] tc: Add -bs option to batch mode
> 
> On Fri, Jan 05, 2018 at 11:15:59AM -0700, David Ahern wrote:
> > On 1/4/18 12:34 AM, Chris Mi wrote:
> > > Currently in tc batch mode, only one command is read from the batch
> > > file and sent to kernel to process. With this support, we can
> > > accumulate several commands before sending to kernel.
> > >
> > > Now it only works for the following successive rules, 1. filter add
> > > 2. filter delete 3. actions add 4. actions delete
> > >
> > > Otherwise, the batch size is still 1.
> > >
> > > Signed-off-by: Chris Mi <chr...@mellanox.com>
> > > ---
> > >  tc/m_action.c  |  93 ++--
> > >  tc/tc.c|  96 +++--
> > >  tc/tc_common.h |   8 +++-
> > >  tc/tc_filter.c | 132
> > > -
> > >  4 files changed, 252 insertions(+), 77 deletions(-)
> > >
> > > diff --git a/tc/m_action.c b/tc/m_action.c index fc422364..cf5cc95d
> > > 100644
> > > --- a/tc/m_action.c
> > > +++ b/tc/m_action.c
> > > @@ -23,6 +23,7 @@
> > >  #include 
> > >  #include 
> > >  #include 
> > > +#include 
> > >
> > >  #include "utils.h"
> > >  #include "tc_common.h"
> > > @@ -546,40 +547,86 @@ bad_val:
> > >   return ret;
> > >  }
> > >
> > > +typedef struct {
> > > + struct nlmsghdr n;
> > > + struct tcamsg   t;
> > > + charbuf[MAX_MSG];
> > > +} tc_action_req;
> > > +
> > > +static tc_action_req *action_reqs;
> > > +static struct iovec msg_iov[MSG_IOV_MAX];
> > > +
> > > +void free_action_reqs(void)
> > > +{
> > > + free(action_reqs);
> > > +}
> > > +
> > > +static tc_action_req *get_action_req(int batch_size, int index) {
> > > + tc_action_req *req;
> > > +
> > > + if (action_reqs == NULL) {
> > > + action_reqs = malloc(batch_size * sizeof (tc_action_req));
> > > + if (action_reqs == NULL)
> > > + return NULL;
> > > + }
> > > + req = _reqs[index];
> > > + memset(req, 0, sizeof (*req));
> > > +
> > > + return req;
> > > +}
> > > +
> > >  static int tc_action_modify(int cmd, unsigned int flags,
> > > - int *argc_p, char ***argv_p)
> > > + int *argc_p, char ***argv_p,
> > > + int batch_size, int index, bool send)
> > >  {
> > > - int argc = *argc_p;
> > > + struct sockaddr_nl nladdr = { .nl_family = AF_NETLINK };
> > > + struct iovec *iov = _iov[index];
> > >   char **argv = *argv_p;
> > > - int ret = 0;
> > > - struct {
> > > - struct nlmsghdr n;
> > > - struct tcamsg   t;
> > > - charbuf[MAX_MSG];
> > > - } req = {
> > > - .n.nlmsg_len = NLMSG_LENGTH(sizeof(struct tcamsg)),
> > > - .n.nlmsg_flags = NLM_F_REQUEST | flags,
> > > - .n.nlmsg_type = cmd,
> > > - .t.tca_family = AF_UNSPEC,
> > > + struct msghdr msg = {
> > > + .msg_name = ,
> > > + .msg_namelen = sizeof(nladdr),
> > > + .msg_iov = msg_iov,
> > > + .msg_iovlen = index + 1,
> > >   };
> > > - struct rtattr *tail = NLMSG_TAIL();
> > > + struct rtattr *tail;
> > > + tc_action_req *req;
> > > + int argc = *argc_p;
> > > + int ret = 0;
> > > +
> > > + req = get_action_req(batch_size, index);
> > > + if (req == NULL) {
> > > + fprintf(stderr, "get_action_req error: not enough buffer\n");
> > > + return -ENOMEM;
> > > + }
> > > + req->n.nlmsg_len = NLMSG_LENGTH(sizeof(struct tcamsg));
> > > + req->n.nlmsg_flags = NLM_F_REQUEST | flags;
> > > + req->n.nlmsg_type = cmd;
> > > + req->t.tca_family = AF_UNSPEC;
> > > + tail = NLM

RE: [patch iproute2 v6 1/3] lib/libnetlink: Add a function rtnl_talk_msg

2018-01-08 Thread Chris Mi
> -Original Message-
> From: David Ahern [mailto:dsah...@gmail.com]
> Sent: Saturday, January 6, 2018 1:51 AM
> To: Chris Mi <chr...@mellanox.com>; netdev@vger.kernel.org
> Cc: gerlitz...@gmail.com; step...@networkplumber.org;
> marcelo.leit...@gmail.com
> Subject: Re: [patch iproute2 v6 1/3] lib/libnetlink: Add a function
> rtnl_talk_msg
> 
> On 1/4/18 12:34 AM, Chris Mi wrote:
> > rtnl_talk can only send a single message to kernel. Add a new function
> > rtnl_talk_msg that can send multiple messages to kernel.
> >
> > Signed-off-by: Chris Mi <chr...@mellanox.com>
> > ---
> >  include/libnetlink.h |  3 +++
> >  lib/libnetlink.c | 66 ++
> --
> >  2 files changed, 51 insertions(+), 18 deletions(-)
> >
> 
> I think you should add an argument to rtnl_talk_msg to return the number of
> messages processed. That can be used to refine which line failed. As batch
> size increases the current design puts the burden on the user to scan a lot of
> lines to find the one that fails:
> 
> tc -b tc.batch  -bs 50
> RTNETLINK answers: File exists
> We have an error talking to the kernel, -1 Command failed tc.batch:2-51
> 
> We should be able to tell them exactly which line failed.
Done.
> 
> Also, it would be better to call this rtnl_talk_iov, take an iov as an 
> argument
> and have a common rtnl_talk_msg for existing code and this new one.
> 
> As it stands you are having to add:
>struct sockaddr_nl nladdr = { .nl_family = AF_NETLINK };
> 
> to tc functions when it really only needs to know about iov's.
Done.


RE: [patch iproute2 v6 0/3] tc: Add -bs option to batch mode

2018-01-08 Thread Chris Mi
> -Original Message-
> From: n...@orbyte.nwl.cc [mailto:n...@orbyte.nwl.cc] On Behalf Of Phil
> Sutter
> Sent: Monday, January 8, 2018 9:32 PM
> To: Chris Mi <chr...@mellanox.com>
> Cc: dsah...@gmail.com; marcelo.leit...@gmail.com;
> netdev@vger.kernel.org; gerlitz...@gmail.com;
> step...@networkplumber.org
> Subject: Re: [patch iproute2 v6 0/3] tc: Add -bs option to batch mode
> 
> Hi Chris,
> 
> On Mon, Jan 08, 2018 at 02:03:53AM +, Chris Mi wrote:
> > > On Thu, Jan 04, 2018 at 04:34:51PM +0900, Chris Mi wrote:
> > > > The insertion rate is improved more than 10%.
> > >
> > > Did you measure the effect of increasing batch sizes?
> > Yes. Even if we enlarge the batch size bigger than 10, there is no big
> improvement.
> > I think that's because current kernel doesn't process the requests in
> parallel.
> > If kernel processes the requests in parallel, I believe specifying a
> > bigger batch size will get a better result.
> 
> But throughput doesn't regress at some point, right? I think that's the 
> critical
> aspect when considering an "unlimited" batch size.
> 
> On Mon, Jan 08, 2018 at 08:00:00AM +, Chris Mi wrote:
> > After testing, I find that the message passed to kernel should not be too
> big.
> > If it is bigger than about 64K, sendmsg returns -1, errno is 90 (EMSGSIZE).
> > That is about 400 commands.  So how about set batch size to 128 which is
> big enough?
> 
> If that's the easiest way, why not. At first, I thought one could maybe send
> the collected messages in chunks of suitable size, but that's probably not
> worth the effort.
I did a testing. If we read a million commands in memory and send them in 
chunks of 128,
we'll have a big regression. It takes about 21 seconds.




RE: [patch iproute2 v6 0/3] tc: Add -bs option to batch mode

2018-01-08 Thread Chris Mi
> -Original Message-
> From: Stephen Hemminger [mailto:step...@networkplumber.org]
> Sent: Monday, January 8, 2018 11:40 PM
> To: Chris Mi <chr...@mellanox.com>
> Cc: David Ahern <dsah...@gmail.com>; Phil Sutter <p...@nwl.cc>;
> marcelo.leit...@gmail.com; netdev@vger.kernel.org; gerlitz...@gmail.com
> Subject: Re: [patch iproute2 v6 0/3] tc: Add -bs option to batch mode
> 
> On Mon, 8 Jan 2018 08:00:00 +
> Chris Mi <chr...@mellanox.com> wrote:
> 
> > > >> I wonder whether specifying the batch size is necessary at all.
> > > >> Couldn't batch mode just collect messages until either EOF or an
> > > >> incompatible command is encountered which then triggers a commit
> > > >> to kernel? This might simplify code quite a bit.
> > > > That's a good suggestion.
> > >
> > > Thanks for your time on this, Chris.
> > After testing, I find that the message passed to kernel should not be too
> big.
> > If it is bigger than about 64K, sendmsg returns -1, errno is 90 (EMSGSIZE).
> > That is about 400 commands.  So how about set batch size to 128 which is
> big enough?
> 
> 
> Use sendmmsg?
Maybe we can try that, but there is also a limit on it.


RE: [patch iproute2 v6 0/3] tc: Add -bs option to batch mode

2018-01-08 Thread Chris Mi
> -Original Message-
> From: n...@orbyte.nwl.cc [mailto:n...@orbyte.nwl.cc] On Behalf Of Phil
> Sutter
> Sent: Monday, January 8, 2018 9:32 PM
> To: Chris Mi <chr...@mellanox.com>
> Cc: dsah...@gmail.com; marcelo.leit...@gmail.com;
> netdev@vger.kernel.org; gerlitz...@gmail.com;
> step...@networkplumber.org
> Subject: Re: [patch iproute2 v6 0/3] tc: Add -bs option to batch mode
> 
> Hi Chris,
> 
> On Mon, Jan 08, 2018 at 02:03:53AM +, Chris Mi wrote:
> > > On Thu, Jan 04, 2018 at 04:34:51PM +0900, Chris Mi wrote:
> > > > The insertion rate is improved more than 10%.
> > >
> > > Did you measure the effect of increasing batch sizes?
> > Yes. Even if we enlarge the batch size bigger than 10, there is no big
> improvement.
> > I think that's because current kernel doesn't process the requests in
> parallel.
> > If kernel processes the requests in parallel, I believe specifying a
> > bigger batch size will get a better result.
> 
> But throughput doesn't regress at some point, right? I think that's the 
> critical
> aspect when considering an "unlimited" batch size.
Yes.
> 
> On Mon, Jan 08, 2018 at 08:00:00AM +, Chris Mi wrote:
> > After testing, I find that the message passed to kernel should not be too
> big.
> > If it is bigger than about 64K, sendmsg returns -1, errno is 90 (EMSGSIZE).
> > That is about 400 commands.  So how about set batch size to 128 which is
> big enough?
> 
> If that's the easiest way, why not. At first, I thought one could maybe send
> the collected messages in chunks of suitable size, but that's probably not
> worth the effort.
OK.

-Chris


RE: [patch iproute2 v6 0/3] tc: Add -bs option to batch mode

2018-01-08 Thread Chris Mi
> >> I wonder whether specifying the batch size is necessary at all.
> >> Couldn't batch mode just collect messages until either EOF or an
> >> incompatible command is encountered which then triggers a commit to
> >> kernel? This might simplify code quite a bit.
> > That's a good suggestion.
> 
> Thanks for your time on this, Chris.
After testing, I find that the message passed to kernel should not be too big.
If it is bigger than about 64K, sendmsg returns -1, errno is 90 (EMSGSIZE).
That is about 400 commands.  So how about set batch size to 128 which is big 
enough?


RE: [patch iproute2 v6 0/3] tc: Add -bs option to batch mode

2018-01-07 Thread Chris Mi
> -Original Message-
> From: n...@orbyte.nwl.cc [mailto:n...@orbyte.nwl.cc] On Behalf Of Phil
> Sutter
> Sent: Saturday, January 6, 2018 1:25 AM
> To: Chris Mi <chr...@mellanox.com>
> Cc: netdev@vger.kernel.org; gerlitz...@gmail.com;
> step...@networkplumber.org; dsah...@gmail.com;
> marcelo.leit...@gmail.com
> Subject: Re: [patch iproute2 v6 0/3] tc: Add -bs option to batch mode
> 
> Hi Chris,
> 
> On Thu, Jan 04, 2018 at 04:34:51PM +0900, Chris Mi wrote:
> > Currently in tc batch mode, only one command is read from the batch
> > file and sent to kernel to process. With this patchset, we can
> > accumulate several commands before sending to kernel. The batch size
> > is specified using option -bs or -batchsize.
> >
> > To accumulate the commands in tc, client should allocate an array of
> > struct iovec. If batchsize is bigger than 1, only after the client has
> > accumulated enough commands, can the client call rtnl_talk_msg to send
> > the message that includes the iov array. One exception is that there
> > is no more command in the batch file.
> >
> > But please note that kernel still processes the requests one by one.
> > To process the requests in parallel in kernel is another effort.
> > The time we're saving in this patchset is the user mode and kernel
> > mode context switch. So this patchset works on top of the current kernel.
> >
> > Using the following script in kernel, we can generate 1,000,000 rules.
> > tools/testing/selftests/tc-testing/tdc_batch.py
> >
> > Without this patchset, 'tc -b $file' exection time is:
> >
> > real0m15.555s
> > user0m7.211s
> > sys 0m8.284s
> >
> > With this patchset, 'tc -b $file -bs 10' exection time is:
> >
> > real0m13.043s
> > user0m6.479s
> > sys 0m6.504s
> >
> > The insertion rate is improved more than 10%.
> 
> Did you measure the effect of increasing batch sizes?
Yes. Even if we enlarge the batch size bigger than 10, there is no big 
improvement.
I think that's because current kernel doesn't process the requests in parallel.
If kernel processes the requests in parallel, I believe specifying a bigger 
batch size
will get a better result.
> 
> I wonder whether specifying the batch size is necessary at all. Couldn't batch
> mode just collect messages until either EOF or an incompatible command is
> encountered which then triggers a commit to kernel? This might simplify
> code quite a bit.
That's a good suggestion.

-Chris
> 
> Cheers, Phil


[patch iproute2 v6 2/3] tc: Add -bs option to batch mode

2018-01-03 Thread Chris Mi
Currently in tc batch mode, only one command is read from the batch
file and sent to kernel to process. With this support, we can accumulate
several commands before sending to kernel.

Now it only works for the following successive rules,
1. filter add
2. filter delete
3. actions add
4. actions delete

Otherwise, the batch size is still 1.

Signed-off-by: Chris Mi <chr...@mellanox.com>
---
 tc/m_action.c  |  93 ++--
 tc/tc.c|  96 +++--
 tc/tc_common.h |   8 +++-
 tc/tc_filter.c | 132 -
 4 files changed, 252 insertions(+), 77 deletions(-)

diff --git a/tc/m_action.c b/tc/m_action.c
index fc422364..cf5cc95d 100644
--- a/tc/m_action.c
+++ b/tc/m_action.c
@@ -23,6 +23,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include "utils.h"
 #include "tc_common.h"
@@ -546,40 +547,86 @@ bad_val:
return ret;
 }
 
+typedef struct {
+   struct nlmsghdr n;
+   struct tcamsg   t;
+   charbuf[MAX_MSG];
+} tc_action_req;
+
+static tc_action_req *action_reqs;
+static struct iovec msg_iov[MSG_IOV_MAX];
+
+void free_action_reqs(void)
+{
+   free(action_reqs);
+}
+
+static tc_action_req *get_action_req(int batch_size, int index)
+{
+   tc_action_req *req;
+
+   if (action_reqs == NULL) {
+   action_reqs = malloc(batch_size * sizeof (tc_action_req));
+   if (action_reqs == NULL)
+   return NULL;
+   }
+   req = _reqs[index];
+   memset(req, 0, sizeof (*req));
+
+   return req;
+}
+
 static int tc_action_modify(int cmd, unsigned int flags,
-   int *argc_p, char ***argv_p)
+   int *argc_p, char ***argv_p,
+   int batch_size, int index, bool send)
 {
-   int argc = *argc_p;
+   struct sockaddr_nl nladdr = { .nl_family = AF_NETLINK };
+   struct iovec *iov = _iov[index];
char **argv = *argv_p;
-   int ret = 0;
-   struct {
-   struct nlmsghdr n;
-   struct tcamsg   t;
-   charbuf[MAX_MSG];
-   } req = {
-   .n.nlmsg_len = NLMSG_LENGTH(sizeof(struct tcamsg)),
-   .n.nlmsg_flags = NLM_F_REQUEST | flags,
-   .n.nlmsg_type = cmd,
-   .t.tca_family = AF_UNSPEC,
+   struct msghdr msg = {
+   .msg_name = ,
+   .msg_namelen = sizeof(nladdr),
+   .msg_iov = msg_iov,
+   .msg_iovlen = index + 1,
};
-   struct rtattr *tail = NLMSG_TAIL();
+   struct rtattr *tail;
+   tc_action_req *req;
+   int argc = *argc_p;
+   int ret = 0;
+
+   req = get_action_req(batch_size, index);
+   if (req == NULL) {
+   fprintf(stderr, "get_action_req error: not enough buffer\n");
+   return -ENOMEM;
+   }
+   req->n.nlmsg_len = NLMSG_LENGTH(sizeof(struct tcamsg));
+   req->n.nlmsg_flags = NLM_F_REQUEST | flags;
+   req->n.nlmsg_type = cmd;
+   req->t.tca_family = AF_UNSPEC;
+   tail = NLMSG_TAIL(>n);
 
argc -= 1;
argv += 1;
-   if (parse_action(, , TCA_ACT_TAB, )) {
+   if (parse_action(, , TCA_ACT_TAB, >n)) {
fprintf(stderr, "Illegal \"action\"\n");
return -1;
}
-   tail->rta_len = (void *) NLMSG_TAIL() - (void *) tail;
+   tail->rta_len = (void *) NLMSG_TAIL(>n) - (void *) tail;
 
-   if (rtnl_talk(, , NULL) < 0) {
+   *argc_p = argc;
+   *argv_p = argv;
+
+   iov->iov_base = >n;
+   iov->iov_len = req->n.nlmsg_len;
+
+   if (!send)
+   return 0;
+
+   if (rtnl_talk_msg(, , NULL) < 0) {
fprintf(stderr, "We have an error talking to the kernel\n");
ret = -1;
}
 
-   *argc_p = argc;
-   *argv_p = argv;
-
return ret;
 }
 
@@ -679,7 +726,7 @@ bad_val:
return ret;
 }
 
-int do_action(int argc, char **argv)
+int do_action(int argc, char **argv, int batch_size, int index, bool send)
 {
 
int ret = 0;
@@ -689,12 +736,14 @@ int do_action(int argc, char **argv)
if (matches(*argv, "add") == 0) {
ret =  tc_action_modify(RTM_NEWACTION,
NLM_F_EXCL | NLM_F_CREATE,
-   , );
+   , , batch_size,
+   index, send);
} else if (matches(*argv, "change") == 0 ||
  matches(*argv, "replace") == 0

[patch iproute2 v6 1/3] lib/libnetlink: Add a function rtnl_talk_msg

2018-01-03 Thread Chris Mi
rtnl_talk can only send a single message to kernel. Add a new function
rtnl_talk_msg that can send multiple messages to kernel.

Signed-off-by: Chris Mi <chr...@mellanox.com>
---
 include/libnetlink.h |  3 +++
 lib/libnetlink.c | 66 ++--
 2 files changed, 51 insertions(+), 18 deletions(-)

diff --git a/include/libnetlink.h b/include/libnetlink.h
index a4d83b9e..01d98b16 100644
--- a/include/libnetlink.h
+++ b/include/libnetlink.h
@@ -96,6 +96,9 @@ int rtnl_dump_filter_nc(struct rtnl_handle *rth,
 int rtnl_talk(struct rtnl_handle *rtnl, struct nlmsghdr *n,
  struct nlmsghdr **answer)
__attribute__((warn_unused_result));
+int rtnl_talk_msg(struct rtnl_handle *rtnl, struct msghdr *m,
+ struct nlmsghdr **answer)
+   __attribute__((warn_unused_result));
 int rtnl_talk_extack(struct rtnl_handle *rtnl, struct nlmsghdr *n,
  struct nlmsghdr **answer, nl_ext_ack_fn_t errfn)
__attribute__((warn_unused_result));
diff --git a/lib/libnetlink.c b/lib/libnetlink.c
index 00e6ce0c..49ee1208 100644
--- a/lib/libnetlink.c
+++ b/lib/libnetlink.c
@@ -581,38 +581,40 @@ static void rtnl_talk_error(struct nlmsghdr *h, struct 
nlmsgerr *err,
strerror(-err->error));
 }
 
-static int __rtnl_talk(struct rtnl_handle *rtnl, struct nlmsghdr *n,
-  struct nlmsghdr **answer,
-  bool show_rtnl_err, nl_ext_ack_fn_t errfn)
+static int __rtnl_talk_msg(struct rtnl_handle *rtnl, struct msghdr *m,
+  struct nlmsghdr **answer,
+  bool show_rtnl_err, nl_ext_ack_fn_t errfn)
 {
-   int status;
-   unsigned int seq;
-   struct nlmsghdr *h;
struct sockaddr_nl nladdr = { .nl_family = AF_NETLINK };
-   struct iovec iov = {
-   .iov_base = n,
-   .iov_len = n->nlmsg_len
-   };
+   int i, status, iovlen = m->msg_iovlen;
+   unsigned int seq = 0;
+   struct nlmsghdr *h;
+   struct iovec iov;
struct msghdr msg = {
.msg_name = ,
.msg_namelen = sizeof(nladdr),
.msg_iov = ,
.msg_iovlen = 1,
};
-   char *buf;
-
-   n->nlmsg_seq = seq = ++rtnl->seq;
 
-   if (answer == NULL)
-   n->nlmsg_flags |= NLM_F_ACK;
+   for (i = 0; i < iovlen; i++) {
+   struct iovec *v;
+   v = >msg_iov[i];
+   h = v->iov_base;
+   h->nlmsg_seq = seq = ++rtnl->seq;
+   if (answer == NULL)
+   h->nlmsg_flags |= NLM_F_ACK;
+   }
 
-   status = sendmsg(rtnl->fd, , 0);
+   status = sendmsg(rtnl->fd, m, 0);
if (status < 0) {
perror("Cannot talk to rtnetlink");
return -1;
}
 
while (1) {
+   char *buf;
+next:
status = rtnl_recvmsg(rtnl->fd, , );
 
if (status < 0)
@@ -642,7 +644,7 @@ static int __rtnl_talk(struct rtnl_handle *rtnl, struct 
nlmsghdr *n,
 
if (nladdr.nl_pid != 0 ||
h->nlmsg_pid != rtnl->local.nl_pid ||
-   h->nlmsg_seq != seq) {
+   h->nlmsg_seq > seq || h->nlmsg_seq < seq - iovlen) {
/* Don't forget to skip that message. */
status -= NLMSG_ALIGN(len);
h = (struct nlmsghdr *)((char *)h + 
NLMSG_ALIGN(len));
@@ -662,7 +664,10 @@ static int __rtnl_talk(struct rtnl_handle *rtnl, struct 
nlmsghdr *n,
*answer = (struct nlmsghdr 
*)buf;
else
free(buf);
-   return 0;
+   if (h->nlmsg_seq == seq)
+   return 0;
+   else
+   goto next;
}
 
if (rtnl->proto != NETLINK_SOCK_DIAG &&
@@ -698,12 +703,37 @@ static int __rtnl_talk(struct rtnl_handle *rtnl, struct 
nlmsghdr *n,
}
 }
 
+static int __rtnl_talk(struct rtnl_handle *rtnl, struct nlmsghdr *n,
+  struct nlmsghdr **answer,
+  bool show_rtnl_err, nl_ext_ack_fn_t errfn)
+{
+   struct sockaddr_nl nladdr = { .nl_family = AF_NETLINK };
+   struct iovec iov = {
+   .iov_base = n,
+   .iov_len = n->nlmsg_len
+   };
+   struct msghdr msg = {
+   .msg_name = ,
+   .msg_namelen = sizeof(nladdr),
+   .msg_iov = ,
+   .msg_iovlen = 1,
+   };
+
+   r

[patch iproute2 v6 0/3] tc: Add -bs option to batch mode

2018-01-03 Thread Chris Mi
Currently in tc batch mode, only one command is read from the batch
file and sent to kernel to process. With this patchset, we can accumulate
several commands before sending to kernel. The batch size is specified
using option -bs or -batchsize.

To accumulate the commands in tc, client should allocate an array of
struct iovec. If batchsize is bigger than 1, only after the client
has accumulated enough commands, can the client call rtnl_talk_msg
to send the message that includes the iov array. One exception is
that there is no more command in the batch file.

But please note that kernel still processes the requests one by one.
To process the requests in parallel in kernel is another effort.
The time we're saving in this patchset is the user mode and kernel mode
context switch. So this patchset works on top of the current kernel.

Using the following script in kernel, we can generate 1,000,000 rules.
tools/testing/selftests/tc-testing/tdc_batch.py

Without this patchset, 'tc -b $file' exection time is:

real0m15.555s
user0m7.211s
sys 0m8.284s

With this patchset, 'tc -b $file -bs 10' exection time is:

real0m13.043s
user0m6.479s
sys 0m6.504s

The insertion rate is improved more than 10%.

In this patchset, we still ack for every rule. If we don't ack at all,
'tc -b $file' exection time is:

real0m14.748s
user0m6.944s
sys 0m7.740s

'tc -b $file -bs 10' exection time is:

real0m12.535s
user0m6.587s
sys 0m5.888s

We can see that the performance win is to send multiple messages instead
of no acking. I think that's because in tc, we don't spend too much time
processing the ack message.


v3
==
1. Instead of hacking function rtnl_talk directly, add a new function
   rtnl_talk_msg.
2. remove most of global variables to use parameter passing
3. divide the previous patch into 4 patches.

v4
==
1. Remove function setcmdlinetotal. Now in function batch, we read one
   more line to determine if we are reaching the end of file.
2. Remove function __rtnl_check_ack. Now __rtnl_talk calls
__rtnl_talk_msg
   directly.
3. if (batch_size < 1)
batch_size = 1;

v5
==
1. Fix a bug that can't deal with batch file with blank line.
2. Describe the limitation in man page.

v6
==
1. Add support for mixed commands.
2. Fix a bug that not all messages are acked if batch size > 1.


Chris Mi (3):
  lib/libnetlink: Add a function rtnl_talk_msg
  tc: Add -bs option to batch mode
  man: Add -bs option to tc manpage


 include/libnetlink.h |   3 ++
 lib/libnetlink.c |  66 +++---
 man/man8/tc.8|   7 +++
 tc/m_action.c|  93 +++-
 tc/tc.c  |  96 -
 tc/tc_common.h   |   8 +++-
 tc/tc_filter.c   | 132 +++
 7 files changed, 310 insertions(+), 95 deletions(-)

-- 
2.14.3



[patch iproute2 v6 3/3] man: Add -bs option to tc manpage

2018-01-03 Thread Chris Mi
Signed-off-by: Chris Mi <chr...@mellanox.com>
---
 man/man8/tc.8 | 7 +++
 1 file changed, 7 insertions(+)

diff --git a/man/man8/tc.8 b/man/man8/tc.8
index ff071b33..23db730c 100644
--- a/man/man8/tc.8
+++ b/man/man8/tc.8
@@ -601,6 +601,13 @@ must exist already.
 read commands from provided file or standard input and invoke them.
 First failure will cause termination of tc.
 
+.TP
+.BR "\-bs", " \-bs size", " \-batchsize", " \-batchsize size"
+How many commands are accumulated before sending to kernel.
+By default, it is 1. It only takes effect in batch mode.
+Only successive rules of filter add and delete are supported.
+Otherwise, batch size is still 1.
+
 .TP
 .BR "\-force"
 don't terminate tc on errors in batch mode.
-- 
2.14.3



Re: [patch iproute2 v5 2/3] tc: Add -bs option to batch mode

2018-01-03 Thread Chris Mi

 2018/1/3 12:25, David Ahern:

You need a patch description here ...

Done.


On 1/2/18 7:55 PM, Chris Mi wrote:

  static int tc_action_modify(int cmd, unsigned int flags,
-   int *argc_p, char ***argv_p)
+   int *argc_p, char ***argv_p,
+   int batch_size, int index, bool send)
  {
int argc = *argc_p;
char **argv = *argv_p;
int ret = 0;
-   struct {
-   struct nlmsghdr n;
-   struct tcamsg   t;
-   charbuf[MAX_MSG];
-   } req = {
-   .n.nlmsg_len = NLMSG_LENGTH(sizeof(struct tcamsg)),
-   .n.nlmsg_flags = NLM_F_REQUEST | flags,
-   .n.nlmsg_type = cmd,
-   .t.tca_family = AF_UNSPEC,
+   tc_action_req *req;
+   struct sockaddr_nl nladdr = { .nl_family = AF_NETLINK };
+   struct iovec *iov = _iov[index];

Reverse xmas tree is the coding standard for net code. Please check all
new code to conform to this standard.

Done.


I have not reviewed all of this patch, but I firmly believe the batching
size option needs to be able handle a file with mixed commands. Your use
case is filter and action adds and deletes, but you should allow users
(e.g., test suites) to benefit from this performance speed up with test
cases that have single files with all of the commands.
Done. There is a little performance pernalty. But I think it is better 
than segfault.


For example,
$ cat tc.batch
qdisc add dev eth2 ingress
filter add dev eth2 ingress protocol ip pref 21 flower dst_ip
192.168.1.0/16 action drop
filter add dev eth2 ingress protocol ip pref 22 flower dst_ip
192.168.2.0/16 action drop
filter add dev eth2 ingress protocol ip pref 23 flower dst_ip
192.168.3.0/16 action drop
filter add dev eth2 ingress protocol ip pref 24 flower dst_ip
192.168.4.0/16 action drop
filter add dev eth2 ingress protocol ip pref 25 flower dst_ip
192.168.5.0/16 action drop
qdisc del dev eth2 ingress

(and consider this to be a huge file to really stress tc code paths for
example). Right now, the above file fails:

$ tc -b tc.batch -bs 5
Segmentation fault


Also, your changes fail to break out on an error:

$ tc -b tc.batch -bs 1
RTNETLINK answers: File exists
We have an error talking to the kernel, -1
RTNETLINK answers: File exists
We have an error talking to the kernel, -1
RTNETLINK answers: File exists
We have an error talking to the kernel, -1
RTNETLINK answers: File exists
We have an error talking to the kernel, -1
RTNETLINK answers: File exists
We have an error talking to the kernel, -1

where as the existing command does this:
$ tc -b tc.batch
RTNETLINK answers: File exists
We have an error talking to the kernel
Command failed tc.batch:1




Re: [patch iproute2 v5 1/3] lib/libnetlink: Add a function rtnl_talk_msg

2018-01-03 Thread Chris Mi

2018/1/3 12:08, David Ahern:

On 1/2/18 7:55 PM, Chris Mi wrote:

diff --git a/lib/libnetlink.c b/lib/libnetlink.c
index 00e6ce0c..cc02a139 100644
--- a/lib/libnetlink.c
+++ b/lib/libnetlink.c
@@ -581,32 +581,34 @@ static void rtnl_talk_error(struct nlmsghdr *h, struct 
nlmsgerr *err,
strerror(-err->error));
  }
  
-static int __rtnl_talk(struct rtnl_handle *rtnl, struct nlmsghdr *n,

-  struct nlmsghdr **answer,
-  bool show_rtnl_err, nl_ext_ack_fn_t errfn)
+static int __rtnl_talk_msg(struct rtnl_handle *rtnl, struct msghdr *m,
+  struct nlmsghdr **answer,
+  bool show_rtnl_err, nl_ext_ack_fn_t errfn)
  {
-   int status;
-   unsigned int seq;
-   struct nlmsghdr *h;
+   int iovlen = m->msg_iovlen;
+   unsigned int seq = 0;
+   int i, status;
+   char *buf;
+
struct sockaddr_nl nladdr = { .nl_family = AF_NETLINK };
-   struct iovec iov = {
-   .iov_base = n,
-   .iov_len = n->nlmsg_len
-   };
+   struct iovec iov, *v;
+   struct nlmsghdr *h;
struct msghdr msg = {
.msg_name = ,
.msg_namelen = sizeof(nladdr),
.msg_iov = ,
.msg_iovlen = 1,
};
-   char *buf;

Reverse xmas tree is the coding standard for net code. Please adhere to
it. Only dependencies between variables are an acceptable exception.

OK, got it.


Some of those (struct nlmsghdr *h and struct iovec *v) can be moved to
the for loop which aligns with your intentions of grouping variables.

Done.


  
-	n->nlmsg_seq = seq = ++rtnl->seq;

-
-   if (answer == NULL)
-   n->nlmsg_flags |= NLM_F_ACK;
+   for (i = 0; i < iovlen; i++) {
+   v = >msg_iov[i];
+   h = v->iov_base;
+   h->nlmsg_seq = seq = ++rtnl->seq;

doesn't seq need to track the recvmsg loop? I think for batching you
want it to start at the first seq number and then in the recvmsg loop
increment it.

Yes, it is a bug. Thanks for your test case.


As it stands this file:
$ cat tc.batch
filter add dev eth2 ingress protocol ip pref 21 flower dst_ip
192.168.1.0/16 action drop
filter add dev eth2 ingress protocol ip pref 22 flower dst_ip
192.168.2.0/16 action drop
filter add dev eth2 ingress protocol ip pref 22 flower dst_ip
192.168.3.0/16 action drop
filter add dev eth2 ingress protocol ip pref 24 flower dst_ip
192.168.4.0/16 action drop
filter add dev eth2 ingress protocol ip pref 25 flower dst_ip
192.168.5.0/16 action drop

does not give me an error message:
$ tc -b tc.batch -bs 5


Yet it failed to insert all filters:
$ tc filter show dev eth2 ingress
filter protocol ip pref 21 flower chain 0
filter protocol ip pref 21 flower chain 0 handle 0x1
   eth_type ipv4
   dst_ip 192.168.1.0/16
   not_in_hw
action order 1: gact action drop
 random type none pass val 0
 index 1 ref 1 bind 1

filter protocol ip pref 22 flower chain 0
filter protocol ip pref 22 flower chain 0 handle 0x1
   eth_type ipv4
   dst_ip 192.168.2.0/16
   not_in_hw
action order 1: gact action drop
 random type none pass val 0
 index 2 ref 1 bind 1

filter protocol ip pref 24 flower chain 0
filter protocol ip pref 24 flower chain 0 handle 0x1
   eth_type ipv4
   dst_ip 192.168.4.0/16
   not_in_hw
action order 1: gact action drop
 random type none pass val 0
 index 3 ref 1 bind 1

filter protocol ip pref 25 flower chain 0
filter protocol ip pref 25 flower chain 0 handle 0x1
   eth_type ipv4
   dst_ip 192.168.5.0/16
   not_in_hw
action order 1: gact action drop
 random type none pass val 0
 index 4 ref 1 bind 1


After fixing it, the test result is:

# tc -b tc.batch -bs 5
RTNETLINK answers: File exists
We have an error talking to the kernel, -1
Command failed 1.txt:0-4

We can't tell exactly which command causes this error, so we give a 
range which is less than the batch size.


[patch iproute2 v5 3/3] man: Add -bs option to tc manpage

2018-01-02 Thread Chris Mi
Signed-off-by: Chris Mi <chr...@mellanox.com>
---
 man/man8/tc.8 | 9 +
 1 file changed, 9 insertions(+)

diff --git a/man/man8/tc.8 b/man/man8/tc.8
index ff071b33..7338ed3b 100644
--- a/man/man8/tc.8
+++ b/man/man8/tc.8
@@ -601,6 +601,15 @@ must exist already.
 read commands from provided file or standard input and invoke them.
 First failure will cause termination of tc.
 
+.TP
+.BR "\-bs", " \-bs size", " \-batchsize", " \-batchsize size"
+How many commands are accumulated before sending to kernel.
+By default, it is 1. It only takes effect in batch mode.
+Currently, it only supports filter add or actions add.
+If there are mixed commands in the batch file, the result is unpredictable.
+And there is a limitation that the last line in the batch file should not be 
blank.
+Or you will lose at most batchsize - 1 rules.
+
 .TP
 .BR "\-force"
 don't terminate tc on errors in batch mode.
-- 
2.14.3



[patch iproute2 v5 1/3] lib/libnetlink: Add a function rtnl_talk_msg

2018-01-02 Thread Chris Mi
rtnl_talk can only send a single message to kernel. Add a new function
rtnl_talk_msg that can send multiple messages to kernel.

Signed-off-by: Chris Mi <chr...@mellanox.com>
---
 include/libnetlink.h |  3 +++
 lib/libnetlink.c | 59 ++--
 2 files changed, 46 insertions(+), 16 deletions(-)

diff --git a/include/libnetlink.h b/include/libnetlink.h
index a4d83b9e..01d98b16 100644
--- a/include/libnetlink.h
+++ b/include/libnetlink.h
@@ -96,6 +96,9 @@ int rtnl_dump_filter_nc(struct rtnl_handle *rth,
 int rtnl_talk(struct rtnl_handle *rtnl, struct nlmsghdr *n,
  struct nlmsghdr **answer)
__attribute__((warn_unused_result));
+int rtnl_talk_msg(struct rtnl_handle *rtnl, struct msghdr *m,
+ struct nlmsghdr **answer)
+   __attribute__((warn_unused_result));
 int rtnl_talk_extack(struct rtnl_handle *rtnl, struct nlmsghdr *n,
  struct nlmsghdr **answer, nl_ext_ack_fn_t errfn)
__attribute__((warn_unused_result));
diff --git a/lib/libnetlink.c b/lib/libnetlink.c
index 00e6ce0c..cc02a139 100644
--- a/lib/libnetlink.c
+++ b/lib/libnetlink.c
@@ -581,32 +581,34 @@ static void rtnl_talk_error(struct nlmsghdr *h, struct 
nlmsgerr *err,
strerror(-err->error));
 }
 
-static int __rtnl_talk(struct rtnl_handle *rtnl, struct nlmsghdr *n,
-  struct nlmsghdr **answer,
-  bool show_rtnl_err, nl_ext_ack_fn_t errfn)
+static int __rtnl_talk_msg(struct rtnl_handle *rtnl, struct msghdr *m,
+  struct nlmsghdr **answer,
+  bool show_rtnl_err, nl_ext_ack_fn_t errfn)
 {
-   int status;
-   unsigned int seq;
-   struct nlmsghdr *h;
+   int iovlen = m->msg_iovlen;
+   unsigned int seq = 0;
+   int i, status;
+   char *buf;
+
struct sockaddr_nl nladdr = { .nl_family = AF_NETLINK };
-   struct iovec iov = {
-   .iov_base = n,
-   .iov_len = n->nlmsg_len
-   };
+   struct iovec iov, *v;
+   struct nlmsghdr *h;
struct msghdr msg = {
.msg_name = ,
.msg_namelen = sizeof(nladdr),
.msg_iov = ,
.msg_iovlen = 1,
};
-   char *buf;
 
-   n->nlmsg_seq = seq = ++rtnl->seq;
-
-   if (answer == NULL)
-   n->nlmsg_flags |= NLM_F_ACK;
+   for (i = 0; i < iovlen; i++) {
+   v = >msg_iov[i];
+   h = v->iov_base;
+   h->nlmsg_seq = seq = ++rtnl->seq;
+   if (answer == NULL)
+   h->nlmsg_flags |= NLM_F_ACK;
+   }
 
-   status = sendmsg(rtnl->fd, , 0);
+   status = sendmsg(rtnl->fd, m, 0);
if (status < 0) {
perror("Cannot talk to rtnetlink");
return -1;
@@ -698,12 +700,37 @@ static int __rtnl_talk(struct rtnl_handle *rtnl, struct 
nlmsghdr *n,
}
 }
 
+static int __rtnl_talk(struct rtnl_handle *rtnl, struct nlmsghdr *n,
+  struct nlmsghdr **answer,
+  bool show_rtnl_err, nl_ext_ack_fn_t errfn)
+{
+   struct sockaddr_nl nladdr = { .nl_family = AF_NETLINK };
+   struct iovec iov = {
+   .iov_base = n,
+   .iov_len = n->nlmsg_len
+   };
+   struct msghdr msg = {
+   .msg_name = ,
+   .msg_namelen = sizeof(nladdr),
+   .msg_iov = ,
+   .msg_iovlen = 1,
+   };
+
+   return __rtnl_talk_msg(rtnl, , answer, show_rtnl_err, errfn);
+}
+
 int rtnl_talk(struct rtnl_handle *rtnl, struct nlmsghdr *n,
  struct nlmsghdr **answer)
 {
return __rtnl_talk(rtnl, n, answer, true, NULL);
 }
 
+int rtnl_talk_msg(struct rtnl_handle *rtnl, struct msghdr *m,
+ struct nlmsghdr **answer)
+{
+   return __rtnl_talk_msg(rtnl, m, answer, true, NULL);
+}
+
 int rtnl_talk_extack(struct rtnl_handle *rtnl, struct nlmsghdr *n,
 struct nlmsghdr **answer,
 nl_ext_ack_fn_t errfn)
-- 
2.14.3



[patch iproute2 v5 0/3] tc: Add -bs option to batch mode

2018-01-02 Thread Chris Mi
Currently in tc batch mode, only one command is read from the batch
file and sent to kernel to process. With this patchset, we can accumulate
several commands before sending to kernel. The batch size is specified
using option -bs or -batchsize.

To accumulate the commands in tc, client should allocate an array of
struct iovec. If batchsize is bigger than 1, only after the client
has accumulated enough commands, can the client call rtnl_talk_msg
to send the message that includes the iov array. One exception is
that there is no more command in the batch file.

But please note that kernel still processes the requests one by one.
To process the requests in parallel in kernel is another effort.
The time we're saving in this patchset is the user mode and kernel mode
context switch. So this patchset works on top of the current kernel.

Using the following script in kernel, we can generate 1,000,000 rules.
tools/testing/selftests/tc-testing/tdc_batch.py

Without this patchset, 'tc -b $file' exection time is:

real0m15.125s
user0m6.982s
sys 0m8.080s

With this patchset, 'tc -b $file -bs 10' exection time is:

real0m12.772s
user0m5.984s
sys 0m6.723s

The insertion rate is improved more than 10%.

In this patchset, we still ack for every rule. If we don't ack at all,

'tc -b $file' exection time is:

real0m14.484s
user0m6.919s
sys 0m7.498s

'tc -b $file -bs 10' exection time is:

real0m11.664s
user0m6.017s
sys 0m5.578s

We can see that the performance win is to send multiple messages instead
of no acking. I think that's because in tc, we don't spend too much time
processing the ack message.


v3
==
1. Instead of hacking function rtnl_talk directly, add a new function
   rtnl_talk_msg.
2. remove most of global variables to use parameter passing
3. divide the previous patch into 4 patches.

v4
==
1. Remove function setcmdlinetotal. Now in function batch, we read one
   more line to determine if we are reaching the end of file.
2. Remove function __rtnl_check_ack. Now __rtnl_talk calls __rtnl_talk_msg
   directly.
3. if (batch_size < 1)
batch_size = 1;

v5
==
1. Fix a bug that can't deal with batch file with blank line.
2. Describe the limitation in man page.


Chris Mi (3):
  lib/libnetlink: Add a function rtnl_talk_msg
  tc: Add -bs option to batch mode
  man: Add -bs option to tc manpage

 include/libnetlink.h |   3 ++
 lib/libnetlink.c |  59 ++---
 man/man8/tc.8|   9 
 tc/m_action.c|  90 +-
 tc/tc.c  |  70 +++--
 tc/tc_common.h   |   8 +++-
 tc/tc_filter.c   | 121 +--
 7 files changed, 276 insertions(+), 84 deletions(-)

-- 
2.14.3



[patch iproute2 v5 2/3] tc: Add -bs option to batch mode

2018-01-02 Thread Chris Mi
Signed-off-by: Chris Mi <chr...@mellanox.com>
---
 tc/m_action.c  |  90 --
 tc/tc.c|  70 ++---
 tc/tc_common.h |   8 +++-
 tc/tc_filter.c | 121 +
 4 files changed, 221 insertions(+), 68 deletions(-)

diff --git a/tc/m_action.c b/tc/m_action.c
index fc422364..2e79034d 100644
--- a/tc/m_action.c
+++ b/tc/m_action.c
@@ -23,6 +23,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include "utils.h"
 #include "tc_common.h"
@@ -546,40 +547,87 @@ bad_val:
return ret;
 }
 
+typedef struct {
+   struct nlmsghdr n;
+   struct tcamsg   t;
+   charbuf[MAX_MSG];
+} tc_action_req;
+
+static tc_action_req *action_reqs;
+static struct iovec msg_iov[MSG_IOV_MAX];
+
+void free_action_reqs(void)
+{
+   free(action_reqs);
+}
+
+static tc_action_req *get_action_req(int batch_size, int index)
+{
+   tc_action_req *req;
+
+   if (action_reqs == NULL) {
+   action_reqs = malloc(batch_size * sizeof (tc_action_req));
+   if (action_reqs == NULL)
+   return NULL;
+   }
+   req = _reqs[index];
+   memset(req, 0, sizeof (*req));
+
+   return req;
+}
+
 static int tc_action_modify(int cmd, unsigned int flags,
-   int *argc_p, char ***argv_p)
+   int *argc_p, char ***argv_p,
+   int batch_size, int index, bool send)
 {
int argc = *argc_p;
char **argv = *argv_p;
int ret = 0;
-   struct {
-   struct nlmsghdr n;
-   struct tcamsg   t;
-   charbuf[MAX_MSG];
-   } req = {
-   .n.nlmsg_len = NLMSG_LENGTH(sizeof(struct tcamsg)),
-   .n.nlmsg_flags = NLM_F_REQUEST | flags,
-   .n.nlmsg_type = cmd,
-   .t.tca_family = AF_UNSPEC,
+   tc_action_req *req;
+   struct sockaddr_nl nladdr = { .nl_family = AF_NETLINK };
+   struct iovec *iov = _iov[index];
+
+   req = get_action_req(batch_size, index);
+   if (req == NULL) {
+   fprintf(stderr, "get_action_req error: not enough buffer\n");
+   return -ENOMEM;
+   }
+
+   req->n.nlmsg_len = NLMSG_LENGTH(sizeof(struct tcamsg));
+   req->n.nlmsg_flags = NLM_F_REQUEST | flags;
+   req->n.nlmsg_type = cmd;
+   req->t.tca_family = AF_UNSPEC;
+   struct rtattr *tail = NLMSG_TAIL(>n);
+
+   struct msghdr msg = {
+   .msg_name = ,
+   .msg_namelen = sizeof(nladdr),
+   .msg_iov = msg_iov,
+   .msg_iovlen = index + 1,
};
-   struct rtattr *tail = NLMSG_TAIL();
 
argc -= 1;
argv += 1;
-   if (parse_action(, , TCA_ACT_TAB, )) {
+   if (parse_action(, , TCA_ACT_TAB, >n)) {
fprintf(stderr, "Illegal \"action\"\n");
return -1;
}
-   tail->rta_len = (void *) NLMSG_TAIL() - (void *) tail;
+   tail->rta_len = (void *) NLMSG_TAIL(>n) - (void *) tail;
+
+   *argc_p = argc;
+   *argv_p = argv;
+
+   iov->iov_base = >n;
+   iov->iov_len = req->n.nlmsg_len;
+
+   if (!send)
+   return 0;
 
-   if (rtnl_talk(, , NULL) < 0) {
+   if (rtnl_talk_msg(, , NULL) < 0) {
fprintf(stderr, "We have an error talking to the kernel\n");
ret = -1;
}
 
-   *argc_p = argc;
-   *argv_p = argv;
-
return ret;
 }
 
@@ -679,7 +727,7 @@ bad_val:
return ret;
 }
 
-int do_action(int argc, char **argv)
+int do_action(int argc, char **argv, int batch_size, int index, bool send)
 {
 
int ret = 0;
@@ -689,12 +737,14 @@ int do_action(int argc, char **argv)
if (matches(*argv, "add") == 0) {
ret =  tc_action_modify(RTM_NEWACTION,
NLM_F_EXCL | NLM_F_CREATE,
-   , );
+   , , batch_size,
+   index, send);
} else if (matches(*argv, "change") == 0 ||
  matches(*argv, "replace") == 0) {
ret = tc_action_modify(RTM_NEWACTION,
   NLM_F_CREATE | NLM_F_REPLACE,
-  , );
+  , , batch_size,
+  index, send);
} else if (matches(*argv, "delete") == 0) {
argc -= 1;
argv += 1;
diff --git a/tc/tc.c b/tc/tc.c
index ad9f07e9..90

Re: [patch iproute2 v4 3/3] man: Add -bs option to tc manpage

2018-01-02 Thread Chris Mi

2018/1/3 4:07, Marcelo Ricardo Leitner:

On Tue, Jan 02, 2018 at 11:28:04PM +0900, Chris Mi wrote:

Signed-off-by: Chris Mi <chr...@mellanox.com>
---
  man/man8/tc.8 | 5 +
  1 file changed, 5 insertions(+)

diff --git a/man/man8/tc.8 b/man/man8/tc.8
index ff071b33..de137e16 100644
--- a/man/man8/tc.8
+++ b/man/man8/tc.8
@@ -601,6 +601,11 @@ must exist already.
  read commands from provided file or standard input and invoke them.
  First failure will cause termination of tc.
  
+.TP

+.BR "\-bs", " \-bs size", " \-batchsize", " \-batchsize size"
+How many commands are accumulated before sending to kernel.
+By default, it is 1. It only takes effect in batch mode.
+

You should also describe the limitations it has. Like, it only works
for action and filter and that it shouldn't be mixed with other
commands.

Done.

And maybe even do such check in the code: refuse to do other commands
if batch_size > 1.
I didn't add it because I'm afraid the benefit may be gone if I add the 
check.

But I add a warning in the man page.



  .TP
  .BR "\-force"
  don't terminate tc on errors in batch mode.
--
2.14.3





Re: [patch iproute2 v4 2/3] tc: Add -bs option to batch mode

2018-01-02 Thread Chris Mi



On Tue, Jan 02, 2018 at 11:28:03PM +0900, Chris Mi wrote:

@@ -240,23 +244,49 @@ static int batch(const char *name)
}
  
  	cmdlineno = 0;

-   while (getcmdline(, , stdin) != -1) {
+   if (getcmdline(, , stdin) == -1)
+   goto Exit;
+   do {
char *largv[100];
int largc;
  
+		if (getcmdline(, , stdin) == -1)

+   lastline = true;
+
largc = makeargs(line, largv, 100);
if (largc == 0)
continue;   /* blank line */

If it reads a new line, it won't process anything else after it
because line won't get updated.
Indeed. Thanks for catching it. After fixing it, I find that it only 
works if the blank line is
in the beginning or middle. If the blank line is in the end, we may lose 
at most batchsize - 1 rules.
It is not easy to fix it. I think this issue is trivial. It is not worth 
to make the code complex to fix it.

So I describe this limitation in the man page.

-Chris


   Marcelo

  
-		if (do_cmd(largc, largv)) {

-   fprintf(stderr, "Command failed %s:%d\n", name, 
cmdlineno);
+   line = line2;
+   line2 = NULL;
+   len = 0;
+
+   /*
+* In batch mode, if we haven't accumulated enough commands
+* and this is not the last command, don't send the message
+* immediately.
+*/
+   if (batch_size > 1 && msg_iov_index + 1 != batch_size
+   && !lastline)
+   send = false;
+   else
+   send = true;
+
+   ret = do_cmd(largc, largv, batch_size, msg_iov_index++, send);
+   if (ret < 0) {
+   fprintf(stderr, "Command failed %s:%d\n", name,
+   cmdlineno);
ret = 1;
if (!force)
break;
}
-   }
-   if (line)
-   free(line);
+   msg_iov_index %= batch_size;
+   } while (!lastline);
+
+   free_filter_reqs();
+   free_action_reqs();
+Exit:
+   free(line);
  
  	rtnl_close();

return ret;




[patch iproute2 v4 3/3] man: Add -bs option to tc manpage

2018-01-02 Thread Chris Mi
Signed-off-by: Chris Mi <chr...@mellanox.com>
---
 man/man8/tc.8 | 5 +
 1 file changed, 5 insertions(+)

diff --git a/man/man8/tc.8 b/man/man8/tc.8
index ff071b33..de137e16 100644
--- a/man/man8/tc.8
+++ b/man/man8/tc.8
@@ -601,6 +601,11 @@ must exist already.
 read commands from provided file or standard input and invoke them.
 First failure will cause termination of tc.
 
+.TP
+.BR "\-bs", " \-bs size", " \-batchsize", " \-batchsize size"
+How many commands are accumulated before sending to kernel.
+By default, it is 1. It only takes effect in batch mode.
+
 .TP
 .BR "\-force"
 don't terminate tc on errors in batch mode.
-- 
2.14.3



[patch iproute2 v4 2/3] tc: Add -bs option to batch mode

2018-01-02 Thread Chris Mi
Signed-off-by: Chris Mi <chr...@mellanox.com>
---
 tc/m_action.c  |  90 --
 tc/tc.c|  69 +---
 tc/tc_common.h |   8 +++-
 tc/tc_filter.c | 121 +
 4 files changed, 220 insertions(+), 68 deletions(-)

diff --git a/tc/m_action.c b/tc/m_action.c
index fc422364..2e79034d 100644
--- a/tc/m_action.c
+++ b/tc/m_action.c
@@ -23,6 +23,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include "utils.h"
 #include "tc_common.h"
@@ -546,40 +547,87 @@ bad_val:
return ret;
 }
 
+typedef struct {
+   struct nlmsghdr n;
+   struct tcamsg   t;
+   charbuf[MAX_MSG];
+} tc_action_req;
+
+static tc_action_req *action_reqs;
+static struct iovec msg_iov[MSG_IOV_MAX];
+
+void free_action_reqs(void)
+{
+   free(action_reqs);
+}
+
+static tc_action_req *get_action_req(int batch_size, int index)
+{
+   tc_action_req *req;
+
+   if (action_reqs == NULL) {
+   action_reqs = malloc(batch_size * sizeof (tc_action_req));
+   if (action_reqs == NULL)
+   return NULL;
+   }
+   req = _reqs[index];
+   memset(req, 0, sizeof (*req));
+
+   return req;
+}
+
 static int tc_action_modify(int cmd, unsigned int flags,
-   int *argc_p, char ***argv_p)
+   int *argc_p, char ***argv_p,
+   int batch_size, int index, bool send)
 {
int argc = *argc_p;
char **argv = *argv_p;
int ret = 0;
-   struct {
-   struct nlmsghdr n;
-   struct tcamsg   t;
-   charbuf[MAX_MSG];
-   } req = {
-   .n.nlmsg_len = NLMSG_LENGTH(sizeof(struct tcamsg)),
-   .n.nlmsg_flags = NLM_F_REQUEST | flags,
-   .n.nlmsg_type = cmd,
-   .t.tca_family = AF_UNSPEC,
+   tc_action_req *req;
+   struct sockaddr_nl nladdr = { .nl_family = AF_NETLINK };
+   struct iovec *iov = _iov[index];
+
+   req = get_action_req(batch_size, index);
+   if (req == NULL) {
+   fprintf(stderr, "get_action_req error: not enough buffer\n");
+   return -ENOMEM;
+   }
+
+   req->n.nlmsg_len = NLMSG_LENGTH(sizeof(struct tcamsg));
+   req->n.nlmsg_flags = NLM_F_REQUEST | flags;
+   req->n.nlmsg_type = cmd;
+   req->t.tca_family = AF_UNSPEC;
+   struct rtattr *tail = NLMSG_TAIL(>n);
+
+   struct msghdr msg = {
+   .msg_name = ,
+   .msg_namelen = sizeof(nladdr),
+   .msg_iov = msg_iov,
+   .msg_iovlen = index + 1,
};
-   struct rtattr *tail = NLMSG_TAIL();
 
argc -= 1;
argv += 1;
-   if (parse_action(, , TCA_ACT_TAB, )) {
+   if (parse_action(, , TCA_ACT_TAB, >n)) {
fprintf(stderr, "Illegal \"action\"\n");
return -1;
}
-   tail->rta_len = (void *) NLMSG_TAIL() - (void *) tail;
+   tail->rta_len = (void *) NLMSG_TAIL(>n) - (void *) tail;
+
+   *argc_p = argc;
+   *argv_p = argv;
+
+   iov->iov_base = >n;
+   iov->iov_len = req->n.nlmsg_len;
+
+   if (!send)
+   return 0;
 
-   if (rtnl_talk(, , NULL) < 0) {
+   if (rtnl_talk_msg(, , NULL) < 0) {
fprintf(stderr, "We have an error talking to the kernel\n");
ret = -1;
}
 
-   *argc_p = argc;
-   *argv_p = argv;
-
return ret;
 }
 
@@ -679,7 +727,7 @@ bad_val:
return ret;
 }
 
-int do_action(int argc, char **argv)
+int do_action(int argc, char **argv, int batch_size, int index, bool send)
 {
 
int ret = 0;
@@ -689,12 +737,14 @@ int do_action(int argc, char **argv)
if (matches(*argv, "add") == 0) {
ret =  tc_action_modify(RTM_NEWACTION,
NLM_F_EXCL | NLM_F_CREATE,
-   , );
+   , , batch_size,
+   index, send);
} else if (matches(*argv, "change") == 0 ||
  matches(*argv, "replace") == 0) {
ret = tc_action_modify(RTM_NEWACTION,
   NLM_F_CREATE | NLM_F_REPLACE,
-  , );
+  , , batch_size,
+  index, send);
} else if (matches(*argv, "delete") == 0) {
argc -= 1;
argv += 1;
diff --git a/tc/tc.c b/tc/tc.c
index ad9f07e9..61

[patch iproute2 v4 1/3] lib/libnetlink: Add a function rtnl_talk_msg

2018-01-02 Thread Chris Mi
rtnl_talk can only send a single message to kernel. Add a new function
rtnl_talk_msg that can send multiple messages to kernel.

Signed-off-by: Chris Mi <chr...@mellanox.com>
---
 include/libnetlink.h |  3 +++
 lib/libnetlink.c | 59 ++--
 2 files changed, 46 insertions(+), 16 deletions(-)

diff --git a/include/libnetlink.h b/include/libnetlink.h
index a4d83b9e..01d98b16 100644
--- a/include/libnetlink.h
+++ b/include/libnetlink.h
@@ -96,6 +96,9 @@ int rtnl_dump_filter_nc(struct rtnl_handle *rth,
 int rtnl_talk(struct rtnl_handle *rtnl, struct nlmsghdr *n,
  struct nlmsghdr **answer)
__attribute__((warn_unused_result));
+int rtnl_talk_msg(struct rtnl_handle *rtnl, struct msghdr *m,
+ struct nlmsghdr **answer)
+   __attribute__((warn_unused_result));
 int rtnl_talk_extack(struct rtnl_handle *rtnl, struct nlmsghdr *n,
  struct nlmsghdr **answer, nl_ext_ack_fn_t errfn)
__attribute__((warn_unused_result));
diff --git a/lib/libnetlink.c b/lib/libnetlink.c
index 00e6ce0c..cc02a139 100644
--- a/lib/libnetlink.c
+++ b/lib/libnetlink.c
@@ -581,32 +581,34 @@ static void rtnl_talk_error(struct nlmsghdr *h, struct 
nlmsgerr *err,
strerror(-err->error));
 }
 
-static int __rtnl_talk(struct rtnl_handle *rtnl, struct nlmsghdr *n,
-  struct nlmsghdr **answer,
-  bool show_rtnl_err, nl_ext_ack_fn_t errfn)
+static int __rtnl_talk_msg(struct rtnl_handle *rtnl, struct msghdr *m,
+  struct nlmsghdr **answer,
+  bool show_rtnl_err, nl_ext_ack_fn_t errfn)
 {
-   int status;
-   unsigned int seq;
-   struct nlmsghdr *h;
+   int iovlen = m->msg_iovlen;
+   unsigned int seq = 0;
+   int i, status;
+   char *buf;
+
struct sockaddr_nl nladdr = { .nl_family = AF_NETLINK };
-   struct iovec iov = {
-   .iov_base = n,
-   .iov_len = n->nlmsg_len
-   };
+   struct iovec iov, *v;
+   struct nlmsghdr *h;
struct msghdr msg = {
.msg_name = ,
.msg_namelen = sizeof(nladdr),
.msg_iov = ,
.msg_iovlen = 1,
};
-   char *buf;
 
-   n->nlmsg_seq = seq = ++rtnl->seq;
-
-   if (answer == NULL)
-   n->nlmsg_flags |= NLM_F_ACK;
+   for (i = 0; i < iovlen; i++) {
+   v = >msg_iov[i];
+   h = v->iov_base;
+   h->nlmsg_seq = seq = ++rtnl->seq;
+   if (answer == NULL)
+   h->nlmsg_flags |= NLM_F_ACK;
+   }
 
-   status = sendmsg(rtnl->fd, , 0);
+   status = sendmsg(rtnl->fd, m, 0);
if (status < 0) {
perror("Cannot talk to rtnetlink");
return -1;
@@ -698,12 +700,37 @@ static int __rtnl_talk(struct rtnl_handle *rtnl, struct 
nlmsghdr *n,
}
 }
 
+static int __rtnl_talk(struct rtnl_handle *rtnl, struct nlmsghdr *n,
+  struct nlmsghdr **answer,
+  bool show_rtnl_err, nl_ext_ack_fn_t errfn)
+{
+   struct sockaddr_nl nladdr = { .nl_family = AF_NETLINK };
+   struct iovec iov = {
+   .iov_base = n,
+   .iov_len = n->nlmsg_len
+   };
+   struct msghdr msg = {
+   .msg_name = ,
+   .msg_namelen = sizeof(nladdr),
+   .msg_iov = ,
+   .msg_iovlen = 1,
+   };
+
+   return __rtnl_talk_msg(rtnl, , answer, show_rtnl_err, errfn);
+}
+
 int rtnl_talk(struct rtnl_handle *rtnl, struct nlmsghdr *n,
  struct nlmsghdr **answer)
 {
return __rtnl_talk(rtnl, n, answer, true, NULL);
 }
 
+int rtnl_talk_msg(struct rtnl_handle *rtnl, struct msghdr *m,
+ struct nlmsghdr **answer)
+{
+   return __rtnl_talk_msg(rtnl, m, answer, true, NULL);
+}
+
 int rtnl_talk_extack(struct rtnl_handle *rtnl, struct nlmsghdr *n,
 struct nlmsghdr **answer,
 nl_ext_ack_fn_t errfn)
-- 
2.14.3



[patch iproute2 v4 0/3] tc: Add -bs option to batch mode

2018-01-02 Thread Chris Mi
Currently in tc batch mode, only one command is read from the batch
file and sent to kernel to process. With this patchset, we can
accumulate
several commands before sending to kernel. The batch size is specified
using option -bs or -batchsize.

To accumulate the commands in tc, client should allocate an array of
struct iovec. If batchsize is bigger than 1, only after the client
has accumulated enough commands, can the client call rtnl_talk_msg
to send the message that includes the iov array. One exception is
that there is no more command in the batch file.

But please note that kernel still processes the requests one by one.
To process the requests in parallel in kernel is another effort.
The time we're saving in this patchset is the user mode and kernel mode
context switch. So this patchset works on top of the current kernel.

Using the following script in kernel, we can generate 1,000,000 rules.
tools/testing/selftests/tc-testing/tdc_batch.py

Without this patchset, 'tc -b $file' exection time is:

real0m15.125s
user0m6.982s
sys 0m8.080s

With this patchset, 'tc -b $file -bs 10' exection time is:

real0m12.772s
user0m5.984s
sys 0m6.723s

The insertion rate is improved more than 10%.

In this patchset, we still ack for every rule. If we don't ack at all,

'tc -b $file' exection time is:

real0m14.484s
user0m6.919s
sys 0m7.498s

'tc -b $file -bs 10' exection time is:

real0m11.664s
user0m6.017s
sys 0m5.578s


We can see that the performance win is to send multiple messages instead
of no acking. I think that's because in tc, we don't spend too much time
processing the ack message.


v3
==
1. Instead of hacking function rtnl_talk directly, add a new function
   rtnl_talk_msg.
2. remove most of global variables to use parameter passing
3. divide the previous patch into 4 patches.

v4
==
1. Remove function setcmdlinetotal. Now in function batch, we read one
   more line to determine if we are reaching the end of file.
2. Remove function __rtnl_check_ack. Now __rtnl_talk calls __rtnl_talk_msg
   directly.
3. if (batch_size < 1)
batch_size = 1;

Chris Mi (3):
  lib/libnetlink: Add a function rtnl_talk_msg
  tc: Add -bs option to batch mode
  man: Add -bs option to tc manpage

 include/libnetlink.h |   3 ++
 lib/libnetlink.c |  59 ++---
 man/man8/tc.8|   5 +++
 tc/m_action.c|  90 +-
 tc/tc.c  |  69 +++--
 tc/tc_common.h   |   8 +++-
 tc/tc_filter.c   | 121 +--
 7 files changed, 271 insertions(+), 84 deletions(-)

-- 
2.14.3



Re: [patch iproute2 v3 3/4] tc: Add -bs option to batch mode

2018-01-02 Thread Chris Mi



On Mon, Dec 25, 2017 at 05:46:57PM +0900, Chris Mi wrote:

@@ -267,6 +287,7 @@ int main(int argc, char **argv)
  {
int ret;
char *batch_file = NULL;
+   int batch_size = 1;
  
  	while (argc > 1) {

if (argv[1][0] != '-')
@@ -297,6 +318,14 @@ int main(int argc, char **argv)
if (argc <= 1)
usage();
batch_file = argv[1];
+   } else if (matches(argv[1], "-batchsize") == 0 ||
+   matches(argv[1], "-bs") == 0) {
+   argc--; argv++;
+   if (argc <= 1)
+   usage();
+   batch_size = atoi(argv[1]);
+   if (batch_size > MSG_IOV_MAX)
+   batch_size = MSG_IOV_MAX;

what about
if (batch_size < 1)
batch_size = 1;

Done.



} else if (matches(argv[1], "-netns") == 0) {
NEXT_ARG();
if (netns_switch(argv[1]))




Re: [patch iproute2 v3 3/4] tc: Add -bs option to batch mode

2018-01-02 Thread Chris Mi



On 12/25/17 2:46 AM, Chris Mi wrote:

Signed-off-by: Chris Mi <chr...@mellanox.com>
---
  tc/m_action.c  |  91 +--
  tc/tc.c|  47 ++
  tc/tc_common.h |   8 +++-
  tc/tc_filter.c | 121 +
  4 files changed, 204 insertions(+), 63 deletions(-)

diff --git a/tc/m_action.c b/tc/m_action.c
index fc422364..c4c3b862 100644
--- a/tc/m_action.c
+++ b/tc/m_action.c
@@ -23,6 +23,7 @@
  #include 
  #include 
  #include 
+#include 
  
  #include "utils.h"

  #include "tc_common.h"
@@ -546,40 +547,88 @@ bad_val:
return ret;
  }
  
+typedef struct {

+   struct nlmsghdr n;
+   struct tcamsg   t;
+   charbuf[MAX_MSG];
+} tc_action_req;
+
+static tc_action_req *action_reqs;
+static struct iovec msg_iov[MSG_IOV_MAX];
+
+void free_action_reqs(void)
+{
+   free(action_reqs);
+}
+
+static tc_action_req *get_action_req(int batch_size, int index)
+{
+   tc_action_req *req;
+
+   if (action_reqs == NULL) {
+   action_reqs = malloc(batch_size * sizeof (tc_action_req));
+   if (action_reqs == NULL)
+   return NULL;
+   }
+   req = _reqs[index];
+   memset(req, 0, sizeof (*req));
+
+   return req;
+}
+
  static int tc_action_modify(int cmd, unsigned int flags,
-   int *argc_p, char ***argv_p)
+   int *argc_p, char ***argv_p,
+   int batch_size, int index, bool send)
  {
int argc = *argc_p;
char **argv = *argv_p;
int ret = 0;
-   struct {
-   struct nlmsghdr n;
-   struct tcamsg   t;
-   charbuf[MAX_MSG];
-   } req = {
-   .n.nlmsg_len = NLMSG_LENGTH(sizeof(struct tcamsg)),
-   .n.nlmsg_flags = NLM_F_REQUEST | flags,
-   .n.nlmsg_type = cmd,
-   .t.tca_family = AF_UNSPEC,
+   tc_action_req *req;
+   struct sockaddr_nl nladdr = { .nl_family = AF_NETLINK };
+   struct iovec *iov = _iov[index];
+
+   req = get_action_req(batch_size, index);
+   if (req == NULL) {
+   fprintf(stderr, "get_action_req error: not enough buffer\n");
+   return -ENOMEM;
+   }
+
+   req->n.nlmsg_len = NLMSG_LENGTH(sizeof(struct tcamsg));
+   req->n.nlmsg_flags = NLM_F_REQUEST | flags;
+   req->n.nlmsg_type = cmd;
+   req->t.tca_family = AF_UNSPEC;
+   struct rtattr *tail = NLMSG_TAIL(>n);
+
+   struct msghdr msg = {
+   .msg_name = ,
+   .msg_namelen = sizeof(nladdr),
+   .msg_iov = msg_iov,
+   .msg_iovlen = index + 1,
};
-   struct rtattr *tail = NLMSG_TAIL();
  
  	argc -= 1;

argv += 1;
-   if (parse_action(, , TCA_ACT_TAB, )) {
+   if (parse_action(, , TCA_ACT_TAB, >n)) {
fprintf(stderr, "Illegal \"action\"\n");
return -1;
}
-   tail->rta_len = (void *) NLMSG_TAIL() - (void *) tail;
+   tail->rta_len = (void *) NLMSG_TAIL(>n) - (void *) tail;
+
+   *argc_p = argc;
+   *argv_p = argv;
+
+   iov->iov_base = >n;
+   iov->iov_len = req->n.nlmsg_len;
+
+   if (!send)
+   return 0;
  
-	if (rtnl_talk(, , NULL) < 0) {

+   ret = rtnl_talk_msg(, , NULL);
+   if (ret < 0) {
fprintf(stderr, "We have an error talking to the kernel\n");
ret = -1;
}
  
-	*argc_p = argc;

-   *argv_p = argv;
-
return ret;
  }
  
@@ -679,7 +728,7 @@ bad_val:

return ret;
  }
  
-int do_action(int argc, char **argv)

+int do_action(int argc, char **argv, int batch_size, int index, bool send)
  {
  
  	int ret = 0;

@@ -689,12 +738,14 @@ int do_action(int argc, char **argv)
if (matches(*argv, "add") == 0) {
ret =  tc_action_modify(RTM_NEWACTION,
NLM_F_EXCL | NLM_F_CREATE,
-   , );
+   , , batch_size,
+   index, send);
} else if (matches(*argv, "change") == 0 ||
  matches(*argv, "replace") == 0) {
ret = tc_action_modify(RTM_NEWACTION,
   NLM_F_CREATE | NLM_F_REPLACE,
-  , );
+  , , batch_size,
+  index, send);
} else if (matches(*argv, "delete") == 0) {
argc -= 1;
   

Re: [patch iproute2 v3 2/4] utils: Add a function setcmdlinetotal

2018-01-02 Thread Chris Mi



On 12/25/17 2:46 AM, Chris Mi wrote:

This function calculates how many commands a batch file has and
set it to global variable cmdlinetotal.

Signed-off-by: Chris Mi <chr...@mellanox.com>
---
  include/utils.h |  4 
  lib/utils.c | 20 
  2 files changed, 24 insertions(+)

diff --git a/include/utils.h b/include/utils.h
index d3895d56..113a8c31 100644
--- a/include/utils.h
+++ b/include/utils.h
@@ -235,6 +235,10 @@ void print_nlmsg_timestamp(FILE *fp, const struct nlmsghdr 
*n);
  
  extern int cmdlineno;

  ssize_t getcmdline(char **line, size_t *len, FILE *in);
+
+extern int cmdlinetotal;
+void setcmdlinetotal(const char *name);
+
  int makeargs(char *line, char *argv[], int maxargs);
  int inet_get_addr(const char *src, __u32 *dst, struct in6_addr *dst6);
  
diff --git a/lib/utils.c b/lib/utils.c

index 7ced8c06..53ca389f 100644
--- a/lib/utils.c
+++ b/lib/utils.c
@@ -1202,6 +1202,26 @@ ssize_t getcmdline(char **linep, size_t *lenp, FILE *in)
return cc;
  }
  
+int cmdlinetotal;

+
+void setcmdlinetotal(const char *name)
+{
+   char *line = NULL;
+   size_t len = 0;
+
+   if (name && strcmp(name, "-") != 0) {
+   if (freopen(name, "r", stdin) == NULL) {
+   fprintf(stderr, "Cannot open file \"%s\" for reading: 
%s\n",
+   name, strerror(errno));
+   return;
+   }
+   }
+
+   cmdlinetotal = 0;
+   while (getcmdline(, , stdin) != -1)
+   cmdlinetotal++;
+}
+
  /* split command line into argument vector */
  int makeargs(char *line, char *argv[], int maxargs)
  {


This helper should not be needed. There is no need to read what could be
a million+ line file multiple times.

Done. I removed this helper. But we can't simply use !feof directly.
I figure out a way to determine if we are reaching the end of file by
reading one more line of the batch file.


Re: [patch iproute2 v3 1/4] lib/libnetlink: Add a function rtnl_talk_msg

2018-01-02 Thread Chris Mi



On 12/25/17 2:46 AM, Chris Mi wrote:

diff --git a/lib/libnetlink.c b/lib/libnetlink.c
index 00e6ce0c..f5f675cf 100644
--- a/lib/libnetlink.c
+++ b/lib/libnetlink.c
@@ -581,36 +581,21 @@ static void rtnl_talk_error(struct nlmsghdr *h, struct 
nlmsgerr *err,
strerror(-err->error));
  }
  
-static int __rtnl_talk(struct rtnl_handle *rtnl, struct nlmsghdr *n,

-  struct nlmsghdr **answer,
-  bool show_rtnl_err, nl_ext_ack_fn_t errfn)
+static int __rtnl_check_ack(struct rtnl_handle *rtnl, struct nlmsghdr **answer,

Make this function __rtnl_talk_msg. Include the assignment of nlmsg_seq
and ack setting using the for loop below and sendmsg() call. All of that
code can be common for both the single and multiple iov case.

Thanks for your suggestion. Done.



+  bool show_rtnl_err, nl_ext_ack_fn_t errfn,
+  unsigned int seq)
  {
int status;
-   unsigned int seq;
-   struct nlmsghdr *h;
+   char *buf;

Please order variables in the reverse xmas tree style used in the net code.
Actually, I divide the variables in two parts, none-struct variables and 
struct variables.
Not sure if that meets the reverse xmac tree style, but I think it is 
more readable.



struct sockaddr_nl nladdr = { .nl_family = AF_NETLINK };
-   struct iovec iov = {
-   .iov_base = n,
-   .iov_len = n->nlmsg_len
-   };
+   struct nlmsghdr *h;
+   struct iovec iov;
struct msghdr msg = {
.msg_name = ,
.msg_namelen = sizeof(nladdr),
.msg_iov = ,
.msg_iovlen = 1,
};
-   char *buf;
-
-   n->nlmsg_seq = seq = ++rtnl->seq;
-
-   if (answer == NULL)
-   n->nlmsg_flags |= NLM_F_ACK;
-
-   status = sendmsg(rtnl->fd, , 0);
-   if (status < 0) {
-   perror("Cannot talk to rtnetlink");
-   return -1;
-   }
  
  	while (1) {

status = rtnl_recvmsg(rtnl->fd, , );




[patch iproute2 v3 1/4] lib/libnetlink: Add a function rtnl_talk_msg

2017-12-25 Thread Chris Mi
rtnl_talk can only send a single message to kernel. Add a new function
rtnl_talk_msg that can send multiple messages to kernel.

Signed-off-by: Chris Mi <chr...@mellanox.com>
---
 include/libnetlink.h |  3 ++
 lib/libnetlink.c | 92 
 2 files changed, 74 insertions(+), 21 deletions(-)

diff --git a/include/libnetlink.h b/include/libnetlink.h
index a4d83b9e..e95fad75 100644
--- a/include/libnetlink.h
+++ b/include/libnetlink.h
@@ -96,6 +96,9 @@ int rtnl_dump_filter_nc(struct rtnl_handle *rth,
 int rtnl_talk(struct rtnl_handle *rtnl, struct nlmsghdr *n,
  struct nlmsghdr **answer)
__attribute__((warn_unused_result));
+int rtnl_talk_msg(struct rtnl_handle *rtnl, struct msghdr *m,
+ struct nlmsghdr **answer)
+   __attribute__((warn_unused_result));
 int rtnl_talk_extack(struct rtnl_handle *rtnl, struct nlmsghdr *n,
  struct nlmsghdr **answer, nl_ext_ack_fn_t errfn)
__attribute__((warn_unused_result));
diff --git a/lib/libnetlink.c b/lib/libnetlink.c
index 00e6ce0c..f5f675cf 100644
--- a/lib/libnetlink.c
+++ b/lib/libnetlink.c
@@ -581,36 +581,21 @@ static void rtnl_talk_error(struct nlmsghdr *h, struct 
nlmsgerr *err,
strerror(-err->error));
 }
 
-static int __rtnl_talk(struct rtnl_handle *rtnl, struct nlmsghdr *n,
-  struct nlmsghdr **answer,
-  bool show_rtnl_err, nl_ext_ack_fn_t errfn)
+static int __rtnl_check_ack(struct rtnl_handle *rtnl, struct nlmsghdr **answer,
+  bool show_rtnl_err, nl_ext_ack_fn_t errfn,
+  unsigned int seq)
 {
int status;
-   unsigned int seq;
-   struct nlmsghdr *h;
+   char *buf;
struct sockaddr_nl nladdr = { .nl_family = AF_NETLINK };
-   struct iovec iov = {
-   .iov_base = n,
-   .iov_len = n->nlmsg_len
-   };
+   struct nlmsghdr *h;
+   struct iovec iov;
struct msghdr msg = {
.msg_name = ,
.msg_namelen = sizeof(nladdr),
.msg_iov = ,
.msg_iovlen = 1,
};
-   char *buf;
-
-   n->nlmsg_seq = seq = ++rtnl->seq;
-
-   if (answer == NULL)
-   n->nlmsg_flags |= NLM_F_ACK;
-
-   status = sendmsg(rtnl->fd, , 0);
-   if (status < 0) {
-   perror("Cannot talk to rtnetlink");
-   return -1;
-   }
 
while (1) {
status = rtnl_recvmsg(rtnl->fd, , );
@@ -698,12 +683,77 @@ static int __rtnl_talk(struct rtnl_handle *rtnl, struct 
nlmsghdr *n,
}
 }
 
+static int __rtnl_talk(struct rtnl_handle *rtnl, struct nlmsghdr *n,
+  struct nlmsghdr **answer,
+  bool show_rtnl_err, nl_ext_ack_fn_t errfn)
+{
+   unsigned int seq;
+   int status;
+   struct sockaddr_nl nladdr = { .nl_family = AF_NETLINK };
+   struct iovec iov = {
+   .iov_base = n,
+   .iov_len = n->nlmsg_len
+   };
+   struct msghdr msg = {
+   .msg_name = ,
+   .msg_namelen = sizeof(nladdr),
+   .msg_iov = ,
+   .msg_iovlen = 1,
+   };
+
+   n->nlmsg_seq = seq = ++rtnl->seq;
+
+   if (answer == NULL)
+   n->nlmsg_flags |= NLM_F_ACK;
+
+   status = sendmsg(rtnl->fd, , 0);
+   if (status < 0) {
+   perror("Cannot talk to rtnetlink");
+   return -1;
+   }
+
+   return __rtnl_check_ack(rtnl, answer, show_rtnl_err, errfn, seq);
+}
+
+static int __rtnl_talk_msg(struct rtnl_handle *rtnl, struct msghdr *m,
+  struct nlmsghdr **answer,
+  bool show_rtnl_err, nl_ext_ack_fn_t errfn)
+{
+   int iovlen = m->msg_iovlen;
+   unsigned int seq = 0;
+   struct nlmsghdr *n;
+   struct iovec *v;
+   int i, status;
+
+   for (i = 0; i < iovlen; i++) {
+   v = >msg_iov[i];
+   n = v->iov_base;
+   n->nlmsg_seq = seq = ++rtnl->seq;
+   if (answer == NULL)
+   n->nlmsg_flags |= NLM_F_ACK;
+   }
+
+   status = sendmsg(rtnl->fd, m, 0);
+   if (status < 0) {
+   perror("Cannot talk to rtnetlink");
+   return -1;
+   }
+
+   return __rtnl_check_ack(rtnl, answer, show_rtnl_err, errfn, seq);
+}
+
 int rtnl_talk(struct rtnl_handle *rtnl, struct nlmsghdr *n,
  struct nlmsghdr **answer)
 {
return __rtnl_talk(rtnl, n, answer, true, NULL);
 }
 
+int rtnl_talk_msg(struct rtnl_handle *rtnl, struct msghdr *m,
+ struct nlmsghdr **answer)
+{
+   return __rtnl_talk_msg(rtnl, m, answer, true, NULL);
+}
+
 int rtnl_talk_extack(struct rtnl_handle *rtnl, struct nlmsghdr *n,
 struct nlmsghdr **answer,
 nl_ext_ack_fn_t errfn)
-- 
2.14.3



[patch iproute2 v3 3/4] tc: Add -bs option to batch mode

2017-12-25 Thread Chris Mi
Signed-off-by: Chris Mi <chr...@mellanox.com>
---
 tc/m_action.c  |  91 +--
 tc/tc.c|  47 ++
 tc/tc_common.h |   8 +++-
 tc/tc_filter.c | 121 +
 4 files changed, 204 insertions(+), 63 deletions(-)

diff --git a/tc/m_action.c b/tc/m_action.c
index fc422364..c4c3b862 100644
--- a/tc/m_action.c
+++ b/tc/m_action.c
@@ -23,6 +23,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include "utils.h"
 #include "tc_common.h"
@@ -546,40 +547,88 @@ bad_val:
return ret;
 }
 
+typedef struct {
+   struct nlmsghdr n;
+   struct tcamsg   t;
+   charbuf[MAX_MSG];
+} tc_action_req;
+
+static tc_action_req *action_reqs;
+static struct iovec msg_iov[MSG_IOV_MAX];
+
+void free_action_reqs(void)
+{
+   free(action_reqs);
+}
+
+static tc_action_req *get_action_req(int batch_size, int index)
+{
+   tc_action_req *req;
+
+   if (action_reqs == NULL) {
+   action_reqs = malloc(batch_size * sizeof (tc_action_req));
+   if (action_reqs == NULL)
+   return NULL;
+   }
+   req = _reqs[index];
+   memset(req, 0, sizeof (*req));
+
+   return req;
+}
+
 static int tc_action_modify(int cmd, unsigned int flags,
-   int *argc_p, char ***argv_p)
+   int *argc_p, char ***argv_p,
+   int batch_size, int index, bool send)
 {
int argc = *argc_p;
char **argv = *argv_p;
int ret = 0;
-   struct {
-   struct nlmsghdr n;
-   struct tcamsg   t;
-   charbuf[MAX_MSG];
-   } req = {
-   .n.nlmsg_len = NLMSG_LENGTH(sizeof(struct tcamsg)),
-   .n.nlmsg_flags = NLM_F_REQUEST | flags,
-   .n.nlmsg_type = cmd,
-   .t.tca_family = AF_UNSPEC,
+   tc_action_req *req;
+   struct sockaddr_nl nladdr = { .nl_family = AF_NETLINK };
+   struct iovec *iov = _iov[index];
+
+   req = get_action_req(batch_size, index);
+   if (req == NULL) {
+   fprintf(stderr, "get_action_req error: not enough buffer\n");
+   return -ENOMEM;
+   }
+
+   req->n.nlmsg_len = NLMSG_LENGTH(sizeof(struct tcamsg));
+   req->n.nlmsg_flags = NLM_F_REQUEST | flags;
+   req->n.nlmsg_type = cmd;
+   req->t.tca_family = AF_UNSPEC;
+   struct rtattr *tail = NLMSG_TAIL(>n);
+
+   struct msghdr msg = {
+   .msg_name = ,
+   .msg_namelen = sizeof(nladdr),
+   .msg_iov = msg_iov,
+   .msg_iovlen = index + 1,
};
-   struct rtattr *tail = NLMSG_TAIL();
 
argc -= 1;
argv += 1;
-   if (parse_action(, , TCA_ACT_TAB, )) {
+   if (parse_action(, , TCA_ACT_TAB, >n)) {
fprintf(stderr, "Illegal \"action\"\n");
return -1;
}
-   tail->rta_len = (void *) NLMSG_TAIL() - (void *) tail;
+   tail->rta_len = (void *) NLMSG_TAIL(>n) - (void *) tail;
+
+   *argc_p = argc;
+   *argv_p = argv;
+
+   iov->iov_base = >n;
+   iov->iov_len = req->n.nlmsg_len;
+
+   if (!send)
+   return 0;
 
-   if (rtnl_talk(, , NULL) < 0) {
+   ret = rtnl_talk_msg(, , NULL);
+   if (ret < 0) {
fprintf(stderr, "We have an error talking to the kernel\n");
ret = -1;
}
 
-   *argc_p = argc;
-   *argv_p = argv;
-
return ret;
 }
 
@@ -679,7 +728,7 @@ bad_val:
return ret;
 }
 
-int do_action(int argc, char **argv)
+int do_action(int argc, char **argv, int batch_size, int index, bool send)
 {
 
int ret = 0;
@@ -689,12 +738,14 @@ int do_action(int argc, char **argv)
if (matches(*argv, "add") == 0) {
ret =  tc_action_modify(RTM_NEWACTION,
NLM_F_EXCL | NLM_F_CREATE,
-   , );
+   , , batch_size,
+   index, send);
} else if (matches(*argv, "change") == 0 ||
  matches(*argv, "replace") == 0) {
ret = tc_action_modify(RTM_NEWACTION,
   NLM_F_CREATE | NLM_F_REPLACE,
-  , );
+  , , batch_size,
+  index, send);
} else if (matches(*argv, "delete") == 0) {
argc -= 1;
argv += 1;
diff --git a/tc/tc.c b/tc/tc.c
index ad

[patch iproute2 v3 4/4] man: Add -bs option to tc manpage

2017-12-25 Thread Chris Mi
Signed-off-by: Chris Mi <chr...@mellanox.com>
---
 man/man8/tc.8 | 5 +
 1 file changed, 5 insertions(+)

diff --git a/man/man8/tc.8 b/man/man8/tc.8
index ff071b33..de137e16 100644
--- a/man/man8/tc.8
+++ b/man/man8/tc.8
@@ -601,6 +601,11 @@ must exist already.
 read commands from provided file or standard input and invoke them.
 First failure will cause termination of tc.
 
+.TP
+.BR "\-bs", " \-bs size", " \-batchsize", " \-batchsize size"
+How many commands are accumulated before sending to kernel.
+By default, it is 1. It only takes effect in batch mode.
+
 .TP
 .BR "\-force"
 don't terminate tc on errors in batch mode.
-- 
2.14.3



[patch iproute2 v3 2/4] utils: Add a function setcmdlinetotal

2017-12-25 Thread Chris Mi
This function calculates how many commands a batch file has and
set it to global variable cmdlinetotal.

Signed-off-by: Chris Mi <chr...@mellanox.com>
---
 include/utils.h |  4 
 lib/utils.c | 20 
 2 files changed, 24 insertions(+)

diff --git a/include/utils.h b/include/utils.h
index d3895d56..113a8c31 100644
--- a/include/utils.h
+++ b/include/utils.h
@@ -235,6 +235,10 @@ void print_nlmsg_timestamp(FILE *fp, const struct nlmsghdr 
*n);
 
 extern int cmdlineno;
 ssize_t getcmdline(char **line, size_t *len, FILE *in);
+
+extern int cmdlinetotal;
+void setcmdlinetotal(const char *name);
+
 int makeargs(char *line, char *argv[], int maxargs);
 int inet_get_addr(const char *src, __u32 *dst, struct in6_addr *dst6);
 
diff --git a/lib/utils.c b/lib/utils.c
index 7ced8c06..53ca389f 100644
--- a/lib/utils.c
+++ b/lib/utils.c
@@ -1202,6 +1202,26 @@ ssize_t getcmdline(char **linep, size_t *lenp, FILE *in)
return cc;
 }
 
+int cmdlinetotal;
+
+void setcmdlinetotal(const char *name)
+{
+   char *line = NULL;
+   size_t len = 0;
+
+   if (name && strcmp(name, "-") != 0) {
+   if (freopen(name, "r", stdin) == NULL) {
+   fprintf(stderr, "Cannot open file \"%s\" for reading: 
%s\n",
+   name, strerror(errno));
+   return;
+   }
+   }
+
+   cmdlinetotal = 0;
+   while (getcmdline(, , stdin) != -1)
+   cmdlinetotal++;
+}
+
 /* split command line into argument vector */
 int makeargs(char *line, char *argv[], int maxargs)
 {
-- 
2.14.3



[patch iproute2 v3 0/4] tc: Add -bs option to batch mode

2017-12-25 Thread Chris Mi
Currently in tc batch mode, only one command is read from the batch
file and sent to kernel to process. With this patchset, we can accumulate
several commands before sending to kernel. The batch size is specified
using option -bs or -batchsize.

To accumulate the commands in tc, client should allocate an array of
struct iovec. If batchsize is bigger than 1, only after the client
has accumulated enough commands, can the client call rtnl_talk_msg
to send the message that includes the iov array. One exception is
that there is no more command in the batch file.

But please note that kernel still processes the requests one by one.
To process the requests in parallel in kernel is another effort.
The time we're saving in this patchset is the user mode and kernel mode
context switch. So this patchset works on top of the current kernel.

Using the following script in kernel, we can generate 1,000,000 rules.
tools/testing/selftests/tc-testing/tdc_batch.py

Without this patchset, 'tc -b $file' exection time is:

real0m15.125s
user0m6.982s
sys 0m8.080s

With this patchset, 'tc -b $file -bs 10' exection time is:

real0m12.772s
user0m5.984s
sys 0m6.723s

The insertion rate is improved more than 10%.

In this patchset, we still ack for every rule. If we don't ack at all,

'tc -b $file' exection time is:

real0m14.484s
user0m6.919s
sys 0m7.498s

'tc -b $file -bs 10' exection time is:

real0m11.664s
user0m6.017s
sys 0m5.578s

We can see that the performance win is to send multiple messages instead
of no acking. I think that's because in tc, we don't spend too much time
processing the ack message.


v3
==
1. Instead of hacking function rtnl_talk directly, add a new function
   rtnl_talk_msg. 
2. remove most of global variables to use parameter passing
3. divide the previous patch into 4 patches.

Chris Mi (4):
  lib/libnetlink: Add a function rtnl_talk_msg
  utils: Add a function setcmdlinetotal
  tc: Add -bs option to batch mode
  man: Add -bs option to tc manpage

 include/libnetlink.h |   3 ++
 include/utils.h  |   4 ++
 lib/libnetlink.c |  92 ++-
 lib/utils.c  |  20 +
 man/man8/tc.8|   5 +++
 tc/m_action.c|  91 +-
 tc/tc.c  |  47 
 tc/tc_common.h   |   8 +++-
 tc/tc_filter.c   | 121 +--
 9 files changed, 307 insertions(+), 84 deletions(-)

-- 
2.14.3



RE: [patch iproute2 v2] tc: add -bs option for batch mode

2017-12-22 Thread Chris Mi
> -Original Message-
> From: David Ahern [mailto:dsah...@gmail.com]
> Sent: Friday, December 22, 2017 6:04 AM
> To: Chris Mi <chr...@mellanox.com>; netdev@vger.kernel.org
> Cc: gerlitz...@gmail.com
> Subject: Re: [patch iproute2 v2] tc: add -bs option for batch mode
> 
> On 12/20/17 2:26 AM, Chris Mi wrote:
> > Currently in tc batch mode, only one command is read from the batch
> > file and sent to kernel to process. With this patch, we can accumulate
> > several commands before sending to kernel. The batch size is specified
> > using option -bs or -batchsize.
> >
> > To accumulate the commands in tc, we allocate an array of struct iovec.
> > If batchsize is bigger than 1 and we haven't accumulated enough
> > commands, rtnl_talk() will return without actually sending the message.
> > One exception is that there is no more command in the batch file.
> >
> > But please note that kernel still processes the requests one by one.
> > To process the requests in parallel in kernel is another effort.
> > The time we're saving in this patch is the user mode and kernel mode
> > context switch. So this patch works on top of the current kernel.
> >
> > Using the following script in kernel, we can generate 1,000,000 rules.
> > tools/testing/selftests/tc-testing/tdc_batch.py
> >
> > Without this patch, 'tc -b $file' exection time is:
> >
> > real0m14.916s
> > user0m6.808s
> > sys 0m8.046s
> >
> > With this patch, 'tc -b $file -bs 10' exection time is:
> >
> > real0m12.286s
> > user0m5.903s
> > sys 0m6.312s
> >
> > The insertion rate is improved more than 10%.
> >
> > Signed-off-by: Chris Mi <chr...@mellanox.com>
> > ---
> >  include/libnetlink.h |  6 
> >  include/utils.h  |  4 +++
> >  lib/libnetlink.c | 25 ++-
> >  lib/utils.c  | 20 
> >  man/man8/tc.8|  5 +++
> >  tc/m_action.c| 62 +++-
> >  tc/tc.c  | 24 --
> >  tc/tc_common.h   |  3 ++
> >  tc/tc_filter.c   | 88 -
> ---
> >  9 files changed, 186 insertions(+), 51 deletions(-)
> >
> > diff --git a/include/libnetlink.h b/include/libnetlink.h index
> > a4d83b9e..775f830b 100644
> > --- a/include/libnetlink.h
> > +++ b/include/libnetlink.h
> > @@ -13,6 +13,8 @@
> >  #include 
> >  #include 
> >
> > +#define MSG_IOV_MAX 256
> > +
> >  struct rtnl_handle {
> > int fd;
> > struct sockaddr_nl  local;
> > @@ -93,6 +95,10 @@ int rtnl_dump_filter_nc(struct rtnl_handle *rth,
> > void *arg, __u16 nc_flags);
> >  #define rtnl_dump_filter(rth, filter, arg) \
> > rtnl_dump_filter_nc(rth, filter, arg, 0)
> > +
> > +extern int msg_iov_index;
> > +extern int batch_size;
> > +
> >  int rtnl_talk(struct rtnl_handle *rtnl, struct nlmsghdr *n,
> >   struct nlmsghdr **answer)
> > __attribute__((warn_unused_result));
> > diff --git a/include/utils.h b/include/utils.h index
> > d3895d56..113a8c31 100644
> > --- a/include/utils.h
> > +++ b/include/utils.h
> > @@ -235,6 +235,10 @@ void print_nlmsg_timestamp(FILE *fp, const struct
> > nlmsghdr *n);
> >
> >  extern int cmdlineno;
> >  ssize_t getcmdline(char **line, size_t *len, FILE *in);
> > +
> > +extern int cmdlinetotal;
> > +void setcmdlinetotal(const char *name);
> > +
> >  int makeargs(char *line, char *argv[], int maxargs);  int
> > inet_get_addr(const char *src, __u32 *dst, struct in6_addr *dst6);
> >
> > diff --git a/lib/libnetlink.c b/lib/libnetlink.c index
> > 00e6ce0c..7ff32d64 100644
> > --- a/lib/libnetlink.c
> > +++ b/lib/libnetlink.c
> > @@ -24,6 +24,7 @@
> >  #include 
> >
> >  #include "libnetlink.h"
> > +#include "utils.h"
> >
> >  #ifndef SOL_NETLINK
> >  #define SOL_NETLINK 270
> > @@ -581,6 +582,10 @@ static void rtnl_talk_error(struct nlmsghdr *h,
> struct nlmsgerr *err,
> > strerror(-err->error));
> >  }
> >
> > +static struct iovec msg_iov[MSG_IOV_MAX]; int msg_iov_index; int
> > +batch_size = 1;
> > +
> >  static int __rtnl_talk(struct rtnl_handle *rtnl, struct nlmsghdr *n,
> >struct nlmsghdr **answer,
> >bool show_rtnl_err, nl_ext_ack_fn_t errfn) @@ -589,23
> > +594,29 @@ static int _

RE: [patch iproute2] tc: add -bs option for batch mode

2017-12-20 Thread Chris Mi
> -Original Message-
> From: Stephen Hemminger [mailto:step...@networkplumber.org]
> Sent: Wednesday, December 20, 2017 11:18 PM
> To: Chris Mi <chr...@mellanox.com>
> Cc: netdev@vger.kernel.org; gerlitz...@gmail.com
> Subject: Re: [patch iproute2] tc: add -bs option for batch mode
> 
> On Wed, 20 Dec 2017 09:23:34 +
> Chris Mi <chr...@mellanox.com> wrote:
> 
> > > Your real performance win is just not asking for ACK for every rule.
> > No. Even if batch_size > 1, we ack every rule. The real performance
> > win is to send multiple rules in one system call. If we are not asking
> > for ACK for every rule, the performance will be improved further.
> 
> Try the no ACK method.
> 
> When we were optimizing routing daemons like Quagga, it was discovered
> that an ACK for every route insert was the main bottleneck. Doing
> asynchronous error handling got a bigger win than your batching.
> 
> Please try that, doing multiple messages using iov is not necessary.

This is my testing result to insert 1,000,000 rules:

1. batch_size = 1, acking

real0m15.125s
user0m6.982s
sys 0m8.080s

2. batch_size = 10, acking

real0m12.772s
user0m5.984s
sys 0m6.723s

3. batch_size = 1, no acking

real0m14.484s
user0m6.919s
sys 0m7.498s

4. batch_size = 10, no acking

real0m11.664s
user0m6.017s
sys 0m5.578s

As we can see from above test result, the bottleneck is not in acking.
Without acking or with asynchronous error handling, we can improve the 
performance further.
It is worth to do that. But I think that should be in another patch. This patch 
only adds
the -bs option.


[patch iproute2 v2] tc: add -bs option for batch mode

2017-12-20 Thread Chris Mi
Currently in tc batch mode, only one command is read from the batch
file and sent to kernel to process. With this patch, we can accumulate
several commands before sending to kernel. The batch size is specified
using option -bs or -batchsize.

To accumulate the commands in tc, we allocate an array of struct iovec.
If batchsize is bigger than 1 and we haven't accumulated enough
commands, rtnl_talk() will return without actually sending the message.
One exception is that there is no more command in the batch file.

But please note that kernel still processes the requests one by one.
To process the requests in parallel in kernel is another effort.
The time we're saving in this patch is the user mode and kernel mode
context switch. So this patch works on top of the current kernel.

Using the following script in kernel, we can generate 1,000,000 rules.
tools/testing/selftests/tc-testing/tdc_batch.py

Without this patch, 'tc -b $file' exection time is:

real0m14.916s
user0m6.808s
sys 0m8.046s

With this patch, 'tc -b $file -bs 10' exection time is:

real0m12.286s
user0m5.903s
sys 0m6.312s

The insertion rate is improved more than 10%.

Signed-off-by: Chris Mi <chr...@mellanox.com>
---
 include/libnetlink.h |  6 
 include/utils.h  |  4 +++
 lib/libnetlink.c | 25 ++-
 lib/utils.c  | 20 
 man/man8/tc.8|  5 +++
 tc/m_action.c| 62 +++-
 tc/tc.c  | 24 --
 tc/tc_common.h   |  3 ++
 tc/tc_filter.c   | 88 
 9 files changed, 186 insertions(+), 51 deletions(-)

diff --git a/include/libnetlink.h b/include/libnetlink.h
index a4d83b9e..775f830b 100644
--- a/include/libnetlink.h
+++ b/include/libnetlink.h
@@ -13,6 +13,8 @@
 #include 
 #include 
 
+#define MSG_IOV_MAX 256
+
 struct rtnl_handle {
int fd;
struct sockaddr_nl  local;
@@ -93,6 +95,10 @@ int rtnl_dump_filter_nc(struct rtnl_handle *rth,
void *arg, __u16 nc_flags);
 #define rtnl_dump_filter(rth, filter, arg) \
rtnl_dump_filter_nc(rth, filter, arg, 0)
+
+extern int msg_iov_index;
+extern int batch_size;
+
 int rtnl_talk(struct rtnl_handle *rtnl, struct nlmsghdr *n,
  struct nlmsghdr **answer)
__attribute__((warn_unused_result));
diff --git a/include/utils.h b/include/utils.h
index d3895d56..113a8c31 100644
--- a/include/utils.h
+++ b/include/utils.h
@@ -235,6 +235,10 @@ void print_nlmsg_timestamp(FILE *fp, const struct nlmsghdr 
*n);
 
 extern int cmdlineno;
 ssize_t getcmdline(char **line, size_t *len, FILE *in);
+
+extern int cmdlinetotal;
+void setcmdlinetotal(const char *name);
+
 int makeargs(char *line, char *argv[], int maxargs);
 int inet_get_addr(const char *src, __u32 *dst, struct in6_addr *dst6);
 
diff --git a/lib/libnetlink.c b/lib/libnetlink.c
index 00e6ce0c..7ff32d64 100644
--- a/lib/libnetlink.c
+++ b/lib/libnetlink.c
@@ -24,6 +24,7 @@
 #include 
 
 #include "libnetlink.h"
+#include "utils.h"
 
 #ifndef SOL_NETLINK
 #define SOL_NETLINK 270
@@ -581,6 +582,10 @@ static void rtnl_talk_error(struct nlmsghdr *h, struct 
nlmsgerr *err,
strerror(-err->error));
 }
 
+static struct iovec msg_iov[MSG_IOV_MAX];
+int msg_iov_index;
+int batch_size = 1;
+
 static int __rtnl_talk(struct rtnl_handle *rtnl, struct nlmsghdr *n,
   struct nlmsghdr **answer,
   bool show_rtnl_err, nl_ext_ack_fn_t errfn)
@@ -589,23 +594,29 @@ static int __rtnl_talk(struct rtnl_handle *rtnl, struct 
nlmsghdr *n,
unsigned int seq;
struct nlmsghdr *h;
struct sockaddr_nl nladdr = { .nl_family = AF_NETLINK };
-   struct iovec iov = {
-   .iov_base = n,
-   .iov_len = n->nlmsg_len
-   };
+   struct iovec *iov = _iov[msg_iov_index];
+   char *buf;
+
+   iov->iov_base = n;
+   iov->iov_len = n->nlmsg_len;
+
struct msghdr msg = {
.msg_name = ,
.msg_namelen = sizeof(nladdr),
-   .msg_iov = ,
-   .msg_iovlen = 1,
+   .msg_iov = msg_iov,
+   .msg_iovlen = ++msg_iov_index,
};
-   char *buf;
 
n->nlmsg_seq = seq = ++rtnl->seq;
 
if (answer == NULL)
n->nlmsg_flags |= NLM_F_ACK;
 
+   msg_iov_index %= batch_size;
+   if (msg_iov_index != 0 && batch_size > 1 && cmdlineno != cmdlinetotal &&
+   (n->nlmsg_type == RTM_NEWTFILTER || n->nlmsg_type == RTM_NEWACTION))
+   return 3;
+
status = sendmsg(rtnl->fd, , 0);
if (status < 0) {
perror("Cannot talk to rtnetlink");
diff --git a/lib/utils.c b/lib/utils.c
index 7ced8c06..53ca389f 100644
--- a/lib/utils.c
+++ b/lib/utils.c
@@ -1202,6 +1202,26 @@ ssize_t 

RE: [patch iproute2] tc: add -bs option for batch mode

2017-12-20 Thread Chris Mi
> -Original Message-
> From: Stephen Hemminger [mailto:step...@networkplumber.org]
> Sent: Tuesday, December 19, 2017 11:23 PM
> To: Chris Mi <chr...@mellanox.com>
> Cc: netdev@vger.kernel.org; gerlitz...@gmail.com
> Subject: Re: [patch iproute2] tc: add -bs option for batch mode
> 
> On Tue, 19 Dec 2017 15:33:46 +0900
> Chris Mi <chr...@mellanox.com> wrote:
> 
> > Currently in tc batch mode, only one command is read from the batch
> > file and sent to kernel to process. With this patch, we can accumulate
> > several commands before sending to kernel. The batch size is specified
> > using option -bs or -batchsize.
> >
> > To accumulate the commands in tc, we allocate an array of struct iovec.
> > If batchsize is bigger than 1 and we haven't accumulated enough
> > commands, rtnl_talk() will return without actually sending the message.
> > One exception is that there is no more command in the batch file.
> >
> > But please note that kernel still processes the requests one by one.
> > To process the requests in parallel in kernel is another effort.
> > The time we're saving in this patch is the user mode and kernel mode
> > context switch. So this patch works on top of the current kernel.
> >
> > Using the following script in kernel, we can generate 1,000,000 rules.
> > tools/testing/selftests/tc-testing/tdc_batch.py
> >
> > Without this patch, 'tc -b $file' exection time is:
> >
> > real0m14.916s
> > user0m6.808s
> > sys 0m8.046s
> >
> > With this patch, 'tc -b $file -bs 10' exection time is:
> >
> > real0m12.286s
> > user0m5.903s
> > sys 0m6.312s
> >
> > The insertion rate is improved more than 10%.
> >
> > Signed-off-by: Chris Mi <chr...@mellanox.com>
> > ---
> >  include/libnetlink.h | 27 
> >  include/utils.h  |  8 +
> >  lib/libnetlink.c | 30 +-
> >  lib/utils.c  | 20 
> >  man/man8/tc.8|  5 +++
> >  tc/m_action.c| 63 -
> >  tc/tc.c  | 27 ++--
> >  tc/tc_common.h   |  3 ++
> >  tc/tc_filter.c   | 89 -
> ---
> >  9 files changed, 221 insertions(+), 51 deletions(-)
> 
> In addition to my earlier comments, these are the implementation issues.
> 
> >
> > diff --git a/include/libnetlink.h b/include/libnetlink.h index
> > a4d83b9e..07e88c94 100644
> > --- a/include/libnetlink.h
> > +++ b/include/libnetlink.h
> > @@ -13,6 +13,8 @@
> >  #include 
> >  #include 
> >
> > +#define MSG_IOV_MAX 256
> > +
> >  struct rtnl_handle {
> > int fd;
> > struct sockaddr_nl  local;
> > @@ -93,6 +95,31 @@ int rtnl_dump_filter_nc(struct rtnl_handle *rth,
> > void *arg, __u16 nc_flags);
> >  #define rtnl_dump_filter(rth, filter, arg) \
> > rtnl_dump_filter_nc(rth, filter, arg, 0)
> > +
> > +extern int msg_iov_index;
> > +static inline int get_msg_iov_index(void) {
> > +   return msg_iov_index;
> > +}
> > +static inline void set_msg_iov_index(int index) {
> > +   msg_iov_index = index;
> > +}
> > +static inline void incr_msg_iov_index(void) {
> > +   ++msg_iov_index;
> > +}
> > +
> > +extern int batch_size;
> > +static inline int get_batch_size(void) {
> > +   return batch_size;
> > +}
> > +static inline void set_batch_size(int size) {
> > +   batch_size = size;
> > +}
> 
> Iproute2 is C not C++; no accessors for every variable.
I have changed them to C style.
> 
> 
> >  int rtnl_talk(struct rtnl_handle *rtnl, struct nlmsghdr *n,
> >   struct nlmsghdr **answer)
> > __attribute__((warn_unused_result));
> > diff --git a/include/utils.h b/include/utils.h index
> > d3895d56..66cb4747 100644
> > --- a/include/utils.h
> > +++ b/include/utils.h
> > @@ -235,6 +235,14 @@ void print_nlmsg_timestamp(FILE *fp, const struct
> > nlmsghdr *n);
> >
> >  extern int cmdlineno;
> >  ssize_t getcmdline(char **line, size_t *len, FILE *in);
> > +
> > +extern int cmdlinetotal;
> > +static inline int getcmdlinetotal(void) {
> > +   return cmdlinetotal;
> > +}
> > +void setcmdlinetotal(const char *name);
> > +
> >  int makeargs(char *line, char *argv[], int maxargs);  int
> > inet_get_addr(const char *src, __u32 *dst, struct in6_addr *dst6);
> >
> > diff --git a/lib/libnetlink

[patch iproute2] tc: add -bs option for batch mode

2017-12-18 Thread Chris Mi
Currently in tc batch mode, only one command is read from the batch
file and sent to kernel to process. With this patch, we can accumulate
several commands before sending to kernel. The batch size is specified
using option -bs or -batchsize.

To accumulate the commands in tc, we allocate an array of struct iovec.
If batchsize is bigger than 1 and we haven't accumulated enough
commands, rtnl_talk() will return without actually sending the message.
One exception is that there is no more command in the batch file.

But please note that kernel still processes the requests one by one.
To process the requests in parallel in kernel is another effort.
The time we're saving in this patch is the user mode and kernel mode
context switch. So this patch works on top of the current kernel.

Using the following script in kernel, we can generate 1,000,000 rules.
tools/testing/selftests/tc-testing/tdc_batch.py

Without this patch, 'tc -b $file' exection time is:

real0m14.916s
user0m6.808s
sys 0m8.046s

With this patch, 'tc -b $file -bs 10' exection time is:

real0m12.286s
user0m5.903s
sys 0m6.312s

The insertion rate is improved more than 10%.

Signed-off-by: Chris Mi <chr...@mellanox.com>
---
 include/libnetlink.h | 27 
 include/utils.h  |  8 +
 lib/libnetlink.c | 30 +-
 lib/utils.c  | 20 
 man/man8/tc.8|  5 +++
 tc/m_action.c| 63 -
 tc/tc.c  | 27 ++--
 tc/tc_common.h   |  3 ++
 tc/tc_filter.c   | 89 
 9 files changed, 221 insertions(+), 51 deletions(-)

diff --git a/include/libnetlink.h b/include/libnetlink.h
index a4d83b9e..07e88c94 100644
--- a/include/libnetlink.h
+++ b/include/libnetlink.h
@@ -13,6 +13,8 @@
 #include 
 #include 
 
+#define MSG_IOV_MAX 256
+
 struct rtnl_handle {
int fd;
struct sockaddr_nl  local;
@@ -93,6 +95,31 @@ int rtnl_dump_filter_nc(struct rtnl_handle *rth,
void *arg, __u16 nc_flags);
 #define rtnl_dump_filter(rth, filter, arg) \
rtnl_dump_filter_nc(rth, filter, arg, 0)
+
+extern int msg_iov_index;
+static inline int get_msg_iov_index(void)
+{
+   return msg_iov_index;
+}
+static inline void set_msg_iov_index(int index)
+{
+   msg_iov_index = index;
+}
+static inline void incr_msg_iov_index(void)
+{
+   ++msg_iov_index;
+}
+
+extern int batch_size;
+static inline int get_batch_size(void)
+{
+   return batch_size;
+}
+static inline void set_batch_size(int size)
+{
+   batch_size = size;
+}
+
 int rtnl_talk(struct rtnl_handle *rtnl, struct nlmsghdr *n,
  struct nlmsghdr **answer)
__attribute__((warn_unused_result));
diff --git a/include/utils.h b/include/utils.h
index d3895d56..66cb4747 100644
--- a/include/utils.h
+++ b/include/utils.h
@@ -235,6 +235,14 @@ void print_nlmsg_timestamp(FILE *fp, const struct nlmsghdr 
*n);
 
 extern int cmdlineno;
 ssize_t getcmdline(char **line, size_t *len, FILE *in);
+
+extern int cmdlinetotal;
+static inline int getcmdlinetotal(void)
+{
+   return cmdlinetotal;
+}
+void setcmdlinetotal(const char *name);
+
 int makeargs(char *line, char *argv[], int maxargs);
 int inet_get_addr(const char *src, __u32 *dst, struct in6_addr *dst6);
 
diff --git a/lib/libnetlink.c b/lib/libnetlink.c
index 00e6ce0c..f9be1c6d 100644
--- a/lib/libnetlink.c
+++ b/lib/libnetlink.c
@@ -24,6 +24,7 @@
 #include 
 
 #include "libnetlink.h"
+#include "utils.h"
 
 #ifndef SOL_NETLINK
 #define SOL_NETLINK 270
@@ -581,6 +582,10 @@ static void rtnl_talk_error(struct nlmsghdr *h, struct 
nlmsgerr *err,
strerror(-err->error));
 }
 
+static struct iovec msg_iov[MSG_IOV_MAX];
+int msg_iov_index;
+int batch_size = 1;
+
 static int __rtnl_talk(struct rtnl_handle *rtnl, struct nlmsghdr *n,
   struct nlmsghdr **answer,
   bool show_rtnl_err, nl_ext_ack_fn_t errfn)
@@ -589,23 +594,34 @@ static int __rtnl_talk(struct rtnl_handle *rtnl, struct 
nlmsghdr *n,
unsigned int seq;
struct nlmsghdr *h;
struct sockaddr_nl nladdr = { .nl_family = AF_NETLINK };
-   struct iovec iov = {
-   .iov_base = n,
-   .iov_len = n->nlmsg_len
-   };
+   struct iovec *iov = _iov[get_msg_iov_index()];
+   int index;
+   char *buf;
+
+   iov->iov_base = n;
+   iov->iov_len = n->nlmsg_len;
+
+   incr_msg_iov_index();
struct msghdr msg = {
.msg_name = ,
.msg_namelen = sizeof(nladdr),
-   .msg_iov = ,
-   .msg_iovlen = 1,
+   .msg_iov = msg_iov,
+   .msg_iovlen = get_msg_iov_index(),
};
-   char *buf;
 
n->nlmsg_seq = seq = ++rtnl->seq;
 
if (answer == NULL)
n->nlmsg_flags

RE: [patch iproute2] tc: fix command "tc actions del" hang issue

2017-12-17 Thread Chris Mi
> -Original Message-
> From: Stephen Hemminger [mailto:step...@networkplumber.org]
> Sent: Friday, December 15, 2017 1:17 PM
> To: Chris Mi <chr...@mellanox.com>
> Cc: netdev@vger.kernel.org; j...@resnulli.us
> Subject: Re: [patch iproute2] tc: fix command "tc actions del" hang issue
> 
> On Thu, 14 Dec 2017 18:09:00 +0900
> Chris Mi <chr...@mellanox.com> wrote:
> 
> > If command is RTM_DELACTION, a non-NULL pointer is passed to rtnl_talk().
> > Then flag NLM_F_ACK is not set on n->nlmsg_flags and netlink_ack()
> > will not be called. Command tc will wait for the reply for ever.
> >
> > Fixes: 86bf43c7c2fd ("lib/libnetlink: update rtnl_talk to support
> > malloc buff at run time")
> > Signed-off-by: Chris Mi <chr...@mellanox.com>
> > Reviewed-by: Jiri Pirko <j...@mellanox.com>
> 
> Thanks for fixing this.
> Applied, but please don't linewrap the fixes tag.
Thank for fixing it. I'll pay attention to it next time.


[patch iproute2] tc: fix command "tc actions del" hang issue

2017-12-14 Thread Chris Mi
If command is RTM_DELACTION, a non-NULL pointer is passed to rtnl_talk().
Then flag NLM_F_ACK is not set on n->nlmsg_flags and netlink_ack() will
not be called. Command tc will wait for the reply for ever.

Fixes: 86bf43c7c2fd ("lib/libnetlink: update rtnl_talk to support malloc
buff at run time")
Signed-off-by: Chris Mi <chr...@mellanox.com>
Reviewed-by: Jiri Pirko <j...@mellanox.com>
---
 tc/m_action.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tc/m_action.c b/tc/m_action.c
index afb6cfad..986ef7d0 100644
--- a/tc/m_action.c
+++ b/tc/m_action.c
@@ -529,7 +529,7 @@ static int tc_action_gd(int cmd, unsigned int flags,
 
req.n.nlmsg_seq = rth.dump = ++rth.seq;
 
-   if (rtnl_talk(, , ) < 0) {
+   if (rtnl_talk(, , cmd == RTM_DELACTION ? NULL : ) < 0) {
fprintf(stderr, "We have an error talking to the kernel\n");
return 1;
}
-- 
2.14.3



RE: [patch net v2 1/4] net/sched: Change tc_action refcnt and bindcnt to atomic

2017-10-24 Thread Chris Mi
> -Original Message-
> From: Cong Wang [mailto:xiyou.wangc...@gmail.com]
> Sent: Monday, October 23, 2017 11:40 PM
> To: Chris Mi <chr...@mellanox.com>
> Cc: Jamal Hadi Salim <j...@mojatatu.com>; Linux Kernel Network Developers
> <netdev@vger.kernel.org>; Lucas Bates <luc...@mojatatu.com>; Jiri Pirko
> <j...@resnulli.us>; David Miller <da...@davemloft.net>
> Subject: Re: [patch net v2 1/4] net/sched: Change tc_action refcnt and
> bindcnt to atomic
> 
> On Sun, Oct 22, 2017 at 7:47 PM, Chris Mi <chr...@mellanox.com> wrote:
> >
> > It seems it is not easy to discard call_rcu().  I'm afraid even if we
> > have a final solution without call_rcu(), it is not mature at the
> > beginning as well. I mean we also need time
> 
> Why do you believe it is not easy? RTNL lock is already there,
> list_splice_init_rcu() is there too. I can naturally divide my patches for 
> each
> module so that they are much easier to backport than yours.
I tested your patches. It takes about 17 seconds to run the same test.
I believe your code is simpler and easy to maintain. If it is stable,
we will also get benefit. Thanks!
> 
> 
> > to fix the possible bugs of the new design. And maybe to destroy the
> > filters in parallel is the right direction. If this bug is the last
> > bug brought by call_rcu(), then changing it may not be a good idea.
> 
> Again, you have to prove this is the last bug, I seriously doubt it is.
> 
> 
> >
> > Patch 1 is straightforward to use atomic. Patch 2 is to convert the list to
> array.
> 
> Both are big in size.
> 
> 
> > I think there is no harm to the new design.  Patch 3 and 4 are useful test
> case.
> 
> It definitely doesn't harm, but why do we waste time on it when we know
> there is a better way? It is clearly not easy for backport either, in fact it 
> is
> harder w.r.t. size.
> 
> 
> > We also need it with new design to make sure there is no regression.
> >
> 
> Are you saying I can't trust your test cases? ;)
> 
> 
> > So I think my patch set should not be held so long time.
> 
> I think your patches should be dropped except the last two, I will take the
> last two for you.
> 
> Thanks!


RE: [Patch net 14/15] selftests: Introduce a new script to generate tc batch file

2017-10-23 Thread Chris Mi
> -Original Message-
> From: Cong Wang [mailto:xiyou.wangc...@gmail.com]
> Sent: Tuesday, October 24, 2017 6:03 AM
> To: netdev@vger.kernel.org
> Cc: paul...@linux.vnet.ibm.com; j...@mojatatu.com;
> john.fastab...@gmail.com; Chris Mi <chr...@mellanox.com>; Cong Wang
> <xiyou.wangc...@gmail.com>
> Subject: [Patch net 14/15] selftests: Introduce a new script to generate tc
> batch file
> 
> From: Chris Mi <chr...@mellanox.com>
> 
> From: Chris Mi <chr...@mellanox.com>
> 
>   # ./tdc_batch.py -h
>   usage: tdc_batch.py [-h] [-n NUMBER] [-o] [-s] [-p] device file
> 
>   TC batch file generator
> 
>   positional arguments:
> devicedevice name
> file  batch file name
> 
>   optional arguments:
> -h, --helpshow this help message and exit
> -n NUMBER, --number NUMBER
>   how many lines in batch file
> -o, --skip_sw skip_sw (offload), by default skip_hw
> -s, --share_actionall filters share the same action
> -p, --prioall filters have different prio
> 
> Acked-by: Jamal Hadi Salim <j...@mojatatu.com>
> Acked-by: Lucas Bates <luc...@mojatatu.com>
> Signed-off-by: Chris Mi <chr...@mellanox.com>
> Signed-off-by: Cong Wang <xiyou.wangc...@gmail.com>
> ---
>  tools/testing/selftests/tc-testing/tdc_batch.py | 62
> +
>  1 file changed, 62 insertions(+)
>  create mode 100644 tools/testing/selftests/tc-testing/tdc_batch.py
> 
> diff --git a/tools/testing/selftests/tc-testing/tdc_batch.py
> b/tools/testing/selftests/tc-testing/tdc_batch.py
> new file mode 100644
File mode should be 755.
> index ..707c6bfef689
> --- /dev/null
> +++ b/tools/testing/selftests/tc-testing/tdc_batch.py
> @@ -0,0 +1,62 @@
> +#!/usr/bin/python3
> +
> +"""
> +tdc_batch.py - a script to generate TC batch file
> +
> +Copyright (C) 2017 Chris Mi <chr...@mellanox.com> """
> +
> +import argparse
> +
> +parser = argparse.ArgumentParser(description='TC batch file generator')
> +parser.add_argument("device", help="device name")
> +parser.add_argument("file", help="batch file name")
> +parser.add_argument("-n", "--number", type=int,
> +help="how many lines in batch file")
> +parser.add_argument("-o", "--skip_sw",
> +help="skip_sw (offload), by default skip_hw",
> +action="store_true") parser.add_argument("-s",
> +"--share_action",
> +help="all filters share the same action",
> +action="store_true") parser.add_argument("-p",
> +"--prio",
> +help="all filters have different prio",
> +action="store_true") args = parser.parse_args()
> +
> +device = args.device
> +file = open(args.file, 'w')
> +
> +number = 1
> +if args.number:
> +number = args.number
> +
> +skip = "skip_hw"
> +if args.skip_sw:
> +skip = "skip_sw"
> +
> +share_action = ""
> +if args.share_action:
> +share_action = "index 1"
> +
> +prio = "prio 1"
> +if args.prio:
> +prio = ""
> +if number > 0x4000:
> +number = 0x4000
> +
> +index = 0
> +for i in range(0x100):
> +for j in range(0x100):
> +for k in range(0x100):
> +mac = ("%02x:%02x:%02x" % (i, j, k))
> +src_mac = "e4:11:00:" + mac
> +dst_mac = "e4:12:00:" + mac
> +cmd = ("filter add dev %s %s protocol ip parent : flower %s "
> +   "src_mac %s dst_mac %s action drop %s" %
> +   (device, prio, skip, src_mac, dst_mac, share_action))
> +file.write("%s\n" % cmd)
> +index += 1
> +if index >= number:
> +file.close()
> +exit(0)
> --
> 2.13.0



RE: [patch net v2 1/4] net/sched: Change tc_action refcnt and bindcnt to atomic

2017-10-22 Thread Chris Mi
> -Original Message-
> From: Cong Wang [mailto:xiyou.wangc...@gmail.com]
> Sent: Friday, October 20, 2017 11:00 AM
> To: Jamal Hadi Salim <j...@mojatatu.com>
> Cc: Chris Mi <chr...@mellanox.com>; Linux Kernel Network Developers
> <netdev@vger.kernel.org>; Lucas Bates <luc...@mojatatu.com>; Jiri Pirko
> <j...@resnulli.us>; David Miller <da...@davemloft.net>
> Subject: Re: [patch net v2 1/4] net/sched: Change tc_action refcnt and
> bindcnt to atomic
> 
> On Thu, Oct 19, 2017 at 7:21 AM, Jamal Hadi Salim <j...@mojatatu.com>
> wrote:
> > On 17-10-18 12:43 PM, Cong Wang wrote:
> >>
> >> On Tue, Oct 17, 2017 at 6:03 PM, Chris Mi <chr...@mellanox.com> wrote:
> >>>>
> >>>> -Original Message-
> >
> >
> >>
> >> You listed 3 problems, and you think they are 3 different ones, here
> >> I argue problem 3 (using RCU callbacks) is the cause of problem 1
> >> (refcnt not atomic). This is why I mentioned I have been thinking
> >> about removing RCU callbacks, because it probably could fix all of them.
> >>
> >
> > Cong,
> > Given this is a known bug (the test case Chris presented crashes the
> > kernel) - would it make sense to have a patch that goes to -net to fix
> > this while your approach and discussion outcome goes into net-next?
> 
> I am not sure. Because Chris' patchset is large too and I don't think it 
> could fix
> all crashes, so it has little value to just apply them for -net.

It seems it is not easy to discard call_rcu().  I'm afraid even if we have a 
final solution
without call_rcu(), it is not mature at the beginning as well. I mean we also 
need time
to fix the possible bugs of the new design. And maybe to destroy the filters in 
parallel
is the right direction. If this bug is the last bug brought by call_rcu(), then 
changing it
may not be a good idea.
 
Patch 1 is straightforward to use atomic. Patch 2 is to convert the list to 
array.
I think there is no harm to the new design.  Patch 3 and 4 are useful test case.
We also need it with new design to make sure there is no regression.

So I think my patch set should not be held so long time.

My two cents.


RE: [patch net v2 1/4] net/sched: Change tc_action refcnt and bindcnt to atomic

2017-10-17 Thread Chris Mi
> -Original Message-
> From: Cong Wang [mailto:xiyou.wangc...@gmail.com]
> Sent: Tuesday, October 17, 2017 11:53 PM
> To: Chris Mi <chr...@mellanox.com>
> Cc: Linux Kernel Network Developers <netdev@vger.kernel.org>; Jamal Hadi
> Salim <j...@mojatatu.com>; Lucas Bates <luc...@mojatatu.com>; Jiri Pirko
> <j...@resnulli.us>; David Miller <da...@davemloft.net>
> Subject: Re: [patch net v2 1/4] net/sched: Change tc_action refcnt and
> bindcnt to atomic
> 
> On Mon, Oct 16, 2017 at 6:14 PM, Chris Mi <chr...@mellanox.com> wrote:
> > I don't think this bug were introduced by above two commits only.
> > Actually, this bug were introduced by several commits, at least the
> following:
> > 1. refcnt and bindcnt are not atomic
> 
> Nope, it is perfectly okay with non-atomic as long as no parallel, and without
> RCU callback they are perfectly serialized by RTNL.
Agree.
> 
> 
> > 2. passing actions using list instead of arrays (I think initially we
> > are using arrays)
> 
> We are discussing patch 1/4, this is patch 2/4, so irrelevant.
Agree.
> 
> 
> > 3. using RCU callbacks
> 
> This introduces problem 1.
I think this patch set only fixes one problem, that's the race and the panic.
What do you mean by problem 1.
> 
> 
> > So instead of blaming the latest commit, it is better to say it is a 
> > pre-git error.
> 
> You are wrong.
OK, you are right. But could I know what's your suggestion for this patch set?
1. reject it?
2. change the "Fixes" as you suggested?
3. something else?

Thanks,
Chris


RE: [patch net v3 2/4] net/sched: Use action array instead of action list as parameter

2017-10-17 Thread Chris Mi


> -Original Message-
> From: Cong Wang [mailto:xiyou.wangc...@gmail.com]
> Sent: Wednesday, October 18, 2017 12:56 AM
> To: Chris Mi <chr...@mellanox.com>
> Cc: Linux Kernel Network Developers <netdev@vger.kernel.org>; Jamal Hadi
> Salim <j...@mojatatu.com>; Lucas Bates <luc...@mojatatu.com>; Jiri Pirko
> <j...@resnulli.us>; David Miller <da...@davemloft.net>
> Subject: Re: [patch net v3 2/4] net/sched: Use action array instead of action
> list as parameter
> 
> On Mon, Oct 16, 2017 at 6:20 PM, Chris Mi <chr...@mellanox.com> wrote:
> > When destroying filters, actions should be destroyed first.
> > The pointers of each action are saved in an array. TC doesn't use the
> > array directly, but put all actions in a doubly linked list and use
> > that list as parameter.
> >
> > There is no problem if each filter has its own actions. But if some
> > filters share the same action, when these filters are destroyed, RCU
> > callback fl_destroy_filter() may be called at the same time. That
> > means the same action's 'struct list_head list'
> > could be manipulated at the same time. It may point to an invalid
> > address so that system will panic.
> 
> So if we remove these RCU callbacks (by adding a sychronize_rcu) this is not a
> problem, right? 
Maybe you are right. But do you think it will cause performance issue, I mean 
it takes
longer time to destroy filters if using synchronize_rcu()?
Or is there any other races than RCU callbacks?
We haven't found them.  This is the only one we found.
> 
> 
> >
> > This patch uses the action array directly to fix this issue.
> >
> > Fixes commit in pre-git era.
> >
> > Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2")
> 
> This is wrong too. RCU callbacks were introduced very late.


[patch net v3 4/4] selftests: Introduce a new test case to tc testsuite

2017-10-16 Thread Chris Mi
In this patchset, we fixed a tc bug. This patch adds the test case
that reproduces the bug. To run this test case, user should specify
an existing NIC device:
  # sudo ./tdc.py -d enp4s0f0

This test case belongs to category "flower". If user doesn't specify
a NIC device, the test cases belong to "flower" will not be run.

In this test case, we create 1M filters and all filters share the same
action. When destroying all filters, kernel should not panic. It takes
about 18s to run it.

Signed-off-by: Chris Mi <chr...@mellanox.com>
Acked-by: Jamal Hadi Salim <j...@mojatatu.com>
Acked-by: Lucas Bates <luc...@mojatatu.com>
---
 .../tc-testing/tc-tests/filters/tests.json | 23 +-
 tools/testing/selftests/tc-testing/tdc.py  | 20 +++
 tools/testing/selftests/tc-testing/tdc_config.py   |  2 ++
 3 files changed, 40 insertions(+), 5 deletions(-)

diff --git a/tools/testing/selftests/tc-testing/tc-tests/filters/tests.json 
b/tools/testing/selftests/tc-testing/tc-tests/filters/tests.json
index c727b96..5fa02d8 100644
--- a/tools/testing/selftests/tc-testing/tc-tests/filters/tests.json
+++ b/tools/testing/selftests/tc-testing/tc-tests/filters/tests.json
@@ -17,5 +17,26 @@
 "teardown": [
 "$TC qdisc del dev $DEV1 ingress"
 ]
+},
+{
+"id": "d052",
+"name": "Add 1M filters with the same action",
+"category": [
+"filter",
+"flower"
+],
+"setup": [
+"$TC qdisc add dev $DEV2 ingress",
+"./tdc_batch.py $DEV2 $BATCH_FILE --share_action -n 100"
+],
+"cmdUnderTest": "$TC -b $BATCH_FILE",
+"expExitCode": "0",
+"verifyCmd": "$TC actions list action gact",
+"matchPattern": "action order 0: gact action drop.*index 1 ref 100 
bind 100",
+"matchCount": "1",
+"teardown": [
+"$TC qdisc del dev $DEV2 ingress",
+"/bin/rm $BATCH_FILE"
+]
 }
-]
\ No newline at end of file
+]
diff --git a/tools/testing/selftests/tc-testing/tdc.py 
b/tools/testing/selftests/tc-testing/tdc.py
index cd61b78..5f11f5d 100755
--- a/tools/testing/selftests/tc-testing/tdc.py
+++ b/tools/testing/selftests/tc-testing/tdc.py
@@ -88,7 +88,7 @@ def prepare_env(cmdlist):
 exit(1)
 
 
-def test_runner(filtered_tests):
+def test_runner(filtered_tests, args):
 """
 Driver function for the unit tests.
 
@@ -105,6 +105,8 @@ def test_runner(filtered_tests):
 for tidx in testlist:
 result = True
 tresult = ""
+if "flower" in tidx["category"] and args.device == None:
+continue
 print("Test " + tidx["id"] + ": " + tidx["name"])
 prepare_env(tidx["setup"])
 (p, procout) = exec_cmd(tidx["cmdUnderTest"])
@@ -152,6 +154,10 @@ def ns_create():
 exec_cmd(cmd, False)
 cmd = 'ip -s $NS link set $DEV1 up'
 exec_cmd(cmd, False)
+cmd = 'ip link set $DEV2 netns $NS'
+exec_cmd(cmd, False)
+cmd = 'ip -s $NS link set $DEV2 up'
+exec_cmd(cmd, False)
 
 
 def ns_destroy():
@@ -211,7 +217,8 @@ def set_args(parser):
 help='Execute the single test case with specified ID')
 parser.add_argument('-i', '--id', action='store_true', dest='gen_id',
 help='Generate ID numbers for new test cases')
-return parser
+parser.add_argument('-d', '--device',
+help='Execute the test case in flower category')
 return parser
 
 
@@ -225,6 +232,8 @@ def check_default_settings(args):
 
 if args.path != None:
  NAMES['TC'] = args.path
+if args.device != None:
+ NAMES['DEV2'] = args.device
 if not os.path.isfile(NAMES['TC']):
 print("The specified tc path " + NAMES['TC'] + " does not exist.")
 exit(1)
@@ -381,14 +390,17 @@ def set_operation_mode(args):
 if (len(alltests) == 0):
 print("Cannot find a test case with ID matching " + target_id)
 exit(1)
-catresults = test_runner(alltests)
+catresults = test_runner(alltests, args)
 print("All test results: " + "\n\n" + catresults)
 elif (len(target_category) > 0):
+if (target_category == "flower") and args.device == None:
+print("Please specify a NIC device (-d) to run category flower")
+exit(1)
 if (target_category not in ucat):
 print("Specifie

[patch net v3 3/4] selftests: Introduce a new script to generate tc batch file

2017-10-16 Thread Chris Mi
  # ./tdc_batch.py -h
  usage: tdc_batch.py [-h] [-n NUMBER] [-o] [-s] [-p] device file

  TC batch file generator

  positional arguments:
devicedevice name
file  batch file name

  optional arguments:
-h, --helpshow this help message and exit
-n NUMBER, --number NUMBER
  how many lines in batch file
-o, --skip_sw skip_sw (offload), by default skip_hw
-s, --share_actionall filters share the same action
-p, --prioall filters have different prio

Signed-off-by: Chris Mi <chr...@mellanox.com>
Acked-by: Jamal Hadi Salim <j...@mojatatu.com>
Acked-by: Lucas Bates <luc...@mojatatu.com>
---
 tools/testing/selftests/tc-testing/tdc_batch.py | 62 +
 1 file changed, 62 insertions(+)
 create mode 100755 tools/testing/selftests/tc-testing/tdc_batch.py

diff --git a/tools/testing/selftests/tc-testing/tdc_batch.py 
b/tools/testing/selftests/tc-testing/tdc_batch.py
new file mode 100755
index 000..707c6bf
--- /dev/null
+++ b/tools/testing/selftests/tc-testing/tdc_batch.py
@@ -0,0 +1,62 @@
+#!/usr/bin/python3
+
+"""
+tdc_batch.py - a script to generate TC batch file
+
+Copyright (C) 2017 Chris Mi <chr...@mellanox.com>
+"""
+
+import argparse
+
+parser = argparse.ArgumentParser(description='TC batch file generator')
+parser.add_argument("device", help="device name")
+parser.add_argument("file", help="batch file name")
+parser.add_argument("-n", "--number", type=int,
+help="how many lines in batch file")
+parser.add_argument("-o", "--skip_sw",
+help="skip_sw (offload), by default skip_hw",
+action="store_true")
+parser.add_argument("-s", "--share_action",
+help="all filters share the same action",
+action="store_true")
+parser.add_argument("-p", "--prio",
+help="all filters have different prio",
+action="store_true")
+args = parser.parse_args()
+
+device = args.device
+file = open(args.file, 'w')
+
+number = 1
+if args.number:
+number = args.number
+
+skip = "skip_hw"
+if args.skip_sw:
+skip = "skip_sw"
+
+share_action = ""
+if args.share_action:
+share_action = "index 1"
+
+prio = "prio 1"
+if args.prio:
+prio = ""
+if number > 0x4000:
+number = 0x4000
+
+index = 0
+for i in range(0x100):
+for j in range(0x100):
+for k in range(0x100):
+mac = ("%02x:%02x:%02x" % (i, j, k))
+src_mac = "e4:11:00:" + mac
+dst_mac = "e4:12:00:" + mac
+cmd = ("filter add dev %s %s protocol ip parent : flower %s "
+   "src_mac %s dst_mac %s action drop %s" %
+   (device, prio, skip, src_mac, dst_mac, share_action))
+file.write("%s\n" % cmd)
+index += 1
+if index >= number:
+file.close()
+exit(0)
-- 
1.8.3.1



[patch net v3 2/4] net/sched: Use action array instead of action list as parameter

2017-10-16 Thread Chris Mi
When destroying filters, actions should be destroyed first.
The pointers of each action are saved in an array. TC doesn't
use the array directly, but put all actions in a doubly linked
list and use that list as parameter.

There is no problem if each filter has its own actions. But if
some filters share the same action, when these filters are
destroyed, RCU callback fl_destroy_filter() may be called at the
same time. That means the same action's 'struct list_head list'
could be manipulated at the same time. It may point to an invalid
address so that system will panic.

This patch uses the action array directly to fix this issue.

Fixes commit in pre-git era.

Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2")
Signed-off-by: Chris Mi <chr...@mellanox.com>
Acked-by: Jamal Hadi Salim <j...@mojatatu.com>
---
 include/net/act_api.h |   7 ++--
 net/sched/act_api.c   | 103 +++---
 net/sched/cls_api.c   |  18 +++--
 3 files changed, 75 insertions(+), 53 deletions(-)

diff --git a/include/net/act_api.h b/include/net/act_api.h
index a469ab6..081a313 100644
--- a/include/net/act_api.h
+++ b/include/net/act_api.h
@@ -148,16 +148,17 @@ static inline int tcf_idr_release(struct tc_action *a, 
bool bind)
 int tcf_register_action(struct tc_action_ops *a, struct pernet_operations 
*ops);
 int tcf_unregister_action(struct tc_action_ops *a,
  struct pernet_operations *ops);
-int tcf_action_destroy(struct list_head *actions, int bind);
+int tcf_action_destroy(struct tc_action **actions, int nr, int bind);
 int tcf_action_exec(struct sk_buff *skb, struct tc_action **actions,
int nr_actions, struct tcf_result *res);
 int tcf_action_init(struct net *net, struct tcf_proto *tp, struct nlattr *nla,
struct nlattr *est, char *name, int ovr, int bind,
-   struct list_head *actions);
+   struct tc_action **actions, int *nr);
 struct tc_action *tcf_action_init_1(struct net *net, struct tcf_proto *tp,
struct nlattr *nla, struct nlattr *est,
char *name, int ovr, int bind);
-int tcf_action_dump(struct sk_buff *skb, struct list_head *, int, int);
+int tcf_action_dump(struct sk_buff *skb, struct tc_action **actions, int nr,
+   int bind, int ref);
 int tcf_action_dump_old(struct sk_buff *skb, struct tc_action *a, int, int);
 int tcf_action_dump_1(struct sk_buff *skb, struct tc_action *a, int, int);
 int tcf_action_copy_stats(struct sk_buff *, struct tc_action *, int);
diff --git a/net/sched/act_api.c b/net/sched/act_api.c
index 9c0224d..391d560 100644
--- a/net/sched/act_api.c
+++ b/net/sched/act_api.c
@@ -513,13 +513,15 @@ int tcf_action_exec(struct sk_buff *skb, struct tc_action 
**actions,
 }
 EXPORT_SYMBOL(tcf_action_exec);
 
-int tcf_action_destroy(struct list_head *actions, int bind)
+int tcf_action_destroy(struct tc_action **actions, int nr, int bind)
 {
const struct tc_action_ops *ops;
-   struct tc_action *a, *tmp;
+   struct tc_action *a;
int ret = 0;
+   int i;
 
-   list_for_each_entry_safe(a, tmp, actions, list) {
+   for (i = 0; i < nr; i++) {
+   a = actions[i];
ops = a->ops;
ret = __tcf_idr_release(a, bind, true);
if (ret == ACT_P_DELETED)
@@ -568,14 +570,16 @@ int tcf_action_destroy(struct list_head *actions, int 
bind)
 }
 EXPORT_SYMBOL(tcf_action_dump_1);
 
-int tcf_action_dump(struct sk_buff *skb, struct list_head *actions,
+int tcf_action_dump(struct sk_buff *skb, struct tc_action **actions, int nr,
int bind, int ref)
 {
struct tc_action *a;
-   int err = -EINVAL;
struct nlattr *nest;
+   int err = -EINVAL;
+   int i;
 
-   list_for_each_entry(a, actions, list) {
+   for (i = 0; i < nr; i++) {
+   a = actions[i];
nest = nla_nest_start(skb, a->order);
if (nest == NULL)
goto nla_put_failure;
@@ -700,10 +704,7 @@ struct tc_action *tcf_action_init_1(struct net *net, 
struct tcf_proto *tp,
if (TC_ACT_EXT_CMP(a->tcfa_action, TC_ACT_GOTO_CHAIN)) {
err = tcf_action_goto_chain_init(a, tp);
if (err) {
-   LIST_HEAD(actions);
-
-   list_add_tail(>list, );
-   tcf_action_destroy(, bind);
+   tcf_action_destroy(, 1, bind);
return ERR_PTR(err);
}
}
@@ -720,23 +721,27 @@ struct tc_action *tcf_action_init_1(struct net *net, 
struct tcf_proto *tp,
return ERR_PTR(err);
 }
 
-static void cleanup_a(struct list_head *actions, int ovr)
+static void cleanup_a(struct tc_action **actions, int nr, int ovr)
 {
struct tc_action *a;
+   int i;
 
if (!ovr)
return;

[patch net v3 1/4] net/sched: Change tc_action refcnt and bindcnt to atomic

2017-10-16 Thread Chris Mi
If many filters share the same action. That action's refcnt and bindcnt
could be manipulated by many RCU callbacks at the same time. This patch
makes these operations atomic.

Fixes commit in pre-git era.

Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2")
Signed-off-by: Chris Mi <chr...@mellanox.com>
Acked-by: Jamal Hadi Salim <j...@mojatatu.com>
---
 include/net/act_api.h  |  4 ++--
 net/sched/act_api.c| 21 +++--
 net/sched/act_bpf.c|  4 ++--
 net/sched/act_connmark.c   |  4 ++--
 net/sched/act_csum.c   |  4 ++--
 net/sched/act_gact.c   |  4 ++--
 net/sched/act_ife.c|  4 ++--
 net/sched/act_ipt.c|  4 ++--
 net/sched/act_mirred.c |  4 ++--
 net/sched/act_nat.c|  4 ++--
 net/sched/act_pedit.c  |  4 ++--
 net/sched/act_police.c |  4 ++--
 net/sched/act_sample.c |  4 ++--
 net/sched/act_simple.c |  4 ++--
 net/sched/act_skbedit.c|  4 ++--
 net/sched/act_skbmod.c |  4 ++--
 net/sched/act_tunnel_key.c |  4 ++--
 net/sched/act_vlan.c   |  4 ++--
 18 files changed, 45 insertions(+), 44 deletions(-)

diff --git a/include/net/act_api.h b/include/net/act_api.h
index b944e0eb..a469ab6 100644
--- a/include/net/act_api.h
+++ b/include/net/act_api.h
@@ -25,8 +25,8 @@ struct tc_action {
struct tcf_idrinfo  *idrinfo;
 
u32 tcfa_index;
-   int tcfa_refcnt;
-   int tcfa_bindcnt;
+   atomic_ttcfa_refcnt;
+   atomic_ttcfa_bindcnt;
u32 tcfa_capab;
int tcfa_action;
struct tcf_ttcfa_tm;
diff --git a/net/sched/act_api.c b/net/sched/act_api.c
index da6fa82..9c0224d 100644
--- a/net/sched/act_api.c
+++ b/net/sched/act_api.c
@@ -88,12 +88,13 @@ int __tcf_idr_release(struct tc_action *p, bool bind, bool 
strict)
 
if (p) {
if (bind)
-   p->tcfa_bindcnt--;
-   else if (strict && p->tcfa_bindcnt > 0)
+   atomic_dec(>tcfa_bindcnt);
+   else if (strict && atomic_read(>tcfa_bindcnt) > 0)
return -EPERM;
 
-   p->tcfa_refcnt--;
-   if (p->tcfa_bindcnt <= 0 && p->tcfa_refcnt <= 0) {
+   atomic_dec(>tcfa_refcnt);
+   if (atomic_read(>tcfa_bindcnt) == 0 &&
+   atomic_read(>tcfa_refcnt) == 0) {
if (p->ops->cleanup)
p->ops->cleanup(p, bind);
tcf_idr_remove(p->idrinfo, p);
@@ -245,8 +246,8 @@ bool tcf_idr_check(struct tc_action_net *tn, u32 index, 
struct tc_action **a,
 
if (index && p) {
if (bind)
-   p->tcfa_bindcnt++;
-   p->tcfa_refcnt++;
+   atomic_inc(>tcfa_bindcnt);
+   atomic_inc(>tcfa_refcnt);
*a = p;
return true;
}
@@ -274,9 +275,9 @@ int tcf_idr_create(struct tc_action_net *tn, u32 index, 
struct nlattr *est,
 
if (unlikely(!p))
return -ENOMEM;
-   p->tcfa_refcnt = 1;
+   atomic_set(>tcfa_refcnt, 1);
if (bind)
-   p->tcfa_bindcnt = 1;
+   atomic_set(>tcfa_bindcnt, 1);
 
if (cpustats) {
p->cpu_bstats = netdev_alloc_pcpu_stats(struct 
gnet_stats_basic_cpu);
@@ -727,7 +728,7 @@ static void cleanup_a(struct list_head *actions, int ovr)
return;
 
list_for_each_entry(a, actions, list)
-   a->tcfa_refcnt--;
+   atomic_dec(>tcfa_refcnt);
 }
 
 int tcf_action_init(struct net *net, struct tcf_proto *tp, struct nlattr *nla,
@@ -751,7 +752,7 @@ int tcf_action_init(struct net *net, struct tcf_proto *tp, 
struct nlattr *nla,
}
act->order = i;
if (ovr)
-   act->tcfa_refcnt++;
+   atomic_inc(>tcfa_refcnt);
list_add_tail(>list, actions);
}
 
diff --git a/net/sched/act_bpf.c b/net/sched/act_bpf.c
index c0c707e..4ddf281 100644
--- a/net/sched/act_bpf.c
+++ b/net/sched/act_bpf.c
@@ -141,8 +141,8 @@ static int tcf_bpf_dump(struct sk_buff *skb, struct 
tc_action *act,
struct tcf_bpf *prog = to_bpf(act);
struct tc_act_bpf opt = {
.index   = prog->tcf_index,
-   .refcnt  = prog->tcf_refcnt - ref,
-   .bindcnt = prog->tcf_bindcnt - bind,
+   .refcnt  = atomic_read(>tcf_refcnt) - ref,
+   .bindcnt = atomic_read(>tcf_bindcnt) - bind,
.action  = prog->tcf_action,
};
struct tcf_t tm;
diff --g

[patch net v3 0/4] net/sched: Fix a system panic when deleting filters

2017-10-16 Thread Chris Mi
If some filters share the same action, when deleting these filters,
it is possible to create a system panic. This is because deletions
could be manipulated by many RCU callbacks at the same time.

This patch set fixes these issues. To reproduce the issue run selftests
in patch 3 and 4. To test if the issue was fixed, apply patches 1 and 2
and then repeat the tests.

v2 changelog


Revise the comment and add Acked-by: Jamal Hadi Salim <j...@mojatatu.com>

v3 changelog

Add Acked-by: Lucas Bates <luc...@mojatatu.com> for patch 3 and 4

Chris Mi (4):
  net/sched: Change tc_action refcnt and bindcnt to atomic
  net/sched: Use action array instead of action list as parameter
  selftests: Introduce a new script to generate tc batch file
  selftests: Introduce a new test case to tc testsuite

 include/net/act_api.h  |  11 +-
 net/sched/act_api.c| 124 +
 net/sched/act_bpf.c|   4 +-
 net/sched/act_connmark.c   |   4 +-
 net/sched/act_csum.c   |   4 +-
 net/sched/act_gact.c   |   4 +-
 net/sched/act_ife.c|   4 +-
 net/sched/act_ipt.c|   4 +-
 net/sched/act_mirred.c |   4 +-
 net/sched/act_nat.c|   4 +-
 net/sched/act_pedit.c  |   4 +-
 net/sched/act_police.c |   4 +-
 net/sched/act_sample.c |   4 +-
 net/sched/act_simple.c |   4 +-
 net/sched/act_skbedit.c|   4 +-
 net/sched/act_skbmod.c |   4 +-
 net/sched/act_tunnel_key.c |   4 +-
 net/sched/act_vlan.c   |   4 +-
 net/sched/cls_api.c|  18 +--
 .../tc-testing/tc-tests/filters/tests.json |  23 +++-
 tools/testing/selftests/tc-testing/tdc.py  |  20 +++-
 tools/testing/selftests/tc-testing/tdc_batch.py|  62 +++
 tools/testing/selftests/tc-testing/tdc_config.py   |   2 +
 23 files changed, 222 insertions(+), 102 deletions(-)
 create mode 100755 tools/testing/selftests/tc-testing/tdc_batch.py

-- 
1.8.3.1



RE: [patch net v2 1/4] net/sched: Change tc_action refcnt and bindcnt to atomic

2017-10-16 Thread Chris Mi


> -Original Message-
> From: Cong Wang [mailto:xiyou.wangc...@gmail.com]
> Sent: Tuesday, October 17, 2017 1:06 AM
> To: Chris Mi <chr...@mellanox.com>
> Cc: Linux Kernel Network Developers <netdev@vger.kernel.org>; Jamal Hadi
> Salim <j...@mojatatu.com>; Lucas Bates <luc...@mojatatu.com>; Jiri Pirko
> <j...@resnulli.us>; David Miller <da...@davemloft.net>
> Subject: Re: [patch net v2 1/4] net/sched: Change tc_action refcnt and
> bindcnt to atomic
> 
> On Mon, Oct 16, 2017 at 4:18 AM, Chris Mi <chr...@mellanox.com> wrote:
> > If many filters share the same action. That action's refcnt and
> > bindcnt could be manipulated by many RCU callbacks at the same time.
> > This patch makes these operations atomic.
> 
> Actually I have been thinking about removing these RCU callbacks, they are
> not necessary AFAIK, callers hold RTNL lock so they are allowed to block. The
> only drawback is that adding a synchronize_rcu(), but these are slow paths
> anyway...
> 
> I am not sure, it is arguable anyway, essentially it is:
> 
> synchronize_rcu() in slow path vs.  multiple RCU callback races
> 
> 
> >
> > Fixes commit in pre-git era.
> >
> > Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2")
> 
> This is not true, the action RCU callbacks were introduced
> by:
> 
> commit c7de2cf053420d63bac85133469c965d4b1083e1
> Author: Eric Dumazet <eric.duma...@gmail.com>
> Date:   Wed Jun 9 02:09:23 2010 +
> 
> pkt_sched: gen_kill_estimator() rcu fixes
> 
> 
> and the filter RCU callbacks were introduced by the patchset like this one:
> 
> 
> commit 1ce87720d456e471de0fbd814dc5d1fe10fc1c44
> Author: John Fastabend <john.fastab...@gmail.com>
> Date:   Fri Sep 12 20:09:16 2014 -0700
> 
> net: sched: make cls_u32 lockless
I don't think this bug were introduced by above two commits only.
Actually, this bug were introduced by several commits, at least the following:
1. refcnt and bindcnt are not atomic
2. passing actions using list instead of arrays (I think initially we are using 
arrays)
3. using RCU callbacks
So instead of blaming the latest commit, it is better to say it is a pre-git 
error.


RE: [patch net v2 4/4] selftests: Introduce a new test case to tc testsuite

2017-10-16 Thread Chris Mi


> -Original Message-
> From: Lucas Bates [mailto:luc...@mojatatu.com]
> Sent: Tuesday, October 17, 2017 12:25 AM
> To: Chris Mi <chr...@mellanox.com>
> Cc: netdev@vger.kernel.org; Jamal Hadi Salim <j...@mojatatu.com>; Cong
> Wang <xiyou.wangc...@gmail.com>; Jiri Pirko <j...@resnulli.us>;
> da...@davemloft.net
> Subject: Re: [patch net v2 4/4] selftests: Introduce a new test case to tc
> testsuite
> 
> On Mon, Oct 16, 2017 at 7:18 AM, Chris Mi <chr...@mellanox.com> wrote:
> > In this patchset, we fixed a tc bug. This patch adds the test case
> > that reproduces the bug. To run this test case, user should specify an
> > existing NIC device:
> >   # sudo ./tdc.py -d enp4s0f0
> >
> > This test case belongs to category "flower". If user doesn't specify a
> > NIC device, the test cases belong to "flower" will not be run.
> >
> > In this test case, we create 1M filters and all filters share the same
> > action. When destroying all filters, kernel should not panic. It takes
> > about 18s to run it.
> >
> > Signed-off-by: Chris Mi <chr...@mellanox.com>
> > Acked-by: Jamal Hadi Salim <j...@mojatatu.com>
> 
> I'm a little wary about adding changes like these into tdc.py directly; I 
> don't
> think it's going to be sustainable in the long run.
> Even the namespace creation I put in to the original version is too specific 
> and
> limiting.
> 
> There are some upcoming changes to tdc to help address these particular
> issues.  I'll ack this for now, thanks.
OK. Thanks for your review, Lucas.
> 
> Acked-by: Lucas Bates <luc...@mojatatu.com>
> 
> 
> > ---
> >  .../tc-testing/tc-tests/filters/tests.json | 23
> +-
> >  tools/testing/selftests/tc-testing/tdc.py  | 20 +++
> >  tools/testing/selftests/tc-testing/tdc_config.py   |  2 ++
> >  3 files changed, 40 insertions(+), 5 deletions(-)
> >
> > diff --git
> > a/tools/testing/selftests/tc-testing/tc-tests/filters/tests.json
> > b/tools/testing/selftests/tc-testing/tc-tests/filters/tests.json
> > index c727b96..5fa02d8 100644
> > --- a/tools/testing/selftests/tc-testing/tc-tests/filters/tests.json
> > +++ b/tools/testing/selftests/tc-testing/tc-tests/filters/tests.json
> > @@ -17,5 +17,26 @@
> >  "teardown": [
> >  "$TC qdisc del dev $DEV1 ingress"
> >  ]
> > +},
> > +{
> > +"id": "d052",
> > +"name": "Add 1M filters with the same action",
> > +"category": [
> > +"filter",
> > +"flower"
> > +],
> > +"setup": [
> > +"$TC qdisc add dev $DEV2 ingress",
> > +"./tdc_batch.py $DEV2 $BATCH_FILE --share_action -n 100"
> > +],
> > +"cmdUnderTest": "$TC -b $BATCH_FILE",
> > +"expExitCode": "0",
> > +"verifyCmd": "$TC actions list action gact",
> > +"matchPattern": "action order 0: gact action drop.*index 1 ref 
> > 100
> bind 100",
> > +"matchCount": "1",
> > +"teardown": [
> > +"$TC qdisc del dev $DEV2 ingress",
> > +"/bin/rm $BATCH_FILE"
> > +]
> >  }
> > -]
> > \ No newline at end of file
> > +]
> > diff --git a/tools/testing/selftests/tc-testing/tdc.py
> > b/tools/testing/selftests/tc-testing/tdc.py
> > index cd61b78..5f11f5d 100755
> > --- a/tools/testing/selftests/tc-testing/tdc.py
> > +++ b/tools/testing/selftests/tc-testing/tdc.py
> > @@ -88,7 +88,7 @@ def prepare_env(cmdlist):
> >  exit(1)
> >
> >
> > -def test_runner(filtered_tests):
> > +def test_runner(filtered_tests, args):
> >  """
> >  Driver function for the unit tests.
> >
> > @@ -105,6 +105,8 @@ def test_runner(filtered_tests):
> >  for tidx in testlist:
> >  result = True
> >  tresult = ""
> > +if "flower" in tidx["category"] and args.device == None:
> > +continue
> >  print("Test " + tidx["id"] + ": " + tidx["name"])
> >  prepare_env(tidx["setup"])
> >  (p, procout) = exec_cmd(tidx["cmdU

RE: [patch net 0/4] net/sched: Fix a system panic when deleting filters

2017-10-16 Thread Chris Mi
Hi Jamal,

> -Original Message-
> From: Jamal Hadi Salim [mailto:j...@mojatatu.com]
> Sent: Monday, October 16, 2017 7:06 PM
> To: Chris Mi <chr...@mellanox.com>; netdev@vger.kernel.org
> Cc: luc...@mojatatu.com; xiyou.wangc...@gmail.com; j...@resnulli.us;
> da...@davemloft.net
> Subject: Re: [patch net 0/4] net/sched: Fix a system panic when deleting
> filters
> 
> 
> Hi Chris,
> 
> On 17-10-16 04:31 AM, Chris Mi wrote:
> > If some filters share the same action, when deleting these filters,
> > system may panic. This patchset fixes this issue. And the test case
> > finding this issue is also integrated into tc test suite of selftests.
> >
> 
> I think this would read better if re-worded as:
> ---
> If some filters share the same action, when deleting these filters, it is
> possible to create a system panic. This is because deletions could be
> manipulated by many RCU callbacks at the same time.
> This patch set fixes these issues.
> To reproduce the issue run selftests in patch 3 and 4.
> To test if the issue was fixed, apply patches 1 and 2 and then repeat the 
> tests.
> --
Done.
> 
> Other than that all look good to me and:
> 
> Acked-by: Jamal Hadi Salim <j...@mojatatu.com>
Done. And thanks for your review.

Thanks,
Chris
> 
> cheers,
> jamal


[patch net v2 3/4] selftests: Introduce a new script to generate tc batch file

2017-10-16 Thread Chris Mi
  # ./tdc_batch.py -h
  usage: tdc_batch.py [-h] [-n NUMBER] [-o] [-s] [-p] device file

  TC batch file generator

  positional arguments:
devicedevice name
file  batch file name

  optional arguments:
-h, --helpshow this help message and exit
-n NUMBER, --number NUMBER
  how many lines in batch file
-o, --skip_sw skip_sw (offload), by default skip_hw
-s, --share_actionall filters share the same action
-p, --prioall filters have different prio

Signed-off-by: Chris Mi <chr...@mellanox.com>
Acked-by: Jamal Hadi Salim <j...@mojatatu.com>
---
 tools/testing/selftests/tc-testing/tdc_batch.py | 62 +
 1 file changed, 62 insertions(+)
 create mode 100755 tools/testing/selftests/tc-testing/tdc_batch.py

diff --git a/tools/testing/selftests/tc-testing/tdc_batch.py 
b/tools/testing/selftests/tc-testing/tdc_batch.py
new file mode 100755
index 000..707c6bf
--- /dev/null
+++ b/tools/testing/selftests/tc-testing/tdc_batch.py
@@ -0,0 +1,62 @@
+#!/usr/bin/python3
+
+"""
+tdc_batch.py - a script to generate TC batch file
+
+Copyright (C) 2017 Chris Mi <chr...@mellanox.com>
+"""
+
+import argparse
+
+parser = argparse.ArgumentParser(description='TC batch file generator')
+parser.add_argument("device", help="device name")
+parser.add_argument("file", help="batch file name")
+parser.add_argument("-n", "--number", type=int,
+help="how many lines in batch file")
+parser.add_argument("-o", "--skip_sw",
+help="skip_sw (offload), by default skip_hw",
+action="store_true")
+parser.add_argument("-s", "--share_action",
+help="all filters share the same action",
+action="store_true")
+parser.add_argument("-p", "--prio",
+help="all filters have different prio",
+action="store_true")
+args = parser.parse_args()
+
+device = args.device
+file = open(args.file, 'w')
+
+number = 1
+if args.number:
+number = args.number
+
+skip = "skip_hw"
+if args.skip_sw:
+skip = "skip_sw"
+
+share_action = ""
+if args.share_action:
+share_action = "index 1"
+
+prio = "prio 1"
+if args.prio:
+prio = ""
+if number > 0x4000:
+number = 0x4000
+
+index = 0
+for i in range(0x100):
+for j in range(0x100):
+for k in range(0x100):
+mac = ("%02x:%02x:%02x" % (i, j, k))
+src_mac = "e4:11:00:" + mac
+dst_mac = "e4:12:00:" + mac
+cmd = ("filter add dev %s %s protocol ip parent : flower %s "
+   "src_mac %s dst_mac %s action drop %s" %
+   (device, prio, skip, src_mac, dst_mac, share_action))
+file.write("%s\n" % cmd)
+index += 1
+if index >= number:
+file.close()
+exit(0)
-- 
1.8.3.1



[patch net v2 1/4] net/sched: Change tc_action refcnt and bindcnt to atomic

2017-10-16 Thread Chris Mi
If many filters share the same action. That action's refcnt and bindcnt
could be manipulated by many RCU callbacks at the same time. This patch
makes these operations atomic.

Fixes commit in pre-git era.

Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2")
Signed-off-by: Chris Mi <chr...@mellanox.com>
Acked-by: Jamal Hadi Salim <j...@mojatatu.com>
---
 include/net/act_api.h  |  4 ++--
 net/sched/act_api.c| 21 +++--
 net/sched/act_bpf.c|  4 ++--
 net/sched/act_connmark.c   |  4 ++--
 net/sched/act_csum.c   |  4 ++--
 net/sched/act_gact.c   |  4 ++--
 net/sched/act_ife.c|  4 ++--
 net/sched/act_ipt.c|  4 ++--
 net/sched/act_mirred.c |  4 ++--
 net/sched/act_nat.c|  4 ++--
 net/sched/act_pedit.c  |  4 ++--
 net/sched/act_police.c |  4 ++--
 net/sched/act_sample.c |  4 ++--
 net/sched/act_simple.c |  4 ++--
 net/sched/act_skbedit.c|  4 ++--
 net/sched/act_skbmod.c |  4 ++--
 net/sched/act_tunnel_key.c |  4 ++--
 net/sched/act_vlan.c   |  4 ++--
 18 files changed, 45 insertions(+), 44 deletions(-)

diff --git a/include/net/act_api.h b/include/net/act_api.h
index b944e0eb..a469ab6 100644
--- a/include/net/act_api.h
+++ b/include/net/act_api.h
@@ -25,8 +25,8 @@ struct tc_action {
struct tcf_idrinfo  *idrinfo;
 
u32 tcfa_index;
-   int tcfa_refcnt;
-   int tcfa_bindcnt;
+   atomic_ttcfa_refcnt;
+   atomic_ttcfa_bindcnt;
u32 tcfa_capab;
int tcfa_action;
struct tcf_ttcfa_tm;
diff --git a/net/sched/act_api.c b/net/sched/act_api.c
index da6fa82..9c0224d 100644
--- a/net/sched/act_api.c
+++ b/net/sched/act_api.c
@@ -88,12 +88,13 @@ int __tcf_idr_release(struct tc_action *p, bool bind, bool 
strict)
 
if (p) {
if (bind)
-   p->tcfa_bindcnt--;
-   else if (strict && p->tcfa_bindcnt > 0)
+   atomic_dec(>tcfa_bindcnt);
+   else if (strict && atomic_read(>tcfa_bindcnt) > 0)
return -EPERM;
 
-   p->tcfa_refcnt--;
-   if (p->tcfa_bindcnt <= 0 && p->tcfa_refcnt <= 0) {
+   atomic_dec(>tcfa_refcnt);
+   if (atomic_read(>tcfa_bindcnt) == 0 &&
+   atomic_read(>tcfa_refcnt) == 0) {
if (p->ops->cleanup)
p->ops->cleanup(p, bind);
tcf_idr_remove(p->idrinfo, p);
@@ -245,8 +246,8 @@ bool tcf_idr_check(struct tc_action_net *tn, u32 index, 
struct tc_action **a,
 
if (index && p) {
if (bind)
-   p->tcfa_bindcnt++;
-   p->tcfa_refcnt++;
+   atomic_inc(>tcfa_bindcnt);
+   atomic_inc(>tcfa_refcnt);
*a = p;
return true;
}
@@ -274,9 +275,9 @@ int tcf_idr_create(struct tc_action_net *tn, u32 index, 
struct nlattr *est,
 
if (unlikely(!p))
return -ENOMEM;
-   p->tcfa_refcnt = 1;
+   atomic_set(>tcfa_refcnt, 1);
if (bind)
-   p->tcfa_bindcnt = 1;
+   atomic_set(>tcfa_bindcnt, 1);
 
if (cpustats) {
p->cpu_bstats = netdev_alloc_pcpu_stats(struct 
gnet_stats_basic_cpu);
@@ -727,7 +728,7 @@ static void cleanup_a(struct list_head *actions, int ovr)
return;
 
list_for_each_entry(a, actions, list)
-   a->tcfa_refcnt--;
+   atomic_dec(>tcfa_refcnt);
 }
 
 int tcf_action_init(struct net *net, struct tcf_proto *tp, struct nlattr *nla,
@@ -751,7 +752,7 @@ int tcf_action_init(struct net *net, struct tcf_proto *tp, 
struct nlattr *nla,
}
act->order = i;
if (ovr)
-   act->tcfa_refcnt++;
+   atomic_inc(>tcfa_refcnt);
list_add_tail(>list, actions);
}
 
diff --git a/net/sched/act_bpf.c b/net/sched/act_bpf.c
index c0c707e..4ddf281 100644
--- a/net/sched/act_bpf.c
+++ b/net/sched/act_bpf.c
@@ -141,8 +141,8 @@ static int tcf_bpf_dump(struct sk_buff *skb, struct 
tc_action *act,
struct tcf_bpf *prog = to_bpf(act);
struct tc_act_bpf opt = {
.index   = prog->tcf_index,
-   .refcnt  = prog->tcf_refcnt - ref,
-   .bindcnt = prog->tcf_bindcnt - bind,
+   .refcnt  = atomic_read(>tcf_refcnt) - ref,
+   .bindcnt = atomic_read(>tcf_bindcnt) - bind,
.action  = prog->tcf_action,
};
struct tcf_t tm;
diff --g

[patch net v2 2/4] net/sched: Use action array instead of action list as parameter

2017-10-16 Thread Chris Mi
When destroying filters, actions should be destroyed first.
The pointers of each action are saved in an array. TC doesn't
use the array directly, but put all actions in a doubly linked
list and use that list as parameter.

There is no problem if each filter has its own actions. But if
some filters share the same action, when these filters are
destroyed, RCU callback fl_destroy_filter() may be called at the
same time. That means the same action's 'struct list_head list'
could be manipulated at the same time. It may point to an invalid
address so that system will panic.

This patch uses the action array directly to fix this issue.

Fixes commit in pre-git era.

Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2")
Signed-off-by: Chris Mi <chr...@mellanox.com>
Acked-by: Jamal Hadi Salim <j...@mojatatu.com>
---
 include/net/act_api.h |   7 ++--
 net/sched/act_api.c   | 103 +++---
 net/sched/cls_api.c   |  18 +++--
 3 files changed, 75 insertions(+), 53 deletions(-)

diff --git a/include/net/act_api.h b/include/net/act_api.h
index a469ab6..081a313 100644
--- a/include/net/act_api.h
+++ b/include/net/act_api.h
@@ -148,16 +148,17 @@ static inline int tcf_idr_release(struct tc_action *a, 
bool bind)
 int tcf_register_action(struct tc_action_ops *a, struct pernet_operations 
*ops);
 int tcf_unregister_action(struct tc_action_ops *a,
  struct pernet_operations *ops);
-int tcf_action_destroy(struct list_head *actions, int bind);
+int tcf_action_destroy(struct tc_action **actions, int nr, int bind);
 int tcf_action_exec(struct sk_buff *skb, struct tc_action **actions,
int nr_actions, struct tcf_result *res);
 int tcf_action_init(struct net *net, struct tcf_proto *tp, struct nlattr *nla,
struct nlattr *est, char *name, int ovr, int bind,
-   struct list_head *actions);
+   struct tc_action **actions, int *nr);
 struct tc_action *tcf_action_init_1(struct net *net, struct tcf_proto *tp,
struct nlattr *nla, struct nlattr *est,
char *name, int ovr, int bind);
-int tcf_action_dump(struct sk_buff *skb, struct list_head *, int, int);
+int tcf_action_dump(struct sk_buff *skb, struct tc_action **actions, int nr,
+   int bind, int ref);
 int tcf_action_dump_old(struct sk_buff *skb, struct tc_action *a, int, int);
 int tcf_action_dump_1(struct sk_buff *skb, struct tc_action *a, int, int);
 int tcf_action_copy_stats(struct sk_buff *, struct tc_action *, int);
diff --git a/net/sched/act_api.c b/net/sched/act_api.c
index 9c0224d..391d560 100644
--- a/net/sched/act_api.c
+++ b/net/sched/act_api.c
@@ -513,13 +513,15 @@ int tcf_action_exec(struct sk_buff *skb, struct tc_action 
**actions,
 }
 EXPORT_SYMBOL(tcf_action_exec);
 
-int tcf_action_destroy(struct list_head *actions, int bind)
+int tcf_action_destroy(struct tc_action **actions, int nr, int bind)
 {
const struct tc_action_ops *ops;
-   struct tc_action *a, *tmp;
+   struct tc_action *a;
int ret = 0;
+   int i;
 
-   list_for_each_entry_safe(a, tmp, actions, list) {
+   for (i = 0; i < nr; i++) {
+   a = actions[i];
ops = a->ops;
ret = __tcf_idr_release(a, bind, true);
if (ret == ACT_P_DELETED)
@@ -568,14 +570,16 @@ int tcf_action_destroy(struct list_head *actions, int 
bind)
 }
 EXPORT_SYMBOL(tcf_action_dump_1);
 
-int tcf_action_dump(struct sk_buff *skb, struct list_head *actions,
+int tcf_action_dump(struct sk_buff *skb, struct tc_action **actions, int nr,
int bind, int ref)
 {
struct tc_action *a;
-   int err = -EINVAL;
struct nlattr *nest;
+   int err = -EINVAL;
+   int i;
 
-   list_for_each_entry(a, actions, list) {
+   for (i = 0; i < nr; i++) {
+   a = actions[i];
nest = nla_nest_start(skb, a->order);
if (nest == NULL)
goto nla_put_failure;
@@ -700,10 +704,7 @@ struct tc_action *tcf_action_init_1(struct net *net, 
struct tcf_proto *tp,
if (TC_ACT_EXT_CMP(a->tcfa_action, TC_ACT_GOTO_CHAIN)) {
err = tcf_action_goto_chain_init(a, tp);
if (err) {
-   LIST_HEAD(actions);
-
-   list_add_tail(>list, );
-   tcf_action_destroy(, bind);
+   tcf_action_destroy(, 1, bind);
return ERR_PTR(err);
}
}
@@ -720,23 +721,27 @@ struct tc_action *tcf_action_init_1(struct net *net, 
struct tcf_proto *tp,
return ERR_PTR(err);
 }
 
-static void cleanup_a(struct list_head *actions, int ovr)
+static void cleanup_a(struct tc_action **actions, int nr, int ovr)
 {
struct tc_action *a;
+   int i;
 
if (!ovr)
return;

[patch net v2 0/4] net/sched: Fix a system panic when deleting filters

2017-10-16 Thread Chris Mi
If some filters share the same action, when deleting these filters,
it is possible to create a system panic. This is because deletions
could be manipulated by many RCU callbacks at the same time.

This patch set fixes these issues. To reproduce the issue run selftests
in patch 3 and 4. To test if the issue was fixed, apply patches 1 and 2
and then repeat the tests.

v2 changelog


Revise the comment and add Acked-by: Jamal Hadi Salim <j...@mojatatu.com>

Chris Mi (4):
  net/sched: Change tc_action refcnt and bindcnt to atomic
  net/sched: Use action array instead of action list as parameter
  selftests: Introduce a new script to generate tc batch file
  selftests: Introduce a new test case to tc testsuite

 include/net/act_api.h  |  11 +-
 net/sched/act_api.c| 124 +
 net/sched/act_bpf.c|   4 +-
 net/sched/act_connmark.c   |   4 +-
 net/sched/act_csum.c   |   4 +-
 net/sched/act_gact.c   |   4 +-
 net/sched/act_ife.c|   4 +-
 net/sched/act_ipt.c|   4 +-
 net/sched/act_mirred.c |   4 +-
 net/sched/act_nat.c|   4 +-
 net/sched/act_pedit.c  |   4 +-
 net/sched/act_police.c |   4 +-
 net/sched/act_sample.c |   4 +-
 net/sched/act_simple.c |   4 +-
 net/sched/act_skbedit.c|   4 +-
 net/sched/act_skbmod.c |   4 +-
 net/sched/act_tunnel_key.c |   4 +-
 net/sched/act_vlan.c   |   4 +-
 net/sched/cls_api.c|  18 +--
 .../tc-testing/tc-tests/filters/tests.json |  23 +++-
 tools/testing/selftests/tc-testing/tdc.py  |  20 +++-
 tools/testing/selftests/tc-testing/tdc_batch.py|  62 +++
 tools/testing/selftests/tc-testing/tdc_config.py   |   2 +
 23 files changed, 222 insertions(+), 102 deletions(-)
 create mode 100755 tools/testing/selftests/tc-testing/tdc_batch.py

-- 
1.8.3.1



[patch net v2 4/4] selftests: Introduce a new test case to tc testsuite

2017-10-16 Thread Chris Mi
In this patchset, we fixed a tc bug. This patch adds the test case
that reproduces the bug. To run this test case, user should specify
an existing NIC device:
  # sudo ./tdc.py -d enp4s0f0

This test case belongs to category "flower". If user doesn't specify
a NIC device, the test cases belong to "flower" will not be run.

In this test case, we create 1M filters and all filters share the same
action. When destroying all filters, kernel should not panic. It takes
about 18s to run it.

Signed-off-by: Chris Mi <chr...@mellanox.com>
Acked-by: Jamal Hadi Salim <j...@mojatatu.com>
---
 .../tc-testing/tc-tests/filters/tests.json | 23 +-
 tools/testing/selftests/tc-testing/tdc.py  | 20 +++
 tools/testing/selftests/tc-testing/tdc_config.py   |  2 ++
 3 files changed, 40 insertions(+), 5 deletions(-)

diff --git a/tools/testing/selftests/tc-testing/tc-tests/filters/tests.json 
b/tools/testing/selftests/tc-testing/tc-tests/filters/tests.json
index c727b96..5fa02d8 100644
--- a/tools/testing/selftests/tc-testing/tc-tests/filters/tests.json
+++ b/tools/testing/selftests/tc-testing/tc-tests/filters/tests.json
@@ -17,5 +17,26 @@
 "teardown": [
 "$TC qdisc del dev $DEV1 ingress"
 ]
+},
+{
+"id": "d052",
+"name": "Add 1M filters with the same action",
+"category": [
+"filter",
+"flower"
+],
+"setup": [
+"$TC qdisc add dev $DEV2 ingress",
+"./tdc_batch.py $DEV2 $BATCH_FILE --share_action -n 100"
+],
+"cmdUnderTest": "$TC -b $BATCH_FILE",
+"expExitCode": "0",
+"verifyCmd": "$TC actions list action gact",
+"matchPattern": "action order 0: gact action drop.*index 1 ref 100 
bind 100",
+"matchCount": "1",
+"teardown": [
+"$TC qdisc del dev $DEV2 ingress",
+"/bin/rm $BATCH_FILE"
+]
 }
-]
\ No newline at end of file
+]
diff --git a/tools/testing/selftests/tc-testing/tdc.py 
b/tools/testing/selftests/tc-testing/tdc.py
index cd61b78..5f11f5d 100755
--- a/tools/testing/selftests/tc-testing/tdc.py
+++ b/tools/testing/selftests/tc-testing/tdc.py
@@ -88,7 +88,7 @@ def prepare_env(cmdlist):
 exit(1)
 
 
-def test_runner(filtered_tests):
+def test_runner(filtered_tests, args):
 """
 Driver function for the unit tests.
 
@@ -105,6 +105,8 @@ def test_runner(filtered_tests):
 for tidx in testlist:
 result = True
 tresult = ""
+if "flower" in tidx["category"] and args.device == None:
+continue
 print("Test " + tidx["id"] + ": " + tidx["name"])
 prepare_env(tidx["setup"])
 (p, procout) = exec_cmd(tidx["cmdUnderTest"])
@@ -152,6 +154,10 @@ def ns_create():
 exec_cmd(cmd, False)
 cmd = 'ip -s $NS link set $DEV1 up'
 exec_cmd(cmd, False)
+cmd = 'ip link set $DEV2 netns $NS'
+exec_cmd(cmd, False)
+cmd = 'ip -s $NS link set $DEV2 up'
+exec_cmd(cmd, False)
 
 
 def ns_destroy():
@@ -211,7 +217,8 @@ def set_args(parser):
 help='Execute the single test case with specified ID')
 parser.add_argument('-i', '--id', action='store_true', dest='gen_id',
 help='Generate ID numbers for new test cases')
-return parser
+parser.add_argument('-d', '--device',
+help='Execute the test case in flower category')
 return parser
 
 
@@ -225,6 +232,8 @@ def check_default_settings(args):
 
 if args.path != None:
  NAMES['TC'] = args.path
+if args.device != None:
+ NAMES['DEV2'] = args.device
 if not os.path.isfile(NAMES['TC']):
 print("The specified tc path " + NAMES['TC'] + " does not exist.")
 exit(1)
@@ -381,14 +390,17 @@ def set_operation_mode(args):
 if (len(alltests) == 0):
 print("Cannot find a test case with ID matching " + target_id)
 exit(1)
-catresults = test_runner(alltests)
+catresults = test_runner(alltests, args)
 print("All test results: " + "\n\n" + catresults)
 elif (len(target_category) > 0):
+if (target_category == "flower") and args.device == None:
+print("Please specify a NIC device (-d) to run category flower")
+exit(1)
 if (target_category not in ucat):
 print("Specified category is not present in this file.")
  

[patch net 1/4] net/sched: Change tc_action refcnt and bindcnt to atomic

2017-10-16 Thread Chris Mi
If many filters share the same action. That action's refcnt and bindcnt
could be manipulated by many RCU callbacks at the same time. This patch
makes these operations atomic.

Fixes commit in pre-git era.

Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2")
Signed-off-by: Chris Mi <chr...@mellanox.com>
---
 include/net/act_api.h  |  4 ++--
 net/sched/act_api.c| 21 +++--
 net/sched/act_bpf.c|  4 ++--
 net/sched/act_connmark.c   |  4 ++--
 net/sched/act_csum.c   |  4 ++--
 net/sched/act_gact.c   |  4 ++--
 net/sched/act_ife.c|  4 ++--
 net/sched/act_ipt.c|  4 ++--
 net/sched/act_mirred.c |  4 ++--
 net/sched/act_nat.c|  4 ++--
 net/sched/act_pedit.c  |  4 ++--
 net/sched/act_police.c |  4 ++--
 net/sched/act_sample.c |  4 ++--
 net/sched/act_simple.c |  4 ++--
 net/sched/act_skbedit.c|  4 ++--
 net/sched/act_skbmod.c |  4 ++--
 net/sched/act_tunnel_key.c |  4 ++--
 net/sched/act_vlan.c   |  4 ++--
 18 files changed, 45 insertions(+), 44 deletions(-)

diff --git a/include/net/act_api.h b/include/net/act_api.h
index b944e0eb..a469ab6 100644
--- a/include/net/act_api.h
+++ b/include/net/act_api.h
@@ -25,8 +25,8 @@ struct tc_action {
struct tcf_idrinfo  *idrinfo;
 
u32 tcfa_index;
-   int tcfa_refcnt;
-   int tcfa_bindcnt;
+   atomic_ttcfa_refcnt;
+   atomic_ttcfa_bindcnt;
u32 tcfa_capab;
int tcfa_action;
struct tcf_ttcfa_tm;
diff --git a/net/sched/act_api.c b/net/sched/act_api.c
index da6fa82..9c0224d 100644
--- a/net/sched/act_api.c
+++ b/net/sched/act_api.c
@@ -88,12 +88,13 @@ int __tcf_idr_release(struct tc_action *p, bool bind, bool 
strict)
 
if (p) {
if (bind)
-   p->tcfa_bindcnt--;
-   else if (strict && p->tcfa_bindcnt > 0)
+   atomic_dec(>tcfa_bindcnt);
+   else if (strict && atomic_read(>tcfa_bindcnt) > 0)
return -EPERM;
 
-   p->tcfa_refcnt--;
-   if (p->tcfa_bindcnt <= 0 && p->tcfa_refcnt <= 0) {
+   atomic_dec(>tcfa_refcnt);
+   if (atomic_read(>tcfa_bindcnt) == 0 &&
+   atomic_read(>tcfa_refcnt) == 0) {
if (p->ops->cleanup)
p->ops->cleanup(p, bind);
tcf_idr_remove(p->idrinfo, p);
@@ -245,8 +246,8 @@ bool tcf_idr_check(struct tc_action_net *tn, u32 index, 
struct tc_action **a,
 
if (index && p) {
if (bind)
-   p->tcfa_bindcnt++;
-   p->tcfa_refcnt++;
+   atomic_inc(>tcfa_bindcnt);
+   atomic_inc(>tcfa_refcnt);
*a = p;
return true;
}
@@ -274,9 +275,9 @@ int tcf_idr_create(struct tc_action_net *tn, u32 index, 
struct nlattr *est,
 
if (unlikely(!p))
return -ENOMEM;
-   p->tcfa_refcnt = 1;
+   atomic_set(>tcfa_refcnt, 1);
if (bind)
-   p->tcfa_bindcnt = 1;
+   atomic_set(>tcfa_bindcnt, 1);
 
if (cpustats) {
p->cpu_bstats = netdev_alloc_pcpu_stats(struct 
gnet_stats_basic_cpu);
@@ -727,7 +728,7 @@ static void cleanup_a(struct list_head *actions, int ovr)
return;
 
list_for_each_entry(a, actions, list)
-   a->tcfa_refcnt--;
+   atomic_dec(>tcfa_refcnt);
 }
 
 int tcf_action_init(struct net *net, struct tcf_proto *tp, struct nlattr *nla,
@@ -751,7 +752,7 @@ int tcf_action_init(struct net *net, struct tcf_proto *tp, 
struct nlattr *nla,
}
act->order = i;
if (ovr)
-   act->tcfa_refcnt++;
+   atomic_inc(>tcfa_refcnt);
list_add_tail(>list, actions);
}
 
diff --git a/net/sched/act_bpf.c b/net/sched/act_bpf.c
index c0c707e..4ddf281 100644
--- a/net/sched/act_bpf.c
+++ b/net/sched/act_bpf.c
@@ -141,8 +141,8 @@ static int tcf_bpf_dump(struct sk_buff *skb, struct 
tc_action *act,
struct tcf_bpf *prog = to_bpf(act);
struct tc_act_bpf opt = {
.index   = prog->tcf_index,
-   .refcnt  = prog->tcf_refcnt - ref,
-   .bindcnt = prog->tcf_bindcnt - bind,
+   .refcnt  = atomic_read(>tcf_refcnt) - ref,
+   .bindcnt = atomic_read(>tcf_bindcnt) - bind,
.action  = prog->tcf_action,
};
struct tcf_t tm;
diff --git a/net/sched/act_connmark.c b/net/sched/act_co

[patch net 3/4] selftests: Introduce a new script to generate tc batch file

2017-10-16 Thread Chris Mi
  # ./tdc_batch.py -h
  usage: tdc_batch.py [-h] [-n NUMBER] [-o] [-s] [-p] device file

  TC batch file generator

  positional arguments:
devicedevice name
file  batch file name

  optional arguments:
-h, --helpshow this help message and exit
-n NUMBER, --number NUMBER
  how many lines in batch file
-o, --skip_sw skip_sw (offload), by default skip_hw
-s, --share_actionall filters share the same action
-p, --prioall filters have different prio

Signed-off-by: Chris Mi <chr...@mellanox.com>
---
 tools/testing/selftests/tc-testing/tdc_batch.py | 62 +
 1 file changed, 62 insertions(+)
 create mode 100755 tools/testing/selftests/tc-testing/tdc_batch.py

diff --git a/tools/testing/selftests/tc-testing/tdc_batch.py 
b/tools/testing/selftests/tc-testing/tdc_batch.py
new file mode 100755
index 000..707c6bf
--- /dev/null
+++ b/tools/testing/selftests/tc-testing/tdc_batch.py
@@ -0,0 +1,62 @@
+#!/usr/bin/python3
+
+"""
+tdc_batch.py - a script to generate TC batch file
+
+Copyright (C) 2017 Chris Mi <chr...@mellanox.com>
+"""
+
+import argparse
+
+parser = argparse.ArgumentParser(description='TC batch file generator')
+parser.add_argument("device", help="device name")
+parser.add_argument("file", help="batch file name")
+parser.add_argument("-n", "--number", type=int,
+help="how many lines in batch file")
+parser.add_argument("-o", "--skip_sw",
+help="skip_sw (offload), by default skip_hw",
+action="store_true")
+parser.add_argument("-s", "--share_action",
+help="all filters share the same action",
+action="store_true")
+parser.add_argument("-p", "--prio",
+help="all filters have different prio",
+action="store_true")
+args = parser.parse_args()
+
+device = args.device
+file = open(args.file, 'w')
+
+number = 1
+if args.number:
+number = args.number
+
+skip = "skip_hw"
+if args.skip_sw:
+skip = "skip_sw"
+
+share_action = ""
+if args.share_action:
+share_action = "index 1"
+
+prio = "prio 1"
+if args.prio:
+prio = ""
+if number > 0x4000:
+number = 0x4000
+
+index = 0
+for i in range(0x100):
+for j in range(0x100):
+for k in range(0x100):
+mac = ("%02x:%02x:%02x" % (i, j, k))
+src_mac = "e4:11:00:" + mac
+dst_mac = "e4:12:00:" + mac
+cmd = ("filter add dev %s %s protocol ip parent : flower %s "
+   "src_mac %s dst_mac %s action drop %s" %
+   (device, prio, skip, src_mac, dst_mac, share_action))
+file.write("%s\n" % cmd)
+index += 1
+if index >= number:
+file.close()
+exit(0)
-- 
1.8.3.1



[patch net 0/4] net/sched: Fix a system panic when deleting filters

2017-10-16 Thread Chris Mi
If some filters share the same action, when deleting these filters,
system may panic. This patchset fixes this issue. And the test case
finding this issue is also integrated into tc test suite of selftests.

Chris Mi (4):
  net/sched: Change tc_action refcnt and bindcnt to atomic
  net/sched: Use action array instead of action list as parameter
  selftests: Introduce a new script to generate tc batch file
  selftests: Introduce a new test case to tc testsuite

 include/net/act_api.h  |  11 +-
 net/sched/act_api.c| 124 +
 net/sched/act_bpf.c|   4 +-
 net/sched/act_connmark.c   |   4 +-
 net/sched/act_csum.c   |   4 +-
 net/sched/act_gact.c   |   4 +-
 net/sched/act_ife.c|   4 +-
 net/sched/act_ipt.c|   4 +-
 net/sched/act_mirred.c |   4 +-
 net/sched/act_nat.c|   4 +-
 net/sched/act_pedit.c  |   4 +-
 net/sched/act_police.c |   4 +-
 net/sched/act_sample.c |   4 +-
 net/sched/act_simple.c |   4 +-
 net/sched/act_skbedit.c|   4 +-
 net/sched/act_skbmod.c |   4 +-
 net/sched/act_tunnel_key.c |   4 +-
 net/sched/act_vlan.c   |   4 +-
 net/sched/cls_api.c|  18 +--
 .../tc-testing/tc-tests/filters/tests.json |  23 +++-
 tools/testing/selftests/tc-testing/tdc.py  |  20 +++-
 tools/testing/selftests/tc-testing/tdc_batch.py|  62 +++
 tools/testing/selftests/tc-testing/tdc_config.py   |   2 +
 23 files changed, 222 insertions(+), 102 deletions(-)
 create mode 100755 tools/testing/selftests/tc-testing/tdc_batch.py

-- 
1.8.3.1



[patch net 4/4] selftests: Introduce a new test case to tc testsuite

2017-10-16 Thread Chris Mi
In this patchset, we fixed a tc bug. This patch adds the test case
that reproduces the bug. To run this test case, user should specify
an existing NIC device:
  # sudo ./tdc.py -d enp4s0f0

This test case belongs to category "flower". If user doesn't specify
a NIC device, the test cases belong to "flower" will not be run.

In this test case, we create 1M filters and all filters share the same
action. When destroying all filters, kernel should not panic. It takes
about 18s to run it.

Signed-off-by: Chris Mi <chr...@mellanox.com>
---
 .../tc-testing/tc-tests/filters/tests.json | 23 +-
 tools/testing/selftests/tc-testing/tdc.py  | 20 +++
 tools/testing/selftests/tc-testing/tdc_config.py   |  2 ++
 3 files changed, 40 insertions(+), 5 deletions(-)

diff --git a/tools/testing/selftests/tc-testing/tc-tests/filters/tests.json 
b/tools/testing/selftests/tc-testing/tc-tests/filters/tests.json
index c727b96..5fa02d8 100644
--- a/tools/testing/selftests/tc-testing/tc-tests/filters/tests.json
+++ b/tools/testing/selftests/tc-testing/tc-tests/filters/tests.json
@@ -17,5 +17,26 @@
 "teardown": [
 "$TC qdisc del dev $DEV1 ingress"
 ]
+},
+{
+"id": "d052",
+"name": "Add 1M filters with the same action",
+"category": [
+"filter",
+"flower"
+],
+"setup": [
+"$TC qdisc add dev $DEV2 ingress",
+"./tdc_batch.py $DEV2 $BATCH_FILE --share_action -n 100"
+],
+"cmdUnderTest": "$TC -b $BATCH_FILE",
+"expExitCode": "0",
+"verifyCmd": "$TC actions list action gact",
+"matchPattern": "action order 0: gact action drop.*index 1 ref 100 
bind 100",
+"matchCount": "1",
+"teardown": [
+"$TC qdisc del dev $DEV2 ingress",
+"/bin/rm $BATCH_FILE"
+]
 }
-]
\ No newline at end of file
+]
diff --git a/tools/testing/selftests/tc-testing/tdc.py 
b/tools/testing/selftests/tc-testing/tdc.py
index cd61b78..5f11f5d 100755
--- a/tools/testing/selftests/tc-testing/tdc.py
+++ b/tools/testing/selftests/tc-testing/tdc.py
@@ -88,7 +88,7 @@ def prepare_env(cmdlist):
 exit(1)
 
 
-def test_runner(filtered_tests):
+def test_runner(filtered_tests, args):
 """
 Driver function for the unit tests.
 
@@ -105,6 +105,8 @@ def test_runner(filtered_tests):
 for tidx in testlist:
 result = True
 tresult = ""
+if "flower" in tidx["category"] and args.device == None:
+continue
 print("Test " + tidx["id"] + ": " + tidx["name"])
 prepare_env(tidx["setup"])
 (p, procout) = exec_cmd(tidx["cmdUnderTest"])
@@ -152,6 +154,10 @@ def ns_create():
 exec_cmd(cmd, False)
 cmd = 'ip -s $NS link set $DEV1 up'
 exec_cmd(cmd, False)
+cmd = 'ip link set $DEV2 netns $NS'
+exec_cmd(cmd, False)
+cmd = 'ip -s $NS link set $DEV2 up'
+exec_cmd(cmd, False)
 
 
 def ns_destroy():
@@ -211,7 +217,8 @@ def set_args(parser):
 help='Execute the single test case with specified ID')
 parser.add_argument('-i', '--id', action='store_true', dest='gen_id',
 help='Generate ID numbers for new test cases')
-return parser
+parser.add_argument('-d', '--device',
+help='Execute the test case in flower category')
 return parser
 
 
@@ -225,6 +232,8 @@ def check_default_settings(args):
 
 if args.path != None:
  NAMES['TC'] = args.path
+if args.device != None:
+ NAMES['DEV2'] = args.device
 if not os.path.isfile(NAMES['TC']):
 print("The specified tc path " + NAMES['TC'] + " does not exist.")
 exit(1)
@@ -381,14 +390,17 @@ def set_operation_mode(args):
 if (len(alltests) == 0):
 print("Cannot find a test case with ID matching " + target_id)
 exit(1)
-catresults = test_runner(alltests)
+catresults = test_runner(alltests, args)
 print("All test results: " + "\n\n" + catresults)
 elif (len(target_category) > 0):
+if (target_category == "flower") and args.device == None:
+print("Please specify a NIC device (-d) to run category flower")
+exit(1)
 if (target_category not in ucat):
 print("Specified category is not present in this file.")
 exit(1)
 else:
-catresults = te

[patch net 2/4] net/sched: Use action array instead of action list as parameter

2017-10-16 Thread Chris Mi
When destroying filters, actions should be destroyed first.
The pointers of each action are saved in an array. TC doesn't
use the array directly, but put all actions in a doubly linked
list and use that list as parameter.

There is no problem if each filter has its own actions. But if
some filters share the same action, when these filters are
destroyed, RCU callback fl_destroy_filter() may be called at the
same time. That means the same action's 'struct list_head list'
could be manipulated at the same time. It may point to an invalid
address so that system will panic.

This patch uses the action array directly to fix this issue.

Fixes commit in pre-git era.

Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2")
Signed-off-by: Chris Mi <chr...@mellanox.com>
---
 include/net/act_api.h |   7 ++--
 net/sched/act_api.c   | 103 +++---
 net/sched/cls_api.c   |  18 +++--
 3 files changed, 75 insertions(+), 53 deletions(-)

diff --git a/include/net/act_api.h b/include/net/act_api.h
index a469ab6..081a313 100644
--- a/include/net/act_api.h
+++ b/include/net/act_api.h
@@ -148,16 +148,17 @@ static inline int tcf_idr_release(struct tc_action *a, 
bool bind)
 int tcf_register_action(struct tc_action_ops *a, struct pernet_operations 
*ops);
 int tcf_unregister_action(struct tc_action_ops *a,
  struct pernet_operations *ops);
-int tcf_action_destroy(struct list_head *actions, int bind);
+int tcf_action_destroy(struct tc_action **actions, int nr, int bind);
 int tcf_action_exec(struct sk_buff *skb, struct tc_action **actions,
int nr_actions, struct tcf_result *res);
 int tcf_action_init(struct net *net, struct tcf_proto *tp, struct nlattr *nla,
struct nlattr *est, char *name, int ovr, int bind,
-   struct list_head *actions);
+   struct tc_action **actions, int *nr);
 struct tc_action *tcf_action_init_1(struct net *net, struct tcf_proto *tp,
struct nlattr *nla, struct nlattr *est,
char *name, int ovr, int bind);
-int tcf_action_dump(struct sk_buff *skb, struct list_head *, int, int);
+int tcf_action_dump(struct sk_buff *skb, struct tc_action **actions, int nr,
+   int bind, int ref);
 int tcf_action_dump_old(struct sk_buff *skb, struct tc_action *a, int, int);
 int tcf_action_dump_1(struct sk_buff *skb, struct tc_action *a, int, int);
 int tcf_action_copy_stats(struct sk_buff *, struct tc_action *, int);
diff --git a/net/sched/act_api.c b/net/sched/act_api.c
index 9c0224d..391d560 100644
--- a/net/sched/act_api.c
+++ b/net/sched/act_api.c
@@ -513,13 +513,15 @@ int tcf_action_exec(struct sk_buff *skb, struct tc_action 
**actions,
 }
 EXPORT_SYMBOL(tcf_action_exec);
 
-int tcf_action_destroy(struct list_head *actions, int bind)
+int tcf_action_destroy(struct tc_action **actions, int nr, int bind)
 {
const struct tc_action_ops *ops;
-   struct tc_action *a, *tmp;
+   struct tc_action *a;
int ret = 0;
+   int i;
 
-   list_for_each_entry_safe(a, tmp, actions, list) {
+   for (i = 0; i < nr; i++) {
+   a = actions[i];
ops = a->ops;
ret = __tcf_idr_release(a, bind, true);
if (ret == ACT_P_DELETED)
@@ -568,14 +570,16 @@ int tcf_action_destroy(struct list_head *actions, int 
bind)
 }
 EXPORT_SYMBOL(tcf_action_dump_1);
 
-int tcf_action_dump(struct sk_buff *skb, struct list_head *actions,
+int tcf_action_dump(struct sk_buff *skb, struct tc_action **actions, int nr,
int bind, int ref)
 {
struct tc_action *a;
-   int err = -EINVAL;
struct nlattr *nest;
+   int err = -EINVAL;
+   int i;
 
-   list_for_each_entry(a, actions, list) {
+   for (i = 0; i < nr; i++) {
+   a = actions[i];
nest = nla_nest_start(skb, a->order);
if (nest == NULL)
goto nla_put_failure;
@@ -700,10 +704,7 @@ struct tc_action *tcf_action_init_1(struct net *net, 
struct tcf_proto *tp,
if (TC_ACT_EXT_CMP(a->tcfa_action, TC_ACT_GOTO_CHAIN)) {
err = tcf_action_goto_chain_init(a, tp);
if (err) {
-   LIST_HEAD(actions);
-
-   list_add_tail(>list, );
-   tcf_action_destroy(, bind);
+   tcf_action_destroy(, 1, bind);
return ERR_PTR(err);
}
}
@@ -720,23 +721,27 @@ struct tc_action *tcf_action_init_1(struct net *net, 
struct tcf_proto *tp,
return ERR_PTR(err);
 }
 
-static void cleanup_a(struct list_head *actions, int ovr)
+static void cleanup_a(struct tc_action **actions, int nr, int ovr)
 {
struct tc_action *a;
+   int i;
 
if (!ovr)
return;
 
-   list_for_each_entry(a, actions, l

RE: Extended IDR API

2017-09-11 Thread Chris Mi
This improvement is good.  But I have a concern that
the parameters of idr_alloc and idr_alloc_ul are different.
I mean in idr_alloc, we have start and end.
In our new API, we keep them. So our design goal is to
make them consistent.  Your new API has its advantage surely.
If you want to change it, I don't object personally.
 
> -Original Message-
> From: Matthew Wilcox [mailto:wi...@infradead.org]
> Sent: Tuesday, September 12, 2017 5:14 AM
> To: Chris Mi <chr...@mellanox.com>
> Cc: Jiri Pirko <j...@mellanox.com>; David S. Miller <da...@davemloft.net>;
> Tejun Heo <t...@kernel.org>; linux-ker...@vger.kernel.org;
> netdev@vger.kernel.org; Rehas Sachdeva <aquan...@gmail.com>
> Subject: Extended IDR API
> 
> 
> I really don't like your new API.  I wish you'd discussed it before merging 
> it.
> Here's my redesign.  Does anybody have a suggestion for improvement?
> 
> We have a lovely new test-suite for the IDR (in tools/testing/radix-tree) ...
> when adding a new API, it's polite to update the test-suite too.
> Do you have any plans to add test cases?
OK, we will add it once these APIs are stabilized.

Thanks,
Chris
> 
> (Compile tested only; I'm at a conference.  Also, I didn't check the kerneldoc
> because I don't have Sphinx installed on my laptop.)
> 
> From ff45b2a6806cd0e4177c5a10f26c97999164c10c Mon Sep 17 00:00:00 2001
> From: Matthew Wilcox <mawil...@microsoft.com>
> Date: Mon, 11 Sep 2017 16:16:29 -0400
> Subject: [PATCH] idr: Rewrite extended IDR API
> 
>  - Rename the API to be 'ul' for unsigned long instead of 'ext'.  This
>fits better with other usage in the Linux kernel.
>  - idr_alloc() moves back to being a function instead of inline
>  - idr_alloc_ul() takes 'nextid' as an in-out parameter like idr_get_next(),
>instead of having 'index' as an out-only parameter.
>  - idr_alloc_ul() needs a __must_check to ensure that users don't look at
>the result without checking whether the function succeeded.
>  - idr_alloc_ul() takes 'max' rather than 'end', or it is impossible to
>allocate the ULONG_MAX id.
>  - idr_replace() can simply take an unsigned long parameter instead of
>an int.
>  - idr_remove() and idr_find() are the same as idr_replace().
>  - We don't need separate idr_get_free() and idr_get_free_ext().
>  - Add kerneldoc for idr_alloc_ul().
> 
> Signed-off-by: Matthew Wilcox <mawil...@microsoft.com>
> ---
>  include/linux/idr.h| 75 +--
>  include/linux/radix-tree.h | 17 +
>  lib/idr.c  | 88 
> +-
>  lib/radix-tree.c   |  2 +-
>  net/sched/act_api.c| 22 ++--
>  net/sched/cls_flower.c | 16 +
>  6 files changed, 95 insertions(+), 125 deletions(-)
> 
> diff --git a/include/linux/idr.h b/include/linux/idr.h index
> 7c3a365f7e12..90faf8279559 100644
> --- a/include/linux/idr.h
> +++ b/include/linux/idr.h
> @@ -81,74 +81,22 @@ static inline void idr_set_cursor(struct idr *idr,
> unsigned int val)
> 
>  void idr_preload(gfp_t gfp_mask);
> 
> -int idr_alloc_cmn(struct idr *idr, void *ptr, unsigned long *index,
> -   unsigned long start, unsigned long end, gfp_t gfp,
> -   bool ext);
> -
> -/**
> - * idr_alloc - allocate an id
> - * @idr: idr handle
> - * @ptr: pointer to be associated with the new id
> - * @start: the minimum id (inclusive)
> - * @end: the maximum id (exclusive)
> - * @gfp: memory allocation flags
> - *
> - * Allocates an unused ID in the range [start, end).  Returns -ENOSPC
> - * if there are no unused IDs in that range.
> - *
> - * Note that @end is treated as max when <= 0.  This is to always allow
> - * using @start + N as @end as long as N is inside integer range.
> - *
> - * Simultaneous modifications to the @idr are not allowed and should be
> - * prevented by the user, usually with a lock.  idr_alloc() may be called
> - * concurrently with read-only accesses to the @idr, such as idr_find() and
> - * idr_for_each_entry().
> - */
> -static inline int idr_alloc(struct idr *idr, void *ptr,
> - int start, int end, gfp_t gfp)
> -{
> - unsigned long id;
> - int ret;
> -
> - if (WARN_ON_ONCE(start < 0))
> - return -EINVAL;
> -
> - ret = idr_alloc_cmn(idr, ptr, , start, end, gfp, false);
> -
> - if (ret)
> - return ret;
> -
> - return id;
> -}
> -
> -static inline int idr_alloc_ext(struct idr *idr, void *ptr,
> - unsigned long *index,
> - unsigned long start,
> - unsigned 

RE: [patch net-next v2 0/3] net/sched: Improve getting objects by indexes

2017-08-30 Thread Chris Mi


> -Original Message-
> From: David Miller [mailto:da...@davemloft.net]
> Sent: Thursday, August 31, 2017 5:39 AM
> To: Chris Mi <chr...@mellanox.com>
> Cc: netdev@vger.kernel.org; j...@mojatatu.com;
> xiyou.wangc...@gmail.com; j...@resnulli.us; mawil...@microsoft.com
> Subject: Re: [patch net-next v2 0/3] net/sched: Improve getting objects by
> indexes
> 
> From: Chris Mi <chr...@mellanox.com>
> Date: Wed, 30 Aug 2017 02:31:56 -0400
> 
> > Using current TC code, it is very slow to insert a lot of rules.
> >
> > In order to improve the rules update rate in TC, we introduced the
> > following two changes:
> > 1) changed cls_flower to use IDR to manage the filters.
> > 2) changed all act_xxx modules to use IDR instead of
> >a small hash table
> >
> > But IDR has a limitation that it uses int. TC handle uses u32.
> > To make sure there is no regression, we add several new IDR APIs to
> > support unsigned long.
> >
> > v2
> > ==
> >
> > Addressed Hannes's comment:
> > express idr_alloc in terms of idr_alloc_ext and most of the other
> > functions
> 
> Series applied, thanks.

Thank you, David,

-Chris


RE: [patch net-next v2 3/3] net/sched: Change act_api and act_xxx modules to use IDR

2017-08-30 Thread Chris Mi


> -Original Message-
> From: Jamal Hadi Salim [mailto:j...@mojatatu.com]
> Sent: Wednesday, August 30, 2017 8:11 PM
> To: Chris Mi <chr...@mellanox.com>; netdev@vger.kernel.org
> Cc: xiyou.wangc...@gmail.com; j...@resnulli.us; da...@davemloft.net;
> mawil...@microsoft.com
> Subject: Re: [patch net-next v2 3/3] net/sched: Change act_api and act_xxx
> modules to use IDR
> 
> On 17-08-30 02:31 AM, Chris Mi wrote:
> > Typically, each TC filter has its own action. All the actions of the
> > same type are saved in its hash table. But the hash buckets are too
> > small that it degrades to a list. And the performance is greatly
> > affected. For example, it takes about 0m11.914s to insert 64K rules.
> > If we convert the hash table to IDR, it only takes about 0m1.500s.
> > The improvement is huge.
> >
> > But please note that the test result is based on previous patch that
> > cls_flower uses IDR.
> >
> > Signed-off-by: Chris Mi <chr...@mellanox.com>
> > Signed-off-by: Jiri Pirko <j...@mellanox.com>
> 
> Acked-by: Jamal Hadi Salim <j...@mojatatu.com>
> 
> Also already acked this before but you left it out in this version. If you 
> make
> changes to the patch then you will need a new ACK.
Sorry about that, Jamal. I think I need to make a note of the review comment
In case I forget it.
> 
> Dont forget to update selftests please.
Sure, we will work on that.

Thanks,
Chris
> 
> cheers,
> jamal


[patch net-next v2 0/3] net/sched: Improve getting objects by indexes

2017-08-30 Thread Chris Mi
Using current TC code, it is very slow to insert a lot of rules.

In order to improve the rules update rate in TC,
we introduced the following two changes:
1) changed cls_flower to use IDR to manage the filters.
2) changed all act_xxx modules to use IDR instead of
   a small hash table

But IDR has a limitation that it uses int. TC handle uses u32.
To make sure there is no regression, we add several new IDR APIs
to support unsigned long.

v2
==

Addressed Hannes's comment:
express idr_alloc in terms of idr_alloc_ext and most of the other functions

Chris Mi (3):
  idr: Add new APIs to support unsigned long
  net/sched: Change cls_flower to use IDR
  net/sched: Change act_api and act_xxx modules to use IDR

 include/linux/idr.h|  69 -
 include/linux/radix-tree.h |  21 +++-
 include/net/act_api.h  |  76 +-
 lib/idr.c  |  66 ++--
 lib/radix-tree.c   |   6 +-
 net/sched/act_api.c| 251 ++---
 net/sched/act_bpf.c|  17 ++-
 net/sched/act_connmark.c   |  16 ++-
 net/sched/act_csum.c   |  16 ++-
 net/sched/act_gact.c   |  16 ++-
 net/sched/act_ife.c|  20 ++--
 net/sched/act_ipt.c|  26 +++--
 net/sched/act_mirred.c |  19 ++--
 net/sched/act_nat.c|  16 ++-
 net/sched/act_pedit.c  |  18 ++--
 net/sched/act_police.c |  18 ++--
 net/sched/act_sample.c |  17 ++-
 net/sched/act_simple.c |  20 ++--
 net/sched/act_skbedit.c|  18 ++--
 net/sched/act_skbmod.c |  18 ++--
 net/sched/act_tunnel_key.c |  20 ++--
 net/sched/act_vlan.c   |  22 ++--
 net/sched/cls_flower.c |  55 +-
 23 files changed, 427 insertions(+), 414 deletions(-)

-- 
1.8.3.1



[patch net-next v2 2/3] net/sched: Change cls_flower to use IDR

2017-08-30 Thread Chris Mi
Currently, all filters with the same priority are linked in a doubly
linked list. Every filter should have a unique handle. To make the
handle unique, we need to iterate the list every time to see if the
handle exists or not when inserting a new filter. It is time-consuming.
For example, it takes about 5m3.169s to insert 64K rules.

This patch changes cls_flower to use IDR. With this patch, it
takes about 0m1.127s to insert 64K rules. The improvement is huge.

But please note that in this testing, all filters share the same action.
If every filter has a unique action, that is another bottleneck.
Follow-up patch in this patchset addresses that.

Signed-off-by: Chris Mi <chr...@mellanox.com>
Signed-off-by: Jiri Pirko <j...@mellanox.com>
---
 net/sched/cls_flower.c | 55 +-
 1 file changed, 23 insertions(+), 32 deletions(-)

diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c
index bd9dab4..3d041d2 100644
--- a/net/sched/cls_flower.c
+++ b/net/sched/cls_flower.c
@@ -68,7 +68,6 @@ struct cls_fl_head {
struct rhashtable ht;
struct fl_flow_mask mask;
struct flow_dissector dissector;
-   u32 hgen;
bool mask_assigned;
struct list_head filters;
struct rhashtable_params ht_params;
@@ -76,6 +75,7 @@ struct cls_fl_head {
struct work_struct work;
struct rcu_head rcu;
};
+   struct idr handle_idr;
 };
 
 struct cls_fl_filter {
@@ -210,6 +210,7 @@ static int fl_init(struct tcf_proto *tp)
 
INIT_LIST_HEAD_RCU(>filters);
rcu_assign_pointer(tp->root, head);
+   idr_init(>handle_idr);
 
return 0;
 }
@@ -295,6 +296,9 @@ static void fl_hw_update_stats(struct tcf_proto *tp, struct 
cls_fl_filter *f)
 
 static void __fl_delete(struct tcf_proto *tp, struct cls_fl_filter *f)
 {
+   struct cls_fl_head *head = rtnl_dereference(tp->root);
+
+   idr_remove_ext(>handle_idr, f->handle);
list_del_rcu(>list);
if (!tc_skip_hw(f->flags))
fl_hw_destroy_filter(tp, f);
@@ -327,6 +331,7 @@ static void fl_destroy(struct tcf_proto *tp)
 
list_for_each_entry_safe(f, next, >filters, list)
__fl_delete(tp, f);
+   idr_destroy(>handle_idr);
 
__module_get(THIS_MODULE);
call_rcu(>rcu, fl_destroy_rcu);
@@ -335,12 +340,8 @@ static void fl_destroy(struct tcf_proto *tp)
 static void *fl_get(struct tcf_proto *tp, u32 handle)
 {
struct cls_fl_head *head = rtnl_dereference(tp->root);
-   struct cls_fl_filter *f;
 
-   list_for_each_entry(f, >filters, list)
-   if (f->handle == handle)
-   return f;
-   return NULL;
+   return idr_find_ext(>handle_idr, handle);
 }
 
 static const struct nla_policy fl_policy[TCA_FLOWER_MAX + 1] = {
@@ -859,27 +860,6 @@ static int fl_set_parms(struct net *net, struct tcf_proto 
*tp,
return 0;
 }
 
-static u32 fl_grab_new_handle(struct tcf_proto *tp,
- struct cls_fl_head *head)
-{
-   unsigned int i = 0x8000;
-   u32 handle;
-
-   do {
-   if (++head->hgen == 0x7FFF)
-   head->hgen = 1;
-   } while (--i > 0 && fl_get(tp, head->hgen));
-
-   if (unlikely(i == 0)) {
-   pr_err("Insufficient number of handles\n");
-   handle = 0;
-   } else {
-   handle = head->hgen;
-   }
-
-   return handle;
-}
-
 static int fl_change(struct net *net, struct sk_buff *in_skb,
 struct tcf_proto *tp, unsigned long base,
 u32 handle, struct nlattr **tca,
@@ -890,6 +870,7 @@ static int fl_change(struct net *net, struct sk_buff 
*in_skb,
struct cls_fl_filter *fnew;
struct nlattr **tb;
struct fl_flow_mask mask = {};
+   unsigned long idr_index;
int err;
 
if (!tca[TCA_OPTIONS])
@@ -920,13 +901,21 @@ static int fl_change(struct net *net, struct sk_buff 
*in_skb,
goto errout;
 
if (!handle) {
-   handle = fl_grab_new_handle(tp, head);
-   if (!handle) {
-   err = -EINVAL;
+   err = idr_alloc_ext(>handle_idr, fnew, _index,
+   1, 0x8000, GFP_KERNEL);
+   if (err)
goto errout;
-   }
+   fnew->handle = idr_index;
+   }
+
+   /* user specifies a handle and it doesn't exist */
+   if (handle && !fold) {
+   err = idr_alloc_ext(>handle_idr, fnew, _index,
+   handle, handle + 1, GFP_KERNEL);
+   if (err)
+   goto errout;
+   fnew->handle = idr_index;
}
-   fnew->handle = handle;
 
if (tb[TCA_FLOWER_FLAGS]) {

[patch net-next v2 1/3] idr: Add new APIs to support unsigned long

2017-08-30 Thread Chris Mi
The following new APIs are added:

int idr_alloc_ext(struct idr *idr, void *ptr, unsigned long *index,
  unsigned long start, unsigned long end, gfp_t gfp);
void *idr_remove_ext(struct idr *idr, unsigned long id);
void *idr_find_ext(const struct idr *idr, unsigned long id);
void *idr_replace_ext(struct idr *idr, void *ptr, unsigned long id);
void *idr_get_next_ext(struct idr *idr, unsigned long *nextid);

Signed-off-by: Chris Mi <chr...@mellanox.com>
Signed-off-by: Jiri Pirko <j...@mellanox.com>
---
 include/linux/idr.h| 69 --
 include/linux/radix-tree.h | 21 --
 lib/idr.c  | 66 +---
 lib/radix-tree.c   |  6 ++--
 4 files changed, 125 insertions(+), 37 deletions(-)

diff --git a/include/linux/idr.h b/include/linux/idr.h
index bf70b3e..7c3a365 100644
--- a/include/linux/idr.h
+++ b/include/linux/idr.h
@@ -80,19 +80,75 @@ static inline void idr_set_cursor(struct idr *idr, unsigned 
int val)
  */
 
 void idr_preload(gfp_t gfp_mask);
-int idr_alloc(struct idr *, void *entry, int start, int end, gfp_t);
+
+int idr_alloc_cmn(struct idr *idr, void *ptr, unsigned long *index,
+ unsigned long start, unsigned long end, gfp_t gfp,
+ bool ext);
+
+/**
+ * idr_alloc - allocate an id
+ * @idr: idr handle
+ * @ptr: pointer to be associated with the new id
+ * @start: the minimum id (inclusive)
+ * @end: the maximum id (exclusive)
+ * @gfp: memory allocation flags
+ *
+ * Allocates an unused ID in the range [start, end).  Returns -ENOSPC
+ * if there are no unused IDs in that range.
+ *
+ * Note that @end is treated as max when <= 0.  This is to always allow
+ * using @start + N as @end as long as N is inside integer range.
+ *
+ * Simultaneous modifications to the @idr are not allowed and should be
+ * prevented by the user, usually with a lock.  idr_alloc() may be called
+ * concurrently with read-only accesses to the @idr, such as idr_find() and
+ * idr_for_each_entry().
+ */
+static inline int idr_alloc(struct idr *idr, void *ptr,
+   int start, int end, gfp_t gfp)
+{
+   unsigned long id;
+   int ret;
+
+   if (WARN_ON_ONCE(start < 0))
+   return -EINVAL;
+
+   ret = idr_alloc_cmn(idr, ptr, , start, end, gfp, false);
+
+   if (ret)
+   return ret;
+
+   return id;
+}
+
+static inline int idr_alloc_ext(struct idr *idr, void *ptr,
+   unsigned long *index,
+   unsigned long start,
+   unsigned long end,
+   gfp_t gfp)
+{
+   return idr_alloc_cmn(idr, ptr, index, start, end, gfp, true);
+}
+
 int idr_alloc_cyclic(struct idr *, void *entry, int start, int end, gfp_t);
 int idr_for_each(const struct idr *,
 int (*fn)(int id, void *p, void *data), void *data);
 void *idr_get_next(struct idr *, int *nextid);
+void *idr_get_next_ext(struct idr *idr, unsigned long *nextid);
 void *idr_replace(struct idr *, void *, int id);
+void *idr_replace_ext(struct idr *idr, void *ptr, unsigned long id);
 void idr_destroy(struct idr *);
 
-static inline void *idr_remove(struct idr *idr, int id)
+static inline void *idr_remove_ext(struct idr *idr, unsigned long id)
 {
return radix_tree_delete_item(>idr_rt, id, NULL);
 }
 
+static inline void *idr_remove(struct idr *idr, int id)
+{
+   return idr_remove_ext(idr, id);
+}
+
 static inline void idr_init(struct idr *idr)
 {
INIT_RADIX_TREE(>idr_rt, IDR_RT_MARKER);
@@ -128,11 +184,16 @@ static inline void idr_preload_end(void)
  * This function can be called under rcu_read_lock(), given that the leaf
  * pointers lifetimes are correctly managed.
  */
-static inline void *idr_find(const struct idr *idr, int id)
+static inline void *idr_find_ext(const struct idr *idr, unsigned long id)
 {
return radix_tree_lookup(>idr_rt, id);
 }
 
+static inline void *idr_find(const struct idr *idr, int id)
+{
+   return idr_find_ext(idr, id);
+}
+
 /**
  * idr_for_each_entry - iterate over an idr's elements of a given type
  * @idr: idr handle
@@ -145,6 +206,8 @@ static inline void *idr_find(const struct idr *idr, int id)
  */
 #define idr_for_each_entry(idr, entry, id) \
for (id = 0; ((entry) = idr_get_next(idr, &(id))) != NULL; ++id)
+#define idr_for_each_entry_ext(idr, entry, id) \
+   for (id = 0; ((entry) = idr_get_next_ext(idr, &(id))) != NULL; ++id)
 
 /**
  * idr_for_each_entry_continue - continue iteration over an idr's elements of 
a given type
diff --git a/include/linux/radix-tree.h b/include/linux/radix-tree.h
index 3e57350..567ebb5 100644
--- a/include/linux/radix-tree.h
+++ b/include/linux/radix-tree.h
@@ -357,8 +357,25 @@ int radix_tree_split(struct radix_tree_root *, unsigned 
long index,
  

[patch net-next v2 3/3] net/sched: Change act_api and act_xxx modules to use IDR

2017-08-30 Thread Chris Mi
Typically, each TC filter has its own action. All the actions of the
same type are saved in its hash table. But the hash buckets are too
small that it degrades to a list. And the performance is greatly
affected. For example, it takes about 0m11.914s to insert 64K rules.
If we convert the hash table to IDR, it only takes about 0m1.500s.
The improvement is huge.

But please note that the test result is based on previous patch that
cls_flower uses IDR.

Signed-off-by: Chris Mi <chr...@mellanox.com>
Signed-off-by: Jiri Pirko <j...@mellanox.com>
---
 include/net/act_api.h  |  76 +-
 net/sched/act_api.c| 251 ++---
 net/sched/act_bpf.c|  17 ++-
 net/sched/act_connmark.c   |  16 ++-
 net/sched/act_csum.c   |  16 ++-
 net/sched/act_gact.c   |  16 ++-
 net/sched/act_ife.c|  20 ++--
 net/sched/act_ipt.c|  26 +++--
 net/sched/act_mirred.c |  19 ++--
 net/sched/act_nat.c|  16 ++-
 net/sched/act_pedit.c  |  18 ++--
 net/sched/act_police.c |  18 ++--
 net/sched/act_sample.c |  17 ++-
 net/sched/act_simple.c |  20 ++--
 net/sched/act_skbedit.c|  18 ++--
 net/sched/act_skbmod.c |  18 ++--
 net/sched/act_tunnel_key.c |  20 ++--
 net/sched/act_vlan.c   |  22 ++--
 18 files changed, 279 insertions(+), 345 deletions(-)

diff --git a/include/net/act_api.h b/include/net/act_api.h
index 26ffd83..8f3d5d8 100644
--- a/include/net/act_api.h
+++ b/include/net/act_api.h
@@ -10,12 +10,9 @@
 #include 
 #include 
 
-
-struct tcf_hashinfo {
-   struct hlist_head   *htab;
-   unsigned inthmask;
-   spinlock_t  lock;
-   u32 index;
+struct tcf_idrinfo {
+   spinlock_t  lock;
+   struct idr  action_idr;
 };
 
 struct tc_action_ops;
@@ -25,9 +22,8 @@ struct tc_action {
__u32   type; /* for backward 
compat(TCA_OLD_COMPAT) */
__u32   order;
struct list_headlist;
-   struct tcf_hashinfo *hinfo;
+   struct tcf_idrinfo  *idrinfo;
 
-   struct hlist_node   tcfa_head;
u32 tcfa_index;
int tcfa_refcnt;
int tcfa_bindcnt;
@@ -44,7 +40,6 @@ struct tc_action {
struct tc_cookie*act_cookie;
struct tcf_chain*goto_chain;
 };
-#define tcf_head   common.tcfa_head
 #define tcf_index  common.tcfa_index
 #define tcf_refcnt common.tcfa_refcnt
 #define tcf_bindcntcommon.tcfa_bindcnt
@@ -57,27 +52,6 @@ struct tc_action {
 #define tcf_lock   common.tcfa_lock
 #define tcf_rcucommon.tcfa_rcu
 
-static inline unsigned int tcf_hash(u32 index, unsigned int hmask)
-{
-   return index & hmask;
-}
-
-static inline int tcf_hashinfo_init(struct tcf_hashinfo *hf, unsigned int mask)
-{
-   int i;
-
-   spin_lock_init(>lock);
-   hf->index = 0;
-   hf->hmask = mask;
-   hf->htab = kzalloc((mask + 1) * sizeof(struct hlist_head),
-  GFP_KERNEL);
-   if (!hf->htab)
-   return -ENOMEM;
-   for (i = 0; i < mask + 1; i++)
-   INIT_HLIST_HEAD(>htab[i]);
-   return 0;
-}
-
 /* Update lastuse only if needed, to avoid dirtying a cache line.
  * We use a temp variable to avoid fetching jiffies twice.
  */
@@ -126,53 +100,51 @@ struct tc_action_ops {
 };
 
 struct tc_action_net {
-   struct tcf_hashinfo *hinfo;
+   struct tcf_idrinfo *idrinfo;
const struct tc_action_ops *ops;
 };
 
 static inline
 int tc_action_net_init(struct tc_action_net *tn,
-  const struct tc_action_ops *ops, unsigned int mask)
+  const struct tc_action_ops *ops)
 {
int err = 0;
 
-   tn->hinfo = kmalloc(sizeof(*tn->hinfo), GFP_KERNEL);
-   if (!tn->hinfo)
+   tn->idrinfo = kmalloc(sizeof(*tn->idrinfo), GFP_KERNEL);
+   if (!tn->idrinfo)
return -ENOMEM;
tn->ops = ops;
-   err = tcf_hashinfo_init(tn->hinfo, mask);
-   if (err)
-   kfree(tn->hinfo);
+   spin_lock_init(>idrinfo->lock);
+   idr_init(>idrinfo->action_idr);
return err;
 }
 
-void tcf_hashinfo_destroy(const struct tc_action_ops *ops,
- struct tcf_hashinfo *hinfo);
+void tcf_idrinfo_destroy(const struct tc_action_ops *ops,
+struct tcf_idrinfo *idrinfo);
 
 static inline void tc_action_net_exit(struct tc_action_net *tn)
 {
-   tcf_hashinfo_destroy(tn->ops, tn->hinfo);
-   kfree(tn->hinfo);
+   tcf_idrinfo_destroy(tn->ops, tn->idrinfo);
+   kfree(tn->idrinfo);
 }
 
 int tcf_generic_walker(struct tc_action_net *tn, struct sk_buff *skb,
   struct

RE: [patch net-next 1/3] idr: Add new APIs to support unsigned long

2017-08-29 Thread Chris Mi


> -Original Message-
> From: Jiri Pirko [mailto:j...@resnulli.us]
> Sent: Tuesday, August 29, 2017 3:57 PM
> To: Chris Mi <chr...@mellanox.com>
> Cc: Hannes Frederic Sowa <han...@stressinduktion.org>;
> netdev@vger.kernel.org; j...@mojatatu.com; xiyou.wangc...@gmail.com;
> da...@davemloft.net; mawil...@microsoft.com
> Subject: Re: [patch net-next 1/3] idr: Add new APIs to support unsigned long
> 
> Tue, Aug 29, 2017 at 09:34:47AM CEST, chr...@mellanox.com wrote:
> >Hi,
> >
> >> -Original Message-
> >> From: Hannes Frederic Sowa [mailto:han...@stressinduktion.org]
> >> Sent: Tuesday, August 29, 2017 3:14 PM
> >> To: Chris Mi <chr...@mellanox.com>
> >> Cc: netdev@vger.kernel.org; j...@mojatatu.com;
> >> xiyou.wangc...@gmail.com; j...@resnulli.us; da...@davemloft.net;
> >> mawil...@microsoft.com
> >> Subject: Re: [patch net-next 1/3] idr: Add new APIs to support
> >> unsigned long
> >>
> >> Hello,
> >>
> >> Chris Mi <chr...@mellanox.com> writes:
> >>
> >> > The following new APIs are added:
> >> >
> >> > int idr_alloc_ext(struct idr *idr, void *ptr, unsigned long *index,
> >> >   unsigned long start, unsigned long end, gfp_t
> >> > gfp); static inline void *idr_remove_ext(struct idr *idr, unsigned
> >> > long id); static inline void *idr_find_ext(const struct idr *idr,
> >> > unsigned long id); void *idr_replace_ext(struct idr *idr, void
> >> > *ptr, unsigned long id); void *idr_get_next_ext(struct idr *idr,
> >> > unsigned long *nextid);
> >> >
> >> > Signed-off-by: Chris Mi <chr...@mellanox.com>
> >> > Signed-off-by: Jiri Pirko <j...@mellanox.com>
> >> > ---
> >> >  include/linux/idr.h| 16 ++
> >> >  include/linux/radix-tree.h |  3 ++
> >> >  lib/idr.c  | 56 +++
> >> >  lib/radix-tree.c   | 73
> >> ++
> >> >  4 files changed, 148 insertions(+)
> >> >
> >>
> >> [...]
> >>
> >> > +int idr_alloc_ext(struct idr *idr, void *ptr, unsigned long *index,
> >> > +  unsigned long start, unsigned long end, gfp_t gfp) {
> >> > +void __rcu **slot;
> >> > +struct radix_tree_iter iter;
> >> > +
> >> > +if (WARN_ON_ONCE(radix_tree_is_internal_node(ptr)))
> >> > +return -EINVAL;
> >> > +
> >> > +radix_tree_iter_init(, start);
> >> > +slot = idr_get_free_ext(>idr_rt, , gfp, end);
> >> > +if (IS_ERR(slot))
> >> > +return PTR_ERR(slot);
> >> > +
> >> > +radix_tree_iter_replace(>idr_rt, , slot, ptr);
> >> > +radix_tree_iter_tag_clear(>idr_rt, , IDR_FREE);
> >> > +
> >> > +if (index)
> >> > +*index = iter.index;
> >> > +return 0;
> >> > +}
> >> > +EXPORT_SYMBOL_GPL(idr_alloc_ext);
> >>
> >> Can you express idr_alloc in terms of idr_alloc_ext? Same for most of
> >> the other functions (it seems that signed int was used as return
> >> value to indicate error cases, thus it should be easy to map those).
> >In idr_alloc(), we have the following check:
> >
> >if (WARN_ON_ONCE(start < 0))
> >return -EINVAL;
> >
> >But in idr_alloc_ext(), since we are using unsigned long, we needn't such
> check.
> 
> You can just check and call idr_alloc_ext then to do the actual work.
OK, will fix it.


RE: [patch net-next 1/3] idr: Add new APIs to support unsigned long

2017-08-29 Thread Chris Mi
Hi,

> -Original Message-
> From: Hannes Frederic Sowa [mailto:han...@stressinduktion.org]
> Sent: Tuesday, August 29, 2017 3:14 PM
> To: Chris Mi <chr...@mellanox.com>
> Cc: netdev@vger.kernel.org; j...@mojatatu.com;
> xiyou.wangc...@gmail.com; j...@resnulli.us; da...@davemloft.net;
> mawil...@microsoft.com
> Subject: Re: [patch net-next 1/3] idr: Add new APIs to support unsigned long
> 
> Hello,
> 
> Chris Mi <chr...@mellanox.com> writes:
> 
> > The following new APIs are added:
> >
> > int idr_alloc_ext(struct idr *idr, void *ptr, unsigned long *index,
> >   unsigned long start, unsigned long end, gfp_t gfp);
> > static inline void *idr_remove_ext(struct idr *idr, unsigned long id);
> > static inline void *idr_find_ext(const struct idr *idr, unsigned long
> > id); void *idr_replace_ext(struct idr *idr, void *ptr, unsigned long
> > id); void *idr_get_next_ext(struct idr *idr, unsigned long *nextid);
> >
> > Signed-off-by: Chris Mi <chr...@mellanox.com>
> > Signed-off-by: Jiri Pirko <j...@mellanox.com>
> > ---
> >  include/linux/idr.h| 16 ++
> >  include/linux/radix-tree.h |  3 ++
> >  lib/idr.c  | 56 +++
> >  lib/radix-tree.c   | 73
> ++
> >  4 files changed, 148 insertions(+)
> >
> 
> [...]
> 
> > +int idr_alloc_ext(struct idr *idr, void *ptr, unsigned long *index,
> > + unsigned long start, unsigned long end, gfp_t gfp) {
> > +   void __rcu **slot;
> > +   struct radix_tree_iter iter;
> > +
> > +   if (WARN_ON_ONCE(radix_tree_is_internal_node(ptr)))
> > +   return -EINVAL;
> > +
> > +   radix_tree_iter_init(, start);
> > +   slot = idr_get_free_ext(>idr_rt, , gfp, end);
> > +   if (IS_ERR(slot))
> > +   return PTR_ERR(slot);
> > +
> > +   radix_tree_iter_replace(>idr_rt, , slot, ptr);
> > +   radix_tree_iter_tag_clear(>idr_rt, , IDR_FREE);
> > +
> > +   if (index)
> > +   *index = iter.index;
> > +   return 0;
> > +}
> > +EXPORT_SYMBOL_GPL(idr_alloc_ext);
> 
> Can you express idr_alloc in terms of idr_alloc_ext? Same for most of the
> other functions (it seems that signed int was used as return value to indicate
> error cases, thus it should be easy to map those).
In idr_alloc(), we have the following check:

if (WARN_ON_ONCE(start < 0))
return -EINVAL;

But in idr_alloc_ext(), since we are using unsigned long, we needn't such check.

In order to reuse several lines of code, I think it is not worth to express 
idr_alloc()
In terms of idr_alloc_ext. 

Thanks,
Chris
> 
> [...]
> 
> Thanks,
> Hannes


RE: [patch net-next 2/3] net/sched: Change cls_flower to use IDR

2017-08-28 Thread Chris Mi


> -Original Message-
> From: Simon Horman [mailto:simon.hor...@netronome.com]
> Sent: Monday, August 28, 2017 7:37 PM
> To: Chris Mi <chr...@mellanox.com>
> Cc: netdev@vger.kernel.org; j...@mojatatu.com;
> xiyou.wangc...@gmail.com; j...@resnulli.us; da...@davemloft.net;
> mawil...@microsoft.com
> Subject: Re: [patch net-next 2/3] net/sched: Change cls_flower to use IDR
> 
> On Mon, Aug 28, 2017 at 02:41:16AM -0400, Chris Mi wrote:
> > Currently, all filters with the same priority are linked in a doubly
> > linked list. Every filter should have a unique handle. To make the
> > handle unique, we need to iterate the list every time to see if the
> > handle exists or not when inserting a new filter. It is time-consuming.
> > For example, it takes about 5m3.169s to insert 64K rules.
> >
> > This patch changes cls_flower to use IDR. With this patch, it takes
> > about 0m1.127s to insert 64K rules. The improvement is huge.
> 
> Very nice :)
> 
> > But please note that in this testing, all filters share the same action.
> > If every filter has a unique action, that is another bottleneck.
> > Follow-up patch in this patchset addresses that.
> >
> > Signed-off-by: Chris Mi <chr...@mellanox.com>
> > Signed-off-by: Jiri Pirko <j...@mellanox.com>
> > ---
> >  net/sched/cls_flower.c | 55
> > +-
> >  1 file changed, 23 insertions(+), 32 deletions(-)
> >
> > diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c index
> > bd9dab4..3d041d2 100644
> > --- a/net/sched/cls_flower.c
> > +++ b/net/sched/cls_flower.c
> 
> ...
> 
> > @@ -890,6 +870,7 @@ static int fl_change(struct net *net, struct sk_buff
> *in_skb,
> > struct cls_fl_filter *fnew;
> > struct nlattr **tb;
> > struct fl_flow_mask mask = {};
> > +   unsigned long idr_index;
> > int err;
> >
> > if (!tca[TCA_OPTIONS])
> > @@ -920,13 +901,21 @@ static int fl_change(struct net *net, struct sk_buff
> *in_skb,
> > goto errout;
> >
> > if (!handle) {
> > -   handle = fl_grab_new_handle(tp, head);
> > -   if (!handle) {
> > -   err = -EINVAL;
> > +   err = idr_alloc_ext(>handle_idr, fnew, _index,
> > +   1, 0x8000, GFP_KERNEL);
> > +   if (err)
> > goto errout;
> > -   }
> > +   fnew->handle = idr_index;
> > +   }
> > +
> > +   /* user specifies a handle and it doesn't exist */
> > +   if (handle && !fold) {
> > +   err = idr_alloc_ext(>handle_idr, fnew, _index,
> > +   handle, handle + 1, GFP_KERNEL);
> > +   if (err)
> > +   goto errout;
> > +   fnew->handle = idr_index;
> > }
> > -   fnew->handle = handle;
> >
> > if (tb[TCA_FLOWER_FLAGS]) {
> > fnew->flags = nla_get_u32(tb[TCA_FLOWER_FLAGS]);
> > @@ -980,6 +969,8 @@ static int fl_change(struct net *net, struct sk_buff
> *in_skb,
> > *arg = fnew;
> >
> > if (fold) {
> > +   fnew->handle = handle;
> 
> Can it be the case that fold is non-NULL and handle is zero?
> The handling of that case seem to have changed in this patch.
I don't think that could happen.  In function tc_ctl_tfilter(),

fl_get() will be called.  If handle is zero, fl_get() will return NULL.
That means fold is NULL.

> 
> > +   idr_replace_ext(>handle_idr, fnew, fnew->handle);
> > list_replace_rcu(>list, >list);
> > tcf_unbind_filter(tp, >res);
> > call_rcu(>rcu, fl_destroy_filter);
> > --
> > 1.8.3.1
> >


RE: [patch net-next 2/3] net/sched: Change cls_flower to use IDR

2017-08-28 Thread Chris Mi
> -Original Message-
> From: Jamal Hadi Salim [mailto:j...@mojatatu.com]
> Sent: Tuesday, August 29, 2017 5:56 AM
> To: Chris Mi <chr...@mellanox.com>; netdev@vger.kernel.org
> Cc: xiyou.wangc...@gmail.com; j...@resnulli.us; da...@davemloft.net;
> mawil...@microsoft.com
> Subject: Re: [patch net-next 2/3] net/sched: Change cls_flower to use IDR
> 
> On 17-08-28 02:41 AM, Chris Mi wrote:
> > Currently, all filters with the same priority are linked in a doubly
> > linked list. Every filter should have a unique handle. To make the
> > handle unique, we need to iterate the list every time to see if the
> > handle exists or not when inserting a new filter. It is time-consuming.
> > For example, it takes about 5m3.169s to insert 64K rules.
> >
> > This patch changes cls_flower to use IDR. With this patch, it takes
> > about 0m1.127s to insert 64K rules. The improvement is huge.
> >
> > But please note that in this testing, all filters share the same action.
> > If every filter has a unique action, that is another bottleneck.
> > Follow-up patch in this patchset addresses that.
> >
> > Signed-off-by: Chris Mi <chr...@mellanox.com>
> > Signed-off-by: Jiri Pirko <j...@mellanox.com>
> 
> Acked-by: Jamal Hadi Salim <j...@mojatatu.com>
> 
> As Cong asked last time - any plans to add to other classifiers?
I think if other classifiers don't need so many items, list is enough for them.
If we change all of them, we need spend a lot of time to test them to make sure
there is no regression. But the benefit is not very big. If a certain classifier
need to change in the future, flower is an example for reference.

-Chris
> 
> cheers,
> jamal


[patch net-next 3/3] net/sched: Change act_api and act_xxx modules to use IDR

2017-08-28 Thread Chris Mi
Typically, each TC filter has its own action. All the actions of the
same type are saved in its hash table. But the hash buckets are too
small that it degrades to a list. And the performance is greatly
affected. For example, it takes about 0m11.914s to insert 64K rules.
If we convert the hash table to IDR, it only takes about 0m1.500s.
The improvement is huge.

But please note that the test result is based on previous patch that
cls_flower uses IDR.

Signed-off-by: Chris Mi <chr...@mellanox.com>
Signed-off-by: Jiri Pirko <j...@mellanox.com>
---
 include/net/act_api.h  |  76 +-
 net/sched/act_api.c| 251 ++---
 net/sched/act_bpf.c|  17 ++-
 net/sched/act_connmark.c   |  16 ++-
 net/sched/act_csum.c   |  16 ++-
 net/sched/act_gact.c   |  16 ++-
 net/sched/act_ife.c|  20 ++--
 net/sched/act_ipt.c|  26 +++--
 net/sched/act_mirred.c |  19 ++--
 net/sched/act_nat.c|  16 ++-
 net/sched/act_pedit.c  |  18 ++--
 net/sched/act_police.c |  18 ++--
 net/sched/act_sample.c |  17 ++-
 net/sched/act_simple.c |  20 ++--
 net/sched/act_skbedit.c|  18 ++--
 net/sched/act_skbmod.c |  18 ++--
 net/sched/act_tunnel_key.c |  20 ++--
 net/sched/act_vlan.c   |  22 ++--
 18 files changed, 279 insertions(+), 345 deletions(-)

diff --git a/include/net/act_api.h b/include/net/act_api.h
index 26ffd83..8f3d5d8 100644
--- a/include/net/act_api.h
+++ b/include/net/act_api.h
@@ -10,12 +10,9 @@
 #include 
 #include 
 
-
-struct tcf_hashinfo {
-   struct hlist_head   *htab;
-   unsigned inthmask;
-   spinlock_t  lock;
-   u32 index;
+struct tcf_idrinfo {
+   spinlock_t  lock;
+   struct idr  action_idr;
 };
 
 struct tc_action_ops;
@@ -25,9 +22,8 @@ struct tc_action {
__u32   type; /* for backward 
compat(TCA_OLD_COMPAT) */
__u32   order;
struct list_headlist;
-   struct tcf_hashinfo *hinfo;
+   struct tcf_idrinfo  *idrinfo;
 
-   struct hlist_node   tcfa_head;
u32 tcfa_index;
int tcfa_refcnt;
int tcfa_bindcnt;
@@ -44,7 +40,6 @@ struct tc_action {
struct tc_cookie*act_cookie;
struct tcf_chain*goto_chain;
 };
-#define tcf_head   common.tcfa_head
 #define tcf_index  common.tcfa_index
 #define tcf_refcnt common.tcfa_refcnt
 #define tcf_bindcntcommon.tcfa_bindcnt
@@ -57,27 +52,6 @@ struct tc_action {
 #define tcf_lock   common.tcfa_lock
 #define tcf_rcucommon.tcfa_rcu
 
-static inline unsigned int tcf_hash(u32 index, unsigned int hmask)
-{
-   return index & hmask;
-}
-
-static inline int tcf_hashinfo_init(struct tcf_hashinfo *hf, unsigned int mask)
-{
-   int i;
-
-   spin_lock_init(>lock);
-   hf->index = 0;
-   hf->hmask = mask;
-   hf->htab = kzalloc((mask + 1) * sizeof(struct hlist_head),
-  GFP_KERNEL);
-   if (!hf->htab)
-   return -ENOMEM;
-   for (i = 0; i < mask + 1; i++)
-   INIT_HLIST_HEAD(>htab[i]);
-   return 0;
-}
-
 /* Update lastuse only if needed, to avoid dirtying a cache line.
  * We use a temp variable to avoid fetching jiffies twice.
  */
@@ -126,53 +100,51 @@ struct tc_action_ops {
 };
 
 struct tc_action_net {
-   struct tcf_hashinfo *hinfo;
+   struct tcf_idrinfo *idrinfo;
const struct tc_action_ops *ops;
 };
 
 static inline
 int tc_action_net_init(struct tc_action_net *tn,
-  const struct tc_action_ops *ops, unsigned int mask)
+  const struct tc_action_ops *ops)
 {
int err = 0;
 
-   tn->hinfo = kmalloc(sizeof(*tn->hinfo), GFP_KERNEL);
-   if (!tn->hinfo)
+   tn->idrinfo = kmalloc(sizeof(*tn->idrinfo), GFP_KERNEL);
+   if (!tn->idrinfo)
return -ENOMEM;
tn->ops = ops;
-   err = tcf_hashinfo_init(tn->hinfo, mask);
-   if (err)
-   kfree(tn->hinfo);
+   spin_lock_init(>idrinfo->lock);
+   idr_init(>idrinfo->action_idr);
return err;
 }
 
-void tcf_hashinfo_destroy(const struct tc_action_ops *ops,
- struct tcf_hashinfo *hinfo);
+void tcf_idrinfo_destroy(const struct tc_action_ops *ops,
+struct tcf_idrinfo *idrinfo);
 
 static inline void tc_action_net_exit(struct tc_action_net *tn)
 {
-   tcf_hashinfo_destroy(tn->ops, tn->hinfo);
-   kfree(tn->hinfo);
+   tcf_idrinfo_destroy(tn->ops, tn->idrinfo);
+   kfree(tn->idrinfo);
 }
 
 int tcf_generic_walker(struct tc_action_net *tn, struct sk_buff *skb,
   struct

[patch net-next 0/3] net/sched: Improve getting objects by indexes

2017-08-28 Thread Chris Mi
Using current TC code, it is very slow to insert a lot of rules.

In order to improve the rules update rate in TC,
we introduced the following two changes:
1) changed cls_flower to use IDR to manage the filters.
2) changed all act_xxx modules to use IDR instead of
   a small hash table

But IDR has a limitation that it uses int. TC handle uses u32.
To make sure there is no regression, we add several new IDR APIs
to support unsigned long.

Chris Mi (3):
  idr: Add new APIs to support unsigned long
  net/sched: Change cls_flower to use IDR
  net/sched: Change act_api and act_xxx modules to use IDR

 include/linux/idr.h|  16 +++
 include/linux/radix-tree.h |   3 +
 include/net/act_api.h  |  76 +-
 lib/idr.c  |  56 ++
 lib/radix-tree.c   |  73 +
 net/sched/act_api.c| 251 ++---
 net/sched/act_bpf.c|  17 ++-
 net/sched/act_connmark.c   |  16 ++-
 net/sched/act_csum.c   |  16 ++-
 net/sched/act_gact.c   |  16 ++-
 net/sched/act_ife.c|  20 ++--
 net/sched/act_ipt.c|  26 +++--
 net/sched/act_mirred.c |  19 ++--
 net/sched/act_nat.c|  16 ++-
 net/sched/act_pedit.c  |  18 ++--
 net/sched/act_police.c |  18 ++--
 net/sched/act_sample.c |  17 ++-
 net/sched/act_simple.c |  20 ++--
 net/sched/act_skbedit.c|  18 ++--
 net/sched/act_skbmod.c |  18 ++--
 net/sched/act_tunnel_key.c |  20 ++--
 net/sched/act_vlan.c   |  22 ++--
 net/sched/cls_flower.c |  55 +-
 23 files changed, 450 insertions(+), 377 deletions(-)

-- 
1.8.3.1



[patch net-next 2/3] net/sched: Change cls_flower to use IDR

2017-08-28 Thread Chris Mi
Currently, all filters with the same priority are linked in a doubly
linked list. Every filter should have a unique handle. To make the
handle unique, we need to iterate the list every time to see if the
handle exists or not when inserting a new filter. It is time-consuming.
For example, it takes about 5m3.169s to insert 64K rules.

This patch changes cls_flower to use IDR. With this patch, it
takes about 0m1.127s to insert 64K rules. The improvement is huge.

But please note that in this testing, all filters share the same action.
If every filter has a unique action, that is another bottleneck.
Follow-up patch in this patchset addresses that.

Signed-off-by: Chris Mi <chr...@mellanox.com>
Signed-off-by: Jiri Pirko <j...@mellanox.com>
---
 net/sched/cls_flower.c | 55 +-
 1 file changed, 23 insertions(+), 32 deletions(-)

diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c
index bd9dab4..3d041d2 100644
--- a/net/sched/cls_flower.c
+++ b/net/sched/cls_flower.c
@@ -68,7 +68,6 @@ struct cls_fl_head {
struct rhashtable ht;
struct fl_flow_mask mask;
struct flow_dissector dissector;
-   u32 hgen;
bool mask_assigned;
struct list_head filters;
struct rhashtable_params ht_params;
@@ -76,6 +75,7 @@ struct cls_fl_head {
struct work_struct work;
struct rcu_head rcu;
};
+   struct idr handle_idr;
 };
 
 struct cls_fl_filter {
@@ -210,6 +210,7 @@ static int fl_init(struct tcf_proto *tp)
 
INIT_LIST_HEAD_RCU(>filters);
rcu_assign_pointer(tp->root, head);
+   idr_init(>handle_idr);
 
return 0;
 }
@@ -295,6 +296,9 @@ static void fl_hw_update_stats(struct tcf_proto *tp, struct 
cls_fl_filter *f)
 
 static void __fl_delete(struct tcf_proto *tp, struct cls_fl_filter *f)
 {
+   struct cls_fl_head *head = rtnl_dereference(tp->root);
+
+   idr_remove_ext(>handle_idr, f->handle);
list_del_rcu(>list);
if (!tc_skip_hw(f->flags))
fl_hw_destroy_filter(tp, f);
@@ -327,6 +331,7 @@ static void fl_destroy(struct tcf_proto *tp)
 
list_for_each_entry_safe(f, next, >filters, list)
__fl_delete(tp, f);
+   idr_destroy(>handle_idr);
 
__module_get(THIS_MODULE);
call_rcu(>rcu, fl_destroy_rcu);
@@ -335,12 +340,8 @@ static void fl_destroy(struct tcf_proto *tp)
 static void *fl_get(struct tcf_proto *tp, u32 handle)
 {
struct cls_fl_head *head = rtnl_dereference(tp->root);
-   struct cls_fl_filter *f;
 
-   list_for_each_entry(f, >filters, list)
-   if (f->handle == handle)
-   return f;
-   return NULL;
+   return idr_find_ext(>handle_idr, handle);
 }
 
 static const struct nla_policy fl_policy[TCA_FLOWER_MAX + 1] = {
@@ -859,27 +860,6 @@ static int fl_set_parms(struct net *net, struct tcf_proto 
*tp,
return 0;
 }
 
-static u32 fl_grab_new_handle(struct tcf_proto *tp,
- struct cls_fl_head *head)
-{
-   unsigned int i = 0x8000;
-   u32 handle;
-
-   do {
-   if (++head->hgen == 0x7FFF)
-   head->hgen = 1;
-   } while (--i > 0 && fl_get(tp, head->hgen));
-
-   if (unlikely(i == 0)) {
-   pr_err("Insufficient number of handles\n");
-   handle = 0;
-   } else {
-   handle = head->hgen;
-   }
-
-   return handle;
-}
-
 static int fl_change(struct net *net, struct sk_buff *in_skb,
 struct tcf_proto *tp, unsigned long base,
 u32 handle, struct nlattr **tca,
@@ -890,6 +870,7 @@ static int fl_change(struct net *net, struct sk_buff 
*in_skb,
struct cls_fl_filter *fnew;
struct nlattr **tb;
struct fl_flow_mask mask = {};
+   unsigned long idr_index;
int err;
 
if (!tca[TCA_OPTIONS])
@@ -920,13 +901,21 @@ static int fl_change(struct net *net, struct sk_buff 
*in_skb,
goto errout;
 
if (!handle) {
-   handle = fl_grab_new_handle(tp, head);
-   if (!handle) {
-   err = -EINVAL;
+   err = idr_alloc_ext(>handle_idr, fnew, _index,
+   1, 0x8000, GFP_KERNEL);
+   if (err)
goto errout;
-   }
+   fnew->handle = idr_index;
+   }
+
+   /* user specifies a handle and it doesn't exist */
+   if (handle && !fold) {
+   err = idr_alloc_ext(>handle_idr, fnew, _index,
+   handle, handle + 1, GFP_KERNEL);
+   if (err)
+   goto errout;
+   fnew->handle = idr_index;
}
-   fnew->handle = handle;
 
if (tb[TCA_FLOWER_FLAGS]) {

[patch net-next 1/3] idr: Add new APIs to support unsigned long

2017-08-28 Thread Chris Mi
The following new APIs are added:

int idr_alloc_ext(struct idr *idr, void *ptr, unsigned long *index,
  unsigned long start, unsigned long end, gfp_t gfp);
static inline void *idr_remove_ext(struct idr *idr, unsigned long id);
static inline void *idr_find_ext(const struct idr *idr, unsigned long id);
void *idr_replace_ext(struct idr *idr, void *ptr, unsigned long id);
void *idr_get_next_ext(struct idr *idr, unsigned long *nextid);

Signed-off-by: Chris Mi <chr...@mellanox.com>
Signed-off-by: Jiri Pirko <j...@mellanox.com>
---
 include/linux/idr.h| 16 ++
 include/linux/radix-tree.h |  3 ++
 lib/idr.c  | 56 +++
 lib/radix-tree.c   | 73 ++
 4 files changed, 148 insertions(+)

diff --git a/include/linux/idr.h b/include/linux/idr.h
index bf70b3e..e0a030b 100644
--- a/include/linux/idr.h
+++ b/include/linux/idr.h
@@ -81,11 +81,15 @@ static inline void idr_set_cursor(struct idr *idr, unsigned 
int val)
 
 void idr_preload(gfp_t gfp_mask);
 int idr_alloc(struct idr *, void *entry, int start, int end, gfp_t);
+int idr_alloc_ext(struct idr *idr, void *ptr, unsigned long *index,
+ unsigned long start, unsigned long end, gfp_t gfp);
 int idr_alloc_cyclic(struct idr *, void *entry, int start, int end, gfp_t);
 int idr_for_each(const struct idr *,
 int (*fn)(int id, void *p, void *data), void *data);
 void *idr_get_next(struct idr *, int *nextid);
+void *idr_get_next_ext(struct idr *idr, unsigned long *nextid);
 void *idr_replace(struct idr *, void *, int id);
+void *idr_replace_ext(struct idr *idr, void *ptr, unsigned long id);
 void idr_destroy(struct idr *);
 
 static inline void *idr_remove(struct idr *idr, int id)
@@ -93,6 +97,11 @@ static inline void *idr_remove(struct idr *idr, int id)
return radix_tree_delete_item(>idr_rt, id, NULL);
 }
 
+static inline void *idr_remove_ext(struct idr *idr, unsigned long id)
+{
+   return radix_tree_delete_item(>idr_rt, id, NULL);
+}
+
 static inline void idr_init(struct idr *idr)
 {
INIT_RADIX_TREE(>idr_rt, IDR_RT_MARKER);
@@ -133,6 +142,11 @@ static inline void *idr_find(const struct idr *idr, int id)
return radix_tree_lookup(>idr_rt, id);
 }
 
+static inline void *idr_find_ext(const struct idr *idr, unsigned long id)
+{
+   return radix_tree_lookup(>idr_rt, id);
+}
+
 /**
  * idr_for_each_entry - iterate over an idr's elements of a given type
  * @idr: idr handle
@@ -145,6 +159,8 @@ static inline void *idr_find(const struct idr *idr, int id)
  */
 #define idr_for_each_entry(idr, entry, id) \
for (id = 0; ((entry) = idr_get_next(idr, &(id))) != NULL; ++id)
+#define idr_for_each_entry_ext(idr, entry, id) \
+   for (id = 0; ((entry) = idr_get_next_ext(idr, &(id))) != NULL; ++id)
 
 /**
  * idr_for_each_entry_continue - continue iteration over an idr's elements of 
a given type
diff --git a/include/linux/radix-tree.h b/include/linux/radix-tree.h
index 3e57350..947299e 100644
--- a/include/linux/radix-tree.h
+++ b/include/linux/radix-tree.h
@@ -359,6 +359,9 @@ int radix_tree_join(struct radix_tree_root *, unsigned long 
index,
unsigned new_order, void *);
 void __rcu **idr_get_free(struct radix_tree_root *, struct radix_tree_iter *,
gfp_t, int end);
+void __rcu **idr_get_free_ext(struct radix_tree_root *root,
+ struct radix_tree_iter *iter,
+ gfp_t gfp, unsigned long end);
 
 enum {
RADIX_TREE_ITER_TAG_MASK = 0x0f,/* tag index in lower nybble */
diff --git a/lib/idr.c b/lib/idr.c
index b13682b..2a091b9 100644
--- a/lib/idr.c
+++ b/lib/idr.c
@@ -47,6 +47,29 @@ int idr_alloc(struct idr *idr, void *ptr, int start, int 
end, gfp_t gfp)
 }
 EXPORT_SYMBOL_GPL(idr_alloc);
 
+int idr_alloc_ext(struct idr *idr, void *ptr, unsigned long *index,
+ unsigned long start, unsigned long end, gfp_t gfp)
+{
+   void __rcu **slot;
+   struct radix_tree_iter iter;
+
+   if (WARN_ON_ONCE(radix_tree_is_internal_node(ptr)))
+   return -EINVAL;
+
+   radix_tree_iter_init(, start);
+   slot = idr_get_free_ext(>idr_rt, , gfp, end);
+   if (IS_ERR(slot))
+   return PTR_ERR(slot);
+
+   radix_tree_iter_replace(>idr_rt, , slot, ptr);
+   radix_tree_iter_tag_clear(>idr_rt, , IDR_FREE);
+
+   if (index)
+   *index = iter.index;
+   return 0;
+}
+EXPORT_SYMBOL_GPL(idr_alloc_ext);
+
 /**
  * idr_alloc_cyclic - allocate new idr entry in a cyclical fashion
  * @idr: idr handle
@@ -134,6 +157,20 @@ void *idr_get_next(struct idr *idr, int *nextid)
 }
 EXPORT_SYMBOL(idr_get_next);
 
+void *idr_get_next_ext(struct idr *idr, unsigned long *nextid)
+{
+   struct radix_tree_iter iter;
+   void __rcu **slot;
+

[patch net-next] net/sched: Fix the logic error to decide the ingress qdisc

2017-08-18 Thread Chris Mi
The offending commit used a newly added helper function.
But the logic is wrong. Without this fix, the affected NICs
can't do HW offload. Error -EOPNOTSUPP will be returned directly.

Fixes: a2e8da9378cc ("net/sched: use newly added classid identity helpers")
Signed-off-by: Chris Mi <chr...@mellanox.com>
Acked-by: Jiri Pirko <j...@mellanox.com>
---
 drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c | 2 +-
 drivers/net/ethernet/intel/ixgbe/ixgbe_main.c   | 2 +-
 drivers/net/ethernet/mellanox/mlx5/core/en_main.c   | 2 +-
 drivers/net/ethernet/mellanox/mlx5/core/en_rep.c| 2 +-
 drivers/net/ethernet/netronome/nfp/bpf/main.c   | 2 +-
 drivers/net/ethernet/netronome/nfp/flower/offload.c | 2 +-
 6 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c 
b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c
index 77538cd..e55a929 100644
--- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c
+++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c
@@ -2892,7 +2892,7 @@ static int cxgb_set_tx_maxrate(struct net_device *dev, 
int index, u32 rate)
 static int cxgb_setup_tc_cls_u32(struct net_device *dev,
 struct tc_cls_u32_offload *cls_u32)
 {
-   if (is_classid_clsact_ingress(cls_u32->common.classid) ||
+   if (!is_classid_clsact_ingress(cls_u32->common.classid) ||
cls_u32->common.chain_index)
return -EOPNOTSUPP;
 
diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c 
b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
index f9fd8d8..56d7ef0 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
@@ -9230,7 +9230,7 @@ static int ixgbe_setup_tc_cls_u32(struct net_device *dev,
 {
struct ixgbe_adapter *adapter = netdev_priv(dev);
 
-   if (is_classid_clsact_ingress(cls_u32->common.classid) ||
+   if (!is_classid_clsact_ingress(cls_u32->common.classid) ||
cls_u32->common.chain_index)
return -EOPNOTSUPP;
 
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c 
b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
index 8633ca5..2fc3832 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
@@ -3031,7 +3031,7 @@ static int mlx5e_setup_tc_cls_flower(struct net_device 
*dev,
 {
struct mlx5e_priv *priv = netdev_priv(dev);
 
-   if (is_classid_clsact_ingress(cls_flower->common.classid) ||
+   if (!is_classid_clsact_ingress(cls_flower->common.classid) ||
cls_flower->common.chain_index)
return -EOPNOTSUPP;
 
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c 
b/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c
index f34c00f..7a9f53f 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c
@@ -657,7 +657,7 @@ static int mlx5e_rep_get_phys_port_name(struct net_device 
*dev,
 {
struct mlx5e_priv *priv = netdev_priv(dev);
 
-   if (is_classid_clsact_ingress(cls_flower->common.classid) ||
+   if (!is_classid_clsact_ingress(cls_flower->common.classid) ||
cls_flower->common.chain_index)
return -EOPNOTSUPP;
 
diff --git a/drivers/net/ethernet/netronome/nfp/bpf/main.c 
b/drivers/net/ethernet/netronome/nfp/bpf/main.c
index 0e68649..f4de3a7 100644
--- a/drivers/net/ethernet/netronome/nfp/bpf/main.c
+++ b/drivers/net/ethernet/netronome/nfp/bpf/main.c
@@ -127,7 +127,7 @@ static int nfp_bpf_setup_tc(struct nfp_app *app, struct 
net_device *netdev,
struct nfp_net *nn = netdev_priv(netdev);
 
if (type != TC_SETUP_CLSBPF || !nfp_net_ebpf_capable(nn) ||
-   is_classid_clsact_ingress(cls_bpf->common.classid) ||
+   !is_classid_clsact_ingress(cls_bpf->common.classid) ||
cls_bpf->common.protocol != htons(ETH_P_ALL) ||
cls_bpf->common.chain_index)
return -EOPNOTSUPP;
diff --git a/drivers/net/ethernet/netronome/nfp/flower/offload.c 
b/drivers/net/ethernet/netronome/nfp/flower/offload.c
index 3ad5aaa..d868a57 100644
--- a/drivers/net/ethernet/netronome/nfp/flower/offload.c
+++ b/drivers/net/ethernet/netronome/nfp/flower/offload.c
@@ -390,7 +390,7 @@ int nfp_flower_setup_tc(struct nfp_app *app, struct 
net_device *netdev,
struct tc_cls_flower_offload *cls_flower = type_data;
 
if (type != TC_SETUP_CLSFLOWER ||
-   is_classid_clsact_ingress(cls_flower->common.classid) ||
+   !is_classid_clsact_ingress(cls_flower->common.classid) ||
!eth_proto_is_802_3(cls_flower->common.protocol) ||
cls_flower->common.chain_index)
return -EOPNOTSUPP;
-- 
1.8.3.1



[patch net-next repost 2/3] net/sched: Change cls_flower to use IDR

2017-08-16 Thread Chris Mi
Currently, all filters with the same priority are linked in a doubly
linked list. Every filter should have a unique handle. To make the
handle unique, we need to iterate the list every time to see if the
handle exists or not when inserting a new filter. It is time-consuming.
For example, it takes about 5m3.169s to insert 64K rules.

This patch changes cls_flower to use IDR. With this patch, it
takes about 0m1.127s to insert 64K rules. The improvement is huge.

But please note that in this testing, all filters share the same action.
If every filter has a unique action, that is another bottleneck.
Follow-up patch in this patchset addresses that.

Signed-off-by: Chris Mi <chr...@mellanox.com>
Signed-off-by: Jiri Pirko <j...@mellanox.com>
---
 net/sched/cls_flower.c | 55 +-
 1 file changed, 23 insertions(+), 32 deletions(-)

diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c
index 052e902..071f0ef 100644
--- a/net/sched/cls_flower.c
+++ b/net/sched/cls_flower.c
@@ -68,7 +68,6 @@ struct cls_fl_head {
struct rhashtable ht;
struct fl_flow_mask mask;
struct flow_dissector dissector;
-   u32 hgen;
bool mask_assigned;
struct list_head filters;
struct rhashtable_params ht_params;
@@ -76,6 +75,7 @@ struct cls_fl_head {
struct work_struct work;
struct rcu_head rcu;
};
+   struct idr handle_idr;
 };
 
 struct cls_fl_filter {
@@ -210,6 +210,7 @@ static int fl_init(struct tcf_proto *tp)
 
INIT_LIST_HEAD_RCU(>filters);
rcu_assign_pointer(tp->root, head);
+   idr_init(>handle_idr);
 
return 0;
 }
@@ -295,6 +296,9 @@ static void fl_hw_update_stats(struct tcf_proto *tp, struct 
cls_fl_filter *f)
 
 static void __fl_delete(struct tcf_proto *tp, struct cls_fl_filter *f)
 {
+   struct cls_fl_head *head = rtnl_dereference(tp->root);
+
+   idr_remove(>handle_idr, f->handle);
list_del_rcu(>list);
if (!tc_skip_hw(f->flags))
fl_hw_destroy_filter(tp, f);
@@ -327,6 +331,7 @@ static void fl_destroy(struct tcf_proto *tp)
 
list_for_each_entry_safe(f, next, >filters, list)
__fl_delete(tp, f);
+   idr_destroy(>handle_idr);
 
__module_get(THIS_MODULE);
call_rcu(>rcu, fl_destroy_rcu);
@@ -335,12 +340,8 @@ static void fl_destroy(struct tcf_proto *tp)
 static void *fl_get(struct tcf_proto *tp, u32 handle)
 {
struct cls_fl_head *head = rtnl_dereference(tp->root);
-   struct cls_fl_filter *f;
 
-   list_for_each_entry(f, >filters, list)
-   if (f->handle == handle)
-   return f;
-   return NULL;
+   return idr_find(>handle_idr, handle);
 }
 
 static const struct nla_policy fl_policy[TCA_FLOWER_MAX + 1] = {
@@ -859,27 +860,6 @@ static int fl_set_parms(struct net *net, struct tcf_proto 
*tp,
return 0;
 }
 
-static u32 fl_grab_new_handle(struct tcf_proto *tp,
- struct cls_fl_head *head)
-{
-   unsigned int i = 0x8000;
-   u32 handle;
-
-   do {
-   if (++head->hgen == 0x7FFF)
-   head->hgen = 1;
-   } while (--i > 0 && fl_get(tp, head->hgen));
-
-   if (unlikely(i == 0)) {
-   pr_err("Insufficient number of handles\n");
-   handle = 0;
-   } else {
-   handle = head->hgen;
-   }
-
-   return handle;
-}
-
 static int fl_change(struct net *net, struct sk_buff *in_skb,
 struct tcf_proto *tp, unsigned long base,
 u32 handle, struct nlattr **tca,
@@ -890,6 +870,7 @@ static int fl_change(struct net *net, struct sk_buff 
*in_skb,
struct cls_fl_filter *fnew;
struct nlattr **tb;
struct fl_flow_mask mask = {};
+   unsigned long idr_index;
int err;
 
if (!tca[TCA_OPTIONS])
@@ -920,13 +901,21 @@ static int fl_change(struct net *net, struct sk_buff 
*in_skb,
goto errout;
 
if (!handle) {
-   handle = fl_grab_new_handle(tp, head);
-   if (!handle) {
-   err = -EINVAL;
+   err = idr_alloc(>handle_idr, fnew, _index,
+   1, 0x8000, GFP_KERNEL);
+   if (err)
goto errout;
-   }
+   fnew->handle = idr_index;
+   }
+
+   /* user specifies a handle and it doesn't exist */
+   if (handle && !fold) {
+   err = idr_alloc(>handle_idr, fnew, _index,
+   handle, handle + 1, GFP_KERNEL);
+   if (err)
+   goto errout;
+   fnew->handle = idr_index;
}
-   fnew->handle = handle;
 
if (tb[TCA_FLOWER_FLAGS]) {
fnew->flags = 

  1   2   >