from:"Pablo Neira Ayuso"

[PATCH 4/8] netfilter: invoke synchronize_rcu after set the _hook_ to NULL

2017-03-29 Thread Pablo Neira Ayuso

From: Liping Zhang <zlpnob...@gmail.com>

Otherwise, another CPU may access the invalid pointer. For example:
CPU0CPU1
 -  rcu_read_lock();
 -  pfunc = _hook_;
  _hook_ = NULL;  -
  mod unload  -
 - pfunc(); // invalid, panic
 - rcu_read_unlock();

So we must call synchronize_rcu() to wait the rcu reader to finish.

Also note, in nf_nat_snmp_basic_fini, synchronize_rcu() will be invoked
by later nf_conntrack_helper_unregister, but I'm inclined to add a
explicit synchronize_rcu after set the nf_nat_snmp_hook to NULL. Depend
on such obscure assumptions is not a good idea.

Last, in nfnetlink_cttimeout, we use kfree_rcu to free the time object,
so in cttimeout_exit, invoking rcu_barrier() is not necessary at all,
remove it too.

Signed-off-by: Liping Zhang <zlpnob...@gmail.com>
Signed-off-by: Pablo Neira Ayuso <pa...@netfilter.org>
---
 net/ipv4/netfilter/nf_nat_snmp_basic.c | 1 +
 net/netfilter/nf_conntrack_ecache.c| 2 ++
 net/netfilter/nf_conntrack_netlink.c   | 1 +
 net/netfilter/nf_nat_core.c| 2 ++
 net/netfilter/nfnetlink_cttimeout.c| 2 +-
 5 files changed, 7 insertions(+), 1 deletion(-)

diff --git a/net/ipv4/netfilter/nf_nat_snmp_basic.c 
b/net/ipv4/netfilter/nf_nat_snmp_basic.c
index c9b52c361da2..5a8f7c360887 100644
--- a/net/ipv4/netfilter/nf_nat_snmp_basic.c
+++ b/net/ipv4/netfilter/nf_nat_snmp_basic.c
@@ -1304,6 +1304,7 @@ static int __init nf_nat_snmp_basic_init(void)
 static void __exit nf_nat_snmp_basic_fini(void)
 {
RCU_INIT_POINTER(nf_nat_snmp_hook, NULL);
+   synchronize_rcu();
nf_conntrack_helper_unregister(_trap_helper);
 }
 
diff --git a/net/netfilter/nf_conntrack_ecache.c 
b/net/netfilter/nf_conntrack_ecache.c
index da9df2d56e66..22fc32143e9c 100644
--- a/net/netfilter/nf_conntrack_ecache.c
+++ b/net/netfilter/nf_conntrack_ecache.c
@@ -290,6 +290,7 @@ void nf_conntrack_unregister_notifier(struct net *net,
BUG_ON(notify != new);
RCU_INIT_POINTER(net->ct.nf_conntrack_event_cb, NULL);
mutex_unlock(_ct_ecache_mutex);
+   /* synchronize_rcu() is called from ctnetlink_exit. */
 }
 EXPORT_SYMBOL_GPL(nf_conntrack_unregister_notifier);
 
@@ -326,6 +327,7 @@ void nf_ct_expect_unregister_notifier(struct net *net,
BUG_ON(notify != new);
RCU_INIT_POINTER(net->ct.nf_expect_event_cb, NULL);
mutex_unlock(_ct_ecache_mutex);
+   /* synchronize_rcu() is called from ctnetlink_exit. */
 }
 EXPORT_SYMBOL_GPL(nf_ct_expect_unregister_notifier);
 
diff --git a/net/netfilter/nf_conntrack_netlink.c 
b/net/netfilter/nf_conntrack_netlink.c
index 6806b5e73567..908d858034e4 100644
--- a/net/netfilter/nf_conntrack_netlink.c
+++ b/net/netfilter/nf_conntrack_netlink.c
@@ -3442,6 +3442,7 @@ static void __exit ctnetlink_exit(void)
 #ifdef CONFIG_NETFILTER_NETLINK_GLUE_CT
RCU_INIT_POINTER(nfnl_ct_hook, NULL);
 #endif
+   synchronize_rcu();
 }
 
 module_init(ctnetlink_init);
diff --git a/net/netfilter/nf_nat_core.c b/net/netfilter/nf_nat_core.c
index 94b14c5a8b17..82802e4a6640 100644
--- a/net/netfilter/nf_nat_core.c
+++ b/net/netfilter/nf_nat_core.c
@@ -903,6 +903,8 @@ static void __exit nf_nat_cleanup(void)
 #ifdef CONFIG_XFRM
RCU_INIT_POINTER(nf_nat_decode_session_hook, NULL);
 #endif
+   synchronize_rcu();
+
for (i = 0; i < NFPROTO_NUMPROTO; i++)
kfree(nf_nat_l4protos[i]);
 
diff --git a/net/netfilter/nfnetlink_cttimeout.c 
b/net/netfilter/nfnetlink_cttimeout.c
index 139e0867e56e..47d6656c9119 100644
--- a/net/netfilter/nfnetlink_cttimeout.c
+++ b/net/netfilter/nfnetlink_cttimeout.c
@@ -646,8 +646,8 @@ static void __exit cttimeout_exit(void)
 #ifdef CONFIG_NF_CONNTRACK_TIMEOUT
RCU_INIT_POINTER(nf_ct_timeout_find_get_hook, NULL);
RCU_INIT_POINTER(nf_ct_timeout_put_hook, NULL);
+   synchronize_rcu();
 #endif /* CONFIG_NF_CONNTRACK_TIMEOUT */
-   rcu_barrier();
 }
 
 module_init(cttimeout_init);
-- 
2.1.4

[PATCH 0/8] Netfilter fixes for net

2017-03-29 Thread Pablo Neira Ayuso

Hi David,

The following patchset contains a rather large update with Netfilter
fixes, specifically targeted to incorrect RCU usage in several spots and
the userspace conntrack helper infrastructure (nfnetlink_cthelper),
more specifically they are:

1) expect_class_max is incorrect set via cthelper, as in kernel semantics
   mandate that this represents the array of expectation classes minus 1.
   Patch from Liping Zhang.

2) Expectation policy updates via cthelper are currently broken for several
   reasons: This code allows illegal changes in the policy such as changing
   the number of expeciation classes, it is leaking the updated policy and
   such update occurs with no RCU protection at all. Fix this by adding a
   new nfnl_cthelper_update_policy() that describes what is really legal on
   the update path.

3) Fix several memory leaks in cthelper, from Jeffy Chen.

4) synchronize_rcu() is missing in the removal path of several modules,
   this may lead to races since CPU may still be running on code that has
   just gone. Also from Liping Zhang.

5) Don't use the helper hashtable from cthelper, it is not safe to walk
   over those bits without the helper mutex. Fix this by introducing a
   new independent list for userspace helpers. From Liping Zhang.

6) nf_ct_extend_unregister() needs synchronize_rcu() to make sure no
   packets are walking on any conntrack extension that is gone after
   module removal, again from Liping.

7) nf_nat_snmp may crash if we fail to unregister the helper due to
   accidental leftover code, from Gao Feng.

8) Fix leak in nfnetlink_queue with secctx support, from Liping Zhang.

You can pull these changes from:

  git://git.kernel.org/pub/scm/linux/kernel/git/pablo/nf.git

Thanks!



The following changes since commit db7f00b8dba6d687b6ab1f2e9309acfd214fcb4b:

  tcp: tcp_get_info() should read tcp_time_stamp later (2017-03-16 21:37:13 
-0700)

are available in the git repository at:

  git://git.kernel.org/pub/scm/linux/kernel/git/pablo/nf.git HEAD

for you to fetch changes up to 77c1c03c5b8ef28e55bb0aff29b1e006037ca645:

  netfilter: nfnetlink_queue: fix secctx memory leak (2017-03-29 12:20:50 +0200)


Gao Feng (1):
  netfilter: nf_nat_snmp: Fix panic when snmp_trap_helper fails to register

Jeffy Chen (1):
  netfilter: nfnl_cthelper: Fix memory leak

Liping Zhang (5):
  netfilter: nfnl_cthelper: fix incorrect helper->expect_class_max
  netfilter: invoke synchronize_rcu after set the _hook_ to NULL
  netfilter: nfnl_cthelper: fix a race when walk the nf_ct_helper_hash table
  netfilter: nf_ct_ext: fix possible panic after nf_ct_extend_unregister
  netfilter: nfnetlink_queue: fix secctx memory leak

Pablo Neira Ayuso (1):
  netfilter: nfnl_cthelper: fix runtime expectation policy updates

 net/ipv4/netfilter/nf_nat_snmp_basic.c |  20 +--
 net/netfilter/nf_conntrack_ecache.c|   2 +
 net/netfilter/nf_conntrack_extend.c|  13 +-
 net/netfilter/nf_conntrack_netlink.c   |   1 +
 net/netfilter/nf_nat_core.c|   2 +
 net/netfilter/nfnetlink_cthelper.c | 287 +
 net/netfilter/nfnetlink_cttimeout.c|   2 +-
 net/netfilter/nfnetlink_queue.c|   9 +-
 8 files changed, 206 insertions(+), 130 deletions(-)

[PATCH 2/8] netfilter: nfnl_cthelper: fix runtime expectation policy updates

2017-03-29 Thread Pablo Neira Ayuso

We only allow runtime updates of expectation policies for timeout and
maximum number of expectations, otherwise reject the update.

Signed-off-by: Pablo Neira Ayuso <pa...@netfilter.org>
Acked-by: Liping Zhang <zlpnob...@gmail.com>
---
 net/netfilter/nfnetlink_cthelper.c | 86 +-
 1 file changed, 84 insertions(+), 2 deletions(-)

diff --git a/net/netfilter/nfnetlink_cthelper.c 
b/net/netfilter/nfnetlink_cthelper.c
index 3cd41d105407..90f291e27eb1 100644
--- a/net/netfilter/nfnetlink_cthelper.c
+++ b/net/netfilter/nfnetlink_cthelper.c
@@ -256,6 +256,89 @@ nfnl_cthelper_create(const struct nlattr * const tb[],
 }
 
 static int
+nfnl_cthelper_update_policy_one(const struct nf_conntrack_expect_policy 
*policy,
+   struct nf_conntrack_expect_policy *new_policy,
+   const struct nlattr *attr)
+{
+   struct nlattr *tb[NFCTH_POLICY_MAX + 1];
+   int err;
+
+   err = nla_parse_nested(tb, NFCTH_POLICY_MAX, attr,
+  nfnl_cthelper_expect_pol);
+   if (err < 0)
+   return err;
+
+   if (!tb[NFCTH_POLICY_NAME] ||
+   !tb[NFCTH_POLICY_EXPECT_MAX] ||
+   !tb[NFCTH_POLICY_EXPECT_TIMEOUT])
+   return -EINVAL;
+
+   if (nla_strcmp(tb[NFCTH_POLICY_NAME], policy->name))
+   return -EBUSY;
+
+   new_policy->max_expected =
+   ntohl(nla_get_be32(tb[NFCTH_POLICY_EXPECT_MAX]));
+   new_policy->timeout =
+   ntohl(nla_get_be32(tb[NFCTH_POLICY_EXPECT_TIMEOUT]));
+
+   return 0;
+}
+
+static int nfnl_cthelper_update_policy_all(struct nlattr *tb[],
+  struct nf_conntrack_helper *helper)
+{
+   struct nf_conntrack_expect_policy new_policy[helper->expect_class_max + 
1];
+   struct nf_conntrack_expect_policy *policy;
+   int i, err;
+
+   /* Check first that all policy attributes are well-formed, so we don't
+* leave things in inconsistent state on errors.
+*/
+   for (i = 0; i < helper->expect_class_max + 1; i++) {
+
+   if (!tb[NFCTH_POLICY_SET + i])
+   return -EINVAL;
+
+   err = nfnl_cthelper_update_policy_one(>expect_policy[i],
+ _policy[i],
+ tb[NFCTH_POLICY_SET + i]);
+   if (err < 0)
+   return err;
+   }
+   /* Now we can safely update them. */
+   for (i = 0; i < helper->expect_class_max + 1; i++) {
+   policy = (struct nf_conntrack_expect_policy *)
+   >expect_policy[i];
+   policy->max_expected = new_policy->max_expected;
+   policy->timeout = new_policy->timeout;
+   }
+
+   return 0;
+}
+
+static int nfnl_cthelper_update_policy(struct nf_conntrack_helper *helper,
+  const struct nlattr *attr)
+{
+   struct nlattr *tb[NFCTH_POLICY_SET_MAX + 1];
+   unsigned int class_max;
+   int err;
+
+   err = nla_parse_nested(tb, NFCTH_POLICY_SET_MAX, attr,
+  nfnl_cthelper_expect_policy_set);
+   if (err < 0)
+   return err;
+
+   if (!tb[NFCTH_POLICY_SET_NUM])
+   return -EINVAL;
+
+   class_max = ntohl(nla_get_be32(tb[NFCTH_POLICY_SET_NUM]));
+   if (helper->expect_class_max + 1 != class_max)
+   return -EBUSY;
+
+   return nfnl_cthelper_update_policy_all(tb, helper);
+}
+
+static int
 nfnl_cthelper_update(const struct nlattr * const tb[],
 struct nf_conntrack_helper *helper)
 {
@@ -265,8 +348,7 @@ nfnl_cthelper_update(const struct nlattr * const tb[],
return -EBUSY;
 
if (tb[NFCTH_POLICY]) {
-   ret = nfnl_cthelper_parse_expect_policy(helper,
-   tb[NFCTH_POLICY]);
+   ret = nfnl_cthelper_update_policy(helper, tb[NFCTH_POLICY]);
if (ret < 0)
return ret;
}
-- 
2.1.4

[PATCH 1/8] netfilter: nfnl_cthelper: fix incorrect helper->expect_class_max

2017-03-29 Thread Pablo Neira Ayuso

From: Liping Zhang <zlpnob...@gmail.com>

The helper->expect_class_max must be set to the total number of
expect_policy minus 1, since we will use the statement "if (class >
helper->expect_class_max)" to validate the CTA_EXPECT_CLASS attr in
ctnetlink_alloc_expect.

So for compatibility, set the helper->expect_class_max to the
NFCTH_POLICY_SET_NUM attr's value minus 1.

Also: it's invalid when the NFCTH_POLICY_SET_NUM attr's value is zero.
1. this will result "expect_policy = kzalloc(0, GFP_KERNEL);";
2. we cannot set the helper->expect_class_max to a proper value.

So if nla_get_be32(tb[NFCTH_POLICY_SET_NUM]) is zero, report -EINVAL to
the userspace.

Signed-off-by: Liping Zhang <zlpnob...@gmail.com>
Signed-off-by: Pablo Neira Ayuso <pa...@netfilter.org>
---
 net/netfilter/nfnetlink_cthelper.c | 20 +++-
 1 file changed, 11 insertions(+), 9 deletions(-)

diff --git a/net/netfilter/nfnetlink_cthelper.c 
b/net/netfilter/nfnetlink_cthelper.c
index de8782345c86..3cd41d105407 100644
--- a/net/netfilter/nfnetlink_cthelper.c
+++ b/net/netfilter/nfnetlink_cthelper.c
@@ -161,6 +161,7 @@ nfnl_cthelper_parse_expect_policy(struct 
nf_conntrack_helper *helper,
int i, ret;
struct nf_conntrack_expect_policy *expect_policy;
struct nlattr *tb[NFCTH_POLICY_SET_MAX+1];
+   unsigned int class_max;
 
ret = nla_parse_nested(tb, NFCTH_POLICY_SET_MAX, attr,
   nfnl_cthelper_expect_policy_set);
@@ -170,19 +171,18 @@ nfnl_cthelper_parse_expect_policy(struct 
nf_conntrack_helper *helper,
if (!tb[NFCTH_POLICY_SET_NUM])
return -EINVAL;
 
-   helper->expect_class_max =
-   ntohl(nla_get_be32(tb[NFCTH_POLICY_SET_NUM]));
-
-   if (helper->expect_class_max != 0 &&
-   helper->expect_class_max > NF_CT_MAX_EXPECT_CLASSES)
+   class_max = ntohl(nla_get_be32(tb[NFCTH_POLICY_SET_NUM]));
+   if (class_max == 0)
+   return -EINVAL;
+   if (class_max > NF_CT_MAX_EXPECT_CLASSES)
return -EOVERFLOW;
 
expect_policy = kzalloc(sizeof(struct nf_conntrack_expect_policy) *
-   helper->expect_class_max, GFP_KERNEL);
+   class_max, GFP_KERNEL);
if (expect_policy == NULL)
return -ENOMEM;
 
-   for (i=0; iexpect_class_max; i++) {
+   for (i = 0; i < class_max; i++) {
if (!tb[NFCTH_POLICY_SET+i])
goto err;
 
@@ -191,6 +191,8 @@ nfnl_cthelper_parse_expect_policy(struct 
nf_conntrack_helper *helper,
if (ret < 0)
goto err;
}
+
+   helper->expect_class_max = class_max - 1;
helper->expect_policy = expect_policy;
return 0;
 err:
@@ -377,10 +379,10 @@ nfnl_cthelper_dump_policy(struct sk_buff *skb,
goto nla_put_failure;
 
if (nla_put_be32(skb, NFCTH_POLICY_SET_NUM,
-htonl(helper->expect_class_max)))
+htonl(helper->expect_class_max + 1)))
goto nla_put_failure;
 
-   for (i=0; iexpect_class_max; i++) {
+   for (i = 0; i < helper->expect_class_max + 1; i++) {
nest_parms2 = nla_nest_start(skb,
(NFCTH_POLICY_SET+i) | NLA_F_NESTED);
if (nest_parms2 == NULL)
-- 
2.1.4

Re: [Outreachy kernel] [PATCH] net: netfilter: Remove complexity

2017-03-28 Thread Pablo Neira Ayuso

On Tue, Mar 28, 2017 at 06:30:56PM +0530, Arushi Singhal wrote:
> To remove complexity of code the function is added in nfnetlink.h
> to make code more clear and readable.

Patch looks good, you can also use this new function from other
_fill_info() functions in the netfilter code, eg.

nfnl_cthelper_fill_info()

Take the time to make a careful look at the netfilter tree to see if
we can have a good bunch of new clients of this function in one go.
Most likely many of the nfnetlink_*.c files can use this.

BTW, you can also indicate in your patch description that this is
opencoded in a way that makes it error prone for future netfilter
netlink subsystems.

Please search for a better patch title, this one is too generic. I
understand selecting a good patch title is something that exercise.

I'd suggest this:

netfilter: add nfnl_msg_type() helper function

Never give up, revamp and resubmit, thanks!

Re: [PATCH 0/2] netfilter: Remove unnecessary cast on void pointer

2017-03-27 Thread Pablo Neira Ayuso

On Tue, Mar 21, 2017 at 05:49:52PM +0530, simran singhal wrote:
> This patch series remove unnecessary cast on void pointer.
> 
> simran singhal (2):
>   netfilter: ipset: Remove unnecessary cast on void pointer
>   netfilter: Remove unnecessary cast on void pointer

Please, merge this two patches in one single patch.

I have to pass up batches to David, and I would like this cleanups are
grouped into the same logical change.

Is there more occurrences of this in the Netfilter tree? Please have a
look at net/ipv4/netfilter/ net/ipv6/netfilter/ and
net/bridge/netfilter, we also have code there.

Thanks.

Re: [Outreachy kernel] Re: [PATCH] net: netfilter: Remove multiple assignment.

2017-03-27 Thread Pablo Neira Ayuso

On Mon, Mar 27, 2017 at 05:48:41PM +0530, Arushi Singhal wrote:
> On Mon, Mar 27, 2017 at 5:38 PM, Pablo Neira Ayuso <pa...@netfilter.org>
> wrote:
> 
> > On Sat, Mar 25, 2017 at 06:19:47PM +0530, Arushi Singhal wrote:
> > > This patch removes multiple assignments.
> > > Done using coccinelle.
> > > @@
> > > identifier i1,i2;
> > > constant c;
> > > @@
> > > - i1=i2=c;
> > > + i1=c;
> > > + i2=c;
> >
> > You have to explain why this is bad.
> >
> 
> It is against the kernel coding style and we have to avoid multiple
> assignments to make the code more readable.
> This error is found using Checkpatch.pl script.

Then, please place this information in your patch description.

Thanks!

Re: [PATCH] net: netfilter: Remove multiple assignment.

2017-03-27 Thread Pablo Neira Ayuso

On Sat, Mar 25, 2017 at 06:19:47PM +0530, Arushi Singhal wrote:
> This patch removes multiple assignments.
> Done using coccinelle.
> @@
> identifier i1,i2;
> constant c;
> @@
> - i1=i2=c;
> + i1=c;
> + i2=c;

You have to explain why this is bad.

Re: [PATCH] net: netfilters: Remove extra parenthesis

2017-03-27 Thread Pablo Neira Ayuso

Hi Arushi,

On Sat, Mar 25, 2017 at 07:23:13PM +0530, Arushi Singhal wrote:
> diff --git a/net/netfilter/nf_conntrack_netlink.c 
> b/net/netfilter/nf_conntrack_netlink.c
> index 6806b5e73567..aa344c5868c5 100644
> --- a/net/netfilter/nf_conntrack_netlink.c
> +++ b/net/netfilter/nf_conntrack_netlink.c
> @@ -467,7 +467,7 @@ ctnetlink_fill_info(struct sk_buff *skb, u32 portid, u32 
> seq, u32 type,
>   struct nlattr *nest_parms;
>   unsigned int flags = portid ? NLM_F_MULTI : 0, event;
>  
> - event = (NFNL_SUBSYS_CTNETLINK << 8 | IPCTNL_MSG_CT_NEW);
> + event = NFNL_SUBSYS_CTNETLINK << 8 | IPCTNL_MSG_CT_NEW;

Could you send us a unfront patch to add something like:

static inline u16 nfnl_msg_type(u8 subsys, u8 msg_type)
{
return subsys << 8 | msg_type;
}

I would suggest you place this in include/linux/netfilter/nfnetlink.h

Then, use it here.

>   nlh = nlmsg_put(skb, portid, seq, event, sizeof(*nfmsg), flags);
>   if (nlh == NULL)
>   goto nlmsg_failure;
> @@ -1983,7 +1983,7 @@ ctnetlink_ct_stat_cpu_fill_info(struct sk_buff *skb, 
> u32 portid, u32 seq,
>   struct nfgenmsg *nfmsg;
>   unsigned int flags = portid ? NLM_F_MULTI : 0, event;
>  
> - event = (NFNL_SUBSYS_CTNETLINK << 8 | IPCTNL_MSG_CT_GET_STATS_CPU);
> + event = NFNL_SUBSYS_CTNETLINK << 8 | IPCTNL_MSG_CT_GET_STATS_CPU;

... And here too.

>   nlh = nlmsg_put(skb, portid, seq, event, sizeof(*nfmsg), flags);
>   if (nlh == NULL)
>   goto nlmsg_failure;
> @@ -2066,7 +2066,7 @@ ctnetlink_stat_ct_fill_info(struct sk_buff *skb, u32 
> portid, u32 seq, u32 type,
>   unsigned int flags = portid ? NLM_F_MULTI : 0, event;
>   unsigned int nr_conntracks = atomic_read(>ct.count);
>  
> - event = (NFNL_SUBSYS_CTNETLINK << 8 | IPCTNL_MSG_CT_GET_STATS);
> + event = NFNL_SUBSYS_CTNETLINK << 8 | IPCTNL_MSG_CT_GET_STATS;

And so on. Look for more spots where we can replace this opencoded
thing.

I guess there are more spots in all of the net/netfilter/ netlink
subsystems.

Once that patch gets in, you can follow up with this parens cleanup.

Thanks!

P.S: Cc'ing netfilter-de...@vger.kernel.org (and
lvs-de...@vger.kernel.org if you touch 'ipvs' bits) should be fine. No
need to Cc all those many lists, better to narrow down you target.

Re: [PATCH nf v3 1/1] netfilter: snmp: Fix one possible panic when snmp_trap_helper fail to register

2017-03-24 Thread Pablo Neira Ayuso

On Tue, Mar 21, 2017 at 08:22:29AM +0800, f...@ikuai8.com wrote:
> From: Gao Feng 
> 
> In the commit 93557f53e1fb ("netfilter: nf_conntrack: nf_conntrack snmp
> helper"), the snmp_helper is replaced by nf_nat_snmp_hook. So the
> snmp_helper is never registered. But it still tries to unregister the
> snmp_helper, it could cause the panic.
> 
> Now remove the useless snmp_helper and the unregister call in the
> error handler.
> 
> Fixes: 93557f53e1fb ("netfilter: nf_conntrack: nf_conntrack snmp helper")
> 
> Signed-off-by: Gao Feng 
> ---
>  v3: Remove the angle brackets in description, per Sergei
>  v2: Add the SHA1 ID in the description, per Sergei
>  v1: Initial version
> 
>  net/ipv4/netfilter/nf_nat_snmp_basic.c | 14 +-
>  1 file changed, 1 insertion(+), 13 deletions(-)
> 
> diff --git a/net/ipv4/netfilter/nf_nat_snmp_basic.c 
> b/net/ipv4/netfilter/nf_nat_snmp_basic.c
> index c9b52c3..5787364 100644
> --- a/net/ipv4/netfilter/nf_nat_snmp_basic.c
> +++ b/net/ipv4/netfilter/nf_nat_snmp_basic.c
> @@ -1260,16 +1260,6 @@ static int help(struct sk_buff *skb, unsigned int 
> protoff,
>   .timeout= 180,
>  };
>  
> -static struct nf_conntrack_helper snmp_helper __read_mostly = {
> - .me = THIS_MODULE,
> - .help   = help,
> - .expect_policy  = _exp_policy,
> - .name   = "snmp",
> - .tuple.src.l3num= AF_INET,
> - .tuple.src.u.udp.port   = cpu_to_be16(SNMP_PORT),
> - .tuple.dst.protonum = IPPROTO_UDP,
> -};
> -
>  static struct nf_conntrack_helper snmp_trap_helper __read_mostly = {
>   .me = THIS_MODULE,
>   .help   = help,
> @@ -1294,10 +1284,8 @@ static int __init nf_nat_snmp_basic_init(void)
>   RCU_INIT_POINTER(nf_nat_snmp_hook, help);
>  
>   ret = nf_conntrack_helper_register(_trap_helper);
> - if (ret < 0) {
> - nf_conntrack_helper_unregister(_helper);
> + if (ret < 0)
>   return ret;
> - }
>   return ret;

You can provide a more simplified version after this is removed, I
think:

@@ -1283,10 +1283,7 @@ static int __init nf_nat_snmp_basic_init(void)
BUG_ON(nf_nat_snmp_hook != NULL);
RCU_INIT_POINTER(nf_nat_snmp_hook, help);
 
-   ret = nf_conntrack_helper_register(_trap_helper);
-   if (ret < 0)
-   return ret;
-   return ret;
+   return nf_conntrack_helper_register(_trap_helper);
 }

Re: [PATCH net-next v3 0/2] GTP SGSN-side tunnels

2017-03-24 Thread Pablo Neira Ayuso

On Fri, Mar 24, 2017 at 02:57:41PM +0100, Jonas Bonn wrote:
> On 03/24/2017 11:41 AM, Pablo Neira Ayuso wrote:
> >On Fri, Mar 24, 2017 at 10:33:56AM +0100, Jonas Bonn wrote:
> >>Changes since v2:
> >>
> >>* Move #define of legacy netlink attribute into enum
> >>* Added note to commit message about load-testing use-case
> >>* Ack from Pablo
> >Thanks a lot Jonas!
> >
> >David, please apply. Thanks!
> 
> David, please apply v4 instead as it has a coding-style fixup requested by
> Harald.

Right, please apply v4, thanks!

Re: [PATCH nf v3 1/1] netfilter: snmp: Fix one possible panic when snmp_trap_helper fail to register

2017-03-24 Thread Pablo Neira Ayuso

On Fri, Mar 24, 2017 at 01:21:30PM +0100, Pablo Neira Ayuso wrote:
> On Tue, Mar 21, 2017 at 08:22:29AM +0800, f...@ikuai8.com wrote:
> > From: Gao Feng <f...@ikuai8.com>
> > 
> > In the commit 93557f53e1fb ("netfilter: nf_conntrack: nf_conntrack snmp
> > helper"), the snmp_helper is replaced by nf_nat_snmp_hook. So the
> > snmp_helper is never registered. But it still tries to unregister the
> > snmp_helper, it could cause the panic.
> 
> This patch looks correct to me.
> 
> But probably for some reason I don't manage to trigger, how do you
> reproduce this?

I'm refering to the panic.

Re: [PATCH nf v3 1/1] netfilter: snmp: Fix one possible panic when snmp_trap_helper fail to register

2017-03-24 Thread Pablo Neira Ayuso

On Tue, Mar 21, 2017 at 08:22:29AM +0800, f...@ikuai8.com wrote:
> From: Gao Feng 
> 
> In the commit 93557f53e1fb ("netfilter: nf_conntrack: nf_conntrack snmp
> helper"), the snmp_helper is replaced by nf_nat_snmp_hook. So the
> snmp_helper is never registered. But it still tries to unregister the
> snmp_helper, it could cause the panic.

This patch looks correct to me.

But probably for some reason I don't manage to trigger, how do you
reproduce this?

Re: [PATCH net-next v3 0/2] GTP SGSN-side tunnels

2017-03-24 Thread Pablo Neira Ayuso

On Fri, Mar 24, 2017 at 11:15:25AM +0100, Harald Welte wrote:
> Hi Jonas,
> 
> looks fine to me, but I haven't tested it.  Did you manually test it
> using the extended libgtpnl + tools?
> 
> Also, in code like this:
> 
> +   if (gtp->role == GTP_ROLE_SGSN) {
> +   pctx = ipv4_pdp_find(gtp, iph->saddr);
> +   } else {
> 
> I think general Linux kernel coding style is to not have curly-brackets
> around single-line blocks. See "Do not unnecessarily use braces where a
> single statement will do." in line 169 of
> Documentation/process/coding-style.rst
> 
> I won't mind your current style, and it is not a blocker issue to me,
> but still it would be nice for general consistency.

Harald is right, I overlook this comestic thing.

Re: [PATCH net-next v3 0/2] GTP SGSN-side tunnels

2017-03-24 Thread Pablo Neira Ayuso

On Fri, Mar 24, 2017 at 10:33:56AM +0100, Jonas Bonn wrote:
> Changes since v2:
> 
> * Move #define of legacy netlink attribute into enum
> * Added note to commit message about load-testing use-case
> * Ack from Pablo

Thanks a lot Jonas!

David, please apply. Thanks!

Re: [PATCH v2 1/2] gtp: rename SGSN netlink attribute

2017-03-23 Thread Pablo Neira Ayuso

On Thu, Mar 23, 2017 at 02:16:22PM -0700, David Miller wrote:
> Can I get a review of these two patches from some GTP experts?

I asked Jonas to resend indicating [PATCH net-next] in the subject and
some minor comestic change.

Apart from patches look good so Jonas, you can add this in your follow
up version.

Acked-by: Pablo Neira Ayuso <pa...@netfilter.org>

Re: [PATCH 0/5] netfilter: Clean up tests if NULL returned on failure

2017-03-22 Thread Pablo Neira Ayuso

On Tue, Mar 21, 2017 at 02:14:34PM +0530, simran singhal wrote:
> This patch series clean up tests if NULL returned on failure.

$ git grep "== NULL" net/netfilter/ | wc -l
461

This is cleaning up just some of them, we still seem to have quite a
bit of them.

Main problem with this changes is that it creates lots of work to
people backporting patches and stable maintainers, I remember that
they have complain about this.

Either way, I would prefer you fix this coding style issue in one go,
ie. one single patch, it will be a large one.

Re: [PATCH v2 1/2] gtp: rename SGSN netlink attribute

2017-03-21 Thread Pablo Neira Ayuso

On Tue, Mar 21, 2017 at 04:10:26PM +0100, Jonas Bonn wrote:
> On 03/21/2017 04:07 PM, Pablo Neira Ayuso wrote:
> >On Tue, Mar 21, 2017 at 04:04:29PM +0100, Jonas Bonn wrote:
> >>diff --git a/include/uapi/linux/gtp.h b/include/uapi/linux/gtp.h
> >>index 72a04a0..c51ebb0 100644
> >>--- a/include/uapi/linux/gtp.h
> >>+++ b/include/uapi/linux/gtp.h
> >>@@ -19,7 +19,7 @@ enum gtp_attrs {
> >>GTPA_LINK,
> >>GTPA_VERSION,
> >>GTPA_TID,   /* for GTPv0 only */
> >>-   GTPA_SGSN_ADDRESS,
> >>+   GTPA_PEER_ADDRESS,  /* Remote GSN peer, either SGSN or GGSN */
> >We need here:
> >
> >#define GTPA_SGSN_ADDRESS GTPA_PEER_ADDRESS
> >
> >for backward compatibility. Anything that is exposed through uapi
> >cannot be changed.
> 
> Yes... look a couple of lines further down...
> 
> >>GTPA_MS_ADDRESS,
> >>GTPA_FLOW,
> >>GTPA_NET_NS_FD,
> >>@@ -29,5 +29,6 @@ enum gtp_attrs {
> >>__GTPA_MAX,
> >>  };
> >>  #define GTPA_MAX (__GTPA_MAX + 1)
> >>+#define GTPA_SGSN_ADDRESS GTPA_PEER_ADDRESS /* maintain legacy attr name */
> 
> ... there it is! :)

Oh right!

Please, move it there to the enum definition, just after the new
GTPA_PEER_ADDRESS. We usually do this in other areas of the networking
code.

You also have to resubmit indicating net-next in your patch subject,
ie.  [PATCH net-next v3 1/2] gtp: rename SGSN netlink attribute

David usually requests that you explicitly indicate the target tree in
some way.

Thanks!

Re: [PATCH v2 1/2] gtp: rename SGSN netlink attribute

2017-03-21 Thread Pablo Neira Ayuso

On Tue, Mar 21, 2017 at 04:04:29PM +0100, Jonas Bonn wrote:
> diff --git a/include/uapi/linux/gtp.h b/include/uapi/linux/gtp.h
> index 72a04a0..c51ebb0 100644
> --- a/include/uapi/linux/gtp.h
> +++ b/include/uapi/linux/gtp.h
> @@ -19,7 +19,7 @@ enum gtp_attrs {
>   GTPA_LINK,
>   GTPA_VERSION,
>   GTPA_TID,   /* for GTPv0 only */
> - GTPA_SGSN_ADDRESS,
> + GTPA_PEER_ADDRESS,  /* Remote GSN peer, either SGSN or GGSN */

We need here:

#define GTPA_SGSN_ADDRESS GTPA_PEER_ADDRESS

for backward compatibility. Anything that is exposed through uapi
cannot be changed.

>   GTPA_MS_ADDRESS,
>   GTPA_FLOW,
>   GTPA_NET_NS_FD,
> @@ -29,5 +29,6 @@ enum gtp_attrs {
>   __GTPA_MAX,
>  };
>  #define GTPA_MAX (__GTPA_MAX + 1)
> +#define GTPA_SGSN_ADDRESS GTPA_PEER_ADDRESS /* maintain legacy attr name */
>  
>  #endif /* _UAPI_LINUX_GTP_H_ */
> -- 
> 2.9.3
>

Re: [Bridge] [PATCH net] bridge: ebtables: fix reception of frames DNAT-ed to bridge device

2017-03-21 Thread Pablo Neira Ayuso

On Tue, Mar 21, 2017 at 01:09:47AM +0100, Linus Lüssing wrote:
> On Sun, Mar 19, 2017 at 05:55:06PM +0100, Linus Lüssing wrote:
> > On Fri, Mar 17, 2017 at 02:10:44PM +0100, Pablo Neira Ayuso wrote:
> > > Wait.
> > > 
> > > May this break local multicast listener that are bound to the bridge
> > > interface? Assuming the bridge interface got an IP address, and that
> > > there is local multicast listener.
> > > 
> > > Missing anything here?
> > 
> > Hm, for multicast packets usually the code path a few lines
> > later in br_handle_frame_finish() should be taken instead.
> > 
> > But you might be right for IP multicast packets with a unicast MAC
> > destination (due to whatever reason, for instance via DNAT'ing
> > again).
> > 
> > Will check that - thanks!
> 
> Ok, I tested DNAT'ing an IP multicast packet to the unicast MAC address
> of the bridge interface.
> 
> Both ping-ing to an IPv4 and IPv6 multicast listener on br0 worked
> and was replied to fine, both with or without changing skb->pkt_type
> from PACKET_MULTICAST to PACKET_HOST.
> ("$ ping 224.1.0.123" and "$ ping6 ff02::1:ff40:707c%in0" from a
>  network namespace, tied into the bridge via veth)
> 
> Also, a DNAT'ed PACKET_BROADCAST worked, with or without changing
> it to PACKET_HOST.
> 
> I also checked via tcpdump that the destination MAC was changed
> successfully.

Thanks for looking into this more in depth.

> So, so far I wasn't able to find any bugs with the current
> patch. But I think I like the idea of leaving the skb->pkt_type
> unaltered for PACKET_MULTICAST and PACKET_BROADCAST, seems cleaner.

Yes, and matching on skb->pkt_type would not break from netfilter.

> I'd just add an "if (skb->pkt_type == PACKET_OTHERHOST)" check
> then and resend a PATCH v2.

Thanks.

Re: [PATCH] [netfilter-next] netfilter: remove unused refcount variable

2017-03-20 Thread Pablo Neira Ayuso

On Mon, Mar 20, 2017 at 01:37:01PM +0100, Arnd Bergmann wrote:
> The refcount variable was accidentally introduced without any reference
> to it. Removing it again avoids this warning:
> 
> net/netfilter/nfnetlink_acct.c: In function 'nfnl_acct_try_del':
> net/netfilter/nfnetlink_acct.c:329:15: error: unused variable 'refcount' 
> [-Werror=unused-variable]

Thanks Arnd. Elena already fixed it here:

https://git.kernel.org/pub/scm/linux/kernel/git/pablo/nf-next.git/commit/

[PATCH 02/22] netfilter: nft_hash: rename nft_hash to nft_jhash

2017-03-20 Thread Pablo Neira Ayuso

From: Laura Garcia Liebana <nev...@gmail.com>

This patch renames the local nft_hash structure and functions
to nft_jhash in order to prepare the nft_hash module code to
add new hash functions.

Signed-off-by: Laura Garcia Liebana <laura.gar...@zevenet.com>
Signed-off-by: Pablo Neira Ayuso <pa...@netfilter.org>
---
 net/netfilter/nft_hash.c | 36 ++--
 1 file changed, 18 insertions(+), 18 deletions(-)

diff --git a/net/netfilter/nft_hash.c b/net/netfilter/nft_hash.c
index eb2721af898d..ccb834ef049b 100644
--- a/net/netfilter/nft_hash.c
+++ b/net/netfilter/nft_hash.c
@@ -17,7 +17,7 @@
 #include 
 #include 
 
-struct nft_hash {
+struct nft_jhash {
enum nft_registers  sreg:8;
enum nft_registers  dreg:8;
u8  len;
@@ -26,11 +26,11 @@ struct nft_hash {
u32 offset;
 };
 
-static void nft_hash_eval(const struct nft_expr *expr,
- struct nft_regs *regs,
- const struct nft_pktinfo *pkt)
+static void nft_jhash_eval(const struct nft_expr *expr,
+  struct nft_regs *regs,
+  const struct nft_pktinfo *pkt)
 {
-   struct nft_hash *priv = nft_expr_priv(expr);
+   struct nft_jhash *priv = nft_expr_priv(expr);
const void *data = >data[priv->sreg];
u32 h;
 
@@ -47,11 +47,11 @@ static const struct nla_policy 
nft_hash_policy[NFTA_HASH_MAX + 1] = {
[NFTA_HASH_OFFSET]  = { .type = NLA_U32 },
 };
 
-static int nft_hash_init(const struct nft_ctx *ctx,
-const struct nft_expr *expr,
-const struct nlattr * const tb[])
+static int nft_jhash_init(const struct nft_ctx *ctx,
+ const struct nft_expr *expr,
+ const struct nlattr * const tb[])
 {
-   struct nft_hash *priv = nft_expr_priv(expr);
+   struct nft_jhash *priv = nft_expr_priv(expr);
u32 len;
int err;
 
@@ -92,10 +92,10 @@ static int nft_hash_init(const struct nft_ctx *ctx,
   NFT_DATA_VALUE, sizeof(u32));
 }
 
-static int nft_hash_dump(struct sk_buff *skb,
-const struct nft_expr *expr)
+static int nft_jhash_dump(struct sk_buff *skb,
+ const struct nft_expr *expr)
 {
-   const struct nft_hash *priv = nft_expr_priv(expr);
+   const struct nft_jhash *priv = nft_expr_priv(expr);
 
if (nft_dump_register(skb, NFTA_HASH_SREG, priv->sreg))
goto nla_put_failure;
@@ -117,17 +117,17 @@ static int nft_hash_dump(struct sk_buff *skb,
 }
 
 static struct nft_expr_type nft_hash_type;
-static const struct nft_expr_ops nft_hash_ops = {
+static const struct nft_expr_ops nft_jhash_ops = {
.type   = _hash_type,
-   .size   = NFT_EXPR_SIZE(sizeof(struct nft_hash)),
-   .eval   = nft_hash_eval,
-   .init   = nft_hash_init,
-   .dump   = nft_hash_dump,
+   .size   = NFT_EXPR_SIZE(sizeof(struct nft_jhash)),
+   .eval   = nft_jhash_eval,
+   .init   = nft_jhash_init,
+   .dump   = nft_jhash_dump,
 };
 
 static struct nft_expr_type nft_hash_type __read_mostly = {
.name   = "hash",
-   .ops= _hash_ops,
+   .ops= _jhash_ops,
.policy = nft_hash_policy,
.maxattr= NFTA_HASH_MAX,
.owner  = THIS_MODULE,
-- 
2.1.4

[PATCH 00/22] Netfilter/IPVS updates for net-next

2017-03-20 Thread Pablo Neira Ayuso

Hi David,

The following patchset contains Netfilter/IPVS updates for your
net-next tree. A couple of new features for nf_tables, and unsorted
cleanups and incremental updates for the Netfilter tree. More
specifically, they are:

1) Allow to check for TCP option presence via nft_exthdr, patch
   from Phil Sutter.

2) Add symmetric hash support to nft_hash, from Laura Garcia Liebana.

3) Use pr_cont() in ebt_log, from Joe Perches.

4) Remove some dead code in arp_tables reported via static analysis
   tool, from Colin Ian King.

5) Consolidate nf_tables expression validation, from Liping Zhang.

6) Consolidate set lookup via nft_set_lookup().

7) Remove unnecessary rcu read lock side in bridge netfilter, from
   Florian Westphal.

8) Remove unused variable in nf_reject_ipv4, from Tahee Yoo.

9) Pass nft_ctx struct to object initialization indirections, from
   Florian Westphal.

10) Add code to integrate conntrack helper into nf_tables, also from
Florian.

11) Allow to check if interface index or name exists via
NFTA_FIB_F_PRESENT, from Phil Sutter.

12) Simplify resolve_normal_ct(), from Florian.

13) Use per-limit spinlock in nft_limit and xt_limit, from Liping Zhang.

14) Use rwlock in nft_set_rbtree set, also from Liping Zhang.

15) One patch to remove a useless printk at netns init path in ipvs,
and several patches to document IPVS knobs.

16) Use refcount_t for reference counter in the Netfilter/IPVS code,
from Elena Reshetova.

You can pull these changes from:

  git://git.kernel.org/pub/scm/linux/kernel/git/pablo/nf-next.git

Thanks!



The following changes since commit 8d70eeb84ab277377c017af6a21d0a337025dede:

  Merge git://git.kernel.org/pub/scm/linux/kernel/git/davem/net (2017-03-04 
17:31:39 -0800)

are available in the git repository at:

  git://git.kernel.org/pub/scm/linux/kernel/git/pablo/nf-next.git HEAD

for you to fetch changes up to 4485a841be171dbd8d3f0701b00f59d389e94ce6:

  netfilter: fix the warning on unused refcount variable (2017-03-20 10:49:12 
+0100)


Colin Ian King (1):
  netfilter: arp_tables: remove redundant check on ret being non-zero

Cong Wang (1):
  ipvs: remove an annoying printk in netns init

Florian Westphal (4):
  netfilter: bridge: remove unneeded rcu_read_lock
  netfilter: provide nft_ctx in object init function
  netfilter: nft_ct: add helper set support
  netfilter: nf_conntrack: reduce resolve_normal_ct args

Hangbin Liu (4):
  ipvs: fix sync_threshold description and add sync_refresh_period, 
sync_retries
  ipvs: Document sysctl sync_qlen_max and sync_sock_size
  ipvs: Document sysctl sync_ports
  ipvs: Document sysctl pmtu_disc

Joe Perches (1):
  netfilter: Use pr_cont where appropriate

Laura Garcia Liebana (2):
  netfilter: nft_hash: rename nft_hash to nft_jhash
  netfilter: nft_hash: support of symmetric hash

Liping Zhang (3):
  netfilter: nf_tables: validate the expr explicitly after init successfully
  netfilter: limit: use per-rule spinlock to improve the scalability
  netfilter: nft_set_rbtree: use per-set rwlock to improve the scalability

Pablo Neira Ayuso (1):
  netfilter: nf_tables: add nft_set_lookup()

Phil Sutter (2):
  netfilter: nft_exthdr: Allow checking TCP option presence, too
  netfilter: nft_fib: Support existence check

Reshetova, Elena (2):
  netfilter: refcounter conversions
  netfilter: fix the warning on unused refcount variable

Taehee Yoo (1):
  netfilter: nf_reject: remove unused variable

 Documentation/networking/ipvs-sysctl.txt |  68 +--
 include/net/ip_vs.h  |  16 +--
 include/net/netfilter/nf_conntrack_expect.h  |   4 +-
 include/net/netfilter/nf_conntrack_timeout.h |   3 +-
 include/net/netfilter/nf_tables.h|  12 +-
 include/net/netfilter/nft_fib.h  |   2 +-
 include/uapi/linux/netfilter/nf_tables.h |  26 +++-
 net/bridge/br_netfilter_hooks.c  |   3 -
 net/bridge/netfilter/ebt_log.c   |  34 +++---
 net/bridge/netfilter/nft_reject_bridge.c |   6 +-
 net/ipv4/netfilter/arp_tables.c  |   2 -
 net/ipv4/netfilter/ipt_CLUSTERIP.c   |  19 +--
 net/ipv4/netfilter/nf_nat_snmp_basic.c   |  15 +--
 net/ipv4/netfilter/nf_reject_ipv4.c  |   3 -
 net/ipv4/netfilter/nft_fib_ipv4.c|   4 +-
 net/ipv6/netfilter/nft_fib_ipv6.c|   2 +-
 net/netfilter/ipvs/ip_vs_conn.c  |  24 ++--
 net/netfilter/ipvs/ip_vs_core.c  |   6 +-
 net/netfilter/ipvs/ip_vs_ctl.c   |  12 +-
 net/netfilter/ipvs/ip_vs_lblc.c  |   2 +-
 net/netfilter/ipvs/ip_vs_lblcr.c |   6 +-
 net/netfilter/ipvs/ip_vs_nq.c|   2 +-
 net/netfilter/ipvs/ip_vs_proto_sctp.c|   2 +-
 net/netfilter/ipvs/ip_vs_proto_tcp.c |   2

[PATCH 05/22] netfilter: arp_tables: remove redundant check on ret being non-zero

2017-03-20 Thread Pablo Neira Ayuso

From: Colin Ian King <colin.k...@canonical.com>

ret is initialized to zero and if it is set to non-zero in the
xt_entry_foreach loop then we exit via the out_free label. Hence
the check for ret being non-zero is redundant and can be removed.

Detected by CoverityScan, CID#1357132 ("Logically Dead Code")

Signed-off-by: Colin Ian King <colin.k...@canonical.com>
Signed-off-by: Pablo Neira Ayuso <pa...@netfilter.org>
---
 net/ipv4/netfilter/arp_tables.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c
index 6241a81fd7f5..f17dab1dee6e 100644
--- a/net/ipv4/netfilter/arp_tables.c
+++ b/net/ipv4/netfilter/arp_tables.c
@@ -562,8 +562,6 @@ static int translate_table(struct xt_table_info *newinfo, 
void *entry0,
XT_ERROR_TARGET) == 0)
++newinfo->stacksize;
}
-   if (ret != 0)
-   goto out_free;
 
ret = -EINVAL;
if (i != repl->num_entries)
-- 
2.1.4

[PATCH 13/22] netfilter: nf_conntrack: reduce resolve_normal_ct args

2017-03-20 Thread Pablo Neira Ayuso

From: Florian Westphal <f...@strlen.de>

also mark init_conntrack noinline, in most cases resolve_normal_ct will
find an existing conntrack entry.

textdata bss dec hex filename
167355707 176   22618585a net/netfilter/nf_conntrack_core.o
166875707 176   22570582a net/netfilter/nf_conntrack_core.o

Signed-off-by: Florian Westphal <f...@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pa...@netfilter.org>
---
 net/netfilter/nf_conntrack_core.c | 57 ++-
 1 file changed, 26 insertions(+), 31 deletions(-)

diff --git a/net/netfilter/nf_conntrack_core.c 
b/net/netfilter/nf_conntrack_core.c
index 071b97fcbefb..b0f2e8e65084 100644
--- a/net/netfilter/nf_conntrack_core.c
+++ b/net/netfilter/nf_conntrack_core.c
@@ -1129,7 +1129,7 @@ EXPORT_SYMBOL_GPL(nf_conntrack_free);
 
 /* Allocate a new conntrack: we return -ENOMEM if classification
failed due to stress.  Otherwise it really is unclassifiable. */
-static struct nf_conntrack_tuple_hash *
+static noinline struct nf_conntrack_tuple_hash *
 init_conntrack(struct net *net, struct nf_conn *tmpl,
   const struct nf_conntrack_tuple *tuple,
   struct nf_conntrack_l3proto *l3proto,
@@ -1237,21 +1237,20 @@ init_conntrack(struct net *net, struct nf_conn *tmpl,
return >tuplehash[IP_CT_DIR_ORIGINAL];
 }
 
-/* On success, returns conntrack ptr, sets skb->_nfct | ctinfo */
-static inline struct nf_conn *
+/* On success, returns 0, sets skb->_nfct | ctinfo */
+static int
 resolve_normal_ct(struct net *net, struct nf_conn *tmpl,
  struct sk_buff *skb,
  unsigned int dataoff,
  u_int16_t l3num,
  u_int8_t protonum,
  struct nf_conntrack_l3proto *l3proto,
- struct nf_conntrack_l4proto *l4proto,
- int *set_reply,
- enum ip_conntrack_info *ctinfo)
+ struct nf_conntrack_l4proto *l4proto)
 {
const struct nf_conntrack_zone *zone;
struct nf_conntrack_tuple tuple;
struct nf_conntrack_tuple_hash *h;
+   enum ip_conntrack_info ctinfo;
struct nf_conntrack_zone tmp;
struct nf_conn *ct;
u32 hash;
@@ -1260,7 +1259,7 @@ resolve_normal_ct(struct net *net, struct nf_conn *tmpl,
 dataoff, l3num, protonum, net, , l3proto,
 l4proto)) {
pr_debug("Can't get tuple\n");
-   return NULL;
+   return 0;
}
 
/* look for tuple match */
@@ -1271,33 +1270,30 @@ resolve_normal_ct(struct net *net, struct nf_conn *tmpl,
h = init_conntrack(net, tmpl, , l3proto, l4proto,
   skb, dataoff, hash);
if (!h)
-   return NULL;
+   return 0;
if (IS_ERR(h))
-   return (void *)h;
+   return PTR_ERR(h);
}
ct = nf_ct_tuplehash_to_ctrack(h);
 
/* It exists; we have (non-exclusive) reference. */
if (NF_CT_DIRECTION(h) == IP_CT_DIR_REPLY) {
-   *ctinfo = IP_CT_ESTABLISHED_REPLY;
-   /* Please set reply bit if this packet OK */
-   *set_reply = 1;
+   ctinfo = IP_CT_ESTABLISHED_REPLY;
} else {
/* Once we've had two way comms, always ESTABLISHED. */
if (test_bit(IPS_SEEN_REPLY_BIT, >status)) {
pr_debug("normal packet for %p\n", ct);
-   *ctinfo = IP_CT_ESTABLISHED;
+   ctinfo = IP_CT_ESTABLISHED;
} else if (test_bit(IPS_EXPECTED_BIT, >status)) {
pr_debug("related packet for %p\n", ct);
-   *ctinfo = IP_CT_RELATED;
+   ctinfo = IP_CT_RELATED;
} else {
pr_debug("new packet for %p\n", ct);
-   *ctinfo = IP_CT_NEW;
+   ctinfo = IP_CT_NEW;
}
-   *set_reply = 0;
}
-   nf_ct_set(skb, ct, *ctinfo);
-   return ct;
+   nf_ct_set(skb, ct, ctinfo);
+   return 0;
 }
 
 unsigned int
@@ -1311,7 +1307,6 @@ nf_conntrack_in(struct net *net, u_int8_t pf, unsigned 
int hooknum,
unsigned int *timeouts;
unsigned int dataoff;
u_int8_t protonum;
-   int set_reply = 0;
int ret;
 
tmpl = nf_ct_get(skb, );
@@ -1354,23 +1349,22 @@ nf_conntrack_in(struct net *net, u_int8_t pf, unsigned 
int hooknum,
goto out;
}
 repeat:
-   ct = resolve_normal_ct(net, tmpl, skb, dataoff, pf, protonum,
-  l3proto, l4proto, _reply, );
-   if (!ct) {
-   /* Not valid part of a connection */
-   NF_CT_STAT_INC_ATOMIC

[PATCH 11/22] netfilter: nft_ct: add helper set support

2017-03-20 Thread Pablo Neira Ayuso

From: Florian Westphal <f...@strlen.de>

this allows to assign connection tracking helpers to
connections via nft objref infrastructure.

The idea is to first specifiy a helper object:

 table ip filter {
ct helper some-name {
  type "ftp"
  protocol tcp
  l3proto ip
}
 }

and then assign it via

nft add ... ct helper set "some-name"

helper assignment works for new conntracks only as we cannot expand the
conntrack extension area once it has been committed to the main conntrack
table.

ipv4 and ipv6 protocols are tracked stored separately so
we can also handle families that observe both ipv4 and ipv6 traffic.

Signed-off-by: Florian Westphal <f...@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pa...@netfilter.org>
---
 include/uapi/linux/netfilter/nf_tables.h |  12 ++-
 net/netfilter/nft_ct.c   | 171 +++
 2 files changed, 182 insertions(+), 1 deletion(-)

diff --git a/include/uapi/linux/netfilter/nf_tables.h 
b/include/uapi/linux/netfilter/nf_tables.h
index 4f7d75682c59..34c8d08b687a 100644
--- a/include/uapi/linux/netfilter/nf_tables.h
+++ b/include/uapi/linux/netfilter/nf_tables.h
@@ -1259,10 +1259,20 @@ enum nft_fib_flags {
NFTA_FIB_F_OIF  = 1 << 4,   /* restrict to oif */
 };
 
+enum nft_ct_helper_attributes {
+   NFTA_CT_HELPER_UNSPEC,
+   NFTA_CT_HELPER_NAME,
+   NFTA_CT_HELPER_L3PROTO,
+   NFTA_CT_HELPER_L4PROTO,
+   __NFTA_CT_HELPER_MAX,
+};
+#define NFTA_CT_HELPER_MAX (__NFTA_CT_HELPER_MAX - 1)
+
 #define NFT_OBJECT_UNSPEC  0
 #define NFT_OBJECT_COUNTER 1
 #define NFT_OBJECT_QUOTA   2
-#define __NFT_OBJECT_MAX   3
+#define NFT_OBJECT_CT_HELPER   3
+#define __NFT_OBJECT_MAX   4
 #define NFT_OBJECT_MAX (__NFT_OBJECT_MAX - 1)
 
 /**
diff --git a/net/netfilter/nft_ct.c b/net/netfilter/nft_ct.c
index bf548a7a71ec..4144ae845bdd 100644
--- a/net/netfilter/nft_ct.c
+++ b/net/netfilter/nft_ct.c
@@ -32,6 +32,12 @@ struct nft_ct {
};
 };
 
+struct nft_ct_helper_obj  {
+   struct nf_conntrack_helper *helper4;
+   struct nf_conntrack_helper *helper6;
+   u8 l4proto;
+};
+
 #ifdef CONFIG_NF_CONNTRACK_ZONES
 static DEFINE_PER_CPU(struct nf_conn *, nft_ct_pcpu_template);
 static unsigned int nft_ct_pcpu_template_refcnt __read_mostly;
@@ -730,6 +736,162 @@ static struct nft_expr_type nft_notrack_type 
__read_mostly = {
.owner  = THIS_MODULE,
 };
 
+static int nft_ct_helper_obj_init(const struct nft_ctx *ctx,
+ const struct nlattr * const tb[],
+ struct nft_object *obj)
+{
+   struct nft_ct_helper_obj *priv = nft_obj_data(obj);
+   struct nf_conntrack_helper *help4, *help6;
+   char name[NF_CT_HELPER_NAME_LEN];
+   int family = ctx->afi->family;
+
+   if (!tb[NFTA_CT_HELPER_NAME] || !tb[NFTA_CT_HELPER_L4PROTO])
+   return -EINVAL;
+
+   priv->l4proto = nla_get_u8(tb[NFTA_CT_HELPER_L4PROTO]);
+   if (!priv->l4proto)
+   return -ENOENT;
+
+   nla_strlcpy(name, tb[NFTA_CT_HELPER_NAME], sizeof(name));
+
+   if (tb[NFTA_CT_HELPER_L3PROTO])
+   family = ntohs(nla_get_be16(tb[NFTA_CT_HELPER_L3PROTO]));
+
+   help4 = NULL;
+   help6 = NULL;
+
+   switch (family) {
+   case NFPROTO_IPV4:
+   if (ctx->afi->family == NFPROTO_IPV6)
+   return -EINVAL;
+
+   help4 = nf_conntrack_helper_try_module_get(name, family,
+  priv->l4proto);
+   break;
+   case NFPROTO_IPV6:
+   if (ctx->afi->family == NFPROTO_IPV4)
+   return -EINVAL;
+
+   help6 = nf_conntrack_helper_try_module_get(name, family,
+  priv->l4proto);
+   break;
+   case NFPROTO_NETDEV: /* fallthrough */
+   case NFPROTO_BRIDGE: /* same */
+   case NFPROTO_INET:
+   help4 = nf_conntrack_helper_try_module_get(name, NFPROTO_IPV4,
+  priv->l4proto);
+   help6 = nf_conntrack_helper_try_module_get(name, NFPROTO_IPV6,
+  priv->l4proto);
+   break;
+   default:
+   return -EAFNOSUPPORT;
+   }
+
+   /* && is intentional; only error if INET found neither ipv4 or ipv6 */
+   if (!help4 && !help6)
+   return -ENOENT;
+
+   priv->helper4 = help4;
+   priv->helper6 = help6;
+
+   return 0;
+}
+
+static void nft_ct_helper_obj_destroy(struct nft_object *obj)
+{
+   struct nft_ct_helper_obj *priv = nft_obj_data(obj);
+
+   if (priv->helper4)
+   module_put(priv->helper4->me);
+   if (priv->helpe

[PATCH 09/22] netfilter: nf_reject: remove unused variable

2017-03-20 Thread Pablo Neira Ayuso

From: Taehee Yoo <ap420...@gmail.com>

variable oiph is not used.

Signed-off-by: Taehee Yoo <ap420...@gmail.com>
Signed-off-by: Pablo Neira Ayuso <pa...@netfilter.org>
---
 net/ipv4/netfilter/nf_reject_ipv4.c | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/net/ipv4/netfilter/nf_reject_ipv4.c 
b/net/ipv4/netfilter/nf_reject_ipv4.c
index 146d86105183..7cd8d0d918f8 100644
--- a/net/ipv4/netfilter/nf_reject_ipv4.c
+++ b/net/ipv4/netfilter/nf_reject_ipv4.c
@@ -104,7 +104,6 @@ EXPORT_SYMBOL_GPL(nf_reject_ip_tcphdr_put);
 void nf_send_reset(struct net *net, struct sk_buff *oldskb, int hook)
 {
struct sk_buff *nskb;
-   const struct iphdr *oiph;
struct iphdr *niph;
const struct tcphdr *oth;
struct tcphdr _oth;
@@ -116,8 +115,6 @@ void nf_send_reset(struct net *net, struct sk_buff *oldskb, 
int hook)
if (skb_rtable(oldskb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
return;
 
-   oiph = ip_hdr(oldskb);
-
nskb = alloc_skb(sizeof(struct iphdr) + sizeof(struct tcphdr) +
 LL_MAX_HEADER, GFP_ATOMIC);
if (!nskb)
-- 
2.1.4

[PATCH 20/22] ipvs: Document sysctl pmtu_disc

2017-03-20 Thread Pablo Neira Ayuso

From: Hangbin Liu 

Document sysctl pmtu_disc based on commit 3654e61137db ("ipvs: add
pmtu_disc option to disable IP DF for TUN packets").

Signed-off-by: Hangbin Liu 
Signed-off-by: Simon Horman 
---
 Documentation/networking/ipvs-sysctl.txt | 8 
 1 file changed, 8 insertions(+)

diff --git a/Documentation/networking/ipvs-sysctl.txt 
b/Documentation/networking/ipvs-sysctl.txt
index a6feecd467cd..056898685d40 100644
--- a/Documentation/networking/ipvs-sysctl.txt
+++ b/Documentation/networking/ipvs-sysctl.txt
@@ -175,6 +175,14 @@ nat_icmp_send - BOOLEAN
 for VS/NAT when the load balancer receives packets from real
 servers but the connection entries don't exist.
 
+pmtu_disc - BOOLEAN
+   0 - disabled
+   not 0 - enabled (default)
+
+   By default, reject with FRAG_NEEDED all DF packets that exceed
+   the PMTU, irrespective of the forwarding method. For TUN method
+   the flag can be disabled to fragment such packets.
+
 secure_tcp - INTEGER
 0  - disabled (default)
 
-- 
2.1.4

[PATCH 06/22] netfilter: nf_tables: validate the expr explicitly after init successfully

2017-03-20 Thread Pablo Neira Ayuso

From: Liping Zhang <zlpnob...@gmail.com>

When we want to validate the expr's dependency or hooks, we must do two
things to accomplish it. First, write a X_validate callback function
and point ->validate to it. Second, call X_validate in init routine.
This is very common, such as fib, nat, reject expr and so on ...

It is a little ugly, since we will call X_validate in the expr's init
routine, it's better to do it in nf_tables_newexpr. So we can avoid to
do this again and again. After doing this, the second step listed above
is not useful anymore, remove them now.

Patch was tested by nftables/tests/py/nft-test.py and
nftables/tests/shell/run-tests.sh.

Signed-off-by: Liping Zhang <zlpnob...@gmail.com>
Signed-off-by: Pablo Neira Ayuso <pa...@netfilter.org>
---
 net/bridge/netfilter/nft_reject_bridge.c |  6 +-
 net/netfilter/nf_tables_api.c| 11 +++
 net/netfilter/nft_compat.c   |  8 
 net/netfilter/nft_fib.c  |  2 +-
 net/netfilter/nft_masq.c |  4 
 net/netfilter/nft_meta.c |  4 
 net/netfilter/nft_nat.c  |  4 
 net/netfilter/nft_redir.c|  4 
 net/netfilter/nft_reject.c   |  5 -
 net/netfilter/nft_reject_inet.c  |  6 +-
 10 files changed, 14 insertions(+), 40 deletions(-)

diff --git a/net/bridge/netfilter/nft_reject_bridge.c 
b/net/bridge/netfilter/nft_reject_bridge.c
index 206dc266ecd2..346ef6b00b8f 100644
--- a/net/bridge/netfilter/nft_reject_bridge.c
+++ b/net/bridge/netfilter/nft_reject_bridge.c
@@ -375,11 +375,7 @@ static int nft_reject_bridge_init(const struct nft_ctx 
*ctx,
  const struct nlattr * const tb[])
 {
struct nft_reject *priv = nft_expr_priv(expr);
-   int icmp_code, err;
-
-   err = nft_reject_bridge_validate(ctx, expr, NULL);
-   if (err < 0)
-   return err;
+   int icmp_code;
 
if (tb[NFTA_REJECT_TYPE] == NULL)
return -EINVAL;
diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index 5e0ccfd5bb37..fd8789eccc92 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -1772,8 +1772,19 @@ static int nf_tables_newexpr(const struct nft_ctx *ctx,
goto err1;
}
 
+   if (ops->validate) {
+   const struct nft_data *data = NULL;
+
+   err = ops->validate(ctx, expr, );
+   if (err < 0)
+   goto err2;
+   }
+
return 0;
 
+err2:
+   if (ops->destroy)
+   ops->destroy(ctx, expr);
 err1:
expr->ops = NULL;
return err;
diff --git a/net/netfilter/nft_compat.c b/net/netfilter/nft_compat.c
index c21e7eb8dce0..fab6bf3f955e 100644
--- a/net/netfilter/nft_compat.c
+++ b/net/netfilter/nft_compat.c
@@ -230,10 +230,6 @@ nft_target_init(const struct nft_ctx *ctx, const struct 
nft_expr *expr,
union nft_entry e = {};
int ret;
 
-   ret = nft_compat_chain_validate_dependency(target->table, ctx->chain);
-   if (ret < 0)
-   goto err;
-
target_compat_from_user(target, nla_data(tb[NFTA_TARGET_INFO]), info);
 
if (ctx->nla[NFTA_RULE_COMPAT]) {
@@ -419,10 +415,6 @@ nft_match_init(const struct nft_ctx *ctx, const struct 
nft_expr *expr,
union nft_entry e = {};
int ret;
 
-   ret = nft_compat_chain_validate_dependency(match->table, ctx->chain);
-   if (ret < 0)
-   goto err;
-
match_compat_from_user(match, nla_data(tb[NFTA_MATCH_INFO]), info);
 
if (ctx->nla[NFTA_RULE_COMPAT]) {
diff --git a/net/netfilter/nft_fib.c b/net/netfilter/nft_fib.c
index 29a4906adc27..fd0b19303b0d 100644
--- a/net/netfilter/nft_fib.c
+++ b/net/netfilter/nft_fib.c
@@ -112,7 +112,7 @@ int nft_fib_init(const struct nft_ctx *ctx, const struct 
nft_expr *expr,
if (err < 0)
return err;
 
-   return nft_fib_validate(ctx, expr, NULL);
+   return 0;
 }
 EXPORT_SYMBOL_GPL(nft_fib_init);
 
diff --git a/net/netfilter/nft_masq.c b/net/netfilter/nft_masq.c
index 11ce016cd479..6ac03d4266c9 100644
--- a/net/netfilter/nft_masq.c
+++ b/net/netfilter/nft_masq.c
@@ -46,10 +46,6 @@ int nft_masq_init(const struct nft_ctx *ctx,
struct nft_masq *priv = nft_expr_priv(expr);
int err;
 
-   err = nft_masq_validate(ctx, expr, NULL);
-   if (err)
-   return err;
-
if (tb[NFTA_MASQ_FLAGS]) {
priv->flags = ntohl(nla_get_be32(tb[NFTA_MASQ_FLAGS]));
if (priv->flags & ~NF_NAT_RANGE_MASK)
diff --git a/net/netfilter/nft_meta.c b/net/netfilter/nft_meta.c
index e1f5ca9b423b..d14417aaf5d4 100644
--- a/net/netfilter/nft_meta.c
+++ b/net/netfilter/nft_meta.c
@@ -370,10 +370,6 @@ int nft_meta_set_init(const struct nft_ctx *ctx,
return -EOPNOTSUPP;

[PATCH 07/22] netfilter: nf_tables: add nft_set_lookup()

2017-03-20 Thread Pablo Neira Ayuso

This new function consolidates set lookup via either name or ID by
introducing a new nft_set_lookup() function. Replace existing spots
where we can use this too.

Signed-off-by: Pablo Neira Ayuso <pa...@netfilter.org>
---
 include/net/netfilter/nf_tables.h |  9 +
 net/netfilter/nf_tables_api.c | 31 ---
 net/netfilter/nft_dynset.c| 14 --
 net/netfilter/nft_lookup.c| 14 --
 net/netfilter/nft_objref.c| 14 --
 5 files changed, 41 insertions(+), 41 deletions(-)

diff --git a/include/net/netfilter/nf_tables.h 
b/include/net/netfilter/nf_tables.h
index 2aa8a9d80fbe..f0d46726d06e 100644
--- a/include/net/netfilter/nf_tables.h
+++ b/include/net/netfilter/nf_tables.h
@@ -385,10 +385,11 @@ static inline struct nft_set *nft_set_container_of(const 
void *priv)
return (void *)priv - offsetof(struct nft_set, data);
 }
 
-struct nft_set *nf_tables_set_lookup(const struct nft_table *table,
-const struct nlattr *nla, u8 genmask);
-struct nft_set *nf_tables_set_lookup_byid(const struct net *net,
- const struct nlattr *nla, u8 genmask);
+struct nft_set *nft_set_lookup(const struct net *net,
+  const struct nft_table *table,
+  const struct nlattr *nla_set_name,
+  const struct nlattr *nla_set_id,
+  u8 genmask);
 
 static inline unsigned long nft_set_gc_interval(const struct nft_set *set)
 {
diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index fd8789eccc92..4559f5d66bcc 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -2534,8 +2534,8 @@ static int nft_ctx_init_from_setattr(struct nft_ctx *ctx, 
struct net *net,
return 0;
 }
 
-struct nft_set *nf_tables_set_lookup(const struct nft_table *table,
-const struct nlattr *nla, u8 genmask)
+static struct nft_set *nf_tables_set_lookup(const struct nft_table *table,
+   const struct nlattr *nla, u8 
genmask)
 {
struct nft_set *set;
 
@@ -2549,11 +2549,10 @@ struct nft_set *nf_tables_set_lookup(const struct 
nft_table *table,
}
return ERR_PTR(-ENOENT);
 }
-EXPORT_SYMBOL_GPL(nf_tables_set_lookup);
 
-struct nft_set *nf_tables_set_lookup_byid(const struct net *net,
- const struct nlattr *nla,
- u8 genmask)
+static struct nft_set *nf_tables_set_lookup_byid(const struct net *net,
+const struct nlattr *nla,
+u8 genmask)
 {
struct nft_trans *trans;
u32 id = ntohl(nla_get_be32(nla));
@@ -2568,7 +2567,25 @@ struct nft_set *nf_tables_set_lookup_byid(const struct 
net *net,
}
return ERR_PTR(-ENOENT);
 }
-EXPORT_SYMBOL_GPL(nf_tables_set_lookup_byid);
+
+struct nft_set *nft_set_lookup(const struct net *net,
+  const struct nft_table *table,
+  const struct nlattr *nla_set_name,
+  const struct nlattr *nla_set_id,
+  u8 genmask)
+{
+   struct nft_set *set;
+
+   set = nf_tables_set_lookup(table, nla_set_name, genmask);
+   if (IS_ERR(set)) {
+   if (!nla_set_id)
+   return set;
+
+   set = nf_tables_set_lookup_byid(net, nla_set_id, genmask);
+   }
+   return set;
+}
+EXPORT_SYMBOL_GPL(nft_set_lookup);
 
 static int nf_tables_set_alloc_name(struct nft_ctx *ctx, struct nft_set *set,
const char *name)
diff --git a/net/netfilter/nft_dynset.c b/net/netfilter/nft_dynset.c
index 049ad2d9ee66..3948da380259 100644
--- a/net/netfilter/nft_dynset.c
+++ b/net/netfilter/nft_dynset.c
@@ -133,16 +133,10 @@ static int nft_dynset_init(const struct nft_ctx *ctx,
priv->invert = true;
}
 
-   set = nf_tables_set_lookup(ctx->table, tb[NFTA_DYNSET_SET_NAME],
-  genmask);
-   if (IS_ERR(set)) {
-   if (tb[NFTA_DYNSET_SET_ID])
-   set = nf_tables_set_lookup_byid(ctx->net,
-   tb[NFTA_DYNSET_SET_ID],
-   genmask);
-   if (IS_ERR(set))
-   return PTR_ERR(set);
-   }
+   set = nft_set_lookup(ctx->net, ctx->table, tb[NFTA_DYNSET_SET_NAME],
+tb[NFTA_DYNSET_SET_ID], genmask);
+   if (IS_ERR(set))
+   return PTR_ERR(set);
 
if (set->ops->update == NULL)
return -EOPNOTSUPP;
diff --git a/net/netfilter/nft_lookup.c b/net/n

[PATCH 16/22] ipvs: remove an annoying printk in netns init

2017-03-20 Thread Pablo Neira Ayuso

From: Cong Wang 

At most it is used for debugging purpose, but I don't think
it is even useful for debugging, just remove it.

Signed-off-by: Cong Wang 
Signed-off-by: Simon Horman 
---
 net/netfilter/ipvs/ip_vs_core.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/net/netfilter/ipvs/ip_vs_core.c b/net/netfilter/ipvs/ip_vs_core.c
index db40050f8785..9aaa49025cdc 100644
--- a/net/netfilter/ipvs/ip_vs_core.c
+++ b/net/netfilter/ipvs/ip_vs_core.c
@@ -2231,8 +2231,6 @@ static int __net_init __ip_vs_init(struct net *net)
if (ip_vs_sync_net_init(ipvs) < 0)
goto sync_fail;
 
-   printk(KERN_INFO "IPVS: Creating netns size=%zu id=%d\n",
-sizeof(struct netns_ipvs), ipvs->gen);
return 0;
 /*
  * Error handling
-- 
2.1.4

[PATCH 19/22] ipvs: Document sysctl sync_ports

2017-03-20 Thread Pablo Neira Ayuso

From: Hangbin Liu 

Document sysctl sync_ports based on commit f73181c8288f ("ipvs: add support
for sync threads").

Signed-off-by: Hangbin Liu 
Signed-off-by: Simon Horman 
---
 Documentation/networking/ipvs-sysctl.txt | 8 
 1 file changed, 8 insertions(+)

diff --git a/Documentation/networking/ipvs-sysctl.txt 
b/Documentation/networking/ipvs-sysctl.txt
index 159d70b6dff3..a6feecd467cd 100644
--- a/Documentation/networking/ipvs-sysctl.txt
+++ b/Documentation/networking/ipvs-sysctl.txt
@@ -231,6 +231,14 @@ sync_sock_size - INTEGER
Configuration of SNDBUF (master) or RCVBUF (slave) socket limit.
Default value is 0 (preserve system defaults).
 
+sync_ports - INTEGER
+   default 1
+
+   The number of threads that master and backup servers can use for
+   sync traffic. Every thread will use single UDP port, thread 0 will
+   use the default port 8848 while last thread will use port
+   8848+sync_ports-1.
+
 snat_reroute - BOOLEAN
0 - disabled
not 0 - enabled (default)
-- 
2.1.4

[PATCH 03/22] netfilter: nft_hash: support of symmetric hash

2017-03-20 Thread Pablo Neira Ayuso

From: Laura Garcia Liebana <nev...@gmail.com>

This patch provides symmetric hash support according to source
ip address and port, and destination ip address and port.

For this purpose, the __skb_get_hash_symmetric() is used to
identify the flow as it uses FLOW_DISSECTOR_F_STOP_AT_FLOW_LABEL
flag by default.

The new attribute NFTA_HASH_TYPE has been included to support
different types of hashing functions. Currently supported
NFT_HASH_JENKINS through jhash and NFT_HASH_SYM through symhash.

The main difference between both types are:
 - jhash requires an expression with sreg, symhash doesn't.
 - symhash supports modulus and offset, but not seed.

Examples:

 nft add rule ip nat prerouting ct mark set jhash ip saddr mod 2
 nft add rule ip nat prerouting ct mark set symhash mod 2

By default, jenkins hash will be used if no hash type is
provided for compatibility reasons.

Signed-off-by: Laura Garcia Liebana <laura.gar...@zevenet.com>
Signed-off-by: Pablo Neira Ayuso <pa...@netfilter.org>
---
 include/uapi/linux/netfilter/nf_tables.h | 13 +
 net/netfilter/nft_hash.c | 99 +++-
 2 files changed, 111 insertions(+), 1 deletion(-)

diff --git a/include/uapi/linux/netfilter/nf_tables.h 
b/include/uapi/linux/netfilter/nf_tables.h
index 05215d30fe5c..4f7d75682c59 100644
--- a/include/uapi/linux/netfilter/nf_tables.h
+++ b/include/uapi/linux/netfilter/nf_tables.h
@@ -816,6 +816,17 @@ enum nft_rt_keys {
 };
 
 /**
+ * enum nft_hash_types - nf_tables hash expression types
+ *
+ * @NFT_HASH_JENKINS: Jenkins Hash
+ * @NFT_HASH_SYM: Symmetric Hash
+ */
+enum nft_hash_types {
+   NFT_HASH_JENKINS,
+   NFT_HASH_SYM,
+};
+
+/**
  * enum nft_hash_attributes - nf_tables hash expression netlink attributes
  *
  * @NFTA_HASH_SREG: source register (NLA_U32)
@@ -824,6 +835,7 @@ enum nft_rt_keys {
  * @NFTA_HASH_MODULUS: modulus value (NLA_U32)
  * @NFTA_HASH_SEED: seed value (NLA_U32)
  * @NFTA_HASH_OFFSET: add this offset value to hash result (NLA_U32)
+ * @NFTA_HASH_TYPE: hash operation (NLA_U32: nft_hash_types)
  */
 enum nft_hash_attributes {
NFTA_HASH_UNSPEC,
@@ -833,6 +845,7 @@ enum nft_hash_attributes {
NFTA_HASH_MODULUS,
NFTA_HASH_SEED,
NFTA_HASH_OFFSET,
+   NFTA_HASH_TYPE,
__NFTA_HASH_MAX,
 };
 #define NFTA_HASH_MAX  (__NFTA_HASH_MAX - 1)
diff --git a/net/netfilter/nft_hash.c b/net/netfilter/nft_hash.c
index ccb834ef049b..a6a4633725bb 100644
--- a/net/netfilter/nft_hash.c
+++ b/net/netfilter/nft_hash.c
@@ -38,6 +38,25 @@ static void nft_jhash_eval(const struct nft_expr *expr,
regs->data[priv->dreg] = h + priv->offset;
 }
 
+struct nft_symhash {
+   enum nft_registers  dreg:8;
+   u32 modulus;
+   u32 offset;
+};
+
+static void nft_symhash_eval(const struct nft_expr *expr,
+struct nft_regs *regs,
+const struct nft_pktinfo *pkt)
+{
+   struct nft_symhash *priv = nft_expr_priv(expr);
+   struct sk_buff *skb = pkt->skb;
+   u32 h;
+
+   h = reciprocal_scale(__skb_get_hash_symmetric(skb), priv->modulus);
+
+   regs->data[priv->dreg] = h + priv->offset;
+}
+
 static const struct nla_policy nft_hash_policy[NFTA_HASH_MAX + 1] = {
[NFTA_HASH_SREG]= { .type = NLA_U32 },
[NFTA_HASH_DREG]= { .type = NLA_U32 },
@@ -45,6 +64,7 @@ static const struct nla_policy nft_hash_policy[NFTA_HASH_MAX 
+ 1] = {
[NFTA_HASH_MODULUS] = { .type = NLA_U32 },
[NFTA_HASH_SEED]= { .type = NLA_U32 },
[NFTA_HASH_OFFSET]  = { .type = NLA_U32 },
+   [NFTA_HASH_TYPE]= { .type = NLA_U32 },
 };
 
 static int nft_jhash_init(const struct nft_ctx *ctx,
@@ -92,6 +112,32 @@ static int nft_jhash_init(const struct nft_ctx *ctx,
   NFT_DATA_VALUE, sizeof(u32));
 }
 
+static int nft_symhash_init(const struct nft_ctx *ctx,
+   const struct nft_expr *expr,
+   const struct nlattr * const tb[])
+{
+   struct nft_symhash *priv = nft_expr_priv(expr);
+
+   if (!tb[NFTA_HASH_DREG]||
+   !tb[NFTA_HASH_MODULUS])
+   return -EINVAL;
+
+   if (tb[NFTA_HASH_OFFSET])
+   priv->offset = ntohl(nla_get_be32(tb[NFTA_HASH_OFFSET]));
+
+   priv->dreg = nft_parse_register(tb[NFTA_HASH_DREG]);
+
+   priv->modulus = ntohl(nla_get_be32(tb[NFTA_HASH_MODULUS]));
+   if (priv->modulus <= 1)
+   return -ERANGE;
+
+   if (priv->offset + priv->modulus - 1 < priv->offset)
+   return -EOVERFLOW;
+
+   return nft_validate_register_store(ctx, priv->dreg, NULL,
+  NFT_DATA_VALUE, sizeof(u32));
+}
+
 static int nft_jhash_dump(struct sk_buff *skb,
  const struct nft_expr

[PATCH 04/22] netfilter: Use pr_cont where appropriate

2017-03-20 Thread Pablo Neira Ayuso

From: Joe Perches <j...@perches.com>

Logging output was changed when simple printks without KERN_CONT
are now emitted on a new line and KERN_CONT is required to continue
lines so use pr_cont.

Miscellanea:

o realign arguments
o use print_hex_dump instead of a local variant

Signed-off-by: Joe Perches <j...@perches.com>
Signed-off-by: Pablo Neira Ayuso <pa...@netfilter.org>
---
 net/bridge/netfilter/ebt_log.c | 34 +-
 net/ipv4/netfilter/nf_nat_snmp_basic.c | 15 ++-
 2 files changed, 19 insertions(+), 30 deletions(-)

diff --git a/net/bridge/netfilter/ebt_log.c b/net/bridge/netfilter/ebt_log.c
index 98b9c8e8615e..707caea39743 100644
--- a/net/bridge/netfilter/ebt_log.c
+++ b/net/bridge/netfilter/ebt_log.c
@@ -62,10 +62,10 @@ print_ports(const struct sk_buff *skb, uint8_t protocol, 
int offset)
pptr = skb_header_pointer(skb, offset,
  sizeof(_ports), &_ports);
if (pptr == NULL) {
-   printk(" INCOMPLETE TCP/UDP header");
+   pr_cont(" INCOMPLETE TCP/UDP header");
return;
}
-   printk(" SPT=%u DPT=%u", ntohs(pptr->src), ntohs(pptr->dst));
+   pr_cont(" SPT=%u DPT=%u", ntohs(pptr->src), ntohs(pptr->dst));
}
 }
 
@@ -100,11 +100,11 @@ ebt_log_packet(struct net *net, u_int8_t pf, unsigned int 
hooknum,
 
ih = skb_header_pointer(skb, 0, sizeof(_iph), &_iph);
if (ih == NULL) {
-   printk(" INCOMPLETE IP header");
+   pr_cont(" INCOMPLETE IP header");
goto out;
}
-   printk(" IP SRC=%pI4 IP DST=%pI4, IP tos=0x%02X, IP proto=%d",
-  >saddr, >daddr, ih->tos, ih->protocol);
+   pr_cont(" IP SRC=%pI4 IP DST=%pI4, IP tos=0x%02X, IP proto=%d",
+   >saddr, >daddr, ih->tos, ih->protocol);
print_ports(skb, ih->protocol, ih->ihl*4);
goto out;
}
@@ -120,11 +120,11 @@ ebt_log_packet(struct net *net, u_int8_t pf, unsigned int 
hooknum,
 
ih = skb_header_pointer(skb, 0, sizeof(_iph), &_iph);
if (ih == NULL) {
-   printk(" INCOMPLETE IPv6 header");
+   pr_cont(" INCOMPLETE IPv6 header");
goto out;
}
-   printk(" IPv6 SRC=%pI6 IPv6 DST=%pI6, IPv6 priority=0x%01X, 
Next Header=%d",
-  >saddr, >daddr, ih->priority, ih->nexthdr);
+   pr_cont(" IPv6 SRC=%pI6 IPv6 DST=%pI6, IPv6 priority=0x%01X, 
Next Header=%d",
+   >saddr, >daddr, ih->priority, ih->nexthdr);
nexthdr = ih->nexthdr;
offset_ph = ipv6_skip_exthdr(skb, sizeof(_iph), , 
_off);
if (offset_ph == -1)
@@ -142,12 +142,12 @@ ebt_log_packet(struct net *net, u_int8_t pf, unsigned int 
hooknum,
 
ah = skb_header_pointer(skb, 0, sizeof(_arph), &_arph);
if (ah == NULL) {
-   printk(" INCOMPLETE ARP header");
+   pr_cont(" INCOMPLETE ARP header");
goto out;
}
-   printk(" ARP HTYPE=%d, PTYPE=0x%04x, OPCODE=%d",
-  ntohs(ah->ar_hrd), ntohs(ah->ar_pro),
-  ntohs(ah->ar_op));
+   pr_cont(" ARP HTYPE=%d, PTYPE=0x%04x, OPCODE=%d",
+   ntohs(ah->ar_hrd), ntohs(ah->ar_pro),
+   ntohs(ah->ar_op));
 
/* If it's for Ethernet and the lengths are OK,
 * then log the ARP payload
@@ -161,17 +161,17 @@ ebt_log_packet(struct net *net, u_int8_t pf, unsigned int 
hooknum,
ap = skb_header_pointer(skb, sizeof(_arph),
sizeof(_arpp), &_arpp);
if (ap == NULL) {
-   printk(" INCOMPLETE ARP payload");
+   pr_cont(" INCOMPLETE ARP payload");
goto out;
}
-   printk(" ARP MAC SRC=%pM ARP IP SRC=%pI4 ARP MAC 
DST=%pM ARP IP DST=%pI4",
-   ap->mac_src, ap->ip_src, ap->mac_dst, 
ap->ip_dst);
+   pr_cont(" ARP MAC SRC=%pM ARP IP SRC=%pI4 ARP MAC 
DST=%pM ARP IP DST=%pI4",
+   ap->mac_src, ap->ip_src,
+   ap->mac_dst, ap->ip_ds

[PATCH 21/22] netfilter: refcounter conversions

2017-03-20 Thread Pablo Neira Ayuso

From: "Reshetova, Elena" <elena.reshet...@intel.com>

refcount_t type and corresponding API (see include/linux/refcount.h)
should be used instead of atomic_t when the variable is used as
a reference counter. This allows to avoid accidental
refcounter overflows that might lead to use-after-free
situations.

Signed-off-by: Elena Reshetova <elena.reshet...@intel.com>
Signed-off-by: Hans Liljestrand <ishkam...@gmail.com>
Signed-off-by: Kees Cook <keesc...@chromium.org>
Signed-off-by: David Windsor <dwind...@gmail.com>
Signed-off-by: Pablo Neira Ayuso <pa...@netfilter.org>
---
 include/net/ip_vs.h  | 16 +---
 include/net/netfilter/nf_conntrack_expect.h  |  4 +++-
 include/net/netfilter/nf_conntrack_timeout.h |  3 ++-
 net/ipv4/netfilter/ipt_CLUSTERIP.c   | 19 ++-
 net/netfilter/ipvs/ip_vs_conn.c  | 24 
 net/netfilter/ipvs/ip_vs_core.c  |  4 ++--
 net/netfilter/ipvs/ip_vs_ctl.c   | 12 ++--
 net/netfilter/ipvs/ip_vs_lblc.c  |  2 +-
 net/netfilter/ipvs/ip_vs_lblcr.c |  6 +++---
 net/netfilter/ipvs/ip_vs_nq.c|  2 +-
 net/netfilter/ipvs/ip_vs_proto_sctp.c|  2 +-
 net/netfilter/ipvs/ip_vs_proto_tcp.c |  2 +-
 net/netfilter/ipvs/ip_vs_rr.c|  2 +-
 net/netfilter/ipvs/ip_vs_sed.c   |  2 +-
 net/netfilter/ipvs/ip_vs_wlc.c   |  2 +-
 net/netfilter/ipvs/ip_vs_wrr.c   |  2 +-
 net/netfilter/nf_conntrack_expect.c  | 10 +-
 net/netfilter/nf_conntrack_netlink.c |  4 ++--
 net/netfilter/nfnetlink_acct.c   | 16 +---
 net/netfilter/nfnetlink_cttimeout.c  | 12 ++--
 net/netfilter/nfnetlink_log.c| 14 --
 21 files changed, 85 insertions(+), 75 deletions(-)

diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h
index 7bdfa7d78363..8a4a57b887fb 100644
--- a/include/net/ip_vs.h
+++ b/include/net/ip_vs.h
@@ -12,6 +12,8 @@
 #include  /* for struct list_head */
 #include  /* for struct rwlock_t */
 #include/* for struct atomic_t */
+#include  /* for struct refcount_t */
+
 #include 
 #include 
 #include 
@@ -525,7 +527,7 @@ struct ip_vs_conn {
struct netns_ipvs   *ipvs;
 
/* counter and timer */
-   atomic_trefcnt; /* reference count */
+   refcount_t  refcnt; /* reference count */
struct timer_list   timer;  /* Expiration timer */
volatile unsigned long  timeout;/* timeout */
 
@@ -667,7 +669,7 @@ struct ip_vs_dest {
atomic_tconn_flags; /* flags to copy to conn */
atomic_tweight; /* server weight */
 
-   atomic_trefcnt; /* reference counter */
+   refcount_t  refcnt; /* reference counter */
struct ip_vs_stats  stats;  /* statistics */
unsigned long   idle_start; /* start time, jiffies */
 
@@ -1211,14 +1213,14 @@ struct ip_vs_conn * ip_vs_conn_out_get_proto(struct 
netns_ipvs *ipvs, int af,
  */
 static inline bool __ip_vs_conn_get(struct ip_vs_conn *cp)
 {
-   return atomic_inc_not_zero(>refcnt);
+   return refcount_inc_not_zero(>refcnt);
 }
 
 /* put back the conn without restarting its timer */
 static inline void __ip_vs_conn_put(struct ip_vs_conn *cp)
 {
smp_mb__before_atomic();
-   atomic_dec(>refcnt);
+   refcount_dec(>refcnt);
 }
 void ip_vs_conn_put(struct ip_vs_conn *cp);
 void ip_vs_conn_fill_cport(struct ip_vs_conn *cp, __be16 cport);
@@ -1410,18 +1412,18 @@ void ip_vs_try_bind_dest(struct ip_vs_conn *cp);
 
 static inline void ip_vs_dest_hold(struct ip_vs_dest *dest)
 {
-   atomic_inc(>refcnt);
+   refcount_inc(>refcnt);
 }
 
 static inline void ip_vs_dest_put(struct ip_vs_dest *dest)
 {
smp_mb__before_atomic();
-   atomic_dec(>refcnt);
+   refcount_dec(>refcnt);
 }
 
 static inline void ip_vs_dest_put_and_free(struct ip_vs_dest *dest)
 {
-   if (atomic_dec_and_test(>refcnt))
+   if (refcount_dec_and_test(>refcnt))
kfree(dest);
 }
 
diff --git a/include/net/netfilter/nf_conntrack_expect.h 
b/include/net/netfilter/nf_conntrack_expect.h
index 5ed33ea4718e..65cc2cb005d9 100644
--- a/include/net/netfilter/nf_conntrack_expect.h
+++ b/include/net/netfilter/nf_conntrack_expect.h
@@ -5,6 +5,8 @@
 #ifndef _NF_CONNTRACK_EXPECT_H
 #define _NF_CONNTRACK_EXPECT_H
 
+#include 
+
 #include 
 #include 
 
@@ -37,7 +39,7 @@ struct nf_conntrack_expect {
struct timer_list timeout;
 
/* Usage count. */
-   atomic_t use;
+   refcount_t use;
 
/* Flags */
unsigned int flags;
diff --git a/include/net/netfilter/nf_conntrack_

[PATCH 15/22] netfilter: nft_set_rbtree: use per-set rwlock to improve the scalability

2017-03-20 Thread Pablo Neira Ayuso

From: Liping Zhang <zlpnob...@gmail.com>

Karel Rericha reported that in his test case, ICMP packets going through
boxes had normally about 5ms latency. But when running nft, actually
listing the sets with interval flags, latency would go up to 30-100ms.
This was observed when router throughput is from 600Mbps to 2Gbps.

This is because we use a single global spinlock to protect the whole
rbtree sets, so "dumping sets" will race with the "key lookup" inevitably.
But actually they are all _readers_, so it's ok to convert the spinlock
to rwlock to avoid competition between them. Also use per-set rwlock since
each set is independent.

Reported-by: Karel Rericha <ka...@unitednetworks.cz>
Tested-by: Karel Rericha <ka...@unitednetworks.cz>
Signed-off-by: Liping Zhang <zlpnob...@gmail.com>
Signed-off-by: Pablo Neira Ayuso <pa...@netfilter.org>
---
 net/netfilter/nft_set_rbtree.c | 31 ---
 1 file changed, 16 insertions(+), 15 deletions(-)

diff --git a/net/netfilter/nft_set_rbtree.c b/net/netfilter/nft_set_rbtree.c
index 78dfbf9588b3..e97e2fb53f0a 100644
--- a/net/netfilter/nft_set_rbtree.c
+++ b/net/netfilter/nft_set_rbtree.c
@@ -18,9 +18,8 @@
 #include 
 #include 
 
-static DEFINE_SPINLOCK(nft_rbtree_lock);
-
 struct nft_rbtree {
+   rwlock_tlock;
struct rb_root  root;
 };
 
@@ -44,14 +43,14 @@ static bool nft_rbtree_equal(const struct nft_set *set, 
const void *this,
 static bool nft_rbtree_lookup(const struct net *net, const struct nft_set *set,
  const u32 *key, const struct nft_set_ext **ext)
 {
-   const struct nft_rbtree *priv = nft_set_priv(set);
+   struct nft_rbtree *priv = nft_set_priv(set);
const struct nft_rbtree_elem *rbe, *interval = NULL;
u8 genmask = nft_genmask_cur(net);
const struct rb_node *parent;
const void *this;
int d;
 
-   spin_lock_bh(_rbtree_lock);
+   read_lock_bh(>lock);
parent = priv->root.rb_node;
while (parent != NULL) {
rbe = rb_entry(parent, struct nft_rbtree_elem, node);
@@ -75,7 +74,7 @@ static bool nft_rbtree_lookup(const struct net *net, const 
struct nft_set *set,
}
if (nft_rbtree_interval_end(rbe))
goto out;
-   spin_unlock_bh(_rbtree_lock);
+   read_unlock_bh(>lock);
 
*ext = >ext;
return true;
@@ -85,12 +84,12 @@ static bool nft_rbtree_lookup(const struct net *net, const 
struct nft_set *set,
if (set->flags & NFT_SET_INTERVAL && interval != NULL &&
nft_set_elem_active(>ext, genmask) &&
!nft_rbtree_interval_end(interval)) {
-   spin_unlock_bh(_rbtree_lock);
+   read_unlock_bh(>lock);
*ext = >ext;
return true;
}
 out:
-   spin_unlock_bh(_rbtree_lock);
+   read_unlock_bh(>lock);
return false;
 }
 
@@ -140,12 +139,13 @@ static int nft_rbtree_insert(const struct net *net, const 
struct nft_set *set,
 const struct nft_set_elem *elem,
 struct nft_set_ext **ext)
 {
+   struct nft_rbtree *priv = nft_set_priv(set);
struct nft_rbtree_elem *rbe = elem->priv;
int err;
 
-   spin_lock_bh(_rbtree_lock);
+   write_lock_bh(>lock);
err = __nft_rbtree_insert(net, set, rbe, ext);
-   spin_unlock_bh(_rbtree_lock);
+   write_unlock_bh(>lock);
 
return err;
 }
@@ -157,9 +157,9 @@ static void nft_rbtree_remove(const struct net *net,
struct nft_rbtree *priv = nft_set_priv(set);
struct nft_rbtree_elem *rbe = elem->priv;
 
-   spin_lock_bh(_rbtree_lock);
+   write_lock_bh(>lock);
rb_erase(>node, >root);
-   spin_unlock_bh(_rbtree_lock);
+   write_unlock_bh(>lock);
 }
 
 static void nft_rbtree_activate(const struct net *net,
@@ -224,12 +224,12 @@ static void nft_rbtree_walk(const struct nft_ctx *ctx,
struct nft_set *set,
struct nft_set_iter *iter)
 {
-   const struct nft_rbtree *priv = nft_set_priv(set);
+   struct nft_rbtree *priv = nft_set_priv(set);
struct nft_rbtree_elem *rbe;
struct nft_set_elem elem;
struct rb_node *node;
 
-   spin_lock_bh(_rbtree_lock);
+   read_lock_bh(>lock);
for (node = rb_first(>root); node != NULL; node = rb_next(node)) {
rbe = rb_entry(node, struct nft_rbtree_elem, node);
 
@@ -242,13 +242,13 @@ static void nft_rbtree_walk(const struct nft_ctx *ctx,
 
iter->err = iter->fn(ctx, set, iter, );
if (iter->err < 0) {
-   spin_unlock_bh

[PATCH 22/22] netfilter: fix the warning on unused refcount variable

2017-03-20 Thread Pablo Neira Ayuso

From: "Reshetova, Elena" <elena.reshet...@intel.com>

net/netfilter/nfnetlink_acct.c: In function 'nfnl_acct_try_del':
net/netfilter/nfnetlink_acct.c:329:15: warning: unused variable 'refcount' 
[-Wunused-variable]
unsigned int refcount;
 ^

Fixes: b54ab92b84b6 ("netfilter: refcounter conversions")
Signed-off-by: Elena Reshetova <elena.reshet...@intel.com>
Signed-off-by: Pablo Neira Ayuso <pa...@netfilter.org>
---
 net/netfilter/nfnetlink_acct.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/net/netfilter/nfnetlink_acct.c b/net/netfilter/nfnetlink_acct.c
index f44cbd35357f..c86da174a5fc 100644
--- a/net/netfilter/nfnetlink_acct.c
+++ b/net/netfilter/nfnetlink_acct.c
@@ -326,7 +326,6 @@ static int nfnl_acct_get(struct net *net, struct sock *nfnl,
 static int nfnl_acct_try_del(struct nf_acct *cur)
 {
int ret = 0;
-   unsigned int refcount;
 
/* We want to avoid races with nfnl_acct_put. So only when the current
 * refcnt is 1, we decrease it to 0.
-- 
2.1.4

[PATCH 14/22] netfilter: limit: use per-rule spinlock to improve the scalability

2017-03-20 Thread Pablo Neira Ayuso

From: Liping Zhang <zlpnob...@gmail.com>

The limit token is independent between each rules, so there's no
need to use a global spinlock.

Signed-off-by: Liping Zhang <zlpnob...@gmail.com>
Signed-off-by: Pablo Neira Ayuso <pa...@netfilter.org>
---
 net/netfilter/nft_limit.c | 10 +-
 net/netfilter/xt_limit.c  | 11 ++-
 2 files changed, 11 insertions(+), 10 deletions(-)

diff --git a/net/netfilter/nft_limit.c b/net/netfilter/nft_limit.c
index c6baf412236d..18dd57a52651 100644
--- a/net/netfilter/nft_limit.c
+++ b/net/netfilter/nft_limit.c
@@ -17,9 +17,8 @@
 #include 
 #include 
 
-static DEFINE_SPINLOCK(limit_lock);
-
 struct nft_limit {
+   spinlock_t  lock;
u64 last;
u64 tokens;
u64 tokens_max;
@@ -34,7 +33,7 @@ static inline bool nft_limit_eval(struct nft_limit *limit, 
u64 cost)
u64 now, tokens;
s64 delta;
 
-   spin_lock_bh(_lock);
+   spin_lock_bh(>lock);
now = ktime_get_ns();
tokens = limit->tokens + now - limit->last;
if (tokens > limit->tokens_max)
@@ -44,11 +43,11 @@ static inline bool nft_limit_eval(struct nft_limit *limit, 
u64 cost)
delta = tokens - cost;
if (delta >= 0) {
limit->tokens = delta;
-   spin_unlock_bh(_lock);
+   spin_unlock_bh(>lock);
return limit->invert;
}
limit->tokens = tokens;
-   spin_unlock_bh(_lock);
+   spin_unlock_bh(>lock);
return !limit->invert;
 }
 
@@ -86,6 +85,7 @@ static int nft_limit_init(struct nft_limit *limit,
limit->invert = true;
}
limit->last = ktime_get_ns();
+   spin_lock_init(>lock);
 
return 0;
 }
diff --git a/net/netfilter/xt_limit.c b/net/netfilter/xt_limit.c
index dab962df1787..d27b5f1ea619 100644
--- a/net/netfilter/xt_limit.c
+++ b/net/netfilter/xt_limit.c
@@ -18,6 +18,7 @@
 #include 
 
 struct xt_limit_priv {
+   spinlock_t lock;
unsigned long prev;
uint32_t credit;
 };
@@ -32,8 +33,6 @@ MODULE_ALIAS("ip6t_limit");
  * see net/sched/sch_tbf.c in the linux source tree
  */
 
-static DEFINE_SPINLOCK(limit_lock);
-
 /* Rusty: This is my (non-mathematically-inclined) understanding of
this algorithm.  The `average rate' in jiffies becomes your initial
amount of credit `credit' and the most credit you can ever have
@@ -72,7 +71,7 @@ limit_mt(const struct sk_buff *skb, struct xt_action_param 
*par)
struct xt_limit_priv *priv = r->master;
unsigned long now = jiffies;
 
-   spin_lock_bh(_lock);
+   spin_lock_bh(>lock);
priv->credit += (now - xchg(>prev, now)) * CREDITS_PER_JIFFY;
if (priv->credit > r->credit_cap)
priv->credit = r->credit_cap;
@@ -80,11 +79,11 @@ limit_mt(const struct sk_buff *skb, struct xt_action_param 
*par)
if (priv->credit >= r->cost) {
/* We're not limited. */
priv->credit -= r->cost;
-   spin_unlock_bh(_lock);
+   spin_unlock_bh(>lock);
return true;
}
 
-   spin_unlock_bh(_lock);
+   spin_unlock_bh(>lock);
return false;
 }
 
@@ -126,6 +125,8 @@ static int limit_mt_check(const struct xt_mtchk_param *par)
r->credit_cap = priv->credit; /* Credits full. */
r->cost = user2credits(r->avg);
}
+   spin_lock_init(>lock);
+
return 0;
 }
 
-- 
2.1.4

[PATCH 12/22] netfilter: nft_fib: Support existence check

2017-03-20 Thread Pablo Neira Ayuso

From: Phil Sutter <p...@nwl.cc>

Instead of the actual interface index or name, set destination register
to just 1 or 0 depending on whether the lookup succeeded or not if
NFTA_FIB_F_PRESENT was set in userspace.

Signed-off-by: Phil Sutter <p...@nwl.cc>
Signed-off-by: Pablo Neira Ayuso <pa...@netfilter.org>
---
 include/net/netfilter/nft_fib.h  |  2 +-
 include/uapi/linux/netfilter/nf_tables.h |  1 +
 net/ipv4/netfilter/nft_fib_ipv4.c|  4 ++--
 net/ipv6/netfilter/nft_fib_ipv6.c|  2 +-
 net/netfilter/nft_fib.c  | 14 +-
 5 files changed, 14 insertions(+), 9 deletions(-)

diff --git a/include/net/netfilter/nft_fib.h b/include/net/netfilter/nft_fib.h
index 5ceb2205e4e3..381af9469e6a 100644
--- a/include/net/netfilter/nft_fib.h
+++ b/include/net/netfilter/nft_fib.h
@@ -32,6 +32,6 @@ void nft_fib6_eval_type(const struct nft_expr *expr, struct 
nft_regs *regs,
 void nft_fib6_eval(const struct nft_expr *expr, struct nft_regs *regs,
   const struct nft_pktinfo *pkt);
 
-void nft_fib_store_result(void *reg, enum nft_fib_result r,
+void nft_fib_store_result(void *reg, const struct nft_fib *priv,
  const struct nft_pktinfo *pkt, int index);
 #endif
diff --git a/include/uapi/linux/netfilter/nf_tables.h 
b/include/uapi/linux/netfilter/nf_tables.h
index 34c8d08b687a..8f3842690d17 100644
--- a/include/uapi/linux/netfilter/nf_tables.h
+++ b/include/uapi/linux/netfilter/nf_tables.h
@@ -1257,6 +1257,7 @@ enum nft_fib_flags {
NFTA_FIB_F_MARK = 1 << 2,   /* use skb->mark */
NFTA_FIB_F_IIF  = 1 << 3,   /* restrict to iif */
NFTA_FIB_F_OIF  = 1 << 4,   /* restrict to oif */
+   NFTA_FIB_F_PRESENT  = 1 << 5,   /* check existence only */
 };
 
 enum nft_ct_helper_attributes {
diff --git a/net/ipv4/netfilter/nft_fib_ipv4.c 
b/net/ipv4/netfilter/nft_fib_ipv4.c
index 2981291910dd..f4e4462cb5bb 100644
--- a/net/ipv4/netfilter/nft_fib_ipv4.c
+++ b/net/ipv4/netfilter/nft_fib_ipv4.c
@@ -90,7 +90,7 @@ void nft_fib4_eval(const struct nft_expr *expr, struct 
nft_regs *regs,
 
if (nft_hook(pkt) == NF_INET_PRE_ROUTING &&
nft_fib_is_loopback(pkt->skb, nft_in(pkt))) {
-   nft_fib_store_result(dest, priv->result, pkt,
+   nft_fib_store_result(dest, priv, pkt,
 nft_in(pkt)->ifindex);
return;
}
@@ -99,7 +99,7 @@ void nft_fib4_eval(const struct nft_expr *expr, struct 
nft_regs *regs,
if (ipv4_is_zeronet(iph->saddr)) {
if (ipv4_is_lbcast(iph->daddr) ||
ipv4_is_local_multicast(iph->daddr)) {
-   nft_fib_store_result(dest, priv->result, pkt,
+   nft_fib_store_result(dest, priv, pkt,
 get_ifindex(pkt->skb->dev));
return;
}
diff --git a/net/ipv6/netfilter/nft_fib_ipv6.c 
b/net/ipv6/netfilter/nft_fib_ipv6.c
index 765facf03d45..e8d88d82636b 100644
--- a/net/ipv6/netfilter/nft_fib_ipv6.c
+++ b/net/ipv6/netfilter/nft_fib_ipv6.c
@@ -159,7 +159,7 @@ void nft_fib6_eval(const struct nft_expr *expr, struct 
nft_regs *regs,
 
if (nft_hook(pkt) == NF_INET_PRE_ROUTING &&
nft_fib_is_loopback(pkt->skb, nft_in(pkt))) {
-   nft_fib_store_result(dest, priv->result, pkt,
+   nft_fib_store_result(dest, priv, pkt,
 nft_in(pkt)->ifindex);
return;
}
diff --git a/net/netfilter/nft_fib.c b/net/netfilter/nft_fib.c
index fd0b19303b0d..21df8cccea65 100644
--- a/net/netfilter/nft_fib.c
+++ b/net/netfilter/nft_fib.c
@@ -24,7 +24,8 @@ const struct nla_policy nft_fib_policy[NFTA_FIB_MAX + 1] = {
 EXPORT_SYMBOL(nft_fib_policy);
 
 #define NFTA_FIB_F_ALL (NFTA_FIB_F_SADDR | NFTA_FIB_F_DADDR | \
-   NFTA_FIB_F_MARK | NFTA_FIB_F_IIF | NFTA_FIB_F_OIF)
+   NFTA_FIB_F_MARK | NFTA_FIB_F_IIF | NFTA_FIB_F_OIF | \
+   NFTA_FIB_F_PRESENT)
 
 int nft_fib_validate(const struct nft_ctx *ctx, const struct nft_expr *expr,
 const struct nft_data **data)
@@ -133,19 +134,22 @@ int nft_fib_dump(struct sk_buff *skb, const struct 
nft_expr *expr)
 }
 EXPORT_SYMBOL_GPL(nft_fib_dump);
 
-void nft_fib_store_result(void *reg, enum nft_fib_result r,
+void nft_fib_store_result(void *reg, const struct nft_fib *priv,
  const struct nft_pktinfo *pkt, int index)
 {
struct net_device *dev;
u32 *dreg = reg;
 
-   switch (r) {
+   switch (priv->result) {
case NFT_FIB_RESULT_OIF:
-   *dreg = index;
+   *dreg = (priv->flags & NFTA_FIB_F_PRESENT) ? !!index : index;
break;
case NFT_FIB_RESULT_OIFNAME:

[PATCH 17/22] ipvs: fix sync_threshold description and add sync_refresh_period, sync_retries

2017-03-20 Thread Pablo Neira Ayuso

From: Hangbin Liu 

Fix sync_threshold description which should have two values. Also add
sync_refresh_period and sync_retries based on commit 749c42b620a9
("ipvs: reduce sync rate with time thresholds").

Signed-off-by: Hangbin Liu 
Signed-off-by: Simon Horman 
---
 Documentation/networking/ipvs-sysctl.txt | 40 +---
 1 file changed, 31 insertions(+), 9 deletions(-)

diff --git a/Documentation/networking/ipvs-sysctl.txt 
b/Documentation/networking/ipvs-sysctl.txt
index e6b1c025fdd8..7acaaa65451e 100644
--- a/Documentation/networking/ipvs-sysctl.txt
+++ b/Documentation/networking/ipvs-sysctl.txt
@@ -185,15 +185,37 @@ secure_tcp - INTEGER
 The value definition is the same as that of drop_entry and
 drop_packet.
 
-sync_threshold - INTEGER
-default 3
-
-It sets synchronization threshold, which is the minimum number
-of incoming packets that a connection needs to receive before
-the connection will be synchronized. A connection will be
-synchronized, every time the number of its incoming packets
-modulus 50 equals the threshold. The range of the threshold is
-from 0 to 49.
+sync_threshold - vector of 2 INTEGERs: sync_threshold, sync_period
+   default 3 50
+
+   It sets synchronization threshold, which is the minimum number
+   of incoming packets that a connection needs to receive before
+   the connection will be synchronized. A connection will be
+   synchronized, every time the number of its incoming packets
+   modulus sync_period equals the threshold. The range of the
+   threshold is from 0 to sync_period.
+
+   When sync_period and sync_refresh_period are 0, send sync only
+   for state changes or only once when pkts matches sync_threshold
+
+sync_refresh_period - UNSIGNED INTEGER
+   default 0
+
+   In seconds, difference in reported connection timer that triggers
+   new sync message. It can be used to avoid sync messages for the
+   specified period (or half of the connection timeout if it is lower)
+   if connection state is not changed since last sync.
+
+   This is useful for normal connections with high traffic to reduce
+   sync rate. Additionally, retry sync_retries times with period of
+   sync_refresh_period/8.
+
+sync_retries - INTEGER
+   default 0
+
+   Defines sync retries with period of sync_refresh_period/8. Useful
+   to protect against loss of sync messages. The range of the
+   sync_retries is from 0 to 3.
 
 snat_reroute - BOOLEAN
0 - disabled
-- 
2.1.4

[PATCH 10/22] netfilter: provide nft_ctx in object init function

2017-03-20 Thread Pablo Neira Ayuso

From: Florian Westphal <f...@strlen.de>

this is needed by the upcoming ct helper object type --
we'd like to be able use the table family (ip, ip6, inet) to figure
out which helper has to be requested.

Signed-off-by: Florian Westphal <f...@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pa...@netfilter.org>
---
 include/net/netfilter/nf_tables.h | 3 ++-
 net/netfilter/nf_tables_api.c | 7 ---
 net/netfilter/nft_counter.c   | 3 ++-
 net/netfilter/nft_quota.c | 3 ++-
 4 files changed, 10 insertions(+), 6 deletions(-)

diff --git a/include/net/netfilter/nf_tables.h 
b/include/net/netfilter/nf_tables.h
index f0d46726d06e..49436849d7d7 100644
--- a/include/net/netfilter/nf_tables.h
+++ b/include/net/netfilter/nf_tables.h
@@ -1017,7 +1017,8 @@ struct nft_object_type {
unsigned intmaxattr;
struct module   *owner;
const struct nla_policy *policy;
-   int (*init)(const struct nlattr * const 
tb[],
+   int (*init)(const struct nft_ctx *ctx,
+   const struct nlattr *const tb[],
struct nft_object *obj);
void(*destroy)(struct nft_object *obj);
int (*dump)(struct sk_buff *skb,
diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index 4559f5d66bcc..12cc5218de96 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -4095,7 +4095,8 @@ static const struct nla_policy 
nft_obj_policy[NFTA_OBJ_MAX + 1] = {
[NFTA_OBJ_DATA] = { .type = NLA_NESTED },
 };
 
-static struct nft_object *nft_obj_init(const struct nft_object_type *type,
+static struct nft_object *nft_obj_init(const struct nft_ctx *ctx,
+  const struct nft_object_type *type,
   const struct nlattr *attr)
 {
struct nlattr *tb[type->maxattr + 1];
@@ -4115,7 +4116,7 @@ static struct nft_object *nft_obj_init(const struct 
nft_object_type *type,
if (obj == NULL)
goto err1;
 
-   err = type->init((const struct nlattr * const *)tb, obj);
+   err = type->init(ctx, (const struct nlattr * const *)tb, obj);
if (err < 0)
goto err2;
 
@@ -4223,7 +4224,7 @@ static int nf_tables_newobj(struct net *net, struct sock 
*nlsk,
if (IS_ERR(type))
return PTR_ERR(type);
 
-   obj = nft_obj_init(type, nla[NFTA_OBJ_DATA]);
+   obj = nft_obj_init(, type, nla[NFTA_OBJ_DATA]);
if (IS_ERR(obj)) {
err = PTR_ERR(obj);
goto err1;
diff --git a/net/netfilter/nft_counter.c b/net/netfilter/nft_counter.c
index 7f8422213341..67a710ebde09 100644
--- a/net/netfilter/nft_counter.c
+++ b/net/netfilter/nft_counter.c
@@ -82,7 +82,8 @@ static int nft_counter_do_init(const struct nlattr * const 
tb[],
return 0;
 }
 
-static int nft_counter_obj_init(const struct nlattr * const tb[],
+static int nft_counter_obj_init(const struct nft_ctx *ctx,
+   const struct nlattr * const tb[],
struct nft_object *obj)
 {
struct nft_counter_percpu_priv *priv = nft_obj_data(obj);
diff --git a/net/netfilter/nft_quota.c b/net/netfilter/nft_quota.c
index 2d6fe3559912..25e33159be57 100644
--- a/net/netfilter/nft_quota.c
+++ b/net/netfilter/nft_quota.c
@@ -99,7 +99,8 @@ static int nft_quota_do_init(const struct nlattr * const tb[],
return 0;
 }
 
-static int nft_quota_obj_init(const struct nlattr * const tb[],
+static int nft_quota_obj_init(const struct nft_ctx *ctx,
+ const struct nlattr * const tb[],
  struct nft_object *obj)
 {
struct nft_quota *priv = nft_obj_data(obj);
-- 
2.1.4

[PATCH 18/22] ipvs: Document sysctl sync_qlen_max and sync_sock_size

2017-03-20 Thread Pablo Neira Ayuso

From: Hangbin Liu 

Document sysctl sync_qlen_max and sync_sock_size based on
commit 1c003b1580e2 ("ipvs: wakeup master thread").

Signed-off-by: Hangbin Liu 
Signed-off-by: Simon Horman 
---
 Documentation/networking/ipvs-sysctl.txt | 14 ++
 1 file changed, 14 insertions(+)

diff --git a/Documentation/networking/ipvs-sysctl.txt 
b/Documentation/networking/ipvs-sysctl.txt
index 7acaaa65451e..159d70b6dff3 100644
--- a/Documentation/networking/ipvs-sysctl.txt
+++ b/Documentation/networking/ipvs-sysctl.txt
@@ -217,6 +217,20 @@ sync_retries - INTEGER
to protect against loss of sync messages. The range of the
sync_retries is from 0 to 3.
 
+sync_qlen_max - UNSIGNED LONG
+
+   Hard limit for queued sync messages that are not sent yet. It
+   defaults to 1/32 of the memory pages but actually represents
+   number of messages. It will protect us from allocating large
+   parts of memory when the sending rate is lower than the queuing
+   rate.
+
+sync_sock_size - INTEGER
+   default 0
+
+   Configuration of SNDBUF (master) or RCVBUF (slave) socket limit.
+   Default value is 0 (preserve system defaults).
+
 snat_reroute - BOOLEAN
0 - disabled
not 0 - enabled (default)
-- 
2.1.4

[PATCH 08/22] netfilter: bridge: remove unneeded rcu_read_lock

2017-03-20 Thread Pablo Neira Ayuso

From: Florian Westphal <f...@strlen.de>

as comment says, the function is always called with rcu read lock held.

Signed-off-by: Florian Westphal <f...@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pa...@netfilter.org>
---
 net/bridge/br_netfilter_hooks.c | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/net/bridge/br_netfilter_hooks.c b/net/bridge/br_netfilter_hooks.c
index 95087e6e8258..52739e6c610e 100644
--- a/net/bridge/br_netfilter_hooks.c
+++ b/net/bridge/br_netfilter_hooks.c
@@ -1016,13 +1016,10 @@ int br_nf_hook_thresh(unsigned int hook, struct net 
*net,
if (!elem)
return okfn(net, sk, skb);
 
-   /* We may already have this, but read-locks nest anyway */
-   rcu_read_lock();
nf_hook_state_init(, hook, NFPROTO_BRIDGE, indev, outdev,
   sk, net, okfn);
 
ret = nf_hook_slow(skb, , elem);
-   rcu_read_unlock();
if (ret == 1)
ret = okfn(net, sk, skb);
 
-- 
2.1.4

[PATCH 01/22] netfilter: nft_exthdr: Allow checking TCP option presence, too

2017-03-20 Thread Pablo Neira Ayuso

From: Phil Sutter <p...@nwl.cc>

Honor NFT_EXTHDR_F_PRESENT flag so we check if the TCP option is
present.

Signed-off-by: Phil Sutter <p...@nwl.cc>
Signed-off-by: Pablo Neira Ayuso <pa...@netfilter.org>
---
 net/netfilter/nft_exthdr.c | 13 ++---
 1 file changed, 10 insertions(+), 3 deletions(-)

diff --git a/net/netfilter/nft_exthdr.c b/net/netfilter/nft_exthdr.c
index c308920b194c..d212a85d2f33 100644
--- a/net/netfilter/nft_exthdr.c
+++ b/net/netfilter/nft_exthdr.c
@@ -98,14 +98,21 @@ static void nft_exthdr_tcp_eval(const struct nft_expr *expr,
goto err;
 
offset = i + priv->offset;
-   dest[priv->len / NFT_REG32_SIZE] = 0;
-   memcpy(dest, opt + offset, priv->len);
+   if (priv->flags & NFT_EXTHDR_F_PRESENT) {
+   *dest = 1;
+   } else {
+   dest[priv->len / NFT_REG32_SIZE] = 0;
+   memcpy(dest, opt + offset, priv->len);
+   }
 
return;
}
 
 err:
-   regs->verdict.code = NFT_BREAK;
+   if (priv->flags & NFT_EXTHDR_F_PRESENT)
+   *dest = 0;
+   else
+   regs->verdict.code = NFT_BREAK;
 }
 
 static const struct nla_policy nft_exthdr_policy[NFTA_EXTHDR_MAX + 1] = {
-- 
2.1.4

Re: [PATCH net] bridge: ebtables: fix reception of frames DNAT-ed to bridge device

2017-03-17 Thread Pablo Neira Ayuso

On Wed, Mar 15, 2017 at 11:06:05PM +0100, Pablo Neira Ayuso wrote:
> On Wed, Mar 15, 2017 at 10:16:19PM +0100, Linus Lüssing wrote:
> > On Wed, Mar 15, 2017 at 07:15:39PM +0100, Pablo Neira Ayuso wrote:
> > > Could you update ebtables dnat to check if the ethernet address
> > > matches the one of the input bridge interface, so we mangle the
> > > ->pkt_type accordingly from there, instead of doing this from the
> > > core?
> > 
> > Actually, that was the approach I thought about and went for first
> > (and it would probably work for me). Just checking against the
> > bridge device's net_device::dev_addr.
> > 
> > I scratched it though, as I was afraid that the issue might still
> > exist for people using some other upper device on top of the bridge
> > device. For instance, macvlan? And iterating over the
> > net_device::dev_addrs list seemed too costly for fast path to me.
> 
> I was more thinking of following the simple approach that we follow in
> ebt_redirect_tg() by taking the input interface.
> 
> Anyway, I'm ok with this.

Wait.

May this break local multicast listener that are bound to the bridge
interface? Assuming the bridge interface got an IP address, and that
there is local multicast listener.

Missing anything here?

Re: [GIT PULL 0/5] IPVS Updates for v4.12

2017-03-17 Thread Pablo Neira Ayuso

On Thu, Mar 16, 2017 at 01:43:10PM +0100, Simon Horman wrote:
> Hi Pablo,
> 
> please consider these enhancements to the IPVS for v4.12.
> 
> * Update sysctl documentation
> * Remove unnecessary printk in __ip_vs_init
> 
> The following changes since commit 03e5fd0e9bcc1f34b7a542786b34b8f771e7c260:
> 
>   netfilter: nft_set_rbtree: use per-set rwlock to improve the scalability 
> (2017-03-13 19:30:43 +0100)
> 
> are available in the git repository at:
> 
>   https://git.kernel.org/pub/scm/linux/kernel/git/horms/ipvs-next.git 
> tags/ipvs-for-v4.12

Pulled, thanks Simon!

Re: [PATCH net] bridge: ebtables: fix reception of frames DNAT-ed to bridge device

2017-03-15 Thread Pablo Neira Ayuso

On Wed, Mar 15, 2017 at 10:16:19PM +0100, Linus Lüssing wrote:
> On Wed, Mar 15, 2017 at 07:15:39PM +0100, Pablo Neira Ayuso wrote:
> > Could you update ebtables dnat to check if the ethernet address
> > matches the one of the input bridge interface, so we mangle the
> > ->pkt_type accordingly from there, instead of doing this from the
> > core?
> 
> Actually, that was the approach I thought about and went for first
> (and it would probably work for me). Just checking against the
> bridge device's net_device::dev_addr.
> 
> I scratched it though, as I was afraid that the issue might still
> exist for people using some other upper device on top of the bridge
> device. For instance, macvlan? And iterating over the
> net_device::dev_addrs list seemed too costly for fast path to me.

I was more thinking of following the simple approach that we follow in
ebt_redirect_tg() by taking the input interface.

Anyway, I'm ok with this.

Re: [PATCH 1/1] gtp: support SGSN-side tunnels

2017-03-15 Thread Pablo Neira Ayuso

Hi Harald,

On Wed, Mar 15, 2017 at 08:10:38PM +0100, Harald Welte wrote:
> I've modified the patch slightly, see below (compile-tested, but not
> otherwise tested yet).  Basically rename the flags attribute to 'role',
> expand the commit log and removed unrelated cosmetic changes.
> 
> I've also prepared a corresponding change to libgtpnl into the
> laforge/sgsn-rol branch, see
> http://git.osmocom.org/libgtpnl/commit/?h=laforge/sgsn-role
> 
> This is not yet tested in any way, but I'm planning to add some
> associated support to the command line tools and then give it some
> testing (both against the kernel GTP in GGSN mode, as well as an
> independent userspace GTP implementation).

Thanks Harald.

> > It would be good if we provide a way to configure GTP via iproute2 for
> > testing purposes.
> 
> I don't really care about which tool is used, as long as it is easily
> available [and FOSS, of course].
>
> > We would need to create some dummy socket from
> > kernel too though so we don't need any userspace daemon for this
> > testing mode.
> 
> I don't really like that latter idea. It sounds too much like a hack to
> me.  But then, I don't have enough phantasy right now ti imagine how an
> actual implementation would look like.

It's not that far away, we can just create the udp socket from
kernelspace via udp_sock_create() in the test mode. So we don't need
to pass the file descriptor from userspace. But not asking you to work
on this, just an idea.

> To me, it is perfectly fine to run a simple, small utility in userspace
> even for testing.

No problem.

[PATCH net] MAINTAINERS: remove MACVLAN and VLAN entries

2017-03-15 Thread Pablo Neira Ayuso

macvlan.c file seems to be both in VLAN and MACVLAN DRIVER, so remove
the MACVLAN DRIVER since this is redundant.

I propose with this patch to remove the VLAN (802.1Q) entry so this just
falls into the NETWORKING [GENERAL].

Signed-off-by: Pablo Neira Ayuso <pa...@netfilter.org>
---
@David, please just place this in net-next if you find it more pertinent.
Thanks.

 MAINTAINERS | 15 ---
 1 file changed, 15 deletions(-)

diff --git a/MAINTAINERS b/MAINTAINERS
index c265a5fe4848..4d68d9657ed0 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -7774,13 +7774,6 @@ F:   include/net/mac80211.h
 F: net/mac80211/
 F: drivers/net/wireless/mac80211_hwsim.[ch]
 
-MACVLAN DRIVER
-M: Patrick McHardy <ka...@trash.net>
-L: netdev@vger.kernel.org
-S: Maintained
-F: drivers/net/macvlan.c
-F: include/linux/if_macvlan.h
-
 MAILBOX API
 M: Jassi Brar <jassisinghb...@gmail.com>
 L: linux-ker...@vger.kernel.org
@@ -13384,14 +13377,6 @@ W: https://linuxtv.org
 S: Maintained
 F: drivers/media/platform/vivid/*
 
-VLAN (802.1Q)
-M: Patrick McHardy <ka...@trash.net>
-L: netdev@vger.kernel.org
-S: Maintained
-F: drivers/net/macvlan.c
-F: include/linux/if_*vlan.h
-F: net/8021q/
-
 VLYNQ BUS
 M: Florian Fainelli <f.faine...@gmail.com>
 L: openwrt-de...@lists.openwrt.org (subscribers-only)
-- 
2.1.4

Re: [PATCH net] bridge: ebtables: fix reception of frames DNAT-ed to bridge device

2017-03-15 Thread Pablo Neira Ayuso

On Wed, Mar 15, 2017 at 03:27:20PM +0100, Linus Lüssing wrote:
> On Wed, Mar 15, 2017 at 11:42:11AM +0100, Pablo Neira Ayuso wrote:
> > I'm missing then why redirect is not then just enough for Linus usecase.
> 
> For my usecase, the MAC address is configured by the user from a
> Web-UI. It may or may not be the one from the bridge device.
> 
> Besides, found it counter intuitive that DNAT did not work here
> and took me some time to find out why. At least I didn't read about
> any such known limitations of the dnat target in the ebtables
> manpage.

Could you update ebtables dnat to check if the ethernet address
matches the one of the input bridge interface, so we mangle the
->pkt_type accordingly from there, instead of doing this from the
core?

Re: [PATCH 1/1] gtp: support SGSN-side tunnels

2017-03-15 Thread Pablo Neira Ayuso

On Wed, Mar 15, 2017 at 05:39:16PM +0100, Harald Welte wrote:
> Hi Jonas,
> 
> are you working on the review feedback that was provided back in early
> February?  I think there were some comments like
> * remove unrelated cosmetic change in comment
> * change from FLAGS to a dedicated MODE netlink attribute
> * add libgtpnl code and some usage information or even sample scripts
> 
> I would definitely like to see this move forward, particularly in order
> to test the GGSN-side code.

Agreed.

It would be good if we provide a way to configure GTP via iproute2 for
testing purposes. We would need to create some dummy socket from
kernel too though so we don't need any userspace daemon for this
testing mode.

[PATCH 04/10] netfilter: nft_set_bitmap: fetch the element key based on the set->klen

2017-03-15 Thread Pablo Neira Ayuso

From: Liping Zhang <zlpnob...@gmail.com>

Currently we just assume the element key as a u32 integer, regardless of
the set key length.

This is incorrect, for example, the tcp port number is only 16 bits.
So when we use the nft_payload expr to get the tcp dport and store
it to dreg, the dport will be stored at 0~15 bits, and 16~31 bits
will be padded with zero.

So the reg->data[dreg] will be looked like as below:
  0  15   31
  +-+-+-+-+-+-+-+-+-+-+-+-+
  | tcp dport |  0|
  +-+-+-+-+-+-+-+-+-+-+-+-+
But for these big-endian systems, if we treate this register as a u32
integer, the element key will be larger than 65535, so the following
lookup in bitmap set will cause out of bound access.

Another issue is that if we add element with comments in bitmap
set(although the comments will be ignored eventually), the element will
vanish strangely. Because we treate the element key as a u32 integer, so
the comments will become the part of the element key, then the element
key will also be larger than 65535 and out of bound access will happen:
  # nft add element t s { 1 comment test }

Since set->klen is 1 or 2, it's fine to treate the element key as a u8 or
u16 integer.

Fixes: 665153ff5752 ("netfilter: nf_tables: add bitmap set type")
Signed-off-by: Liping Zhang <zlpnob...@gmail.com>
Signed-off-by: Pablo Neira Ayuso <pa...@netfilter.org>
---
 net/netfilter/nft_set_bitmap.c | 27 +--
 1 file changed, 17 insertions(+), 10 deletions(-)

diff --git a/net/netfilter/nft_set_bitmap.c b/net/netfilter/nft_set_bitmap.c
index 152d226552c1..9b024e22717b 100644
--- a/net/netfilter/nft_set_bitmap.c
+++ b/net/netfilter/nft_set_bitmap.c
@@ -45,9 +45,17 @@ struct nft_bitmap {
u8  bitmap[];
 };
 
-static inline void nft_bitmap_location(u32 key, u32 *idx, u32 *off)
+static inline void nft_bitmap_location(const struct nft_set *set,
+  const void *key,
+  u32 *idx, u32 *off)
 {
-   u32 k = (key << 1);
+   u32 k;
+
+   if (set->klen == 2)
+   k = *(u16 *)key;
+   else
+   k = *(u8 *)key;
+   k <<= 1;
 
*idx = k / BITS_PER_BYTE;
*off = k % BITS_PER_BYTE;
@@ -69,7 +77,7 @@ static bool nft_bitmap_lookup(const struct net *net, const 
struct nft_set *set,
u8 genmask = nft_genmask_cur(net);
u32 idx, off;
 
-   nft_bitmap_location(*key, , );
+   nft_bitmap_location(set, key, , );
 
return nft_bitmap_active(priv->bitmap, idx, off, genmask);
 }
@@ -83,7 +91,7 @@ static int nft_bitmap_insert(const struct net *net, const 
struct nft_set *set,
u8 genmask = nft_genmask_next(net);
u32 idx, off;
 
-   nft_bitmap_location(nft_set_ext_key(ext)->data[0], , );
+   nft_bitmap_location(set, nft_set_ext_key(ext), , );
if (nft_bitmap_active(priv->bitmap, idx, off, genmask))
return -EEXIST;
 
@@ -102,7 +110,7 @@ static void nft_bitmap_remove(const struct net *net,
u8 genmask = nft_genmask_next(net);
u32 idx, off;
 
-   nft_bitmap_location(nft_set_ext_key(ext)->data[0], , );
+   nft_bitmap_location(set, nft_set_ext_key(ext), , );
/* Enter 00 state. */
priv->bitmap[idx] &= ~(genmask << off);
 }
@@ -116,7 +124,7 @@ static void nft_bitmap_activate(const struct net *net,
u8 genmask = nft_genmask_next(net);
u32 idx, off;
 
-   nft_bitmap_location(nft_set_ext_key(ext)->data[0], , );
+   nft_bitmap_location(set, nft_set_ext_key(ext), , );
/* Enter 11 state. */
priv->bitmap[idx] |= (genmask << off);
 }
@@ -128,7 +136,7 @@ static bool nft_bitmap_flush(const struct net *net,
u8 genmask = nft_genmask_next(net);
u32 idx, off;
 
-   nft_bitmap_location(nft_set_ext_key(ext)->data[0], , );
+   nft_bitmap_location(set, nft_set_ext_key(ext), , );
/* Enter 10 state, similar to deactivation. */
priv->bitmap[idx] &= ~(genmask << off);
 
@@ -161,10 +169,9 @@ static void *nft_bitmap_deactivate(const struct net *net,
struct nft_bitmap *priv = nft_set_priv(set);
u8 genmask = nft_genmask_next(net);
struct nft_set_ext *ext;
-   u32 idx, off, key = 0;
+   u32 idx, off;
 
-   memcpy(, elem->key.val.data, set->klen);
-   nft_bitmap_location(key, , );
+   nft_bitmap_location(set, elem->key.val.data, , );
 
if (!nft_bitmap_active(priv->bitmap, idx, off, genmask))
return NULL;
-- 
2.1.4

[PATCH 06/10] netfilter: bridge: honor frag_max_size when refragmenting

2017-03-15 Thread Pablo Neira Ayuso

From: Florian Westphal <f...@strlen.de>

consider a bridge with mtu 9000, but end host sending smaller
packets to another host with mtu < 9000.

In this case, after reassembly, bridge+defrag would refragment,
and then attempt to send the reassembled packet as long as it
was below 9k.

Instead we have to cap by the largest fragment size seen.

Signed-off-by: Florian Westphal <f...@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pa...@netfilter.org>
---
 net/bridge/br_netfilter_hooks.c | 12 +++-
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/net/bridge/br_netfilter_hooks.c b/net/bridge/br_netfilter_hooks.c
index 95087e6e8258..3c5185021c1c 100644
--- a/net/bridge/br_netfilter_hooks.c
+++ b/net/bridge/br_netfilter_hooks.c
@@ -721,18 +721,20 @@ static unsigned int nf_bridge_mtu_reduction(const struct 
sk_buff *skb)
 
 static int br_nf_dev_queue_xmit(struct net *net, struct sock *sk, struct 
sk_buff *skb)
 {
-   struct nf_bridge_info *nf_bridge;
-   unsigned int mtu_reserved;
+   struct nf_bridge_info *nf_bridge = nf_bridge_info_get(skb);
+   unsigned int mtu, mtu_reserved;
 
mtu_reserved = nf_bridge_mtu_reduction(skb);
+   mtu = skb->dev->mtu;
+
+   if (nf_bridge->frag_max_size && nf_bridge->frag_max_size < mtu)
+   mtu = nf_bridge->frag_max_size;
 
-   if (skb_is_gso(skb) || skb->len + mtu_reserved <= skb->dev->mtu) {
+   if (skb_is_gso(skb) || skb->len + mtu_reserved <= mtu) {
nf_bridge_info_free(skb);
return br_dev_queue_push_xmit(net, sk, skb);
}
 
-   nf_bridge = nf_bridge_info_get(skb);
-
/* This is wrong! We should preserve the original fragment
 * boundaries by preserving frag_list rather than refragmenting.
 */
-- 
2.1.4

[PATCH 03/10] netfilter: nf_tables: set pktinfo->thoff at AH header if found

2017-03-15 Thread Pablo Neira Ayuso

Phil Sutter reports that IPv6 AH header matching is broken. From
userspace, nft generates bytecode that expects to find the AH header at
NFT_PAYLOAD_TRANSPORT_HEADER both for IPv4 and IPv6. However,
pktinfo->thoff is set to the inner header after the AH header in IPv6,
while in IPv4 pktinfo->thoff points to the AH header indeed. This
behaviour is inconsistent. This patch fixes this problem by updating
ipv6_find_hdr() to get the IP6_FH_F_AUTH flag so this function stops at
the AH header, so both IPv4 and IPv6 pktinfo->thoff point to the AH
header.

This is also inconsistent when trying to match encapsulated headers:

1) A packet that looks like IPv4 + AH + TCP dport 22 will *not* match.
2) A packet that looks like IPv6 + AH + TCP dport 22 will match.

Reported-by: Phil Sutter <p...@nwl.cc>
Signed-off-by: Pablo Neira Ayuso <pa...@netfilter.org>
---
 include/net/netfilter/nf_tables_ipv6.h | 6 --
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/include/net/netfilter/nf_tables_ipv6.h 
b/include/net/netfilter/nf_tables_ipv6.h
index d150b5066201..97983d1c05e4 100644
--- a/include/net/netfilter/nf_tables_ipv6.h
+++ b/include/net/netfilter/nf_tables_ipv6.h
@@ -9,12 +9,13 @@ nft_set_pktinfo_ipv6(struct nft_pktinfo *pkt,
 struct sk_buff *skb,
 const struct nf_hook_state *state)
 {
+   unsigned int flags = IP6_FH_F_AUTH;
int protohdr, thoff = 0;
unsigned short frag_off;
 
nft_set_pktinfo(pkt, skb, state);
 
-   protohdr = ipv6_find_hdr(pkt->skb, , -1, _off, NULL);
+   protohdr = ipv6_find_hdr(pkt->skb, , -1, _off, );
if (protohdr < 0) {
nft_set_pktinfo_proto_unspec(pkt, skb);
return;
@@ -32,6 +33,7 @@ __nft_set_pktinfo_ipv6_validate(struct nft_pktinfo *pkt,
const struct nf_hook_state *state)
 {
 #if IS_ENABLED(CONFIG_IPV6)
+   unsigned int flags = IP6_FH_F_AUTH;
struct ipv6hdr *ip6h, _ip6h;
unsigned int thoff = 0;
unsigned short frag_off;
@@ -50,7 +52,7 @@ __nft_set_pktinfo_ipv6_validate(struct nft_pktinfo *pkt,
if (pkt_len + sizeof(*ip6h) > skb->len)
return -1;
 
-   protohdr = ipv6_find_hdr(pkt->skb, , -1, _off, NULL);
+   protohdr = ipv6_find_hdr(pkt->skb, , -1, _off, );
if (protohdr < 0)
return -1;
 
-- 
2.1.4

[PATCH 00/10] Netfilter fixes for net

2017-03-15 Thread Pablo Neira Ayuso

Hi David,

The following patchset contains Netfilter fixes for your net tree, a
rather large batch of fixes targeted to nf_tables, conntrack and bridge
netfilter. More specifically, they are:

1) Don't track fragmented packets if the socket option IP_NODEFRAG is set.
   From Florian Westphal.

2) SCTP protocol tracker assumes that ICMP error messages contain the
   checksum field, what results in packet drops. From Ying Xue.

3) Fix inconsistent handling of AH traffic from nf_tables.

4) Fix new bitmap set representation with big endian. Fix mismatches in
   nf_tables due to incorrect big endian handling too. Both patches
   from Liping Zhang.

5) Bridge netfilter doesn't honor maximum fragment size field, cap to
   largest fragment seen. From Florian Westphal.

6) Fake conntrack entry needs to be aligned to 8 bytes since the 3 LSB
   bits are now used to store the ctinfo. From Steven Rostedt.

7) Fix element comments with the bitmap set type. Revert the flush
   field in the nft_set_iter structure, not required anymore after
   fixing up element comments.

8) Missing error on invalid conntrack direction from nft_ct, also from
   Liping Zhang.

You can pull these changes from:

  git://git.kernel.org/pub/scm/linux/kernel/git/pablo/nf.git

Thanks!



The following changes since commit 8d70eeb84ab277377c017af6a21d0a337025dede:

  Merge git://git.kernel.org/pub/scm/linux/kernel/git/davem/net (2017-03-04 
17:31:39 -0800)

are available in the git repository at:

  git://git.kernel.org/pub/scm/linux/kernel/git/pablo/nf.git HEAD

for you to fetch changes up to 4494dbc6dec37817f2cc2aa7604039a9e87ada18:

  netfilter: nft_ct: do cleanup work when NFTA_CT_DIRECTION is invalid 
(2017-03-15 17:15:54 +0100)


Florian Westphal (2):
  netfilter: don't track fragmented packets
  netfilter: bridge: honor frag_max_size when refragmenting

Liping Zhang (3):
  netfilter: nft_set_bitmap: fetch the element key based on the set->klen
  netfilter: nf_tables: fix mismatch in big-endian system
  netfilter: nft_ct: do cleanup work when NFTA_CT_DIRECTION is invalid

Pablo Neira Ayuso (3):
  netfilter: nf_tables: set pktinfo->thoff at AH header if found
  netfilter: nft_set_bitmap: keep a list of dummy elements
  Revert "netfilter: nf_tables: add flush field to struct nft_set_iter"

Steven Rostedt (VMware) (1):
  netfilter: Force fake conntrack entry to be at least 8 bytes aligned

Ying Xue (1):
  netfilter: nf_nat_sctp: fix ICMP packet to be dropped accidently

 include/net/netfilter/nf_conntrack.h   |   2 +-
 include/net/netfilter/nf_tables.h  |  30 -
 include/net/netfilter/nf_tables_ipv6.h |   6 +-
 net/bridge/br_netfilter_hooks.c|  12 +-
 net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c |   4 +
 net/ipv4/netfilter/nf_nat_l3proto_ipv4.c   |   5 -
 net/ipv4/netfilter/nft_masq_ipv4.c |   8 +-
 net/ipv4/netfilter/nft_redir_ipv4.c|   8 +-
 net/ipv6/netfilter/nft_masq_ipv6.c |   8 +-
 net/ipv6/netfilter/nft_redir_ipv6.c|   8 +-
 net/netfilter/nf_conntrack_core.c  |   6 +-
 net/netfilter/nf_nat_proto_sctp.c  |  13 +-
 net/netfilter/nf_tables_api.c  |   4 -
 net/netfilter/nft_ct.c |  21 ++--
 net/netfilter/nft_meta.c   |  40 +++---
 net/netfilter/nft_nat.c|   8 +-
 net/netfilter/nft_set_bitmap.c | 165 -
 17 files changed, 194 insertions(+), 154 deletions(-)

[PATCH 05/10] netfilter: nf_tables: fix mismatch in big-endian system

2017-03-15 Thread Pablo Neira Ayuso

From: Liping Zhang <zlpnob...@gmail.com>

Currently, there are two different methods to store an u16 integer to
the u32 data register. For example:
  u32 *dest = >data[priv->dreg];
  1. *dest = 0; *(u16 *) dest = val_u16;
  2. *dest = val_u16;

For method 1, the u16 value will be stored like this, either in
big-endian or little-endian system:
  0  15   31
  +-+-+-+-+-+-+-+-+-+-+-+-+
  |   Value   | 0 |
  +-+-+-+-+-+-+-+-+-+-+-+-+

For method 2, in little-endian system, the u16 value will be the same
as listed above. But in big-endian system, the u16 value will be stored
like this:
  0  15   31
  +-+-+-+-+-+-+-+-+-+-+-+-+
  | 0 |   Value   |
  +-+-+-+-+-+-+-+-+-+-+-+-+

So later we use "memcmp(>data[priv->sreg], data, 2);" to do
compare in nft_cmp, nft_lookup expr ..., method 2 will get the wrong
result in big-endian system, as 0~15 bits will always be zero.

For the similar reason, when loading an u16 value from the u32 data
register, we should use "*(u16 *) sreg;" instead of "(u16)*sreg;",
the 2nd method will get the wrong value in the big-endian system.

So introduce some wrapper functions to store/load an u8 or u16
integer to/from the u32 data register, and use them in the right
place.

Signed-off-by: Liping Zhang <zlpnob...@gmail.com>
Signed-off-by: Pablo Neira Ayuso <pa...@netfilter.org>
---
 include/net/netfilter/nf_tables.h   | 29 +++
 net/ipv4/netfilter/nft_masq_ipv4.c  |  8 
 net/ipv4/netfilter/nft_redir_ipv4.c |  8 
 net/ipv6/netfilter/nft_masq_ipv6.c  |  8 
 net/ipv6/netfilter/nft_redir_ipv6.c |  8 
 net/netfilter/nft_ct.c  | 18 +
 net/netfilter/nft_meta.c| 40 +++--
 net/netfilter/nft_nat.c |  8 
 8 files changed, 80 insertions(+), 47 deletions(-)

diff --git a/include/net/netfilter/nf_tables.h 
b/include/net/netfilter/nf_tables.h
index 2aa8a9d80fbe..70c5ca0c60b1 100644
--- a/include/net/netfilter/nf_tables.h
+++ b/include/net/netfilter/nf_tables.h
@@ -103,6 +103,35 @@ struct nft_regs {
};
 };
 
+/* Store/load an u16 or u8 integer to/from the u32 data register.
+ *
+ * Note, when using concatenations, register allocation happens at 32-bit
+ * level. So for store instruction, pad the rest part with zero to avoid
+ * garbage values.
+ */
+
+static inline void nft_reg_store16(u32 *dreg, u16 val)
+{
+   *dreg = 0;
+   *(u16 *)dreg = val;
+}
+
+static inline void nft_reg_store8(u32 *dreg, u8 val)
+{
+   *dreg = 0;
+   *(u8 *)dreg = val;
+}
+
+static inline u16 nft_reg_load16(u32 *sreg)
+{
+   return *(u16 *)sreg;
+}
+
+static inline u8 nft_reg_load8(u32 *sreg)
+{
+   return *(u8 *)sreg;
+}
+
 static inline void nft_data_copy(u32 *dst, const struct nft_data *src,
 unsigned int len)
 {
diff --git a/net/ipv4/netfilter/nft_masq_ipv4.c 
b/net/ipv4/netfilter/nft_masq_ipv4.c
index a0ea8aad1bf1..f18677277119 100644
--- a/net/ipv4/netfilter/nft_masq_ipv4.c
+++ b/net/ipv4/netfilter/nft_masq_ipv4.c
@@ -26,10 +26,10 @@ static void nft_masq_ipv4_eval(const struct nft_expr *expr,
memset(, 0, sizeof(range));
range.flags = priv->flags;
if (priv->sreg_proto_min) {
-   range.min_proto.all =
-   *(__be16 *)>data[priv->sreg_proto_min];
-   range.max_proto.all =
-   *(__be16 *)>data[priv->sreg_proto_max];
+   range.min_proto.all = (__force __be16)nft_reg_load16(
+   >data[priv->sreg_proto_min]);
+   range.max_proto.all = (__force __be16)nft_reg_load16(
+   >data[priv->sreg_proto_max]);
}
regs->verdict.code = nf_nat_masquerade_ipv4(pkt->skb, nft_hook(pkt),
, nft_out(pkt));
diff --git a/net/ipv4/netfilter/nft_redir_ipv4.c 
b/net/ipv4/netfilter/nft_redir_ipv4.c
index 1650ed23c15d..5120be1d3118 100644
--- a/net/ipv4/netfilter/nft_redir_ipv4.c
+++ b/net/ipv4/netfilter/nft_redir_ipv4.c
@@ -26,10 +26,10 @@ static void nft_redir_ipv4_eval(const struct nft_expr *expr,
 
memset(, 0, sizeof(mr));
if (priv->sreg_proto_min) {
-   mr.range[0].min.all =
-   *(__be16 *)>data[priv->sreg_proto_min];
-   mr.range[0].max.all =
-   *(__be16 *)>data[priv->sreg_proto_max];
+   mr.range[0].min.all = (__force __be16)nft_reg_load16(
+   >data[priv->sreg_proto_min]);
+   mr.range[0].max.all = (__force __be16)nft_reg_load16(
+   >data[priv->sreg_proto_max]);
mr.range[0].flags |= NF_NAT_RANGE_PROTO_SPECIFIED;
}
 
diff --git a/net/ipv6/netfilter/nft_masq_ipv6.c 
b/net/ipv6/netfilte

[PATCH 01/10] netfilter: don't track fragmented packets

2017-03-15 Thread Pablo Neira Ayuso

From: Florian Westphal <f...@strlen.de>

Andrey reports syzkaller splat caused by

NF_CT_ASSERT(!ip_is_fragment(ip_hdr(skb)));

in ipv4 nat.  But this assertion (and the comment) are wrong, this function
does see fragments when IP_NODEFRAG setsockopt is used.

As conntrack doesn't track packets without complete l4 header, only the
first fragment is tracked.

Because applying nat to first packet but not the rest makes no sense this
also turns off tracking of all fragments.

Reported-by: Andrey Konovalov <andreyk...@google.com>
Signed-off-by: Florian Westphal <f...@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pa...@netfilter.org>
---
 net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c | 4 
 net/ipv4/netfilter/nf_nat_l3proto_ipv4.c   | 5 -
 2 files changed, 4 insertions(+), 5 deletions(-)

diff --git a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c 
b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c
index bc1486f2c064..2e14ed11a35c 100644
--- a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c
+++ b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c
@@ -165,6 +165,10 @@ static unsigned int ipv4_conntrack_local(void *priv,
if (skb->len < sizeof(struct iphdr) ||
ip_hdrlen(skb) < sizeof(struct iphdr))
return NF_ACCEPT;
+
+   if (ip_is_fragment(ip_hdr(skb))) /* IP_NODEFRAG setsockopt set */
+   return NF_ACCEPT;
+
return nf_conntrack_in(state->net, PF_INET, state->hook, skb);
 }
 
diff --git a/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c 
b/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c
index f8aad03d674b..6f5e8d01b876 100644
--- a/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c
+++ b/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c
@@ -255,11 +255,6 @@ nf_nat_ipv4_fn(void *priv, struct sk_buff *skb,
/* maniptype == SRC for postrouting. */
enum nf_nat_manip_type maniptype = HOOK2MANIP(state->hook);
 
-   /* We never see fragments: conntrack defrags on pre-routing
-* and local-out, and nf_nat_out protects post-routing.
-*/
-   NF_CT_ASSERT(!ip_is_fragment(ip_hdr(skb)));
-
ct = nf_ct_get(skb, );
/* Can't track?  It's not due to stress, or conntrack would
 * have dropped it.  Hence it's the user's responsibilty to
-- 
2.1.4

[PATCH 08/10] netfilter: nft_set_bitmap: keep a list of dummy elements

2017-03-15 Thread Pablo Neira Ayuso

Element comments may come without any prior set flag, so we have to keep
a list of dummy struct nft_set_ext to keep this information around. This
is only useful for set dumps to userspace. From the packet path, this
set type relies on the bitmap representation. This patch simplifies the
logic since we don't need to allocate the dummy nft_set_ext structure
anymore on the fly at the cost of increasing memory consumption because
of the list of dummy struct nft_set_ext.

Fixes: 665153ff5752 ("netfilter: nf_tables: add bitmap set type")
Signed-off-by: Pablo Neira Ayuso <pa...@netfilter.org>
---
 net/netfilter/nft_set_bitmap.c | 146 +++--
 1 file changed, 66 insertions(+), 80 deletions(-)

diff --git a/net/netfilter/nft_set_bitmap.c b/net/netfilter/nft_set_bitmap.c
index 9b024e22717b..8ebbc2940f4c 100644
--- a/net/netfilter/nft_set_bitmap.c
+++ b/net/netfilter/nft_set_bitmap.c
@@ -15,6 +15,11 @@
 #include 
 #include 
 
+struct nft_bitmap_elem {
+   struct list_headhead;
+   struct nft_set_ext  ext;
+};
+
 /* This bitmap uses two bits to represent one element. These two bits determine
  * the element state in the current and the future generation.
  *
@@ -41,8 +46,9 @@
  *  restore its previous state.
  */
 struct nft_bitmap {
-   u16 bitmap_size;
-   u8  bitmap[];
+   struct  list_head   list;
+   u16 bitmap_size;
+   u8  bitmap[];
 };
 
 static inline void nft_bitmap_location(const struct nft_set *set,
@@ -82,21 +88,43 @@ static bool nft_bitmap_lookup(const struct net *net, const 
struct nft_set *set,
return nft_bitmap_active(priv->bitmap, idx, off, genmask);
 }
 
+static struct nft_bitmap_elem *
+nft_bitmap_elem_find(const struct nft_set *set, struct nft_bitmap_elem *this,
+u8 genmask)
+{
+   const struct nft_bitmap *priv = nft_set_priv(set);
+   struct nft_bitmap_elem *be;
+
+   list_for_each_entry_rcu(be, >list, head) {
+   if (memcmp(nft_set_ext_key(>ext),
+  nft_set_ext_key(>ext), set->klen) ||
+   !nft_set_elem_active(>ext, genmask))
+   continue;
+
+   return be;
+   }
+   return NULL;
+}
+
 static int nft_bitmap_insert(const struct net *net, const struct nft_set *set,
 const struct nft_set_elem *elem,
-struct nft_set_ext **_ext)
+struct nft_set_ext **ext)
 {
struct nft_bitmap *priv = nft_set_priv(set);
-   struct nft_set_ext *ext = elem->priv;
+   struct nft_bitmap_elem *new = elem->priv, *be;
u8 genmask = nft_genmask_next(net);
u32 idx, off;
 
-   nft_bitmap_location(set, nft_set_ext_key(ext), , );
-   if (nft_bitmap_active(priv->bitmap, idx, off, genmask))
+   be = nft_bitmap_elem_find(set, new, genmask);
+   if (be) {
+   *ext = >ext;
return -EEXIST;
+   }
 
+   nft_bitmap_location(set, nft_set_ext_key(>ext), , );
/* Enter 01 state. */
priv->bitmap[idx] |= (genmask << off);
+   list_add_tail_rcu(>head, >list);
 
return 0;
 }
@@ -106,13 +134,14 @@ static void nft_bitmap_remove(const struct net *net,
  const struct nft_set_elem *elem)
 {
struct nft_bitmap *priv = nft_set_priv(set);
-   struct nft_set_ext *ext = elem->priv;
+   struct nft_bitmap_elem *be = elem->priv;
u8 genmask = nft_genmask_next(net);
u32 idx, off;
 
-   nft_bitmap_location(set, nft_set_ext_key(ext), , );
+   nft_bitmap_location(set, nft_set_ext_key(>ext), , );
/* Enter 00 state. */
priv->bitmap[idx] &= ~(genmask << off);
+   list_del_rcu(>head);
 }
 
 static void nft_bitmap_activate(const struct net *net,
@@ -120,73 +149,52 @@ static void nft_bitmap_activate(const struct net *net,
const struct nft_set_elem *elem)
 {
struct nft_bitmap *priv = nft_set_priv(set);
-   struct nft_set_ext *ext = elem->priv;
+   struct nft_bitmap_elem *be = elem->priv;
u8 genmask = nft_genmask_next(net);
u32 idx, off;
 
-   nft_bitmap_location(set, nft_set_ext_key(ext), , );
+   nft_bitmap_location(set, nft_set_ext_key(>ext), , );
/* Enter 11 state. */
priv->bitmap[idx] |= (genmask << off);
+   nft_set_elem_change_active(net, set, >ext);
 }
 
 static bool nft_bitmap_flush(const struct net *net,
-const struct nft_set *set, void *ext)
+const struct nft_set *set, void *_be)
 {
struct nft_bitmap *priv = nft_set_priv(set);
u8 genmask = nft_genmask_next(net);
+   struct nft_bitmap_elem *be = _be;
u32 idx, off;
 
-   nft_bitmap_location(se

[PATCH 09/10] Revert "netfilter: nf_tables: add flush field to struct nft_set_iter"

2017-03-15 Thread Pablo Neira Ayuso

This reverts commit 1f48ff6c5393aa7fe290faf5d633164f105b0aa7.

This patch is not required anymore now that we keep a dummy list of
set elements in the bitmap set implementation, so revert this before
we forget this code has no clients.

Signed-off-by: Pablo Neira Ayuso <pa...@netfilter.org>
---
 include/net/netfilter/nf_tables.h | 1 -
 net/netfilter/nf_tables_api.c | 4 
 2 files changed, 5 deletions(-)

diff --git a/include/net/netfilter/nf_tables.h 
b/include/net/netfilter/nf_tables.h
index 70c5ca0c60b1..0136028652bd 100644
--- a/include/net/netfilter/nf_tables.h
+++ b/include/net/netfilter/nf_tables.h
@@ -232,7 +232,6 @@ struct nft_set_elem {
 struct nft_set;
 struct nft_set_iter {
u8  genmask;
-   boolflush;
unsigned intcount;
unsigned intskip;
int err;
diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index 5e0ccfd5bb37..434c739dfeca 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -3145,7 +3145,6 @@ int nf_tables_bind_set(const struct nft_ctx *ctx, struct 
nft_set *set,
iter.count  = 0;
iter.err= 0;
iter.fn = nf_tables_bind_check_setelem;
-   iter.flush  = false;
 
set->ops->walk(ctx, set, );
if (iter.err < 0)
@@ -3399,7 +3398,6 @@ static int nf_tables_dump_set(struct sk_buff *skb, struct 
netlink_callback *cb)
args.iter.count = 0;
args.iter.err   = 0;
args.iter.fn= nf_tables_dump_setelem;
-   args.iter.flush = false;
set->ops->walk(, set, );
 
nla_nest_end(skb, nest);
@@ -3963,7 +3961,6 @@ static int nf_tables_delsetelem(struct net *net, struct 
sock *nlsk,
struct nft_set_iter iter = {
.genmask= genmask,
.fn = nft_flush_set,
-   .flush  = true,
};
set->ops->walk(, set, );
 
@@ -5114,7 +5111,6 @@ static int nf_tables_check_loops(const struct nft_ctx 
*ctx,
iter.count  = 0;
iter.err= 0;
iter.fn = nf_tables_loop_check_setelem;
-   iter.flush  = false;
 
set->ops->walk(ctx, set, );
if (iter.err < 0)
-- 
2.1.4

[PATCH 10/10] netfilter: nft_ct: do cleanup work when NFTA_CT_DIRECTION is invalid

2017-03-15 Thread Pablo Neira Ayuso

From: Liping Zhang <zlpnob...@gmail.com>

We should jump to invoke __nft_ct_set_destroy() instead of just
return error.

Fixes: edee4f1e9245 ("netfilter: nft_ct: add zone id set support")
Signed-off-by: Liping Zhang <zlpnob...@gmail.com>
Acked-by: Florian Westphal <f...@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pa...@netfilter.org>
---
 net/netfilter/nft_ct.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/net/netfilter/nft_ct.c b/net/netfilter/nft_ct.c
index 91585b5e5307..0264258c46fe 100644
--- a/net/netfilter/nft_ct.c
+++ b/net/netfilter/nft_ct.c
@@ -544,7 +544,8 @@ static int nft_ct_set_init(const struct nft_ctx *ctx,
case IP_CT_DIR_REPLY:
break;
default:
-   return -EINVAL;
+   err = -EINVAL;
+   goto err1;
}
}
 
-- 
2.1.4

[PATCH 02/10] netfilter: nf_nat_sctp: fix ICMP packet to be dropped accidently

2017-03-15 Thread Pablo Neira Ayuso

From: Ying Xue <ying@windriver.com>

Regarding RFC 792, the first 64 bits of the original SCTP datagram's
data could be contained in ICMP packet, such as:

0   1   2   3
0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
   +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
   | Type  | Code  |  Checksum |
   +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
   | unused|
   +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
   |  Internet Header + 64 bits of Original Data Datagram  |
   +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+

However, according to RFC 4960, SCTP datagram header is as below:

0   1   2   3
0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
   +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
   | Source Port Number| Destination Port Number   |
   +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
   |  Verification Tag |
   +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
   |   Checksum|
   +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+

It means only the first three fields of SCTP header can be carried in
ICMP packet except for Checksum field.

At present in sctp_manip_pkt(), no matter whether the packet is ICMP or
not, it always calculates SCTP packet checksum. However, not only the
calculation of checksum is unnecessary for ICMP, but also it causes
another fatal issue that ICMP packet is dropped. The header size of
SCTP is used to identify whether the writeable length of skb is bigger
than skb->len through skb_make_writable() in sctp_manip_pkt(). But
when it deals with ICMP packet, skb_make_writable() directly returns
false as the writeable length of skb is bigger than skb->len.
Subsequently ICMP is dropped.

Now we correct this misbahavior. When sctp_manip_pkt() handles ICMP
packet, 8 bytes rather than the whole SCTP header size is used to check
if writeable length of skb is overflowed. Meanwhile, as it's meaningless
to calculate checksum when packet is ICMP, the computation of checksum
is ignored as well.

Signed-off-by: Ying Xue <ying@windriver.com>
Signed-off-by: Pablo Neira Ayuso <pa...@netfilter.org>
---
 net/netfilter/nf_nat_proto_sctp.c | 13 -
 1 file changed, 12 insertions(+), 1 deletion(-)

diff --git a/net/netfilter/nf_nat_proto_sctp.c 
b/net/netfilter/nf_nat_proto_sctp.c
index 31d358691af0..804e8a0ab36e 100644
--- a/net/netfilter/nf_nat_proto_sctp.c
+++ b/net/netfilter/nf_nat_proto_sctp.c
@@ -33,8 +33,16 @@ sctp_manip_pkt(struct sk_buff *skb,
   enum nf_nat_manip_type maniptype)
 {
sctp_sctphdr_t *hdr;
+   int hdrsize = 8;
 
-   if (!skb_make_writable(skb, hdroff + sizeof(*hdr)))
+   /* This could be an inner header returned in imcp packet; in such
+* cases we cannot update the checksum field since it is outside
+* of the 8 bytes of transport layer headers we are guaranteed.
+*/
+   if (skb->len >= hdroff + sizeof(*hdr))
+   hdrsize = sizeof(*hdr);
+
+   if (!skb_make_writable(skb, hdroff + hdrsize))
return false;
 
hdr = (struct sctphdr *)(skb->data + hdroff);
@@ -47,6 +55,9 @@ sctp_manip_pkt(struct sk_buff *skb,
hdr->dest = tuple->dst.u.sctp.port;
}
 
+   if (hdrsize < sizeof(*hdr))
+   return true;
+
if (skb->ip_summed != CHECKSUM_PARTIAL) {
hdr->checksum = sctp_compute_cksum(skb, hdroff);
skb->ip_summed = CHECKSUM_NONE;
-- 
2.1.4

[PATCH 07/10] netfilter: Force fake conntrack entry to be at least 8 bytes aligned

2017-03-15 Thread Pablo Neira Ayuso

From: "Steven Rostedt (VMware)" <rost...@goodmis.org>

Since the nfct and nfctinfo have been combined, the nf_conn structure
must be at least 8 bytes aligned, as the 3 LSB bits are used for the
nfctinfo. But there's a fake nf_conn structure to denote untracked
connections, which is created by a PER_CPU construct. This does not
guarantee that it will be 8 bytes aligned and can break the logic in
determining the correct nfctinfo.

I triggered this on a 32bit machine with the following error:

BUG: unable to handle kernel NULL pointer dereference at 0af4
IP: nf_ct_deliver_cached_events+0x1b/0xfb
*pdpt = 31962001 *pde = 

Oops:  [#1] SMP
[Modules linked in: ip6t_REJECT nf_reject_ipv6 nf_conntrack_ipv6 nf_defrag_ipv6 
ip6table_filter ip6_tables ipv6 crc_ccitt ppdev r8169 parport_pc parport
  OK  ]
CPU: 0 PID: 0 Comm: swapper/0 Not tainted 4.10.0-test+ #75
Hardware name: MSI MS-7823/CSM-H87M-G43 (MS-7823), BIOS V1.6 02/22/2014
task: c126ec00 task.stack: c1258000
EIP: nf_ct_deliver_cached_events+0x1b/0xfb
EFLAGS: 00010202 CPU: 0
EAX: 0021cd01 EBX:  ECX: 27b0c767 EDX: 32bcb17a
ESI: f34135c0 EDI: f34135c0 EBP: f2debd60 ESP: f2debd3c
 DS: 007b ES: 007b FS: 00d8 GS:  SS: 0068
CR0: 80050033 CR2: 0af4 CR3: 309a0440 CR4: 001406f0
Call Trace:
 
 ? ipv6_skip_exthdr+0xac/0xcb
 ipv6_confirm+0x10c/0x119 [nf_conntrack_ipv6]
 nf_hook_slow+0x22/0xc7
 nf_hook+0x9a/0xad [ipv6]
 ? ip6t_do_table+0x356/0x379 [ip6_tables]
 ? ip6_fragment+0x9e9/0x9e9 [ipv6]
 ip6_output+0xee/0x107 [ipv6]
 ? ip6_fragment+0x9e9/0x9e9 [ipv6]
 dst_output+0x36/0x4d [ipv6]
 NF_HOOK.constprop.37+0xb2/0xba [ipv6]
 ? icmp6_dst_alloc+0x2c/0xfd [ipv6]
 ? local_bh_enable+0x14/0x14 [ipv6]
 mld_sendpack+0x1c5/0x281 [ipv6]
 ? mark_held_locks+0x40/0x5c
 mld_ifc_timer_expire+0x1f6/0x21e [ipv6]
 call_timer_fn+0x135/0x283
 ? detach_if_pending+0x55/0x55
 ? mld_dad_timer_expire+0x3e/0x3e [ipv6]
 __run_timers+0x111/0x14b
 ? mld_dad_timer_expire+0x3e/0x3e [ipv6]
 run_timer_softirq+0x1c/0x36
 __do_softirq+0x185/0x37c
 ? test_ti_thread_flag.constprop.19+0xd/0xd
 do_softirq_own_stack+0x22/0x28
 
 irq_exit+0x5a/0xa4
 smp_apic_timer_interrupt+0x2a/0x34
 apic_timer_interrupt+0x37/0x3c

By using DEFINE/DECLARE_PER_CPU_ALIGNED we can enforce at least 8 byte
alignment as all cache line sizes are at least 8 bytes or more.

Fixes: a9e419dc7be6 ("netfilter: merge ctinfo into nfct pointer storage area")
Signed-off-by: Steven Rostedt (VMware) <rost...@goodmis.org>
Acked-by: Florian Westphal <f...@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pa...@netfilter.org>
---
 include/net/netfilter/nf_conntrack.h | 2 +-
 net/netfilter/nf_conntrack_core.c| 6 +-
 2 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/include/net/netfilter/nf_conntrack.h 
b/include/net/netfilter/nf_conntrack.h
index f540f9ad2af4..19605878da47 100644
--- a/include/net/netfilter/nf_conntrack.h
+++ b/include/net/netfilter/nf_conntrack.h
@@ -244,7 +244,7 @@ extern s32 (*nf_ct_nat_offset)(const struct nf_conn *ct,
   u32 seq);
 
 /* Fake conntrack entry for untracked connections */
-DECLARE_PER_CPU(struct nf_conn, nf_conntrack_untracked);
+DECLARE_PER_CPU_ALIGNED(struct nf_conn, nf_conntrack_untracked);
 static inline struct nf_conn *nf_ct_untracked_get(void)
 {
return raw_cpu_ptr(_conntrack_untracked);
diff --git a/net/netfilter/nf_conntrack_core.c 
b/net/netfilter/nf_conntrack_core.c
index 071b97fcbefb..ffb78e5f7b70 100644
--- a/net/netfilter/nf_conntrack_core.c
+++ b/net/netfilter/nf_conntrack_core.c
@@ -181,7 +181,11 @@ EXPORT_SYMBOL_GPL(nf_conntrack_htable_size);
 unsigned int nf_conntrack_max __read_mostly;
 seqcount_t nf_conntrack_generation __read_mostly;
 
-DEFINE_PER_CPU(struct nf_conn, nf_conntrack_untracked);
+/* nf_conn must be 8 bytes aligned, as the 3 LSB bits are used
+ * for the nfctinfo. We cheat by (ab)using the PER CPU cache line
+ * alignment to enforce this.
+ */
+DEFINE_PER_CPU_ALIGNED(struct nf_conn, nf_conntrack_untracked);
 EXPORT_PER_CPU_SYMBOL(nf_conntrack_untracked);
 
 static unsigned int nf_conntrack_hash_rnd __read_mostly;
-- 
2.1.4

Re: [PATCH net-next] net/gtp: Add udp source port generation according to flow hash

2017-03-15 Thread Pablo Neira Ayuso

On Wed, Mar 15, 2017 at 06:14:02PM +0200, Or Gerlitz wrote:
> On Thu, Feb 23, 2017 at 7:54 PM, David Miller <da...@davemloft.net> wrote:
> > From: Andreas Schultz <aschu...@tpip.net>
> > Date: Thu, 23 Feb 2017 18:19:16 +0100 (CET)
> >
> >> When we are talking about the xmit path, then currently none of the
> >> receivers we are talking to is going to be Linux and we have no
> >> idea how they will behave nor do we have any influence on them. Do
> >> we really need to make assumptions about other vendors implementations?
> >>
> >> Traces on live GRX networks show that about 90% of the SGSN/S-GW
> >> that would talk to us always use the default GTP-U port as source
> >> port. Some multi chassis GSN's seem to assign source port ranges to
> >> chassis, but that has nothing todo with DDOS protection.
> >
> > This is exactly what other UDP tunnel implementations did before
> > flow separation was prevelant.
> >
> > I don't see the point of any of this discussion discouraging the
> > enablement of proper flow separation.
> 
> Hi Dave,
> 
> So where do we  go from here? should I resubmit the patch?

IIRC this patch didn't get into the merge window in time, so it's
reasonable to resubmit I think.

You may want to add this to the patch:

Acked-by: Pablo Neira Ayuso <pa...@netfilter.org>

Thanks Or.

Re: [PATCH net] bridge: ebtables: fix reception of frames DNAT-ed to bridge device

2017-03-15 Thread Pablo Neira Ayuso

On Wed, Mar 15, 2017 at 11:26:08AM +0100, Florian Westphal wrote:
> Linus Lüssing  wrote:
> > When trying to redirect bridged frames to the bridge device itself
> > via the ebtables nat-prerouting chain and the dnat target then this
> > currently fails:
> > 
> > The ethernet destination of the frame is dnat'ed to the MAC address of
> > the bridge itself just fine and the correctly altered frame can even
> > be captured via a tcpdump on br0 (with or without promisc mode).
> >
> > However, the IP code drops it in the beginning of ip_input.c/ip_rcv()
> > as the dnat target did not update the skb->pkt_type.
> 
> Right, thats the reason why ebtables also has ebt_redirect target
> which does this pkt_type fixup.

I'm missing then why redirect is not then just enough for Linus usecase.

Re: [PATCH net] bridge: ebtables: fix reception of frames DNAT-ed to bridge device

2017-03-15 Thread Pablo Neira Ayuso

On Wed, Mar 15, 2017 at 04:18:11AM +0100, Linus Lüssing wrote:
> When trying to redirect bridged frames to the bridge device itself
> via the ebtables nat-prerouting chain and the dnat target then this
> currently fails:
> 
> The ethernet destination of the frame is dnat'ed to the MAC address of
> the bridge itself just fine and the correctly altered frame can even
> be captured via a tcpdump on br0 (with or without promisc mode).
> 
> However, the IP code drops it in the beginning of ip_input.c/ip_rcv()
> as the dnat target did not update the skb->pkt_type. If after
> dnat'ing the packet is now destined to us then the skb->pkt_type
> needs to be updated from PACKET_OTHERHOST to PACKET_HOST, too.
> 
> Signed-off-by: Linus Lüssing 
> ---
>  net/bridge/br_input.c | 6 +-
>  1 file changed, 5 insertions(+), 1 deletion(-)
> 
> diff --git a/net/bridge/br_input.c b/net/bridge/br_input.c
> index 013f2290b..ec83175 100644
> --- a/net/bridge/br_input.c
> +++ b/net/bridge/br_input.c
> @@ -198,8 +198,12 @@ int br_handle_frame_finish(struct net *net, struct sock 
> *sk, struct sk_buff *skb
>   if (dst) {
>   unsigned long now = jiffies;
>  
> - if (dst->is_local)
> + if (dst->is_local) {
> + /* fix up potential DNAT mess */
> + skb->pkt_type = PACKET_HOST;

I would like to find a way to fix this from ebtables itself, so we
don't need to add this code to the bridge core path. AFAICS, from
prerouting we don't know the dst yet, so we cannot know if this packet
is local from there.

Re: [PATCH net-next 2/4] gtp: add genl cmd to enable GTP encapsulation on UDP socket

2017-03-14 Thread Pablo Neira Ayuso

On Tue, Mar 14, 2017 at 01:28:25PM +0100, Andreas Schultz wrote:
> - On Mar 14, 2017, at 12:43 PM, pablo pa...@netfilter.org wrote:
[...]
> A GTP entity serves multiple local IP endpoints, It manages outgoing tunnels
> by the local APN/VRF, source IP, destination IP and remote tunnel id, incoming
> tunnels are managed by the local destination IP, local tunnel id and VRF/APN.
> 
> Therefor a PDP context needs the following attributes:
> 
>  * local source/destination IP (and port - but that's for different series)
>  * remote destination IP
>  * local and remote TEID
>  * VRF/APN
[...]
> I'm not sure if this pseudo GTP entity root device fits well with
> other networking concepts. And more over, I can't really see the need
> for such an construct.

Some sort of top-level structure that wraps all these objects is
needed, and that can be a new VRF object itself.

You can add a netlink interface to add/dump/delete VRFs, this VRF
database would be *global* to the GSN. At VRF creation, you attach the
socket and the GTP device. You can share sockets between VRFs. PDP
context objects would be added to the corresponding VRF *not to the
socket*, but actually this will result in inserting this PDP context
into the socket hashtable and the GTP device hashtable.

We need to introduce a default VRF that is assumed to always exist,
and that userspace cannot remove, so things don't break backward. If
no VRF is specified, then we attach things to this default VRF.
Actually, look at this from a different angle: the existing driver is
just supporting *one single VRF* at this moment so we just have to
represent this explicitly. Breaking existing API is a no-go.

This explicit VRF concept would also allow us to dump PDP contexts
that belong to a given VRF, by simply indicating the VRF unique id.
Jamal already requested that we extend iproute2 to have command to
inspect the gtp driver we cannot escape this, we should allow
standalone tools to inspect the gtp datapath as we do with other
existing tunnel drivers, no matter what daemon in userspace implements
the control plane.

[...]
> I think that the user space control instance should own the tunnels and
> only use the kernel facility to manage them. When the user space instance
> goes away, so should the tunnels.

This doesn't allow daemon hot restart for whatever administrative
reason without affecting existing traffic. The kernel owns the
datapath indeed, that include tunnels.

> From that perspective, I want to keep the kernel facilities to the
> absolute needed minimum.

If some simple abstraction that we can insert makes this whole thing
more maintainable, then it makes sense to consider it.

This is all about not exposing the internal layout of the
representation you use for a very specific reason: The more you expose
internal details to userspace, the more problems we'll have to extend
things later on in the future. And don't try to be smart and say:
"Hey, I already know every usecase we will have in the future" because
that is not true.

Re: [PATCH net-next 0/4] gtp: support multiple APN's per GTP endpoint

2017-03-14 Thread Pablo Neira Ayuso

On Tue, Mar 14, 2017 at 01:42:44PM +0100, Andreas Schultz wrote:
> > It is definitely not acceptable to break the existing API.
> 
> The specific use case of the API that is no longer supported was never used by
> anyone. [...]

Yes, this was used openggsn and I tested this with a full blown FOSS
setup. Yes, a toy thing compared to the proprietary equipment you deal
with, but we always started with things like this.

> The only supported and documented API for the GTP module is libgtpnl.

No, the netlink interface itself if the API.

Stopping trying to find a reason to break API, that is a no-go.

Re: [PATCH net-next 0/4] gtp: support multiple APN's per GTP endpoint

2017-03-14 Thread Pablo Neira Ayuso

On Tue, Mar 14, 2017 at 12:25:44PM +0100, Andreas Schultz wrote:
[...]
> API impact:
> ---
> 
> This is probably the most problematic part of this series...
> 
> The removeal of the TEID form the netdevice also means that the gtp genl API
> for retriving tunnel information and removing tunnels needs to be adjusted.
> 
> Before this change it was possible to change a GTP tunnel using the gtp
> netdevice id and the teid. The teid is no longer unique per gtp netdevice.
> After this change it has to be either the netdevice and MS IP or the GTP
> socket and teid.

Then we have to introduce some explicit VRF concept or such to sort
out this.

It is definitely not acceptable to break the existing API.

Re: [PATCH net-next 2/4] gtp: add genl cmd to enable GTP encapsulation on UDP socket

2017-03-14 Thread Pablo Neira Ayuso

On Tue, Mar 14, 2017 at 12:25:46PM +0100, Andreas Schultz wrote:

> @@ -1254,6 +1293,8 @@ static struct nla_policy gtp_genl_policy[GTPA_MAX + 1] 
> = {
>   [GTPA_NET_NS_FD]= { .type = NLA_U32, },
>   [GTPA_I_TEI]= { .type = NLA_U32, },
>   [GTPA_O_TEI]= { .type = NLA_U32, },
> + [GTPA_PDP_HASHSIZE] = { .type = NLA_U32, },

This per PDP hashsize attribute clearly doesn't belong here.

Moreover, we now have a rhashtable implementation, so we hopefully we
can get rid of this. It should be very easy to convert this to use
rhashtable, and it is very much desiderable.

> + [GTPA_FD]   = { .type = NLA_U32, },

This new atttribute has nothing to do with the PDP context.
And enum gtp_attrs *only* describe a PDP context. Adding more
attributes there to mix semantics is not the way to go.

You likely have to inaugurate a new enum. This gtp_attrs enum only
related to the PDP description.

Why not add some interface to attach more sockets to the gtp device
globally? So still the gtp device is the top-level structure. Then add
a netlink attribute to specify to what VRF this tunnel belongs to,
instead of implicitly using the socket to achieve this.

Another possibility is to explicitly have an interface to add
new/delete VRFs, attach sockets to them.

In general, I'm still not convinced this is the right design for this.

Re: [PATCH net-next 1/4] gtp: move TEID hash to per socket structure

2017-03-14 Thread Pablo Neira Ayuso

On Tue, Mar 14, 2017 at 12:25:45PM +0100, Andreas Schultz wrote:
> @@ -275,9 +280,9 @@ static int gtp1u_udp_encap_recv(struct gtp_dev *gtp, 
> struct sk_buff *skb)
>  
>   gtp1 = (struct gtp1_header *)(skb->data + sizeof(struct udphdr));
>  
> - pctx = gtp1_pdp_find(gtp, ntohl(gtp1->tid));
> + pctx = gtp1_pdp_find(gsk, ntohl(gtp1->tid));
>   if (!pctx) {
> - netdev_dbg(gtp->dev, "No PDP ctx to decap skb=%p\n", skb);
> + pr_debug("No PDP ctx to decap skb=%p\n", skb);
>   return 1;

Again the pr_debug() change has resurrected.

I already told you: If we are going to have more than one gtp device,
then this doesn't make sense. I have to repeat things over and over
again, just because you don't want to rebase your patchset for some
reason. I don't find any other explaination for this.

So please remove this debugging rather than rendering this completely
useful.

Moreover this change has nothing to this patch, so this doesn't break
the one logical change per patch.

Re: [PATCH net] bridge: drop netfilter fake rtable unconditionally

2017-03-13 Thread Pablo Neira Ayuso

On Mon, Mar 13, 2017 at 05:38:17PM +0100, Florian Westphal wrote:
> Andreas reports kernel oops during rmmod of the br_netfilter module.
> Hannes debugged the oops down to a NULL rt6info->rt6i_indev.
> 
> Problem is that br_netfilter has the nasty concept of adding a fake
> rtable to skb->dst; this happens in a br_netfilter prerouting hook.
> 
> A second hook (in bridge LOCAL_IN) is supposed to remove these again
> before the skb is handed up the stack.
> 
> However, on module unload hooks get unregistered which means an
> skb could traverse the prerouting hook that attaches the fake_rtable,
> while the 'fake rtable remove' hook gets removed from the hooklist
> immediately after.
> 
> Fixes: 34666d467cbf1e2e3c7 ("netfilter: bridge: move br_netfilter out of the 
> core")
> Reported-by: Andreas Karis <aka...@redhat.com>
> Debugged-by: Hannes Frederic Sowa <han...@stressinduktion.org>
> Signed-off-by: Florian Westphal <f...@strlen.de>
> ---
>  David, I know you *love* the fake_rtable turd, but I think
>  this patch should go directly via net and not nf tree.
>  After all, this adds to bridge and removes from the netfilter part.

If this helps speed up this.

Acked-by: Pablo Neira Ayuso <pa...@netfilter.org>

I also love this bridge netfilter Frankenstein a lot. Looking forward
you kill this via native bridge conntrack support for nf_tables.

Thanks.

Re: [PATCH net-next v5 0/7] gtp: misc improvements

2017-03-13 Thread Pablo Neira Ayuso

On Thu, Mar 09, 2017 at 05:42:55PM +0100, Andreas Schultz wrote:
> Hi Pablo,
> 
> This is a resent of last series that missed the merge window. There
> are no changes compared to v4.
> 
> v4: Compared to v3 it contains mostly smallish naming and spelling fixes.
> It also drops the documentation patch, Harald did a better job with the
> documentation and the some things I described do not yet match the 
> implementation.
> I'll readd the relevant parts with a follow up series.
> 
> This series lays the groundwork for removing the socket references from
> the GTP netdevice by removing duplicate code and simplifying the logic on
> some code paths.
> 
> It slighly changes the GTP genl API by making the socket parameters optional
> (though one of them is still required).
> 
> The removal of the socket references will break the 1:1 releation between
> GTP netdevice and GTP socket that prevents us to support multiple VRFs with
> overlapping IP addresse spaces attached to the same GTP-U entity (needed for
> multi APN support, coming a follow up series).
> 
> Pablo found a socket hold problem in v2. In order to solve that I had to
> switch the socket references from the struct socket to the internal
> struct sock. This should have no functionl impact, but we can now hang
> on to the reference without blocking user space from closing the GTP socket.

Acked-by: Pablo Neira Ayuso <pa...@netfilter.org>

I personally don't like this podge hodge unsorted submissions, I don't
think they belong to the same series but you keep pushing with this
patchset in this same way, which is annoying.

In your follow up patchsets, please split them in smaller series that
are related.

I would like you take the time to develop the VRF idea that you want
to introduce, I just would like that we avoid bloating the GTP tunnel
with features that we can just achieve via composite of different
subsystems.

Thank you.

Re: [PATCH] netfilter: Force fake conntrack entry to be at least 8 bytes aligned

2017-03-13 Thread Pablo Neira Ayuso

On Sat, Mar 11, 2017 at 10:12:22AM +0100, Florian Westphal wrote:
> Steven Rostedt (VMware)  wrote:
> > Since the nfct and nfctinfo have been combined, the nf_conn structure
> > must be at least 8 bytes aligned, as the 3 LSB bits are used for the
> > nfctinfo. But there's a fake nf_conn structure to denote untracked
> > connections, which is created by a PER_CPU construct. This does not
> > guarantee that it will be 8 bytes aligned and can break the logic in
> > determining the correct nfctinfo.
> > 
> > I triggered this on a 32bit machine with the following error:
> [..]
> 
> Ugh.  Originally I had planned to also submit followup changes
> to get rid of the untracked objects but that part got delayed.
> 
> > By using DEFINE/DECLARE_PER_CPU_ALIGNED we can enforce at least 8 byte
> > alignment as all cache line sizes are at least 8 bytes or more.
> 
> Thanks for fixing this!
> 
> Acked-by: Florian Westphal 

Applied, thanks.

Re: [net] netfilter: nat: sctp: fix ICMP packet to be dropped accidently

2017-03-08 Thread Pablo Neira Ayuso

On Sat, Mar 04, 2017 at 06:00:02PM +0800, Ying Xue wrote:
> Regarding RFC 792, the first 64 bits of the original SCTP datagram's
> data could be contained in ICMP packet, such as:
> 
> 0   1   2   3
> 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
>+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
>| Type  | Code  |  Checksum |
>+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
>| unused|
>+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
>|  Internet Header + 64 bits of Original Data Datagram  |
>+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
> 
> However, according to RFC 4960, SCTP datagram header is as below:
> 
> 0   1   2   3
> 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
>+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
>| Source Port Number| Destination Port Number   |
>+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
>|  Verification Tag |
>+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
>|   Checksum|
>+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
> 
> It means only the first three fields of SCTP header can be carried in
> ICMP packet except for Checksum field.
> 
> At present in sctp_manip_pkt(), no matter whether the packet is ICMP or
> not, it always calculates SCTP packet checksum. However, not only the
> calculation of checksum is unnecessary for ICMP, but also it causes
> another fatal issue that ICMP packet is dropped. The header size of
> SCTP is used to identify whether the writeable length of skb is bigger
> than skb->len through skb_make_writable() in sctp_manip_pkt(). But
> when it deals with ICMP packet, skb_make_writable() directly returns
> false as the writeable length of skb is bigger than skb->len.
> Subsequently ICMP is dropped.
> 
> Now we correct this misbahavior. When sctp_manip_pkt() handles ICMP
> packet, 8 bytes rather than the whole SCTP header size is used to check
> if writeable length of skb is overflowed. Meanwhile, as it's meaningless
> to calculate checksum when packet is ICMP, the computation of checksum
> is ignored as well.

Applied, thanks.

Re: [PATCH] netfilter: Use pr_cont where appropriate

2017-03-06 Thread Pablo Neira Ayuso

On Tue, Feb 28, 2017 at 02:09:24PM -0800, Joe Perches wrote:
> Logging output was changed when simple printks without KERN_CONT
> are now emitted on a new line and KERN_CONT is required to continue
> lines so use pr_cont.
> 
> Miscellanea:
> 
> o realign arguments
> o use print_hex_dump instead of a local variant

Applied, thanks Joe.

Re: [PATCH] netfilter: remove redundant check on ret being non-zero

2017-03-06 Thread Pablo Neira Ayuso

On Tue, Feb 28, 2017 at 11:31:15AM +, Colin King wrote:
> From: Colin Ian King 
> 
> ret is initialized to zero and if it is set to non-zero in the
> xt_entry_foreach loop then we exit via the out_free label. Hence
> the check for ret being non-zero is redundant and can be removed.
> 
> Detected by CoverityScan, CID#1357132 ("Logically Dead Code")

Applied, thanks.

[PATCH 0/4] Netfilter fixes for net

2017-03-03 Thread Pablo Neira Ayuso

Hi David,

The following patchset contains Netfilter fixes for your net tree,
they are:

1) Missing check for full sock in ip_route_me_harder(), from
   Florian Westphal.

2) Incorrect sip helper structure initilization that breaks it when
   several ports are used, from Christophe Leroy.

3) Fix incorrect assumption when looking up for matching with adjacent
   intervals in the nft_set_rbtree.

4) Fix broken netlink event error reporting in nf_tables that results
   in misleading ESRCH errors propagated to userspace listeners.

You can pull these changes from:

  git://git.kernel.org/pub/scm/linux/kernel/git/pablo/nf.git

Thanks!



The following changes since commit 2f44f75257d57f0d5668dba3a6ada0f4872132c9:

  Merge branch 'qed-fixes' (2017-02-27 09:22:10 -0500)

are available in the git repository at:

  git://git.kernel.org/pub/scm/linux/kernel/git/pablo/nf.git HEAD

for you to fetch changes up to 25e94a997b324b5f167f56d56d7106d38b78c9de:

  netfilter: nf_tables: don't call nfnetlink_set_err() if nfnetlink_send() 
fails (2017-03-03 13:48:34 +0100)


Christophe Leroy (1):
  netfilter: nf_conntrack_sip: fix wrong memory initialisation

Florian Westphal (1):
  netfilter: use skb_to_full_sk in ip_route_me_harder

Pablo Neira Ayuso (2):
  netfilter: nft_set_rbtree: incorrect assumption on lower interval lookups
  netfilter: nf_tables: don't call nfnetlink_set_err() if nfnetlink_send() 
fails

 include/net/netfilter/nf_tables.h |   6 +-
 net/ipv4/netfilter.c  |   7 +-
 net/netfilter/nf_conntrack_sip.c  |   2 -
 net/netfilter/nf_tables_api.c | 133 --
 net/netfilter/nft_set_rbtree.c|   9 ++-
 5 files changed, 66 insertions(+), 91 deletions(-)

[PATCH 1/4] netfilter: use skb_to_full_sk in ip_route_me_harder

2017-03-03 Thread Pablo Neira Ayuso

From: Florian Westphal <f...@strlen.de>

inet_sk(skb->sk) is illegal in case skb is attached to request socket.

Fixes: ca6fb0651883 ("tcp: attach SYNACK messages to request sockets instead of 
listener")
Reported by: Daniel J Blueman <dan...@quora.org>
Signed-off-by: Florian Westphal <f...@strlen.de>
Tested-by: Daniel J Blueman <dan...@quora.org>
Signed-off-by: Pablo Neira Ayuso <pa...@netfilter.org>
---
 net/ipv4/netfilter.c | 7 ---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/net/ipv4/netfilter.c b/net/ipv4/netfilter.c
index b3cc1335adbc..c0cc6aa8cfaa 100644
--- a/net/ipv4/netfilter.c
+++ b/net/ipv4/netfilter.c
@@ -23,7 +23,8 @@ int ip_route_me_harder(struct net *net, struct sk_buff *skb, 
unsigned int addr_t
struct rtable *rt;
struct flowi4 fl4 = {};
__be32 saddr = iph->saddr;
-   __u8 flags = skb->sk ? inet_sk_flowi_flags(skb->sk) : 0;
+   const struct sock *sk = skb_to_full_sk(skb);
+   __u8 flags = sk ? inet_sk_flowi_flags(sk) : 0;
struct net_device *dev = skb_dst(skb)->dev;
unsigned int hh_len;
 
@@ -40,7 +41,7 @@ int ip_route_me_harder(struct net *net, struct sk_buff *skb, 
unsigned int addr_t
fl4.daddr = iph->daddr;
fl4.saddr = saddr;
fl4.flowi4_tos = RT_TOS(iph->tos);
-   fl4.flowi4_oif = skb->sk ? skb->sk->sk_bound_dev_if : 0;
+   fl4.flowi4_oif = sk ? sk->sk_bound_dev_if : 0;
if (!fl4.flowi4_oif)
fl4.flowi4_oif = l3mdev_master_ifindex(dev);
fl4.flowi4_mark = skb->mark;
@@ -61,7 +62,7 @@ int ip_route_me_harder(struct net *net, struct sk_buff *skb, 
unsigned int addr_t
xfrm_decode_session(skb, flowi4_to_flowi(), AF_INET) == 0) {
struct dst_entry *dst = skb_dst(skb);
skb_dst_set(skb, NULL);
-   dst = xfrm_lookup(net, dst, flowi4_to_flowi(), skb->sk, 0);
+   dst = xfrm_lookup(net, dst, flowi4_to_flowi(), sk, 0);
if (IS_ERR(dst))
return PTR_ERR(dst);
skb_dst_set(skb, dst);
-- 
2.1.4

[PATCH 4/4] netfilter: nf_tables: don't call nfnetlink_set_err() if nfnetlink_send() fails

2017-03-03 Thread Pablo Neira Ayuso

The underlying nlmsg_multicast() already sets sk->sk_err for us to
notify socket overruns, so we should not do anything with this return
value. So we just call nfnetlink_set_err() if:

1) We fail to allocate the netlink message.

or

2) We don't have enough space in the netlink message to place attributes,
   which means that we likely need to allocate a larger message.

Before this patch, the internal ESRCH netlink error code was propagated
to userspace, which is quite misleading. Netlink semantics mandate that
listeners just hit ENOBUFS if the socket buffer overruns.

Reported-by: Alexander Alemayhu <alexan...@alemayhu.com>
Tested-by: Alexander Alemayhu <alexan...@alemayhu.com>
Signed-off-by: Pablo Neira Ayuso <pa...@netfilter.org>
---
 include/net/netfilter/nf_tables.h |   6 +-
 net/netfilter/nf_tables_api.c | 133 --
 2 files changed, 58 insertions(+), 81 deletions(-)

diff --git a/include/net/netfilter/nf_tables.h 
b/include/net/netfilter/nf_tables.h
index ac84686aaafb..2aa8a9d80fbe 100644
--- a/include/net/netfilter/nf_tables.h
+++ b/include/net/netfilter/nf_tables.h
@@ -988,9 +988,9 @@ struct nft_object *nf_tables_obj_lookup(const struct 
nft_table *table,
const struct nlattr *nla, u32 objtype,
u8 genmask);
 
-int nft_obj_notify(struct net *net, struct nft_table *table,
-  struct nft_object *obj, u32 portid, u32 seq,
-  int event, int family, int report, gfp_t gfp);
+void nft_obj_notify(struct net *net, struct nft_table *table,
+   struct nft_object *obj, u32 portid, u32 seq,
+   int event, int family, int report, gfp_t gfp);
 
 /**
  * struct nft_object_type - stateful object type
diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index ff7304ae58ac..5e0ccfd5bb37 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -461,16 +461,15 @@ static int nf_tables_fill_table_info(struct sk_buff *skb, 
struct net *net,
return -1;
 }
 
-static int nf_tables_table_notify(const struct nft_ctx *ctx, int event)
+static void nf_tables_table_notify(const struct nft_ctx *ctx, int event)
 {
struct sk_buff *skb;
int err;
 
if (!ctx->report &&
!nfnetlink_has_listeners(ctx->net, NFNLGRP_NFTABLES))
-   return 0;
+   return;
 
-   err = -ENOBUFS;
skb = nlmsg_new(NLMSG_GOODSIZE, GFP_KERNEL);
if (skb == NULL)
goto err;
@@ -482,14 +481,11 @@ static int nf_tables_table_notify(const struct nft_ctx 
*ctx, int event)
goto err;
}
 
-   err = nfnetlink_send(skb, ctx->net, ctx->portid, NFNLGRP_NFTABLES,
-ctx->report, GFP_KERNEL);
+   nfnetlink_send(skb, ctx->net, ctx->portid, NFNLGRP_NFTABLES,
+  ctx->report, GFP_KERNEL);
+   return;
 err:
-   if (err < 0) {
-   nfnetlink_set_err(ctx->net, ctx->portid, NFNLGRP_NFTABLES,
- err);
-   }
-   return err;
+   nfnetlink_set_err(ctx->net, ctx->portid, NFNLGRP_NFTABLES, -ENOBUFS);
 }
 
 static int nf_tables_dump_tables(struct sk_buff *skb,
@@ -1050,16 +1046,15 @@ static int nf_tables_fill_chain_info(struct sk_buff 
*skb, struct net *net,
return -1;
 }
 
-static int nf_tables_chain_notify(const struct nft_ctx *ctx, int event)
+static void nf_tables_chain_notify(const struct nft_ctx *ctx, int event)
 {
struct sk_buff *skb;
int err;
 
if (!ctx->report &&
!nfnetlink_has_listeners(ctx->net, NFNLGRP_NFTABLES))
-   return 0;
+   return;
 
-   err = -ENOBUFS;
skb = nlmsg_new(NLMSG_GOODSIZE, GFP_KERNEL);
if (skb == NULL)
goto err;
@@ -1072,14 +1067,11 @@ static int nf_tables_chain_notify(const struct nft_ctx 
*ctx, int event)
goto err;
}
 
-   err = nfnetlink_send(skb, ctx->net, ctx->portid, NFNLGRP_NFTABLES,
-ctx->report, GFP_KERNEL);
+   nfnetlink_send(skb, ctx->net, ctx->portid, NFNLGRP_NFTABLES,
+  ctx->report, GFP_KERNEL);
+   return;
 err:
-   if (err < 0) {
-   nfnetlink_set_err(ctx->net, ctx->portid, NFNLGRP_NFTABLES,
- err);
-   }
-   return err;
+   nfnetlink_set_err(ctx->net, ctx->portid, NFNLGRP_NFTABLES, -ENOBUFS);
 }
 
 static int nf_tables_dump_chains(struct sk_buff *skb,
@@ -1934,18 +1926,16 @@ static int nf_tables_fill_rule_info(struct sk_buff 
*skb, struct net *net,
return -1;
 }
 
-static int nf_tables_rule_notify(const struct nft_ctx *ctx,
-const struct nft_rule *rule,
-int

[PATCH 3/4] netfilter: nft_set_rbtree: incorrect assumption on lower interval lookups

2017-03-03 Thread Pablo Neira Ayuso

In case of adjacent ranges, we may indeed see either the high part of
the range in first place or the low part of it. Remove this incorrect
assumption, let's make sure we annotate the low part of the interval in
case of we have adjacent interva intervals so we hit a matching in
lookups.

Reported-by: Simon Hanisch <hani...@wh2.tu-dresden.de>
Signed-off-by: Pablo Neira Ayuso <pa...@netfilter.org>
---
 net/netfilter/nft_set_rbtree.c | 9 -
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/net/netfilter/nft_set_rbtree.c b/net/netfilter/nft_set_rbtree.c
index 71e8fb886a73..78dfbf9588b3 100644
--- a/net/netfilter/nft_set_rbtree.c
+++ b/net/netfilter/nft_set_rbtree.c
@@ -60,11 +60,10 @@ static bool nft_rbtree_lookup(const struct net *net, const 
struct nft_set *set,
d = memcmp(this, key, set->klen);
if (d < 0) {
parent = parent->rb_left;
-   /* In case of adjacent ranges, we always see the high
-* part of the range in first place, before the low one.
-* So don't update interval if the keys are equal.
-*/
-   if (interval && nft_rbtree_equal(set, this, interval))
+   if (interval &&
+   nft_rbtree_equal(set, this, interval) &&
+   nft_rbtree_interval_end(this) &&
+   !nft_rbtree_interval_end(interval))
continue;
interval = rbe;
} else if (d > 0)
-- 
2.1.4

[PATCH 2/4] netfilter: nf_conntrack_sip: fix wrong memory initialisation

2017-03-03 Thread Pablo Neira Ayuso

From: Christophe Leroy <christophe.le...@c-s.fr>

In commit 82de0be6862cd ("netfilter: Add helper array
register/unregister functions"),
struct nf_conntrack_helper sip[MAX_PORTS][4] was changed to
sip[MAX_PORTS * 4], so the memory init should have been changed to
memset([4 * i], 0, 4 * sizeof(sip[i]));

But as the sip[] table is allocated in the BSS, it is already set to 0

Fixes: 82de0be6862cd ("netfilter: Add helper array register/unregister 
functions")
Signed-off-by: Christophe Leroy <christophe.le...@c-s.fr>
Signed-off-by: Pablo Neira Ayuso <pa...@netfilter.org>
---
 net/netfilter/nf_conntrack_sip.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/net/netfilter/nf_conntrack_sip.c b/net/netfilter/nf_conntrack_sip.c
index 24174c520239..0d17894798b5 100644
--- a/net/netfilter/nf_conntrack_sip.c
+++ b/net/netfilter/nf_conntrack_sip.c
@@ -1628,8 +1628,6 @@ static int __init nf_conntrack_sip_init(void)
ports[ports_c++] = SIP_PORT;
 
for (i = 0; i < ports_c; i++) {
-   memset([i], 0, sizeof(sip[i]));
-
nf_ct_helper_init([4 * i], AF_INET, IPPROTO_UDP, "sip",
  SIP_PORT, ports[i], i, sip_exp_policy,
  SIP_EXPECT_MAX,
-- 
2.1.4

Re: [PATCH] netfilter: nf_conntrack_sip: fix wrong memory initialisation

2017-03-03 Thread Pablo Neira Ayuso

On Wed, Mar 01, 2017 at 03:33:26PM +0100, Christophe Leroy wrote:
> In commit 82de0be6862cd ("netfilter: Add helper array
> register/unregister functions"),
> struct nf_conntrack_helper sip[MAX_PORTS][4] was changed to
> sip[MAX_PORTS * 4], so the memory init should have been changed to
> memset([4 * i], 0, 4 * sizeof(sip[i]));
> 
> But as the sip[] table is allocated in the BSS, it is already set to 0

Applied, thanks.

Re: [PATCH 09/14] netfilter: conntrack: refine gc worker heuristics, redux

2017-03-01 Thread Pablo Neira Ayuso

On Wed, Mar 01, 2017 at 04:02:53PM +0100, Nicolas Dichtel wrote:
> Le 27/01/2017 à 17:51, Nicolas Dichtel a écrit :
> > Le 26/01/2017 à 17:38, Pablo Neira Ayuso a écrit :
> >> From: Florian Westphal <f...@strlen.de>
> >>
> >> This further refines the changes made to conntrack gc_worker in
> >> commit e0df8cae6c16 ("netfilter: conntrack: refine gc worker heuristics").
> >>
> >> The main idea of that change was to reduce the scan interval when evictions
> >> take place.
> >>
> >> However, on the reporters' setup, there are 1-2 million conntrack entries
> >> in total and roughly 8k new (and closing) connections per second.
> >>
> >> In this case we'll always evict at least one entry per gc cycle and scan
> >> interval is always at 1 jiffy because of this test:
> >>
> >>  } else if (expired_count) {
> >>  gc_work->next_gc_run /= 2U;
> >>  next_run = msecs_to_jiffies(1);
> >>
> >> being true almost all the time.
> >>
> >> Given we scan ~10k entries per run its clearly wrong to reduce interval
> >> based on nonzero eviction count, it will only waste cpu cycles since a vast
> >> majorities of conntracks are not timed out.
> >>
> >> Thus only look at the ratio (scanned entries vs. evicted entries) to make
> >> a decision on whether to reduce or not.
> >>
> >> Because evictor is supposed to only kick in when system turns idle after
> >> a busy period, pick a high ratio -- this makes it 50%.  We thus keep
> >> the idea of increasing scan rate when its likely that table contains many
> >> expired entries.
> >>
> >> In order to not let timed-out entries hang around for too long
> >> (important when using event logging, in which case we want to timely
> >> destroy events), we now scan the full table within at most
> >> GC_MAX_SCAN_JIFFIES (16 seconds) even in worst-case scenario where all
> >> timed-out entries sit in same slot.
> >>
> >> I tested this with a vm under synflood (with
> >> sysctl net.netfilter.nf_conntrack_tcp_timeout_syn_recv=3).
> >>
> >> While flood is ongoing, interval now stays at its max rate
> >> (GC_MAX_SCAN_JIFFIES / GC_MAX_BUCKETS_DIV -> 125ms).
> >>
> >> With feedback from Nicolas Dichtel.
> >>
> >> Reported-by: Denys Fedoryshchenko <nuclear...@nuclearcat.com>
> >> Cc: Nicolas Dichtel <nicolas.dich...@6wind.com>
> >> Fixes: b87a2f9199ea82eaadc ("netfilter: conntrack: add gc worker to remove 
> >> timed-out entries")
> >> Signed-off-by: Florian Westphal <f...@strlen.de>
> >> Tested-by: Nicolas Dichtel <nicolas.dich...@6wind.com>
> >> Acked-by: Nicolas Dichtel <nicolas.dich...@6wind.com>
> >> Tested-by: Denys Fedoryshchenko <nuclear...@nuclearcat.com>
> >> Signed-off-by: Pablo Neira Ayuso <pa...@netfilter.org>
> > Pablo, is it possible to queue this patch (and the previous: 08/14) for the 
> > 4.9
> > stable?
> 
> Pablo, should I submit it formally?

Just requested this to Greg, sorry this didn't happen so far.

Thanks.

Re: [4.9.10] ip_route_me_harder() reading off-slab

2017-02-28 Thread Pablo Neira Ayuso

On Mon, Feb 27, 2017 at 10:41:48PM +0800, Daniel J Blueman wrote:
> On 17 February 2017 at 15:39, Florian Westphal  wrote:
> > Daniel J Blueman  wrote:
> >
> > [ CC nf-devel, pablo ]
> >
> >> When booting a VM in libvirt/KVM attached to a local bridge and KASAN
> >> enabled on 4.9.10, we see a stream of KASAN warnings about off-slab
> >> access [1].
> >>
> >> Let me know if you'd like more debug.
> >
> > Does this patch help?
> >
> > Subject: [PATCH nf] netfilter: use skb_to_full_sk in ip_route_me_harder
> >
> > inet_sk(skb->sk) is illegal in case skb is attached to request socket.
> >
> > Fixes: ca6fb0651883 ("tcp: attach SYNACK messages to request sockets 
> > instead of listener")
> > Reported by: Daniel J Blueman 
> > Signed-off-by: Florian Westphal 
[...]
> Apologies for the delays; this also addresses the issue just fine.
> 
> Tested-by: Daniel J Blueman 

Applied, thanks for testing.

[PATCH 2/6] uapi: stop including linux/sysctl.h in uapi/linux/netfilter.h

2017-02-27 Thread Pablo Neira Ayuso

From: "Dmitry V. Levin" <l...@altlinux.org>

linux/netfilter.h is the last uapi header file that includes
linux/sysctl.h but it does not depend on definitions provided
by this essentially dead header file.

Suggested-by: Eric W. Biederman <ebied...@xmission.com>
Signed-off-by: Dmitry V. Levin <l...@altlinux.org>
Signed-off-by: Pablo Neira Ayuso <pa...@netfilter.org>
---
 include/uapi/linux/netfilter.h | 1 -
 1 file changed, 1 deletion(-)

diff --git a/include/uapi/linux/netfilter.h b/include/uapi/linux/netfilter.h
index 7550e9176a54..c111a91adcc0 100644
--- a/include/uapi/linux/netfilter.h
+++ b/include/uapi/linux/netfilter.h
@@ -3,7 +3,6 @@
 
 #include 
 #include 
-#include 
 #include 
 #include 
 
-- 
2.1.4

[PATCH 3/6] uapi: fix linux/netfilter/xt_hashlimit.h userspace compilation error

2017-02-27 Thread Pablo Neira Ayuso

From: "Dmitry V. Levin" <l...@altlinux.org>

Include  like some of uapi/linux/netfilter/xt_*.h
headers do to fix the following linux/netfilter/xt_hashlimit.h
userspace compilation error:

/usr/include/linux/netfilter/xt_hashlimit.h:90:12: error: 'NAME_MAX' undeclared 
here (not in a function)
  char name[NAME_MAX];

Signed-off-by: Dmitry V. Levin <l...@altlinux.org>
Signed-off-by: Pablo Neira Ayuso <pa...@netfilter.org>
---
 include/uapi/linux/netfilter/xt_hashlimit.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/include/uapi/linux/netfilter/xt_hashlimit.h 
b/include/uapi/linux/netfilter/xt_hashlimit.h
index 3efc0ca18345..79da349f1060 100644
--- a/include/uapi/linux/netfilter/xt_hashlimit.h
+++ b/include/uapi/linux/netfilter/xt_hashlimit.h
@@ -2,6 +2,7 @@
 #define _UAPI_XT_HASHLIMIT_H
 
 #include 
+#include 
 #include 
 
 /* timings are in milliseconds. */
-- 
2.1.4

[PATCH 4/6] netfilter: nf_ct_expect: nf_ct_expect_related_report(): Return zero on success.

2017-02-27 Thread Pablo Neira Ayuso

From: Jarno Rajahalme <ja...@ovn.org>

Commit 4dee62b1b9b4 ("netfilter: nf_ct_expect: nf_ct_expect_insert()
returns void") inadvertently changed the successful return value of
nf_ct_expect_related_report() from 0 to 1, which caused openvswitch
conntrack integration fail in FTP test cases.

Fix this by always returning zero on the success code path.

Fixes: 4dee62b1b9b4 ("netfilter: nf_ct_expect: nf_ct_expect_insert() returns 
void")
Signed-off-by: Jarno Rajahalme <ja...@ovn.org>
Acked-by: Joe Stringer <j...@ovn.org>
Signed-off-by: Pablo Neira Ayuso <pa...@netfilter.org>
---
 net/netfilter/nf_conntrack_expect.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/netfilter/nf_conntrack_expect.c 
b/net/netfilter/nf_conntrack_expect.c
index e19a69787d99..d6ace69d57dc 100644
--- a/net/netfilter/nf_conntrack_expect.c
+++ b/net/netfilter/nf_conntrack_expect.c
@@ -467,7 +467,7 @@ int nf_ct_expect_related_report(struct nf_conntrack_expect 
*expect,
 
spin_unlock_bh(_conntrack_expect_lock);
nf_ct_expect_event_report(IPEXP_NEW, expect, portid, report);
-   return ret;
+   return 0;
 out:
spin_unlock_bh(_conntrack_expect_lock);
return ret;
-- 
2.1.4

[PATCH 0/6] Netfilter fixes for net

2017-02-27 Thread Pablo Neira Ayuso

Hi David,

The following patchset contains netfilter fixes for you net tree,
they are:

1) Missing ct zone size in the nft_ct initialization path, patch
   from Florian Westphal.

2) Two patches for netfilter uapi headers, one to remove unnecessary
   sysctl.h inclusion and another to fix compilation of xt_hashlimit.h
   in userspace, from Dmitry V. Levin.

3) Patch to fix a sloppy change in nf_ct_expect that incorrectly
   simplified nf_ct_expect_related_report() in the previous nf-next
   batch. This also includes another patch for __nf_ct_expect_check()
   to report success by returning 0 to keep it consistent with other
   existing functions. From Jarno Rajahalme.

4) The ->walk() iterator of the new bitmap set type goes over the real
   bitmap size, this results in incorrect dumps when NFTA_SET_USERDATA
   is used.

You can pull these changes from:

  git://git.kernel.org/pub/scm/linux/kernel/git/pablo/nf.git

Thanks!



The following changes since commit 9c4713701c01e4cef6e2315c2818abc919ffb0de:

  bpf: Fix bpf_xdp_event_output (2017-02-23 13:53:42 -0500)

are available in the git repository at:

  git://git.kernel.org/pub/scm/linux/kernel/git/pablo/nf.git HEAD

for you to fetch changes up to 13aa5a8f498dacd5f1a8e35be72af47e630fb8c6:

  netfilter: nft_set_bitmap: incorrect bitmap size (2017-02-26 21:00:19 +0100)


Dmitry V. Levin (2):
  uapi: stop including linux/sysctl.h in uapi/linux/netfilter.h
  uapi: fix linux/netfilter/xt_hashlimit.h userspace compilation error

Florian Westphal (1):
  netfilter: nft_ct: fix random validation errors for zone set support

Jarno Rajahalme (2):
  netfilter: nf_ct_expect: nf_ct_expect_related_report(): Return zero on 
success.
  netfilter: nf_ct_expect: Change __nf_ct_expect_check() return value.

Pablo Neira Ayuso (1):
  netfilter: nft_set_bitmap: incorrect bitmap size

 include/uapi/linux/netfilter.h  | 1 -
 include/uapi/linux/netfilter/xt_hashlimit.h | 1 +
 net/netfilter/nf_conntrack_expect.c | 6 +++---
 net/netfilter/nft_ct.c  | 1 +
 net/netfilter/nft_set_bitmap.c  | 2 +-
 5 files changed, 6 insertions(+), 5 deletions(-)

[PATCH 6/6] netfilter: nft_set_bitmap: incorrect bitmap size

2017-02-27 Thread Pablo Neira Ayuso

priv->bitmap_size stores the real bitmap size, instead of the full
struct nft_bitmap object.

Fixes: 665153ff5752 ("netfilter: nf_tables: add bitmap set type")
Signed-off-by: Pablo Neira Ayuso <pa...@netfilter.org>
---
 net/netfilter/nft_set_bitmap.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/netfilter/nft_set_bitmap.c b/net/netfilter/nft_set_bitmap.c
index 97f9649bcc7e..152d226552c1 100644
--- a/net/netfilter/nft_set_bitmap.c
+++ b/net/netfilter/nft_set_bitmap.c
@@ -258,7 +258,7 @@ static int nft_bitmap_init(const struct nft_set *set,
 {
struct nft_bitmap *priv = nft_set_priv(set);
 
-   priv->bitmap_size = nft_bitmap_total_size(set->klen);
+   priv->bitmap_size = nft_bitmap_size(set->klen);
 
return 0;
 }
-- 
2.1.4

[PATCH 5/6] netfilter: nf_ct_expect: Change __nf_ct_expect_check() return value.

2017-02-27 Thread Pablo Neira Ayuso

From: Jarno Rajahalme <ja...@ovn.org>

Commit 4dee62b1b9b4 ("netfilter: nf_ct_expect: nf_ct_expect_insert()
returns void") inadvertently changed the successful return value of
nf_ct_expect_related_report() from 0 to 1 due to
__nf_ct_expect_check() returning 1 on success.  Prevent this
regression in the future by changing the return value of
__nf_ct_expect_check() to 0 on success.

Signed-off-by: Jarno Rajahalme <ja...@ovn.org>
Acked-by: Joe Stringer <j...@ovn.org>
Signed-off-by: Pablo Neira Ayuso <pa...@netfilter.org>
---
 net/netfilter/nf_conntrack_expect.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/net/netfilter/nf_conntrack_expect.c 
b/net/netfilter/nf_conntrack_expect.c
index d6ace69d57dc..4b2e1fb28bb4 100644
--- a/net/netfilter/nf_conntrack_expect.c
+++ b/net/netfilter/nf_conntrack_expect.c
@@ -410,7 +410,7 @@ static inline int __nf_ct_expect_check(struct 
nf_conntrack_expect *expect)
struct net *net = nf_ct_exp_net(expect);
struct hlist_node *next;
unsigned int h;
-   int ret = 1;
+   int ret = 0;
 
if (!master_help) {
ret = -ESHUTDOWN;
@@ -460,7 +460,7 @@ int nf_ct_expect_related_report(struct nf_conntrack_expect 
*expect,
 
spin_lock_bh(_conntrack_expect_lock);
ret = __nf_ct_expect_check(expect);
-   if (ret <= 0)
+   if (ret < 0)
goto out;
 
nf_ct_expect_insert(expect);
-- 
2.1.4

[PATCH 1/6] netfilter: nft_ct: fix random validation errors for zone set support

2017-02-27 Thread Pablo Neira Ayuso

From: Florian Westphal <f...@strlen.de>

Dan reports:
 net/netfilter/nft_ct.c:549 nft_ct_set_init()
 error: uninitialized symbol 'len'.

Reported-by: Dan Carpenter <dan.carpen...@oracle.com>
Fixes: edee4f1e924582 ("netfilter: nft_ct: add zone id set support")
Signed-off-by: Florian Westphal <f...@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pa...@netfilter.org>
---
 net/netfilter/nft_ct.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/net/netfilter/nft_ct.c b/net/netfilter/nft_ct.c
index c6b8022c0e47..bf548a7a71ec 100644
--- a/net/netfilter/nft_ct.c
+++ b/net/netfilter/nft_ct.c
@@ -528,6 +528,7 @@ static int nft_ct_set_init(const struct nft_ctx *ctx,
if (!nft_ct_tmpl_alloc_pcpu())
return -ENOMEM;
nft_ct_pcpu_template_refcnt++;
+   len = sizeof(u16);
break;
 #endif
default:
-- 
2.1.4

Re: [PATCH net-next] net/gtp: Add udp source port generation according to flow hash

2017-02-23 Thread Pablo Neira Ayuso

On Thu, Feb 23, 2017 at 03:21:13PM +0100, Andreas Schultz wrote:
> - On Feb 23, 2017, at 2:49 PM, pablo pa...@netfilter.org wrote:
[...]
> > According to specs, section 4.4.2.3 Encapsulated T-PDU, TS 29.281.
> > 
> > "The UDP Source Port is a locally allocated port number at the sending
> > GTP-U entity."
> > 
> > Older specs that refer to GTP-U such as TS 09.60 and TS 29.060 also
> > state the same.
> 
> It is absolutely valid the choose any sending port you want. I only
> think you should know which port you did send on.
> 
> TS 29.281, Sect. 5.2.2.1 defines the UDP port extension to be used
> in error indications. It provides the UDP source port of a G-PDU
> that triggered an error.
> 
> If the send side does not know which port it uses to send, how
> can it use this indication to correlate an error? That's the reason
> I thought it would be better to add the UDP source port to the
> PDP context and allow the control path to assign the source port
> on context creation.
> 
> Of course, this header is optional and the receiving side is not
> required to use it.
> 
> About the RSS spreading in the receive side, I would think that
> a receiver would prefer to process all packets that belong to a
> give TEID with the same receive instance. So keeping the UDP
> source port for a given TEID stable would be beneficial. As far
> as I understand it, the hash used in the patch uses the source
> and destination values from the original flow. This would mean
> that GTP packets that belong to the same TEID would end up with
> different UDP source ports.

The receiver needs to scale up, and that happens if packets are
distributed between CPUs in a way that make sense. I don't think it
makes sense to pass packets that belong to the same tunnel to the same
CPU, this is exactly the scenario that makes DDOS easier to trigger.

> So what about this as a compromise, we dd a UDP source port field
> to the PDP context, it defaults to 0 (zero), the control instance
> can optionally initialize this field, when we hit the xmit code
> and the port is non zero, use that value, if it is zero use the hash?

You want to use this for your VRF concept? I would like that you take
the time to explain us your usecases. How are you going to use this
mapping between tunnels and UDP source ports? An explaination would be
better than searching at some optional (corner case) extension in the
specs whose usage is questionable.

In any case, I think we want a good default behaviour, and I think
Or's patch provides it.

Re: [PATCH net-next] net/gtp: Add udp source port generation according to flow hash

2017-02-23 Thread Pablo Neira Ayuso

On Thu, Feb 23, 2017 at 10:35:56AM +0100, Andreas Schultz wrote:
> - On Feb 22, 2017, at 10:47 PM, Tom Herbert t...@herbertland.com wrote:
[...]
> > This shouldn't be taken as a HW requirement and it's unlikely we'd add
> > explicit GTP support in flow_dissector. If we can't get entropy in the
> > UDP source port then IPv6 flow label is a potential alternative (so
> > that should be supported in NICs for RSS).
> > 
> > I'll also reiterate my previous point about the need for GTP testing--
> > in order for us to be able to evaluate the GTP datapath for things
> > like performance or how they withstand against DDOS we really need an
> > easy way to isolate the datapath.
> 
> GTP as specified is very unsecure by definition. It is meant to be run
> only on *private* mobile carrier and intra mobile carrier EPC networks.
> Running it openly on the public internet would be extremly foolish.
> 
> There are some mechanisms in GTPv1-C on how to handle overload and
> more extensive mechanisms in GTPv2-C for overload handling. The basic
> guiding principle is to simply drop any traffic that it can't handle.
> 
> Anyhow, I havn't seen anything in 3GPP or GSMA documents that deals
> with DDOS.
> 
> There are guidelines like the GSMA's IR.88 that describe how the intra
> carrier roaming should work and what security measures should be
> implemented.
> 
> Traffic coming in at Gi/SGi or form the UE could create a DDOS on tunnel.
> However, on the UE side you still have the RAN (eNODE, SGSN, S-GW) or
> an ePDG that has to apply QoS and thereby limit traffic. On the Gi/SGi
> side side you have the PCEF that does the same.
> 
> So in a complete 3GPP node (GGSN, P-GW) that uses the GTP tunnel
> implementation, malicious traffic should be blocked before it can reach
> the tunnel.
> 
> And as I stated before, the GTP tunnel module is not supposed to be
> use without any of those components. So the DDOS concern should not
> be handled at the tunnel level.

I think that Tom's point is that this tunnel driver will have to deal
with DDOS scenarios anyway, because reality is that you can't always
block it before it can reach the tunnel.

Or's patch helps us deal with this scenario.

Re: [PATCH net-next] net/gtp: Add udp source port generation according to flow hash

2017-02-23 Thread Pablo Neira Ayuso

On Wed, Feb 22, 2017 at 01:47:17PM -0800, Tom Herbert wrote:
> On Wed, Feb 22, 2017 at 1:29 PM, Or Gerlitz  wrote:
> > On Thu, Feb 16, 2017 at 11:58 PM, Andreas Schultz  wrote:
> >> Hi Or,
> >> - On Feb 16, 2017, at 3:59 PM, Or Gerlitz ogerl...@mellanox.com wrote:
> >>
> >>> Generate the source udp header according to the flow represented by
> >>> the packet we are encapsulating, as done for other udp tunnels. This
> >>> helps on the receiver side to apply RSS spreading.
> >>
> >> This might work for GTPv0-U, However, for GTPv1-U this could interfere
> >> with error handling in the user space control process when the UDP port
> >> extension  header is used in error indications.
> >
> >
> > in the document you posted there's this quote "The source IP and port
> > have no meaning and can change at any time" -- I assume it refers to
> > v0? can we identify in the kernel code that we're on v0 and have the
> > patch come into play?
> >
> >> 3GPP TS 29.281 Rel 13, section 5.2.2.1 defines the UDP port extension and
> >> section 7.3.1 says that the UDP source port extension can be used to
> >> mitigate DOS attacks. This would IMHO imply that the user space control
> >> process needs to know the TEID to UDP source port mapping.
> >
> >> The other question is, on what is this actually hashing. When I understand
> >> the code correctly, this will hash on the source/destination of the orignal
> >> flow. I would expect that a SGSN/SGW/eNodeB would like the keep flow
> >> processing on a per TEID base, so the port hashing should be base on the 
> >> TEID.
> >
> > is it possible for packets belonging to the same TCP session or UDP
> > "pseudo session" (given pair of src/dst ip/port) to be encapsulated
> > using different TEID?
> >
> > hashing on the TEID imposes a harder requirement on the NIC HW vs.
> > just UDP based RSS.
> 
> This shouldn't be taken as a HW requirement and it's unlikely we'd add
> explicit GTP support in flow_dissector. If we can't get entropy in the
> UDP source port then IPv6 flow label is a potential alternative (so
> that should be supported in NICs for RSS).

According to specs, section 4.4.2.3 Encapsulated T-PDU, TS 29.281.

"The UDP Source Port is a locally allocated port number at the sending
GTP-U entity."

Older specs that refer to GTP-U such as TS 09.60 and TS 29.060 also
state the same.

So Or patch looks fine to me.

Re: netfilter: nft_ct: add zone id set support

2017-02-23 Thread Pablo Neira Ayuso

On Thu, Feb 23, 2017 at 12:34:35PM +0100, Florian Westphal wrote:
> Geert Uytterhoeven <ge...@linux-m68k.org> wrote:
> > On Wed, Feb 22, 2017 at 8:02 PM, Linux Kernel Mailing List
> > <linux-ker...@vger.kernel.org> wrote:
> > > Web:
> > > https://git.kernel.org/torvalds/c/edee4f1e92458299505ff007733f676b00c516a1
> > > Commit: edee4f1e92458299505ff007733f676b00c516a1
> > > Parent: 5c178d81b69f08ca3195427a6ea9a46d9af23127
> > > Refname:refs/heads/master
> > > Author: Florian Westphal <f...@strlen.de>
> > > AuthorDate: Fri Feb 3 13:35:50 2017 +0100
> > > Committer:  Pablo Neira Ayuso <pa...@netfilter.org>
> > > CommitDate: Wed Feb 8 14:16:23 2017 +0100
> > >
> > Unlike for the other cases of the switch statement, "len" is not initialized
> > here...
> > 
> > > +   break;
> > > priv->sreg = nft_parse_register(tb[NFTA_CT_SREG]);
> > > err = nft_validate_register_load(priv->sreg, len);
> > 
> > ... and used here, which may lead to spurious failures of
> > nft_validate_register_load().
> 
> Yes, Dan reported this and a patch is queued at
> http://patchwork.ozlabs.org/patch/727573/
> 
> Pablo, any reason why this is still waiting?

I just flushing out my nf.git tree via pull request.

Once these changes are pulled, I'll fetch recent net-next changes that
were just merged via net. Then, I'll pick this so we can calm down
these compilation warnings.

Are you OK with this procedure? Thanks!

[PATCH 0/8] Netfilter fixes for net

2017-02-23 Thread Pablo Neira Ayuso

Hi David,

The following patchset contains Netfilter fixes for your net tree,
they are:

1) Revisit warning logic when not applying default helper assignment.
   Jiri Kosina considers we are breaking existing setups and not warning
   our users accordinly now that automatic helper assignment has been
   turned off by default. So let's make him happy by spotting the warning
   by when we find a helper but we cannot attach, instead of warning on the
   former deprecated behaviour. Patch from Jiri Kosina.

2) Two patches to fix regression in ctnetlink interfaces with
   nfnetlink_queue. Specifically, perform more relaxed in CTA_STATUS
   and do not bail out if CTA_HELP indicates the same helper that we
   already have. Patches from Kevin Cernekee.

3) A couple of bugfixes for ipset via Jozsef Kadlecsik. Due to wrong
   index logic in hash set types and null pointer exception in the
   list:set type.

4) hashlimit bails out with correct userspace parameters due to wrong
   arithmetics in the code that avoids "divide by zero" when
   transforming the userspace timing in milliseconds to token credits.
   Patch from Alban Browaeys.

5) Fix incorrect NFQA_VLAN_MAX definition, patch from
   Ken-ichirou MATSUZAWA.

6) Don't not declare nfnetlink batch error list as static, since this
   may be used by several subsystems at the same time. Patch from
   Liping Zhang.

You can pull these changes from:

  git://git.kernel.org/pub/scm/linux/kernel/git/pablo/nf.git

Thanks!



The following changes since commit cafe8df8b9bc9aa3dffa827c1a6757c6cd36f657:

  net: phy: Fix lack of reference count on PHY driver (2017-02-02 22:59:43 
-0500)

are available in the git repository at:

  git://git.kernel.org/pub/scm/linux/kernel/git/pablo/nf.git HEAD

for you to fetch changes up to 3ef767e5cbd405abfd01339c7e5daaf98e037be2:

  Merge branch 'master' of git://blackhole.kfki.hu/nf (2017-02-21 14:01:05 
+0100)


Alban Browaeys (1):
  netfilter: xt_hashlimit: Fix integer divide round to zero.

Jiri Kosina (1):
  netfilter: nf_ct_helper: warn when not applying default helper assignment

Jozsef Kadlecsik (1):
  Fix bug: sometimes valid entries in hash:* types of sets were evicted

Ken-ichirou MATSUZAWA (1):
  netfilter: nfnetlink_queue: fix NFQA_VLAN_MAX definition

Kevin Cernekee (2):
  netfilter: ctnetlink: Fix regression in CTA_STATUS processing
  netfilter: ctnetlink: Fix regression in CTA_HELP processing

Liping Zhang (1):
  netfilter: nfnetlink: remove static declaration from err_list

Pablo Neira Ayuso (1):
  Merge branch 'master' of git://blackhole.kfki.hu/nf

Vishwanath Pai (1):
  netfilter: ipset: Null pointer exception in ipset list:set

 include/uapi/linux/netfilter/nf_conntrack_common.h |  4 ++
 include/uapi/linux/netfilter/nfnetlink_queue.h |  2 +-
 net/netfilter/ipset/ip_set_hash_gen.h  |  2 +-
 net/netfilter/ipset/ip_set_list_set.c  |  9 +++--
 net/netfilter/nf_conntrack_helper.c| 39 +---
 net/netfilter/nf_conntrack_netlink.c   | 43 +++---
 net/netfilter/nfnetlink.c  |  2 +-
 net/netfilter/xt_hashlimit.c   | 25 +
 8 files changed, 86 insertions(+), 40 deletions(-)

[PATCH 5/8] netfilter: ipset: Null pointer exception in ipset list:set

2017-02-23 Thread Pablo Neira Ayuso

From: Vishwanath Pai 

If we use before/after to add an element to an empty list it will cause
a kernel panic.

$> cat crash.restore
create a hash:ip
create b hash:ip
create test list:set timeout 5 size 4
add test b before a

$> ipset -R < crash.restore

Executing the above will crash the kernel.

Signed-off-by: Vishwanath Pai 
Reviewed-by: Josh Hunt 
Signed-off-by: Jozsef Kadlecsik 
---
 net/netfilter/ipset/ip_set_list_set.c | 9 ++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/net/netfilter/ipset/ip_set_list_set.c 
b/net/netfilter/ipset/ip_set_list_set.c
index 51077c53d76b..178d4eba013b 100644
--- a/net/netfilter/ipset/ip_set_list_set.c
+++ b/net/netfilter/ipset/ip_set_list_set.c
@@ -260,11 +260,14 @@ list_set_uadd(struct ip_set *set, void *value, const 
struct ip_set_ext *ext,
else
prev = e;
}
+
+   /* If before/after is used on an empty set */
+   if ((d->before > 0 && !next) ||
+   (d->before < 0 && !prev))
+   return -IPSET_ERR_REF_EXIST;
+
/* Re-add already existing element */
if (n) {
-   if ((d->before > 0 && !next) ||
-   (d->before < 0 && !prev))
-   return -IPSET_ERR_REF_EXIST;
if (!flag_exist)
return -IPSET_ERR_EXIST;
/* Update extensions */
-- 
2.1.4

[PATCH 1/8] netfilter: nf_ct_helper: warn when not applying default helper assignment

2017-02-23 Thread Pablo Neira Ayuso

From: Jiri Kosina <jkos...@suse.cz>

Commit 3bb398d925 ("netfilter: nf_ct_helper: disable automatic helper
assignment") is causing behavior regressions in firewalls, as traffic
handled by conntrack helpers is now by default not passed through even
though it was before due to missing CT targets (which were not necessary
before this commit).

The default had to be switched off due to security reasons [1] [2] and
therefore should stay the way it is, but let's be friendly to firewall
admins and issue a warning the first time we're in situation where packet
would be likely passed through with the old default but we're likely going
to drop it on the floor now.

Rewrite the code a little bit as suggested by Linus, so that we avoid
spaghettiing the code even more -- namely the whole decision making
process regarding helper selection (either automatic or not) is being
separated, so that the whole logic can be simplified and code (condition)
duplication reduced.

[1] https://cansecwest.com/csw12/conntrack-attack.pdf
[2] https://home.regit.org/netfilter-en/secure-use-of-helpers/

Signed-off-by: Jiri Kosina <jkos...@suse.cz>
Signed-off-by: Pablo Neira Ayuso <pa...@netfilter.org>
---
 net/netfilter/nf_conntrack_helper.c | 39 -
 1 file changed, 26 insertions(+), 13 deletions(-)

diff --git a/net/netfilter/nf_conntrack_helper.c 
b/net/netfilter/nf_conntrack_helper.c
index 7341adf7059d..6dc44d9b4190 100644
--- a/net/netfilter/nf_conntrack_helper.c
+++ b/net/netfilter/nf_conntrack_helper.c
@@ -188,6 +188,26 @@ nf_ct_helper_ext_add(struct nf_conn *ct,
 }
 EXPORT_SYMBOL_GPL(nf_ct_helper_ext_add);
 
+static struct nf_conntrack_helper *
+nf_ct_lookup_helper(struct nf_conn *ct, struct net *net)
+{
+   if (!net->ct.sysctl_auto_assign_helper) {
+   if (net->ct.auto_assign_helper_warned)
+   return NULL;
+   if (!__nf_ct_helper_find(>tuplehash[IP_CT_DIR_REPLY].tuple))
+   return NULL;
+   pr_info("nf_conntrack: default automatic helper assignment "
+   "has been turned off for security reasons and CT-based "
+   " firewall rule not found. Use the iptables CT target "
+   "to attach helpers instead.\n");
+   net->ct.auto_assign_helper_warned = 1;
+   return NULL;
+   }
+
+   return __nf_ct_helper_find(>tuplehash[IP_CT_DIR_REPLY].tuple);
+}
+
+
 int __nf_ct_try_assign_helper(struct nf_conn *ct, struct nf_conn *tmpl,
  gfp_t flags)
 {
@@ -213,21 +233,14 @@ int __nf_ct_try_assign_helper(struct nf_conn *ct, struct 
nf_conn *tmpl,
}
 
help = nfct_help(ct);
-   if (net->ct.sysctl_auto_assign_helper && helper == NULL) {
-   helper = 
__nf_ct_helper_find(>tuplehash[IP_CT_DIR_REPLY].tuple);
-   if (unlikely(!net->ct.auto_assign_helper_warned && helper)) {
-   pr_info("nf_conntrack: automatic helper "
-   "assignment is deprecated and it will "
-   "be removed soon. Use the iptables CT target "
-   "to attach helpers instead.\n");
-   net->ct.auto_assign_helper_warned = true;
-   }
-   }
 
if (helper == NULL) {
-   if (help)
-   RCU_INIT_POINTER(help->helper, NULL);
-   return 0;
+   helper = nf_ct_lookup_helper(ct, net);
+   if (helper == NULL) {
+   if (help)
+   RCU_INIT_POINTER(help->helper, NULL);
+   return 0;
+   }
}
 
if (help == NULL) {
-- 
2.1.4

[PATCH 8/8] netfilter: nfnetlink: remove static declaration from err_list

2017-02-23 Thread Pablo Neira Ayuso

From: Liping Zhang <zlpnob...@gmail.com>

Otherwise, different subsys will race to access the err_list, with holding
the different nfnl_lock(subsys_id).

But this will not happen now, since ->call_batch is only implemented by
nftables, so the err_list is protected by nfnl_lock(NFNL_SUBSYS_NFTABLES).

Signed-off-by: Liping Zhang <zlpnob...@gmail.com>
Signed-off-by: Pablo Neira Ayuso <pa...@netfilter.org>
---
 net/netfilter/nfnetlink.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/netfilter/nfnetlink.c b/net/netfilter/nfnetlink.c
index a09fa9fd8f3d..6fa448478cba 100644
--- a/net/netfilter/nfnetlink.c
+++ b/net/netfilter/nfnetlink.c
@@ -279,7 +279,7 @@ static void nfnetlink_rcv_batch(struct sk_buff *skb, struct 
nlmsghdr *nlh,
struct net *net = sock_net(skb->sk);
const struct nfnetlink_subsystem *ss;
const struct nfnl_callback *nc;
-   static LIST_HEAD(err_list);
+   LIST_HEAD(err_list);
u32 status;
int err;
 
-- 
2.1.4

[PATCH 2/8] netfilter: ctnetlink: Fix regression in CTA_STATUS processing

2017-02-23 Thread Pablo Neira Ayuso

From: Kevin Cernekee <cerne...@chromium.org>

The libnetfilter_conntrack userland library always sets IPS_CONFIRMED
when building a CTA_STATUS attribute.  If this toggles the bit from
0->1, the parser will return an error.  On Linux 4.4+ this will cause any
NFQA_EXP attribute in the packet to be ignored.  This breaks conntrackd's
userland helpers because they operate on unconfirmed connections.

Instead of returning -EBUSY if the user program asks to modify an
unchangeable bit, simply ignore the change.

Also, fix the logic so that user programs are allowed to clear
the bits that they are allowed to change.

Signed-off-by: Kevin Cernekee <cerne...@chromium.org>
Signed-off-by: Pablo Neira Ayuso <pa...@netfilter.org>
---
 include/uapi/linux/netfilter/nf_conntrack_common.h |  4 
 net/netfilter/nf_conntrack_netlink.c   | 26 +-
 2 files changed, 29 insertions(+), 1 deletion(-)

diff --git a/include/uapi/linux/netfilter/nf_conntrack_common.h 
b/include/uapi/linux/netfilter/nf_conntrack_common.h
index 6d074d14ee27..6a8e33dd4ecb 100644
--- a/include/uapi/linux/netfilter/nf_conntrack_common.h
+++ b/include/uapi/linux/netfilter/nf_conntrack_common.h
@@ -82,6 +82,10 @@ enum ip_conntrack_status {
IPS_DYING_BIT = 9,
IPS_DYING = (1 << IPS_DYING_BIT),
 
+   /* Bits that cannot be altered from userland. */
+   IPS_UNCHANGEABLE_MASK = (IPS_NAT_DONE_MASK | IPS_NAT_MASK |
+IPS_EXPECTED | IPS_CONFIRMED | IPS_DYING),
+
/* Connection has fixed timeout. */
IPS_FIXED_TIMEOUT_BIT = 10,
IPS_FIXED_TIMEOUT = (1 << IPS_FIXED_TIMEOUT_BIT),
diff --git a/net/netfilter/nf_conntrack_netlink.c 
b/net/netfilter/nf_conntrack_netlink.c
index 27540455dc62..bf04b7e9d6f7 100644
--- a/net/netfilter/nf_conntrack_netlink.c
+++ b/net/netfilter/nf_conntrack_netlink.c
@@ -2270,6 +2270,30 @@ ctnetlink_glue_build(struct sk_buff *skb, struct nf_conn 
*ct,
 }
 
 static int
+ctnetlink_update_status(struct nf_conn *ct, const struct nlattr * const cda[])
+{
+   unsigned int status = ntohl(nla_get_be32(cda[CTA_STATUS]));
+   unsigned long d = ct->status ^ status;
+
+   if (d & IPS_SEEN_REPLY && !(status & IPS_SEEN_REPLY))
+   /* SEEN_REPLY bit can only be set */
+   return -EBUSY;
+
+   if (d & IPS_ASSURED && !(status & IPS_ASSURED))
+   /* ASSURED bit can only be set */
+   return -EBUSY;
+
+   /* This check is less strict than ctnetlink_change_status()
+* because callers often flip IPS_EXPECTED bits when sending
+* an NFQA_CT attribute to the kernel.  So ignore the
+* unchangeable bits but do not error out.
+*/
+   ct->status = (status & ~IPS_UNCHANGEABLE_MASK) |
+(ct->status & IPS_UNCHANGEABLE_MASK);
+   return 0;
+}
+
+static int
 ctnetlink_glue_parse_ct(const struct nlattr *cda[], struct nf_conn *ct)
 {
int err;
@@ -2280,7 +2304,7 @@ ctnetlink_glue_parse_ct(const struct nlattr *cda[], 
struct nf_conn *ct)
return err;
}
if (cda[CTA_STATUS]) {
-   err = ctnetlink_change_status(ct, cda);
+   err = ctnetlink_update_status(ct, cda);
if (err < 0)
return err;
}
-- 
2.1.4

< 6 7 8 9 10 11 12 13 14 15 >

1001 - 1100 of 2305 matches

Mail list logo