[PATCH V2 1/2] netfilter: ctnetlink: Fix regression in CTA_STATUS processing

2017-01-26 Thread Kevin Cernekee
The libnetfilter_conntrack userland library always sets IPS_CONFIRMED
when building a CTA_STATUS attribute.  If this toggles the bit from
0->1, the parser will return an error.  On Linux 4.4+ this will cause any
NFQA_EXP attribute in the packet to be ignored.  This breaks conntrackd's
userland helpers because they operate on unconfirmed connections.

Instead of returning -EBUSY if the user program asks to modify an
unchangeable bit, simply ignore the change.

Also, fix the logic so that user programs are allowed to clear
the bits that they are allowed to change.

Signed-off-by: Kevin Cernekee 
---


V1->V2:

 - Create a new ctnetlink_update_status() function, per code review feedback
 - Add comment and update changelog, per code review feedback
 - Rebase + retest on net-next

(Note that the original patch 1/3, which fixed a related problem on
CTA_TIMEOUT, is only needed on old kernels like 4.4.)


 include/uapi/linux/netfilter/nf_conntrack_common.h |  4 
 net/netfilter/nf_conntrack_netlink.c   | 27 +-
 2 files changed, 30 insertions(+), 1 deletion(-)

diff --git a/include/uapi/linux/netfilter/nf_conntrack_common.h 
b/include/uapi/linux/netfilter/nf_conntrack_common.h
index 6d074d1..6a8e33d 100644
--- a/include/uapi/linux/netfilter/nf_conntrack_common.h
+++ b/include/uapi/linux/netfilter/nf_conntrack_common.h
@@ -82,6 +82,10 @@ enum ip_conntrack_status {
IPS_DYING_BIT = 9,
IPS_DYING = (1 << IPS_DYING_BIT),
 
+   /* Bits that cannot be altered from userland. */
+   IPS_UNCHANGEABLE_MASK = (IPS_NAT_DONE_MASK | IPS_NAT_MASK |
+IPS_EXPECTED | IPS_CONFIRMED | IPS_DYING),
+
/* Connection has fixed timeout. */
IPS_FIXED_TIMEOUT_BIT = 10,
IPS_FIXED_TIMEOUT = (1 << IPS_FIXED_TIMEOUT_BIT),
diff --git a/net/netfilter/nf_conntrack_netlink.c 
b/net/netfilter/nf_conntrack_netlink.c
index 2754045..e8f704a6 100644
--- a/net/netfilter/nf_conntrack_netlink.c
+++ b/net/netfilter/nf_conntrack_netlink.c
@@ -1446,6 +1446,31 @@ ctnetlink_change_status(struct nf_conn *ct, const struct 
nlattr * const cda[])
 }
 
 static int
+ctnetlink_update_status(struct nf_conn *ct, const struct nlattr * const cda[])
+{
+   unsigned long d;
+   unsigned int status = ntohl(nla_get_be32(cda[CTA_STATUS]));
+   d = ct->status ^ status;
+
+   if (d & IPS_SEEN_REPLY && !(status & IPS_SEEN_REPLY))
+   /* SEEN_REPLY bit can only be set */
+   return -EBUSY;
+
+   if (d & IPS_ASSURED && !(status & IPS_ASSURED))
+   /* ASSURED bit can only be set */
+   return -EBUSY;
+
+   /* This check is less strict than ctnetlink_change_status()
+* because callers often flip IPS_EXPECTED bits when sending
+* an NFQA_CT attribute to the kernel.  So ignore the
+* unchangeable bits but do not error out.
+*/
+   ct->status = (status & ~IPS_UNCHANGEABLE_MASK) |
+(ct->status & IPS_UNCHANGEABLE_MASK);
+   return 0;
+}
+
+static int
 ctnetlink_setup_nat(struct nf_conn *ct, const struct nlattr * const cda[])
 {
 #ifdef CONFIG_NF_NAT_NEEDED
@@ -2280,7 +2305,7 @@ ctnetlink_glue_parse_ct(const struct nlattr *cda[], 
struct nf_conn *ct)
return err;
}
if (cda[CTA_STATUS]) {
-   err = ctnetlink_change_status(ct, cda);
+   err = ctnetlink_update_status(ct, cda);
if (err < 0)
return err;
}
-- 
2.7.4

--
To unsubscribe from this list: send the line "unsubscribe netfilter-devel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v2 net] net: free ip_vs_dest structs when refcnt=0

2017-01-26 Thread Julian Anastasov

Hello,

On Mon, 23 Jan 2017, David Windsor wrote:

> Currently, the ip_vs_dest cache frees ip_vs_dest objects when their
> reference count becomes < 0.  Aside from not being semantically sound,
> this is problematic for the new type refcount_t, which will be introduced
> shortly in a separate patch. refcount_t is the new kernel type for
> holding reference counts, and provides overflow protection and a
> constrained interface relative to atomic_t (the type currently being
> used for kernel reference counts).
> 
> Per Julian Anastasov: "The problem is that dest_trash currently holds
> deleted dests (unlinked from RCU lists) with refcnt=0."  Changing
> dest_trash to hold dest with refcnt=1 will allow us to free ip_vs_dest
> structs when their refcnt=0, in ip_vs_dest_put_and_free().
> 
> Signed-off-by: David Windsor 

Thanks! I tested the first version and this one
just adds the needed changes in comments, so

Signed-off-by: Julian Anastasov 

Simon and Pablo, this is more appropriate for
ipvs-next/nf-next. Please apply!

> ---
>  include/net/ip_vs.h| 2 +-
>  net/netfilter/ipvs/ip_vs_ctl.c | 8 +++-
>  2 files changed, 4 insertions(+), 6 deletions(-)
> 
> diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h
> index cd6018a..a3e78ad 100644
> --- a/include/net/ip_vs.h
> +++ b/include/net/ip_vs.h
> @@ -1421,7 +1421,7 @@ static inline void ip_vs_dest_put(struct ip_vs_dest 
> *dest)
>  
>  static inline void ip_vs_dest_put_and_free(struct ip_vs_dest *dest)
>  {
> - if (atomic_dec_return(>refcnt) < 0)
> + if (atomic_dec_and_test(>refcnt))
>   kfree(dest);
>  }
>  
> diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c
> index 55e0169..5fc4836 100644
> --- a/net/netfilter/ipvs/ip_vs_ctl.c
> +++ b/net/netfilter/ipvs/ip_vs_ctl.c
> @@ -711,7 +711,6 @@ ip_vs_trash_get_dest(struct ip_vs_service *svc, int 
> dest_af,
> dest->vport == svc->port))) {
>   /* HIT */
>   list_del(>t_list);
> - ip_vs_dest_hold(dest);
>   goto out;
>   }
>   }
> @@ -741,7 +740,7 @@ static void ip_vs_dest_free(struct ip_vs_dest *dest)
>   *  When the ip_vs_control_clearup is activated by ipvs module exit,
>   *  the service tables must have been flushed and all the connections
>   *  are expired, and the refcnt of each destination in the trash must
> - *  be 0, so we simply release them here.
> + *  be 1, so we simply release them here.
>   */
>  static void ip_vs_trash_cleanup(struct netns_ipvs *ipvs)
>  {
> @@ -1080,11 +1079,10 @@ static void __ip_vs_del_dest(struct netns_ipvs *ipvs, 
> struct ip_vs_dest *dest,
>   if (list_empty(>dest_trash) && !cleanup)
>   mod_timer(>dest_trash_timer,
> jiffies + (IP_VS_DEST_TRASH_PERIOD >> 1));
> - /* dest lives in trash without reference */
> + /* dest lives in trash with reference */
>   list_add(>t_list, >dest_trash);
>   dest->idle_start = 0;
>   spin_unlock_bh(>dest_trash_lock);
> - ip_vs_dest_put(dest);
>  }
>  
>  
> @@ -1160,7 +1158,7 @@ static void ip_vs_dest_trash_expire(unsigned long data)
>  
>   spin_lock(>dest_trash_lock);
>   list_for_each_entry_safe(dest, next, >dest_trash, t_list) {
> - if (atomic_read(>refcnt) > 0)
> + if (atomic_read(>refcnt) > 1)
>   continue;
>   if (dest->idle_start) {
>   if (time_before(now, dest->idle_start +
> -- 
> 2.7.4

Regards

--
Julian Anastasov 
--
To unsubscribe from this list: send the line "unsubscribe netfilter-devel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 02/14] tcp: fix mark propagation with fwmark_reflect enabled

2017-01-26 Thread Eric Dumazet
On Thu, 2017-01-26 at 20:19 +0100, Pablo Neira Ayuso wrote:

> Right. This is not percpu as in IPv4.
> 
> I can send a follow up patch to get this in sync with the way we do it
> in IPv4, ie. add percpu socket.
> 
> Fine with this approach? Thanks!

Not really.

percpu sockets are going to slow down network namespace creation /
deletion and increase memory foot print.

IPv6 is cleaner because it does not really have to use different
sockets.

Ultimately would would like to have the same for IPv4.

I would rather carry the mark either in an additional parameter,
or in the flow that is already passed as a parameter.



--
To unsubscribe from this list: send the line "unsubscribe netfilter-devel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 02/14] tcp: fix mark propagation with fwmark_reflect enabled

2017-01-26 Thread Pablo Neira Ayuso
On Thu, Jan 26, 2017 at 10:02:40AM -0800, Eric Dumazet wrote:
> On Thu, 2017-01-26 at 17:37 +0100, Pablo Neira Ayuso wrote:
> > From: Pau Espin Pedrol 
> > 
> > Otherwise, RST packets generated by the TCP stack for non-existing
> > sockets always have mark 0.
> > The mark from the original packet is assigned to the netns_ipv4/6
> > socket used to send the response so that it can get copied into the
> > response skb when the socket sends it.
> > 
> > Fixes: e110861f8609 ("net: add a sysctl to reflect the fwmark on replies")
> > Cc: Lorenzo Colitti 
> > Signed-off-by: Pau Espin Pedrol 
> > Signed-off-by: Pablo Neira Ayuso 
> > ---
> >  net/ipv4/ip_output.c | 1 +
> >  net/ipv6/tcp_ipv6.c  | 1 +
> >  2 files changed, 2 insertions(+)
> > 
> > diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
> > index fac275c48108..b67719f45953 100644
> > --- a/net/ipv4/ip_output.c
> > +++ b/net/ipv4/ip_output.c
> > @@ -1629,6 +1629,7 @@ void ip_send_unicast_reply(struct sock *sk, struct 
> > sk_buff *skb,
> > sk->sk_protocol = ip_hdr(skb)->protocol;
> > sk->sk_bound_dev_if = arg->bound_dev_if;
> > sk->sk_sndbuf = sysctl_wmem_default;
> > +   sk->sk_mark = fl4.flowi4_mark;
> > err = ip_append_data(sk, , ip_reply_glue_bits, arg->iov->iov_base,
> >  len, 0, , , MSG_DONTWAIT);
> > if (unlikely(err)) {
> > diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
> > index 73bc8fc68acd..2b20622a5824 100644
> > --- a/net/ipv6/tcp_ipv6.c
> > +++ b/net/ipv6/tcp_ipv6.c
> > @@ -840,6 +840,7 @@ static void tcp_v6_send_response(const struct sock *sk, 
> > struct sk_buff *skb, u32
> > dst = ip6_dst_lookup_flow(ctl_sk, , NULL);
> > if (!IS_ERR(dst)) {
> > skb_dst_set(buff, dst);
> > +   ctl_sk->sk_mark = fl6.flowi6_mark;
> > ip6_xmit(ctl_sk, buff, , NULL, tclass);
> > TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
> > if (rst)
> 
> 
> This patch is wrong.
> 
> ctl_sk is a shared socket, and tcp_v6_send_response() can be called from
> many different cpus at the same time.

Right. This is not percpu as in IPv4.

I can send a follow up patch to get this in sync with the way we do it
in IPv4, ie. add percpu socket.

Fine with this approach? Thanks!
--
To unsubscribe from this list: send the line "unsubscribe netfilter-devel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 02/14] tcp: fix mark propagation with fwmark_reflect enabled

2017-01-26 Thread Eric Dumazet
On Thu, 2017-01-26 at 17:37 +0100, Pablo Neira Ayuso wrote:
> From: Pau Espin Pedrol 
> 
> Otherwise, RST packets generated by the TCP stack for non-existing
> sockets always have mark 0.
> The mark from the original packet is assigned to the netns_ipv4/6
> socket used to send the response so that it can get copied into the
> response skb when the socket sends it.
> 
> Fixes: e110861f8609 ("net: add a sysctl to reflect the fwmark on replies")
> Cc: Lorenzo Colitti 
> Signed-off-by: Pau Espin Pedrol 
> Signed-off-by: Pablo Neira Ayuso 
> ---
>  net/ipv4/ip_output.c | 1 +
>  net/ipv6/tcp_ipv6.c  | 1 +
>  2 files changed, 2 insertions(+)
> 
> diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
> index fac275c48108..b67719f45953 100644
> --- a/net/ipv4/ip_output.c
> +++ b/net/ipv4/ip_output.c
> @@ -1629,6 +1629,7 @@ void ip_send_unicast_reply(struct sock *sk, struct 
> sk_buff *skb,
>   sk->sk_protocol = ip_hdr(skb)->protocol;
>   sk->sk_bound_dev_if = arg->bound_dev_if;
>   sk->sk_sndbuf = sysctl_wmem_default;
> + sk->sk_mark = fl4.flowi4_mark;
>   err = ip_append_data(sk, , ip_reply_glue_bits, arg->iov->iov_base,
>len, 0, , , MSG_DONTWAIT);
>   if (unlikely(err)) {
> diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
> index 73bc8fc68acd..2b20622a5824 100644
> --- a/net/ipv6/tcp_ipv6.c
> +++ b/net/ipv6/tcp_ipv6.c
> @@ -840,6 +840,7 @@ static void tcp_v6_send_response(const struct sock *sk, 
> struct sk_buff *skb, u32
>   dst = ip6_dst_lookup_flow(ctl_sk, , NULL);
>   if (!IS_ERR(dst)) {
>   skb_dst_set(buff, dst);
> + ctl_sk->sk_mark = fl6.flowi6_mark;
>   ip6_xmit(ctl_sk, buff, , NULL, tclass);
>   TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
>   if (rst)


This patch is wrong.

ctl_sk is a shared socket, and tcp_v6_send_response() can be called from
many different cpus at the same time.



--
To unsubscribe from this list: send the line "unsubscribe netfilter-devel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: ip_rcv_finish() NULL pointer kernel panic

2017-01-26 Thread Eric Dumazet
On Thu, 2017-01-26 at 10:00 -0800, Eric Dumazet wrote:
> On Thu, 2017-01-26 at 17:24 +0100, Florian Westphal wrote:
> 
> > I think it makes sense to set dst->incoming
> > to a stub in br_netfilter_rtable_init() to just kfree_skb()+
> > WARN_ON_ONCE(), no need to add code to ip stack or crash kernel
> > due to brnf bug.
> 
> Just kfree_skb() would hide bugs.
> 
> Dropping packets is not uncommon in networking...
> 
> I would rather use at least a WARN_ON_ONCE() before the kfree_skb() ;)


Oh well, I obviously did not parse properly your suggestion.


--
To unsubscribe from this list: send the line "unsubscribe netfilter-devel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: ip_rcv_finish() NULL pointer kernel panic

2017-01-26 Thread Eric Dumazet
On Thu, 2017-01-26 at 17:24 +0100, Florian Westphal wrote:

> I think it makes sense to set dst->incoming
> to a stub in br_netfilter_rtable_init() to just kfree_skb()+
> WARN_ON_ONCE(), no need to add code to ip stack or crash kernel
> due to brnf bug.

Just kfree_skb() would hide bugs.

Dropping packets is not uncommon in networking...

I would rather use at least a WARN_ON_ONCE() before the kfree_skb() ;)



--
To unsubscribe from this list: send the line "unsubscribe netfilter-devel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: ip_rcv_finish() NULL pointer kernel panic

2017-01-26 Thread David Miller
From: Florian Westphal 
Date: Thu, 26 Jan 2017 17:24:33 +0100

> Eric Dumazet  wrote:
>> > Though possibly with different things not setting the "input" function 
>> > pointer in the "struct dst_entry".
>> > 
>> > include/net/dst.h:
>> >   496 static inline int dst_input(struct sk_buff *skb) {
>> >   498 return skb_dst(skb)->input(skb);
>> >   499 }
>> > 
>> > Is there any reason not to check to see if this pointer is NULL before 
>> > blindly calling it ?
>> 
>> Sure. It can not be NULL at this point.
>> 
>> Just look at the code in ip_rcv_finish() :
>> 
>> It first make sure to get a valid dst.
>> 
>> Something is broken, probably in bridge netfilter code.
>> 
>> I suspect the dst here points to >fake_rtable, and this is not
>> expected.
>> 
>> br_drop_fake_rtable() should have been called somewhere ...
> 
> I think it makes sense to set dst->incoming
> to a stub in br_netfilter_rtable_init() to just kfree_skb()+
> WARN_ON_ONCE(), no need to add code to ip stack or crash kernel
> due to brnf bug.

That would certainly make recovery from such bugs must better.

But I have to say that this netfilter bridging fake dst has caused
several dozen bugs over the years, it is fundamentally a serious
problem in and of itself.  It provides DST facilities by hand, in a
static object, without using any of the usual methods for creating and
facilitating dst objects.

Therefore every time someone makes an adjustment to common dst code,
this turd (and yes, it _is_ a turd) breaks.  Every single time.

So in the long term, instead of polishing this turd, let's get rid of
it.
--
To unsubscribe from this list: send the line "unsubscribe netfilter-devel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH nft v3 4/6] src: Allow list stateful objects in a table

2017-01-26 Thread Elise Lennion
Currently, stateful objects can be listed by: listing all objects in
all tables; listing a single object in a table. Now it's allowed to
list all objects in a table.

$ nft list counters table filter
table ip filter {
counter https-traffic {
packets 14825 bytes 950063
}
counter http-traffic {
packets 117 bytes 9340
}
}

$ nft list quotas table filter
table ip filter {
quota https-quota {
25 mbytes used 2 mbytes
}
quota http-quota {
25 mbytes used 10 kbytes
}
}

Signed-off-by: Elise Lennion 
---

 v3: Created in v3.

 src/evaluate.c | 10 --
 src/parser_bison.y |  8 
 2 files changed, 16 insertions(+), 2 deletions(-)

diff --git a/src/evaluate.c b/src/evaluate.c
index ed41bd8..379f9d7 100644
--- a/src/evaluate.c
+++ b/src/evaluate.c
@@ -2950,10 +2950,16 @@ static int cmd_evaluate_list(struct eval_ctx *ctx, 
struct cmd *cmd)
return cmd_error(ctx, "Could not process rule: Object 
'%s' does not exist",
 cmd->handle.obj);
return 0;
-   case CMD_OBJ_CHAINS:
-   case CMD_OBJ_SETS:
case CMD_OBJ_COUNTERS:
case CMD_OBJ_QUOTAS:
+   if (cmd->handle.table == NULL)
+   return 0;
+   if (table_lookup(>handle) == NULL)
+   return cmd_error(ctx, "Could not process rule: Table 
'%s' does not exist",
+cmd->handle.table);
+   return 0;
+   case CMD_OBJ_CHAINS:
+   case CMD_OBJ_SETS:
case CMD_OBJ_RULESET:
case CMD_OBJ_FLOWTABLES:
case CMD_OBJ_MAPS:
diff --git a/src/parser_bison.y b/src/parser_bison.y
index a1b8b08..d543e3e 100644
--- a/src/parser_bison.y
+++ b/src/parser_bison.y
@@ -932,6 +932,10 @@ list_cmd   :   TABLE   table_spec
{
$$ = cmd_alloc(CMD_LIST, CMD_OBJ_COUNTERS, &$2, 
&@$, NULL);
}
+   |   COUNTERSTABLE   table_spec
+   {
+   $$ = cmd_alloc(CMD_LIST, CMD_OBJ_COUNTERS, &$3, 
&@$, NULL);
+   }
|   COUNTER obj_spec
{
$$ = cmd_alloc(CMD_LIST, CMD_OBJ_COUNTER, &$2, 
&@$, NULL);
@@ -940,6 +944,10 @@ list_cmd   :   TABLE   table_spec
{
$$ = cmd_alloc(CMD_LIST, CMD_OBJ_QUOTAS, &$2, 
&@$, NULL);
}
+   |   QUOTAS  TABLE   table_spec
+   {
+   $$ = cmd_alloc(CMD_LIST, CMD_OBJ_QUOTAS, &$3, 
&@$, NULL);
+   }
|   QUOTA   obj_spec
{
$$ = cmd_alloc(CMD_LIST, CMD_OBJ_QUOTA, &$2, 
&@$, NULL);
-- 
2.7.4

--
To unsubscribe from this list: send the line "unsubscribe netfilter-devel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH nft v3 5/6] tests: py: Add suport for stateful objects in python tests

2017-01-26 Thread Elise Lennion
This allows to write pytests using the new stateful objects.

To add an object use the symbol '%', followed by the name, type and
specifications (currently used in quota):

%cnt1 type counter;ok # Adds the counter cnt1 to all tables

%qt1 type quota over 25 mbytes;ok # Adds the quota qt1 to all tables

Signed-off-by: Elise Lennion 
---

 v3: Created in v3.

 tests/py/nft-test.py | 131 +++
 1 file changed, 131 insertions(+)

diff --git a/tests/py/nft-test.py b/tests/py/nft-test.py
index 62b7942..2500921 100755
--- a/tests/py/nft-test.py
+++ b/tests/py/nft-test.py
@@ -27,6 +27,7 @@ log_file = None
 table_list = []
 chain_list = []
 all_set = dict()
+obj_list = []
 signal_received = 0
 
 
@@ -83,6 +84,20 @@ class Set:
 return self.__dict__ == other.__dict__
 
 
+class Obj:
+"""Class that represents an object"""
+
+def __init__(self, table, family, name, type, spcf):
+self.table = table
+self.family = family
+self.name = name
+self.type = type
+self.spcf = spcf
+
+def __eq__(self, other):
+return self.__dict__ == other.__dict__
+
+
 def print_msg(reason, filename=None, lineno=None, color=None, errstr=None):
 '''
 Prints a message with nice colors, indicating file and line number.
@@ -472,6 +487,91 @@ def set_check_element(rule1, rule2):
 
 return cmp(rule1[end1:], rule2[end2:])
 
+
+def obj_add(o, test_result, filename, lineno):
+'''
+Adds an object.
+'''
+if not table_list:
+reason = "Missing table to add rule"
+print_error(reason, filename, lineno)
+return -1
+
+for table in table_list:
+o.table = table.name
+o.family = table.family
+obj_handle = o.type + " " + o.name
+if _obj_exist(o, filename, lineno):
+reason = "The " + obj_handle + " already exists in " + table.name
+print_error(reason, filename, lineno)
+return -1
+
+table_handle = " " + table.family + " " + table.name + " "
+
+cmd = NFT_BIN + " add " + o.type + table_handle + o.name + " " + o.spcf
+ret = execute_cmd(cmd, filename, lineno)
+
+if (ret == 0 and test_result == "fail") or \
+(ret != 0 and test_result == "ok"):
+reason = cmd + ": " + "I cannot add the " + obj_handle
+print_error(reason, filename, lineno)
+return -1
+
+if not _obj_exist(o, filename, lineno):
+reason = "I have just added the " + obj_handle + \
+ " to the table " + table.name + " but it does not exist"
+print_error(reason, filename, lineno)
+return -1
+
+
+def obj_delete(table, filename=None, lineno=None):
+'''
+Deletes object.
+'''
+for o in obj_list:
+obj_handle = o.type + " " + o.name
+# Check if exists the obj
+if not obj_exist(o, table, filename, lineno):
+reason = "The " + obj_handle + " does not exist, I cannot delete 
it"
+print_error(reason, filename, lineno)
+return -1
+
+# We delete the object.
+table_info = " " + table.family + " " + table.name + " "
+cmd = NFT_BIN + " delete " + o.type + table_info + " " + o.name
+ret = execute_cmd(cmd, filename, lineno)
+
+# Check if the object still exists after I deleted it.
+if ret != 0 or obj_exist(o, table, filename, lineno):
+reason = "Cannot remove the " + obj_handle
+print_error(reason, filename, lineno)
+return -1
+
+return 0
+
+
+def obj_exist(o, table, filename, lineno):
+'''
+Check if the object exists.
+'''
+table_handle = " " + table.family + " " + table.name + " "
+cmd = NFT_BIN + " list -nnn " + o.type + table_handle + o.name
+ret = execute_cmd(cmd, filename, lineno)
+
+return True if (ret == 0) else False
+
+
+def _obj_exist(o, filename, lineno):
+'''
+Check if the object exists.
+'''
+table_handle = " " + o.family + " " + o.table + " "
+cmd = NFT_BIN + " list -nnn " + o.type + table_handle + o.name
+ret = execute_cmd(cmd, filename, lineno)
+
+return True if (ret == 0) else False
+
+
 def output_clean(pre_output, chain):
 pos_chain = pre_output.find(chain.name)
 if pos_chain == -1:
@@ -684,6 +784,8 @@ def cleanup_on_exit():
 chain_delete(chain, table, "", "")
 if all_set:
 set_delete(table)
+if obj_list:
+obj_delete(table)
 table_delete(table)
 
 
@@ -775,6 +877,26 @@ def set_element_process(element_line, filename, lineno):
 return set_add_elements(set_element, set_name, rule_state, filename, 
lineno)
 
 
+def obj_process(obj_line, filename, lineno):
+test_result = obj_line[1]
+
+tokens = obj_line[0].split(" ")
+obj_name = tokens[0]
+obj_type = tokens[2]
+obj_spcf = ""
+
+if len(tokens) > 3:
+  

[PATCH nft v3 2/6] src: Allow list single stateful object

2017-01-26 Thread Elise Lennion
Currently the stateful objects can only be listed in groups. With this
patch listing a single object is allowed:

$ nft list counter filter https-traffic
table ip filter {
counter https-traffic {
packets 4014 bytes 228948
}
}

$ nft list quota filter https-quota
table ip filter {
quota https-quota {
25 mbytes used 278 kbytes
}
}

Signed-off-by: Elise Lennion 
---

 v2: No changes.
 v3: Fixed a bug, which counters with the same name in different tables
 were listed. Also, removed some code duplication in evaluate.c.

 src/evaluate.c | 14 ++
 src/rule.c | 12 +++-
 2 files changed, 25 insertions(+), 1 deletion(-)

diff --git a/src/evaluate.c b/src/evaluate.c
index 1d2f925..dab7cfc 100644
--- a/src/evaluate.c
+++ b/src/evaluate.c
@@ -2882,6 +2882,7 @@ static int cmd_evaluate_list(struct eval_ctx *ctx, struct 
cmd *cmd)
struct table *table;
struct set *set;
int ret;
+   uint32_t obj_type = NFT_OBJECT_UNSPEC;
 
ret = cache_update(cmd->op, ctx->msgs);
if (ret < 0)
@@ -2936,6 +2937,19 @@ static int cmd_evaluate_list(struct eval_ctx *ctx, 
struct cmd *cmd)
return cmd_error(ctx, "Could not process rule: Chain 
'%s' does not exist",
 cmd->handle.chain);
return 0;
+   case CMD_OBJ_QUOTA:
+   obj_type = NFT_OBJECT_QUOTA;
+   case CMD_OBJ_COUNTER:
+   if (obj_type == NFT_OBJECT_UNSPEC)
+   obj_type = NFT_OBJECT_COUNTER;
+   table = table_lookup(>handle);
+   if (table == NULL)
+   return cmd_error(ctx, "Could not process rule: Table 
'%s' does not exist",
+cmd->handle.table);
+   if (obj_lookup(table, cmd->handle.obj, obj_type) == NULL)
+   return cmd_error(ctx, "Could not process rule: Object 
'%s' does not exist",
+cmd->handle.obj);
+   return 0;
case CMD_OBJ_CHAINS:
case CMD_OBJ_SETS:
case CMD_OBJ_COUNTERS:
diff --git a/src/rule.c b/src/rule.c
index a9f3a49..0d58073 100644
--- a/src/rule.c
+++ b/src/rule.c
@@ -1276,8 +1276,16 @@ static int do_list_obj(struct netlink_ctx *ctx, struct 
cmd *cmd, uint32_t type)
   family2str(table->handle.family),
   table->handle.table);
 
+   if (cmd->handle.table != NULL &&
+   strcmp(cmd->handle.table, table->handle.table)) {
+   printf("}\n");
+   continue;
+   }
+
list_for_each_entry(obj, >objs, list) {
-   if (obj->type != type)
+   if (obj->type != type ||
+   (cmd->handle.obj != NULL &&
+strcmp(cmd->handle.obj, obj->handle.obj)))
continue;
 
obj_print_declaration(obj, );
@@ -1420,8 +1428,10 @@ static int do_command_list(struct netlink_ctx *ctx, 
struct cmd *cmd)
return do_list_sets(ctx, cmd);
case CMD_OBJ_MAP:
return do_list_set(ctx, cmd, table);
+   case CMD_OBJ_COUNTER:
case CMD_OBJ_COUNTERS:
return do_list_obj(ctx, cmd, NFT_OBJECT_COUNTER);
+   case CMD_OBJ_QUOTA:
case CMD_OBJ_QUOTAS:
return do_list_obj(ctx, cmd, NFT_OBJECT_QUOTA);
default:
-- 
2.7.4

--
To unsubscribe from this list: send the line "unsubscribe netfilter-devel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 10/14] netfilter: nf_tables: validate the name size when possible

2017-01-26 Thread Pablo Neira Ayuso
From: Liping Zhang 

Currently, if the user add a stateful object with the name size exceed
NFT_OBJ_MAXNAMELEN - 1 (i.e. 31), we truncate it down to 31 silently.
This is not friendly, furthermore, this will cause duplicated stateful
objects when the first 31 characters of the name is same. So limit the
stateful object's name size to NFT_OBJ_MAXNAMELEN - 1.

After apply this patch, error message will be printed out like this:
  # name_32=$(printf "%0.sQ" {1..32})
  # nft add counter filter $name_32
  :1:1-52: Error: Could not process rule: Numerical result out
  of range
  add counter filter 
  

Also this patch cleans up the codes which missing the name size limit
validation in nftables.

Fixes: e50092404c1b ("netfilter: nf_tables: add stateful objects")
Signed-off-by: Liping Zhang 
Signed-off-by: Pablo Neira Ayuso 
---
 net/netfilter/nf_tables_api.c | 21 ++---
 net/netfilter/nft_dynset.c|  3 ++-
 net/netfilter/nft_lookup.c|  3 ++-
 net/netfilter/nft_objref.c|  6 --
 4 files changed, 22 insertions(+), 11 deletions(-)

diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index 091d2dcc63b2..b84c7b25219b 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -928,7 +928,8 @@ static struct nft_chain *nf_tables_chain_lookup(const 
struct nft_table *table,
 }
 
 static const struct nla_policy nft_chain_policy[NFTA_CHAIN_MAX + 1] = {
-   [NFTA_CHAIN_TABLE]  = { .type = NLA_STRING },
+   [NFTA_CHAIN_TABLE]  = { .type = NLA_STRING,
+   .len = NFT_TABLE_MAXNAMELEN - 1 },
[NFTA_CHAIN_HANDLE] = { .type = NLA_U64 },
[NFTA_CHAIN_NAME]   = { .type = NLA_STRING,
.len = NFT_CHAIN_MAXNAMELEN - 1 },
@@ -1854,7 +1855,8 @@ static struct nft_rule *nf_tables_rule_lookup(const 
struct nft_chain *chain,
 }
 
 static const struct nla_policy nft_rule_policy[NFTA_RULE_MAX + 1] = {
-   [NFTA_RULE_TABLE]   = { .type = NLA_STRING },
+   [NFTA_RULE_TABLE]   = { .type = NLA_STRING,
+   .len = NFT_TABLE_MAXNAMELEN - 1 },
[NFTA_RULE_CHAIN]   = { .type = NLA_STRING,
.len = NFT_CHAIN_MAXNAMELEN - 1 },
[NFTA_RULE_HANDLE]  = { .type = NLA_U64 },
@@ -2443,7 +2445,8 @@ nft_select_set_ops(const struct nlattr * const nla[],
 }
 
 static const struct nla_policy nft_set_policy[NFTA_SET_MAX + 1] = {
-   [NFTA_SET_TABLE]= { .type = NLA_STRING },
+   [NFTA_SET_TABLE]= { .type = NLA_STRING,
+   .len = NFT_TABLE_MAXNAMELEN - 1 },
[NFTA_SET_NAME] = { .type = NLA_STRING,
.len = NFT_SET_MAXNAMELEN - 1 },
[NFTA_SET_FLAGS]= { .type = NLA_U32 },
@@ -3192,8 +3195,10 @@ static const struct nla_policy 
nft_set_elem_policy[NFTA_SET_ELEM_MAX + 1] = {
 };
 
 static const struct nla_policy nft_set_elem_list_policy[NFTA_SET_ELEM_LIST_MAX 
+ 1] = {
-   [NFTA_SET_ELEM_LIST_TABLE]  = { .type = NLA_STRING },
-   [NFTA_SET_ELEM_LIST_SET]= { .type = NLA_STRING },
+   [NFTA_SET_ELEM_LIST_TABLE]  = { .type = NLA_STRING,
+   .len = NFT_TABLE_MAXNAMELEN - 1 },
+   [NFTA_SET_ELEM_LIST_SET]= { .type = NLA_STRING,
+   .len = NFT_SET_MAXNAMELEN - 1 },
[NFTA_SET_ELEM_LIST_ELEMENTS]   = { .type = NLA_NESTED },
[NFTA_SET_ELEM_LIST_SET_ID] = { .type = NLA_U32 },
 };
@@ -4032,8 +4037,10 @@ struct nft_object *nf_tables_obj_lookup(const struct 
nft_table *table,
 EXPORT_SYMBOL_GPL(nf_tables_obj_lookup);
 
 static const struct nla_policy nft_obj_policy[NFTA_OBJ_MAX + 1] = {
-   [NFTA_OBJ_TABLE]= { .type = NLA_STRING },
-   [NFTA_OBJ_NAME] = { .type = NLA_STRING },
+   [NFTA_OBJ_TABLE]= { .type = NLA_STRING,
+   .len = NFT_TABLE_MAXNAMELEN - 1 },
+   [NFTA_OBJ_NAME] = { .type = NLA_STRING,
+   .len = NFT_OBJ_MAXNAMELEN - 1 },
[NFTA_OBJ_TYPE] = { .type = NLA_U32 },
[NFTA_OBJ_DATA] = { .type = NLA_NESTED },
 };
diff --git a/net/netfilter/nft_dynset.c b/net/netfilter/nft_dynset.c
index 7de2f46734a4..049ad2d9ee66 100644
--- a/net/netfilter/nft_dynset.c
+++ b/net/netfilter/nft_dynset.c
@@ -98,7 +98,8 @@ static void nft_dynset_eval(const struct nft_expr *expr,
 }
 
 static const struct nla_policy nft_dynset_policy[NFTA_DYNSET_MAX + 1] = {
-   [NFTA_DYNSET_SET_NAME]  = { .type = NLA_STRING },
+   [NFTA_DYNSET_SET_NAME]  = { .type = NLA_STRING,
+   .len = NFT_SET_MAXNAMELEN - 

[PATCH 01/14] netfilter: use fwmark_reflect in nf_send_reset

2017-01-26 Thread Pablo Neira Ayuso
From: Pau Espin Pedrol 

Otherwise, RST packets generated by ipt_REJECT always have mark 0 when
the routing is checked later in the same code path.

Fixes: e110861f8609 ("net: add a sysctl to reflect the fwmark on replies")
Cc: Lorenzo Colitti 
Signed-off-by: Pau Espin Pedrol 
Signed-off-by: Pablo Neira Ayuso 
---
 net/ipv4/netfilter/nf_reject_ipv4.c | 2 ++
 net/ipv6/netfilter/nf_reject_ipv6.c | 3 +++
 2 files changed, 5 insertions(+)

diff --git a/net/ipv4/netfilter/nf_reject_ipv4.c 
b/net/ipv4/netfilter/nf_reject_ipv4.c
index fd8220213afc..146d86105183 100644
--- a/net/ipv4/netfilter/nf_reject_ipv4.c
+++ b/net/ipv4/netfilter/nf_reject_ipv4.c
@@ -126,6 +126,8 @@ void nf_send_reset(struct net *net, struct sk_buff *oldskb, 
int hook)
/* ip_route_me_harder expects skb->dst to be set */
skb_dst_set_noref(nskb, skb_dst(oldskb));
 
+   nskb->mark = IP4_REPLY_MARK(net, oldskb->mark);
+
skb_reserve(nskb, LL_MAX_HEADER);
niph = nf_reject_iphdr_put(nskb, oldskb, IPPROTO_TCP,
   ip4_dst_hoplimit(skb_dst(nskb)));
diff --git a/net/ipv6/netfilter/nf_reject_ipv6.c 
b/net/ipv6/netfilter/nf_reject_ipv6.c
index 10090400c72f..eedee5d108d9 100644
--- a/net/ipv6/netfilter/nf_reject_ipv6.c
+++ b/net/ipv6/netfilter/nf_reject_ipv6.c
@@ -157,6 +157,7 @@ void nf_send_reset6(struct net *net, struct sk_buff 
*oldskb, int hook)
fl6.fl6_sport = otcph->dest;
fl6.fl6_dport = otcph->source;
fl6.flowi6_oif = l3mdev_master_ifindex(skb_dst(oldskb)->dev);
+   fl6.flowi6_mark = IP6_REPLY_MARK(net, oldskb->mark);
security_skb_classify_flow(oldskb, flowi6_to_flowi());
dst = ip6_route_output(net, NULL, );
if (dst->error) {
@@ -180,6 +181,8 @@ void nf_send_reset6(struct net *net, struct sk_buff 
*oldskb, int hook)
 
skb_dst_set(nskb, dst);
 
+   nskb->mark = fl6.flowi6_mark;
+
skb_reserve(nskb, hh_len + dst->header_len);
ip6h = nf_reject_ip6hdr_put(nskb, oldskb, IPPROTO_TCP,
ip6_dst_hoplimit(dst));
-- 
2.1.4

--
To unsubscribe from this list: send the line "unsubscribe netfilter-devel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 11/14] netfilter: nft_log: restrict the log prefix length to 127

2017-01-26 Thread Pablo Neira Ayuso
From: Liping Zhang 

First, log prefix will be truncated to NF_LOG_PREFIXLEN-1, i.e. 127,
at nf_log_packet(), so the extra part is useless.

Second, after adding a log rule with a very very long prefix, we will
fail to dump the nft rules after this _special_ one, but acctually,
they do exist. For example:
  # name_65000=$(printf "%0.sQ" {1..65000})
  # nft add rule filter output log prefix "$name_65000"
  # nft add rule filter output counter
  # nft add rule filter output counter
  # nft list chain filter output
  table ip filter {
  chain output {
  type filter hook output priority 0; policy accept;
  }
  }

So now, restrict the log prefix length to NF_LOG_PREFIXLEN-1.

Fixes: 96518518cc41 ("netfilter: add nftables")
Signed-off-by: Liping Zhang 
Signed-off-by: Pablo Neira Ayuso 
---
 include/uapi/linux/netfilter/nf_log.h | 2 ++
 net/netfilter/nf_log.c| 1 -
 net/netfilter/nft_log.c   | 3 ++-
 3 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/include/uapi/linux/netfilter/nf_log.h 
b/include/uapi/linux/netfilter/nf_log.h
index 8be21e02387d..d0b5fa91ff54 100644
--- a/include/uapi/linux/netfilter/nf_log.h
+++ b/include/uapi/linux/netfilter/nf_log.h
@@ -9,4 +9,6 @@
 #define NF_LOG_MACDECODE   0x20/* Decode MAC header */
 #define NF_LOG_MASK0x2f
 
+#define NF_LOG_PREFIXLEN   128
+
 #endif /* _NETFILTER_NF_LOG_H */
diff --git a/net/netfilter/nf_log.c b/net/netfilter/nf_log.c
index 3dca90dc24ad..ffb9e8ada899 100644
--- a/net/netfilter/nf_log.c
+++ b/net/netfilter/nf_log.c
@@ -13,7 +13,6 @@
 /* Internal logging interface, which relies on the real
LOG target modules */
 
-#define NF_LOG_PREFIXLEN   128
 #define NFLOGGER_NAME_LEN  64
 
 static struct nf_logger __rcu *loggers[NFPROTO_NUMPROTO][NF_LOG_TYPE_MAX] 
__read_mostly;
diff --git a/net/netfilter/nft_log.c b/net/netfilter/nft_log.c
index 6271e40a3dd6..6f6e64423643 100644
--- a/net/netfilter/nft_log.c
+++ b/net/netfilter/nft_log.c
@@ -39,7 +39,8 @@ static void nft_log_eval(const struct nft_expr *expr,
 
 static const struct nla_policy nft_log_policy[NFTA_LOG_MAX + 1] = {
[NFTA_LOG_GROUP]= { .type = NLA_U16 },
-   [NFTA_LOG_PREFIX]   = { .type = NLA_STRING },
+   [NFTA_LOG_PREFIX]   = { .type = NLA_STRING,
+   .len = NF_LOG_PREFIXLEN - 1 },
[NFTA_LOG_SNAPLEN]  = { .type = NLA_U32 },
[NFTA_LOG_QTHRESHOLD]   = { .type = NLA_U16 },
[NFTA_LOG_LEVEL]= { .type = NLA_U32 },
-- 
2.1.4

--
To unsubscribe from this list: send the line "unsubscribe netfilter-devel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 12/14] netfilter: nf_tables: fix set->nelems counting with no NLM_F_EXCL

2017-01-26 Thread Pablo Neira Ayuso
If the element exists and no NLM_F_EXCL is specified, do not bump
set->nelems, otherwise we leak one set element slot. This problem
amplifies if the set is full since the abort path always decrements the
counter for the -ENFILE case too, giving one spare extra slot.

Fix this by moving set->nelems update to nft_add_set_elem() after
successful element insertion. Moreover, remove the element if the set is
full so there is no need to rely on the abort path to undo things
anymore.

Fixes: c016c7e45ddf ("netfilter: nf_tables: honor NLM_F_EXCL flag in set 
element insertion")
Signed-off-by: Pablo Neira Ayuso 
---
 net/netfilter/nf_tables_api.c | 16 +---
 1 file changed, 9 insertions(+), 7 deletions(-)

diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index b84c7b25219b..831a9a16f563 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -3745,10 +3745,18 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct 
nft_set *set,
goto err5;
}
 
+   if (set->size &&
+   !atomic_add_unless(>nelems, 1, set->size + set->ndeact)) {
+   err = -ENFILE;
+   goto err6;
+   }
+
nft_trans_elem(trans) = elem;
list_add_tail(>list, >net->nft.commit_list);
return 0;
 
+err6:
+   set->ops->remove(set, );
 err5:
kfree(trans);
 err4:
@@ -3795,15 +3803,9 @@ static int nf_tables_newsetelem(struct net *net, struct 
sock *nlsk,
return -EBUSY;
 
nla_for_each_nested(attr, nla[NFTA_SET_ELEM_LIST_ELEMENTS], rem) {
-   if (set->size &&
-   !atomic_add_unless(>nelems, 1, set->size + 
set->ndeact))
-   return -ENFILE;
-
err = nft_add_set_elem(, set, attr, nlh->nlmsg_flags);
-   if (err < 0) {
-   atomic_dec(>nelems);
+   if (err < 0)
break;
-   }
}
return err;
 }
-- 
2.1.4

--
To unsubscribe from this list: send the line "unsubscribe netfilter-devel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 08/14] netfilter: conntrack: remove GC_MAX_EVICTS break

2017-01-26 Thread Pablo Neira Ayuso
From: Florian Westphal 

Instead of breaking loop and instant resched, don't bother checking
this in first place (the loop calls cond_resched for every bucket anyway).

Suggested-by: Nicolas Dichtel 
Signed-off-by: Florian Westphal 
Acked-by: Nicolas Dichtel 
Signed-off-by: Pablo Neira Ayuso 
---
 net/netfilter/nf_conntrack_core.c | 7 ++-
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/net/netfilter/nf_conntrack_core.c 
b/net/netfilter/nf_conntrack_core.c
index 3a073cd9fcf4..6feb5d370319 100644
--- a/net/netfilter/nf_conntrack_core.c
+++ b/net/netfilter/nf_conntrack_core.c
@@ -88,8 +88,6 @@ static __read_mostly bool nf_conntrack_locks_all;
 #define GC_MAX_BUCKETS_DIV 64u
 /* upper bound of scan intervals */
 #define GC_INTERVAL_MAX(2 * HZ)
-/* maximum conntracks to evict per gc run */
-#define GC_MAX_EVICTS  256u
 
 static struct conntrack_gc_work conntrack_gc_work;
 
@@ -979,8 +977,7 @@ static void gc_worker(struct work_struct *work)
 */
rcu_read_unlock();
cond_resched_rcu_qs();
-   } while (++buckets < goal &&
-expired_count < GC_MAX_EVICTS);
+   } while (++buckets < goal);
 
if (gc_work->exiting)
return;
@@ -1005,7 +1002,7 @@ static void gc_worker(struct work_struct *work)
 * In case we have lots of evictions next scan is done immediately.
 */
ratio = scanned ? expired_count * 100 / scanned : 0;
-   if (ratio >= 90 || expired_count == GC_MAX_EVICTS) {
+   if (ratio >= 90) {
gc_work->next_gc_run = 0;
next_run = 0;
} else if (expired_count) {
-- 
2.1.4

--
To unsubscribe from this list: send the line "unsubscribe netfilter-devel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 03/14] netfilter: nf_tables: fix spelling mistakes

2017-01-26 Thread Pablo Neira Ayuso
From: Alexander Alemayhu 

o s/numerice/numeric
o s/opertaor/operator

Signed-off-by: Alexander Alemayhu 
Signed-off-by: Pablo Neira Ayuso 
---
 include/uapi/linux/netfilter/nf_tables.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/include/uapi/linux/netfilter/nf_tables.h 
b/include/uapi/linux/netfilter/nf_tables.h
index 881d49e94569..e3f27e09eb2b 100644
--- a/include/uapi/linux/netfilter/nf_tables.h
+++ b/include/uapi/linux/netfilter/nf_tables.h
@@ -235,7 +235,7 @@ enum nft_rule_compat_flags {
 /**
  * enum nft_rule_compat_attributes - nf_tables rule compat attributes
  *
- * @NFTA_RULE_COMPAT_PROTO: numerice value of handled protocol (NLA_U32)
+ * @NFTA_RULE_COMPAT_PROTO: numeric value of handled protocol (NLA_U32)
  * @NFTA_RULE_COMPAT_FLAGS: bitmask of enum nft_rule_compat_flags (NLA_U32)
  */
 enum nft_rule_compat_attributes {
@@ -499,7 +499,7 @@ enum nft_bitwise_attributes {
  * enum nft_byteorder_ops - nf_tables byteorder operators
  *
  * @NFT_BYTEORDER_NTOH: network to host operator
- * @NFT_BYTEORDER_HTON: host to network opertaor
+ * @NFT_BYTEORDER_HTON: host to network operator
  */
 enum nft_byteorder_ops {
NFT_BYTEORDER_NTOH,
-- 
2.1.4

--
To unsubscribe from this list: send the line "unsubscribe netfilter-devel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 02/14] tcp: fix mark propagation with fwmark_reflect enabled

2017-01-26 Thread Pablo Neira Ayuso
From: Pau Espin Pedrol 

Otherwise, RST packets generated by the TCP stack for non-existing
sockets always have mark 0.
The mark from the original packet is assigned to the netns_ipv4/6
socket used to send the response so that it can get copied into the
response skb when the socket sends it.

Fixes: e110861f8609 ("net: add a sysctl to reflect the fwmark on replies")
Cc: Lorenzo Colitti 
Signed-off-by: Pau Espin Pedrol 
Signed-off-by: Pablo Neira Ayuso 
---
 net/ipv4/ip_output.c | 1 +
 net/ipv6/tcp_ipv6.c  | 1 +
 2 files changed, 2 insertions(+)

diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index fac275c48108..b67719f45953 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -1629,6 +1629,7 @@ void ip_send_unicast_reply(struct sock *sk, struct 
sk_buff *skb,
sk->sk_protocol = ip_hdr(skb)->protocol;
sk->sk_bound_dev_if = arg->bound_dev_if;
sk->sk_sndbuf = sysctl_wmem_default;
+   sk->sk_mark = fl4.flowi4_mark;
err = ip_append_data(sk, , ip_reply_glue_bits, arg->iov->iov_base,
 len, 0, , , MSG_DONTWAIT);
if (unlikely(err)) {
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 73bc8fc68acd..2b20622a5824 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -840,6 +840,7 @@ static void tcp_v6_send_response(const struct sock *sk, 
struct sk_buff *skb, u32
dst = ip6_dst_lookup_flow(ctl_sk, , NULL);
if (!IS_ERR(dst)) {
skb_dst_set(buff, dst);
+   ctl_sk->sk_mark = fl6.flowi6_mark;
ip6_xmit(ctl_sk, buff, , NULL, tclass);
TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
if (rst)
-- 
2.1.4

--
To unsubscribe from this list: send the line "unsubscribe netfilter-devel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 13/14] netfilter: nf_tables: deconstify walk callback function

2017-01-26 Thread Pablo Neira Ayuso
The flush operation needs to modify set and element objects, so let's
deconstify this.

Signed-off-by: Pablo Neira Ayuso 
---
 include/net/netfilter/nf_tables.h |  6 +++---
 net/netfilter/nf_tables_api.c | 24 
 net/netfilter/nft_set_hash.c  |  2 +-
 net/netfilter/nft_set_rbtree.c|  2 +-
 4 files changed, 17 insertions(+), 17 deletions(-)

diff --git a/include/net/netfilter/nf_tables.h 
b/include/net/netfilter/nf_tables.h
index 924325c46aab..7dfdb517f0be 100644
--- a/include/net/netfilter/nf_tables.h
+++ b/include/net/netfilter/nf_tables.h
@@ -207,9 +207,9 @@ struct nft_set_iter {
unsigned intskip;
int err;
int (*fn)(const struct nft_ctx *ctx,
- const struct nft_set *set,
+ struct nft_set *set,
  const struct nft_set_iter *iter,
- const struct nft_set_elem *elem);
+ struct nft_set_elem *elem);
 };
 
 /**
@@ -301,7 +301,7 @@ struct nft_set_ops {
void(*remove)(const struct nft_set *set,
  const struct nft_set_elem 
*elem);
void(*walk)(const struct nft_ctx *ctx,
-   const struct nft_set *set,
+   struct nft_set *set,
struct nft_set_iter *iter);
 
unsigned int(*privsize)(const struct nlattr * const 
nla[]);
diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index 831a9a16f563..5bd0068320fb 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -3087,9 +3087,9 @@ static int nf_tables_delset(struct net *net, struct sock 
*nlsk,
 }
 
 static int nf_tables_bind_check_setelem(const struct nft_ctx *ctx,
-   const struct nft_set *set,
+   struct nft_set *set,
const struct nft_set_iter *iter,
-   const struct nft_set_elem *elem)
+   struct nft_set_elem *elem)
 {
const struct nft_set_ext *ext = nft_set_elem_ext(set, elem->priv);
enum nft_registers dreg;
@@ -3308,9 +3308,9 @@ struct nft_set_dump_args {
 };
 
 static int nf_tables_dump_setelem(const struct nft_ctx *ctx,
- const struct nft_set *set,
+ struct nft_set *set,
  const struct nft_set_iter *iter,
- const struct nft_set_elem *elem)
+ struct nft_set_elem *elem)
 {
struct nft_set_dump_args *args;
 
@@ -3322,7 +3322,7 @@ static int nf_tables_dump_set(struct sk_buff *skb, struct 
netlink_callback *cb)
 {
struct net *net = sock_net(skb->sk);
u8 genmask = nft_genmask_cur(net);
-   const struct nft_set *set;
+   struct nft_set *set;
struct nft_set_dump_args args;
struct nft_ctx ctx;
struct nlattr *nla[NFTA_SET_ELEM_LIST_MAX + 1];
@@ -3890,9 +3890,9 @@ static int nft_del_setelem(struct nft_ctx *ctx, struct 
nft_set *set,
 }
 
 static int nft_flush_set(const struct nft_ctx *ctx,
-const struct nft_set *set,
+struct nft_set *set,
 const struct nft_set_iter *iter,
-const struct nft_set_elem *elem)
+struct nft_set_elem *elem)
 {
struct nft_trans *trans;
int err;
@@ -3907,8 +3907,8 @@ static int nft_flush_set(const struct nft_ctx *ctx,
goto err1;
}
 
-   nft_trans_elem_set(trans) = (struct nft_set *)set;
-   nft_trans_elem(trans) = *((struct nft_set_elem *)elem);
+   nft_trans_elem_set(trans) = set;
+   nft_trans_elem(trans) = *elem;
list_add_tail(>list, >net->nft.commit_list);
 
return 0;
@@ -5019,9 +5019,9 @@ static int nf_tables_check_loops(const struct nft_ctx 
*ctx,
 const struct nft_chain *chain);
 
 static int nf_tables_loop_check_setelem(const struct nft_ctx *ctx,
-   const struct nft_set *set,
+   struct nft_set *set,
const struct nft_set_iter *iter,
-   const struct nft_set_elem *elem)
+   struct nft_set_elem *elem)
 {
const struct nft_set_ext *ext = nft_set_elem_ext(set, elem->priv);
const struct nft_data *data;
@@ -5045,7 +5045,7 @@ static int nf_tables_check_loops(const struct nft_ctx 
*ctx,
 {
const struct nft_rule *rule;
const struct nft_expr *expr, 

[PATCH 05/14] netfilter: nf_tables: fix possible oops when dumping stateful objects

2017-01-26 Thread Pablo Neira Ayuso
From: Liping Zhang 

When dumping nft stateful objects, if NFTA_OBJ_TABLE and NFTA_OBJ_TYPE
attributes are not specified either, filter will become NULL, so oops
will happen(actually nft utility will always set NFTA_OBJ_TABLE attr,
so I write a test program to make this happen):

  BUG: unable to handle kernel NULL pointer dereference at (null)
  IP: nf_tables_dump_obj+0x17c/0x330 [nf_tables]
  [...]
  Call Trace:
  ? nf_tables_dump_obj+0x5/0x330 [nf_tables]
  ? __kmalloc_reserve.isra.35+0x31/0x90
  ? __alloc_skb+0x5b/0x1e0
  netlink_dump+0x124/0x2a0
  __netlink_dump_start+0x161/0x190
  nf_tables_getobj+0xe8/0x280 [nf_tables]

Fixes: a9fea2a3c3cf ("netfilter: nf_tables: allow to filter stateful object 
dumps by type")
Signed-off-by: Liping Zhang 
Signed-off-by: Pablo Neira Ayuso 
---
 net/netfilter/nf_tables_api.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index 0db5f9782265..091d2dcc63b2 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -4262,10 +4262,11 @@ static int nf_tables_dump_obj(struct sk_buff *skb, 
struct netlink_callback *cb)
if (idx > s_idx)
memset(>args[1], 0,
   sizeof(cb->args) - 
sizeof(cb->args[0]));
-   if (filter->table[0] &&
+   if (filter && filter->table[0] &&
strcmp(filter->table, table->name))
goto cont;
-   if (filter->type != NFT_OBJECT_UNSPEC &&
+   if (filter &&
+   filter->type != NFT_OBJECT_UNSPEC &&
obj->type->type != filter->type)
goto cont;
 
-- 
2.1.4

--
To unsubscribe from this list: send the line "unsubscribe netfilter-devel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 14/14] netfilter: nf_tables: bump set->ndeact on set flush

2017-01-26 Thread Pablo Neira Ayuso
Add missing set->ndeact update on each deactivated element from the set
flush path. Otherwise, sets with fixed size break after flush since
accounting breaks.

 # nft add set x y { type ipv4_addr\; size 2\; }
 # nft add element x y { 1.1.1.1 }
 # nft add element x y { 1.1.1.2 }
 # nft flush set x y
 # nft add element x y { 1.1.1.1 }
 :1:1-28: Error: Could not process rule: Too many open files in system

Fixes: 8411b6442e59 ("netfilter: nf_tables: support for set flushing")
Reported-by: Elise Lennion 
Signed-off-by: Pablo Neira Ayuso 
---
 net/netfilter/nf_tables_api.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index 5bd0068320fb..1b913760f205 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -3906,6 +3906,7 @@ static int nft_flush_set(const struct nft_ctx *ctx,
err = -ENOENT;
goto err1;
}
+   set->ndeact++;
 
nft_trans_elem_set(trans) = set;
nft_trans_elem(trans) = *elem;
-- 
2.1.4

--
To unsubscribe from this list: send the line "unsubscribe netfilter-devel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 06/14] netfilter: Fix typo in NF_CONNTRACK Kconfig option description

2017-01-26 Thread Pablo Neira Ayuso
From: William Breathitt Gray 

The NF_CONNTRACK Kconfig option description makes an incorrect reference
to the "meta" expression where the "ct" expression would be correct.This
patch fixes the respective typographical error.

Fixes: d497c6352736 ("netfilter: add help information to new nf_tables Kconfig 
options")
Signed-off-by: William Breathitt Gray 
Signed-off-by: Pablo Neira Ayuso 
---
 net/netfilter/Kconfig | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig
index 63729b489c2c..bbc45f8a7b2d 100644
--- a/net/netfilter/Kconfig
+++ b/net/netfilter/Kconfig
@@ -494,7 +494,7 @@ config NFT_CT
depends on NF_CONNTRACK
tristate "Netfilter nf_tables conntrack module"
help
- This option adds the "meta" expression that you can use to match
+ This option adds the "ct" expression that you can use to match
  connection tracking information such as the flow state.
 
 config NFT_SET_RBTREE
-- 
2.1.4

--
To unsubscribe from this list: send the line "unsubscribe netfilter-devel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: ip_rcv_finish() NULL pointer kernel panic

2017-01-26 Thread Florian Westphal
Eric Dumazet  wrote:
> > Though possibly with different things not setting the "input" function 
> > pointer in the "struct dst_entry".
> > 
> > include/net/dst.h:
> >   496 static inline int dst_input(struct sk_buff *skb) {
> >   498 return skb_dst(skb)->input(skb);
> >   499 }
> > 
> > Is there any reason not to check to see if this pointer is NULL before 
> > blindly calling it ?
> 
> Sure. It can not be NULL at this point.
> 
> Just look at the code in ip_rcv_finish() :
> 
> It first make sure to get a valid dst.
> 
> Something is broken, probably in bridge netfilter code.
> 
> I suspect the dst here points to >fake_rtable, and this is not
> expected.
> 
> br_drop_fake_rtable() should have been called somewhere ...

I think it makes sense to set dst->incoming
to a stub in br_netfilter_rtable_init() to just kfree_skb()+
WARN_ON_ONCE(), no need to add code to ip stack or crash kernel
due to brnf bug.
--
To unsubscribe from this list: send the line "unsubscribe netfilter-devel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] iptables: fix the wrong appending of jump verdict after the comment.

2017-01-26 Thread Pablo Neira Ayuso
On Thu, Jan 26, 2017 at 05:06:56PM +0530, Shivani Bhardwaj wrote:
> On Thu, Jan 26, 2017 at 2:49 PM, Shyam Saini  wrote:
> > Fix wrong appending of jump verdict after the comment
> >
> > For example:
> > $ iptables-translate -A INPUT -p tcp -m tcp --sport http -s  192.168.0.0/16 
> > -d 192.168.0.0/16 -j LONGNACCEPT -m comment --comment "foobar"
> > nft add rule ip filter INPUT ip saddr 192.168.0.0/16 ip daddr 
> > 192.168.0.0/16 tcp sport 80 counter comment \"foobar\"jump LONGNACCEPT
> >
> > Note that even without comment with double-quotes (i.e. --comment
> > "foobar"), it will add quotes:
> >
> > $ iptables-translate -A FORWARD -p tcp -m tcp --sport http -s 
> > 192.168.0.0/16 -d 192.168.0.0/16 -j DROP -m comment --comment singlecomment
> > nft add rule ip filter FORWARD ip saddr 192.168.0.0/16 ip daddr 
> > 192.168.0.0/16 tcp sport 80 counter comment \"singlecomment\"drop
> >
> > Attempting to apply the translated/generated rule will result to:
> >
> > $ nft add rule ip filter INPUT ip saddr 192.168.0.0/16 ip daddr  
> > 192.168.0.0/16 tcp sport 80 counter comment \"foobar\"jump LONGNACCEPT
> > :1:111-114: Error: syntax error, unexpected jump, expecting endof 
> > file or newline or semicolon
> > add rule ip filter INPUT ip saddr 192.168.0.0/16 ip daddr 192.168.0.0/16 
> > tcp sport 80 counter comment "foobar"jump LONGNACCEPT
> >
> > After this patch
> > $ iptables-translate -A INPUT -p tcp -m tcp --sport http -s 192.168.0.0/16 
> > -d 192.168.0.0/16 -j LONGNACCEPT -m comment --comment "foobar"
> > nft add rule ip filter INPUT ip saddr 192.168.0.0/16 ip daddr 
> > 192.168.0.0/16 tcp sport 80 counter jump LONGNACCEPT comment \"foobar\"
> > which is correct translation
> >
> > Signed-off-by: Shyam Saini 
> 
> Reviewed-by: Shivani Bhardwaj 
> 
> It does get accepted by nft. Sorry about the last mail.
> You could probably send out similar patch for ip6 too.

I have mangled your patch to include the missing IPv6 chunk.

So this patch has been applied, thanks!
--
To unsubscribe from this list: send the line "unsubscribe netfilter-devel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: ip_rcv_finish() NULL pointer kernel panic

2017-01-26 Thread Eric Dumazet
On Thu, 2017-01-26 at 09:32 -0600, Roy Keene wrote:
> This bug appears to have existed for a long time:
> 
>   https://www.spinics.net/lists/netdev/msg222459.html
> 
>   http://www.kernelhub.org/?p=2=823752
> 
> Though possibly with different things not setting the "input" function 
> pointer in the "struct dst_entry".
> 
> include/net/dst.h:
>   496 static inline int dst_input(struct sk_buff *skb) {
>   498 return skb_dst(skb)->input(skb);
>   499 }
> 
> Is there any reason not to check to see if this pointer is NULL before 
> blindly calling it ?

Sure. It can not be NULL at this point.

Just look at the code in ip_rcv_finish() :

It first make sure to get a valid dst.

Something is broken, probably in bridge netfilter code.

I suspect the dst here points to >fake_rtable, and this is not
expected.

br_drop_fake_rtable() should have been called somewhere ...



--
To unsubscribe from this list: send the line "unsubscribe netfilter-devel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v2 net] net: free ip_vs_dest structs when refcnt=0

2017-01-26 Thread David Windsor
Currently, the ip_vs_dest cache frees ip_vs_dest objects when their
reference count becomes < 0.  Aside from not being semantically sound,
this is problematic for the new type refcount_t, which will be introduced
shortly in a separate patch. refcount_t is the new kernel type for
holding reference counts, and provides overflow protection and a
constrained interface relative to atomic_t (the type currently being
used for kernel reference counts).

Per Julian Anastasov: "The problem is that dest_trash currently holds
deleted dests (unlinked from RCU lists) with refcnt=0."  Changing
dest_trash to hold dest with refcnt=1 will allow us to free ip_vs_dest
structs when their refcnt=0, in ip_vs_dest_put_and_free().

Signed-off-by: David Windsor 
---
 include/net/ip_vs.h| 2 +-
 net/netfilter/ipvs/ip_vs_ctl.c | 8 +++-
 2 files changed, 4 insertions(+), 6 deletions(-)

diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h
index cd6018a..a3e78ad 100644
--- a/include/net/ip_vs.h
+++ b/include/net/ip_vs.h
@@ -1421,7 +1421,7 @@ static inline void ip_vs_dest_put(struct ip_vs_dest *dest)
 
 static inline void ip_vs_dest_put_and_free(struct ip_vs_dest *dest)
 {
-   if (atomic_dec_return(>refcnt) < 0)
+   if (atomic_dec_and_test(>refcnt))
kfree(dest);
 }
 
diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c
index 55e0169..5fc4836 100644
--- a/net/netfilter/ipvs/ip_vs_ctl.c
+++ b/net/netfilter/ipvs/ip_vs_ctl.c
@@ -711,7 +711,6 @@ ip_vs_trash_get_dest(struct ip_vs_service *svc, int dest_af,
  dest->vport == svc->port))) {
/* HIT */
list_del(>t_list);
-   ip_vs_dest_hold(dest);
goto out;
}
}
@@ -741,7 +740,7 @@ static void ip_vs_dest_free(struct ip_vs_dest *dest)
  *  When the ip_vs_control_clearup is activated by ipvs module exit,
  *  the service tables must have been flushed and all the connections
  *  are expired, and the refcnt of each destination in the trash must
- *  be 0, so we simply release them here.
+ *  be 1, so we simply release them here.
  */
 static void ip_vs_trash_cleanup(struct netns_ipvs *ipvs)
 {
@@ -1080,11 +1079,10 @@ static void __ip_vs_del_dest(struct netns_ipvs *ipvs, 
struct ip_vs_dest *dest,
if (list_empty(>dest_trash) && !cleanup)
mod_timer(>dest_trash_timer,
  jiffies + (IP_VS_DEST_TRASH_PERIOD >> 1));
-   /* dest lives in trash without reference */
+   /* dest lives in trash with reference */
list_add(>t_list, >dest_trash);
dest->idle_start = 0;
spin_unlock_bh(>dest_trash_lock);
-   ip_vs_dest_put(dest);
 }
 
 
@@ -1160,7 +1158,7 @@ static void ip_vs_dest_trash_expire(unsigned long data)
 
spin_lock(>dest_trash_lock);
list_for_each_entry_safe(dest, next, >dest_trash, t_list) {
-   if (atomic_read(>refcnt) > 0)
+   if (atomic_read(>refcnt) > 1)
continue;
if (dest->idle_start) {
if (time_before(now, dest->idle_start +
-- 
2.7.4

--
To unsubscribe from this list: send the line "unsubscribe netfilter-devel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html