[PATCH 07/18] netlink: Pass groups pointer to .bind()

2018-07-25 Thread Dmitry Safonov
Netlink messages sent by xfrm differ in size between 64-bit native and
32-bit compatible applications. To know which UABI to use to send the
message from kernel, I'll use the type of bind() syscall.
Xfrm will have hidden from userspace kernel-only groups for compatible
applications.
So, add pointer to groups to netlink_bind().
With later patches xfrm will set a proper compat group for netlink
socket during bind().

Cc: "David S. Miller" 
Cc: Eric Paris 
Cc: Florian Westphal 
Cc: Herbert Xu 
Cc: Jozsef Kadlecsik 
Cc: Pablo Neira Ayuso 
Cc: Paul Moore 
Cc: Steffen Klassert 
Cc: coret...@netfilter.org
Cc: linux-au...@redhat.com
Cc: net...@vger.kernel.org
Cc: netfilter-devel@vger.kernel.org
Signed-off-by: Dmitry Safonov 
---
 include/linux/netlink.h   |  2 +-
 kernel/audit.c|  2 +-
 net/core/rtnetlink.c  | 14 ++
 net/core/sock_diag.c  | 25 -
 net/netfilter/nfnetlink.c | 24 ++--
 net/netlink/af_netlink.c  | 27 ++-
 net/netlink/af_netlink.h  |  4 ++--
 net/netlink/genetlink.c   | 26 ++
 8 files changed, 64 insertions(+), 60 deletions(-)

diff --git a/include/linux/netlink.h b/include/linux/netlink.h
index f3075d6c7e82..19202648e04a 100644
--- a/include/linux/netlink.h
+++ b/include/linux/netlink.h
@@ -46,7 +46,7 @@ struct netlink_kernel_cfg {
unsigned intflags;
void(*input)(struct sk_buff *skb);
struct mutex*cb_mutex;
-   int (*bind)(struct net *net, int group);
+   int (*bind)(struct net *net, unsigned long *groups);
void(*unbind)(struct net *net, int group);
bool(*compare)(struct net *net, struct sock *sk);
 };
diff --git a/kernel/audit.c b/kernel/audit.c
index e7478cb58079..87ca0214bcf2 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -1523,7 +1523,7 @@ static void audit_receive(struct sk_buff  *skb)
 }
 
 /* Run custom bind function on netlink socket group connect or bind requests. 
*/
-static int audit_bind(struct net *net, int group)
+static int audit_bind(struct net *net, unsigned long *groups)
 {
if (!capable(CAP_AUDIT_READ))
return -EPERM;
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index e3f743c141b3..0465e692ae32 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -4683,15 +4683,13 @@ static void rtnetlink_rcv(struct sk_buff *skb)
netlink_rcv_skb(skb, &rtnetlink_rcv_msg);
 }
 
-static int rtnetlink_bind(struct net *net, int group)
+static int rtnetlink_bind(struct net *net, unsigned long *groups)
 {
-   switch (group) {
-   case RTNLGRP_IPV4_MROUTE_R:
-   case RTNLGRP_IPV6_MROUTE_R:
-   if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
-   return -EPERM;
-   break;
-   }
+   unsigned long mroute_r;
+
+   mroute_r = 1UL << RTNLGRP_IPV4_MROUTE_R | 1UL << RTNLGRP_IPV6_MROUTE_R;
+   if ((*groups & mroute_r) && !ns_capable(net->user_ns, CAP_NET_ADMIN))
+   return -EPERM;
return 0;
 }
 
diff --git a/net/core/sock_diag.c b/net/core/sock_diag.c
index c37b5be7c5e4..befa6759f2ad 100644
--- a/net/core/sock_diag.c
+++ b/net/core/sock_diag.c
@@ -273,20 +273,19 @@ static void sock_diag_rcv(struct sk_buff *skb)
mutex_unlock(&sock_diag_mutex);
 }
 
-static int sock_diag_bind(struct net *net, int group)
+static int sock_diag_bind(struct net *net, unsigned long *groups)
 {
-   switch (group) {
-   case SKNLGRP_INET_TCP_DESTROY:
-   case SKNLGRP_INET_UDP_DESTROY:
-   if (!sock_diag_handlers[AF_INET])
-   sock_load_diag_module(AF_INET, 0);
-   break;
-   case SKNLGRP_INET6_TCP_DESTROY:
-   case SKNLGRP_INET6_UDP_DESTROY:
-   if (!sock_diag_handlers[AF_INET6])
-   sock_load_diag_module(AF_INET6, 0);
-   break;
-   }
+   unsigned long inet_mask, inet6_mask;
+
+   inet_mask   = 1UL << SKNLGRP_INET_TCP_DESTROY;
+   inet_mask  |= 1UL << SKNLGRP_INET_UDP_DESTROY;
+   inet6_mask  = 1UL << SKNLGRP_INET6_TCP_DESTROY;
+   inet6_mask |= 1UL << SKNLGRP_INET6_UDP_DESTROY;
+
+   if ((*groups & inet_mask) && !sock_diag_handlers[AF_INET])
+   sock_load_diag_module(AF_INET, 0);
+   if ((*groups & inet6_mask) && !sock_diag_handlers[AF_INET6])
+   sock_load_diag_module(AF_INET6, 0);
return 0;
 }
 
diff --git a/net/netfilter/nfnetlink.c b/net/netfilter/nfnetlink.c
index e1b6be29848d..6a8893df5285 100644
--- a/net/netfilter/nfnetlink.c
+++ b/net/netfilter/nfnetlink.c
@@ -556,21 +556,25 @@ static void nfnetlink_rcv(struct sk_buff *skb)
 }
 
 #ifdef CONFIG_MODULES
-static int nfnetlink_bind(struct net *net, int group)
+static int nfnetlink_bind(struct net *net, unsigned long *groups)
 {
const struct nfnetlink_subsystem *ss;
-   int type;
+   unsigned long _groups = *grou

[PATCH 00/18] xfrm: Add compat layer

2018-07-25 Thread Dmitry Safonov
Due to some historical mistake, xfrm User ABI differ between native and
compatible applications. The difference is in structures paddings and in
the result in the size of netlink messages.
As it's already visible ABI, it cannot be adjusted by packing structures.

Possibility for compatible application to manage xfrm tunnels was
disabled by: the commmit 19d7df69fdb2 ("xfrm: Refuse to insert 32 bit
userspace socket policies on 64 bit systems") and the commit 74005991b78a
("xfrm: Do not parse 32bits compiled xfrm netlink msg on 64bits host").

By some wonderful reasons and brilliant architecture decisions for
creating userspace, on Arista switches we still use 32-bit userspace
with 64-bit kernel. There is slow movement to full 64-bit build, but
it's not yet here. As the switches need support for ipsec tunnels, the
local kernel has reverted mentioned patches that disable xfrm for
compat apps. On the top of that there is a bunch of disgraceful hacks
in userspace to work around the size check for netlink messages
and all that jazz.

It looks like, we're not the only desirable users of compatible xfrm,
there were a couple of attempts to make it work:
https://lkml.org/lkml/2017/1/20/733
https://patchwork.ozlabs.org/patch/44600/
http://netdev.vger.kernel.narkive.com/2Gesykj6/patch-net-next-xfrm-correctly-parse-netlink-msg-from-32bits-ip-command-on-64bits-host

All the discussions end in the conclusion that xfrm should have a full
compatible layer to correctly work with 32-bit applications on 64-bit
kernels:
https://lkml.org/lkml/2017/1/23/413
https://patchwork.ozlabs.org/patch/433279/

In some recent lkml discussion, Linus said that it's worth to fix this
problem and not giving people an excuse to stay on 32-bit kernel:
https://lkml.org/lkml/2018/2/13/752

So, here I add a compatible layer to xfrm.
As xfrm uses netlink notifications, kernel should send them in ABI
format that an application will parse. The proposed solution is
to save the ABI of bind() syscall. The realization detail is
to create kernel-hidden, non visible to userspace netlink groups
for compat applications.

The first two patches simplify ifdeffery, and while I've already submitted
them a while ago, I'm resending them for completeness:
https://lore.kernel.org/lkml/20180717005004.25984-1-d...@arista.com/T/#u

There is also an exhaustive selftest for ipsec tunnels and to check
that kernel parses correctly the structures those differ in size.
It doesn't depend on any library and compat version can be easy
build with: make CFLAGS=-m32 net/ipsec

Cc: "David S. Miller" 
Cc: Herbert Xu 
Cc: Steffen Klassert 
Cc: Dmitry Safonov <0x7f454...@gmail.com>
Cc: net...@vger.kernel.org

Dmitry Safonov (18):
  x86/compat: Adjust in_compat_syscall() to generic code under !COMPAT
  compat: Cleanup in_compat_syscall() callers
  selftest/net/xfrm: Add test for ipsec tunnel
  net/xfrm: Add _packed types for compat users
  net/xfrm: Parse userspi_info{,_packed} depending on syscall
  netlink: Do not subscribe to non-existent groups
  netlink: Pass groups pointer to .bind()
  xfrm: Add in-kernel groups for compat notifications
  xfrm: Dump usersa_info in compat/native formats
  xfrm: Send state notifications in compat format too
  xfrm: Add compat support for xfrm_user_expire messages
  xfrm: Add compat support for xfrm_userpolicy_info messages
  xfrm: Add compat support for xfrm_user_acquire messages
  xfrm: Add compat support for xfrm_user_polexpire messages
  xfrm: Check compat acquire listeners in xfrm_is_alive()
  xfrm: Notify compat listeners about policy flush
  xfrm: Notify compat listeners about state flush
  xfrm: Enable compat syscalls

 MAINTAINERS|1 +
 arch/x86/include/asm/compat.h  |9 +-
 arch/x86/include/asm/ftrace.h  |4 +-
 arch/x86/kernel/process_64.c   |4 +-
 arch/x86/kernel/sys_x86_64.c   |   11 +-
 arch/x86/mm/hugetlbpage.c  |4 +-
 arch/x86/mm/mmap.c |2 +-
 drivers/firmware/efi/efivars.c |   16 +-
 include/linux/compat.h |4 +-
 include/linux/netlink.h|2 +-
 include/net/xfrm.h |   14 -
 kernel/audit.c |2 +-
 kernel/time/time.c |2 +-
 net/core/rtnetlink.c   |   14 +-
 net/core/sock_diag.c   |   25 +-
 net/netfilter/nfnetlink.c  |   24 +-
 net/netlink/af_netlink.c   |   28 +-
 net/netlink/af_netlink.h   |4 +-
 net/netlink/genetlink.c|   26 +-
 net/xfrm/xfrm_state.c  |5 -
 net/xfrm/xfrm_user.c   |  690 ---
 tools/testing/selftests/net/.gitignore |1 +
 tools/testing/selftests/net/Makefile   |1 +
 tools/testing/selftests/net/ipsec.c| 1987 
 24 files changed, 2612 insertions(+), 268 deletions(-)
 create mode 100644 tools/testing/selftests/net/ipsec.c

-- 

Re: [Bug 200651] New: cgroups iptables-restor: vmalloc: allocation failure

2018-07-25 Thread Andrew Morton
(switched to email.  Please respond via emailed reply-to-all, not via the
bugzilla web interface).

On Wed, 25 Jul 2018 11:42:57 + bugzilla-dae...@bugzilla.kernel.org wrote:

> https://bugzilla.kernel.org/show_bug.cgi?id=200651
> 
> Bug ID: 200651
>Summary: cgroups iptables-restor: vmalloc: allocation failure

Thanks.  Please do note the above request.

>Product: Memory Management
>Version: 2.5
> Kernel Version: 4.14
>   Hardware: All
> OS: Linux
>   Tree: Mainline
> Status: NEW
>   Severity: normal
>   Priority: P1
>  Component: Other
>   Assignee: a...@linux-foundation.org
>   Reporter: gniko...@icdsoft.com
> Regression: No
> 
> Created attachment 277505
>   --> https://bugzilla.kernel.org/attachment.cgi?id=277505&action=edit
> iptables save
> 
> After creating large number of cgroups and under memory pressure, iptables
> command fails with following error:
> 
> "iptables-restor: vmalloc: allocation failure, allocated 3047424 of 3465216
> bytes, mode:0x14010c0(GFP_KERNEL|__GFP_NORETRY), nodemask=(null)"

I'm not sure what the problem is here, apart from iptables being
over-optimistic about vmalloc()'s abilities.

Are cgroups having any impact on this, or is it simply vmalloc arena
fragmentation, and the iptables code should use some data structure
more sophisticated than a massive array?

Maybe all that ccgroup metadata is contributing to the arena
fragmentation, but that allocations will be small and the two systems
should be able to live alongside, by being realistic about vmalloc.

> System which is used to reproduce the bug is with 2 vcpus and 2GB of ram, but
> it happens on more powerfull systems.
> 
> Steps to reproduce:
> 
> mkdir /cgroup
> mount cgroup -t cgroup -omemory,pids,blkio,cpuacct /cgroup
> for a in `seq 1 1000`; do for b in `seq 1 4` ; do mkdir -p
> "/cgroup/user/$a/$b"; done; done
> 
> Then in separate consoles
> 
> cat /dev/vda > /dev/null
> ./test
> ./test
> i=0;while sleep 0 ; do iptables-restore < iptables.save ; i=$(($i+1)); echo 
> $i;
> done
> 
> Here is the source of "test" program and attached iptables.save. It happens
> also with smaller iptables.save file.
> 
> #include 
> #include 
> 
> int main(void) {
> 
> srand(time(NULL));
> int i = 0, j = 0, randnum=0;
> int arr[6] = { 3072, 7168, 15360 , 31744, 64512, 130048}; 
> while(1) {
> 
> for (i = 0; i < 6 ; i++) {
> 
> int *ptr = (int*) malloc(arr[i] * 93);  
> 
> for(j = 0 ; j < arr[i] * 93 / sizeof(int); j++) {
> *(ptr+j) = j+1;
> }
> 
> free(ptr);
> }
> }   
> }
> 

--
To unsubscribe from this list: send the line "unsubscribe netfilter-devel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH nf] netfilter: ip6t_rpfilter: set F_IFACE for linklocal addresses

2018-07-25 Thread Florian Westphal
Roman reports that DHCPv6 client no longer sees replies from server
due to

ip6tables -t raw -A PREROUTING -m rpfilter --invert -j DROP

rule.  We need to set the F_IFACE flag for linklocal addresses, they
are scoped per-device.

Fixes: 47b7e7f82802 ("netfilter: don't set F_IFACE on ipv6 fib lookups")
Reported-by: Roman Mamedov 
Tested-by: Roman Mamedov 
Signed-off-by: Florian Westphal 
---
 net/ipv6/netfilter/ip6t_rpfilter.c | 12 +++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/net/ipv6/netfilter/ip6t_rpfilter.c 
b/net/ipv6/netfilter/ip6t_rpfilter.c
index 0fe61ede77c6..c3c6b09acdc4 100644
--- a/net/ipv6/netfilter/ip6t_rpfilter.c
+++ b/net/ipv6/netfilter/ip6t_rpfilter.c
@@ -26,6 +26,12 @@ static bool rpfilter_addr_unicast(const struct in6_addr 
*addr)
return addr_type & IPV6_ADDR_UNICAST;
 }
 
+static bool rpfilter_addr_linklocal(const struct in6_addr *addr)
+{
+   int addr_type = ipv6_addr_type(addr);
+   return addr_type & IPV6_ADDR_LINKLOCAL;
+}
+
 static bool rpfilter_lookup_reverse6(struct net *net, const struct sk_buff 
*skb,
 const struct net_device *dev, u8 flags)
 {
@@ -48,7 +54,11 @@ static bool rpfilter_lookup_reverse6(struct net *net, const 
struct sk_buff *skb,
}
 
fl6.flowi6_mark = flags & XT_RPFILTER_VALID_MARK ? skb->mark : 0;
-   if ((flags & XT_RPFILTER_LOOSE) == 0)
+
+   if (rpfilter_addr_linklocal(&iph->saddr)) {
+   lookup_flags |= RT6_LOOKUP_F_IFACE;
+   fl6.flowi6_oif = dev->ifindex;
+   } else if ((flags & XT_RPFILTER_LOOSE) == 0)
fl6.flowi6_oif = dev->ifindex;
 
rt = (void *)ip6_route_lookup(net, &fl6, skb, lookup_flags);
-- 
2.16.4

--
To unsubscribe from this list: send the line "unsubscribe netfilter-devel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH RFC/RFT net-next 00/17] net: Convert neighbor tables to per-namespace

2018-07-25 Thread Eric W. Biederman
David Ahern  writes:

> On 7/25/18 11:38 AM, Eric W. Biederman wrote:
>> 
>> Absolutely NOT.  Global thresholds are exactly correct given the fact
>> you are running on a single kernel.
>> 
>> Memory is not free (Even though we are swimming in enough of it memory
>> rarely matters).  One of the few remaining challenges is for containers
>> is finding was to limit resources in such a way that one application
>> does not mess things up for another container during ordinary usage.
>> 
>> It looks like the neighbour tables absolutely are that kind of problem,
>> because the artificial limits are too strict.   Completely giving up on
>> limits does not seem right approach either.  We need to fix the limits
>> we have (perhaps making them go away entirely), not just apply a
>> band-aid.  Let's get to the bottom of this and make the system better.
>
> Eric: yes, they all share the global resource of memory and there should
> be limits on how many entries a remote entity can create.
>
> Network namespaces can provide a separation such that one namespace does
> not disrupt networking in another. It is absolutely appropriate to do
> so. Your rigid stance is inconsistent given the basic meaning of a
> network namespace and the parallels to this same problem -- bridges,
> vxlans, and ip fragments. Only neighbor tables are not per-device or per
> namespace; your insistence on global limits is missing the mark and wrong.

That is not what I said.  Let me rephrase and see if you understand.

The problem appears to be of lots of devices.  Fundamentally if you use
lots of network devices today unless you adjust gc_thresh3 you will run
out of neighbour table entries.

The problem has a bigger scope than what you are looking at.

If you fix the core problem you won't see the problem in the context
of network namespaces either.

Default limits should be something that will never be hit unless
something goes crazy.  We are hitting them.  Therefore by definition
there is a bug in these limits.


And yes there is absolutely a place for global limits on things like
inodes, file descriptors etc, that does not care about which part of the
kernel you are in.  However hitting those limits in normal operation is
a bug.

We have ourselves a bug.

Eric

p.s. I wrote the definition of network namespaces and it absolutely does
have room for global limits.   One of the things Linus has periodically
yelled at me about is that there are not enough of them.


--
To unsubscribe from this list: send the line "unsubscribe netfilter-devel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v3] ipvs: fix race between ip_vs_conn_new() and ip_vs_del_dest()

2018-07-25 Thread Julian Anastasov


Hello,

On Wed, 25 Jul 2018, Tan Hu wrote:

> We came across infinite loop in ipvs when using ipvs in docker
> env.
> 
> When ipvs receives new packets and cannot find an ipvs connection,
> it will create a new connection, then if the dest is unavailable
> (i.e. IP_VS_DEST_F_AVAILABLE), the packet will be dropped sliently.
> 
> But if the dropped packet is the first packet of this connection,
> the connection control timer never has a chance to start and the
> ipvs connection cannot be released. This will lead to memory leak, or
> infinite loop in cleanup_net() when net namespace is released like
> this:
> 
> ip_vs_conn_net_cleanup at a0a9f31a [ip_vs]
> __ip_vs_cleanup at a0a9f60a [ip_vs]
> ops_exit_list at 81567a49
> cleanup_net at 81568b40
> process_one_work at 810a851b
> worker_thread at 810a9356
> kthread at 810b0b6f
> ret_from_fork at 81697a18
> 
> race condition:
> CPU1   CPU2
> ip_vs_in()
>   ip_vs_conn_new()
>ip_vs_del_dest()
>  __ip_vs_unlink_dest()
>~IP_VS_DEST_F_AVAILABLE
>   cp->dest && !IP_VS_DEST_F_AVAILABLE
>   __ip_vs_conn_put
> ...
> cleanup_net  ---> infinite looping
> 
> Fix this by checking whether the timer already started.
> 
> Signed-off-by: Tan Hu 
> Reviewed-by: Jiang Biao 

v3 looks good to me,

Acked-by: Julian Anastasov 

Simon and Pablo, this can be applied to ipvs/nf tree...

> ---
> v2: fix use-after-free in CONN_ONE_PACKET case suggested by Julian Anastasov
> v3: remove trailing whitespace for patch checking 
> 
>  net/netfilter/ipvs/ip_vs_core.c | 15 +++
>  1 file changed, 11 insertions(+), 4 deletions(-)
> 
> diff --git a/net/netfilter/ipvs/ip_vs_core.c b/net/netfilter/ipvs/ip_vs_core.c
> index 0679dd1..a17104f 100644
> --- a/net/netfilter/ipvs/ip_vs_core.c
> +++ b/net/netfilter/ipvs/ip_vs_core.c
> @@ -1972,13 +1972,20 @@ static int ip_vs_in_icmp_v6(struct netns_ipvs *ipvs, 
> struct sk_buff *skb,
>   if (cp->dest && !(cp->dest->flags & IP_VS_DEST_F_AVAILABLE)) {
>   /* the destination server is not available */
> 
> - if (sysctl_expire_nodest_conn(ipvs)) {
> + __u32 flags = cp->flags;
> +
> + /* when timer already started, silently drop the packet.*/
> + if (timer_pending(&cp->timer))
> + __ip_vs_conn_put(cp);
> + else
> + ip_vs_conn_put(cp);
> +
> + if (sysctl_expire_nodest_conn(ipvs) &&
> + !(flags & IP_VS_CONN_F_ONE_PACKET)) {
>   /* try to expire the connection immediately */
>   ip_vs_conn_expire_now(cp);
>   }
> - /* don't restart its timer, and silently
> -drop the packet. */
> - __ip_vs_conn_put(cp);
> +
>   return NF_DROP;
>   }
> 
> --
> 1.8.3.1

Regards

--
Julian Anastasov 
--
To unsubscribe from this list: send the line "unsubscribe netfilter-devel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: 4.14.54 regression: rpfilter and DHCPv6

2018-07-25 Thread Roman Mamedov
On Wed, 25 Jul 2018 19:35:20 +0200
Florian Westphal  wrote:

> 
> Does this patch fix the problem for you?

Yes it does. Thanks! 

> 
> diff --git a/net/ipv6/netfilter/ip6t_rpfilter.c 
> b/net/ipv6/netfilter/ip6t_rpfilter.c
> --- a/net/ipv6/netfilter/ip6t_rpfilter.c
> +++ b/net/ipv6/netfilter/ip6t_rpfilter.c
> @@ -26,6 +26,12 @@ static bool rpfilter_addr_unicast(const struct in6_addr 
> *addr)
>   return addr_type & IPV6_ADDR_UNICAST;
>  }
>  
> +static bool rpfilter_addr_linklocal(const struct in6_addr *addr)
> +{
> + int addr_type = ipv6_addr_type(addr);
> + return addr_type & IPV6_ADDR_LINKLOCAL;
> +}
> +
>  static bool rpfilter_lookup_reverse6(struct net *net, const struct sk_buff 
> *skb,
>const struct net_device *dev, u8 flags)
>  {
> @@ -48,7 +54,11 @@ static bool rpfilter_lookup_reverse6(struct net *net, 
> const struct sk_buff *skb,
>   }
>  
>   fl6.flowi6_mark = flags & XT_RPFILTER_VALID_MARK ? skb->mark : 0;
> - if ((flags & XT_RPFILTER_LOOSE) == 0)
> +
> + if (rpfilter_addr_linklocal(&iph->saddr)) {
> + lookup_flags |= RT6_LOOKUP_F_IFACE;
> + fl6.flowi6_oif = dev->ifindex;
> + } else if ((flags & XT_RPFILTER_LOOSE) == 0)
>   fl6.flowi6_oif = dev->ifindex;
>  
>   rt = (void *) ip6_route_lookup(net, &fl6, lookup_flags);

-- 
With respect,
Roman
--
To unsubscribe from this list: send the line "unsubscribe netfilter-devel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH RFC/RFT net-next 00/17] net: Convert neighbor tables to per-namespace

2018-07-25 Thread David Ahern
On 7/24/18 11:14 AM, David Miller wrote:
> From: David Ahern 
> Date: Tue, 24 Jul 2018 09:14:01 -0600
> 
>> I get the impression there is no longer a strong resistance against
>> moving the tables to per namespace, but deciding what is the right
>> approach to handle backwards compatibility. Correct? Changing the
>> accounting is inevitably going to be noticeable to some use case(s), but
>> with sysctl settings it is a simple runtime update once the user knows
>> to make the change.
>>
>> neighbor entries round up to 512 byte allocations, so with the current
>> gc_thresh defaults (128/512/1024) 512k can be consumed. Using those
>> limits per namespace seems high which is why I suggested a per-namespace
>> default of (16/32/64) which amounts to 32k per namespace limit by
>> default. Open to other suggestions as well.
> 
> No objection from me about going to per-ns neigh tables.
> 
> About the defaults, I wonder if we can scale them to the amount of
> memory given to the ns or something like that?  I bet this will better
> match the intended use of the ns.
> 

Not sure how to do that. I am not aware of memory allocations to a
network namespace. As I understand it containers use cgroups to control
memory use, but I am not aware of any direct ties to namespace.
--
To unsubscribe from this list: send the line "unsubscribe netfilter-devel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH RFC/RFT net-next 00/17] net: Convert neighbor tables to per-namespace

2018-07-25 Thread David Ahern
On 7/25/18 11:38 AM, Eric W. Biederman wrote:
> 
> Absolutely NOT.  Global thresholds are exactly correct given the fact
> you are running on a single kernel.
> 
> Memory is not free (Even though we are swimming in enough of it memory
> rarely matters).  One of the few remaining challenges is for containers
> is finding was to limit resources in such a way that one application
> does not mess things up for another container during ordinary usage.
> 
> It looks like the neighbour tables absolutely are that kind of problem,
> because the artificial limits are too strict.   Completely giving up on
> limits does not seem right approach either.  We need to fix the limits
> we have (perhaps making them go away entirely), not just apply a
> band-aid.  Let's get to the bottom of this and make the system better.

Eric: yes, they all share the global resource of memory and there should
be limits on how many entries a remote entity can create.

Network namespaces can provide a separation such that one namespace does
not disrupt networking in another. It is absolutely appropriate to do
so. Your rigid stance is inconsistent given the basic meaning of a
network namespace and the parallels to this same problem -- bridges,
vxlans, and ip fragments. Only neighbor tables are not per-device or per
namespace; your insistence on global limits is missing the mark and wrong.
--
To unsubscribe from this list: send the line "unsubscribe netfilter-devel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH RFC/RFT net-next 00/17] net: Convert neighbor tables to per-namespace

2018-07-25 Thread Eric W. Biederman
David Ahern  writes:

> On 7/25/18 6:33 AM, Eric W. Biederman wrote:
>> Cong Wang  writes:
>> 
>>> On Tue, Jul 24, 2018 at 8:14 AM David Ahern  wrote:

 On 7/19/18 11:12 AM, Cong Wang wrote:
> On Thu, Jul 19, 2018 at 9:16 AM David Ahern  wrote:
>>
>> Chatting with Nikolay about this and he brought up a good corollary - ip
>> fragmentation. It really is a similar problem in that memory is consumed
>> as a result of packets received from an external entity. The ipfrag
>> sysctls are per namespace with a limit that non-init_net namespaces can
>> not set high_thresh > the current value of init_net. Potential memory
>> consumed by fragments scales with the number of namespaces which is the
>> primary concern with making neighbor tables per namespace.
>
> Nothing new, already discussed:
> https://marc.info/?l=linux-netdev&m=140391416215988&w=2
>
> :)
>

 Neighbor tables, bridge fdbs, vxlan fdbs and ip fragments all consume
 local memory resources due to received packets. bridge and vxlan fdb's
 are fairly straightforward analogs to neighbor entries; they are per
 device with no limits on the number of entries. Fragments have memory
 limits per namespace. So neighbor tables are the only ones with this
 strict limitation and concern on memory consumption.

 I get the impression there is no longer a strong resistance against
 moving the tables to per namespace, but deciding what is the right
 approach to handle backwards compatibility. Correct? Changing the
 accounting is inevitably going to be noticeable to some use case(s), but
 with sysctl settings it is a simple runtime update once the user knows
 to make the change.
>>>
>>> This question definitely should go to Eric Biederman who was against
>>> my proposal.
>>>
>>> Let's add Eric into CC.
>> 
>> Given that the entries are per device and the devices are per-namespace,
>> semantically neighbours are already kept in a per-namespace manner.  So
>> this is all about making the code not honoring global resource limits.
>> Making the code not honor gc_thresh3.
>> 
>> Skimming through the code today the default for gc_thresh3 is 1024.
>> Which means that we limit the neighbour tables to 1024 entries per
>> protocol type.
>> 
>> There are some pretty compelling reasons especially with ipv4 to keep
>> the subnet size down.  Arp storms are a real thing.
>> 
>> I don't know off the top of my head what the reasons for limiting the
>> neighbour table sizes.  I would be much more comfortable with a patchset
>> like this if we did some research and figured out the reasons why
>> we have a global limit.  Then changed the code to remove those limits.
>> 
>> When the limits are gone.  When the code can support large subnets
>> without tuning.  We we don't have to worry about someone scanning an all
>> addresses in an ipv6 subnet and causing a DOS on working machines.
>> I think it is completely appropriate to look to see if something per
>> network namespace needs to happen.
>> 
>> So please let's address the limits, not the fact that some specific
>> corner case ran into them.
>> 
>> If we are going to neuter gc_thresh3 let's go as far as removing it
>> entirely.  If we are going to make the neighbour table per something
>> let's make it per network device.  If we can afford the multiple hash
>> tables then a hash table per device is better.   Perhaps we want to move
>> to rhash tables while we look at this, instead of an old hand grown
>> version of resizable hash table.
>
> Given the uses cases with increasing number of devices (> 10,000),
> per-device tables will have more problems than per namespace - in
> reference to your concern in the last paragraph below.
>
>> 
>> Unless I misread something all your patchset did is reshuffle code and
>> data structures so that gc_thresh3 does not apply accross namespaces.
>> That does not feel like it really fixes anything.  That just lies to
>> people.
>
> This patch set fixes the lie that network namespaces provide complete
> isolation when in fact one namespace can evict neighbor entries from
> another. An arp storm you are concerned about in one namespace impacts
> all containers.

Network namespaces can not provide complete isolation.  They share the
same kernel and they do not dedicate resources to each other.
Namespaces in general are about the names.  They are about sharing a
machine efficiently.

I humbly suggest that anyone who wants ``complete'' isolation to use vm
at the very least.

I do think the limits on the neighbour table are quite likely too
strict.  We should be able to relax them and continue to have a
networking stack that works for everyone.

> It starts by removing the proliferation of open coded references to
> arp_tbl and nd_tbl, moving them behind the existing neigh_find_table.
> From there (patches 14-16) it makes the tables per-namespace and hence
> makes the gc_thresh parameters whic

Re: 4.14.54 regression: rpfilter and DHCPv6

2018-07-25 Thread Florian Westphal
Roman Mamedov  wrote:
> I have a machine which is a DHCPv6 client on a PPPoE connection. It also has:
> 
>   sysctl -w net.netfilter.nf_conntrack_tcp_loose=0
>   ip6tables -t raw -A PREROUTING ! -i lo -m rpfilter --invert -j DROP
> 
> After commits:
> 
>   netfilter: don't set F_IFACE on ipv6 fib lookups
>   http://patchwork.ozlabs.org/patch/873574/
> 
>   netfilter: ip6t_rpfilter: provide input interface for route lookup
>   https://patchwork.ozlabs.org/patch/919290/
> 
> ...the DHCPv6 client no longer sees any replies from the server. They are now
> filtered out by rpfilter. Removing the ip6tables rule shown above, or rolling
> back both of these commits, makes it all work fine again.
> 
> From commit messages it doesn't appear like this would be a "by design"
> behavior of these changes.
> 
> I did not test if other kernel branches (4.17 et al) are affected, but if they
> also have both of these, I guess they likely are.

Safe bet.

Does this patch fix the problem for you?

diff --git a/net/ipv6/netfilter/ip6t_rpfilter.c 
b/net/ipv6/netfilter/ip6t_rpfilter.c
--- a/net/ipv6/netfilter/ip6t_rpfilter.c
+++ b/net/ipv6/netfilter/ip6t_rpfilter.c
@@ -26,6 +26,12 @@ static bool rpfilter_addr_unicast(const struct in6_addr 
*addr)
return addr_type & IPV6_ADDR_UNICAST;
 }
 
+static bool rpfilter_addr_linklocal(const struct in6_addr *addr)
+{
+   int addr_type = ipv6_addr_type(addr);
+   return addr_type & IPV6_ADDR_LINKLOCAL;
+}
+
 static bool rpfilter_lookup_reverse6(struct net *net, const struct sk_buff 
*skb,
 const struct net_device *dev, u8 flags)
 {
@@ -48,7 +54,11 @@ static bool rpfilter_lookup_reverse6(struct net *net, const 
struct sk_buff *skb,
}
 
fl6.flowi6_mark = flags & XT_RPFILTER_VALID_MARK ? skb->mark : 0;
-   if ((flags & XT_RPFILTER_LOOSE) == 0)
+
+   if (rpfilter_addr_linklocal(&iph->saddr)) {
+   lookup_flags |= RT6_LOOKUP_F_IFACE;
+   fl6.flowi6_oif = dev->ifindex;
+   } else if ((flags & XT_RPFILTER_LOOSE) == 0)
fl6.flowi6_oif = dev->ifindex;
 
rt = (void *) ip6_route_lookup(net, &fl6, lookup_flags);
--
To unsubscribe from this list: send the line "unsubscribe netfilter-devel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: linux-next: Tree for Jul 25 (netfilter)

2018-07-25 Thread Randy Dunlap
On 07/24/2018 11:44 PM, Stephen Rothwell wrote:
> Hi all,
> 
> Changes since 20180724:
> 

on i386:

net/ipv4/netfilter/nft_chain_nat_ipv4.o: In function `nft_nat_do_chain':
nft_chain_nat_ipv4.c:(.text+0x67): undefined reference to `nft_do_chain'
net/ipv4/netfilter/nft_chain_nat_ipv4.o: In function `nft_chain_nat_exit':
nft_chain_nat_ipv4.c:(.exit.text+0x9): undefined reference to 
`nft_unregister_chain_type'
net/ipv4/netfilter/nft_chain_nat_ipv4.o: In function `nft_chain_nat_init':
nft_chain_nat_ipv4.c:(.init.text+0x9): undefined reference to 
`nft_register_chain_type'


Reported-by: Randy Dunlap 

Full randconfig file is attached.

-- 
~Randy
#
# Automatically generated file; DO NOT EDIT.
# Linux/i386 4.18.0-rc6 Kernel Configuration
#

#
# Compiler: gcc (SUSE Linux) 4.8.5
#
CONFIG_X86_32=y
CONFIG_X86=y
CONFIG_INSTRUCTION_DECODER=y
CONFIG_OUTPUT_FORMAT="elf32-i386"
CONFIG_ARCH_DEFCONFIG="arch/x86/configs/i386_defconfig"
CONFIG_LOCKDEP_SUPPORT=y
CONFIG_STACKTRACE_SUPPORT=y
CONFIG_MMU=y
CONFIG_ARCH_MMAP_RND_BITS_MIN=8
CONFIG_ARCH_MMAP_RND_BITS_MAX=16
CONFIG_ARCH_MMAP_RND_COMPAT_BITS_MIN=8
CONFIG_ARCH_MMAP_RND_COMPAT_BITS_MAX=16
CONFIG_GENERIC_ISA_DMA=y
CONFIG_GENERIC_HWEIGHT=y
CONFIG_ARCH_MAY_HAVE_PC_FDC=y
CONFIG_RWSEM_XCHGADD_ALGORITHM=y
CONFIG_GENERIC_CALIBRATE_DELAY=y
CONFIG_ARCH_HAS_CPU_RELAX=y
CONFIG_ARCH_HAS_CACHE_LINE_SIZE=y
CONFIG_ARCH_HAS_FILTER_PGPROT=y
CONFIG_HAVE_SETUP_PER_CPU_AREA=y
CONFIG_NEED_PER_CPU_EMBED_FIRST_CHUNK=y
CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK=y
CONFIG_ARCH_HIBERNATION_POSSIBLE=y
CONFIG_ARCH_SUSPEND_POSSIBLE=y
CONFIG_ARCH_WANT_HUGE_PMD_SHARE=y
CONFIG_ARCH_WANT_GENERAL_HUGETLB=y
CONFIG_ARCH_SUPPORTS_OPTIMIZED_INLINING=y
CONFIG_ARCH_SUPPORTS_DEBUG_PAGEALLOC=y
CONFIG_X86_32_SMP=y
CONFIG_X86_32_LAZY_GS=y
CONFIG_ARCH_SUPPORTS_UPROBES=y
CONFIG_FIX_EARLYCON_MEM=y
CONFIG_PGTABLE_LEVELS=3
CONFIG_CC_IS_GCC=y
CONFIG_GCC_VERSION=40805
CONFIG_CLANG_VERSION=0
CONFIG_IRQ_WORK=y
CONFIG_BUILDTIME_EXTABLE_SORT=y
CONFIG_THREAD_INFO_IN_TASK=y

#
# General setup
#
CONFIG_INIT_ENV_ARG_LIMIT=32
# CONFIG_COMPILE_TEST is not set
CONFIG_LOCALVERSION=""
# CONFIG_LOCALVERSION_AUTO is not set
CONFIG_BUILD_SALT=""
CONFIG_HAVE_KERNEL_GZIP=y
CONFIG_HAVE_KERNEL_BZIP2=y
CONFIG_HAVE_KERNEL_LZMA=y
CONFIG_HAVE_KERNEL_XZ=y
CONFIG_HAVE_KERNEL_LZO=y
CONFIG_HAVE_KERNEL_LZ4=y
# CONFIG_KERNEL_GZIP is not set
# CONFIG_KERNEL_BZIP2 is not set
# CONFIG_KERNEL_LZMA is not set
CONFIG_KERNEL_XZ=y
# CONFIG_KERNEL_LZO is not set
# CONFIG_KERNEL_LZ4 is not set
CONFIG_DEFAULT_HOSTNAME="(none)"
CONFIG_SWAP=y
CONFIG_SYSVIPC=y
CONFIG_POSIX_MQUEUE=y
CONFIG_CROSS_MEMORY_ATTACH=y
CONFIG_USELIB=y
CONFIG_AUDIT=y
CONFIG_HAVE_ARCH_AUDITSYSCALL=y
CONFIG_AUDITSYSCALL=y
CONFIG_AUDIT_WATCH=y
CONFIG_AUDIT_TREE=y

#
# IRQ subsystem
#
CONFIG_GENERIC_IRQ_PROBE=y
CONFIG_GENERIC_IRQ_SHOW=y
CONFIG_GENERIC_IRQ_EFFECTIVE_AFF_MASK=y
CONFIG_GENERIC_PENDING_IRQ=y
CONFIG_GENERIC_IRQ_MIGRATION=y
CONFIG_GENERIC_IRQ_CHIP=y
CONFIG_IRQ_DOMAIN=y
CONFIG_IRQ_SIM=y
CONFIG_IRQ_DOMAIN_HIERARCHY=y
CONFIG_GENERIC_IRQ_MATRIX_ALLOCATOR=y
CONFIG_GENERIC_IRQ_RESERVATION_MODE=y
CONFIG_IRQ_FORCED_THREADING=y
CONFIG_SPARSE_IRQ=y
# CONFIG_GENERIC_IRQ_DEBUGFS is not set
CONFIG_CLOCKSOURCE_WATCHDOG=y
CONFIG_ARCH_CLOCKSOURCE_DATA=y
CONFIG_CLOCKSOURCE_VALIDATE_LAST_CYCLE=y
CONFIG_GENERIC_TIME_VSYSCALL=y
CONFIG_GENERIC_CLOCKEVENTS=y
CONFIG_GENERIC_CLOCKEVENTS_BROADCAST=y
CONFIG_GENERIC_CLOCKEVENTS_MIN_ADJUST=y
CONFIG_GENERIC_CMOS_UPDATE=y

#
# Timers subsystem
#
CONFIG_HZ_PERIODIC=y
# CONFIG_NO_HZ_IDLE is not set
CONFIG_NO_HZ=y
# CONFIG_HIGH_RES_TIMERS is not set

#
# CPU/Task time and stats accounting
#
CONFIG_TICK_CPU_ACCOUNTING=y
CONFIG_IRQ_TIME_ACCOUNTING=y
# CONFIG_CPU_ISOLATION is not set

#
# RCU Subsystem
#
CONFIG_TREE_RCU=y
CONFIG_RCU_EXPERT=y
CONFIG_SRCU=y
CONFIG_TREE_SRCU=y
CONFIG_TASKS_RCU=y
CONFIG_RCU_STALL_COMMON=y
CONFIG_RCU_NEED_SEGCBLIST=y
CONFIG_RCU_FANOUT=32
CONFIG_RCU_FANOUT_LEAF=16
# CONFIG_RCU_NOCB_CPU is not set
CONFIG_BUILD_BIN2C=y
CONFIG_IKCONFIG=y
CONFIG_HAVE_UNSTABLE_SCHED_CLOCK=y
CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH=y
# CONFIG_CGROUPS is not set
# CONFIG_CHECKPOINT_RESTORE is not set
# CONFIG_SCHED_AUTOGROUP is not set
CONFIG_SYSFS_DEPRECATED=y
CONFIG_SYSFS_DEPRECATED_V2=y
# CONFIG_RELAY is not set
# CONFIG_BLK_DEV_INITRD is not set
CONFIG_CC_OPTIMIZE_FOR_PERFORMANCE=y
# CONFIG_CC_OPTIMIZE_FOR_SIZE is not set
CONFIG_HAVE_UID16=y
CONFIG_SYSCTL_EXCEPTION_TRACE=y
CONFIG_HAVE_PCSPKR_PLATFORM=y
CONFIG_BPF=y
CONFIG_EXPERT=y
# CONFIG_MULTIUSER is not set
# CONFIG_SGETMASK_SYSCALL is not set
CONFIG_SYSFS_SYSCALL=y
CONFIG_FHANDLE=y
# CONFIG_POSIX_TIMERS is not set
# CONFIG_PRINTK is not set
# CONFIG_BUG is not set
CONFIG_PCSPKR_PLATFORM=y
CONFIG_BASE_FULL=y
CONFIG_FUTEX=y
CONFIG_FUTEX_PI=y
CONFIG_EPOLL=y
# CONFIG_SIGNALFD is not set
# CONFIG_TIMERFD is not set
# CONFIG_EVENTFD is not set
# CONFIG_SHMEM is not set
CONFIG_AIO=y
CONFIG_ADVISE_SYSCALLS=y
# CONFIG_MEMBARRIER is not set
CONFIG_KALLSYMS=y
CONFIG_KALLSYMS_ALL=y
CONFIG_KALLSYMS_BASE_RELATIVE=y
# CONFIG_

Re: 4.14.54 regression: rpfilter and DHCPv6

2018-07-25 Thread Roman Mamedov
On Wed, 25 Jul 2018 11:19:39 -0400
Eric Garver  wrote:

> On Wed, Jul 25, 2018 at 07:33:17PM +0500, Roman Mamedov wrote:
> > Hello,
> > 
> > I have a machine which is a DHCPv6 client on a PPPoE connection. It also 
> > has:
> > 
> >   sysctl -w net.netfilter.nf_conntrack_tcp_loose=0
> >   ip6tables -t raw -A PREROUTING ! -i lo -m rpfilter --invert -j DROP
> > 
> > After commits:
> > 
> >   netfilter: don't set F_IFACE on ipv6 fib lookups
> >   http://patchwork.ozlabs.org/patch/873574/
> > 
> >   netfilter: ip6t_rpfilter: provide input interface for route lookup
> >   https://patchwork.ozlabs.org/patch/919290/
> > 
> > ...the DHCPv6 client no longer sees any replies from the server. They are 
> > now
> > filtered out by rpfilter. Removing the ip6tables rule shown above, or 
> > rolling
> > back both of these commits, makes it all work fine again.
> > 
> > From commit messages it doesn't appear like this would be a "by design"
> > behavior of these changes.
> > 
> > I did not test if other kernel branches (4.17 et al) are affected, but if 
> > they
> > also have both of these, I guess they likely are.
> 
> I believe this was fixed by:
> 
>   cede24d1b21d ("netfilter: ip6t_rpfilter: provide input interface for route 
> lookup")
> 
> which landed in 4.18.

It's also in the same 4.14.54 and as you can see I listed it in my message
(sorry for not including commit hashes).

It does not help. Only rolling back both, fixes the issue. From a non-expert
guess, perhaps it misses also restoring the

  lookup_flags |= RT6_LOOKUP_F_IFACE;

line?

-- 
With respect,
Roman
--
To unsubscribe from this list: send the line "unsubscribe netfilter-devel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH nf] netfilter: nft_set: fix allocation size overflow in privsize callback.

2018-07-25 Thread Taehee Yoo
In order to determine allocation size of set, ->privsize is invoked.
At this point, both desc->size and size of each data structure of set
are used. desc->size means number of element that is given by user.
desc->size is u32 type. so that upperlimit of set element is 4294967295.
but return type of ->privsize is also u32. hence overflow can occurred.

test commands:
   %nft add table ip filter
   %nft add set ip filter hash1 { type ipv4_addr \; size 4294967295 \; }
   %nft list ruleset

splat looks like:
[ 1239.202910] kasan: CONFIG_KASAN_INLINE enabled
[ 1239.208788] kasan: GPF could be caused by NULL-ptr deref or user memory 
access
[ 1239.217625] general protection fault:  [#1] SMP DEBUG_PAGEALLOC KASAN PTI
[ 1239.219329] CPU: 0 PID: 1603 Comm: nft Not tainted 4.18.0-rc5+ #7
[ 1239.229091] RIP: 0010:nft_hash_walk+0x1d2/0x310 [nf_tables_set]
[ 1239.229091] Code: 84 d2 7f 10 4c 89 e7 89 44 24 38 e8 d8 5a 17 e0 8b 44 24 
38 48 8d 7b 10 41 0f b6 0c 24 48 89 fa 48 89 fe 48 c1 ea 03 83 e6 07 <42> 0f b6 
14 3a 40 38 f2 7f 1a 84 d2 74 16
[ 1239.229091] RSP: 0018:8801118cf358 EFLAGS: 00010246
[ 1239.229091] RAX:  RBX: 00020400 RCX: 0001
[ 1239.229091] RDX: 4082 RSI:  RDI: 00020410
[ 1239.229091] RBP: 880114d5a988 R08: 7e94 R09: 880114dd8030
[ 1239.229091] R10: 880114d5a988 R11: ed00229bb006 R12: 8801118cf4d0
[ 1239.229091] R13: 8801118cf4d8 R14:  R15: dc00
[ 1239.229091] FS:  7f5a8fe0b700() GS:88011b60() 
knlGS:
[ 1239.229091] CS:  0010 DS:  ES:  CR0: 80050033
[ 1239.229091] CR2: 7f5a8ecc27b0 CR3: 00010608e000 CR4: 001006f0
[ 1239.229091] Call Trace:
[ 1239.229091]  ? nft_hash_remove+0xf0/0xf0 [nf_tables_set]
[ 1239.229091]  ? memset+0x1f/0x40
[ 1239.229091]  ? __nla_reserve+0x9f/0xb0
[ 1239.229091]  ? memcpy+0x34/0x50
[ 1239.229091]  nf_tables_dump_set+0x9a1/0xda0 [nf_tables]
[ 1239.229091]  ? __kmalloc_reserve.isra.29+0x2e/0xa0
[ 1239.229091]  ? nft_chain_hash_obj+0x630/0x630 [nf_tables]
[ 1239.229091]  ? nf_tables_commit+0x2c60/0x2c60 [nf_tables]
[ 1239.229091]  netlink_dump+0x470/0xa20
[ 1239.229091]  __netlink_dump_start+0x5ae/0x690
[ 1239.229091]  nft_netlink_dump_start_rcu+0xd1/0x160 [nf_tables]
[ 1239.229091]  nf_tables_getsetelem+0x2e5/0x4b0 [nf_tables]
[ 1239.229091]  ? nft_get_set_elem+0x440/0x440 [nf_tables]
[ 1239.229091]  ? nft_chain_hash_obj+0x630/0x630 [nf_tables]
[ 1239.229091]  ? nf_tables_dump_obj_done+0x70/0x70 [nf_tables]
[ 1239.229091]  ? nla_parse+0xab/0x230
[ 1239.229091]  ? nft_get_set_elem+0x440/0x440 [nf_tables]
[ 1239.229091]  nfnetlink_rcv_msg+0x7f0/0xab0 [nfnetlink]
[ 1239.229091]  ? nfnetlink_bind+0x1d0/0x1d0 [nfnetlink]
[ 1239.229091]  ? debug_show_all_locks+0x290/0x290
[ 1239.229091]  ? sched_clock_cpu+0x132/0x170
[ 1239.229091]  ? find_held_lock+0x39/0x1b0
[ 1239.229091]  ? sched_clock_local+0x10d/0x130
[ 1239.229091]  netlink_rcv_skb+0x211/0x320
[ 1239.229091]  ? nfnetlink_bind+0x1d0/0x1d0 [nfnetlink]
[ 1239.229091]  ? netlink_ack+0x7b0/0x7b0
[ 1239.229091]  ? ns_capable_common+0x6e/0x110
[ 1239.229091]  nfnetlink_rcv+0x2d1/0x310 [nfnetlink]
[ 1239.229091]  ? nfnetlink_rcv_batch+0x10f0/0x10f0 [nfnetlink]
[ 1239.229091]  ? netlink_deliver_tap+0x829/0x930
[ 1239.229091]  ? lock_acquire+0x265/0x2e0
[ 1239.229091]  netlink_unicast+0x406/0x520
[ 1239.509725]  ? netlink_attachskb+0x5b0/0x5b0
[ 1239.509725]  ? find_held_lock+0x39/0x1b0
[ 1239.509725]  netlink_sendmsg+0x987/0xa20
[ 1239.509725]  ? netlink_unicast+0x520/0x520
[ 1239.509725]  ? _copy_from_user+0xa9/0xc0
[ 1239.509725]  __sys_sendto+0x21a/0x2c0
[ 1239.509725]  ? __ia32_sys_getpeername+0xa0/0xa0
[ 1239.509725]  ? retint_kernel+0x10/0x10
[ 1239.509725]  ? sched_clock_cpu+0x132/0x170
[ 1239.509725]  ? find_held_lock+0x39/0x1b0
[ 1239.509725]  ? lock_downgrade+0x540/0x540
[ 1239.509725]  ? up_read+0x1c/0x100
[ 1239.509725]  ? __do_page_fault+0x763/0x970
[ 1239.509725]  ? retint_user+0x18/0x18
[ 1239.509725]  __x64_sys_sendto+0x177/0x180
[ 1239.509725]  do_syscall_64+0xaa/0x360
[ 1239.509725]  entry_SYSCALL_64_after_hwframe+0x49/0xbe
[ 1239.509725] RIP: 0033:0x7f5a8f468e03
[ 1239.509725] Code: 00 f7 d8 64 89 02 48 c7 c0 ff ff ff ff eb d0 0f 1f 84 00 
00 00 00 00 83 3d 49 c9 2b 00 00 75 13 49 89 ca b8 2c 00 00 00 0f 05 <48> 3d 01 
f0 ff ff 73 34 c3 48 83 ec 08 e8
[ 1239.509725] RSP: 002b:7ffd78d0b778 EFLAGS: 0246 ORIG_RAX: 
002c
[ 1239.509725] RAX: ffda RBX: 7ffd78d0c890 RCX: 7f5a8f468e03
[ 1239.509725] RDX: 0034 RSI: 7ffd78d0b7e0 RDI: 0003
[ 1239.509725] RBP: 7ffd78d0b7d0 R08: 7f5a8f15c160 R09: 000c
[ 1239.509725] R10:  R11: 0246 R12: 7ffd78d0b7e0
[ 1239.509725] R13: 0034 R14: 7f5a8f9aff60 R15: 5648040094b0
[ 1239.509725] Modules linked in: nf_tables_set nf_tables nfnetlink ip_table

Re: [PATCH 1/4] treewide: convert ISO_8859-1 text comments to utf-8

2018-07-25 Thread Joe Perches
On Wed, 2018-07-25 at 15:12 +0200, Arnd Bergmann wrote:
> tools/perf/tests/.gitignore:
> LLVM byte-codes, uncompressed
> On Wed, Jul 25, 2018 at 2:55 AM, Andrew Morton
>  wrote:
> > On Tue, 24 Jul 2018 17:13:20 -0700 Joe Perches  wrote:
> > 
> > > On Tue, 2018-07-24 at 14:00 -0700, Andrew Morton wrote:
> > > > On Tue, 24 Jul 2018 13:13:25 +0200 Arnd Bergmann  wrote:
> > > > > Almost all files in the kernel are either plain text or UTF-8
> > > > > encoded. A couple however are ISO_8859-1, usually just a few
> > > > > characters in a C comments, for historic reasons.
> > > > > This converts them all to UTF-8 for consistency.
> > > 
> > > []
> > > > Will we be getting a checkpatch rule to keep things this way?
> > > 
> > > How would that be done?
> > 
> > I'm using this, seems to work.
> > 
> > if ! file $p | grep -q -P ", ASCII text|, UTF-8 Unicode text"
> > then
> > echo $p: weird charset
> > fi
> 
> There are a couple of files that my version of 'find' incorrectly identified 
> as
> something completely different, like:
> 
> Documentation/devicetree/bindings/pinctrl/pinctrl-sx150x.txt:
> SemOne archive data
> Documentation/devicetree/bindings/rtc/epson,rtc7301.txt:
> Microsoft Document Imaging Format
> Documentation/filesystems/nfs/pnfs-block-server.txt:
> PPMN archive data
> arch/arm/boot/dts/bcm283x-rpi-usb-host.dtsi:
> Sendmail frozen configuration  - version = "host";
> Documentation/networking/segmentation-offloads.txt:
> StuffIt Deluxe Segment (data) : gmentation Offloads in the
> Linux Networking Stack
> arch/sparc/include/asm/visasm.h:  SAS 7+
> arch/xtensa/kernel/setup.c: ,
> init=0x454c, stat=0x090a, dev=0x2009, bas=0x2020
> drivers/cpufreq/powernow-k8.c:
> TI-XX Graphing Calculator (FLASH)
> tools/testing/selftests/net/forwarding/tc_shblocks.sh:
> Minix filesystem, V2 (big endian)
> tools/perf/tests/.gitignore:
> LLVM byte-codes, uncompressed
> 
> All of the above seem to be valid ASCII or UTF-8 files, so the check
> above will lead
> to false-positives, but it may be good enough as they are the
> exception, and may be
> bugs in 'file'.
> 
> Not sure if we need to worry about 'file' not being installed.

checkpatch works on patches so I think the test isn't
really relevant.  It has to use the appropriate email
header that sets the charset.

perhaps:
---
 scripts/checkpatch.pl | 10 +++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl
index 34e4683de7a3..57355fbd2d28 100755
--- a/scripts/checkpatch.pl
+++ b/scripts/checkpatch.pl
@@ -2765,9 +2765,13 @@ sub process {
 # Check if there is UTF-8 in a commit log when a mail header has explicitly
 # declined it, i.e defined some charset where it is missing.
if ($in_header_lines &&
-   $rawline =~ /^Content-Type:.+charset="(.+)".*$/ &&
-   $1 !~ /utf-8/i) {
-   $non_utf8_charset = 1;
+   $rawline =~ /^Content-Type:.+charset="?([^\s;"]+)/) {
+   my $charset = $1;
+   $non_utf8_charset = 1 if ($charset !~ /^utf-8$/i);
+   if ($charset !~ /^(?:us-ascii|utf-8|iso-8859-1)$/) {
+   WARN("PATCH_CHARSET",
+"Unpreferred email header charset 
'$charset'\n" . $herecurr);
+   }
}
 
if ($in_commit_log && $non_utf8_charset && $realfile =~ /^$/ &&



--
To unsubscribe from this list: send the line "unsubscribe netfilter-devel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: 4.14.54 regression: rpfilter and DHCPv6

2018-07-25 Thread Eric Garver
On Wed, Jul 25, 2018 at 07:33:17PM +0500, Roman Mamedov wrote:
> Hello,
> 
> I have a machine which is a DHCPv6 client on a PPPoE connection. It also has:
> 
>   sysctl -w net.netfilter.nf_conntrack_tcp_loose=0
>   ip6tables -t raw -A PREROUTING ! -i lo -m rpfilter --invert -j DROP
> 
> After commits:
> 
>   netfilter: don't set F_IFACE on ipv6 fib lookups
>   http://patchwork.ozlabs.org/patch/873574/
> 
>   netfilter: ip6t_rpfilter: provide input interface for route lookup
>   https://patchwork.ozlabs.org/patch/919290/
> 
> ...the DHCPv6 client no longer sees any replies from the server. They are now
> filtered out by rpfilter. Removing the ip6tables rule shown above, or rolling
> back both of these commits, makes it all work fine again.
> 
> From commit messages it doesn't appear like this would be a "by design"
> behavior of these changes.
> 
> I did not test if other kernel branches (4.17 et al) are affected, but if they
> also have both of these, I guess they likely are.

I believe this was fixed by:

  cede24d1b21d ("netfilter: ip6t_rpfilter: provide input interface for route 
lookup")

which landed in 4.18.
--
To unsubscribe from this list: send the line "unsubscribe netfilter-devel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


4.14.54 regression: rpfilter and DHCPv6

2018-07-25 Thread Roman Mamedov
Hello,

I have a machine which is a DHCPv6 client on a PPPoE connection. It also has:

  sysctl -w net.netfilter.nf_conntrack_tcp_loose=0
  ip6tables -t raw -A PREROUTING ! -i lo -m rpfilter --invert -j DROP

After commits:

  netfilter: don't set F_IFACE on ipv6 fib lookups
  http://patchwork.ozlabs.org/patch/873574/

  netfilter: ip6t_rpfilter: provide input interface for route lookup
  https://patchwork.ozlabs.org/patch/919290/

...the DHCPv6 client no longer sees any replies from the server. They are now
filtered out by rpfilter. Removing the ip6tables rule shown above, or rolling
back both of these commits, makes it all work fine again.

>From commit messages it doesn't appear like this would be a "by design"
behavior of these changes.

I did not test if other kernel branches (4.17 et al) are affected, but if they
also have both of these, I guess they likely are.

-- 
With respect,
Roman
--
To unsubscribe from this list: send the line "unsubscribe netfilter-devel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH RFC/RFT net-next 00/17] net: Convert neighbor tables to per-namespace

2018-07-25 Thread David Ahern
On 7/25/18 6:33 AM, Eric W. Biederman wrote:
> Cong Wang  writes:
> 
>> On Tue, Jul 24, 2018 at 8:14 AM David Ahern  wrote:
>>>
>>> On 7/19/18 11:12 AM, Cong Wang wrote:
 On Thu, Jul 19, 2018 at 9:16 AM David Ahern  wrote:
>
> Chatting with Nikolay about this and he brought up a good corollary - ip
> fragmentation. It really is a similar problem in that memory is consumed
> as a result of packets received from an external entity. The ipfrag
> sysctls are per namespace with a limit that non-init_net namespaces can
> not set high_thresh > the current value of init_net. Potential memory
> consumed by fragments scales with the number of namespaces which is the
> primary concern with making neighbor tables per namespace.

 Nothing new, already discussed:
 https://marc.info/?l=linux-netdev&m=140391416215988&w=2

 :)

>>>
>>> Neighbor tables, bridge fdbs, vxlan fdbs and ip fragments all consume
>>> local memory resources due to received packets. bridge and vxlan fdb's
>>> are fairly straightforward analogs to neighbor entries; they are per
>>> device with no limits on the number of entries. Fragments have memory
>>> limits per namespace. So neighbor tables are the only ones with this
>>> strict limitation and concern on memory consumption.
>>>
>>> I get the impression there is no longer a strong resistance against
>>> moving the tables to per namespace, but deciding what is the right
>>> approach to handle backwards compatibility. Correct? Changing the
>>> accounting is inevitably going to be noticeable to some use case(s), but
>>> with sysctl settings it is a simple runtime update once the user knows
>>> to make the change.
>>
>> This question definitely should go to Eric Biederman who was against
>> my proposal.
>>
>> Let's add Eric into CC.
> 
> Given that the entries are per device and the devices are per-namespace,
> semantically neighbours are already kept in a per-namespace manner.  So
> this is all about making the code not honoring global resource limits.
> Making the code not honor gc_thresh3.
> 
> Skimming through the code today the default for gc_thresh3 is 1024.
> Which means that we limit the neighbour tables to 1024 entries per
> protocol type.
> 
> There are some pretty compelling reasons especially with ipv4 to keep
> the subnet size down.  Arp storms are a real thing.
> 
> I don't know off the top of my head what the reasons for limiting the
> neighbour table sizes.  I would be much more comfortable with a patchset
> like this if we did some research and figured out the reasons why
> we have a global limit.  Then changed the code to remove those limits.
> 
> When the limits are gone.  When the code can support large subnets
> without tuning.  We we don't have to worry about someone scanning an all
> addresses in an ipv6 subnet and causing a DOS on working machines.
> I think it is completely appropriate to look to see if something per
> network namespace needs to happen.
> 
> So please let's address the limits, not the fact that some specific
> corner case ran into them.
> 
> If we are going to neuter gc_thresh3 let's go as far as removing it
> entirely.  If we are going to make the neighbour table per something
> let's make it per network device.  If we can afford the multiple hash
> tables then a hash table per device is better.   Perhaps we want to move
> to rhash tables while we look at this, instead of an old hand grown
> version of resizable hash table.

Given the uses cases with increasing number of devices (> 10,000),
per-device tables will have more problems than per namespace - in
reference to your concern in the last paragraph below.

> 
> Unless I misread something all your patchset did is reshuffle code and
> data structures so that gc_thresh3 does not apply accross namespaces.
> That does not feel like it really fixes anything.  That just lies to
> people.

This patch set fixes the lie that network namespaces provide complete
isolation when in fact one namespace can evict neighbor entries from
another. An arp storm you are concerned about in one namespace impacts
all containers.

It starts by removing the proliferation of open coded references to
arp_tbl and nd_tbl, moving them behind the existing neigh_find_table.
>From there (patches 14-16) it makes the tables per-namespace and hence
makes the gc_thresh parameters which are per-table now
per-table-per-namespace.

So it removes the global thresholds because the global ones are just
wrong given the meaning of a network namespace and provides the more
appropriate per-namespace limits.

> 
> Further unless I misread something you are increasing the number of
> timers to 3 per namespace.  If I create create a thousand network
> namespaces that feels like it will hurt system performance overall.

It seems to me the timers are per neighbor entry not table. The per
table ones are for proxies.
--
To unsubscribe from this list: send the line "unsubscribe netfilter-devel" in
the bo

Re: [PATCH 1/4] treewide: convert ISO_8859-1 text comments to utf-8

2018-07-25 Thread Arnd Bergmann
tools/perf/tests/.gitignore:
LLVM byte-codes, uncompressed
On Wed, Jul 25, 2018 at 2:55 AM, Andrew Morton
 wrote:
> On Tue, 24 Jul 2018 17:13:20 -0700 Joe Perches  wrote:
>
>> On Tue, 2018-07-24 at 14:00 -0700, Andrew Morton wrote:
>> > On Tue, 24 Jul 2018 13:13:25 +0200 Arnd Bergmann  wrote:
>> > > Almost all files in the kernel are either plain text or UTF-8
>> > > encoded. A couple however are ISO_8859-1, usually just a few
>> > > characters in a C comments, for historic reasons.
>> > > This converts them all to UTF-8 for consistency.
>> []
>> > Will we be getting a checkpatch rule to keep things this way?
>>
>> How would that be done?
>
> I'm using this, seems to work.
>
> if ! file $p | grep -q -P ", ASCII text|, UTF-8 Unicode text"
> then
> echo $p: weird charset
> fi

There are a couple of files that my version of 'find' incorrectly identified as
something completely different, like:

Documentation/devicetree/bindings/pinctrl/pinctrl-sx150x.txt:
SemOne archive data
Documentation/devicetree/bindings/rtc/epson,rtc7301.txt:
Microsoft Document Imaging Format
Documentation/filesystems/nfs/pnfs-block-server.txt:
PPMN archive data
arch/arm/boot/dts/bcm283x-rpi-usb-host.dtsi:
Sendmail frozen configuration  - version = "host";
Documentation/networking/segmentation-offloads.txt:
StuffIt Deluxe Segment (data) : gmentation Offloads in the
Linux Networking Stack
arch/sparc/include/asm/visasm.h:  SAS 7+
arch/xtensa/kernel/setup.c: ,
init=0x454c, stat=0x090a, dev=0x2009, bas=0x2020
drivers/cpufreq/powernow-k8.c:
TI-XX Graphing Calculator (FLASH)
tools/testing/selftests/net/forwarding/tc_shblocks.sh:
Minix filesystem, V2 (big endian)
tools/perf/tests/.gitignore:
LLVM byte-codes, uncompressed

All of the above seem to be valid ASCII or UTF-8 files, so the check
above will lead
to false-positives, but it may be good enough as they are the
exception, and may be
bugs in 'file'.

Not sure if we need to worry about 'file' not being installed.

   Arnd
--
To unsubscribe from this list: send the line "unsubscribe netfilter-devel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH PATCH net-next 08/18] sctp: whitespace fixes

2018-07-25 Thread Neil Horman
On Tue, Jul 24, 2018 at 12:29:08PM -0700, Stephen Hemminger wrote:
> Remove blank line at EOF and trailing whitespace.
> 
> Signed-off-by: Stephen Hemminger 
> ---
>  net/sctp/Kconfig | 4 ++--
>  net/sctp/sm_sideeffect.c | 1 -
>  2 files changed, 2 insertions(+), 3 deletions(-)
> 
> diff --git a/net/sctp/Kconfig b/net/sctp/Kconfig
> index c740b189d4ba..950ecf6e7439 100644
> --- a/net/sctp/Kconfig
> +++ b/net/sctp/Kconfig
> @@ -41,8 +41,8 @@ config SCTP_DBG_OBJCNT
>   bool "SCTP: Debug object counts"
>   depends on PROC_FS
>   help
> -   If you say Y, this will enable debugging support for counting the 
> -   type of objects that are currently allocated.  This is useful for 
> +   If you say Y, this will enable debugging support for counting the
> +   type of objects that are currently allocated.  This is useful for
> identifying memory leaks. This debug information can be viewed by
> 'cat /proc/net/sctp/sctp_dbg_objcnt'
>  
> diff --git a/net/sctp/sm_sideeffect.c b/net/sctp/sm_sideeffect.c
> index 298112ca8c06..85d393090238 100644
> --- a/net/sctp/sm_sideeffect.c
> +++ b/net/sctp/sm_sideeffect.c
> @@ -1827,4 +1827,3 @@ static int sctp_cmd_interpreter(enum sctp_event 
> event_type,
>   error = -ENOMEM;
>   goto out;
>  }
> -
> -- 
> 2.18.0
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-sctp" in
> the body of a message to majord...@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> 
Acked-by: Neil Horman 

--
To unsubscribe from this list: send the line "unsubscribe netfilter-devel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH PATCH net-next 07/18] ceph: fix whitespace

2018-07-25 Thread Ilya Dryomov
On Tue, Jul 24, 2018 at 9:31 PM Stephen Hemminger
 wrote:
>
> Remove blank lines at end of file and trailing whitespace.
>
> Signed-off-by: Stephen Hemminger 
> ---
>  net/ceph/Kconfig  | 1 -
>  net/ceph/Makefile | 1 -
>  net/ceph/auth_none.c  | 1 -
>  net/ceph/auth_none.h  | 1 -
>  net/ceph/auth_x.c | 2 --
>  net/ceph/auth_x.h | 1 -
>  net/ceph/mon_client.c | 2 +-
>  net/ceph/pagevec.c| 1 -
>  8 files changed, 1 insertion(+), 9 deletions(-)
>
> diff --git a/net/ceph/Kconfig b/net/ceph/Kconfig
> index f8cceb99e732..cd2d5b9301a1 100644
> --- a/net/ceph/Kconfig
> +++ b/net/ceph/Kconfig
> @@ -41,4 +41,3 @@ config CEPH_LIB_USE_DNS_RESOLVER
>   Documentation/networking/dns_resolver.txt
>
>   If unsure, say N.
> -
> diff --git a/net/ceph/Makefile b/net/ceph/Makefile
> index 12bf49772d24..db09defe27d0 100644
> --- a/net/ceph/Makefile
> +++ b/net/ceph/Makefile
> @@ -15,4 +15,3 @@ libceph-y := ceph_common.o messenger.o msgpool.o buffer.o 
> pagelist.o \
> auth_x.o \
> ceph_fs.o ceph_strings.o ceph_hash.o \
> pagevec.o snapshot.o string_table.o
> -
> diff --git a/net/ceph/auth_none.c b/net/ceph/auth_none.c
> index 41d2a0c72236..edb7042479ed 100644
> --- a/net/ceph/auth_none.c
> +++ b/net/ceph/auth_none.c
> @@ -142,4 +142,3 @@ int ceph_auth_none_init(struct ceph_auth_client *ac)
> ac->ops = &ceph_auth_none_ops;
> return 0;
>  }
> -
> diff --git a/net/ceph/auth_none.h b/net/ceph/auth_none.h
> index 860ed9875791..4158f064302e 100644
> --- a/net/ceph/auth_none.h
> +++ b/net/ceph/auth_none.h
> @@ -26,4 +26,3 @@ struct ceph_auth_none_info {
>  int ceph_auth_none_init(struct ceph_auth_client *ac);
>
>  #endif
> -
> diff --git a/net/ceph/auth_x.c b/net/ceph/auth_x.c
> index 2f4a1baf5f52..32c7f5c4b1a6 100644
> --- a/net/ceph/auth_x.c
> +++ b/net/ceph/auth_x.c
> @@ -823,5 +823,3 @@ int ceph_x_init(struct ceph_auth_client *ac)
>  out:
> return ret;
>  }
> -
> -
> diff --git a/net/ceph/auth_x.h b/net/ceph/auth_x.h
> index 454cb54568af..a71c4c282b57 100644
> --- a/net/ceph/auth_x.h
> +++ b/net/ceph/auth_x.h
> @@ -52,4 +52,3 @@ struct ceph_x_info {
>  int ceph_x_init(struct ceph_auth_client *ac);
>
>  #endif
> -
> diff --git a/net/ceph/mon_client.c b/net/ceph/mon_client.c
> index d7a7a2330ef7..18deb3d889c4 100644
> --- a/net/ceph/mon_client.c
> +++ b/net/ceph/mon_client.c
> @@ -1249,7 +1249,7 @@ static void dispatch(struct ceph_connection *con, 
> struct ceph_msg *msg)
> if (monc->client->extra_mon_dispatch &&
> monc->client->extra_mon_dispatch(monc->client, msg) == 0)
> break;
> -
> +
> pr_err("received unknown message type %d %s\n", type,
>ceph_msg_type_name(type));
> }
> diff --git a/net/ceph/pagevec.c b/net/ceph/pagevec.c
> index e560d3975f41..d3736f5bffec 100644
> --- a/net/ceph/pagevec.c
> +++ b/net/ceph/pagevec.c
> @@ -197,4 +197,3 @@ void ceph_zero_page_vector_range(int off, int len, struct 
> page **pages)
> }
>  }
>  EXPORT_SYMBOL(ceph_zero_page_vector_range);
> -

Applied.

Thanks,

Ilya
--
To unsubscribe from this list: send the line "unsubscribe netfilter-devel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH RFC/RFT net-next 00/17] net: Convert neighbor tables to per-namespace

2018-07-25 Thread Eric W. Biederman
Cong Wang  writes:

> On Tue, Jul 24, 2018 at 8:14 AM David Ahern  wrote:
>>
>> On 7/19/18 11:12 AM, Cong Wang wrote:
>> > On Thu, Jul 19, 2018 at 9:16 AM David Ahern  wrote:
>> >>
>> >> Chatting with Nikolay about this and he brought up a good corollary - ip
>> >> fragmentation. It really is a similar problem in that memory is consumed
>> >> as a result of packets received from an external entity. The ipfrag
>> >> sysctls are per namespace with a limit that non-init_net namespaces can
>> >> not set high_thresh > the current value of init_net. Potential memory
>> >> consumed by fragments scales with the number of namespaces which is the
>> >> primary concern with making neighbor tables per namespace.
>> >
>> > Nothing new, already discussed:
>> > https://marc.info/?l=linux-netdev&m=140391416215988&w=2
>> >
>> > :)
>> >
>>
>> Neighbor tables, bridge fdbs, vxlan fdbs and ip fragments all consume
>> local memory resources due to received packets. bridge and vxlan fdb's
>> are fairly straightforward analogs to neighbor entries; they are per
>> device with no limits on the number of entries. Fragments have memory
>> limits per namespace. So neighbor tables are the only ones with this
>> strict limitation and concern on memory consumption.
>>
>> I get the impression there is no longer a strong resistance against
>> moving the tables to per namespace, but deciding what is the right
>> approach to handle backwards compatibility. Correct? Changing the
>> accounting is inevitably going to be noticeable to some use case(s), but
>> with sysctl settings it is a simple runtime update once the user knows
>> to make the change.
>
> This question definitely should go to Eric Biederman who was against
> my proposal.
>
> Let's add Eric into CC.

Given that the entries are per device and the devices are per-namespace,
semantically neighbours are already kept in a per-namespace manner.  So
this is all about making the code not honoring global resource limits.
Making the code not honor gc_thresh3.

Skimming through the code today the default for gc_thresh3 is 1024.
Which means that we limit the neighbour tables to 1024 entries per
protocol type.

There are some pretty compelling reasons especially with ipv4 to keep
the subnet size down.  Arp storms are a real thing.

I don't know off the top of my head what the reasons for limiting the
neighbour table sizes.  I would be much more comfortable with a patchset
like this if we did some research and figured out the reasons why
we have a global limit.  Then changed the code to remove those limits.

When the limits are gone.  When the code can support large subnets
without tuning.  We we don't have to worry about someone scanning an all
addresses in an ipv6 subnet and causing a DOS on working machines.
I think it is completely appropriate to look to see if something per
network namespace needs to happen.

So please let's address the limits, not the fact that some specific
corner case ran into them.

If we are going to neuter gc_thresh3 let's go as far as removing it
entirely.  If we are going to make the neighbour table per something
let's make it per network device.  If we can afford the multiple hash
tables then a hash table per device is better.   Perhaps we want to move
to rhash tables while we look at this, instead of an old hand grown
version of resizable hash table.

Unless I misread something all your patchset did is reshuffle code and
data structures so that gc_thresh3 does not apply accross namespaces.
That does not feel like it really fixes anything.  That just lies to
people.

Further unless I misread something you are increasing the number of
timers to 3 per namespace.  If I create create a thousand network
namespaces that feels like it will hurt system performance overall.

Eric


--
To unsubscribe from this list: send the line "unsubscribe netfilter-devel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] netfilter: ipset: export indexes via netlink

2018-07-25 Thread Florent Fourcot

Hi Jozsef,

Thanks a lot, I will test it on my side this week.

Just a small comment after a short code review: what about adding 
IPSET_ATTR_INDEX in list command when proto is greater than 6? I agree 
that specific commands are a good idea, but i still think that adding it 
in list is a good idea (it's probably less relevant in ip_set_header 
with your patch).


Best regards,

Florent
--
To unsubscribe from this list: send the line "unsubscribe netfilter-devel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: netfilter xt_alloc_table_info regression

2018-07-25 Thread Michal Hocko
On Wed 25-07-18 14:34:36, Georgi Nikolov wrote:
> Hello,
> 
> I made tests with patch reverted and seems that old version is masking
> errors.
> 
> I will write to cgroup developers, because this only happens when
> cgroups are enabled and there are large number of cgroups created.

It seems somebody is depleting the vmalloc space. Feel free to CC me on
that report. I am definitely interested.
-- 
Michal Hocko
SUSE Labs
--
To unsubscribe from this list: send the line "unsubscribe netfilter-devel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH nf-next] netfilter: conntrack: avoid use-after free on rmmod

2018-07-25 Thread Florian Westphal
When the conntrack module is removed, we call nf_ct_iterate_destroy via
nf_ct_l4proto_unregister().

Problem is that nf_conntrack_proto_fini() gets called after the
conntrack hash table has already been freed.

Just remove the l4proto unregister call, its unecessary as the
nf_ct_protos[] array gets free'd right after anyway.

Fixes: a0ae2562c6c4b2 ("netfilter: conntrack: remove l3proto abstraction")
Signed-off-by: Florian Westphal 
---
 net/netfilter/nf_conntrack_proto.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/net/netfilter/nf_conntrack_proto.c 
b/net/netfilter/nf_conntrack_proto.c
index 803607a90102..a7bbbefd8ec9 100644
--- a/net/netfilter/nf_conntrack_proto.c
+++ b/net/netfilter/nf_conntrack_proto.c
@@ -940,8 +940,6 @@ void nf_conntrack_proto_fini(void)
 {
unsigned int i;
 
-   nf_ct_l4proto_unregister(builtin_l4proto,
-ARRAY_SIZE(builtin_l4proto));
nf_unregister_sockopt(&so_getorigdst);
 #if IS_ENABLED(CONFIG_IPV6)
nf_unregister_sockopt(&so_getorigdst6);
-- 
2.16.4

--
To unsubscribe from this list: send the line "unsubscribe netfilter-devel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: netfilter xt_alloc_table_info regression

2018-07-25 Thread Georgi Nikolov
Hello,

I made tests with patch reverted and seems that old version is masking
errors.

I will write to cgroup developers, because this only happens when
cgroups are enabled and there are large number of cgroups created.

Thank you for your time.

Regards,

-- Georgi Nikolov



On 07/24/2018 11:39 AM, Michal Hocko wrote:
> On Tue 24-07-18 10:36:38, Georgi Nikolov wrote:
>> Hello,
>>
>> I posted a kernel bug https://bugzilla.kernel.org/show_bug.cgi?id=200639 and
>> i hope this is the correct place to discuss this.
> Let me quote your report for the full context
>
> : Folowing commit leads to "vmalloc: allocation failure" when cgroups memory 
> controller is enabled:
> : 
> : 
> https://github.com/torvalds/linux/commit/eacd86ca3b036e55e172b7279f101cef4a6ff3a4
> : 
> : After creating large number of cgroups and under memory pressure, iptables 
> command fails with following error:
> : 
> : "iptables-restor: vmalloc: allocation failure, allocated 3047424 of 3465216 
> bytes, mode:0x14010c0(GFP_KERNEL|__GFP_NORETRY), nodemask=(null)"
> : 
> : System which is used to reproduce the bug is with 2 vcpus and 2GB of ram, 
> but it happens on more powerfull systems.
> : 
> : Steps to reproduce:
> : 
> : mkdir /cgroup
> : mount cgroup -t cgroup -omemory,pids,blkio,cpuacct /cgroup
> : for a in `seq 1 1000`; do for b in `seq 1 4` ; do mkdir -p 
> "/cgroup/user/$a/$b"; done; done
> : 
> : Then in separate consoles
> : 
> : cat /dev/vda > /dev/null
> : ./test
> : ./test
> : i=0;while sleep 0 ; do iptables-restore < iptables.save ; i=$(($i+1)); echo 
> $i; done
> : 
> : Here is the source of "test" program and attached iptables.save. It happens 
> also with smaller iptables.save file.
> : 
> : #include 
> : #include 
> : 
> : int main(void) {
> : 
> : srand(time(NULL));
> : int i = 0, j = 0, randnum=0;
> : int arr[6] = { 3072, 7168, 15360 , 31744, 64512, 130048}; 
> : while(1) {
> : 
> : for (i = 0; i < 6 ; i++) {
> : 
> : int *ptr = (int*) malloc(arr[i] * 93);  
> : 
> : for(j = 0 ; j < arr[i] * 93 / sizeof(int); j++) {
> : *(ptr+j) = j+1;
> : }
> : 
> : free(ptr);
> : }
> : }   
> : }
>
> Have you confirmed that revering eacd86ca3b03
> ("net/netfilter/x_tables.c: use kvmalloc() in xt_alloc_table_info()")
> fixes the allocation failure you are seeing?
>
> There are only two differences the patch has introduced. It has
> introduced vmalloc fallback for all but order-0 sizes and it dropped
> __GFP_NOWARN from the vmalloc call.
>
> The later would allow to print the error message. Just to make it clear,
> the regression you are seeing is not just the error message. It is
> iptables-restore that fails and hasn't before, right?



[PATCH][v3] netfilter: use kvmalloc_array to allocate memory for hashtable

2018-07-25 Thread Li RongQing
nf_ct_alloc_hashtable is used to allocate memory for conntrack,
NAT bysrc and expectation hashtable. Assuming 64k bucket size,
which means 7th order page allocation, __get_free_pages, called
by nf_ct_alloc_hashtable, will trigger the direct memory reclaim
and stall for a long time, when system has lots of memory stress

so replace combination of __get_free_pages and vzalloc with
kvmalloc_array, which provides a overflow check and a fallback
if no high order memory is available, and do not retry to reclaim
memory, reduce stall

and remove nf_ct_free_hashtable, since it is just a kvfree

Signed-off-by: Zhang Yu 
Signed-off-by: Wang Li 
Signed-off-by: Li RongQing 
---
 include/net/netfilter/nf_conntrack.h |  2 --
 net/netfilter/nf_conntrack_core.c| 29 ++---
 net/netfilter/nf_conntrack_expect.c  |  2 +-
 net/netfilter/nf_conntrack_helper.c  |  4 ++--
 net/netfilter/nf_nat_core.c  |  4 ++--
 5 files changed, 11 insertions(+), 30 deletions(-)

diff --git a/include/net/netfilter/nf_conntrack.h 
b/include/net/netfilter/nf_conntrack.h
index a2b0ed025908..7e012312cd61 100644
--- a/include/net/netfilter/nf_conntrack.h
+++ b/include/net/netfilter/nf_conntrack.h
@@ -176,8 +176,6 @@ void nf_ct_netns_put(struct net *net, u8 nfproto);
  */
 void *nf_ct_alloc_hashtable(unsigned int *sizep, int nulls);
 
-void nf_ct_free_hashtable(void *hash, unsigned int size);
-
 int nf_conntrack_hash_check_insert(struct nf_conn *ct);
 bool nf_ct_delete(struct nf_conn *ct, u32 pid, int report);
 
diff --git a/net/netfilter/nf_conntrack_core.c 
b/net/netfilter/nf_conntrack_core.c
index 8a113ca1eea2..429151b4991a 100644
--- a/net/netfilter/nf_conntrack_core.c
+++ b/net/netfilter/nf_conntrack_core.c
@@ -2022,16 +2022,6 @@ static int kill_all(struct nf_conn *i, void *data)
return net_eq(nf_ct_net(i), data);
 }
 
-void nf_ct_free_hashtable(void *hash, unsigned int size)
-{
-   if (is_vmalloc_addr(hash))
-   vfree(hash);
-   else
-   free_pages((unsigned long)hash,
-  get_order(sizeof(struct hlist_head) * size));
-}
-EXPORT_SYMBOL_GPL(nf_ct_free_hashtable);
-
 void nf_conntrack_cleanup_start(void)
 {
conntrack_gc_work.exiting = true;
@@ -2042,7 +2032,7 @@ void nf_conntrack_cleanup_end(void)
 {
RCU_INIT_POINTER(nf_ct_hook, NULL);
cancel_delayed_work_sync(&conntrack_gc_work.dwork);
-   nf_ct_free_hashtable(nf_conntrack_hash, nf_conntrack_htable_size);
+   kvfree(nf_conntrack_hash);
 
nf_conntrack_proto_fini();
nf_conntrack_seqadj_fini();
@@ -2108,7 +2098,6 @@ void *nf_ct_alloc_hashtable(unsigned int *sizep, int 
nulls)
 {
struct hlist_nulls_head *hash;
unsigned int nr_slots, i;
-   size_t sz;
 
if (*sizep > (UINT_MAX / sizeof(struct hlist_nulls_head)))
return NULL;
@@ -2116,14 +2105,8 @@ void *nf_ct_alloc_hashtable(unsigned int *sizep, int 
nulls)
BUILD_BUG_ON(sizeof(struct hlist_nulls_head) != sizeof(struct 
hlist_head));
nr_slots = *sizep = roundup(*sizep, PAGE_SIZE / sizeof(struct 
hlist_nulls_head));
 
-   if (nr_slots > (UINT_MAX / sizeof(struct hlist_nulls_head)))
-   return NULL;
-
-   sz = nr_slots * sizeof(struct hlist_nulls_head);
-   hash = (void *)__get_free_pages(GFP_KERNEL | __GFP_NOWARN | __GFP_ZERO,
-   get_order(sz));
-   if (!hash)
-   hash = vzalloc(sz);
+   hash = kvmalloc_array(nr_slots, sizeof(struct hlist_nulls_head),
+   GFP_KERNEL | __GFP_ZERO);
 
if (hash && nulls)
for (i = 0; i < nr_slots; i++)
@@ -2150,7 +2133,7 @@ int nf_conntrack_hash_resize(unsigned int hashsize)
 
old_size = nf_conntrack_htable_size;
if (old_size == hashsize) {
-   nf_ct_free_hashtable(hash, hashsize);
+   kvfree(hash);
return 0;
}
 
@@ -2186,7 +2169,7 @@ int nf_conntrack_hash_resize(unsigned int hashsize)
local_bh_enable();
 
synchronize_net();
-   nf_ct_free_hashtable(old_hash, old_size);
+   kvfree(old_hash);
return 0;
 }
 
@@ -2350,7 +2333,7 @@ int nf_conntrack_init_start(void)
 err_expect:
kmem_cache_destroy(nf_conntrack_cachep);
 err_cachep:
-   nf_ct_free_hashtable(nf_conntrack_hash, nf_conntrack_htable_size);
+   kvfree(nf_conntrack_hash);
return ret;
 }
 
diff --git a/net/netfilter/nf_conntrack_expect.c 
b/net/netfilter/nf_conntrack_expect.c
index 3f586ba23d92..27b84231db10 100644
--- a/net/netfilter/nf_conntrack_expect.c
+++ b/net/netfilter/nf_conntrack_expect.c
@@ -712,5 +712,5 @@ void nf_conntrack_expect_fini(void)
 {
rcu_barrier(); /* Wait for call_rcu() before destroy */
kmem_cache_destroy(nf_ct_expect_cachep);
-   nf_ct_free_hashtable(nf_ct_expect_hash, nf_ct_expect_hsize);
+   kvfree(nf_ct_expect_hash);
 }
diff --git a/net/netfilter/nf_conntr

Re: Re: [PATCH v2] ipvs: fix race between ip_vs_conn_new() andip_vs_del_dest()

2018-07-25 Thread tan.hu
Thanks, patch-v3 has been sent. please check it again.

> Hello,
> 
> On Wed, 25 Jul 2018, Tan Hu wrote:
> 
> > We came across infinite loop in ipvs when using ipvs in docker
> > env.
> >
> > When ipvs receives new packets and cannot find an ipvs connection,
> > it will create a new connection, then if the dest is unavailable
> > (i.e. IP_VS_DEST_F_AVAILABLE), the packet will be dropped sliently.
> >
> > But if the dropped packet is the first packet of this connection,
> > the connection control timer never has a chance to start and the
> > ipvs connection cannot be released. This will lead to memory leak, or
> > infinite loop in cleanup_net() when net namespace is released like
> > this:
> >
> > ip_vs_conn_net_cleanup at a0a9f31a [ip_vs]
> > __ip_vs_cleanup at a0a9f60a [ip_vs]
> > ops_exit_list at 81567a49
> > cleanup_net at 81568b40
> > process_one_work at 810a851b
> > worker_thread at 810a9356
> > kthread at 810b0b6f
> > ret_from_fork at 81697a18
> >
> > race condition:
> > CPU1 CPU2
> > ip_vs_in()
> > ip_vs_conn_new()
> > ip_vs_del_dest()
> > __ip_vs_unlink_dest()
> > ~IP_VS_DEST_F_AVAILABLE
> > cp->dest && !IP_VS_DEST_F_AVAILABLE
> > __ip_vs_conn_put
> > ...
> > cleanup_net ---> infinite looping
> >
> > Fix this by checking whether the timer already started.
> >
> > Signed-off-by: Tan Hu 
> > Reviewed-by: Jiang Biao 
> > ---
> > v2: fix use-after-free in CONN_ONE_PACKET case suggested by Julian Anastasov
> >
> > net/netfilter/ipvs/ip_vs_core.c | 15 +++
> > 1 file changed, 11 insertions(+), 4 deletions(-)
> >
> > diff --git a/net/netfilter/ipvs/ip_vs_core.c 
> > b/net/netfilter/ipvs/ip_vs_core.c
> > index 0679dd1..a17104f 100644
> > --- a/net/netfilter/ipvs/ip_vs_core.c
> > +++ b/net/netfilter/ipvs/ip_vs_core.c
> > @@ -1972,13 +1972,20 @@ static int ip_vs_in_icmp_v6(struct netns_ipvs 
> > *ipvs, struct sk_buff *skb,
> > if (cp->dest && !(cp->dest->flags & IP_VS_DEST_F_AVAILABLE)) {
> > /* the destination server is not available */
> >
> > - if (sysctl_expire_nodest_conn(ipvs)) {
> > + __u32 flags = cp->flags;
> 
> Ops, now scripts/checkpatch.pl --strict /tmp/file.patch
> is complaining about extra trailing space in above line.
> You can also remove the empty line above the new code...
> 
> Regards

[PATCH v3] ipvs: fix race between ip_vs_conn_new() and ip_vs_del_dest()

2018-07-25 Thread Tan Hu
We came across infinite loop in ipvs when using ipvs in docker
env.

When ipvs receives new packets and cannot find an ipvs connection,
it will create a new connection, then if the dest is unavailable
(i.e. IP_VS_DEST_F_AVAILABLE), the packet will be dropped sliently.

But if the dropped packet is the first packet of this connection,
the connection control timer never has a chance to start and the
ipvs connection cannot be released. This will lead to memory leak, or
infinite loop in cleanup_net() when net namespace is released like
this:

ip_vs_conn_net_cleanup at a0a9f31a [ip_vs]
__ip_vs_cleanup at a0a9f60a [ip_vs]
ops_exit_list at 81567a49
cleanup_net at 81568b40
process_one_work at 810a851b
worker_thread at 810a9356
kthread at 810b0b6f
ret_from_fork at 81697a18

race condition:
CPU1   CPU2
ip_vs_in()
  ip_vs_conn_new()
   ip_vs_del_dest()
 __ip_vs_unlink_dest()
   ~IP_VS_DEST_F_AVAILABLE
  cp->dest && !IP_VS_DEST_F_AVAILABLE
  __ip_vs_conn_put
...
cleanup_net  ---> infinite looping

Fix this by checking whether the timer already started.

Signed-off-by: Tan Hu 
Reviewed-by: Jiang Biao 
---
v2: fix use-after-free in CONN_ONE_PACKET case suggested by Julian Anastasov
v3: remove trailing whitespace for patch checking 

 net/netfilter/ipvs/ip_vs_core.c | 15 +++
 1 file changed, 11 insertions(+), 4 deletions(-)

diff --git a/net/netfilter/ipvs/ip_vs_core.c b/net/netfilter/ipvs/ip_vs_core.c
index 0679dd1..a17104f 100644
--- a/net/netfilter/ipvs/ip_vs_core.c
+++ b/net/netfilter/ipvs/ip_vs_core.c
@@ -1972,13 +1972,20 @@ static int ip_vs_in_icmp_v6(struct netns_ipvs *ipvs, 
struct sk_buff *skb,
if (cp->dest && !(cp->dest->flags & IP_VS_DEST_F_AVAILABLE)) {
/* the destination server is not available */

-   if (sysctl_expire_nodest_conn(ipvs)) {
+   __u32 flags = cp->flags;
+
+   /* when timer already started, silently drop the packet.*/
+   if (timer_pending(&cp->timer))
+   __ip_vs_conn_put(cp);
+   else
+   ip_vs_conn_put(cp);
+
+   if (sysctl_expire_nodest_conn(ipvs) &&
+   !(flags & IP_VS_CONN_F_ONE_PACKET)) {
/* try to expire the connection immediately */
ip_vs_conn_expire_now(cp);
}
-   /* don't restart its timer, and silently
-  drop the packet. */
-   __ip_vs_conn_put(cp);
+
return NF_DROP;
}

--
1.8.3.1

--
To unsubscribe from this list: send the line "unsubscribe netfilter-devel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html