from:"Kirill Tkhai"

Re: [PATCH v4 02/27] net: datagram: fix some kernel-doc markups

2020-11-16 Thread Kirill Tkhai

On 16.11.2020 13:17, Mauro Carvalho Chehab wrote:
> Some identifiers have different names between their prototypes
> and the kernel-doc markup.
> 
> Signed-off-by: Mauro Carvalho Chehab 

Reviewed-by: Kirill Tkhai 

> ---
>  net/core/datagram.c   | 2 +-
>  net/core/dev.c| 4 ++--
>  net/core/skbuff.c | 2 +-
>  net/ethernet/eth.c| 6 +++---
>  net/sunrpc/rpc_pipe.c | 3 ++-
>  5 files changed, 9 insertions(+), 8 deletions(-)
> 
> diff --git a/net/core/datagram.c b/net/core/datagram.c
> index 9fcaa544f11a..81809fa735a7 100644
> --- a/net/core/datagram.c
> +++ b/net/core/datagram.c
> @@ -692,41 +692,41 @@ EXPORT_SYMBOL(__zerocopy_sg_from_iter);
>   *   @from: the source to copy from
>   *
>   *   The function will first copy up to headlen, and then pin the userspace
>   *   pages and build frags through them.
>   *
>   *   Returns 0, -EFAULT or -EMSGSIZE.
>   */
>  int zerocopy_sg_from_iter(struct sk_buff *skb, struct iov_iter *from)
>  {
>   int copy = min_t(int, skb_headlen(skb), iov_iter_count(from));
>  
>   /* copy up to skb headlen */
>   if (skb_copy_datagram_from_iter(skb, 0, from, copy))
>   return -EFAULT;
>  
>   return __zerocopy_sg_from_iter(NULL, skb, from, ~0U);
>  }
>  EXPORT_SYMBOL(zerocopy_sg_from_iter);
>  
>  /**
> - *   skb_copy_and_csum_datagram_iter - Copy datagram to an iovec iterator
> + *   skb_copy_and_csum_datagram - Copy datagram to an iovec iterator
>   *  and update a checksum.
>   *   @skb: buffer to copy
>   *   @offset: offset in the buffer to start copying from
>   *   @to: iovec iterator to copy to
>   *   @len: amount of data to copy from buffer to iovec
>   *  @csump: checksum pointer
>   */
>  static int skb_copy_and_csum_datagram(const struct sk_buff *skb, int offset,
> struct iov_iter *to, int len,
> __wsum *csump)
>  {
>   return __skb_datagram_iter(skb, offset, to, len, true,
>   csum_and_copy_to_iter, csump);
>  }
>  
>  /**
>   *   skb_copy_and_csum_datagram_msg - Copy and checksum skb to user iovec.
>   *   @skb: skbuff
>   *   @hlen: hardware length
>   *   @msg: destination
> diff --git a/net/core/dev.c b/net/core/dev.c
> index 60d325bda0d7..4bfdcd6b20e8 100644
> --- a/net/core/dev.c
> +++ b/net/core/dev.c
> @@ -6902,41 +6902,41 @@ static int netdev_has_upper_dev(struct net_device 
> *upper_dev,
>   *
>   * Find out if a device is linked to specified upper device and return true
>   * in case it is. Note that this checks only immediate upper device,
>   * not through a complete stack of devices. The caller must hold the RTNL 
> lock.
>   */
>  bool netdev_has_upper_dev(struct net_device *dev,
> struct net_device *upper_dev)
>  {
>   struct netdev_nested_priv priv = {
>   .data = (void *)upper_dev,
>   };
>  
>   ASSERT_RTNL();
>  
>   return netdev_walk_all_upper_dev_rcu(dev, netdev_has_upper_dev,
>&priv);
>  }
>  EXPORT_SYMBOL(netdev_has_upper_dev);
>  
>  /**
> - * netdev_has_upper_dev_all - Check if device is linked to an upper device
> + * netdev_has_upper_dev_all_rcu - Check if device is linked to an upper 
> device
>   * @dev: device
>   * @upper_dev: upper device to check
>   *
>   * Find out if a device is linked to specified upper device and return true
>   * in case it is. Note that this checks the entire upper device chain.
>   * The caller must hold rcu lock.
>   */
>  
>  bool netdev_has_upper_dev_all_rcu(struct net_device *dev,
> struct net_device *upper_dev)
>  {
>   struct netdev_nested_priv priv = {
>   .data = (void *)upper_dev,
>   };
>  
>   return !!netdev_walk_all_upper_dev_rcu(dev, netdev_has_upper_dev,
>  &priv);
>  }
>  EXPORT_SYMBOL(netdev_has_upper_dev_all_rcu);
>  
> @@ -8140,41 +8140,41 @@ void netdev_adjacent_rename_links(struct net_device 
> *dev, char *oldname)
>   }
>  }
>  
>  void *netdev_lower_dev_get_private(struct net_device *dev,
>  struct net_device *lower_dev)
>  {
>   struct netdev_adjacent *lower;
>  
>   if (!lower_dev)
>   return NULL;
>   lower = __netdev_find_adj(lower_dev, &dev->adj_list.lower);
>   if (!lower)
>   return NULL;
>  
>   return lower->private;
>  }
>  EXPORT_SYMBOL(netdev_lower_dev_get_private);
>  
>  
>  /**
> - * netdev_lower_change - Dispatch event about lower

[PATCH net-next] tun: Do SIOCGSKNS out of rtnl_lock()

2018-05-08 Thread Kirill Tkhai

Since net ns of tun device is assigned on the device creation,
and it never changes, we do not need to use any lock to get it
from alive tun.

Signed-off-by: Kirill Tkhai 
---
 drivers/net/tun.c |   18 +++---
 1 file changed, 7 insertions(+), 11 deletions(-)

diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index d3c04ab9752a..44d4f3d25350 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -2850,10 +2850,10 @@ static long __tun_chr_ioctl(struct file *file, unsigned 
int cmd,
unsigned long arg, int ifreq_len)
 {
struct tun_file *tfile = file->private_data;
+   struct net *net = sock_net(&tfile->sk);
struct tun_struct *tun;
void __user* argp = (void __user*)arg;
struct ifreq ifr;
-   struct net *net;
kuid_t owner;
kgid_t group;
int sndbuf;
@@ -2877,14 +2877,18 @@ static long __tun_chr_ioctl(struct file *file, unsigned 
int cmd,
 */
return put_user(IFF_TUN | IFF_TAP | TUN_FEATURES,
(unsigned int __user*)argp);
-   } else if (cmd == TUNSETQUEUE)
+   } else if (cmd == TUNSETQUEUE) {
return tun_set_queue(file, &ifr);
+   } else if (cmd == SIOCGSKNS) {
+   if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
+   return -EPERM;
+   return open_related_ns(&net->ns, get_net_ns);
+   }
 
ret = 0;
rtnl_lock();
 
tun = tun_get(tfile);
-   net = sock_net(&tfile->sk);
if (cmd == TUNSETIFF) {
ret = -EEXIST;
if (tun)
@@ -2914,14 +2918,6 @@ static long __tun_chr_ioctl(struct file *file, unsigned 
int cmd,
tfile->ifindex = ifindex;
goto unlock;
}
-   if (cmd == SIOCGSKNS) {
-   ret = -EPERM;
-   if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
-   goto unlock;
-
-   ret = open_related_ns(&net->ns, get_net_ns);
-   goto unlock;
-   }
 
ret = -EBADFD;
if (!tun)

Re: [PATCH net-next] tun: Do SIOCGSKNS out of rtnl_lock()

2018-05-09 Thread Kirill Tkhai

Hi, Jason,

On 09.05.2018 10:18, Jason Wang wrote:
> 
> 
> On 2018年05月09日 00:21, Kirill Tkhai wrote:
>> Since net ns of tun device is assigned on the device creation,
>> and it never changes, we do not need to use any lock to get it
>> from alive tun.
>>
>> Signed-off-by: Kirill Tkhai 
>> ---
>>   drivers/net/tun.c |   18 +++---
>>   1 file changed, 7 insertions(+), 11 deletions(-)
>>
>> diff --git a/drivers/net/tun.c b/drivers/net/tun.c
>> index d3c04ab9752a..44d4f3d25350 100644
>> --- a/drivers/net/tun.c
>> +++ b/drivers/net/tun.c
>> @@ -2850,10 +2850,10 @@ static long __tun_chr_ioctl(struct file *file, 
>> unsigned int cmd,
>>   unsigned long arg, int ifreq_len)
>>   {
>>   struct tun_file *tfile = file->private_data;
>> +    struct net *net = sock_net(&tfile->sk);
>>   struct tun_struct *tun;
>>   void __user* argp = (void __user*)arg;
>>   struct ifreq ifr;
>> -    struct net *net;
>>   kuid_t owner;
>>   kgid_t group;
>>   int sndbuf;
>> @@ -2877,14 +2877,18 @@ static long __tun_chr_ioctl(struct file *file, 
>> unsigned int cmd,
>>    */
>>   return put_user(IFF_TUN | IFF_TAP | TUN_FEATURES,
>>   (unsigned int __user*)argp);
>> -    } else if (cmd == TUNSETQUEUE)
>> +    } else if (cmd == TUNSETQUEUE) {
>>   return tun_set_queue(file, &ifr);
>> +    } else if (cmd == SIOCGSKNS) {
> 
> Not for this patch, reusing socket ioctl cmd is probably not good though they 
> were probably not intersected (see ioctl-number.txt). We probably need to 
> introduce TUN specific ioctls for SIOCGSKNS and SIOCGIFHWADDR and warn for 
> socket ones.

The most of socket ioctl cmds use 0x8900 type:

#define SOCK_IOC_TYPE   0x89

while tun cmd is 5400 ('T'). They should not intersect.

The only exceptions are

#define SIOCINQ FIONREAD
#define SIOCOUTQTIOCOUTQ

#define TIOCOUTQ0x5411
#define FIONREAD0x541B

But they can't intersect even with exceptions, since tun nr starts from 200:

#define TUNSETNOCSUM  _IOW('T', 200, int) 

and 200 > 0x1b (==27).

I implemented SIOCGSKNS cmd in the same style as older socket cmds were used.
I'm not sure, we can remove existing SIOCGIFHWADDR, since they are already used.
If we add a warn, which time will we able to remove them? Some old software may
use it, and in case of the program isn't developed any more, nobody can fix this
warnings, even if he/she sees them..

Kirill

> 
>> +    if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
>> +    return -EPERM;
>> +    return open_related_ns(&net->ns, get_net_ns);
>> +    }
>>     ret = 0;
>>   rtnl_lock();
>>     tun = tun_get(tfile);
>> -    net = sock_net(&tfile->sk);
>>   if (cmd == TUNSETIFF) {
>>   ret = -EEXIST;
>>   if (tun)
>> @@ -2914,14 +2918,6 @@ static long __tun_chr_ioctl(struct file *file, 
>> unsigned int cmd,
>>   tfile->ifindex = ifindex;
>>   goto unlock;
>>   }
>> -    if (cmd == SIOCGSKNS) {
>> -    ret = -EPERM;
>> -    if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
>> -    goto unlock;
>> -
>> -    ret = open_related_ns(&net->ns, get_net_ns);
>> -    goto unlock;
>> -    }
>>     ret = -EBADFD;
>>   if (!tun)
>>
>

Re: net namespaces kernel stack overflow

2018-04-18 Thread Kirill Tkhai

Hi, Alexander!

On 18.04.2018 22:45, Alexander Aring wrote:
> I currently can crash my net/master kernel by execute the following script:
> 
> --- snip
> 
> modprobe dummy
> 
> #mkdir /var/run/netns
> #touch /var/run/netns/init_net
> #mount --bind /proc/1/ns/net /var/run/netns/init_net
> 
> while true
> do
> mkdir /var/run/netns
> touch /var/run/netns/init_net
> mount --bind /proc/1/ns/net /var/run/netns/init_net
> 
> ip netns add foo
> ip netns exec foo ip link add dummy0 type dummy
> ip netns delete foo
> done

Fast answer is the best, so I tried your test on my not-for-work computer.
There is old kernel without asynchronous pernet operations:

$uname -a
Linux localhost.localdomain 4.15.0-2-amd64 #1 SMP Debian 4.15.11-1 (2018-03-20) 
x86_64 GNU/Linux

After approximately 15 seconds of your test execution it died :(
(Hopefully, I executed it in "init 1" with all partitions RO as usual).

There is no serial console, so I can't say that the first stack is exactly
the same as you see. But it crashed. So, it seems, the problem have been
existing long ago.

Have you tried to reproduce it in older kernels or to bisect the problem commit?
Or maybe it doesn't reproduce on old kernels in your environment?

> --- snap
> 
> After max ~1 minute the kernel will crash.
> Doing my hack of saving init_net outside the loop it will run fine...
> So the mount bind is necessary.
> 
> The last message which I see is:
> 
> BUG: stack guard page was hit at f0751759 (stack is
> 69363195..73ddc474)
> kernel stack overflow (double-fault):  [#1] SMP PTI
> Modules linked in:
> CPU: 0 PID: 13917 Comm: ip Not tainted 4.16.0-11878-gef9d066f6808 #32
> Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.10.2-1 
> 04/01/2014
> RIP: 0010:validate_chain.isra.23+0x44/0xc40
> RSP: 0018:c92cbff8 EFLAGS: 00010002
> RAX: 0004 RBX: 0e58b88e1d4d15da RCX: 0e58b88e1d4d15da
> RDX:  RSI: 8802b25ee2a0 RDI: 8802b25edb00
> RBP: 0e58b88e1d4d15da R08:  R09: 0004
> R10: c92cc050 R11: 8802b1054be8 R12: 0001
> R13: 8802b25ee268 R14: 8802b25edb00 R15: 
> FS:  () GS:8802bfc0() knlGS:
> CS:  0010 DS:  ES:  CR0: 80050033
> CR2: c92cbfe8 CR3: 02024000 CR4: 06f0
> Call Trace:
>  ? get_max_files+0x10/0x10
>  __lock_acquire+0x332/0x710
>  lock_acquire+0x67/0xb0
>  ? lockref_put_or_lock+0x9/0x30
>  ? dput.part.7+0x17/0x2d0
>  _raw_spin_lock+0x2b/0x60
>  ? lockref_put_or_lock+0x9/0x30
>  lockref_put_or_lock+0x9/0x30
>  dput.part.7+0x1ec/0x2d0
>  drop_mountpoint+0x10/0x40
>  pin_kill+0x9b/0x3a0
>  ? wait_woken+0x90/0x90
>  ? mnt_pin_kill+0x2d/0x100
>  mnt_pin_kill+0x2d/0x100
>  cleanup_mnt+0x66/0x70
>  pin_kill+0x9b/0x3a0
>  ? wait_woken+0x90/0x90
>  ? mnt_pin_kill+0x2d/0x100
>  mnt_pin_kill+0x2d/0x100
>  cleanup_mnt+0x66/0x70
> ...
> 
> I guess maybe it has something to do with recently switching to
> migrate per-net ops to async.
> 
> - Alex

Kirill

[bisected] Stack overflow after fs: "switch the IO-triggering parts of umount to fs_pin" (was net namespaces kernel stack overflow)

2018-04-19 Thread Kirill Tkhai

Hi, Al,

commit 87b95ce0964c016ede92763be9c164e49f1019e9 is the first after which the 
below test crashes the kernel:

Author: Al Viro 
Date:   Sat Jan 10 19:01:08 2015 -0500

switch the IO-triggering parts of umount to fs_pin

Signed-off-by: Al Viro 

$modprobe dummy

$while true
 do
 mkdir /var/run/netns
 touch /var/run/netns/init_net
 mount --bind /proc/1/ns/net /var/run/netns/init_net

 ip netns add foo
 ip netns exec foo ip link add dummy0 type dummy
 ip netns delete foo
done

[   22.058349] ip (3249) used greatest stack depth: 8 bytes left
[   22.182195] BUG: unable to handle kernel paging request at 00035bb1f080
[   22.183065] IP: [] kick_process+0x34/0x80
[   22.183065] PGD 0 
[   22.183065] Thread overran stack, or stack corrupted
[   22.183065] Oops:  [#1] PREEMPT SMP 
[   22.183065] CPU: 1 PID: 3255 Comm: ip Not tainted 3.19.0-rc5+ #111
[   22.183065] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 
1.11.1-1 04/01/2014
[   22.183065] task: 88007c475100 ti: 88007b3cc000 task.ti: 
88007b3cc000
[   22.183065] RIP: 0010:[]  [] 
kick_process+0x34/0x80
[   22.183065] RSP: 0018:88007b3cfcf8  EFLAGS: 00010293
[   22.183065] RAX: 00012900 RBX: 88007c475100 RCX: 88007b20e7b8
[   22.183065] RDX: 7b3cc028 RSI: 819b05f8 RDI: 819cb999
[   22.183065] RBP: 88007b3cfd08 R08: 81cbf688 R09: 88007d3d0810
[   22.183065] R10: 88007fc933c8 R11:  R12: 7b3cc028
[   22.183065] R13: 88007c475100 R14:  R15: 7fff7793a448
[   22.183065] FS:  7fc987546700() GS:88007fc8() 
knlGS:
[   22.183065] CS:  0010 DS:  ES:  CR0: 8005003b
[   22.183065] CR2: 00035bb1f080 CR3: 01c11000 CR4: 06e0
[   22.183065] Stack:
[   22.183065]  88007c3b67b8 88007b3cfd98 88007b3cfd18 
81066b05
[   22.183065]  88007b3cfd38 81176f4c 88007b3cfd48 
88007c3b68a0
[   22.183065]  88007b3cfd48 811f 88007b3cfd68 
81177a49
[   22.183065] Call Trace:
[   22.183065]  [] task_work_add+0x45/0x60
[   22.183065]  [] mntput_no_expire+0xdc/0x150
[   22.183065]  [] mntput+0x1f/0x30
[   22.183065]  [] drop_mountpoint+0x29/0x30
[   22.183065]  [] pin_kill+0x66/0xf0
[   22.183065]  [] ? __wake_up_common+0x90/0x90
[   22.183065]  [] group_pin_kill+0x19/0x40
[   22.183065]  [] namespace_unlock+0x58/0x60
[   22.183065]  [] drop_collected_mounts+0x4e/0x60
[   22.183065]  [] put_mnt_ns+0x2d/0x50
[   22.183065]  [] free_nsproxy+0x1a/0x80
[   22.183065]  [] switch_task_namespaces+0x58/0x70
[   22.183065]  [] exit_task_namespaces+0xb/0x10
[   22.183065]  [] do_exit+0x2c7/0xc00
[   22.183065]  [] do_group_exit+0x3a/0xa0
[   22.183065]  [] SyS_exit_group+0xf/0x10
[   22.183065]  [] system_call_fastpath+0x12/0x17

Kirill

On 19.04.2018 01:08, Kirill Tkhai wrote:
> Hi, Alexander!
> 
> On 18.04.2018 22:45, Alexander Aring wrote:
>> I currently can crash my net/master kernel by execute the following script:
>>
>> --- snip
>>
>> modprobe dummy
>>
>> #mkdir /var/run/netns
>> #touch /var/run/netns/init_net
>> #mount --bind /proc/1/ns/net /var/run/netns/init_net
>>
>> while true
>> do
>> mkdir /var/run/netns
>> touch /var/run/netns/init_net
>> mount --bind /proc/1/ns/net /var/run/netns/init_net
>>
>> ip netns add foo
>> ip netns exec foo ip link add dummy0 type dummy
>> ip netns delete foo
>> done
> 
> Fast answer is the best, so I tried your test on my not-for-work computer.
> There is old kernel without asynchronous pernet operations:
> 
> $uname -a
> Linux localhost.localdomain 4.15.0-2-amd64 #1 SMP Debian 4.15.11-1 
> (2018-03-20) x86_64 GNU/Linux
> 
> After approximately 15 seconds of your test execution it died :(
> (Hopefully, I executed it in "init 1" with all partitions RO as usual).
> 
> There is no serial console, so I can't say that the first stack is exactly
> the same as you see. But it crashed. So, it seems, the problem have been
> existing long ago.
> 
> Have you tried to reproduce it in older kernels or to bisect the problem 
> commit?
> Or maybe it doesn't reproduce on old kernels in your environment?
> 
>> --- snap
>>
>> After max ~1 minute the kernel will crash.
>> Doing my hack of saving init_net outside the loop it will run fine...
>> So the mount bind is necessary.
>>
>> The last message which I see is:
>>
>> BUG: stack guard page was hit at f0751759 (stack is
>> 69363195..73ddc474)
>> kernel stack overflow (double-fault):  [#1] SMP PTI
>> Modules linked in:
>> CPU: 0 PID: 13917 Comm: ip Not tainted 4.1

Re: [bisected] Stack overflow after fs: "switch the IO-triggering parts of umount to fs_pin" (was net namespaces kernel stack overflow)

2018-04-19 Thread Kirill Tkhai

On 19.04.2018 19:44, Al Viro wrote:
> On Thu, Apr 19, 2018 at 04:34:48PM +0100, Al Viro wrote:
> 
>> IOW, we only get there if our vfsmount was an MNT_INTERNAL one.
>> So we have mnt->mnt_umount of some MNT_INTERNAL mount found in
>> ->mnt_pins of some other mount.  Which, AFAICS, means that
>> it used to be mounted on that other mount.  How the hell can
>> that happen?
>>
>> It looks like you somehow get a long chain of MNT_INTERNAL mounts
>> stacked on top of each other, which ought to be prevented by
>> mnt_flags &= ~MNT_INTERNAL_FLAGS;
>> in do_add_mount().  Nuts...
> 
> Argh...  Nuts is right - clone_mnt() preserves the sodding
> MNT_INTERNAL, with obvious results.
> 
> netns is related to the problem, by exposing MNT_INTERNAL mounts
> (in /proc/*/ns/*) for mount --bind to copy and attach to the
> tree.  AFAICS, the minimal reproducer is
> 
> touch /tmp/a
> unshare -m sh -c 'for i in `seq 1`; do mount --bind /proc/1/ns/net 
> /tmp/a; done'
> 
> (and it can be anything in /proc/*/ns/*, really)
> 
> I think the fix should be along the lines of the following:
> 
> Don't leak MNT_INTERNAL away from internal mounts
> 
> We want it only for the stuff created by SB_KERNMOUNT mounts, *not* for
> their copies.
> 
> Cc: sta...@kernel.org
> Signed-off-by: Al Viro 

Flawless victory! Thanks.

Tested-by: Kirill Tkhai 

> ---
> diff --git a/fs/namespace.c b/fs/namespace.c
> --- a/fs/namespace.c
> +++ b/fs/namespace.c
> @@ -1089,7 +1089,8 @@ static struct mount *clone_mnt(struct mount *old, 
> struct dentry *root,
>   goto out_free;
>   }
>  
> - mnt->mnt.mnt_flags = old->mnt.mnt_flags & ~(MNT_WRITE_HOLD|MNT_MARKED);
> + mnt->mnt.mnt_flags = old->mnt.mnt_flags;
> + mnt->mnt.mnt_flags &= ~(MNT_WRITE_HOLD|MNT_MARKED|MNT_INTERNAL);
>   /* Don't allow unprivileged users to change mount flags */
>   if (flag & CL_UNPRIVILEGED) {
>   mnt->mnt.mnt_flags |= MNT_LOCK_ATIME;
>

Re: [PATCH RFC iptables] iptables: Per-net ns lock

2018-04-20 Thread Kirill Tkhai

Pablo, Florian, could you please provide comments on this?

On 09.04.2018 19:55, Kirill Tkhai wrote:
> In CRIU and LXC-restore we met the situation,
> when iptables in container can't be restored
> because of permission denied:
> 
> https://github.com/checkpoint-restore/criu/issues/469
> 
> Containers want to restore their own net ns,
> while they may have no their own mnt ns.
> This case they share host's /run/xtables.lock
> file, but they may not have permission to open
> it.
> 
> Patch makes /run/xtables.lock to be per-namespace,
> i.e., to refer to the caller task's net ns.
> 
> What you think?
> 
> Thanks,
> Kirill
> 
> Signed-off-by: Kirill Tkhai 
> ---
>  iptables/xshared.c |7 ++-
>  1 file changed, 6 insertions(+), 1 deletion(-)
> 
> diff --git a/iptables/xshared.c b/iptables/xshared.c
> index 06db72d4..b6dbe4e7 100644
> --- a/iptables/xshared.c
> +++ b/iptables/xshared.c
> @@ -254,7 +254,12 @@ static int xtables_lock(int wait, struct timeval 
> *wait_interval)
>   time_left.tv_sec = wait;
>   time_left.tv_usec = 0;
>  
> - fd = open(XT_LOCK_NAME, O_CREAT, 0600);
> + if (symlink("/proc/self/ns/net", XT_LOCK_NAME) != 0 &&
> + errno != EEXIST) {
> + fprintf(stderr, "Fatal: can't create lock file\n");
> + return XT_LOCK_FAILED;
> + }
> + fd = open(XT_LOCK_NAME, O_RDONLY);
>   if (fd < 0) {
>   fprintf(stderr, "Fatal: can't open lock file %s: %s\n",
>   XT_LOCK_NAME, strerror(errno));
>

Re: [PATCH RFC iptables] iptables: Per-net ns lock

2018-04-20 Thread Kirill Tkhai

Hi, Florian,

On 20.04.2018 13:50, Florian Westphal wrote:
> Kirill Tkhai  wrote:
>> Pablo, Florian, could you please provide comments on this?
>>
>> On 09.04.2018 19:55, Kirill Tkhai wrote:
>>> In CRIU and LXC-restore we met the situation,
>>> when iptables in container can't be restored
>>> because of permission denied:
>>>
>>> https://github.com/checkpoint-restore/criu/issues/469
>>>
>>> Containers want to restore their own net ns,
>>> while they may have no their own mnt ns.
>>> This case they share host's /run/xtables.lock
>>> file, but they may not have permission to open
>>> it.
>>>
>>> Patch makes /run/xtables.lock to be per-namespace,
>>> i.e., to refer to the caller task's net ns.
> 
> It looks ok to me but then again the entire userspace
> lock thing is a ugly band aid :-/

I'm agree, but I'm not sure there is a possibility
to go away from it in classic iptables...

Kirill

[PATCH] iptables: Per-net ns lock

2018-04-20 Thread Kirill Tkhai

Containers want to restore their own net ns,
while they may have no their own mnt ns.
This case they share host's /run/xtables.lock
file, but they may not have permission to open
it.

Patch makes /run/xtables.lock to be per-namespace,
i.e., to refer to the caller task's net ns.

Signed-off-by: Kirill Tkhai 
---
 iptables/xshared.c |7 ++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/iptables/xshared.c b/iptables/xshared.c
index 06db72d4..b6dbe4e7 100644
--- a/iptables/xshared.c
+++ b/iptables/xshared.c
@@ -254,7 +254,12 @@ static int xtables_lock(int wait, struct timeval 
*wait_interval)
time_left.tv_sec = wait;
time_left.tv_usec = 0;
 
-   fd = open(XT_LOCK_NAME, O_CREAT, 0600);
+   if (symlink("/proc/self/ns/net", XT_LOCK_NAME) != 0 &&
+   errno != EEXIST) {
+   fprintf(stderr, "Fatal: can't create lock file\n");
+   return XT_LOCK_FAILED;
+   }
+   fd = open(XT_LOCK_NAME, O_RDONLY);
if (fd < 0) {
fprintf(stderr, "Fatal: can't open lock file %s: %s\n",
XT_LOCK_NAME, strerror(errno));

Re: [PATCH] iptables: Per-net ns lock

2018-04-23 Thread Kirill Tkhai

On 21.04.2018 02:06, Andrei Vagin wrote:
> On Fri, Apr 20, 2018 at 04:42:47PM +0300, Kirill Tkhai wrote:
>> Containers want to restore their own net ns,
>> while they may have no their own mnt ns.
>> This case they share host's /run/xtables.lock
>> file, but they may not have permission to open
>> it.
>>
>> Patch makes /run/xtables.lock to be per-namespace,
>> i.e., to refer to the caller task's net ns.
>>
>> Signed-off-by: Kirill Tkhai 
>> ---
>>  iptables/xshared.c |7 ++-
>>  1 file changed, 6 insertions(+), 1 deletion(-)
>>
>> diff --git a/iptables/xshared.c b/iptables/xshared.c
>> index 06db72d4..b6dbe4e7 100644
>> --- a/iptables/xshared.c
>> +++ b/iptables/xshared.c
>> @@ -254,7 +254,12 @@ static int xtables_lock(int wait, struct timeval 
>> *wait_interval)
>>  time_left.tv_sec = wait;
>>  time_left.tv_usec = 0;
>>  
>> -fd = open(XT_LOCK_NAME, O_CREAT, 0600);
>> +if (symlink("/proc/self/ns/net", XT_LOCK_NAME) != 0 &&
> 
> Any user can open this file and take the lock. Before this patch, the
> lock file could be opened only by the root user. It means that any user
> will be able to block all iptables operations. Do I miss something?

Yes, this is the idea. It looks like the only way to save compatibility
with old iptables and to allow to set rules from nested net namespaces.
Also, this allows to synchronize with containers, which have its own mount
namespace.

Comparing to existing interfaces in kernel, there is an example. Ordinary user
can open a file RO on a partition, and this prevents root from umounting it
But this is never considered as a problem, and nobody makes partitions available
only for root in 0600 mode to prevent this. There is lsof, and it's easy to find
the lock owner. The same with iptables. The lock is not a critical protection,
it's just a try for different users to synchronize between each other. Real 
protection
happens in setsockopt() path.

> [root@fc24 ~]# ln -s /proc/self/ns/net /run/xtables.lock2
> [root@fc24 ~]# ls -l /run/xtables.lock2
> lrwxrwxrwx 1 root root 17 Apr 21 01:52 /run/xtables.lock2 ->
> /proc/self/ns/net
> [root@fc24 ~]# ls -l /proc/self/ns/net 
> lrwxrwxrwx 1 root root 0 Apr 21 01:52 /proc/self/ns/net ->
> net:[4026531993]
> 
> Thanks,
> Andrei
> 
>> +errno != EEXIST) {
>> +fprintf(stderr, "Fatal: can't create lock file\n");
> 
>   fprintf(stderr, "Fatal: can't create lock file %s: %s\n",
>   XT_LOCK_NAME, strerror(errno));
> 
>> +return XT_LOCK_FAILED;
>> +}
>> +fd = open(XT_LOCK_NAME, O_RDONLY);
>>  if (fd < 0) {
>>  fprintf(stderr, "Fatal: can't open lock file %s: %s\n",
>>  XT_LOCK_NAME, strerror(errno));

Kirill

[PATCH net-next] net: Fix coccinelle warning

2018-04-26 Thread Kirill Tkhai

kbuild test robot says:

  >coccinelle warnings: (new ones prefixed by >>)
  >>> net/core/dev.c:1588:2-3: Unneeded semicolon

So, let's remove it.

Reported-by: kbuild test robot 
Signed-off-by: Kirill Tkhai 
---
 net/core/dev.c |2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/core/dev.c b/net/core/dev.c
index c624a04dad1f..72e9299cd1e6 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -1587,7 +1587,7 @@ const char *netdev_cmd_to_name(enum netdev_cmd cmd)
N(UDP_TUNNEL_DROP_INFO) N(CHANGE_TX_QUEUE_LEN)
N(CVLAN_FILTER_PUSH_INFO) N(CVLAN_FILTER_DROP_INFO)
N(SVLAN_FILTER_PUSH_INFO) N(SVLAN_FILTER_DROP_INFO)
-   };
+   }
 #undef N
return "UNKNOWN_NETDEV_EVENT";
 }

Re: [PATCH net-next v2 2/5] net: Revert "ipv4: fix a deadlock in ip_ra_control"

2018-03-20 Thread Kirill Tkhai

Hi, David,

thanks for the review!

On 20.03.2018 19:23, David Miller wrote:
> From: Kirill Tkhai 
> Date: Mon, 19 Mar 2018 12:14:54 +0300
> 
>> This reverts commit 1215e51edad1.
>> Since raw_close() is used on every RAW socket destruction,
>> the changes made by 1215e51edad1 scale sadly. This clearly
>> seen on endless unshare(CLONE_NEWNET) test, and cleanup_net()
>> kwork spends a lot of time waiting for rtnl_lock() introduced
>> by this commit.
>>
>> Next patches in series will rework this in another way,
>> so now we revert 1215e51edad1. Also, it doesn't seen
>> mrtsock_destruct() takes sk_lock, and the comment to the commit
>> does not show the actual stack dump. So, there is a question
>> did we really need in it.
>>
>> Signed-off-by: Kirill Tkhai 
> 
> Kirill, I think the commit you are reverting is legitimate.
> 
> The IP_RAW_CONTROL path has an ABBA deadlock with other paths once
> you revert this, so you are reintroducing a bug.

The talk is about IP_ROUTER_ALERT, I assume there is just an erratum.
 
> All code paths that must take both RTNL and the socket lock must
> do them in the same order.  And that order is RTNL then socket
> lock.

The place I change in this patch is IP_ROUTER_ALERT. There is only
a call of ip_ra_control(), while this function does not need socket
lock. Please, see next patch. It moves this ip_ra_control() out
of socket lock. And it fixes the problem pointed in reverted patch
in another way. So, if there is ABBA, after next patch it becomes
solved. Does this mean I have to merge [2/5] and [3/5] together?

> But you are breaking that here by getting us back into a state
> where IP_RAW_CONTROL setsockopt will take the socket lock and
> then RTNL.
> 
> Again, we can't take, or retake, RTNL if we have the socket lock
> currently.
> 
> The only valid locking order is socket lock then RTNL.

Thanks,
Kirill

Re: [PATCH net-next v2 2/5] net: Revert "ipv4: fix a deadlock in ip_ra_control"

2018-03-20 Thread Kirill Tkhai

On 20.03.2018 22:25, Kirill Tkhai wrote:
> Hi, David,
> 
> thanks for the review!
> 
> On 20.03.2018 19:23, David Miller wrote:
>> From: Kirill Tkhai 
>> Date: Mon, 19 Mar 2018 12:14:54 +0300
>>
>>> This reverts commit 1215e51edad1.
>>> Since raw_close() is used on every RAW socket destruction,
>>> the changes made by 1215e51edad1 scale sadly. This clearly
>>> seen on endless unshare(CLONE_NEWNET) test, and cleanup_net()
>>> kwork spends a lot of time waiting for rtnl_lock() introduced
>>> by this commit.
>>>
>>> Next patches in series will rework this in another way,
>>> so now we revert 1215e51edad1. Also, it doesn't seen
>>> mrtsock_destruct() takes sk_lock, and the comment to the commit
>>> does not show the actual stack dump. So, there is a question
>>> did we really need in it.
>>>
>>> Signed-off-by: Kirill Tkhai 
>>
>> Kirill, I think the commit you are reverting is legitimate.
>>
>> The IP_RAW_CONTROL path has an ABBA deadlock with other paths once
>> you revert this, so you are reintroducing a bug.
> 
> The talk is about IP_ROUTER_ALERT, I assume there is just an erratum.
>  
>> All code paths that must take both RTNL and the socket lock must
>> do them in the same order.  And that order is RTNL then socket
>> lock.
> 
> The place I change in this patch is IP_ROUTER_ALERT. There is only
> a call of ip_ra_control(), while this function does not need socket
> lock. Please, see next patch. It moves this ip_ra_control() out
> of socket lock. And it fixes the problem pointed in reverted patch
> in another way. So, if there is ABBA, after next patch it becomes
> solved. Does this mean I have to merge [2/5] and [3/5] together?

We also can just change the order of patches, and make [3/5] go before [2/5].
Then, the kernel still remains bisectable. How do you think about this?

Thanks,
Kirill

>> But you are breaking that here by getting us back into a state
>> where IP_RAW_CONTROL setsockopt will take the socket lock and
>> then RTNL.
>>
>> Again, we can't take, or retake, RTNL if we have the socket lock
>> currently.
>>
>> The only valid locking order is socket lock then RTNL.
> 
> Thanks,
> Kirill
>

Re: [PATCH net-next 06/14] net/tls: Add generic NIC offload infrastructure

2018-03-21 Thread Kirill Tkhai

On 20.03.2018 05:45, Saeed Mahameed wrote:
> From: Ilya Lesokhin 
> 
> This patch adds a generic infrastructure to offload TLS crypto to a
> network devices. It enables the kernel TLS socket to skip encryption
> and authentication operations on the transmit side of the data path.
> Leaving those computationally expensive operations to the NIC.
> 
> The NIC offload infrastructure builds TLS records and pushes them to
> the TCP layer just like the SW KTLS implementation and using the same API.
> TCP segmentation is mostly unaffected. Currently the only exception is
> that we prevent mixed SKBs where only part of the payload requires
> offload. In the future we are likely to add a similar restriction
> following a change cipher spec record.
> 
> The notable differences between SW KTLS and NIC offloaded TLS
> implementations are as follows:
> 1. The offloaded implementation builds "plaintext TLS record", those
> records contain plaintext instead of ciphertext and place holder bytes
> instead of authentication tags.
> 2. The offloaded implementation maintains a mapping from TCP sequence
> number to TLS records. Thus given a TCP SKB sent from a NIC offloaded
> TLS socket, we can use the tls NIC offload infrastructure to obtain
> enough context to encrypt the payload of the SKB.
> A TLS record is released when the last byte of the record is ack'ed,
> this is done through the new icsk_clean_acked callback.
> 
> The infrastructure should be extendable to support various NIC offload
> implementations.  However it is currently written with the
> implementation below in mind:
> The NIC assumes that packets from each offloaded stream are sent as
> plaintext and in-order. It keeps track of the TLS records in the TCP
> stream. When a packet marked for offload is transmitted, the NIC
> encrypts the payload in-place and puts authentication tags in the
> relevant place holders.
> 
> The responsibility for handling out-of-order packets (i.e. TCP
> retransmission, qdisc drops) falls on the netdev driver.
> 
> The netdev driver keeps track of the expected TCP SN from the NIC's
> perspective.  If the next packet to transmit matches the expected TCP
> SN, the driver advances the expected TCP SN, and transmits the packet
> with TLS offload indication.
> 
> If the next packet to transmit does not match the expected TCP SN. The
> driver calls the TLS layer to obtain the TLS record that includes the
> TCP of the packet for transmission. Using this TLS record, the driver
> posts a work entry on the transmit queue to reconstruct the NIC TLS
> state required for the offload of the out-of-order packet. It updates
> the expected TCP SN accordingly and transmit the now in-order packet.
> The same queue is used for packet transmission and TLS context
> reconstruction to avoid the need for flushing the transmit queue before
> issuing the context reconstruction request.
> 
> Signed-off-by: Ilya Lesokhin 
> Signed-off-by: Boris Pismenny 
> Signed-off-by: Aviad Yehezkel 
> Signed-off-by: Saeed Mahameed 
> ---
>  include/net/tls.h |  70 +++-
>  net/tls/Kconfig   |  10 +
>  net/tls/Makefile  |   2 +
>  net/tls/tls_device.c  | 804 
> ++
>  net/tls/tls_device_fallback.c | 419 ++
>  net/tls/tls_main.c|  33 +-
>  6 files changed, 1331 insertions(+), 7 deletions(-)
>  create mode 100644 net/tls/tls_device.c
>  create mode 100644 net/tls/tls_device_fallback.c
> 
> diff --git a/include/net/tls.h b/include/net/tls.h
> index 4913430ab807..ab98a6dc4929 100644
> --- a/include/net/tls.h
> +++ b/include/net/tls.h
> @@ -77,6 +77,37 @@ struct tls_sw_context {
>   struct scatterlist sg_aead_out[2];
>  };
>  
> +struct tls_record_info {
> + struct list_head list;
> + u32 end_seq;
> + int len;
> + int num_frags;
> + skb_frag_t frags[MAX_SKB_FRAGS];
> +};
> +
> +struct tls_offload_context {
> + struct crypto_aead *aead_send;
> + spinlock_t lock;/* protects records list */
> + struct list_head records_list;
> + struct tls_record_info *open_record;
> + struct tls_record_info *retransmit_hint;
> + u64 hint_record_sn;
> + u64 unacked_record_sn;
> +
> + struct scatterlist sg_tx_data[MAX_SKB_FRAGS];
> + void (*sk_destruct)(struct sock *sk);
> + u8 driver_state[];
> + /* The TLS layer reserves room for driver specific state
> +  * Currently the belief is that there is not enough
> +  * driver specific state to justify another layer of indirection
> +  */
> +#define TLS_DRIVER_STATE_SIZE (max_t(size_t, 8, sizeof(void *)))
> +};
> +
> +#define TLS_OFFLOAD_CONTEXT_SIZE 
>   \
> + (ALIGN(sizeof(struct tls_offload_context), sizeof(void *)) +   \
> +  TLS_DRIVER_STATE_SIZE)
> +
>  enum {
>   TLS_PENDING_CLOSED_RECORD
>  };
> @@ -87,6 +118,10 @@ struct tls_context {
>   struct tls12_crypto_info_aes_gcm_

Re: [PATCH net-next 06/14] net/tls: Add generic NIC offload infrastructure

2018-03-21 Thread Kirill Tkhai

On 21.03.2018 18:53, Boris Pismenny wrote:
> ...
>>
>> Other patches have two licenses in header. Can I distribute this file under 
>> GPL license terms?
>>
> 
> Sure, I'll update the license to match other files under net/tls.
> 
>>> +#include 
>>> +#include 
>>> +#include 
>>> +#include 
>>> +#include 
>>> +
>>> +#include 
>>> +#include 
>>> +
>>> +/* device_offload_lock is used to synchronize tls_dev_add
>>> + * against NETDEV_DOWN notifications.
>>> + */
>>> +DEFINE_STATIC_PERCPU_RWSEM(device_offload_lock);
>>> +
>>> +static void tls_device_gc_task(struct work_struct *work);
>>> +
>>> +static DECLARE_WORK(tls_device_gc_work, tls_device_gc_task);
>>> +static LIST_HEAD(tls_device_gc_list);
>>> +static LIST_HEAD(tls_device_list);
>>> +static DEFINE_SPINLOCK(tls_device_lock);
>>> +
>>> +static void tls_device_free_ctx(struct tls_context *ctx)
>>> +{
>>> +    struct tls_offload_context *offlad_ctx = tls_offload_ctx(ctx);
>>> +
>>> +    kfree(offlad_ctx);
>>> +    kfree(ctx);
>>> +}
>>> +
>>> +static void tls_device_gc_task(struct work_struct *work)
>>> +{
>>> +    struct tls_context *ctx, *tmp;
>>> +    struct list_head gc_list;
>>> +    unsigned long flags;
>>> +
>>> +    spin_lock_irqsave(&tls_device_lock, flags);
>>> +    INIT_LIST_HEAD(&gc_list);
>>
>> This is stack variable, and it should be initialized outside of global 
>> spinlock.
>> There is LIST_HEAD() primitive for that in kernel.
>> There is one more similar place below.
>>
> 
> Sure.
> 
>>> +    list_splice_init(&tls_device_gc_list, &gc_list);
>>> +    spin_unlock_irqrestore(&tls_device_lock, flags);
>>> +
>>> +    list_for_each_entry_safe(ctx, tmp, &gc_list, list) {
>>> +    struct net_device *netdev = ctx->netdev;
>>> +
>>> +    if (netdev) {
>>> +    netdev->tlsdev_ops->tls_dev_del(netdev, ctx,
>>> +    TLS_OFFLOAD_CTX_DIR_TX);
>>> +    dev_put(netdev);
>>> +    }
>>
>> How is possible the situation we meet NULL netdev here >
> 
> This can happen in tls_device_down. tls_deviec_down is called whenever a 
> netdev that is used for TLS inline crypto offload goes down. It gets called 
> via the NETDEV_DOWN event of the netdevice notifier.
> 
> This flow is somewhat similar to the xfrm_device netdev notifier. However, we 
> do not destroy the socket (as in destroying the xfrm_state in xfrm_device). 
> Instead, we cleanup the netdev state and allow software fallback to handle 
> the rest of the traffic.
> 
>>> +
>>> +    list_del(&ctx->list);
>>> +    tls_device_free_ctx(ctx);
>>> +    }
>>> +}
>>> +
>>> +static void tls_device_queue_ctx_destruction(struct tls_context *ctx)
>>> +{
>>> +    unsigned long flags;
>>> +
>>> +    spin_lock_irqsave(&tls_device_lock, flags);
>>> +    list_move_tail(&ctx->list, &tls_device_gc_list);
>>> +
>>> +    /* schedule_work inside the spinlock
>>> + * to make sure tls_device_down waits for that work.
>>> + */
>>> +    schedule_work(&tls_device_gc_work);
>>> +
>>> +    spin_unlock_irqrestore(&tls_device_lock, flags);
>>> +}
>>> +
>>> +/* We assume that the socket is already connected */
>>> +static struct net_device *get_netdev_for_sock(struct sock *sk)
>>> +{
>>> +    struct inet_sock *inet = inet_sk(sk);
>>> +    struct net_device *netdev = NULL;
>>> +
>>> +    netdev = dev_get_by_index(sock_net(sk), inet->cork.fl.flowi_oif);
>>> +
>>> +    return netdev;
>>> +}
>>> +
>>> +static int attach_sock_to_netdev(struct sock *sk, struct net_device 
>>> *netdev,
>>> + struct tls_context *ctx)
>>> +{
>>> +    int rc;
>>> +
>>> +    rc = netdev->tlsdev_ops->tls_dev_add(netdev, sk, 
>>> TLS_OFFLOAD_CTX_DIR_TX,
>>> + &ctx->crypto_send,
>>> + tcp_sk(sk)->write_seq);
>>> +    if (rc) {
>>> +    pr_err_ratelimited("The netdev has refused to offload this 
>>> socket\n");
>>> +    goto out;
>>> +    }
>>> +
>>> +    rc = 0;
>>> +out:
>>> +    return rc;
>>> +}
>>> +
>>> +static void destroy_record(struct tls_record_info *record)
>>> +{
>>> +    skb_frag_t *frag;
>>> +    int nr_frags = record->num_frags;
>>> +
>>> +    while (nr_frags > 0) {
>>> +    frag = &record->frags[nr_frags - 1];
>>> +    __skb_frag_unref(frag);
>>> +    --nr_frags;
>>> +    }
>>> +    kfree(record);
>>> +}
>>> +
>>> +static void delete_all_records(struct tls_offload_context *offload_ctx)
>>> +{
>>> +    struct tls_record_info *info, *temp;
>>> +
>>> +    list_for_each_entry_safe(info, temp, &offload_ctx->records_list, list) 
>>> {
>>> +    list_del(&info->list);
>>> +    destroy_record(info);
>>> +    }
>>> +
>>> +    offload_ctx->retransmit_hint = NULL;
>>> +}
>>> +
>>> +static void tls_icsk_clean_acked(struct sock *sk, u32 acked_seq)
>>> +{
>>> +    struct tls_context *tls_ctx = tls_get_ctx(sk);
>>> +    struct tls_offload_context *ctx;
>>> +    struct tls_record_info *info, *temp;
>>> +    unsigned long flags;
>>> +    u64 deleted_records = 0;
>>> +
>>> +    if (!tls_ctx)
>>> +    return;
>>>

Re: [PATCH V2 net-next 06/14] net/tls: Add generic NIC offload infrastructure

2018-03-21 Thread Kirill Tkhai

Hi, Saeed,

thanks for fixing some of my remarks, but I've dived into the code
more deeply, and found with a sadness, the patch lacks the readability.

It too big and not fit kernel coding style. Please, see some comments
below.

Can we do something with patch length? Is there a way to split it in
several small patches? It's difficult to review the logic of changes.

On 22.03.2018 00:01, Saeed Mahameed wrote:
> From: Ilya Lesokhin 
> 
> This patch adds a generic infrastructure to offload TLS crypto to a
> network devices. It enables the kernel TLS socket to skip encryption
> and authentication operations on the transmit side of the data path.
> Leaving those computationally expensive operations to the NIC.
> 
> The NIC offload infrastructure builds TLS records and pushes them to
> the TCP layer just like the SW KTLS implementation and using the same API.
> TCP segmentation is mostly unaffected. Currently the only exception is
> that we prevent mixed SKBs where only part of the payload requires
> offload. In the future we are likely to add a similar restriction
> following a change cipher spec record.
> 
> The notable differences between SW KTLS and NIC offloaded TLS
> implementations are as follows:
> 1. The offloaded implementation builds "plaintext TLS record", those
> records contain plaintext instead of ciphertext and place holder bytes
> instead of authentication tags.
> 2. The offloaded implementation maintains a mapping from TCP sequence
> number to TLS records. Thus given a TCP SKB sent from a NIC offloaded
> TLS socket, we can use the tls NIC offload infrastructure to obtain
> enough context to encrypt the payload of the SKB.
> A TLS record is released when the last byte of the record is ack'ed,
> this is done through the new icsk_clean_acked callback.
> 
> The infrastructure should be extendable to support various NIC offload
> implementations.  However it is currently written with the
> implementation below in mind:
> The NIC assumes that packets from each offloaded stream are sent as
> plaintext and in-order. It keeps track of the TLS records in the TCP
> stream. When a packet marked for offload is transmitted, the NIC
> encrypts the payload in-place and puts authentication tags in the
> relevant place holders.
> 
> The responsibility for handling out-of-order packets (i.e. TCP
> retransmission, qdisc drops) falls on the netdev driver.
> 
> The netdev driver keeps track of the expected TCP SN from the NIC's
> perspective.  If the next packet to transmit matches the expected TCP
> SN, the driver advances the expected TCP SN, and transmits the packet
> with TLS offload indication.
> 
> If the next packet to transmit does not match the expected TCP SN. The
> driver calls the TLS layer to obtain the TLS record that includes the
> TCP of the packet for transmission. Using this TLS record, the driver
> posts a work entry on the transmit queue to reconstruct the NIC TLS
> state required for the offload of the out-of-order packet. It updates
> the expected TCP SN accordingly and transmit the now in-order packet.
> The same queue is used for packet transmission and TLS context
> reconstruction to avoid the need for flushing the transmit queue before
> issuing the context reconstruction request.
> 
> Signed-off-by: Ilya Lesokhin 
> Signed-off-by: Boris Pismenny 
> Signed-off-by: Aviad Yehezkel 
> Signed-off-by: Saeed Mahameed 
> ---
>  include/net/tls.h |  74 +++-
>  net/tls/Kconfig   |  10 +
>  net/tls/Makefile  |   2 +
>  net/tls/tls_device.c  | 793 
> ++
>  net/tls/tls_device_fallback.c | 415 ++
>  net/tls/tls_main.c|  33 +-
>  6 files changed, 1320 insertions(+), 7 deletions(-)
>  create mode 100644 net/tls/tls_device.c
>  create mode 100644 net/tls/tls_device_fallback.c
> 
> diff --git a/include/net/tls.h b/include/net/tls.h
> index 4913430ab807..0bfb1b0a156a 100644
> --- a/include/net/tls.h
> +++ b/include/net/tls.h
> @@ -77,6 +77,37 @@ struct tls_sw_context {
>   struct scatterlist sg_aead_out[2];
>  };
>  
> +struct tls_record_info {
> + struct list_head list;
> + u32 end_seq;
> + int len;
> + int num_frags;
> + skb_frag_t frags[MAX_SKB_FRAGS];
> +};
> +
> +struct tls_offload_context {
> + struct crypto_aead *aead_send;
> + spinlock_t lock;/* protects records list */
> + struct list_head records_list;
> + struct tls_record_info *open_record;
> + struct tls_record_info *retransmit_hint;
> + u64 hint_record_sn;
> + u64 unacked_record_sn;
> +
> + struct scatterlist sg_tx_data[MAX_SKB_FRAGS];
> + void (*sk_destruct)(struct sock *sk);
> + u8 driver_state[];
> + /* The TLS layer reserves room for driver specific state
> +  * Currently the belief is that there is not enough
> +  * driver specific state to justify another layer of indirection
> +  */
> +#define TLS_DRIVER_STATE_SIZE (max_t(size_t, 8, siz

[PATCH net-next v3 1/5] net: Revert "ipv4: get rid of ip_ra_lock"

2018-03-22 Thread Kirill Tkhai

This reverts commit ba3f571d5dde. The commit was made
after 1215e51edad1 "ipv4: fix a deadlock in ip_ra_control",
and killed ip_ra_lock, which became useless after rtnl_lock()
made used to destroy every raw ipv4 socket. This scales
very bad, and next patch in series reverts 1215e51edad1.
ip_ra_lock will be used again.

Signed-off-by: Kirill Tkhai 
---
 net/ipv4/ip_sockglue.c |   12 ++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c
index 74c962b9b09c..be7c3b71914d 100644
--- a/net/ipv4/ip_sockglue.c
+++ b/net/ipv4/ip_sockglue.c
@@ -334,6 +334,7 @@ int ip_cmsg_send(struct sock *sk, struct msghdr *msg, 
struct ipcm_cookie *ipc,
sent to multicast group to reach destination designated router.
  */
 struct ip_ra_chain __rcu *ip_ra_chain;
+static DEFINE_SPINLOCK(ip_ra_lock);
 
 
 static void ip_ra_destroy_rcu(struct rcu_head *head)
@@ -355,17 +356,21 @@ int ip_ra_control(struct sock *sk, unsigned char on,
 
new_ra = on ? kmalloc(sizeof(*new_ra), GFP_KERNEL) : NULL;
 
+   spin_lock_bh(&ip_ra_lock);
for (rap = &ip_ra_chain;
-(ra = rtnl_dereference(*rap)) != NULL;
+(ra = rcu_dereference_protected(*rap,
+   lockdep_is_held(&ip_ra_lock))) != NULL;
 rap = &ra->next) {
if (ra->sk == sk) {
if (on) {
+   spin_unlock_bh(&ip_ra_lock);
kfree(new_ra);
return -EADDRINUSE;
}
/* dont let ip_call_ra_chain() use sk again */
ra->sk = NULL;
RCU_INIT_POINTER(*rap, ra->next);
+   spin_unlock_bh(&ip_ra_lock);
 
if (ra->destructor)
ra->destructor(sk);
@@ -379,14 +384,17 @@ int ip_ra_control(struct sock *sk, unsigned char on,
return 0;
}
}
-   if (!new_ra)
+   if (!new_ra) {
+   spin_unlock_bh(&ip_ra_lock);
return -ENOBUFS;
+   }
new_ra->sk = sk;
new_ra->destructor = destructor;
 
RCU_INIT_POINTER(new_ra->next, ra);
rcu_assign_pointer(*rap, new_ra);
sock_hold(sk);
+   spin_unlock_bh(&ip_ra_lock);
 
return 0;
 }

[PATCH net-next v3 2/5] net: Move IP_ROUTER_ALERT out of lock_sock(sk)

2018-03-22 Thread Kirill Tkhai

ip_ra_control() does not need sk_lock. Who are the another
users of ip_ra_chain? ip_mroute_setsockopt() doesn't take
sk_lock, while parallel IP_ROUTER_ALERT syscalls are
synchronized by ip_ra_lock. So, we may move this command
out of sk_lock.

Signed-off-by: Kirill Tkhai 
---
 net/ipv4/ip_sockglue.c |5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c
index be7c3b71914d..dcbf6afe27e7 100644
--- a/net/ipv4/ip_sockglue.c
+++ b/net/ipv4/ip_sockglue.c
@@ -647,6 +647,8 @@ static int do_ip_setsockopt(struct sock *sk, int level,
 
/* If optlen==0, it is equivalent to val == 0 */
 
+   if (optname == IP_ROUTER_ALERT)
+   return ip_ra_control(sk, val ? 1 : 0, NULL);
if (ip_mroute_opt(optname))
return ip_mroute_setsockopt(sk, optname, optval, optlen);
 
@@ -1157,9 +1159,6 @@ static int do_ip_setsockopt(struct sock *sk, int level,
goto e_inval;
inet->mc_all = val;
break;
-   case IP_ROUTER_ALERT:
-   err = ip_ra_control(sk, val ? 1 : 0, NULL);
-   break;
 
case IP_FREEBIND:
if (optlen < 1)

[PATCH net-next v3 3/5] net: Revert "ipv4: fix a deadlock in ip_ra_control"

2018-03-22 Thread Kirill Tkhai

This reverts commit 1215e51edad1.
Since raw_close() is used on every RAW socket destruction,
the changes made by 1215e51edad1 scale sadly. This clearly
seen on endless unshare(CLONE_NEWNET) test, and cleanup_net()
kwork spends a lot of time waiting for rtnl_lock() introduced
by this commit.

Previous patch moved IP_ROUTER_ALERT out of rtnl_lock(),
so we revert this patch.

Signed-off-by: Kirill Tkhai 
---
 net/ipv4/ip_sockglue.c |1 -
 net/ipv4/ipmr.c|   11 +--
 net/ipv4/raw.c |2 --
 3 files changed, 9 insertions(+), 5 deletions(-)

diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c
index dcbf6afe27e7..bf5f44b27b7e 100644
--- a/net/ipv4/ip_sockglue.c
+++ b/net/ipv4/ip_sockglue.c
@@ -594,7 +594,6 @@ static bool setsockopt_needs_rtnl(int optname)
case MCAST_LEAVE_GROUP:
case MCAST_LEAVE_SOURCE_GROUP:
case MCAST_UNBLOCK_SOURCE:
-   case IP_ROUTER_ALERT:
return true;
}
return false;
diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c
index d752a70855d8..f6be5db16da2 100644
--- a/net/ipv4/ipmr.c
+++ b/net/ipv4/ipmr.c
@@ -1399,7 +1399,7 @@ static void mrtsock_destruct(struct sock *sk)
struct net *net = sock_net(sk);
struct mr_table *mrt;
 
-   ASSERT_RTNL();
+   rtnl_lock();
ipmr_for_each_table(mrt, net) {
if (sk == rtnl_dereference(mrt->mroute_sk)) {
IPV4_DEVCONF_ALL(net, MC_FORWARDING)--;
@@ -1411,6 +1411,7 @@ static void mrtsock_destruct(struct sock *sk)
mroute_clean_tables(mrt, false);
}
}
+   rtnl_unlock();
 }
 
 /* Socket options and virtual interface manipulation. The whole
@@ -1475,8 +1476,13 @@ int ip_mroute_setsockopt(struct sock *sk, int optname, 
char __user *optval,
if (sk != rcu_access_pointer(mrt->mroute_sk)) {
ret = -EACCES;
} else {
+   /* We need to unlock here because mrtsock_destruct takes
+* care of rtnl itself and we can't change that due to
+* the IP_ROUTER_ALERT setsockopt which runs without it.
+*/
+   rtnl_unlock();
ret = ip_ra_control(sk, 0, NULL);
-   goto out_unlock;
+   goto out;
}
break;
case MRT_ADD_VIF:
@@ -1588,6 +1594,7 @@ int ip_mroute_setsockopt(struct sock *sk, int optname, 
char __user *optval,
}
 out_unlock:
rtnl_unlock();
+out:
return ret;
 }
 
diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c
index 54648d20bf0f..720bef7da2f6 100644
--- a/net/ipv4/raw.c
+++ b/net/ipv4/raw.c
@@ -711,9 +711,7 @@ static void raw_close(struct sock *sk, long timeout)
/*
 * Raw sockets may have direct kernel references. Kill them.
 */
-   rtnl_lock();
ip_ra_control(sk, 0, NULL);
-   rtnl_unlock();
 
sk_common_release(sk);
 }

[PATCH net-next v3 0/5] Rework ip_ra_chain protection

2018-03-22 Thread Kirill Tkhai

Commit 1215e51edad1 "ipv4: fix a deadlock in ip_ra_control"
made rtnl_lock() be used in raw_close(). This function is called
on every RAW socket destruction, so that rtnl_mutex is taken
every time. This scales very sadly. I observe cleanup_net()
spending a lot of time in rtnl_lock() and raw_close() is one
of the biggest rtnl user (since we have percpu net->ipv4.icmp_sk).

This patchset reworks the locking: reverts the problem commit
and its descendant, and introduces rtnl-independent locking.
This may have a continuation, and someone may work on killing
rtnl_lock() in mrtsock_destruct() in the future.

Thanks,
Kirill

---
v3: Change patches order: [2/5] and [3/5].
v2: Fix sparse warning [4/5], as reported by kbuild test robot.

---

Kirill Tkhai (5):
  net: Revert "ipv4: get rid of ip_ra_lock"
  net: Move IP_ROUTER_ALERT out of lock_sock(sk)
  net: Revert "ipv4: fix a deadlock in ip_ra_control"
  net: Make ip_ra_chain per struct net
  net: Replace ip_ra_lock with per-net mutex


 include/net/ip.h |   13 +++--
 include/net/netns/ipv4.h |2 ++
 net/core/net_namespace.c |1 +
 net/ipv4/ip_input.c  |5 ++---
 net/ipv4/ip_sockglue.c   |   34 +-
 net/ipv4/ipmr.c  |   11 +--
 net/ipv4/raw.c   |2 --
 7 files changed, 38 insertions(+), 30 deletions(-)

--
Signed-off-by: Kirill Tkhai

[PATCH net-next v3 5/5] net: Replace ip_ra_lock with per-net mutex

2018-03-22 Thread Kirill Tkhai

Since ra_chain is per-net, we may use per-net mutexes
to protect them in ip_ra_control(). This improves
scalability.

Signed-off-by: Kirill Tkhai 
---
 include/net/netns/ipv4.h |1 +
 net/core/net_namespace.c |1 +
 net/ipv4/ip_sockglue.c   |   15 ++-
 3 files changed, 8 insertions(+), 9 deletions(-)

diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h
index 97d7ee6667c7..8491bc9c86b1 100644
--- a/include/net/netns/ipv4.h
+++ b/include/net/netns/ipv4.h
@@ -50,6 +50,7 @@ struct netns_ipv4 {
struct ipv4_devconf *devconf_all;
struct ipv4_devconf *devconf_dflt;
struct ip_ra_chain __rcu *ra_chain;
+   struct mutexra_mutex;
 #ifdef CONFIG_IP_MULTIPLE_TABLES
struct fib_rules_ops*rules_ops;
boolfib_has_custom_rules;
diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c
index c340d5cfbdec..95ba2c53bd9a 100644
--- a/net/core/net_namespace.c
+++ b/net/core/net_namespace.c
@@ -301,6 +301,7 @@ static __net_init int setup_net(struct net *net, struct 
user_namespace *user_ns)
net->user_ns = user_ns;
idr_init(&net->netns_ids);
spin_lock_init(&net->nsid_lock);
+   mutex_init(&net->ipv4.ra_mutex);
 
list_for_each_entry(ops, &pernet_list, list) {
error = ops_init(ops, net);
diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c
index f36d35fe924b..5ad2d8ed3a3f 100644
--- a/net/ipv4/ip_sockglue.c
+++ b/net/ipv4/ip_sockglue.c
@@ -322,9 +322,6 @@ int ip_cmsg_send(struct sock *sk, struct msghdr *msg, 
struct ipcm_cookie *ipc,
return 0;
 }
 
-static DEFINE_SPINLOCK(ip_ra_lock);
-
-
 static void ip_ra_destroy_rcu(struct rcu_head *head)
 {
struct ip_ra_chain *ra = container_of(head, struct ip_ra_chain, rcu);
@@ -345,21 +342,21 @@ int ip_ra_control(struct sock *sk, unsigned char on,
 
new_ra = on ? kmalloc(sizeof(*new_ra), GFP_KERNEL) : NULL;
 
-   spin_lock_bh(&ip_ra_lock);
+   mutex_lock(&net->ipv4.ra_mutex);
for (rap = &net->ipv4.ra_chain;
 (ra = rcu_dereference_protected(*rap,
-   lockdep_is_held(&ip_ra_lock))) != NULL;
+   lockdep_is_held(&net->ipv4.ra_mutex))) != NULL;
 rap = &ra->next) {
if (ra->sk == sk) {
if (on) {
-   spin_unlock_bh(&ip_ra_lock);
+   mutex_unlock(&net->ipv4.ra_mutex);
kfree(new_ra);
return -EADDRINUSE;
}
/* dont let ip_call_ra_chain() use sk again */
ra->sk = NULL;
RCU_INIT_POINTER(*rap, ra->next);
-   spin_unlock_bh(&ip_ra_lock);
+   mutex_unlock(&net->ipv4.ra_mutex);
 
if (ra->destructor)
ra->destructor(sk);
@@ -374,7 +371,7 @@ int ip_ra_control(struct sock *sk, unsigned char on,
}
}
if (!new_ra) {
-   spin_unlock_bh(&ip_ra_lock);
+   mutex_unlock(&net->ipv4.ra_mutex);
return -ENOBUFS;
}
new_ra->sk = sk;
@@ -383,7 +380,7 @@ int ip_ra_control(struct sock *sk, unsigned char on,
RCU_INIT_POINTER(new_ra->next, ra);
rcu_assign_pointer(*rap, new_ra);
sock_hold(sk);
-   spin_unlock_bh(&ip_ra_lock);
+   mutex_unlock(&net->ipv4.ra_mutex);
 
return 0;
 }

[PATCH net-next v3 4/5] net: Make ip_ra_chain per struct net

2018-03-22 Thread Kirill Tkhai

This is optimization, which makes ip_call_ra_chain()
iterate less sockets to find the sockets it's looking for.

Signed-off-by: Kirill Tkhai 
---
 include/net/ip.h |   13 +++--
 include/net/netns/ipv4.h |1 +
 net/ipv4/ip_input.c  |5 ++---
 net/ipv4/ip_sockglue.c   |   15 ++-
 4 files changed, 16 insertions(+), 18 deletions(-)

diff --git a/include/net/ip.h b/include/net/ip.h
index fe63ba95d12b..d53b5a9eae34 100644
--- a/include/net/ip.h
+++ b/include/net/ip.h
@@ -91,6 +91,17 @@ static inline int inet_sdif(struct sk_buff *skb)
return 0;
 }
 
+/* Special input handler for packets caught by router alert option.
+   They are selected only by protocol field, and then processed likely
+   local ones; but only if someone wants them! Otherwise, router
+   not running rsvpd will kill RSVP.
+
+   It is user level problem, what it will make with them.
+   I have no idea, how it will masquearde or NAT them (it is joke, joke :-)),
+   but receiver should be enough clever f.e. to forward mtrace requests,
+   sent to multicast group to reach destination designated router.
+ */
+
 struct ip_ra_chain {
struct ip_ra_chain __rcu *next;
struct sock *sk;
@@ -101,8 +112,6 @@ struct ip_ra_chain {
struct rcu_head rcu;
 };
 
-extern struct ip_ra_chain __rcu *ip_ra_chain;
-
 /* IP flags. */
 #define IP_CE  0x8000  /* Flag: "Congestion"   */
 #define IP_DF  0x4000  /* Flag: "Don't Fragment"   */
diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h
index 382bfd7583cf..97d7ee6667c7 100644
--- a/include/net/netns/ipv4.h
+++ b/include/net/netns/ipv4.h
@@ -49,6 +49,7 @@ struct netns_ipv4 {
 #endif
struct ipv4_devconf *devconf_all;
struct ipv4_devconf *devconf_dflt;
+   struct ip_ra_chain __rcu *ra_chain;
 #ifdef CONFIG_IP_MULTIPLE_TABLES
struct fib_rules_ops*rules_ops;
boolfib_has_custom_rules;
diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c
index 57fc13c6ab2b..7582713dd18f 100644
--- a/net/ipv4/ip_input.c
+++ b/net/ipv4/ip_input.c
@@ -159,7 +159,7 @@ bool ip_call_ra_chain(struct sk_buff *skb)
struct net_device *dev = skb->dev;
struct net *net = dev_net(dev);
 
-   for (ra = rcu_dereference(ip_ra_chain); ra; ra = 
rcu_dereference(ra->next)) {
+   for (ra = rcu_dereference(net->ipv4.ra_chain); ra; ra = 
rcu_dereference(ra->next)) {
struct sock *sk = ra->sk;
 
/* If socket is bound to an interface, only report
@@ -167,8 +167,7 @@ bool ip_call_ra_chain(struct sk_buff *skb)
 */
if (sk && inet_sk(sk)->inet_num == protocol &&
(!sk->sk_bound_dev_if ||
-sk->sk_bound_dev_if == dev->ifindex) &&
-   net_eq(sock_net(sk), net)) {
+sk->sk_bound_dev_if == dev->ifindex)) {
if (ip_is_fragment(ip_hdr(skb))) {
if (ip_defrag(net, skb, 
IP_DEFRAG_CALL_RA_CHAIN))
return true;
diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c
index bf5f44b27b7e..f36d35fe924b 100644
--- a/net/ipv4/ip_sockglue.c
+++ b/net/ipv4/ip_sockglue.c
@@ -322,18 +322,6 @@ int ip_cmsg_send(struct sock *sk, struct msghdr *msg, 
struct ipcm_cookie *ipc,
return 0;
 }
 
-
-/* Special input handler for packets caught by router alert option.
-   They are selected only by protocol field, and then processed likely
-   local ones; but only if someone wants them! Otherwise, router
-   not running rsvpd will kill RSVP.
-
-   It is user level problem, what it will make with them.
-   I have no idea, how it will masquearde or NAT them (it is joke, joke :-)),
-   but receiver should be enough clever f.e. to forward mtrace requests,
-   sent to multicast group to reach destination designated router.
- */
-struct ip_ra_chain __rcu *ip_ra_chain;
 static DEFINE_SPINLOCK(ip_ra_lock);
 
 
@@ -350,6 +338,7 @@ int ip_ra_control(struct sock *sk, unsigned char on,
 {
struct ip_ra_chain *ra, *new_ra;
struct ip_ra_chain __rcu **rap;
+   struct net *net = sock_net(sk);
 
if (sk->sk_type != SOCK_RAW || inet_sk(sk)->inet_num == IPPROTO_RAW)
return -EINVAL;
@@ -357,7 +346,7 @@ int ip_ra_control(struct sock *sk, unsigned char on,
new_ra = on ? kmalloc(sizeof(*new_ra), GFP_KERNEL) : NULL;
 
spin_lock_bh(&ip_ra_lock);
-   for (rap = &ip_ra_chain;
+   for (rap = &net->ipv4.ra_chain;
 (ra = rcu_dereference_protected(*rap,
lockdep_is_held(&ip_ra_lock))) != NULL;
 rap = &ra->next) {

Re: [PATCH net-next v3 0/5] Rework ip_ra_chain protection

2018-03-22 Thread Kirill Tkhai

On 22.03.2018 12:44, Kirill Tkhai wrote:
> Commit 1215e51edad1 "ipv4: fix a deadlock in ip_ra_control"
> made rtnl_lock() be used in raw_close(). This function is called
> on every RAW socket destruction, so that rtnl_mutex is taken
> every time. This scales very sadly. I observe cleanup_net()
> spending a lot of time in rtnl_lock() and raw_close() is one
> of the biggest rtnl user (since we have percpu net->ipv4.icmp_sk).
> 
> This patchset reworks the locking: reverts the problem commit
> and its descendant, and introduces rtnl-independent locking.
> This may have a continuation, and someone may work on killing
> rtnl_lock() in mrtsock_destruct() in the future.
> 
> Thanks,
> Kirill
> 
> ---
> v3: Change patches order: [2/5] and [3/5].
> v2: Fix sparse warning [4/5], as reported by kbuild test robot.
> 
> ---
> 
> Kirill Tkhai (5):
>   net: Revert "ipv4: get rid of ip_ra_lock"
>   net: Move IP_ROUTER_ALERT out of lock_sock(sk)
>   net: Revert "ipv4: fix a deadlock in ip_ra_control"
>   net: Make ip_ra_chain per struct net
>   net: Replace ip_ra_lock with per-net mutex
> 
> 
>  include/net/ip.h |   13 +++--
>  include/net/netns/ipv4.h |2 ++
>  net/core/net_namespace.c |1 +
>  net/ipv4/ip_input.c  |5 ++---
>  net/ipv4/ip_sockglue.c   |   34 +-
>  net/ipv4/ipmr.c  |   11 +++++--
>  net/ipv4/raw.c   |2 --
>  7 files changed, 38 insertions(+), 30 deletions(-)
> 
> --
> Signed-off-by: Kirill Tkhai 

JFI: I used the below program to test:

#define _GNU_SOURCE
#include 
#include 
#include 
#include 
#include 

int main()
{
int sk, v, i = 0;

if (unshare(CLONE_NEWNET)) {
perror("unshare");
return 1;
}
sk = socket(AF_INET, SOCK_RAW, IPPROTO_IGMP);
if (sk < 0) {
perror("socket");
return 1;
}
for (i = 0; i < 3; i++)
fork();

while (1) {
setsockopt(sk, IPPROTO_IP, MRT_INIT, (void *)&v, sizeof(v));
setsockopt(sk, IPPROTO_IP, MRT_DONE, (void *)&v, sizeof(v));
v = (i++)%2;
setsockopt(sk, IPPROTO_IP, IP_ROUTER_ALERT, (void *)&v, 
sizeof(v));
}

return 0;
}

Re: [PATCH net-next 06/14] net/tls: Add generic NIC offload infrastructure

2018-03-22 Thread Kirill Tkhai

On 22.03.2018 15:38, Boris Pismenny wrote:
> ...

 Can't we move this check in tls_dev_event() and use it for all types of 
 events?
 Then we avoid duplicate code.

>>>
>>> No. Not all events require this check. Also, the result is different for 
>>> different events.
>>
>> No. You always return NOTIFY_DONE, in case of !(netdev->features & 
>> NETIF_F_HW_TLS_TX).
>> See below:
>>
>> static int tls_check_dev_ops(struct net_device *dev)
>> {
>> if (!dev->tlsdev_ops)
>>     return NOTIFY_BAD;
>>
>> return NOTIFY_DONE;
>> }
>>
>> static int tls_device_down(struct net_device *netdev)
>> {
>> struct tls_context *ctx, *tmp;
>> struct list_head list;
>> unsigned long flags;
>>
>> ...
>> return NOTIFY_DONE;
>> }
>>
>> static int tls_dev_event(struct notifier_block *this, unsigned long event,
>>   void *ptr)
>> {
>>  struct net_device *dev = netdev_notifier_info_to_dev(ptr);
>>
>> if (!(netdev->features & NETIF_F_HW_TLS_TX))
>>     return NOTIFY_DONE;
>>    switch (event) {
>>  case NETDEV_REGISTER:
>>  case NETDEV_FEAT_CHANGE:
>>  return tls_check_dev_ops(dev);
>>    case NETDEV_DOWN:
>>  return tls_device_down(dev);
>>  }
>>  return NOTIFY_DONE;
>> }
>>  
> 
> Sure, will fix in V3.
> 
> +
> +    /* Request a write lock to block new offload attempts
> + */
> +    percpu_down_write(&device_offload_lock);

 What is the reason percpu_rwsem is chosen here? It looks like this 
 primitive
 gives more advantages readers, then plain rwsem does. But it also gives
 disadvantages to writers. It would be good, unless tls_device_down() is 
 called
 with rtnl_lock() held from netdevice notifier. But since netdevice notifier
 are called with rtnl_lock() held, percpu_rwsem will increase the time 
 rtnl_lock()
 is locked.
>>> We use the a rwsem to allow multiple (readers) invocations of 
>>> tls_set_device_offload, which is triggered by the user (persumably) during 
>>> the TLS handshake. This might be considered a fast-path.
>>>
>>> However, we must block all calls to tls_set_device_offload while we are 
>>> processing NETDEV_DOWN events (writer).
>>>
>>> As you've mentioned, the percpu rwsem is more efficient for readers, 
>>> especially on NUMA systems, where cache-line bouncing occurs during reader 
>>> acquire and reduces performance.
>>
>> Hm, and who are the readers? It's used from do_tls_setsockopt_tx(), but it 
>> doesn't
>> seem to be performance critical. Who else?
>>
> 
> It depends on whether you consider the TLS handshake code as critical.
> The readers are TCP connections processing the CCS message of the TLS 
> handshake. They are providing key material to the kernel to start using 
> Kernel TLS.

The thing is rtnl_lock() is critical for the rest of the system,
while TLS handshake is small subset of actions the system makes.

rtnl_lock() is used just almost everywhere, from netlink messages
to netdev ioctls.

Currently, you even just can't close raw socket without rtnl lock.
So, all of this is big reason to avoid doing rcu waitings under it.

Kirill


 Can't we use plain rwsem here instead?

>>>
>>> Its a performance tradeoff. I'm not certain that the percpu rwsem write 
>>> side acquire is significantly worse than using the global rwsem.
>>>
>>> For now, while all of this is experimental, can we agree to focus on the 
>>> performance of readers? We can change it later if it becomes a problem.
>>
>> Same as above.
>>   
> 
> Replaced with rwsem from V2.

[PATCH net-next 2/2] net: Convert rxrpc_net_ops

2018-03-22 Thread Kirill Tkhai

These pernet_operations modifies rxrpc_net_id-pointed
per-net entities. There is external link to AF_RXRPC
in fs/afs/Kconfig, but it seems there is no other
pernet_operations interested in that per-net entities.

Signed-off-by: Kirill Tkhai 
Acked-by: David Howells 
---
 net/rxrpc/net_ns.c |1 +
 1 file changed, 1 insertion(+)

diff --git a/net/rxrpc/net_ns.c b/net/rxrpc/net_ns.c
index f18c9248e0d4..5fd939dabf41 100644
--- a/net/rxrpc/net_ns.c
+++ b/net/rxrpc/net_ns.c
@@ -106,4 +106,5 @@ struct pernet_operations rxrpc_net_ops = {
.exit   = rxrpc_exit_net,
.id = &rxrpc_net_id,
.size   = sizeof(struct rxrpc_net),
+   .async  = true,
 };

[PATCH net-next 0/2] Converting pernet_operations (part #11)

2018-03-22 Thread Kirill Tkhai

Hi,

this series continues to review and to convert pernet_operations
to make them possible to be executed in parallel for several
net namespaces at the same time.

I thought last series was last, but there is one
new pernet_operations came to kernel. This is
udp_sysctl_ops, and here we convert it.

Also, David Howells acked rxrpc_net_ops, so I resend
the patch in case of it should be queued by patchwork:

https://www.spinics.net/lists/netdev/msg490678.html

Thanks,
Kirill
---

Kirill Tkhai (2):
  net: Convert udp_sysctl_ops
  net: Convert rxrpc_net_ops


 net/ipv4/udp.c |3 ++-
 net/rxrpc/net_ns.c |1 +
 2 files changed, 3 insertions(+), 1 deletion(-)

--
Signed-off-by: Kirill Tkhai

[PATCH net-next 1/2] net: Convert udp_sysctl_ops

2018-03-22 Thread Kirill Tkhai

These pernet_operations just initialize udp4 defaults.

Signed-off-by: Kirill Tkhai 
---
 net/ipv4/udp.c |3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 908fc02fb4f8..c6dc019bc64b 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -2842,7 +2842,8 @@ static int __net_init udp_sysctl_init(struct net *net)
 }
 
 static struct pernet_operations __net_initdata udp_sysctl_ops = {
-   .init   = udp_sysctl_init,
+   .init   = udp_sysctl_init,
+   .async  = true,
 };
 
 void __init udp_init(void)

[PATCH net-next 2/2] net: Drop NETDEV_UNREGISTER_FINAL

2018-03-23 Thread Kirill Tkhai

Last user is gone after bdf5bd7f2132 "rds: tcp: remove
register_netdevice_notifier infrastructure.", so we can
remove this netdevice command. This allows to delete
rtnl_lock() in netdev_run_todo(), which is hot path for
net namespace unregistration.

dev_change_net_namespace() and netdev_wait_allrefs()
have rcu_barrier() before NETDEV_UNREGISTER_FINAL call,
and the source commits say they were introduced to
delemit the call with NETDEV_UNREGISTER, but this patch
leaves them on the places, since they require additional
analysis, whether we need in them for something else.

Signed-off-by: Kirill Tkhai 
---
 drivers/infiniband/hw/qedr/main.c |4 ++--
 include/linux/netdevice.h |1 -
 include/rdma/ib_verbs.h   |4 ++--
 net/core/dev.c|6 --
 4 files changed, 4 insertions(+), 11 deletions(-)

diff --git a/drivers/infiniband/hw/qedr/main.c 
b/drivers/infiniband/hw/qedr/main.c
index db4bf97c0e15..eb32abb0099a 100644
--- a/drivers/infiniband/hw/qedr/main.c
+++ b/drivers/infiniband/hw/qedr/main.c
@@ -90,8 +90,8 @@ static struct net_device *qedr_get_netdev(struct ib_device 
*dev, u8 port_num)
dev_hold(qdev->ndev);
 
/* The HW vendor's device driver must guarantee
-* that this function returns NULL before the net device reaches
-* NETDEV_UNREGISTER_FINAL state.
+* that this function returns NULL before the net device has finished
+* NETDEV_UNREGISTER state.
 */
return qdev->ndev;
 }
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index dd5a04c971d5..2a2d9cf50aa2 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -2336,7 +2336,6 @@ enum netdev_cmd {
NETDEV_PRE_TYPE_CHANGE,
NETDEV_POST_TYPE_CHANGE,
NETDEV_POST_INIT,
-   NETDEV_UNREGISTER_FINAL,
NETDEV_RELEASE,
NETDEV_NOTIFY_PEERS,
NETDEV_JOIN,
diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h
index 73b2387e3f74..a1c5e8320f86 100644
--- a/include/rdma/ib_verbs.h
+++ b/include/rdma/ib_verbs.h
@@ -2126,8 +2126,8 @@ struct ib_device {
 * net device of device @device at port @port_num or NULL if such
 * a net device doesn't exist. The vendor driver should call dev_hold
 * on this net device. The HW vendor's device driver must guarantee
-* that this function returns NULL before the net device reaches
-* NETDEV_UNREGISTER_FINAL state.
+* that this function returns NULL before the net device has finished
+* NETDEV_UNREGISTER state.
 */
struct net_device *(*get_netdev)(struct ib_device *device,
 u8 port_num);
diff --git a/net/core/dev.c b/net/core/dev.c
index bcc9fe68240b..c444da3e4b82 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -8086,7 +8086,6 @@ static void netdev_wait_allrefs(struct net_device *dev)
rcu_barrier();
rtnl_lock();
 
-   call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
if (test_bit(__LINK_STATE_LINKWATCH_PENDING,
 &dev->state)) {
/* We must not have linkwatch events
@@ -8158,10 +8157,6 @@ void netdev_run_todo(void)
= list_first_entry(&list, struct net_device, todo_list);
list_del(&dev->todo_list);
 
-   rtnl_lock();
-   call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
-   __rtnl_unlock();
-
if (unlikely(dev->reg_state != NETREG_UNREGISTERING)) {
pr_err("network todo '%s' but state %d\n",
   dev->name, dev->reg_state);
@@ -8603,7 +8598,6 @@ int dev_change_net_namespace(struct net_device *dev, 
struct net *net, const char
 */
call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
rcu_barrier();
-   call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
 
new_nsid = peernet2id_alloc(dev_net(dev), net);
/* If there is an ifindex conflict assign a new one */

[PATCH net-next v2 0/3] Drop NETDEV_UNREGISTER_FINAL (was unnamed)

2018-03-23 Thread Kirill Tkhai

This series drops unused NETDEV_UNREGISTER_FINAL
after some preparations.

v2: New patch [2/3]. Use switch() in [1/3].

The first version was acked by Jason Gunthorpe,
and [1/3] was acked by David Ahern.

Since there are differences to v1, I haven't added
Acked-by tags of people. It would be nice, if you
fill OK to tag v2 too.

Thanks,
Kirill
---

Kirill Tkhai (3):
  net: Make NETDEV_XXX commands enum { }
  infiniband: Replace usnic_ib_netdev_event_to_string() with 
netdev_cmd_to_name()
  net: Drop NETDEV_UNREGISTER_FINAL


 drivers/infiniband/hw/qedr/main.c   |4 +-
 drivers/infiniband/hw/usnic/usnic_ib_main.c |   28 ++-
 include/linux/netdevice.h   |   68 ++-
 include/rdma/ib_verbs.h |4 +-
 net/core/dev.c  |   25 --
 5 files changed, 63 insertions(+), 66 deletions(-)

--
Signed-off-by: Kirill Tkhai

[PATCH net-next v2 3/3] net: Drop NETDEV_UNREGISTER_FINAL

2018-03-23 Thread Kirill Tkhai

Last user is gone after bdf5bd7f2132 "rds: tcp: remove
register_netdevice_notifier infrastructure.", so we can
remove this netdevice command. This allows to delete
rtnl_lock() in netdev_run_todo(), which is hot path for
net namespace unregistration.

dev_change_net_namespace() and netdev_wait_allrefs()
have rcu_barrier() before NETDEV_UNREGISTER_FINAL call,
and the source commits say they were introduced to
delemit the call with NETDEV_UNREGISTER, but this patch
leaves them on the places, since they require additional
analysis, whether we need in them for something else.

Signed-off-by: Kirill Tkhai 
---
 drivers/infiniband/hw/qedr/main.c |4 ++--
 include/linux/netdevice.h |1 -
 include/rdma/ib_verbs.h   |4 ++--
 net/core/dev.c|7 ---
 4 files changed, 4 insertions(+), 12 deletions(-)

diff --git a/drivers/infiniband/hw/qedr/main.c 
b/drivers/infiniband/hw/qedr/main.c
index db4bf97c0e15..eb32abb0099a 100644
--- a/drivers/infiniband/hw/qedr/main.c
+++ b/drivers/infiniband/hw/qedr/main.c
@@ -90,8 +90,8 @@ static struct net_device *qedr_get_netdev(struct ib_device 
*dev, u8 port_num)
dev_hold(qdev->ndev);
 
/* The HW vendor's device driver must guarantee
-* that this function returns NULL before the net device reaches
-* NETDEV_UNREGISTER_FINAL state.
+* that this function returns NULL before the net device has finished
+* NETDEV_UNREGISTER state.
 */
return qdev->ndev;
 }
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index dd5a04c971d5..2a2d9cf50aa2 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -2336,7 +2336,6 @@ enum netdev_cmd {
NETDEV_PRE_TYPE_CHANGE,
NETDEV_POST_TYPE_CHANGE,
NETDEV_POST_INIT,
-   NETDEV_UNREGISTER_FINAL,
NETDEV_RELEASE,
NETDEV_NOTIFY_PEERS,
NETDEV_JOIN,
diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h
index 73b2387e3f74..a1c5e8320f86 100644
--- a/include/rdma/ib_verbs.h
+++ b/include/rdma/ib_verbs.h
@@ -2126,8 +2126,8 @@ struct ib_device {
 * net device of device @device at port @port_num or NULL if such
 * a net device doesn't exist. The vendor driver should call dev_hold
 * on this net device. The HW vendor's device driver must guarantee
-* that this function returns NULL before the net device reaches
-* NETDEV_UNREGISTER_FINAL state.
+* that this function returns NULL before the net device has finished
+* NETDEV_UNREGISTER state.
 */
struct net_device *(*get_netdev)(struct ib_device *device,
 u8 port_num);
diff --git a/net/core/dev.c b/net/core/dev.c
index eb8ff44f46d6..cb4d2ceb36fb 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -1584,7 +1584,6 @@ const char *netdev_cmd_to_name(enum netdev_cmd cmd)
N(RESEND_IGMP) N(PRECHANGEMTU) N(CHANGEINFODATA) N(BONDING_INFO)
N(PRECHANGEUPPER) N(CHANGELOWERSTATE) N(UDP_TUNNEL_PUSH_INFO)
N(UDP_TUNNEL_DROP_INFO) N(CHANGE_TX_QUEUE_LEN)
-   N(UNREGISTER_FINAL)
};
 #undef N
return "UNKNOWN_NETDEV_EVENT";
@@ -8089,7 +8088,6 @@ static void netdev_wait_allrefs(struct net_device *dev)
rcu_barrier();
rtnl_lock();
 
-   call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
if (test_bit(__LINK_STATE_LINKWATCH_PENDING,
 &dev->state)) {
/* We must not have linkwatch events
@@ -8161,10 +8159,6 @@ void netdev_run_todo(void)
= list_first_entry(&list, struct net_device, todo_list);
list_del(&dev->todo_list);
 
-   rtnl_lock();
-   call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
-   __rtnl_unlock();
-
if (unlikely(dev->reg_state != NETREG_UNREGISTERING)) {
pr_err("network todo '%s' but state %d\n",
   dev->name, dev->reg_state);
@@ -8606,7 +8600,6 @@ int dev_change_net_namespace(struct net_device *dev, 
struct net *net, const char
 */
call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
rcu_barrier();
-   call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
 
new_nsid = peernet2id_alloc(dev_net(dev), net);
/* If there is an ifindex conflict assign a new one */

[PATCH net-next v2 2/3] infiniband: Replace usnic_ib_netdev_event_to_string() with netdev_cmd_to_name()

2018-03-23 Thread Kirill Tkhai

This function just calls netdev_cmd_to_name().

Signed-off-by: Kirill Tkhai 
---
 drivers/infiniband/hw/usnic/usnic_ib_main.c |   15 +--
 1 file changed, 5 insertions(+), 10 deletions(-)

diff --git a/drivers/infiniband/hw/usnic/usnic_ib_main.c 
b/drivers/infiniband/hw/usnic/usnic_ib_main.c
index 5bf3b20eba25..ca5638091b55 100644
--- a/drivers/infiniband/hw/usnic/usnic_ib_main.c
+++ b/drivers/infiniband/hw/usnic/usnic_ib_main.c
@@ -95,11 +95,6 @@ void usnic_ib_log_vf(struct usnic_ib_vf *vf)
 }
 
 /* Start of netdev section */
-static inline const char *usnic_ib_netdev_event_to_string(unsigned long event)
-{
-   return netdev_cmd_to_name(event);
-}
-
 static void usnic_ib_qp_grp_modify_active_to_err(struct usnic_ib_dev *us_ibdev)
 {
struct usnic_ib_ucontext *ctx;
@@ -172,7 +167,7 @@ static void usnic_ib_handle_usdev_event(struct usnic_ib_dev 
*us_ibdev,
ib_dispatch_event(&ib_event);
} else {
usnic_dbg("Ignoring %s on %s\n",
-   usnic_ib_netdev_event_to_string(event),
+   netdev_cmd_to_name(event),
us_ibdev->ib_dev.name);
}
break;
@@ -209,7 +204,7 @@ static void usnic_ib_handle_usdev_event(struct usnic_ib_dev 
*us_ibdev,
break;
default:
usnic_dbg("Ignoring event %s on %s",
-   usnic_ib_netdev_event_to_string(event),
+   netdev_cmd_to_name(event),
us_ibdev->ib_dev.name);
}
mutex_unlock(&us_ibdev->usdev_lock);
@@ -251,7 +246,7 @@ static int usnic_ib_handle_inet_event(struct usnic_ib_dev 
*us_ibdev,
switch (event) {
case NETDEV_DOWN:
usnic_info("%s via ip notifiers",
-   usnic_ib_netdev_event_to_string(event));
+   netdev_cmd_to_name(event));
usnic_fwd_del_ipaddr(us_ibdev->ufdev);
usnic_ib_qp_grp_modify_active_to_err(us_ibdev);
ib_event.event = IB_EVENT_GID_CHANGE;
@@ -262,7 +257,7 @@ static int usnic_ib_handle_inet_event(struct usnic_ib_dev 
*us_ibdev,
case NETDEV_UP:
usnic_fwd_add_ipaddr(us_ibdev->ufdev, ifa->ifa_address);
usnic_info("%s via ip notifiers: ip %pI4",
-   usnic_ib_netdev_event_to_string(event),
+   netdev_cmd_to_name(event),
&us_ibdev->ufdev->inaddr);
ib_event.event = IB_EVENT_GID_CHANGE;
ib_event.device = &us_ibdev->ib_dev;
@@ -271,7 +266,7 @@ static int usnic_ib_handle_inet_event(struct usnic_ib_dev 
*us_ibdev,
break;
default:
usnic_info("Ignoring event %s on %s",
-   usnic_ib_netdev_event_to_string(event),
+   netdev_cmd_to_name(event),
us_ibdev->ib_dev.name);
}
mutex_unlock(&us_ibdev->usdev_lock);

[PATCH net-next v1 0/4] Converting pernet_operations (part #7.1)

2018-03-26 Thread Kirill Tkhai

Hi,

this is a resending of the 4 patches from path #7.

Anna kindly reviewed them and suggested to take the patches
through net tree, since there is pernet_operations::async only
in net-next.git.

There is Anna's acks on every header, the rest of patch
has no changes.

Thanks,
Kirill
---

Kirill Tkhai (4):
  net: Convert rpcsec_gss_net_ops
  net: Convert sunrpc_net_ops
  net: Convert nfs4_dns_resolver_ops
  net: Convert nfs4blocklayout_net_ops


 fs/nfs/blocklayout/rpc_pipefs.c |1 +
 fs/nfs/dns_resolve.c|1 +
 net/sunrpc/auth_gss/auth_gss.c  |1 +
 net/sunrpc/sunrpc_syms.c|1 +
 4 files changed, 4 insertions(+)

--
Signed-off-by: Kirill Tkhai 
Acked-by: Anna Schumaker

[PATCH net-next v1 1/4] net: Convert rpcsec_gss_net_ops

2018-03-26 Thread Kirill Tkhai

These pernet_operations initialize and destroy sunrpc_net_id
refered per-net items. Only used global list is cache_list,
and accesses already serialized.

sunrpc_destroy_cache_detail() check for list_empty() without
cache_list_lock, but when it's called from unregister_pernet_subsys(),
there can't be callers in parallel, so we won't miss list_empty()
in this case.

Signed-off-by: Kirill Tkhai 
Acked-by: Anna Schumaker 
---
 net/sunrpc/auth_gss/auth_gss.c |1 +
 1 file changed, 1 insertion(+)

diff --git a/net/sunrpc/auth_gss/auth_gss.c b/net/sunrpc/auth_gss/auth_gss.c
index 9463af4b32e8..44f939cb6bc8 100644
--- a/net/sunrpc/auth_gss/auth_gss.c
+++ b/net/sunrpc/auth_gss/auth_gss.c
@@ -2063,6 +2063,7 @@ static __net_exit void rpcsec_gss_exit_net(struct net 
*net)
 static struct pernet_operations rpcsec_gss_net_ops = {
.init = rpcsec_gss_init_net,
.exit = rpcsec_gss_exit_net,
+   .async = true,
 };
 
 /*

[PATCH net-next v1 2/4] net: Convert sunrpc_net_ops

2018-03-26 Thread Kirill Tkhai

These pernet_operations look similar to rpcsec_gss_net_ops,
they just create and destroy another caches. So, they also
can be async.

Signed-off-by: Kirill Tkhai 
Acked-by: Anna Schumaker 
---
 net/sunrpc/sunrpc_syms.c |1 +
 1 file changed, 1 insertion(+)

diff --git a/net/sunrpc/sunrpc_syms.c b/net/sunrpc/sunrpc_syms.c
index 56f9eff74150..68287e921847 100644
--- a/net/sunrpc/sunrpc_syms.c
+++ b/net/sunrpc/sunrpc_syms.c
@@ -79,6 +79,7 @@ static struct pernet_operations sunrpc_net_ops = {
.exit = sunrpc_exit_net,
.id = &sunrpc_net_id,
.size = sizeof(struct sunrpc_net),
+   .async = true,
 };
 
 static int __init

[PATCH net-next v1 3/4] net: Convert nfs4_dns_resolver_ops

2018-03-26 Thread Kirill Tkhai

These pernet_operations look similar to rpcsec_gss_net_ops,
they just create and destroy another cache. Also they create
and destroy directory. So, they also look safe to be async.

Signed-off-by: Kirill Tkhai 
Acked-by: Anna Schumaker 
---
 fs/nfs/dns_resolve.c |1 +
 1 file changed, 1 insertion(+)

diff --git a/fs/nfs/dns_resolve.c b/fs/nfs/dns_resolve.c
index 060c658eab66..e90bd69ab653 100644
--- a/fs/nfs/dns_resolve.c
+++ b/fs/nfs/dns_resolve.c
@@ -410,6 +410,7 @@ static void nfs4_dns_net_exit(struct net *net)
 static struct pernet_operations nfs4_dns_resolver_ops = {
.init = nfs4_dns_net_init,
.exit = nfs4_dns_net_exit,
+   .async = true,
 };
 
 static int rpc_pipefs_event(struct notifier_block *nb, unsigned long event,

[PATCH net-next v1 4/4] net: Convert nfs4blocklayout_net_ops

2018-03-26 Thread Kirill Tkhai

These pernet_operations create and destroy per-net pipe
and dentry, and they seem safe to be marked as async.

Signed-off-by: Kirill Tkhai 
Acked-by: Anna Schumaker 
---
 fs/nfs/blocklayout/rpc_pipefs.c |1 +
 1 file changed, 1 insertion(+)

diff --git a/fs/nfs/blocklayout/rpc_pipefs.c b/fs/nfs/blocklayout/rpc_pipefs.c
index 9fb067a6f7e0..ef9fa111b009 100644
--- a/fs/nfs/blocklayout/rpc_pipefs.c
+++ b/fs/nfs/blocklayout/rpc_pipefs.c
@@ -261,6 +261,7 @@ static void nfs4blocklayout_net_exit(struct net *net)
 static struct pernet_operations nfs4blocklayout_net_ops = {
.init = nfs4blocklayout_net_init,
.exit = nfs4blocklayout_net_exit,
+   .async = true,
 };
 
 int __init bl_init_pipefs(void)

Re: [PATCH net-next nfs 1/6] net: Convert rpcsec_gss_net_ops

2018-03-26 Thread Kirill Tkhai

On 23.03.2018 21:53, Anna Schumaker wrote:
> 
> 
> On 03/13/2018 06:49 AM, Kirill Tkhai wrote:
>> These pernet_operations initialize and destroy sunrpc_net_id
>> refered per-net items. Only used global list is cache_list,
>> and accesses already serialized.
>>
>> sunrpc_destroy_cache_detail() check for list_empty() without
>> cache_list_lock, but when it's called from unregister_pernet_subsys(),
>> there can't be callers in parallel, so we won't miss list_empty()
>> in this case.
>>
>> Signed-off-by: Kirill Tkhai 
> 
> It might make sense to take these and the other NFS patches through the net 
> tree, since the pernet_operations don't yet have the async field in my tree 
> (and I therefore can't compile once these are applied).
> 
> Acked-by: Anna Schumaker 

Thanks a lot, Anna!

Kirill

>> ---
>>  net/sunrpc/auth_gss/auth_gss.c |1 +
>>  1 file changed, 1 insertion(+)
>>
>> diff --git a/net/sunrpc/auth_gss/auth_gss.c b/net/sunrpc/auth_gss/auth_gss.c
>> index 9463af4b32e8..44f939cb6bc8 100644
>> --- a/net/sunrpc/auth_gss/auth_gss.c
>> +++ b/net/sunrpc/auth_gss/auth_gss.c
>> @@ -2063,6 +2063,7 @@ static __net_exit void rpcsec_gss_exit_net(struct net 
>> *net)
>>  static struct pernet_operations rpcsec_gss_net_ops = {
>>  .init = rpcsec_gss_init_net,
>>  .exit = rpcsec_gss_exit_net,
>> +.async = true,
>>  };
>>  
>>  /*
>>

Re: [PATCH net-next nfs 1/6] net: Convert rpcsec_gss_net_ops

2018-03-27 Thread Kirill Tkhai

On 26.03.2018 21:36, J. Bruce Fields wrote:
> On Fri, Mar 23, 2018 at 02:53:34PM -0400, Anna Schumaker wrote:
>>
>>
>> On 03/13/2018 06:49 AM, Kirill Tkhai wrote:
>>> These pernet_operations initialize and destroy sunrpc_net_id refered
>>> per-net items. Only used global list is cache_list, and accesses
>>> already serialized.
>>>
>>> sunrpc_destroy_cache_detail() check for list_empty() without
>>> cache_list_lock, but when it's called from
>>> unregister_pernet_subsys(), there can't be callers in parallel, so
>>> we won't miss list_empty() in this case.
>>>
>>> Signed-off-by: Kirill Tkhai 
>>
>> It might make sense to take these and the other NFS patches through
>> the net tree, since the pernet_operations don't yet have the async
>> field in my tree (and I therefore can't compile once these are
>> applied).
> 
> Ditto for the nfsd patch, so, for what it's worth:
> 
>   Acked-by: J. Bruce Fields 
> 
> for that patch.--b.

Thanks, Bruce.

Kirill

[PATCH net-next] net: Export net->ipv6.sysctl.ip_nonlocal_bind to /proc

2018-03-27 Thread Kirill Tkhai

Currenly, this parameter can be configured via sysctl
only. But sysctl is considered as depricated interface
(man 2 sysctl), and it only can applied to current's net
namespace (this requires to do setns() to change it in
not current's net ns).

So, let's export the parameter to /proc in standard way,
and this allows to access another process namespace
via /proc/[pid]/net/ip6_nonlocal_bind.

Signed-off-by: Kirill Tkhai 
---
 net/ipv6/proc.c |   48 
 1 file changed, 48 insertions(+)

diff --git a/net/ipv6/proc.c b/net/ipv6/proc.c
index 6e57028d2e91..2d0aa59c2d0d 100644
--- a/net/ipv6/proc.c
+++ b/net/ipv6/proc.c
@@ -312,6 +312,47 @@ int snmp6_unregister_dev(struct inet6_dev *idev)
return 0;
 }
 
+static int nonlocal_bind_show(struct seq_file *seq, void *v)
+{
+   struct net *net = seq->private;
+
+   seq_printf(seq, "%d\n", !!net->ipv6.sysctl.ip_nonlocal_bind);
+   return 0;
+}
+
+static int open_nonlocal_bind(struct inode *inode, struct file *file)
+{
+   return single_open_net(inode, file, nonlocal_bind_show);
+}
+
+static ssize_t write_nonlocal_bind(struct file *file, const char __user *ubuf,
+  size_t count, loff_t *ppos)
+{
+   struct net *net = ((struct seq_file *)file->private_data)->private;
+   char buf[3];
+
+   if (*ppos || count <= 0 || count > sizeof(buf))
+   return -EINVAL;
+
+   if (copy_from_user(buf, ubuf, count))
+   return -EFAULT;
+   buf[0] -= '0';
+   if ((count == 3 && buf[2] != '\0') ||
+   (count >= 2 && buf[1] != '\n') ||
+   (buf[0] != 0 && buf[0] != 1))
+   return -EINVAL;
+
+   net->ipv6.sysctl.ip_nonlocal_bind = buf[0];
+   return count;
+}
+
+static const struct file_operations nonlocal_bind_ops = {
+   .open   = open_nonlocal_bind,
+   .read   = seq_read,
+   .write  = write_nonlocal_bind,
+   .release = single_release_net,
+};
+
 static int __net_init ipv6_proc_init_net(struct net *net)
 {
if (!proc_create("sockstat6", 0444, net->proc_net,
@@ -321,12 +362,18 @@ static int __net_init ipv6_proc_init_net(struct net *net)
if (!proc_create("snmp6", 0444, net->proc_net, &snmp6_seq_fops))
goto proc_snmp6_fail;
 
+if (!proc_create_data("ip6_nonlocal_bind", 0644,
+  net->proc_net, &nonlocal_bind_ops, net))
+   goto proc_bind_fail;
+
net->mib.proc_net_devsnmp6 = proc_mkdir("dev_snmp6", net->proc_net);
if (!net->mib.proc_net_devsnmp6)
goto proc_dev_snmp6_fail;
return 0;
 
 proc_dev_snmp6_fail:
+   remove_proc_entry("ip6_nonlocal_bind", net->proc_net);
+proc_bind_fail:
remove_proc_entry("snmp6", net->proc_net);
 proc_snmp6_fail:
remove_proc_entry("sockstat6", net->proc_net);
@@ -337,6 +384,7 @@ static void __net_exit ipv6_proc_exit_net(struct net *net)
 {
remove_proc_entry("sockstat6", net->proc_net);
remove_proc_entry("dev_snmp6", net->proc_net);
+   remove_proc_entry("ip6_nonlocal_bind", net->proc_net);
remove_proc_entry("snmp6", net->proc_net);
 }

Re: [PATCH net-next] net: Export net->ipv6.sysctl.ip_nonlocal_bind to /proc

2018-03-27 Thread Kirill Tkhai

Please, ignore this.

Thanks,
Kirill

On 27.03.2018 14:24, Kirill Tkhai wrote:
> Currenly, this parameter can be configured via sysctl
> only. But sysctl is considered as depricated interface
> (man 2 sysctl), and it only can applied to current's net
> namespace (this requires to do setns() to change it in
> not current's net ns).
> 
> So, let's export the parameter to /proc in standard way,
> and this allows to access another process namespace
> via /proc/[pid]/net/ip6_nonlocal_bind.
> 
> Signed-off-by: Kirill Tkhai 
> ---
>  net/ipv6/proc.c |   48 
>  1 file changed, 48 insertions(+)
> 
> diff --git a/net/ipv6/proc.c b/net/ipv6/proc.c
> index 6e57028d2e91..2d0aa59c2d0d 100644
> --- a/net/ipv6/proc.c
> +++ b/net/ipv6/proc.c
> @@ -312,6 +312,47 @@ int snmp6_unregister_dev(struct inet6_dev *idev)
>   return 0;
>  }
>  
> +static int nonlocal_bind_show(struct seq_file *seq, void *v)
> +{
> + struct net *net = seq->private;
> +
> + seq_printf(seq, "%d\n", !!net->ipv6.sysctl.ip_nonlocal_bind);
> + return 0;
> +}
> +
> +static int open_nonlocal_bind(struct inode *inode, struct file *file)
> +{
> + return single_open_net(inode, file, nonlocal_bind_show);
> +}
> +
> +static ssize_t write_nonlocal_bind(struct file *file, const char __user 
> *ubuf,
> +size_t count, loff_t *ppos)
> +{
> + struct net *net = ((struct seq_file *)file->private_data)->private;
> + char buf[3];
> +
> + if (*ppos || count <= 0 || count > sizeof(buf))
> + return -EINVAL;
> +
> + if (copy_from_user(buf, ubuf, count))
> + return -EFAULT;
> + buf[0] -= '0';
> + if ((count == 3 && buf[2] != '\0') ||
> + (count >= 2 && buf[1] != '\n') ||
> + (buf[0] != 0 && buf[0] != 1))
> + return -EINVAL;
> +
> + net->ipv6.sysctl.ip_nonlocal_bind = buf[0];
> + return count;
> +}
> +
> +static const struct file_operations nonlocal_bind_ops = {
> + .open   = open_nonlocal_bind,
> + .read   = seq_read,
> + .write  = write_nonlocal_bind,
> + .release = single_release_net,
> +};
> +
>  static int __net_init ipv6_proc_init_net(struct net *net)
>  {
>   if (!proc_create("sockstat6", 0444, net->proc_net,
> @@ -321,12 +362,18 @@ static int __net_init ipv6_proc_init_net(struct net 
> *net)
>   if (!proc_create("snmp6", 0444, net->proc_net, &snmp6_seq_fops))
>   goto proc_snmp6_fail;
>  
> +if (!proc_create_data("ip6_nonlocal_bind", 0644,
> +net->proc_net, &nonlocal_bind_ops, net))
> + goto proc_bind_fail;
> +
>   net->mib.proc_net_devsnmp6 = proc_mkdir("dev_snmp6", net->proc_net);
>   if (!net->mib.proc_net_devsnmp6)
>   goto proc_dev_snmp6_fail;
>   return 0;
>  
>  proc_dev_snmp6_fail:
> + remove_proc_entry("ip6_nonlocal_bind", net->proc_net);
> +proc_bind_fail:
>   remove_proc_entry("snmp6", net->proc_net);
>  proc_snmp6_fail:
>   remove_proc_entry("sockstat6", net->proc_net);
> @@ -337,6 +384,7 @@ static void __net_exit ipv6_proc_exit_net(struct net *net)
>  {
>   remove_proc_entry("sockstat6", net->proc_net);
>   remove_proc_entry("dev_snmp6", net->proc_net);
> + remove_proc_entry("ip6_nonlocal_bind", net->proc_net);
>   remove_proc_entry("snmp6", net->proc_net);
>  }
>  
>

[PATCH net-next 0/5] Make pernet_operations always read locked

2018-03-27 Thread Kirill Tkhai

All the pernet_operations are converted, and the last one
is in this patchset (nfsd_net_ops acked by J. Bruce Fields).
So, it's the time to kill pernet_operations::async field,
and make setup_net() and cleanup_net() always require
the rwsem only read locked.

All further pernet_operations have to be developed to fit
this rule. Some of previous patches added a comment to
struct pernet_operations about that.

Also, this patchset renames net_sem to pernet_ops_rwsem
to make the target area of the rwsem is more clear visible,
and adds more comments.

Thanks,
Kirill
---

Kirill Tkhai (5):
  net: Convert nfsd_net_ops
  net: Reflect all pernet_operations are converted
  net: Drop pernet_operations::async
  net: Rename net_sem to pernet_ops_rwsem
  net: Add more comments


 drivers/infiniband/core/cma.c  |1 
 drivers/net/bonding/bond_main.c|1 
 drivers/net/geneve.c   |1 
 drivers/net/gtp.c  |1 
 drivers/net/ipvlan/ipvlan_main.c   |1 
 drivers/net/loopback.c |1 
 drivers/net/ppp/ppp_generic.c  |1 
 drivers/net/ppp/pppoe.c|1 
 drivers/net/vrf.c  |1 
 drivers/net/vxlan.c|1 
 drivers/net/wireless/mac80211_hwsim.c  |1 
 fs/lockd/svc.c |1 
 fs/nfs/blocklayout/rpc_pipefs.c|1 
 fs/nfs/dns_resolve.c   |1 
 fs/nfs/inode.c |1 
 fs/nfs_common/grace.c  |1 
 fs/proc/proc_net.c |1 
 include/linux/rtnetlink.h  |2 -
 include/net/net_namespace.h|   22 +++
 kernel/audit.c |1 
 lib/kobject_uevent.c   |1 
 net/8021q/vlan.c   |1 
 net/bridge/br.c|1 
 net/bridge/br_netfilter_hooks.c|1 
 net/bridge/netfilter/ebtable_broute.c  |1 
 net/bridge/netfilter/ebtable_filter.c  |1 
 net/bridge/netfilter/ebtable_nat.c |1 
 net/bridge/netfilter/nf_log_bridge.c   |1 
 net/caif/caif_dev.c|1 
 net/can/af_can.c   |1 
 net/can/bcm.c  |1 
 net/can/gw.c   |1 
 net/core/dev.c |2 -
 net/core/fib_notifier.c|1 
 net/core/fib_rules.c   |1 
 net/core/net-procfs.c  |2 -
 net/core/net_namespace.c   |   77 +++-
 net/core/pktgen.c  |1 
 net/core/rtnetlink.c   |7 +-
 net/core/sock.c|2 -
 net/core/sock_diag.c   |1 
 net/core/sysctl_net_core.c |1 
 net/dccp/ipv4.c|1 
 net/dccp/ipv6.c|1 
 net/ieee802154/6lowpan/reassembly.c|1 
 net/ieee802154/core.c  |1 
 net/ipv4/af_inet.c |2 -
 net/ipv4/arp.c |1 
 net/ipv4/devinet.c |1 
 net/ipv4/fib_frontend.c|1 
 net/ipv4/fou.c |1 
 net/ipv4/icmp.c|1 
 net/ipv4/igmp.c|1 
 net/ipv4/ip_fragment.c |1 
 net/ipv4/ip_gre.c  |3 -
 net/ipv4/ip_vti.c  |1 
 net/ipv4/ipip.c|1 
 net/ipv4/ipmr.c|1 
 net/ipv4/netfilter/arp_tables.c|1 
 net/ipv4/netfilter/arptable_filter.c   |1 
 net/ipv4/netfilter/ip_tables.c |1 
 net/ipv4/netfilter/ipt_CLUSTERIP.c |1 
 net/ipv4/netfilter/iptable_filter.c|1 
 net/ipv4/netfilter/iptable_mangle.c|1 
 net/ipv4/netfilter/iptable_nat.c   |1 
 net/ipv4/netfilter/iptable_raw.c   |1 
 net/ipv4/netfilter/iptable_security.c  |1 
 net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c |1 
 net/ipv4/netfilter/nf_defrag_ipv4.c|1 
 net/ipv4/netfilter/nf_log_arp.c|1 
 net/ipv4/netfilter/nf_log_ipv4.c   |1 
 net/ipv4/ping.c|1 
 net/ipv4/proc.c|1 
 net/ipv4/raw.c |1 
 net/ipv4/ro

[PATCH net-next 1/5] net: Convert nfsd_net_ops

2018-03-27 Thread Kirill Tkhai

These pernet_operations look similar to rpcsec_gss_net_ops,
they just create and destroy another caches. So, they also
can be async.

Signed-off-by: Kirill Tkhai 
Acked-by: J. Bruce Fields 
---
 fs/nfsd/nfsctl.c |1 +
 1 file changed, 1 insertion(+)

diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c
index d107b4426f7e..1e3824e6cce0 100644
--- a/fs/nfsd/nfsctl.c
+++ b/fs/nfsd/nfsctl.c
@@ -1263,6 +1263,7 @@ static struct pernet_operations nfsd_net_ops = {
.exit = nfsd_exit_net,
.id   = &nfsd_net_id,
.size = sizeof(struct nfsd_net),
+   .async = true,
 };
 
 static int __init init_nfsd(void)

[PATCH net-next 3/5] net: Drop pernet_operations::async

2018-03-27 Thread Kirill Tkhai

Synchronous pernet_operations are not allowed anymore.
All are asynchronous. So, drop the structure member.

Signed-off-by: Kirill Tkhai 
---
 drivers/infiniband/core/cma.c  |1 -
 drivers/net/bonding/bond_main.c|1 -
 drivers/net/geneve.c   |1 -
 drivers/net/gtp.c  |1 -
 drivers/net/ipvlan/ipvlan_main.c   |1 -
 drivers/net/loopback.c |1 -
 drivers/net/ppp/ppp_generic.c  |1 -
 drivers/net/ppp/pppoe.c|1 -
 drivers/net/vrf.c  |1 -
 drivers/net/vxlan.c|1 -
 drivers/net/wireless/mac80211_hwsim.c  |1 -
 fs/lockd/svc.c |1 -
 fs/nfs/blocklayout/rpc_pipefs.c|1 -
 fs/nfs/dns_resolve.c   |1 -
 fs/nfs/inode.c |1 -
 fs/nfs_common/grace.c  |1 -
 fs/nfsd/nfsctl.c   |1 -
 fs/proc/proc_net.c |1 -
 include/net/net_namespace.h|6 --
 kernel/audit.c |1 -
 lib/kobject_uevent.c   |1 -
 net/8021q/vlan.c   |1 -
 net/bridge/br.c|1 -
 net/bridge/br_netfilter_hooks.c|1 -
 net/bridge/netfilter/ebtable_broute.c  |1 -
 net/bridge/netfilter/ebtable_filter.c  |1 -
 net/bridge/netfilter/ebtable_nat.c |1 -
 net/bridge/netfilter/nf_log_bridge.c   |1 -
 net/caif/caif_dev.c|1 -
 net/can/af_can.c   |1 -
 net/can/bcm.c  |1 -
 net/can/gw.c   |1 -
 net/core/dev.c |2 --
 net/core/fib_notifier.c|1 -
 net/core/fib_rules.c   |1 -
 net/core/net-procfs.c  |2 --
 net/core/net_namespace.c   |2 --
 net/core/pktgen.c  |1 -
 net/core/rtnetlink.c   |1 -
 net/core/sock.c|2 --
 net/core/sock_diag.c   |1 -
 net/core/sysctl_net_core.c |1 -
 net/dccp/ipv4.c|1 -
 net/dccp/ipv6.c|1 -
 net/ieee802154/6lowpan/reassembly.c|1 -
 net/ieee802154/core.c  |1 -
 net/ipv4/af_inet.c |2 --
 net/ipv4/arp.c |1 -
 net/ipv4/devinet.c |1 -
 net/ipv4/fib_frontend.c|1 -
 net/ipv4/fou.c |1 -
 net/ipv4/icmp.c|1 -
 net/ipv4/igmp.c|1 -
 net/ipv4/ip_fragment.c |1 -
 net/ipv4/ip_gre.c  |3 ---
 net/ipv4/ip_vti.c  |1 -
 net/ipv4/ipip.c|1 -
 net/ipv4/ipmr.c|1 -
 net/ipv4/netfilter/arp_tables.c|1 -
 net/ipv4/netfilter/arptable_filter.c   |1 -
 net/ipv4/netfilter/ip_tables.c |1 -
 net/ipv4/netfilter/ipt_CLUSTERIP.c |1 -
 net/ipv4/netfilter/iptable_filter.c|1 -
 net/ipv4/netfilter/iptable_mangle.c|1 -
 net/ipv4/netfilter/iptable_nat.c   |1 -
 net/ipv4/netfilter/iptable_raw.c   |1 -
 net/ipv4/netfilter/iptable_security.c  |1 -
 net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c |1 -
 net/ipv4/netfilter/nf_defrag_ipv4.c|1 -
 net/ipv4/netfilter/nf_log_arp.c|1 -
 net/ipv4/netfilter/nf_log_ipv4.c   |1 -
 net/ipv4/ping.c|1 -
 net/ipv4/proc.c|1 -
 net/ipv4/raw.c |1 -
 net/ipv4/route.c   |4 
 net/ipv4/sysctl_net_ipv4.c |1 -
 net/ipv4/tcp_ipv4.c|2 --
 net/ipv4/tcp_metrics.c |1 -
 net/ipv4/udp.c |2 --
 net/ipv4/udplite.c |1 -
 net/ipv4/xfrm4_policy.c|1 -
 net/ipv6/addrconf.c|2 --
 net/ipv6/addrlabel.c   |1 -
 net/ipv6/af_inet6.c|1 -
 net/ipv6/fib6_rules.c

[PATCH net-next 2/5] net: Reflect all pernet_operations are converted

2018-03-27 Thread Kirill Tkhai

All pernet_operations are reviewed and converted, hooray!
Reflect this in core code: setup_net() and cleanup_net()
will take down_read() always.

Signed-off-by: Kirill Tkhai 
---
 net/core/net_namespace.c |   43 ++-
 1 file changed, 6 insertions(+), 37 deletions(-)

diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c
index 95ba2c53bd9a..0f614523a13f 100644
--- a/net/core/net_namespace.c
+++ b/net/core/net_namespace.c
@@ -40,9 +40,8 @@ struct net init_net = {
 EXPORT_SYMBOL(init_net);
 
 static bool init_net_initialized;
-static unsigned nr_sync_pernet_ops;
 /*
- * net_sem: protects: pernet_list, net_generic_ids, nr_sync_pernet_ops,
+ * net_sem: protects: pernet_list, net_generic_ids,
  * init_net_initialized and first_device pointer.
  */
 DECLARE_RWSEM(net_sem);
@@ -406,7 +405,6 @@ struct net *copy_net_ns(unsigned long flags,
 {
struct ucounts *ucounts;
struct net *net;
-   unsigned write;
int rv;
 
if (!(flags & CLONE_NEWNET))
@@ -424,25 +422,14 @@ struct net *copy_net_ns(unsigned long flags,
refcount_set(&net->passive, 1);
net->ucounts = ucounts;
get_user_ns(user_ns);
-again:
-   write = READ_ONCE(nr_sync_pernet_ops);
-   if (write)
-   rv = down_write_killable(&net_sem);
-   else
-   rv = down_read_killable(&net_sem);
+
+   rv = down_read_killable(&net_sem);
if (rv < 0)
goto put_userns;
 
-   if (!write && unlikely(READ_ONCE(nr_sync_pernet_ops))) {
-   up_read(&net_sem);
-   goto again;
-   }
rv = setup_net(net, user_ns);
 
-   if (write)
-   up_write(&net_sem);
-   else
-   up_read(&net_sem);
+   up_read(&net_sem);
 
if (rv < 0) {
 put_userns:
@@ -490,21 +477,11 @@ static void cleanup_net(struct work_struct *work)
struct net *net, *tmp, *last;
struct llist_node *net_kill_list;
LIST_HEAD(net_exit_list);
-   unsigned write;
 
/* Atomically snapshot the list of namespaces to cleanup */
net_kill_list = llist_del_all(&cleanup_list);
-again:
-   write = READ_ONCE(nr_sync_pernet_ops);
-   if (write)
-   down_write(&net_sem);
-   else
-   down_read(&net_sem);
 
-   if (!write && unlikely(READ_ONCE(nr_sync_pernet_ops))) {
-   up_read(&net_sem);
-   goto again;
-   }
+   down_read(&net_sem);
 
/* Don't let anyone else find us. */
rtnl_lock();
@@ -543,10 +520,7 @@ static void cleanup_net(struct work_struct *work)
list_for_each_entry_reverse(ops, &pernet_list, list)
ops_free_list(ops, &net_exit_list);
 
-   if (write)
-   up_write(&net_sem);
-   else
-   up_read(&net_sem);
+   up_read(&net_sem);
 
/* Ensure there are no outstanding rcu callbacks using this
 * network namespace.
@@ -1006,9 +980,6 @@ static int register_pernet_operations(struct list_head 
*list,
rcu_barrier();
if (ops->id)
ida_remove(&net_generic_ids, *ops->id);
-   } else if (!ops->async) {
-   pr_info_once("Pernet operations %ps are sync.\n", ops);
-   nr_sync_pernet_ops++;
}
 
return error;
@@ -1016,8 +987,6 @@ static int register_pernet_operations(struct list_head 
*list,
 
 static void unregister_pernet_operations(struct pernet_operations *ops)
 {
-   if (!ops->async)
-   BUG_ON(nr_sync_pernet_ops-- == 0);
__unregister_pernet_operations(ops);
rcu_barrier();
if (ops->id)

[PATCH net-next 5/5] net: Add more comments

2018-03-27 Thread Kirill Tkhai

This adds comments to different places to improve
readability.

Signed-off-by: Kirill Tkhai 
---
 include/net/net_namespace.h |4 
 net/core/net_namespace.c|2 ++
 net/core/rtnetlink.c|2 +-
 3 files changed, 7 insertions(+), 1 deletion(-)

diff --git a/include/net/net_namespace.h b/include/net/net_namespace.h
index 922e8b6fb422..1ab4f920f109 100644
--- a/include/net/net_namespace.h
+++ b/include/net/net_namespace.h
@@ -323,6 +323,10 @@ struct pernet_operations {
 * have to keep in mind all other pernet_operations and
 * to introduce a locking, if they share common resources.
 *
+* The only time they are called with exclusive lock is
+* from register_pernet_subsys(), unregister_pernet_subsys()
+* register_pernet_device() and unregister_pernet_device().
+*
 * Exit methods using blocking RCU primitives, such as
 * synchronize_rcu(), should be implemented via exit_batch.
 * Then, destruction of a group of net requires single
diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c
index 9e8ee4640451..b5796d17a302 100644
--- a/net/core/net_namespace.c
+++ b/net/core/net_namespace.c
@@ -43,6 +43,8 @@ static bool init_net_initialized;
 /*
  * pernet_ops_rwsem: protects: pernet_list, net_generic_ids,
  * init_net_initialized and first_device pointer.
+ * This is internal net namespace object. Please, don't use it
+ * outside.
  */
 DECLARE_RWSEM(pernet_ops_rwsem);
 
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index 73011a60434c..2d3949789cef 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -459,7 +459,7 @@ static void rtnl_lock_unregistering_all(void)
  */
 void rtnl_link_unregister(struct rtnl_link_ops *ops)
 {
-   /* Close the race with cleanup_net() */
+   /* Close the race with setup_net() and cleanup_net() */
down_write(&pernet_ops_rwsem);
rtnl_lock_unregistering_all();
__rtnl_link_unregister(ops);

[PATCH net-next 4/5] net: Rename net_sem to pernet_ops_rwsem

2018-03-27 Thread Kirill Tkhai

net_sem is some undefined area name, so it will be better
to make the area more defined.

Rename it to pernet_ops_rwsem for better readability and
better intelligibility.

Signed-off-by: Kirill Tkhai 
---
 include/linux/rtnetlink.h   |2 +-
 include/net/net_namespace.h |   12 +++-
 net/core/net_namespace.c|   40 
 net/core/rtnetlink.c|4 ++--
 4 files changed, 30 insertions(+), 28 deletions(-)

diff --git a/include/linux/rtnetlink.h b/include/linux/rtnetlink.h
index 562a175c35a9..c7d1e4689325 100644
--- a/include/linux/rtnetlink.h
+++ b/include/linux/rtnetlink.h
@@ -36,7 +36,7 @@ extern int rtnl_is_locked(void);
 extern int rtnl_lock_killable(void);
 
 extern wait_queue_head_t netdev_unregistering_wq;
-extern struct rw_semaphore net_sem;
+extern struct rw_semaphore pernet_ops_rwsem;
 
 #ifdef CONFIG_PROVE_LOCKING
 extern bool lockdep_rtnl_is_held(void);
diff --git a/include/net/net_namespace.h b/include/net/net_namespace.h
index 37bcf8382b61..922e8b6fb422 100644
--- a/include/net/net_namespace.h
+++ b/include/net/net_namespace.h
@@ -60,9 +60,10 @@ struct net {
 
struct list_headlist;   /* list of network namespaces */
struct list_headexit_list;  /* To linked to call pernet exit
-* methods on dead net (net_sem
-* read locked), or to 
unregister
-* pernet ops (net_sem wr 
locked).
+* methods on dead net (
+* pernet_ops_rwsem read 
locked),
+* or to unregister pernet ops
+* (pernet_ops_rwsem write 
locked).
 */
struct llist_node   cleanup_list;   /* namespaces on death row */
 
@@ -95,8 +96,9 @@ struct net {
/* core fib_rules */
struct list_headrules_ops;
 
-   struct list_headfib_notifier_ops;  /* protected by net_sem */
-
+   struct list_headfib_notifier_ops;  /* Populated by
+   * register_pernet_subsys()
+   */
struct net_device   *loopback_dev;  /* The loopback */
struct netns_core   core;
struct netns_mibmib;
diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c
index eef17ad29dea..9e8ee4640451 100644
--- a/net/core/net_namespace.c
+++ b/net/core/net_namespace.c
@@ -41,10 +41,10 @@ EXPORT_SYMBOL(init_net);
 
 static bool init_net_initialized;
 /*
- * net_sem: protects: pernet_list, net_generic_ids,
+ * pernet_ops_rwsem: protects: pernet_list, net_generic_ids,
  * init_net_initialized and first_device pointer.
  */
-DECLARE_RWSEM(net_sem);
+DECLARE_RWSEM(pernet_ops_rwsem);
 
 #define MIN_PERNET_OPS_ID  \
((sizeof(struct net_generic) + sizeof(void *) - 1) / sizeof(void *))
@@ -72,7 +72,7 @@ static int net_assign_generic(struct net *net, unsigned int 
id, void *data)
BUG_ON(id < MIN_PERNET_OPS_ID);
 
old_ng = rcu_dereference_protected(net->gen,
-  lockdep_is_held(&net_sem));
+  lockdep_is_held(&pernet_ops_rwsem));
if (old_ng->s.len > id) {
old_ng->ptr[id] = data;
return 0;
@@ -289,7 +289,7 @@ struct net *get_net_ns_by_id(struct net *net, int id)
  */
 static __net_init int setup_net(struct net *net, struct user_namespace 
*user_ns)
 {
-   /* Must be called with net_sem held */
+   /* Must be called with pernet_ops_rwsem held */
const struct pernet_operations *ops, *saved_ops;
int error = 0;
LIST_HEAD(net_exit_list);
@@ -422,13 +422,13 @@ struct net *copy_net_ns(unsigned long flags,
net->ucounts = ucounts;
get_user_ns(user_ns);
 
-   rv = down_read_killable(&net_sem);
+   rv = down_read_killable(&pernet_ops_rwsem);
if (rv < 0)
goto put_userns;
 
rv = setup_net(net, user_ns);
 
-   up_read(&net_sem);
+   up_read(&pernet_ops_rwsem);
 
if (rv < 0) {
 put_userns:
@@ -480,7 +480,7 @@ static void cleanup_net(struct work_struct *work)
/* Atomically snapshot the list of namespaces to cleanup */
net_kill_list = llist_del_all(&cleanup_list);
 
-   down_read(&net_sem);
+   down_read(&pernet_ops_rwsem);
 
/* Don't let anyone else find us. */
rtnl_lock();
@@ -519,7 +519,7 @@ static void cleanup_net(struct work_struct *work)
list_for_each_entry_reverse(ops, &pernet_list, list)
ops_free_list(ops, &net_exit_list);
 
-   up_read(&n

Re: [PATCH net-next 0/5] Make pernet_operations always read locked

2018-03-27 Thread Kirill Tkhai

On 27.03.2018 20:18, David Miller wrote:
> From: Kirill Tkhai 
> Date: Tue, 27 Mar 2018 18:01:42 +0300
> 
>> All the pernet_operations are converted, and the last one
>> is in this patchset (nfsd_net_ops acked by J. Bruce Fields).
>> So, it's the time to kill pernet_operations::async field,
>> and make setup_net() and cleanup_net() always require
>> the rwsem only read locked.
>>
>> All further pernet_operations have to be developed to fit
>> this rule. Some of previous patches added a comment to
>> struct pernet_operations about that.
>>
>> Also, this patchset renames net_sem to pernet_ops_rwsem
>> to make the target area of the rwsem is more clear visible,
>> and adds more comments.
> 
> Amazing work, series applied, thanks!

Thanks a lot, David!

Kirill

Re: [PATCH V5 net-next 06/14] net/tls: Add generic NIC offload infrastructure

2018-03-28 Thread Kirill Tkhai

On 28.03.2018 02:56, Saeed Mahameed wrote:
> From: Ilya Lesokhin 
> 
> This patch adds a generic infrastructure to offload TLS crypto to a
> network device. It enables the kernel TLS socket to skip encryption
> and authentication operations on the transmit side of the data path.
> Leaving those computationally expensive operations to the NIC.
> 
> The NIC offload infrastructure builds TLS records and pushes them to
> the TCP layer just like the SW KTLS implementation and using the same API.
> TCP segmentation is mostly unaffected. Currently the only exception is
> that we prevent mixed SKBs where only part of the payload requires
> offload. In the future we are likely to add a similar restriction
> following a change cipher spec record.
> 
> The notable differences between SW KTLS and NIC offloaded TLS
> implementations are as follows:
> 1. The offloaded implementation builds "plaintext TLS record", those
> records contain plaintext instead of ciphertext and place holder bytes
> instead of authentication tags.
> 2. The offloaded implementation maintains a mapping from TCP sequence
> number to TLS records. Thus given a TCP SKB sent from a NIC offloaded
> TLS socket, we can use the tls NIC offload infrastructure to obtain
> enough context to encrypt the payload of the SKB.
> A TLS record is released when the last byte of the record is ack'ed,
> this is done through the new icsk_clean_acked callback.
> 
> The infrastructure should be extendable to support various NIC offload
> implementations.  However it is currently written with the
> implementation below in mind:
> The NIC assumes that packets from each offloaded stream are sent as
> plaintext and in-order. It keeps track of the TLS records in the TCP
> stream. When a packet marked for offload is transmitted, the NIC
> encrypts the payload in-place and puts authentication tags in the
> relevant place holders.
> 
> The responsibility for handling out-of-order packets (i.e. TCP
> retransmission, qdisc drops) falls on the netdev driver.
> 
> The netdev driver keeps track of the expected TCP SN from the NIC's
> perspective.  If the next packet to transmit matches the expected TCP
> SN, the driver advances the expected TCP SN, and transmits the packet
> with TLS offload indication.
> 
> If the next packet to transmit does not match the expected TCP SN. The
> driver calls the TLS layer to obtain the TLS record that includes the
> TCP of the packet for transmission. Using this TLS record, the driver
> posts a work entry on the transmit queue to reconstruct the NIC TLS
> state required for the offload of the out-of-order packet. It updates
> the expected TCP SN accordingly and transmits the now in-order packet.
> The same queue is used for packet transmission and TLS context
> reconstruction to avoid the need for flushing the transmit queue before
> issuing the context reconstruction request.
> 
> Signed-off-by: Ilya Lesokhin 
> Signed-off-by: Boris Pismenny 
> Signed-off-by: Aviad Yehezkel 
> Signed-off-by: Saeed Mahameed 
> ---
>  include/net/tls.h | 120 +--
>  net/tls/Kconfig   |  10 +
>  net/tls/Makefile  |   2 +
>  net/tls/tls_device.c  | 759 
> ++
>  net/tls/tls_device_fallback.c | 454 +
>  net/tls/tls_main.c| 120 ---
>  net/tls/tls_sw.c  | 132 
>  7 files changed, 1476 insertions(+), 121 deletions(-)
>  create mode 100644 net/tls/tls_device.c
>  create mode 100644 net/tls/tls_device_fallback.c
> 
> diff --git a/include/net/tls.h b/include/net/tls.h
> index 437a746300bf..0a8529e9ec21 100644
> --- a/include/net/tls.h
> +++ b/include/net/tls.h
> @@ -57,21 +57,10 @@
>  
>  #define TLS_AAD_SPACE_SIZE   13
>  
> -struct tls_sw_context {
> +struct tls_sw_context_tx {

What the reason splitting this into tx + rx does not go in separate patch?

>   struct crypto_aead *aead_send;
> - struct crypto_aead *aead_recv;
>   struct crypto_wait async_wait;
>  
> - /* Receive context */
> - struct strparser strp;
> - void (*saved_data_ready)(struct sock *sk);
> - unsigned int (*sk_poll)(struct file *file, struct socket *sock,
> - struct poll_table_struct *wait);
> - struct sk_buff *recv_pkt;
> - u8 control;
> - bool decrypted;
> -
> - /* Sending context */
>   char aad_space[TLS_AAD_SPACE_SIZE];
>  
>   unsigned int sg_plaintext_size;
> @@ -88,6 +77,50 @@ struct tls_sw_context {
>   struct scatterlist sg_aead_out[2];
>  };
>  
> +struct tls_sw_context_rx {
> + struct crypto_aead *aead_recv;
> + struct crypto_wait async_wait;
> +
> + struct strparser strp;
> + void (*saved_data_ready)(struct sock *sk);
> + unsigned int (*sk_poll)(struct file *file, struct socket *sock,
> + struct poll_table_struct *wait);
> + struct sk_buff *recv_pkt;
> + u8 control;
> + bool decrypted;
> +};
> +
>

[PATCH net-next 0/3] Close race between {un, }register_netdevice_notifier and pernet_operations

2018-03-29 Thread Kirill Tkhai

Hi,

the problem is {,un}register_netdevice_notifier() do not take
pernet_ops_rwsem, and they don't see network namespaces, being
initialized in setup_net() and cleanup_net(), since at this
time net is not hashed to net_namespace_list.

This may lead to imbalance, when a notifier is called at time of
setup_net()/net is alive, but it's not called at time of cleanup_net(),
for the devices, hashed to the net, and vise versa. See (3/3) for
the scheme of imbalance.

This patchset fixes the problem by acquiring pernet_ops_rwsem
at the time of {,un}register_netdevice_notifier() (3/3).
(1-2/3) are preparations in xfrm and netfilter subsystems.

The problem was introduced a long ago, but backporting won't be easy,
since every previous kernel version may have changes in netdevice
notifiers, and they all need review and testing. Otherwise, there
may be more pernet_operations, which register or unregister
netdevice notifiers, and that leads to deadlock (which is was fixed
in 1-2/3). This patchset is for net-next.

Thanks,
Kirill
---

Kirill Tkhai (3):
  xfrm: Register xfrm_dev_notifier in appropriate place
  netfilter: Rework xt_TEE netdevice notifier
  net: Close race between {un,}register_netdevice_notifier() and 
setup_net()/cleanup_net()


 include/net/xfrm.h |2 +
 net/core/dev.c |6 
 net/netfilter/xt_TEE.c |   73 ++--
 net/xfrm/xfrm_device.c |2 +
 net/xfrm/xfrm_policy.c |3 +-
 5 files changed, 55 insertions(+), 31 deletions(-)

--
Signed-off-by: Kirill Tkhai

[PATCH net-next 1/3] xfrm: Register xfrm_dev_notifier in appropriate place

2018-03-29 Thread Kirill Tkhai

Currently, driver registers it from pernet_operations::init method,
and this breaks modularity, because initialization of net namespace
and netdevice notifiers are orthogonal actions. We don't have
per-namespace netdevice notifiers; all of them are global for all
devices in all namespaces.

Signed-off-by: Kirill Tkhai 
---
 include/net/xfrm.h |2 +-
 net/xfrm/xfrm_device.c |2 +-
 net/xfrm/xfrm_policy.c |3 +--
 3 files changed, 3 insertions(+), 4 deletions(-)

diff --git a/include/net/xfrm.h b/include/net/xfrm.h
index aa027ba1d032..a872379b69da 100644
--- a/include/net/xfrm.h
+++ b/include/net/xfrm.h
@@ -1894,7 +1894,7 @@ static inline struct xfrm_offload *xfrm_offload(struct 
sk_buff *skb)
 #endif
 }
 
-void __net_init xfrm_dev_init(void);
+void __init xfrm_dev_init(void);
 
 #ifdef CONFIG_XFRM_OFFLOAD
 void xfrm_dev_resume(struct sk_buff *skb);
diff --git a/net/xfrm/xfrm_device.c b/net/xfrm/xfrm_device.c
index e87d6c4dd5b6..175941e15a6e 100644
--- a/net/xfrm/xfrm_device.c
+++ b/net/xfrm/xfrm_device.c
@@ -350,7 +350,7 @@ static struct notifier_block xfrm_dev_notifier = {
.notifier_call  = xfrm_dev_event,
 };
 
-void __net_init xfrm_dev_init(void)
+void __init xfrm_dev_init(void)
 {
register_netdevice_notifier(&xfrm_dev_notifier);
 }
diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c
index 625b3fca5704..f29c8d588116 100644
--- a/net/xfrm/xfrm_policy.c
+++ b/net/xfrm/xfrm_policy.c
@@ -2895,8 +2895,6 @@ static int __net_init xfrm_policy_init(struct net *net)
INIT_LIST_HEAD(&net->xfrm.policy_all);
INIT_WORK(&net->xfrm.policy_hash_work, xfrm_hash_resize);
INIT_WORK(&net->xfrm.policy_hthresh.work, xfrm_hash_rebuild);
-   if (net_eq(net, &init_net))
-   xfrm_dev_init();
return 0;
 
 out_bydst:
@@ -2999,6 +2997,7 @@ void __init xfrm_init(void)
INIT_WORK(&xfrm_pcpu_work[i], xfrm_pcpu_work_fn);
 
register_pernet_subsys(&xfrm_net_ops);
+   xfrm_dev_init();
seqcount_init(&xfrm_policy_hash_generation);
xfrm_input_init();
 }

[PATCH net-next 2/3] netfilter: Rework xt_TEE netdevice notifier

2018-03-29 Thread Kirill Tkhai

Register netdevice notifier for every iptable entry
is not good, since this breaks modularity, and
the hidden synchronization is based on rtnl_lock().

This patch reworks the synchronization via new lock,
while the rest of logic remains as it was before.
This is required for the next patch.

Tested via:

while :; do
unshare -n iptables -t mangle -A OUTPUT -j TEE --gateway 1.1.1.2 --oif 
lo;
done

Signed-off-by: Kirill Tkhai 
---
 net/netfilter/xt_TEE.c |   73 ++--
 1 file changed, 46 insertions(+), 27 deletions(-)

diff --git a/net/netfilter/xt_TEE.c b/net/netfilter/xt_TEE.c
index 86b0580b2216..475957cfcf50 100644
--- a/net/netfilter/xt_TEE.c
+++ b/net/netfilter/xt_TEE.c
@@ -20,7 +20,7 @@
 #include 
 
 struct xt_tee_priv {
-   struct notifier_block   notifier;
+   struct list_headlist;
struct xt_tee_tginfo*tginfo;
int oif;
 };
@@ -51,29 +51,35 @@ tee_tg6(struct sk_buff *skb, const struct xt_action_param 
*par)
 }
 #endif
 
+static DEFINE_MUTEX(priv_list_mutex);
+static LIST_HEAD(priv_list);
+
 static int tee_netdev_event(struct notifier_block *this, unsigned long event,
void *ptr)
 {
struct net_device *dev = netdev_notifier_info_to_dev(ptr);
struct xt_tee_priv *priv;
 
-   priv = container_of(this, struct xt_tee_priv, notifier);
-   switch (event) {
-   case NETDEV_REGISTER:
-   if (!strcmp(dev->name, priv->tginfo->oif))
-   priv->oif = dev->ifindex;
-   break;
-   case NETDEV_UNREGISTER:
-   if (dev->ifindex == priv->oif)
-   priv->oif = -1;
-   break;
-   case NETDEV_CHANGENAME:
-   if (!strcmp(dev->name, priv->tginfo->oif))
-   priv->oif = dev->ifindex;
-   else if (dev->ifindex == priv->oif)
-   priv->oif = -1;
-   break;
+   mutex_lock(&priv_list_mutex);
+   list_for_each_entry(priv, &priv_list, list) {
+   switch (event) {
+   case NETDEV_REGISTER:
+   if (!strcmp(dev->name, priv->tginfo->oif))
+   priv->oif = dev->ifindex;
+   break;
+   case NETDEV_UNREGISTER:
+   if (dev->ifindex == priv->oif)
+   priv->oif = -1;
+   break;
+   case NETDEV_CHANGENAME:
+   if (!strcmp(dev->name, priv->tginfo->oif))
+   priv->oif = dev->ifindex;
+   else if (dev->ifindex == priv->oif)
+   priv->oif = -1;
+   break;
+   }
}
+   mutex_unlock(&priv_list_mutex);
 
return NOTIFY_DONE;
 }
@@ -89,8 +95,6 @@ static int tee_tg_check(const struct xt_tgchk_param *par)
return -EINVAL;
 
if (info->oif[0]) {
-   int ret;
-
if (info->oif[sizeof(info->oif)-1] != '\0')
return -EINVAL;
 
@@ -100,14 +104,11 @@ static int tee_tg_check(const struct xt_tgchk_param *par)
 
priv->tginfo  = info;
priv->oif = -1;
-   priv->notifier.notifier_call = tee_netdev_event;
info->priv= priv;
 
-   ret = register_netdevice_notifier(&priv->notifier);
-   if (ret) {
-   kfree(priv);
-   return ret;
-   }
+   mutex_lock(&priv_list_mutex);
+   list_add(&priv->list, &priv_list);
+   mutex_unlock(&priv_list_mutex);
} else
info->priv = NULL;
 
@@ -120,7 +121,9 @@ static void tee_tg_destroy(const struct xt_tgdtor_param 
*par)
struct xt_tee_tginfo *info = par->targinfo;
 
if (info->priv) {
-   unregister_netdevice_notifier(&info->priv->notifier);
+   mutex_lock(&priv_list_mutex);
+   list_del(&info->priv->list);
+   mutex_unlock(&priv_list_mutex);
kfree(info->priv);
}
static_key_slow_dec(&xt_tee_enabled);
@@ -153,13 +156,29 @@ static struct xt_target tee_tg_reg[] __read_mostly = {
 #endif
 };
 
+static struct notifier_block tee_netdev_notifier = {
+   .notifier_call = tee_netdev_event,
+};
+
 static int __init tee_tg_init(void)
 {
-   return xt_register_targets(tee_tg_reg, ARRAY_SIZE(tee_tg_reg));
+   int ret;
+
+   ret = xt_register_targets(tee_tg_reg, ARRAY_SIZE(tee_tg_reg));
+   if (ret)
+   return ret;
+   ret = register_netdevice_notifier(&te

[PATCH net-next 3/3] net: Close race between {un, }register_netdevice_notifier() and setup_net()/cleanup_net()

2018-03-29 Thread Kirill Tkhai

{un,}register_netdevice_notifier() iterate over all net namespaces
hashed to net_namespace_list. But pernet_operations register and
unregister netdevices in unhashed net namespace, and they are not
seen for netdevice notifiers. This results in asymmetry:

1)Race with register_netdevice_notifier()
  pernet_operations::init(net)  ...
   register_netdevice() ...
call_netdevice_notifiers()  ...
  ... nb is not called ...
  ...   register_netdevice_notifier(nb) -> net skipped
  ...   ...
  list_add_tail(&net->list, ..) ...

  Then, userspace stops using net, and it's destructed:

  pernet_operations::exit(net)
   unregister_netdevice()
call_netdevice_notifiers()
  ... nb is called ...

This always happens with net::loopback_dev, but it may be not the only device.

2)Race with unregister_netdevice_notifier()
  pernet_operations::init(net)
   register_netdevice()
call_netdevice_notifiers()
  ... nb is called ...

  Then, userspace stops using net, and it's destructed:

  list_del_rcu(&net->list)  ...
  pernet_operations::exit(net)  unregister_netdevice_notifier(nb) -> net skipped
   dev_change_net_namespace()   ...
call_netdevice_notifiers()
  ... nb is not called ...
   unregister_netdevice()
call_netdevice_notifiers()
  ... nb is not called ...

This race is more danger, since dev_change_net_namespace() moves real
network devices, which use not trivial netdevice notifiers, and if this
will happen, the system will be left in unpredictable state.

The patch closes the race. During the testing I found two places,
where register_netdevice_notifier() is called from pernet init/exit
methods (which led to deadlock) and fixed them (see previous patches).

The review moved me to one more unusual registration place:
raw_init() (can driver). It may be a reason of problems,
if someone creates in-kernel CAN_RAW sockets, since they
will be destroyed in exit method and raw_release()
will call unregister_netdevice_notifier(). But grep over
kernel tree does not show, someone creates such sockets
from kernel space.

Theoretically, there can be more places like this, and which are
hidden from review, but we found them on the first bumping there
(since there is no a race, it will be 100% reproducible).

Signed-off-by: Kirill Tkhai 
---
 net/core/dev.c |6 ++
 1 file changed, 6 insertions(+)

diff --git a/net/core/dev.c b/net/core/dev.c
index e13807b5c84d..43abc5785a85 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -1623,6 +1623,8 @@ int register_netdevice_notifier(struct notifier_block *nb)
struct net *net;
int err;
 
+   /* Close race with setup_net() and cleanup_net() */
+   down_write(&pernet_ops_rwsem);
rtnl_lock();
err = raw_notifier_chain_register(&netdev_chain, nb);
if (err)
@@ -1645,6 +1647,7 @@ int register_netdevice_notifier(struct notifier_block *nb)
 
 unlock:
rtnl_unlock();
+   up_write(&pernet_ops_rwsem);
return err;
 
 rollback:
@@ -1689,6 +1692,8 @@ int unregister_netdevice_notifier(struct notifier_block 
*nb)
struct net *net;
int err;
 
+   /* Close race with setup_net() and cleanup_net() */
+   down_write(&pernet_ops_rwsem);
rtnl_lock();
err = raw_notifier_chain_unregister(&netdev_chain, nb);
if (err)
@@ -1706,6 +1711,7 @@ int unregister_netdevice_notifier(struct notifier_block 
*nb)
}
 unlock:
rtnl_unlock();
+   up_write(&pernet_ops_rwsem);
return err;
 }
 EXPORT_SYMBOL(unregister_netdevice_notifier);

[PATCH net-next 0/5] Introduce net_rwsem to protect net_namespace_list

2018-03-29 Thread Kirill Tkhai

The series introduces fine grained rw_semaphore, which will be used
instead of rtnl_lock() to protect net_namespace_list.

This improves scalability and allows to do non-exclusive sleepable
iteration for_each_net(), which is enough for most cases.

scripts/get_maintainer.pl gives enormous list of people, and I add
all to CC.

Note, that this patch is independent of "Close race between
{un, }register_netdevice_notifier and pernet_operations":
https://patchwork.ozlabs.org/project/netdev/list/?series=36495

Signed-off-by: Kirill Tkhai 
---

Kirill Tkhai (5):
  net: Introduce net_rwsem to protect net_namespace_list
  net: Don't take rtnl_lock() in wireless_nlevent_flush()
  security: Remove rtnl_lock() in selinux_xfrm_notify_policyload()
  ovs: Remove rtnl_lock() from ovs_exit_net()
  net: Remove rtnl_lock() in nf_ct_iterate_destroy()


 drivers/infiniband/core/roce_gid_mgmt.c |2 ++
 include/linux/rtnetlink.h   |1 +
 include/net/net_namespace.h |1 +
 net/core/dev.c  |5 +
 net/core/fib_notifier.c |2 ++
 net/core/net_namespace.c|   18 +-
 net/core/rtnetlink.c|5 +
 net/netfilter/nf_conntrack_core.c   |4 ++--
 net/openvswitch/datapath.c  |4 ++--
 net/wireless/wext-core.c|6 ++
 security/selinux/include/xfrm.h |4 ++--
 11 files changed, 37 insertions(+), 15 deletions(-)

--
Signed-off-by: Kirill Tkhai

[PATCH net-next 1/5] net: Introduce net_rwsem to protect net_namespace_list

2018-03-29 Thread Kirill Tkhai

rtnl_lock() is used everywhere, and contention is very high.
When someone wants to iterate over alive net namespaces,
he/she has no a possibility to do that without exclusive lock.
But the exclusive rtnl_lock() in such places is overkill,
and it just increases the contention. Yes, there is already
for_each_net_rcu() in kernel, but it requires rcu_read_lock(),
and this can't be sleepable. Also, sometimes it may be need
really prevent net_namespace_list growth, so for_each_net_rcu()
is not fit there.

This patch introduces new rw_semaphore, which will be used
instead of rtnl_mutex to protect net_namespace_list. It is
sleepable and allows not-exclusive iterations over net
namespaces list. It allows to stop using rtnl_lock()
in several places (what is made in next patches) and makes
less the time, we keep rtnl_mutex. Here we just add new lock,
while the explanation of we can remove rtnl_lock() there are
in next patches.

Fine grained locks generally are better, then one big lock,
so let's do that with net_namespace_list, while the situation
allows that.

Signed-off-by: Kirill Tkhai 
---
 drivers/infiniband/core/roce_gid_mgmt.c |2 ++
 include/linux/rtnetlink.h   |1 +
 include/net/net_namespace.h |1 +
 net/core/dev.c  |5 +
 net/core/fib_notifier.c |2 ++
 net/core/net_namespace.c|   18 +-
 net/core/rtnetlink.c|5 +
 net/netfilter/nf_conntrack_core.c   |2 ++
 net/openvswitch/datapath.c  |2 ++
 net/wireless/wext-core.c|2 ++
 security/selinux/include/xfrm.h |2 ++
 11 files changed, 37 insertions(+), 5 deletions(-)

diff --git a/drivers/infiniband/core/roce_gid_mgmt.c 
b/drivers/infiniband/core/roce_gid_mgmt.c
index 5a52ec77940a..cc2966380c0c 100644
--- a/drivers/infiniband/core/roce_gid_mgmt.c
+++ b/drivers/infiniband/core/roce_gid_mgmt.c
@@ -403,10 +403,12 @@ static void enum_all_gids_of_dev_cb(struct ib_device 
*ib_dev,
 * our feet
 */
rtnl_lock();
+   down_read(&net_rwsem);
for_each_net(net)
for_each_netdev(net, ndev)
if (is_eth_port_of_netdev(ib_dev, port, rdma_ndev, 
ndev))
add_netdev_ips(ib_dev, port, rdma_ndev, ndev);
+   up_read(&net_rwsem);
rtnl_unlock();
 }
 
diff --git a/include/linux/rtnetlink.h b/include/linux/rtnetlink.h
index c7d1e4689325..5225832bd6ff 100644
--- a/include/linux/rtnetlink.h
+++ b/include/linux/rtnetlink.h
@@ -37,6 +37,7 @@ extern int rtnl_lock_killable(void);
 
 extern wait_queue_head_t netdev_unregistering_wq;
 extern struct rw_semaphore pernet_ops_rwsem;
+extern struct rw_semaphore net_rwsem;
 
 #ifdef CONFIG_PROVE_LOCKING
 extern bool lockdep_rtnl_is_held(void);
diff --git a/include/net/net_namespace.h b/include/net/net_namespace.h
index 1ab4f920f109..47e35cce3b64 100644
--- a/include/net/net_namespace.h
+++ b/include/net/net_namespace.h
@@ -291,6 +291,7 @@ static inline struct net *read_pnet(const possible_net_t 
*pnet)
 #endif
 }
 
+/* Protected by net_rwsem */
 #define for_each_net(VAR)  \
list_for_each_entry(VAR, &net_namespace_list, list)
 
diff --git a/net/core/dev.c b/net/core/dev.c
index e13807b5c84d..eca5458b2753 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -1629,6 +1629,7 @@ int register_netdevice_notifier(struct notifier_block *nb)
goto unlock;
if (dev_boot_phase)
goto unlock;
+   down_read(&net_rwsem);
for_each_net(net) {
for_each_netdev(net, dev) {
err = call_netdevice_notifier(nb, NETDEV_REGISTER, dev);
@@ -1642,6 +1643,7 @@ int register_netdevice_notifier(struct notifier_block *nb)
call_netdevice_notifier(nb, NETDEV_UP, dev);
}
}
+   up_read(&net_rwsem);
 
 unlock:
rtnl_unlock();
@@ -1664,6 +1666,7 @@ int register_netdevice_notifier(struct notifier_block *nb)
}
 
 outroll:
+   up_read(&net_rwsem);
raw_notifier_chain_unregister(&netdev_chain, nb);
goto unlock;
 }
@@ -1694,6 +1697,7 @@ int unregister_netdevice_notifier(struct notifier_block 
*nb)
if (err)
goto unlock;
 
+   down_read(&net_rwsem);
for_each_net(net) {
for_each_netdev(net, dev) {
if (dev->flags & IFF_UP) {
@@ -1704,6 +1708,7 @@ int unregister_netdevice_notifier(struct notifier_block 
*nb)
call_netdevice_notifier(nb, NETDEV_UNREGISTER, dev);
}
}
+   up_read(&net_rwsem);
 unlock:
rtnl_unlock();
return err;
diff --git a/net/core/fib_notifier.c b/net/core/fib_notifier.c
index 0c048bdeb016..614b985c92a4 100644
--- a/net/core/fib_notifier.c
+++ b/net/core/fib_notifier.c
@

[PATCH net-next 4/5] ovs: Remove rtnl_lock() from ovs_exit_net()

2018-03-29 Thread Kirill Tkhai

Here we iterate for_each_net() and removes
vport from alive net to the exiting net.

ovs_net::dps are protected by ovs_mutex(),
and the others, who change it (ovs_dp_cmd_new(),
__dp_destroy()) also take it.
The same with datapath::ports list.

So, we remove rtnl_lock() here.

Signed-off-by: Kirill Tkhai 
---
 net/openvswitch/datapath.c |2 --
 1 file changed, 2 deletions(-)

diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c
index 9746ee30a99b..015e24e08909 100644
--- a/net/openvswitch/datapath.c
+++ b/net/openvswitch/datapath.c
@@ -2363,12 +2363,10 @@ static void __net_exit ovs_exit_net(struct net *dnet)
list_for_each_entry_safe(dp, dp_next, &ovs_net->dps, list_node)
__dp_destroy(dp);
 
-   rtnl_lock();
down_read(&net_rwsem);
for_each_net(net)
list_vports_from_net(net, dnet, &head);
up_read(&net_rwsem);
-   rtnl_unlock();
 
/* Detach all vports from given namespace. */
list_for_each_entry_safe(vport, vport_next, &head, detach_list) {

[PATCH net-next 3/5] security: Remove rtnl_lock() in selinux_xfrm_notify_policyload()

2018-03-29 Thread Kirill Tkhai

rt_genid_bump_all() consists of ipv4 and ipv6 part.
ipv4 part is incrementing of net::ipv4::rt_genid,
and I see many places, where it's read without rtnl_lock().

ipv6 part calls __fib6_clean_all(), and it's also
called without rtnl_lock() in other places.

So, rtnl_lock() here was used to iterate net_namespace_list only,
and we can remove it.

Signed-off-by: Kirill Tkhai 
---
 security/selinux/include/xfrm.h |2 --
 1 file changed, 2 deletions(-)

diff --git a/security/selinux/include/xfrm.h b/security/selinux/include/xfrm.h
index 31d66431be1e..a0b465316292 100644
--- a/security/selinux/include/xfrm.h
+++ b/security/selinux/include/xfrm.h
@@ -47,12 +47,10 @@ static inline void selinux_xfrm_notify_policyload(void)
 {
struct net *net;
 
-   rtnl_lock();
down_read(&net_rwsem);
for_each_net(net)
rt_genid_bump_all(net);
up_read(&net_rwsem);
-   rtnl_unlock();
 }
 #else
 static inline int selinux_xfrm_enabled(void)

[PATCH net-next 5/5] net: Remove rtnl_lock() in nf_ct_iterate_destroy()

2018-03-29 Thread Kirill Tkhai

rtnl_lock() doesn't protect net::ct::count,
and it's not needed for__nf_ct_unconfirmed_destroy()
and for nf_queue_nf_hook_drop().

Signed-off-by: Kirill Tkhai 
---
 net/netfilter/nf_conntrack_core.c |2 --
 1 file changed, 2 deletions(-)

diff --git a/net/netfilter/nf_conntrack_core.c 
b/net/netfilter/nf_conntrack_core.c
index 370f9b7f051b..41ff04ee2554 100644
--- a/net/netfilter/nf_conntrack_core.c
+++ b/net/netfilter/nf_conntrack_core.c
@@ -1763,7 +1763,6 @@ nf_ct_iterate_destroy(int (*iter)(struct nf_conn *i, void 
*data), void *data)
 {
struct net *net;
 
-   rtnl_lock();
down_read(&net_rwsem);
for_each_net(net) {
if (atomic_read(&net->ct.count) == 0)
@@ -1772,7 +1771,6 @@ nf_ct_iterate_destroy(int (*iter)(struct nf_conn *i, void 
*data), void *data)
nf_queue_nf_hook_drop(net);
}
up_read(&net_rwsem);
-   rtnl_unlock();
 
/* Need to wait for netns cleanup worker to finish, if its
 * running -- it might have deleted a net namespace from

[PATCH net-next 2/5] net: Don't take rtnl_lock() in wireless_nlevent_flush()

2018-03-29 Thread Kirill Tkhai

This function iterates over net_namespace_list and flushes
the queue for every of them. What does this rtnl_lock()
protects?! Since we may add skbs to net::wext_nlevents
without rtnl_lock(), it does not protects us about queuers.

It guarantees, two threads can't flush the queue in parallel,
that can change the order, but since skb can be queued
in any order, it doesn't matter, how many threads do this
in parallel. In case of several threads, this will be even
faster.

So, we can remove rtnl_lock() here, as it was used for
iteration over net_namespace_list only.

Signed-off-by: Kirill Tkhai 
---
 net/wireless/wext-core.c |4 
 1 file changed, 4 deletions(-)

diff --git a/net/wireless/wext-core.c b/net/wireless/wext-core.c
index 544d7b62d7ca..5e677dac2a0c 100644
--- a/net/wireless/wext-core.c
+++ b/net/wireless/wext-core.c
@@ -347,8 +347,6 @@ void wireless_nlevent_flush(void)
struct sk_buff *skb;
struct net *net;
 
-   ASSERT_RTNL();
-
down_read(&net_rwsem);
for_each_net(net) {
while ((skb = skb_dequeue(&net->wext_nlevents)))
@@ -412,9 +410,7 @@ subsys_initcall(wireless_nlevent_init);
 /* Process events generated by the wireless layer or the driver. */
 static void wireless_nlevent_process(struct work_struct *work)
 {
-   rtnl_lock();
wireless_nlevent_flush();
-   rtnl_unlock();
 }
 
 static DECLARE_WORK(wireless_nlevent_work, wireless_nlevent_process);

[PATCH 2/2] tun: Remove unused first parameter of tun_get_iff()

2019-03-20 Thread Kirill Tkhai

Signed-off-by: Kirill Tkhai 
---
 drivers/net/tun.c |7 +++
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index b7137edff624..b834b0d168f9 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -2873,8 +2873,7 @@ static int tun_set_iff(struct net *net, struct file 
*file, struct ifreq *ifr)
return err;
 }
 
-static void tun_get_iff(struct net *net, struct tun_struct *tun,
-  struct ifreq *ifr)
+static void tun_get_iff(struct tun_struct *tun, struct ifreq *ifr)
 {
tun_debug(KERN_INFO, tun, "tun_get_iff\n");
 
@@ -3107,7 +3106,7 @@ static long __tun_chr_ioctl(struct file *file, unsigned 
int cmd,
ret = 0;
switch (cmd) {
case TUNGETIFF:
-   tun_get_iff(current->nsproxy->net_ns, tun, &ifr);
+   tun_get_iff(tun, &ifr);
 
if (tfile->detached)
ifr.ifr_flags |= IFF_DETACH_QUEUE;
@@ -3465,7 +3464,7 @@ static void tun_chr_show_fdinfo(struct seq_file *m, 
struct file *file)
rtnl_lock();
tun = tun_get(tfile);
if (tun)
-   tun_get_iff(current->nsproxy->net_ns, tun, &ifr);
+   tun_get_iff(tun, &ifr);
rtnl_unlock();
 
if (tun)

[PATCH 1/2] tun: Add ioctl() TUNGETDEVNETNS cmd to allow obtaining real net ns of tun device

2019-03-20 Thread Kirill Tkhai

In commit f2780d6d7475 "tun: Add ioctl() SIOCGSKNS cmd to allow
obtaining net ns of tun device" it was missed that tun may change
its net ns, while net ns of socket remains the same as it was
created initially. SIOCGSKNS returns net ns of socket, so it is
not suitable for obtaining net ns of device.

We may have two tun devices with the same names in two net ns,
and in this case it's not possible to determ, which of them
fd refers to (TUNGETIFF will return the same name).

This patch adds new ioctl() cmd for obtaining net ns of a device.

Reported-by: Harald Albrecht 
Signed-off-by: Kirill Tkhai 
---
 drivers/net/tun.c   |8 
 include/uapi/linux/if_tun.h |1 +
 2 files changed, 9 insertions(+)

diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index e9ca1c088d0b..b7137edff624 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -3103,6 +3103,7 @@ static long __tun_chr_ioctl(struct file *file, unsigned 
int cmd,
 
tun_debug(KERN_INFO, tun, "tun_chr_ioctl cmd %u\n", cmd);
 
+   net = dev_net(tun->dev);
ret = 0;
switch (cmd) {
case TUNGETIFF:
@@ -3328,6 +3329,13 @@ static long __tun_chr_ioctl(struct file *file, unsigned 
int cmd,
ret = tun_net_change_carrier(tun->dev, (bool)carrier);
break;
 
+   case TUNGETDEVNETNS:
+   ret = -EPERM;
+   if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
+   goto unlock;
+   ret = open_related_ns(&net->ns, get_net_ns);
+   break;
+
default:
ret = -EINVAL;
break;
diff --git a/include/uapi/linux/if_tun.h b/include/uapi/linux/if_tun.h
index 23a6753b37df..454ae31b93c7 100644
--- a/include/uapi/linux/if_tun.h
+++ b/include/uapi/linux/if_tun.h
@@ -60,6 +60,7 @@
 #define TUNSETSTEERINGEBPF _IOR('T', 224, int)
 #define TUNSETFILTEREBPF _IOR('T', 225, int)
 #define TUNSETCARRIER _IOW('T', 226, int)
+#define TUNGETDEVNETNS _IO('T', 227)
 
 /* TUNSETIFF ifr flags */
 #define IFF_TUN0x0001

[PATCH] net: Fix device name resolving crash in default_device_exit()

2018-06-20 Thread Kirill Tkhai

From: Kirill Tkhai 

The following script makes kernel to crash since it can't obtain
a name for a device, when the name is occupied by another device:

#!/bin/bash
ifconfig eth0 down
ifconfig eth1 down
index=`cat /sys/class/net/eth1/ifindex`
ip link set eth1 name dev$index
unshare -n sleep 1h &
pid=$!
while [[ "`readlink /proc/self/ns/net`" == "`readlink /proc/$pid/ns/net`" ]]; 
do continue; done
ip link set dev$index netns $pid
ip link set eth0 name dev$index
kill -9 $pid

Kernel messages:

virtio_net virtio1 dev3: renamed from eth1
virtio_net virtio0 dev3: renamed from eth0
default_device_exit: failed to move dev3 to init_net: -17
[ cut here ]
kernel BUG at net/core/dev.c:8978!
invalid opcode:  [#1] PREEMPT SMP
CPU: 1 PID: 276 Comm: kworker/u8:3 Not tainted 4.17.0+ #292
Workqueue: netns cleanup_net
RIP: 0010:default_device_exit+0x9c/0xb0
[stack trace snipped]

This patch gives more variability during choosing new name
of device and fixes the problem.

Signed-off-by: Kirill Tkhai 
---

Since there is no suggestions how to fix this in another way, I'm resending the 
patch.

 net/core/dev.c |4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/net/core/dev.c b/net/core/dev.c
index 6e18242a1cae..6c9b9303ded6 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -8959,7 +8959,6 @@ static void __net_exit default_device_exit(struct net 
*net)
rtnl_lock();
for_each_netdev_safe(net, dev, aux) {
int err;
-   char fb_name[IFNAMSIZ];
 
/* Ignore unmoveable devices (i.e. loopback) */
if (dev->features & NETIF_F_NETNS_LOCAL)
@@ -8970,8 +8969,7 @@ static void __net_exit default_device_exit(struct net 
*net)
continue;
 
/* Push remaining network devices to init_net */
-   snprintf(fb_name, IFNAMSIZ, "dev%d", dev->ifindex);
-   err = dev_change_net_namespace(dev, &init_net, fb_name);
+   err = dev_change_net_namespace(dev, &init_net, "dev%d");
if (err) {
pr_emerg("%s: failed to move %s to init_net: %d\n",
 __func__, dev->name, err);

Re: [PATCH] net: Fix device name resolving crash in default_device_exit()

2018-06-21 Thread Kirill Tkhai

On 20.06.2018 20:15, David Ahern wrote:
> On 6/20/18 2:57 AM, Kirill Tkhai wrote:
>> From: Kirill Tkhai 
>>
>> The following script makes kernel to crash since it can't obtain
>> a name for a device, when the name is occupied by another device:
>>
>> #!/bin/bash
>> ifconfig eth0 down
>> ifconfig eth1 down
>> index=`cat /sys/class/net/eth1/ifindex`
>> ip link set eth1 name dev$index
>> unshare -n sleep 1h &
>> pid=$!
>> while [[ "`readlink /proc/self/ns/net`" == "`readlink /proc/$pid/ns/net`" 
>> ]]; do continue; done
>> ip link set dev$index netns $pid
>> ip link set eth0 name dev$index
>> kill -9 $pid
>>
>> Kernel messages:
>>
>> virtio_net virtio1 dev3: renamed from eth1
>> virtio_net virtio0 dev3: renamed from eth0
>> default_device_exit: failed to move dev3 to init_net: -17
>> [ cut here ]
>> kernel BUG at net/core/dev.c:8978!
>> invalid opcode:  [#1] PREEMPT SMP
>> CPU: 1 PID: 276 Comm: kworker/u8:3 Not tainted 4.17.0+ #292
>> Workqueue: netns cleanup_net
>> RIP: 0010:default_device_exit+0x9c/0xb0
>> [stack trace snipped]
>>
>> This patch gives more variability during choosing new name
>> of device and fixes the problem.
>>
>> Signed-off-by: Kirill Tkhai 
>> ---
>>
>> Since there is no suggestions how to fix this in another way, I'm resending 
>> the patch.
> 
> This patch does not remove the BUG, so does not really solve the
> problem. ie., it is fairly trivial to write a script (32k dev%d named
> devices in init_net) that triggers it again, so your commit subject and
> commit log are not correct with the references to 'fixing the problem'.

1)I'm not agree with you and I don't think removing the BUG() is a good idea.
This function is called from the place, where it must not fail. But it can
fail, and the problem with name is not the only reason of this happens.
We can't continue further pernet_operations in case of a problem happened
in default_device_exit(), and we can't remove the BUG() before this function
becomes of void type. But we are not going to make it of void type. So
we can't remove the BUG().

2)In case of the script is trivial, can't you just post it here to show
what type of devices you mean? Is there real problem or this is
a theoretical thinking?

All virtual devices I see have rtnl_link_ops, so that they just destroyed
in default_device_exit_batch(). According to physical devices, it's difficult
to imagine a node with 32k physical devices, and if someone tried to deploy
them it may meet problems not only in this place.

> The change does provide more variability in naming and reduces the
> likelihood of not being able to push a device back to init_net.

No, it provides. With the patch one may move real device to a container,
and allow to do with the device anything including changing of device
index. Then, the destruction of the container does not resilt a kernel
panic just because of two devices have the same index.

Kirill

Re: [PATCH] net: Fix device name resolving crash in default_device_exit()

2018-06-22 Thread Kirill Tkhai

On 21.06.2018 18:28, David Ahern wrote:
> On 6/21/18 4:03 AM, Kirill Tkhai wrote:
>>> This patch does not remove the BUG, so does not really solve the
>>> problem. ie., it is fairly trivial to write a script (32k dev%d named
>>> devices in init_net) that triggers it again, so your commit subject and
>>> commit log are not correct with the references to 'fixing the problem'.
>>
>> 1)I'm not agree with you and I don't think removing the BUG() is a good idea.
>> This function is called from the place, where it must not fail. But it can
>> fail, and the problem with name is not the only reason of this happens.
>> We can't continue further pernet_operations in case of a problem happened
>> in default_device_exit(), and we can't remove the BUG() before this function
>> becomes of void type. But we are not going to make it of void type. So
>> we can't remove the BUG().
> 
> You missed my point: that the function can still fail means you are not
> "fixing" the problem, only delaying it.

Till the function is of int type and it can fail, we can't remove the BUG().
And this does not connected with name resolution.

>>
>> 2)In case of the script is trivial, can't you just post it here to show
>> what type of devices you mean? Is there real problem or this is
>> a theoretical thinking?
> 
> Current code:
> 
> # ip li sh dev eth2
> 4: eth2:  mtu 1500 qdisc mq state UP
> mode DEFAULT group default qlen 1000
> link/ether 02:e0:f9:46:64:80 brd ff:ff:ff:ff:ff:ff
> # ip netns add fubar
> # ip li set eth2 netns fubar
> # ip li add eth2 type dummy
> # ip li add dev4 type dummy
> # ip netns del fubar
> --> BUG
> kernel:[78079.127748] default_device_exit: failed to move eth2 to
> init_net: -17
> 
> 
> With your patch:
> 
> # ip li sh dev eth2
> 4: eth2:  mtu 1500 qdisc mq state UP
> mode DEFAULT group default qlen 1000
> link/ether 02:e0:f9:46:64:80 brd ff:ff:ff:ff:ff:ff
> # ip netns add fubar
> # ip li set eth2 netns fubar
> # ip li add eth2 type dummy
> # for n in $(seq 0 $((32*1024))); do
>   echo "li add dev${n} type dummy"
>   done > ip.batch
> # ip -batch ip.batch
> # ip netns del fubar
> --> BUG
> kernel:[   25.800024] default_device_exit: failed to move eth2 to
> init_net: -17

Yeah, this has a sense.

>>
>> All virtual devices I see have rtnl_link_ops, so that they just destroyed
>> in default_device_exit_batch(). According to physical devices, it's difficult
>> to imagine a node with 32k physical devices, and if someone tried to deploy
>> them it may meet problems not only in this place.
> 
> Nothing says it has to be a physical device. It is only checking for a name.
> 
>>
>>> The change does provide more variability in naming and reduces the
>>> likelihood of not being able to push a device back to init_net.
>>
>> No, it provides. With the patch one may move real device to a container,
>> and allow to do with the device anything including changing of device
>> index. Then, the destruction of the container does not resilt a kernel
>> panic just because of two devices have the same index.

[PATCH net-next 2/4] net: Convert sctp_ctrlsock_ops

2018-03-13 Thread Kirill Tkhai

These pernet_operations create and destroy net::sctp::ctl_sock.
Since pernet_operations do not send sctp packets each other,
they look safe to be marked as async.

Signed-off-by: Kirill Tkhai 
---
 net/sctp/protocol.c |1 +
 1 file changed, 1 insertion(+)

diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c
index 32be52304f98..606361ee9e4a 100644
--- a/net/sctp/protocol.c
+++ b/net/sctp/protocol.c
@@ -1354,6 +1354,7 @@ static void __net_init sctp_ctrlsock_exit(struct net *net)
 static struct pernet_operations sctp_ctrlsock_ops = {
.init = sctp_ctrlsock_init,
.exit = sctp_ctrlsock_exit,
+   .async = true,
 };
 
 /* Initialize the universe into something sensible.  */

[PATCH net-next 0/4] Converting pernet_operations (part #6)

2018-03-13 Thread Kirill Tkhai

Hi,

this series continues to review and to convert pernet_operations
to make them possible to be executed in parallel for several
net namespaces in the same time. There are sctp, tipc and rds
in this series.

Thanks,
Kirill
---

Kirill Tkhai (4):
  net: Convert sctp_defaults_ops
  net: Convert sctp_ctrlsock_ops
  net: Convert tipc_net_ops
  net: Convert rds_tcp_net_ops


 net/rds/tcp.c   |1 +
 net/sctp/protocol.c |2 ++
 net/tipc/core.c |1 +
 3 files changed, 4 insertions(+)

--
Signed-off-by: Kirill Tkhai

[PATCH net-next 1/4] net: Convert sctp_defaults_ops

2018-03-13 Thread Kirill Tkhai

These pernet_operations have a deal with sysctl, /proc
entries and statistics. Also, there are freeing of
net::sctp::addr_waitq queue and net::sctp::local_addr_list
in exit method. All of them look pernet-divided, and it
seems these items are only interesting for sctp_defaults_ops,
which are safe to be executed in parallel.

Signed-off-by: Kirill Tkhai 
---
 net/sctp/protocol.c |1 +
 1 file changed, 1 insertion(+)

diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c
index 91813e686c67..32be52304f98 100644
--- a/net/sctp/protocol.c
+++ b/net/sctp/protocol.c
@@ -1330,6 +1330,7 @@ static void __net_exit sctp_defaults_exit(struct net *net)
 static struct pernet_operations sctp_defaults_ops = {
.init = sctp_defaults_init,
.exit = sctp_defaults_exit,
+   .async = true,
 };
 
 static int __net_init sctp_ctrlsock_init(struct net *net)

[PATCH net-next 3/4] net: Convert tipc_net_ops

2018-03-13 Thread Kirill Tkhai

TIPC looks concentrated in itself, and other pernet_operations
seem not touching its entities.

tipc_net_ops look pernet-divided, and they should be safe to
be executed in parallel for several net the same time.

Signed-off-by: Kirill Tkhai 
---
 net/tipc/core.c |1 +
 1 file changed, 1 insertion(+)

diff --git a/net/tipc/core.c b/net/tipc/core.c
index 0b982d048fb9..04fd91bb11d7 100644
--- a/net/tipc/core.c
+++ b/net/tipc/core.c
@@ -105,6 +105,7 @@ static struct pernet_operations tipc_net_ops = {
.exit = tipc_exit_net,
.id   = &tipc_net_id,
.size = sizeof(struct tipc_net),
+   .async = true,
 };
 
 static int __init tipc_init(void)

[PATCH net-next 4/4] net: Convert rds_tcp_net_ops

2018-03-13 Thread Kirill Tkhai

These pernet_operations create and destroy sysctl table
and listen socket. Also, exit method flushes global
workqueue and work. Everything looks per-net safe,
so we can mark them async.

Signed-off-by: Kirill Tkhai 
---
 net/rds/tcp.c |1 +
 1 file changed, 1 insertion(+)

diff --git a/net/rds/tcp.c b/net/rds/tcp.c
index 08230a145042..eb04e7fa2467 100644
--- a/net/rds/tcp.c
+++ b/net/rds/tcp.c
@@ -515,6 +515,7 @@ static struct pernet_operations rds_tcp_net_ops = {
.exit = rds_tcp_exit_net,
.id = &rds_tcp_netid,
.size = sizeof(struct rds_tcp_net),
+   .async = true,
 };
 
 static void rds_tcp_kill_sock(struct net *net)

[PATCH net-next nfs 0/6] Converting pernet_operations (part #7)

2018-03-13 Thread Kirill Tkhai

Hi,

this series continues to review and to convert pernet_operations
to make them possible to be executed in parallel for several
net namespaces in the same time. There are nfs pernet_operations
in this series. All of them look similar each other, they mostly
create and destroy caches with small exceptions.

Also, there is rxrpc_net_ops, which is used in AFS.

Thanks,
Kirill
---

Kirill Tkhai (6):
  net: Convert rpcsec_gss_net_ops
  net: Convert sunrpc_net_ops
  net: Convert nfsd_net_ops
  net: Convert nfs4_dns_resolver_ops
  net: Convert nfs4blocklayout_net_ops
  net: Convert rxrpc_net_ops


 fs/nfs/blocklayout/rpc_pipefs.c |1 +
 fs/nfs/dns_resolve.c|1 +
 fs/nfsd/nfsctl.c|1 +
 net/rxrpc/net_ns.c  |1 +
 net/sunrpc/auth_gss/auth_gss.c  |1 +
 net/sunrpc/sunrpc_syms.c|1 +
 6 files changed, 6 insertions(+)

--
Signed-off-by: Kirill Tkhai

[PATCH net-next nfs 1/6] net: Convert rpcsec_gss_net_ops

2018-03-13 Thread Kirill Tkhai

These pernet_operations initialize and destroy sunrpc_net_id
refered per-net items. Only used global list is cache_list,
and accesses already serialized.

sunrpc_destroy_cache_detail() check for list_empty() without
cache_list_lock, but when it's called from unregister_pernet_subsys(),
there can't be callers in parallel, so we won't miss list_empty()
in this case.

Signed-off-by: Kirill Tkhai 
---
 net/sunrpc/auth_gss/auth_gss.c |1 +
 1 file changed, 1 insertion(+)

diff --git a/net/sunrpc/auth_gss/auth_gss.c b/net/sunrpc/auth_gss/auth_gss.c
index 9463af4b32e8..44f939cb6bc8 100644
--- a/net/sunrpc/auth_gss/auth_gss.c
+++ b/net/sunrpc/auth_gss/auth_gss.c
@@ -2063,6 +2063,7 @@ static __net_exit void rpcsec_gss_exit_net(struct net 
*net)
 static struct pernet_operations rpcsec_gss_net_ops = {
.init = rpcsec_gss_init_net,
.exit = rpcsec_gss_exit_net,
+   .async = true,
 };
 
 /*

[PATCH net-next nfs 3/6] net: Convert nfsd_net_ops

2018-03-13 Thread Kirill Tkhai

These pernet_operations look similar to rpcsec_gss_net_ops,
they just create and destroy another caches. So, they also
can be async.

Signed-off-by: Kirill Tkhai 
---
 fs/nfsd/nfsctl.c |1 +
 1 file changed, 1 insertion(+)

diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c
index d107b4426f7e..1e3824e6cce0 100644
--- a/fs/nfsd/nfsctl.c
+++ b/fs/nfsd/nfsctl.c
@@ -1263,6 +1263,7 @@ static struct pernet_operations nfsd_net_ops = {
.exit = nfsd_exit_net,
.id   = &nfsd_net_id,
.size = sizeof(struct nfsd_net),
+   .async = true,
 };
 
 static int __init init_nfsd(void)

[PATCH net-next nfs 4/6] net: Convert nfs4_dns_resolver_ops

2018-03-13 Thread Kirill Tkhai

These pernet_operations look similar to rpcsec_gss_net_ops,
they just create and destroy another cache. Also they create
and destroy directory. So, they also look safe to be async.

Signed-off-by: Kirill Tkhai 
---
 fs/nfs/dns_resolve.c |1 +
 1 file changed, 1 insertion(+)

diff --git a/fs/nfs/dns_resolve.c b/fs/nfs/dns_resolve.c
index 060c658eab66..e90bd69ab653 100644
--- a/fs/nfs/dns_resolve.c
+++ b/fs/nfs/dns_resolve.c
@@ -410,6 +410,7 @@ static void nfs4_dns_net_exit(struct net *net)
 static struct pernet_operations nfs4_dns_resolver_ops = {
.init = nfs4_dns_net_init,
.exit = nfs4_dns_net_exit,
+   .async = true,
 };
 
 static int rpc_pipefs_event(struct notifier_block *nb, unsigned long event,

[PATCH net-next nfs 2/6] net: Convert sunrpc_net_ops

2018-03-13 Thread Kirill Tkhai

These pernet_operations look similar to rpcsec_gss_net_ops,
they just create and destroy another caches. So, they also
can be async.

Signed-off-by: Kirill Tkhai 
---
 net/sunrpc/sunrpc_syms.c |1 +
 1 file changed, 1 insertion(+)

diff --git a/net/sunrpc/sunrpc_syms.c b/net/sunrpc/sunrpc_syms.c
index 56f9eff74150..68287e921847 100644
--- a/net/sunrpc/sunrpc_syms.c
+++ b/net/sunrpc/sunrpc_syms.c
@@ -79,6 +79,7 @@ static struct pernet_operations sunrpc_net_ops = {
.exit = sunrpc_exit_net,
.id = &sunrpc_net_id,
.size = sizeof(struct sunrpc_net),
+   .async = true,
 };
 
 static int __init

[PATCH net-next nfs 5/6] net: Convert nfs4blocklayout_net_ops

2018-03-13 Thread Kirill Tkhai

These pernet_operations create and destroy per-net pipe
and dentry, and they seem safe to be marked as async.

Signed-off-by: Kirill Tkhai 
---
 fs/nfs/blocklayout/rpc_pipefs.c |1 +
 1 file changed, 1 insertion(+)

diff --git a/fs/nfs/blocklayout/rpc_pipefs.c b/fs/nfs/blocklayout/rpc_pipefs.c
index 9fb067a6f7e0..ef9fa111b009 100644
--- a/fs/nfs/blocklayout/rpc_pipefs.c
+++ b/fs/nfs/blocklayout/rpc_pipefs.c
@@ -261,6 +261,7 @@ static void nfs4blocklayout_net_exit(struct net *net)
 static struct pernet_operations nfs4blocklayout_net_ops = {
.init = nfs4blocklayout_net_init,
.exit = nfs4blocklayout_net_exit,
+   .async = true,
 };
 
 int __init bl_init_pipefs(void)

[PATCH net-next nfs 6/6] net: Convert rxrpc_net_ops

2018-03-13 Thread Kirill Tkhai

These pernet_operations modifies rxrpc_net_id-pointed
per-net entities. There is external link to AF_RXRPC
in fs/afs/Kconfig, but it seems there is no other
pernet_operations interested in that per-net entities.

Signed-off-by: Kirill Tkhai 
---
 net/rxrpc/net_ns.c |1 +
 1 file changed, 1 insertion(+)

diff --git a/net/rxrpc/net_ns.c b/net/rxrpc/net_ns.c
index f18c9248e0d4..5fd939dabf41 100644
--- a/net/rxrpc/net_ns.c
+++ b/net/rxrpc/net_ns.c
@@ -106,4 +106,5 @@ struct pernet_operations rxrpc_net_ops = {
.exit   = rxrpc_exit_net,
.id = &rxrpc_net_id,
.size   = sizeof(struct rxrpc_net),
+   .async  = true,
 };

[PATCH net-next] net: Add comment about pernet_operations methods and synchronization

2018-03-13 Thread Kirill Tkhai

Make locking scheme be visible for users, and provide
a comment what for we are need exit_batch() methods,
and when it should be used.

Signed-off-by: Kirill Tkhai 
---
 include/net/net_namespace.h |   14 ++
 1 file changed, 14 insertions(+)

diff --git a/include/net/net_namespace.h b/include/net/net_namespace.h
index d4417495773a..71abc8d79178 100644
--- a/include/net/net_namespace.h
+++ b/include/net/net_namespace.h
@@ -312,6 +312,20 @@ struct net *get_net_ns_by_id(struct net *net, int id);
 
 struct pernet_operations {
struct list_head list;
+   /*
+* Below methods are called without any exclusive locks.
+* More than one net may be constructed and destructed
+* in parallel on several cpus. Every pernet_operations
+* have to keep in mind all other pernet_operations and
+* to introduce a locking, if they share common resources.
+*
+* Exit methods using blocking RCU primitives, such as
+* synchronize_rcu(), should be implemented via exit_batch.
+* Then, destruction of a group of net requires single
+* synchronize_rcu() related to these pernet_operations,
+* instead of separate synchronize_rcu() for every net.
+* Please, avoid synchronize_rcu() at all, where it's possible.
+*/
int (*init)(struct net *net);
void (*exit)(struct net *net);
void (*exit_batch)(struct list_head *net_exit_list);

[PATCH net-next 1/2] net: Add rtnl_lock_killable()

2018-03-14 Thread Kirill Tkhai

rtnl_lock() is widely used mutex in kernel. Some of kernel code
does memory allocations under it. In case of memory deficit this
may invoke OOM killer, but the problem is a killed task can't
exit if it's waiting for the mutex. This may be a reason of deadlock
and panic.

This patch adds a new primitive, which responds on SIGKILL, and
it allows to use it in the places, where we don't want to sleep
forever.

Signed-off-by: Kirill Tkhai 
---
 include/linux/rtnetlink.h |1 +
 net/core/rtnetlink.c  |6 ++
 2 files changed, 7 insertions(+)

diff --git a/include/linux/rtnetlink.h b/include/linux/rtnetlink.h
index 3573b4bf2fdf..562a175c35a9 100644
--- a/include/linux/rtnetlink.h
+++ b/include/linux/rtnetlink.h
@@ -33,6 +33,7 @@ extern void rtnl_lock(void);
 extern void rtnl_unlock(void);
 extern int rtnl_trylock(void);
 extern int rtnl_is_locked(void);
+extern int rtnl_lock_killable(void);
 
 extern wait_queue_head_t netdev_unregistering_wq;
 extern struct rw_semaphore net_sem;
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index 67f375cfb982..87079eaa871b 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -75,6 +75,12 @@ void rtnl_lock(void)
 }
 EXPORT_SYMBOL(rtnl_lock);
 
+int rtnl_lock_killable(void)
+{
+   return mutex_lock_killable(&rtnl_mutex);
+}
+EXPORT_SYMBOL(rtnl_lock_killable);
+
 static struct sk_buff *defer_kfree_skb_list;
 void rtnl_kfree_skbs(struct sk_buff *head, struct sk_buff *tail)
 {

[PATCH net-next 0/2] Introduce rtnl_lock_killable()

2018-03-14 Thread Kirill Tkhai

rtnl_lock() is widely used mutex in kernel. Some of kernel code
does memory allocations under it. In case of memory deficit this
may invoke OOM killer, but the problem is a killed task can't
exit if it's waiting for the mutex. This may be a reason of deadlock
and panic.

This patchset adds a new primitive, which responds on SIGKILL,
and it allows to use it in the places, where we don't want
to sleep forever. Also, the first place is made to use it.

---

Kirill Tkhai (2):
  net: Add rtnl_lock_killable()
  net: Use rtnl_lock_killable() in register_netdev()


 include/linux/rtnetlink.h |1 +
 net/core/dev.c|3 ++-
 net/core/rtnetlink.c  |6 ++
 3 files changed, 9 insertions(+), 1 deletion(-)

--
Signed-off-by: Kirill Tkhai

[PATCH net-next 2/2] net: Use rtnl_lock_killable() in register_netdev()

2018-03-14 Thread Kirill Tkhai

This patch adds rtnl_lock_killable() to one of hot path
using rtnl_lock().

Signed-off-by: Kirill Tkhai 
---
 net/core/dev.c |3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/net/core/dev.c b/net/core/dev.c
index 12a9aad0b057..d8887cc38e7b 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -8018,7 +8018,8 @@ int register_netdev(struct net_device *dev)
 {
int err;
 
-   rtnl_lock();
+   if (rtnl_lock_killable())
+   return -EINTR;
err = register_netdevice(dev);
rtnl_unlock();
return err;

[PATCH net-next 0/6] Converting pernet_operations (part #8)

2018-03-15 Thread Kirill Tkhai

Hi,

this series continues to review and to convert pernet_operations
to make them possible to be executed in parallel for several
net namespaces at the same time. There are different operations
over the tree, mostly are ipvs.


Thanks,
Kirill
---

Kirill Tkhai (6):
  net: Convert l2tp_net_ops
  net: Convert mpls_net_ops
  net: Convert ovs_net_ops
  net: Convert ipvs_core_ops
  net: Convert ipvs_core_dev_ops
  net: Convert ip_vs_ftp_ops


 net/l2tp/l2tp_core.c|1 +
 net/mpls/af_mpls.c  |1 +
 net/netfilter/ipvs/ip_vs_core.c |2 ++
 net/netfilter/ipvs/ip_vs_ftp.c  |1 +
 net/openvswitch/datapath.c  |1 +
 5 files changed, 6 insertions(+)

--
Signed-off-by: Kirill Tkhai

[PATCH net-next 3/6] net: Convert ovs_net_ops

2018-03-15 Thread Kirill Tkhai

These pernet_operations initialize and destroy net_generic()
data pointed by ovs_net_id. Exit method destroys vports from
alive net to exiting net. Since they are only pernet_operations
interested in this data, and exit method is executed under
exclusive global lock (ovs_mutex), they are safe to be executed
in parallel.

Signed-off-by: Kirill Tkhai 
---
 net/openvswitch/datapath.c |1 +
 1 file changed, 1 insertion(+)

diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c
index ef38e5aecd28..100191df0371 100644
--- a/net/openvswitch/datapath.c
+++ b/net/openvswitch/datapath.c
@@ -2384,6 +2384,7 @@ static struct pernet_operations ovs_net_ops = {
.exit = ovs_exit_net,
.id   = &ovs_net_id,
.size = sizeof(struct ovs_net),
+   .async = true,
 };
 
 static int __init dp_init(void)

[PATCH net-next 1/6] net: Convert l2tp_net_ops

2018-03-15 Thread Kirill Tkhai

Init method is rather simple. Exit method queues del_work
for every tunnel from per-net list. This seems to be safe
to be marked async.

Signed-off-by: Kirill Tkhai 
---
 net/l2tp/l2tp_core.c |1 +
 1 file changed, 1 insertion(+)

diff --git a/net/l2tp/l2tp_core.c b/net/l2tp/l2tp_core.c
index 83421c6f0bef..189a12a5e4ac 100644
--- a/net/l2tp/l2tp_core.c
+++ b/net/l2tp/l2tp_core.c
@@ -1787,6 +1787,7 @@ static struct pernet_operations l2tp_net_ops = {
.exit = l2tp_exit_net,
.id   = &l2tp_net_id,
.size = sizeof(struct l2tp_net),
+   .async = true,
 };
 
 static int __init l2tp_init(void)

[PATCH net-next 4/6] net: Convert ipvs_core_ops

2018-03-15 Thread Kirill Tkhai

These pernet_operations register and unregister nf hooks,
/proc entries, sysctl, percpu statistics. There are several
global lists, and the only list modified without exclusive
locks is ip_vs_conn_tab in ip_vs_conn_flush(). We iterate
the list and force the timers expire at the moment. Since
there were possible several timer expirations before this
patch, and since they are safe, the patch does not invent
new parallelism of their destruction. These pernet_operations
look safe to be converted.

Signed-off-by: Kirill Tkhai 
---
 net/netfilter/ipvs/ip_vs_core.c |1 +
 1 file changed, 1 insertion(+)

diff --git a/net/netfilter/ipvs/ip_vs_core.c b/net/netfilter/ipvs/ip_vs_core.c
index 5f6f73cf2174..c5d16e2bc8e2 100644
--- a/net/netfilter/ipvs/ip_vs_core.c
+++ b/net/netfilter/ipvs/ip_vs_core.c
@@ -2289,6 +2289,7 @@ static struct pernet_operations ipvs_core_ops = {
.exit = __ip_vs_cleanup,
.id   = &ip_vs_net_id,
.size = sizeof(struct netns_ipvs),
+   .async = true,
 };
 
 static struct pernet_operations ipvs_core_dev_ops = {

[PATCH net-next 2/6] net: Convert mpls_net_ops

2018-03-15 Thread Kirill Tkhai

These pernet_operations register and unregister sysctl table.
Exit methods frees platform_labels from net::mpls::platform_label.
Everything is per-net, and they looks safe to be marked async.

Signed-off-by: Kirill Tkhai 
---
 net/mpls/af_mpls.c |1 +
 1 file changed, 1 insertion(+)

diff --git a/net/mpls/af_mpls.c b/net/mpls/af_mpls.c
index 7a4de6d618b1..d4a89a8be013 100644
--- a/net/mpls/af_mpls.c
+++ b/net/mpls/af_mpls.c
@@ -2488,6 +2488,7 @@ static void mpls_net_exit(struct net *net)
 static struct pernet_operations mpls_net_ops = {
.init = mpls_net_init,
.exit = mpls_net_exit,
+   .async = true,
 };
 
 static struct rtnl_af_ops mpls_af_ops __read_mostly = {

[PATCH net-next 5/6] net: Convert ipvs_core_dev_ops

2018-03-15 Thread Kirill Tkhai

Exit method stops two per-net threads and cancels
delayed work. Everything looks nicely per-net divided.

Signed-off-by: Kirill Tkhai 
---
 net/netfilter/ipvs/ip_vs_core.c |1 +
 1 file changed, 1 insertion(+)

diff --git a/net/netfilter/ipvs/ip_vs_core.c b/net/netfilter/ipvs/ip_vs_core.c
index c5d16e2bc8e2..6a6cb9db030b 100644
--- a/net/netfilter/ipvs/ip_vs_core.c
+++ b/net/netfilter/ipvs/ip_vs_core.c
@@ -2294,6 +2294,7 @@ static struct pernet_operations ipvs_core_ops = {
 
 static struct pernet_operations ipvs_core_dev_ops = {
.exit = __ip_vs_dev_cleanup,
+   .async = true,
 };
 
 /*

[PATCH net-next 6/6] net: Convert ip_vs_ftp_ops

2018-03-15 Thread Kirill Tkhai

These pernet_operations register and unregister ipvs app.
register_ip_vs_app(), unregister_ip_vs_app() and
register_ip_vs_app_inc() modify per-net structures,
and there are no global structures touched. So,
this looks safe to be marked as async.

Signed-off-by: Kirill Tkhai 
---
 net/netfilter/ipvs/ip_vs_ftp.c |1 +
 1 file changed, 1 insertion(+)

diff --git a/net/netfilter/ipvs/ip_vs_ftp.c b/net/netfilter/ipvs/ip_vs_ftp.c
index 58d5d05aec24..8b25aab41928 100644
--- a/net/netfilter/ipvs/ip_vs_ftp.c
+++ b/net/netfilter/ipvs/ip_vs_ftp.c
@@ -479,6 +479,7 @@ static void __ip_vs_ftp_exit(struct net *net)
 static struct pernet_operations ip_vs_ftp_ops = {
.init = __ip_vs_ftp_init,
.exit = __ip_vs_ftp_exit,
+   .async = true,
 };
 
 static int __init ip_vs_ftp_init(void)

Re: netns: send uevent messages

2018-03-15 Thread Kirill Tkhai

CC Andrey Vagin

On 15.03.2018 03:12, Christian Brauner wrote:
> This patch adds a receive method to NETLINK_KOBJECT_UEVENT netlink sockets
> to allow sending uevent messages into the network namespace the socket
> belongs to.
> 
> Currently non-initial network namespaces are already isolated and don't
> receive uevents. There are a number of cases where it is beneficial for a
> sufficiently privileged userspace process to send a uevent into a network
> namespace.
> 
> One such use case would be debugging and fuzzing of a piece of software
> which listens and reacts to uevents. By running a copy of that software
> inside a network namespace, specific uevents could then be presented to it.
> More concretely, this would allow for easy testing of udevd/ueventd.
> 
> This will also allow some piece of software to run components inside a
> separate network namespace and then effectively filter what that software
> can receive. Some examples of software that do directly listen to uevents
> and that we have in the past attempted to run inside a network namespace
> are rbd (CEPH client) or the X server.
> 
> Implementation:
> The implementation has been kept as simple as possible from the kernel's
> perspective. Specifically, a simple input method uevent_net_rcv() is added
> to NETLINK_KOBJECT_UEVENT sockets which completely reuses existing
> af_netlink infrastructure and does neither add an additional netlink family
> nor requires any user-visible changes.
> 
> For example, by using netlink_rcv_skb() we can make use of existing netlink
> infrastructure to report back informative error messages to userspace.
> 
> Furthermore, this implementation does not introduce any overhead for
> existing uevent generating codepaths. The struct netns gets a new uevent
> socket member that records the uevent socket associated with that network
> namespace. Since we record the uevent socket for each network namespace in
> struct net we don't have to walk the whole uevent socket list.
> Instead we can directly retrieve the relevant uevent socket and send the
> message. This keeps the codepath very performant without introducing
> needless overhead.
> 
> Uevent sequence numbers are kept global. When a uevent message is sent to
> another network namespace the implementation will simply increment the
> global uevent sequence number and append it to the received uevent. This
> has the advantage that the kernel will never need to parse the received
> uevent message to replace any existing uevent sequence numbers. Instead it
> is up to the userspace process to remove any existing uevent sequence
> numbers in case the uevent message to be sent contains any.
> 
> Security:
> In order for a caller to send uevent messages to a target network namespace
> the caller must have CAP_SYS_ADMIN in the owning user namespace of the
> target network namespace. Additionally, any received uevent message is
> verified to not exceed size UEVENT_BUFFER_SIZE. This includes the space
> needed to append the uevent sequence number.
> 
> Testing:
> This patch has been tested and verified to work with the following udev
> implementations:
> 1. CentOS 6 with udevd version 147
> 2. Debian Sid with systemd-udevd version 237
> 3. Android 7.1.1 with ueventd
> 
> Signed-off-by: Christian Brauner 
> ---
>  include/net/net_namespace.h |  1 +
>  lib/kobject_uevent.c| 88 
> -
>  2 files changed, 88 insertions(+), 1 deletion(-)
> 
> diff --git a/include/net/net_namespace.h b/include/net/net_namespace.h
> index f306b2aa15a4..467bde763a9b 100644
> --- a/include/net/net_namespace.h
> +++ b/include/net/net_namespace.h
> @@ -78,6 +78,7 @@ struct net {
>  
>   struct sock *rtnl;  /* rtnetlink socket */
>   struct sock *genl_sock;
> + struct sock *uevent_sock;   /* uevent socket */

Since you add this per-net uevent_sock pointer, currently existing iterations 
in uevent_net_exit()
become to look confusing. There are:

mutex_lock(&uevent_sock_mutex);
list_for_each_entry(ue_sk, &uevent_sock_list, list) {
if (sock_net(ue_sk->sk) == net)
goto found;
}

Can't we make a small cleanup in lib/kobject_uevent.c after this change
and before the main part of the patch goes?

>  
>   struct list_headdev_base_head;
>   struct hlist_head   *dev_name_head;
> diff --git a/lib/kobject_uevent.c b/lib/kobject_uevent.c
> index 9fe6ec8fda28..10b2144b9fc3 100644
> --- a/lib/kobject_uevent.c
> +++ b/lib/kobject_uevent.c
> @@ -25,6 +25,7 @@
>  #include 
>  #include 
>  #include 
> +#include 
>  #include 
>  
>  
> @@ -602,12 +603,94 @@ int add_uevent_var(struct kobj_uevent_env *env, const 
> char *format, ...)
>  EXPORT_SYMBOL_GPL(add_uevent_var);
>  
>  #if defined(CONFIG_NET)
> +static int uevent_net_send(struct sock *usk, struct sk_buff *skb,
> +struct netlink_ext_ack *ext

[PATCH net-next 2/5] net: Revert "ipv4: fix a deadlock in ip_ra_control"

2018-03-15 Thread Kirill Tkhai

This reverts commit 1215e51edad1.
Since raw_close() is used on every RAW socket destruction,
the changes made by 1215e51edad1 scale sadly. This clearly
seen on endless unshare(CLONE_NEWNET) test, and cleanup_net()
kwork spends a lot of time waiting for rtnl_lock() introduced
by this commit.

Next patches in series will rework this in another way,
so now we revert 1215e51edad1. Also, it doesn't seen
mrtsock_destruct() takes sk_lock, and the comment to the commit
does not show the actual stack dump. So, there is a question
did we really need in it.

Signed-off-by: Kirill Tkhai 
---
 net/ipv4/ip_sockglue.c |1 -
 net/ipv4/ipmr.c|   11 +--
 net/ipv4/raw.c |2 --
 3 files changed, 9 insertions(+), 5 deletions(-)

diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c
index be7c3b71914d..b7bac7351409 100644
--- a/net/ipv4/ip_sockglue.c
+++ b/net/ipv4/ip_sockglue.c
@@ -594,7 +594,6 @@ static bool setsockopt_needs_rtnl(int optname)
case MCAST_LEAVE_GROUP:
case MCAST_LEAVE_SOURCE_GROUP:
case MCAST_UNBLOCK_SOURCE:
-   case IP_ROUTER_ALERT:
return true;
}
return false;
diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c
index d752a70855d8..f6be5db16da2 100644
--- a/net/ipv4/ipmr.c
+++ b/net/ipv4/ipmr.c
@@ -1399,7 +1399,7 @@ static void mrtsock_destruct(struct sock *sk)
struct net *net = sock_net(sk);
struct mr_table *mrt;
 
-   ASSERT_RTNL();
+   rtnl_lock();
ipmr_for_each_table(mrt, net) {
if (sk == rtnl_dereference(mrt->mroute_sk)) {
IPV4_DEVCONF_ALL(net, MC_FORWARDING)--;
@@ -1411,6 +1411,7 @@ static void mrtsock_destruct(struct sock *sk)
mroute_clean_tables(mrt, false);
}
}
+   rtnl_unlock();
 }
 
 /* Socket options and virtual interface manipulation. The whole
@@ -1475,8 +1476,13 @@ int ip_mroute_setsockopt(struct sock *sk, int optname, 
char __user *optval,
if (sk != rcu_access_pointer(mrt->mroute_sk)) {
ret = -EACCES;
} else {
+   /* We need to unlock here because mrtsock_destruct takes
+* care of rtnl itself and we can't change that due to
+* the IP_ROUTER_ALERT setsockopt which runs without it.
+*/
+   rtnl_unlock();
ret = ip_ra_control(sk, 0, NULL);
-   goto out_unlock;
+   goto out;
}
break;
case MRT_ADD_VIF:
@@ -1588,6 +1594,7 @@ int ip_mroute_setsockopt(struct sock *sk, int optname, 
char __user *optval,
}
 out_unlock:
rtnl_unlock();
+out:
return ret;
 }
 
diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c
index 54648d20bf0f..720bef7da2f6 100644
--- a/net/ipv4/raw.c
+++ b/net/ipv4/raw.c
@@ -711,9 +711,7 @@ static void raw_close(struct sock *sk, long timeout)
/*
 * Raw sockets may have direct kernel references. Kill them.
 */
-   rtnl_lock();
ip_ra_control(sk, 0, NULL);
-   rtnl_unlock();
 
sk_common_release(sk);
 }

[PATCH net-next 4/5] net: Make ip_ra_chain per struct net

2018-03-15 Thread Kirill Tkhai

This is optimization, which makes ip_call_ra_chain()
iterate less sockets to find the sockets it's looking for.

Signed-off-by: Kirill Tkhai 
---
 include/net/ip.h |   13 +++--
 include/net/netns/ipv4.h |1 +
 net/ipv4/ip_input.c  |5 ++---
 net/ipv4/ip_sockglue.c   |   15 ++-
 4 files changed, 16 insertions(+), 18 deletions(-)

diff --git a/include/net/ip.h b/include/net/ip.h
index fe63ba95d12b..d53b5a9eae34 100644
--- a/include/net/ip.h
+++ b/include/net/ip.h
@@ -91,6 +91,17 @@ static inline int inet_sdif(struct sk_buff *skb)
return 0;
 }
 
+/* Special input handler for packets caught by router alert option.
+   They are selected only by protocol field, and then processed likely
+   local ones; but only if someone wants them! Otherwise, router
+   not running rsvpd will kill RSVP.
+
+   It is user level problem, what it will make with them.
+   I have no idea, how it will masquearde or NAT them (it is joke, joke :-)),
+   but receiver should be enough clever f.e. to forward mtrace requests,
+   sent to multicast group to reach destination designated router.
+ */
+
 struct ip_ra_chain {
struct ip_ra_chain __rcu *next;
struct sock *sk;
@@ -101,8 +112,6 @@ struct ip_ra_chain {
struct rcu_head rcu;
 };
 
-extern struct ip_ra_chain __rcu *ip_ra_chain;
-
 /* IP flags. */
 #define IP_CE  0x8000  /* Flag: "Congestion"   */
 #define IP_DF  0x4000  /* Flag: "Don't Fragment"   */
diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h
index 3a970e429ab6..23c208fcf1a0 100644
--- a/include/net/netns/ipv4.h
+++ b/include/net/netns/ipv4.h
@@ -49,6 +49,7 @@ struct netns_ipv4 {
 #endif
struct ipv4_devconf *devconf_all;
struct ipv4_devconf *devconf_dflt;
+   struct ip_ra_chain  *ra_chain;
 #ifdef CONFIG_IP_MULTIPLE_TABLES
struct fib_rules_ops*rules_ops;
boolfib_has_custom_rules;
diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c
index 57fc13c6ab2b..7582713dd18f 100644
--- a/net/ipv4/ip_input.c
+++ b/net/ipv4/ip_input.c
@@ -159,7 +159,7 @@ bool ip_call_ra_chain(struct sk_buff *skb)
struct net_device *dev = skb->dev;
struct net *net = dev_net(dev);
 
-   for (ra = rcu_dereference(ip_ra_chain); ra; ra = 
rcu_dereference(ra->next)) {
+   for (ra = rcu_dereference(net->ipv4.ra_chain); ra; ra = 
rcu_dereference(ra->next)) {
struct sock *sk = ra->sk;
 
/* If socket is bound to an interface, only report
@@ -167,8 +167,7 @@ bool ip_call_ra_chain(struct sk_buff *skb)
 */
if (sk && inet_sk(sk)->inet_num == protocol &&
(!sk->sk_bound_dev_if ||
-sk->sk_bound_dev_if == dev->ifindex) &&
-   net_eq(sock_net(sk), net)) {
+sk->sk_bound_dev_if == dev->ifindex)) {
if (ip_is_fragment(ip_hdr(skb))) {
if (ip_defrag(net, skb, 
IP_DEFRAG_CALL_RA_CHAIN))
return true;
diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c
index bf5f44b27b7e..f36d35fe924b 100644
--- a/net/ipv4/ip_sockglue.c
+++ b/net/ipv4/ip_sockglue.c
@@ -322,18 +322,6 @@ int ip_cmsg_send(struct sock *sk, struct msghdr *msg, 
struct ipcm_cookie *ipc,
return 0;
 }
 
-
-/* Special input handler for packets caught by router alert option.
-   They are selected only by protocol field, and then processed likely
-   local ones; but only if someone wants them! Otherwise, router
-   not running rsvpd will kill RSVP.
-
-   It is user level problem, what it will make with them.
-   I have no idea, how it will masquearde or NAT them (it is joke, joke :-)),
-   but receiver should be enough clever f.e. to forward mtrace requests,
-   sent to multicast group to reach destination designated router.
- */
-struct ip_ra_chain __rcu *ip_ra_chain;
 static DEFINE_SPINLOCK(ip_ra_lock);
 
 
@@ -350,6 +338,7 @@ int ip_ra_control(struct sock *sk, unsigned char on,
 {
struct ip_ra_chain *ra, *new_ra;
struct ip_ra_chain __rcu **rap;
+   struct net *net = sock_net(sk);
 
if (sk->sk_type != SOCK_RAW || inet_sk(sk)->inet_num == IPPROTO_RAW)
return -EINVAL;
@@ -357,7 +346,7 @@ int ip_ra_control(struct sock *sk, unsigned char on,
new_ra = on ? kmalloc(sizeof(*new_ra), GFP_KERNEL) : NULL;
 
spin_lock_bh(&ip_ra_lock);
-   for (rap = &ip_ra_chain;
+   for (rap = &net->ipv4.ra_chain;
 (ra = rcu_dereference_protected(*rap,
lockdep_is_held(&ip_ra_lock))) != NULL;
 rap = &ra->next) {

[PATCH net-next 1/5] net: Revert "ipv4: get rid of ip_ra_lock"

2018-03-15 Thread Kirill Tkhai

This reverts commit ba3f571d5dde. The commit was made
after 1215e51edad1 "ipv4: fix a deadlock in ip_ra_control",
and killed ip_ra_lock, which became useless after rtnl_lock()
made used to destroy every raw ipv4 socket. This scales
very bad, and next patch in series reverts 1215e51edad1.
ip_ra_lock will be used again.

Signed-off-by: Kirill Tkhai 
---
 net/ipv4/ip_sockglue.c |   12 ++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c
index 74c962b9b09c..be7c3b71914d 100644
--- a/net/ipv4/ip_sockglue.c
+++ b/net/ipv4/ip_sockglue.c
@@ -334,6 +334,7 @@ int ip_cmsg_send(struct sock *sk, struct msghdr *msg, 
struct ipcm_cookie *ipc,
sent to multicast group to reach destination designated router.
  */
 struct ip_ra_chain __rcu *ip_ra_chain;
+static DEFINE_SPINLOCK(ip_ra_lock);
 
 
 static void ip_ra_destroy_rcu(struct rcu_head *head)
@@ -355,17 +356,21 @@ int ip_ra_control(struct sock *sk, unsigned char on,
 
new_ra = on ? kmalloc(sizeof(*new_ra), GFP_KERNEL) : NULL;
 
+   spin_lock_bh(&ip_ra_lock);
for (rap = &ip_ra_chain;
-(ra = rtnl_dereference(*rap)) != NULL;
+(ra = rcu_dereference_protected(*rap,
+   lockdep_is_held(&ip_ra_lock))) != NULL;
 rap = &ra->next) {
if (ra->sk == sk) {
if (on) {
+   spin_unlock_bh(&ip_ra_lock);
kfree(new_ra);
return -EADDRINUSE;
}
/* dont let ip_call_ra_chain() use sk again */
ra->sk = NULL;
RCU_INIT_POINTER(*rap, ra->next);
+   spin_unlock_bh(&ip_ra_lock);
 
if (ra->destructor)
ra->destructor(sk);
@@ -379,14 +384,17 @@ int ip_ra_control(struct sock *sk, unsigned char on,
return 0;
}
}
-   if (!new_ra)
+   if (!new_ra) {
+   spin_unlock_bh(&ip_ra_lock);
return -ENOBUFS;
+   }
new_ra->sk = sk;
new_ra->destructor = destructor;
 
RCU_INIT_POINTER(new_ra->next, ra);
rcu_assign_pointer(*rap, new_ra);
sock_hold(sk);
+   spin_unlock_bh(&ip_ra_lock);
 
return 0;
 }

[PATCH net-next 0/5] Rework ip_ra_chain protection

2018-03-15 Thread Kirill Tkhai

Commit 1215e51edad1 "ipv4: fix a deadlock in ip_ra_control"
made rtnl_lock() be used in raw_close(). This function is called
on every RAW socket destruction, so that rtnl_mutex is taken
every time. This scales very sadly. I observe cleanup_net()
spending a lot of time in rtnl_lock() and raw_close() is one
of the biggest rtnl user (since we have percpu net->ipv4.icmp_sk).

Another problem is that commit does not explain actual call path
mrtsock_destruct() takes sk lock and the way to deadlock.
But there is no sk lock taking is visible in mrtsock_destruct().
So, it is a question does we need it at all.

This patchset reworks the locking: reverts the problem commit
and its descendant, and introduces rtnl-independent locking.
This may have a continuation, and someone may work on killing
rtnl_lock() in mrtsock_destruct() in the future.

Thanks,
Kirill
---

Kirill Tkhai (5):
  net: Revert "ipv4: get rid of ip_ra_lock"
  net: Revert "ipv4: fix a deadlock in ip_ra_control"
  net: Move IP_ROUTER_ALERT out of lock_sock(sk)
  net: Make ip_ra_chain per struct net
  net: Replace ip_ra_lock with per-net mutex


 include/net/ip.h |   13 +++--
 include/net/netns/ipv4.h |2 ++
 net/core/net_namespace.c |1 +
 net/ipv4/ip_input.c  |5 ++---
 net/ipv4/ip_sockglue.c   |   34 +-
 net/ipv4/ipmr.c  |   11 +--
 net/ipv4/raw.c   |2 --
 7 files changed, 38 insertions(+), 30 deletions(-)

--
Signed-off-by: Kirill Tkhai

[PATCH net-next 3/5] net: Move IP_ROUTER_ALERT out of lock_sock(sk)

2018-03-15 Thread Kirill Tkhai

ip_ra_control() does not need sk_lock. Who are the another
users of ip_ra_chain? ip_mroute_setsockopt() doesn't take
sk_lock, while parallel IP_ROUTER_ALERT syscalls are
synchronized by ip_ra_lock. So, we may move this command
out of sk_lock.

Signed-off-by: Kirill Tkhai 
---
 net/ipv4/ip_sockglue.c |5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c
index b7bac7351409..bf5f44b27b7e 100644
--- a/net/ipv4/ip_sockglue.c
+++ b/net/ipv4/ip_sockglue.c
@@ -646,6 +646,8 @@ static int do_ip_setsockopt(struct sock *sk, int level,
 
/* If optlen==0, it is equivalent to val == 0 */
 
+   if (optname == IP_ROUTER_ALERT)
+   return ip_ra_control(sk, val ? 1 : 0, NULL);
if (ip_mroute_opt(optname))
return ip_mroute_setsockopt(sk, optname, optval, optlen);
 
@@ -1156,9 +1158,6 @@ static int do_ip_setsockopt(struct sock *sk, int level,
goto e_inval;
inet->mc_all = val;
break;
-   case IP_ROUTER_ALERT:
-   err = ip_ra_control(sk, val ? 1 : 0, NULL);
-   break;
 
case IP_FREEBIND:
if (optlen < 1)

[PATCH net-next 5/5] net: Replace ip_ra_lock with per-net mutex

2018-03-15 Thread Kirill Tkhai

Since ra_chain is per-net, we may use per-net mutexes
to protect them in ip_ra_control(). This improves
scalability.

Signed-off-by: Kirill Tkhai 
---
 include/net/netns/ipv4.h |1 +
 net/core/net_namespace.c |1 +
 net/ipv4/ip_sockglue.c   |   15 ++-
 3 files changed, 8 insertions(+), 9 deletions(-)

diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h
index 23c208fcf1a0..7295429732c6 100644
--- a/include/net/netns/ipv4.h
+++ b/include/net/netns/ipv4.h
@@ -50,6 +50,7 @@ struct netns_ipv4 {
struct ipv4_devconf *devconf_all;
struct ipv4_devconf *devconf_dflt;
struct ip_ra_chain  *ra_chain;
+   struct mutexra_mutex;
 #ifdef CONFIG_IP_MULTIPLE_TABLES
struct fib_rules_ops*rules_ops;
boolfib_has_custom_rules;
diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c
index c340d5cfbdec..95ba2c53bd9a 100644
--- a/net/core/net_namespace.c
+++ b/net/core/net_namespace.c
@@ -301,6 +301,7 @@ static __net_init int setup_net(struct net *net, struct 
user_namespace *user_ns)
net->user_ns = user_ns;
idr_init(&net->netns_ids);
spin_lock_init(&net->nsid_lock);
+   mutex_init(&net->ipv4.ra_mutex);
 
list_for_each_entry(ops, &pernet_list, list) {
error = ops_init(ops, net);
diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c
index f36d35fe924b..5ad2d8ed3a3f 100644
--- a/net/ipv4/ip_sockglue.c
+++ b/net/ipv4/ip_sockglue.c
@@ -322,9 +322,6 @@ int ip_cmsg_send(struct sock *sk, struct msghdr *msg, 
struct ipcm_cookie *ipc,
return 0;
 }
 
-static DEFINE_SPINLOCK(ip_ra_lock);
-
-
 static void ip_ra_destroy_rcu(struct rcu_head *head)
 {
struct ip_ra_chain *ra = container_of(head, struct ip_ra_chain, rcu);
@@ -345,21 +342,21 @@ int ip_ra_control(struct sock *sk, unsigned char on,
 
new_ra = on ? kmalloc(sizeof(*new_ra), GFP_KERNEL) : NULL;
 
-   spin_lock_bh(&ip_ra_lock);
+   mutex_lock(&net->ipv4.ra_mutex);
for (rap = &net->ipv4.ra_chain;
 (ra = rcu_dereference_protected(*rap,
-   lockdep_is_held(&ip_ra_lock))) != NULL;
+   lockdep_is_held(&net->ipv4.ra_mutex))) != NULL;
 rap = &ra->next) {
if (ra->sk == sk) {
if (on) {
-   spin_unlock_bh(&ip_ra_lock);
+   mutex_unlock(&net->ipv4.ra_mutex);
kfree(new_ra);
return -EADDRINUSE;
}
/* dont let ip_call_ra_chain() use sk again */
ra->sk = NULL;
RCU_INIT_POINTER(*rap, ra->next);
-   spin_unlock_bh(&ip_ra_lock);
+   mutex_unlock(&net->ipv4.ra_mutex);
 
if (ra->destructor)
ra->destructor(sk);
@@ -374,7 +371,7 @@ int ip_ra_control(struct sock *sk, unsigned char on,
}
}
if (!new_ra) {
-   spin_unlock_bh(&ip_ra_lock);
+   mutex_unlock(&net->ipv4.ra_mutex);
return -ENOBUFS;
}
new_ra->sk = sk;
@@ -383,7 +380,7 @@ int ip_ra_control(struct sock *sk, unsigned char on,
RCU_INIT_POINTER(new_ra->next, ra);
rcu_assign_pointer(*rap, new_ra);
sock_hold(sk);
-   spin_unlock_bh(&ip_ra_lock);
+   mutex_unlock(&net->ipv4.ra_mutex);
 
return 0;
 }

Re: [PATCH net-next nfs 0/6] Converting pernet_operations (part #7)

2018-03-15 Thread Kirill Tkhai

Hi,

Trond, Anna, Bruce, Jeff, David and other NFS and RXRPC people,
could you please provide your vision on this patches?

Thanks,
Kirill

On 13.03.2018 13:49, Kirill Tkhai wrote:
> Hi,
> 
> this series continues to review and to convert pernet_operations
> to make them possible to be executed in parallel for several
> net namespaces in the same time. There are nfs pernet_operations
> in this series. All of them look similar each other, they mostly
> create and destroy caches with small exceptions.
> 
> Also, there is rxrpc_net_ops, which is used in AFS.
> 
> Thanks,
> Kirill
> ---
> 
> Kirill Tkhai (6):
>   net: Convert rpcsec_gss_net_ops
>   net: Convert sunrpc_net_ops
>   net: Convert nfsd_net_ops
>   net: Convert nfs4_dns_resolver_ops
>   net: Convert nfs4blocklayout_net_ops
>   net: Convert rxrpc_net_ops
> 
> 
>  fs/nfs/blocklayout/rpc_pipefs.c |1 +
>  fs/nfs/dns_resolve.c|1 +
>  fs/nfsd/nfsctl.c|1 +
>  net/rxrpc/net_ns.c  |1 +
>  net/sunrpc/auth_gss/auth_gss.c  |1 +
>  net/sunrpc/sunrpc_syms.c    |1 +
>  6 files changed, 6 insertions(+)
> 
> --
> Signed-off-by: Kirill Tkhai 
>

Re: netns: send uevent messages

2018-03-15 Thread Kirill Tkhai

On 15.03.2018 16:39, Christian Brauner wrote:
> On Thu, Mar 15, 2018 at 12:47:30PM +0300, Kirill Tkhai wrote:
>> CC Andrey Vagin
> 
> Hey Kirill,
> 
> Thanks for CCing Andrey.
> 
>>
>> On 15.03.2018 03:12, Christian Brauner wrote:
>>> This patch adds a receive method to NETLINK_KOBJECT_UEVENT netlink sockets
>>> to allow sending uevent messages into the network namespace the socket
>>> belongs to.
>>>
>>> Currently non-initial network namespaces are already isolated and don't
>>> receive uevents. There are a number of cases where it is beneficial for a
>>> sufficiently privileged userspace process to send a uevent into a network
>>> namespace.
>>>
>>> One such use case would be debugging and fuzzing of a piece of software
>>> which listens and reacts to uevents. By running a copy of that software
>>> inside a network namespace, specific uevents could then be presented to it.
>>> More concretely, this would allow for easy testing of udevd/ueventd.
>>>
>>> This will also allow some piece of software to run components inside a
>>> separate network namespace and then effectively filter what that software
>>> can receive. Some examples of software that do directly listen to uevents
>>> and that we have in the past attempted to run inside a network namespace
>>> are rbd (CEPH client) or the X server.
>>>
>>> Implementation:
>>> The implementation has been kept as simple as possible from the kernel's
>>> perspective. Specifically, a simple input method uevent_net_rcv() is added
>>> to NETLINK_KOBJECT_UEVENT sockets which completely reuses existing
>>> af_netlink infrastructure and does neither add an additional netlink family
>>> nor requires any user-visible changes.
>>>
>>> For example, by using netlink_rcv_skb() we can make use of existing netlink
>>> infrastructure to report back informative error messages to userspace.
>>>
>>> Furthermore, this implementation does not introduce any overhead for
>>> existing uevent generating codepaths. The struct netns gets a new uevent
>>> socket member that records the uevent socket associated with that network
>>> namespace. Since we record the uevent socket for each network namespace in
>>> struct net we don't have to walk the whole uevent socket list.
>>> Instead we can directly retrieve the relevant uevent socket and send the
>>> message. This keeps the codepath very performant without introducing
>>> needless overhead.
>>>
>>> Uevent sequence numbers are kept global. When a uevent message is sent to
>>> another network namespace the implementation will simply increment the
>>> global uevent sequence number and append it to the received uevent. This
>>> has the advantage that the kernel will never need to parse the received
>>> uevent message to replace any existing uevent sequence numbers. Instead it
>>> is up to the userspace process to remove any existing uevent sequence
>>> numbers in case the uevent message to be sent contains any.
>>>
>>> Security:
>>> In order for a caller to send uevent messages to a target network namespace
>>> the caller must have CAP_SYS_ADMIN in the owning user namespace of the
>>> target network namespace. Additionally, any received uevent message is
>>> verified to not exceed size UEVENT_BUFFER_SIZE. This includes the space
>>> needed to append the uevent sequence number.
>>>
>>> Testing:
>>> This patch has been tested and verified to work with the following udev
>>> implementations:
>>> 1. CentOS 6 with udevd version 147
>>> 2. Debian Sid with systemd-udevd version 237
>>> 3. Android 7.1.1 with ueventd
>>>
>>> Signed-off-by: Christian Brauner 
>>> ---
>>>  include/net/net_namespace.h |  1 +
>>>  lib/kobject_uevent.c| 88 
>>> -
>>>  2 files changed, 88 insertions(+), 1 deletion(-)
>>>
>>> diff --git a/include/net/net_namespace.h b/include/net/net_namespace.h
>>> index f306b2aa15a4..467bde763a9b 100644
>>> --- a/include/net/net_namespace.h
>>> +++ b/include/net/net_namespace.h
>>> @@ -78,6 +78,7 @@ struct net {
>>>  
>>> struct sock *rtnl;  /* rtnetlink socket */
>>> struct sock *genl_sock;
>>> +   struct sock *uevent_sock;   /* uevent socket */
>>
>> Since you add this per-

Re: [PATCH net-next nfs 0/6] Converting pernet_operations (part #7)

2018-03-15 Thread Kirill Tkhai

On 15.03.2018 17:24, J. Bruce Fields wrote:
> On Thu, Mar 15, 2018 at 04:32:30PM +0300, Kirill Tkhai wrote:
>> Trond, Anna, Bruce, Jeff, David and other NFS and RXRPC people,
>> could you please provide your vision on this patches?
> 
> Whoops, sorry, I haven't been paying attention.  Do you have a pointer
> to documentation?  I'm unclear what the actual concurrency change
> is--sounds like it becomes possible that e.g. multiple ->init methods
> (from the same pernet_operations but for different namespaces) could run
> in parallel?

There is a commentary near struct pernet_operations in fresh net-next.git:

struct pernet_operations {
struct list_head list;
/*
 * Below methods are called without any exclusive locks.
 * More than one net may be constructed and destructed
 * in parallel on several cpus. Every pernet_operations
 * have to keep in mind all other pernet_operations and
 * to introduce a locking, if they share common resources.
 *
 * Exit methods using blocking RCU primitives, such as
 * synchronize_rcu(), should be implemented via exit_batch.
 * Then, destruction of a group of net requires single
 * synchronize_rcu() related to these pernet_operations,
 * instead of separate synchronize_rcu() for every net.
 * Please, avoid synchronize_rcu() at all, where it's possible.
 */

I hope this is enough. Please tell me, if there is unclear thing, I'll
extend it.

> Sounds likely to be safe, and I don't actually care too much who merges
> them as they look very unlikely to conflict with anything pending.  But
> unless anyone tells me otherwise I'll take the one nfsd_net_ops patch
> and leave the rest to Anna or Trond.

Sounds great. Thanks!

Kirill

>> Thanks,
>> Kirill
>>
>> On 13.03.2018 13:49, Kirill Tkhai wrote:
>>> Hi,
>>>
>>> this series continues to review and to convert pernet_operations
>>> to make them possible to be executed in parallel for several
>>> net namespaces in the same time. There are nfs pernet_operations
>>> in this series. All of them look similar each other, they mostly
>>> create and destroy caches with small exceptions.
>>>
>>> Also, there is rxrpc_net_ops, which is used in AFS.
>>>
>>> Thanks,
>>> Kirill
>>> ---
>>>
>>> Kirill Tkhai (6):
>>>   net: Convert rpcsec_gss_net_ops
>>>   net: Convert sunrpc_net_ops
>>>   net: Convert nfsd_net_ops
>>>   net: Convert nfs4_dns_resolver_ops
>>>   net: Convert nfs4blocklayout_net_ops
>>>   net: Convert rxrpc_net_ops
>>>
>>>
>>>  fs/nfs/blocklayout/rpc_pipefs.c |1 +
>>>  fs/nfs/dns_resolve.c|1 +
>>>  fs/nfsd/nfsctl.c|1 +
>>>  net/rxrpc/net_ns.c  |1 +
>>>  net/sunrpc/auth_gss/auth_gss.c  |1 +
>>>  net/sunrpc/sunrpc_syms.c|1 +
>>>  6 files changed, 6 insertions(+)
>>>
>>> --
>>> Signed-off-by: Kirill Tkhai 
>>>

Re: netns: send uevent messages

2018-03-16 Thread Kirill Tkhai

On 16.03.2018 02:46, Christian Brauner wrote:
> On Thu, Mar 15, 2018 at 05:14:13PM +0300, Kirill Tkhai wrote:
>> On 15.03.2018 16:39, Christian Brauner wrote:
>>> On Thu, Mar 15, 2018 at 12:47:30PM +0300, Kirill Tkhai wrote:
>>>> CC Andrey Vagin
>>>
>>> Hey Kirill,
>>>
>>> Thanks for CCing Andrey.
>>>
>>>>
>>>> On 15.03.2018 03:12, Christian Brauner wrote:
>>>>> This patch adds a receive method to NETLINK_KOBJECT_UEVENT netlink sockets
>>>>> to allow sending uevent messages into the network namespace the socket
>>>>> belongs to.
>>>>>
>>>>> Currently non-initial network namespaces are already isolated and don't
>>>>> receive uevents. There are a number of cases where it is beneficial for a
>>>>> sufficiently privileged userspace process to send a uevent into a network
>>>>> namespace.
>>>>>
>>>>> One such use case would be debugging and fuzzing of a piece of software
>>>>> which listens and reacts to uevents. By running a copy of that software
>>>>> inside a network namespace, specific uevents could then be presented to 
>>>>> it.
>>>>> More concretely, this would allow for easy testing of udevd/ueventd.
>>>>>
>>>>> This will also allow some piece of software to run components inside a
>>>>> separate network namespace and then effectively filter what that software
>>>>> can receive. Some examples of software that do directly listen to uevents
>>>>> and that we have in the past attempted to run inside a network namespace
>>>>> are rbd (CEPH client) or the X server.
>>>>>
>>>>> Implementation:
>>>>> The implementation has been kept as simple as possible from the kernel's
>>>>> perspective. Specifically, a simple input method uevent_net_rcv() is added
>>>>> to NETLINK_KOBJECT_UEVENT sockets which completely reuses existing
>>>>> af_netlink infrastructure and does neither add an additional netlink 
>>>>> family
>>>>> nor requires any user-visible changes.
>>>>>
>>>>> For example, by using netlink_rcv_skb() we can make use of existing 
>>>>> netlink
>>>>> infrastructure to report back informative error messages to userspace.
>>>>>
>>>>> Furthermore, this implementation does not introduce any overhead for
>>>>> existing uevent generating codepaths. The struct netns gets a new uevent
>>>>> socket member that records the uevent socket associated with that network
>>>>> namespace. Since we record the uevent socket for each network namespace in
>>>>> struct net we don't have to walk the whole uevent socket list.
>>>>> Instead we can directly retrieve the relevant uevent socket and send the
>>>>> message. This keeps the codepath very performant without introducing
>>>>> needless overhead.
>>>>>
>>>>> Uevent sequence numbers are kept global. When a uevent message is sent to
>>>>> another network namespace the implementation will simply increment the
>>>>> global uevent sequence number and append it to the received uevent. This
>>>>> has the advantage that the kernel will never need to parse the received
>>>>> uevent message to replace any existing uevent sequence numbers. Instead it
>>>>> is up to the userspace process to remove any existing uevent sequence
>>>>> numbers in case the uevent message to be sent contains any.
>>>>>
>>>>> Security:
>>>>> In order for a caller to send uevent messages to a target network 
>>>>> namespace
>>>>> the caller must have CAP_SYS_ADMIN in the owning user namespace of the
>>>>> target network namespace. Additionally, any received uevent message is
>>>>> verified to not exceed size UEVENT_BUFFER_SIZE. This includes the space
>>>>> needed to append the uevent sequence number.
>>>>>
>>>>> Testing:
>>>>> This patch has been tested and verified to work with the following udev
>>>>> implementations:
>>>>> 1. CentOS 6 with udevd version 147
>>>>> 2. Debian Sid with systemd-udevd version 237
>>>>> 3. Android 7.1.1 with ueventd
>>>>>
>>>>> Signed-off-by: Christian Brauner 
>>>>> ---
>>&

Re: [BUG/Q] can_pernet_exit() leaves devices on dead net

2018-03-16 Thread Kirill Tkhai

On 06.03.2018 13:26, Oliver Hartkopp wrote:
> - DaveM
> 
> Hi Kirill,
> 
> On 03/05/2018 04:22 PM, Kirill Tkhai wrote:
> 
>> Thanks for the explanation, and module unloading should be nice. Just to 
>> clarify,
>> I worry not about rules, but about netdevices.
>>
>>  unshare -n ip link add type vcan
>>
>> This command creates net ns, adds vcan there and exits. Then net ns is 
>> destroyed.
>> Since vcan has rtnl_link_ops, it unregistered by default_device_exit_batch().
>> Real can devices are moved to init_net in default_device_exit(), as they 
>> don't
>> have rtnl_link_ops set.
> 
> In fact most of the real CAN drivers have rtnl_link_ops:
> 
> https://elixir.bootlin.com/linux/latest/source/drivers/net/can/dev.c#L1162
> https://elixir.bootlin.com/linux/latest/source/drivers/net/can/dev.c#L1225
> 
> Just slcan.c which is something like slip.c is missing these feature.
> 
> AFAIK real CAN netdevices are created in init_net at system startup, and you
> can move them to a netns later.
> 
> When we already have rtnl_link_ops in the real CAN drivers - what happens to
> them when the namespace is destroyed? Are they still moved to init_net, or do
> we need to add some default handler in the current rtnl_link_ops setup?

They are unregistered in default_device_exit_batch():

list_for_each_entry(net, net_list, exit_list) {
for_each_netdev_reverse(net, dev) {
if (dev->rtnl_link_ops && dev->rtnl_link_ops->dellink)
dev->rtnl_link_ops->dellink(dev, 
&dev_kill_list);
else
unregister_netdevice_queue(dev, &dev_kill_list);
}
}
unregister_netdevice_many(&dev_kill_list);

So that, my question in the start of the topic is about that. If we unregister 
them,
what devices we're going to meet in can_pernet_exit()?

Also, can devices not having rtnl_link_ops are moved into init_net in 
default_device_exit():

for_each_netdev_safe(net, dev, aux) {
if (dev->features & NETIF_F_NETNS_LOCAL)
continue;

/* Leave virtual devices for the generic cleanup */
if (dev->rtnl_link_ops)
continue;

err = dev_change_net_namespace(dev, &init_net, fb_name);
}
 
>> So, for_each_netdev_rcu() cycle in can_pernet_exit() should be useless 
>> (there are
>> no can devices in the list of net's devices). This looks so for me, please 
>> say
>> what devices are there if my assumption is wrong.
> 
> See above?
> 
>>>> Since can_pernet_ops is pernet subsys, it's executed after 
>>>> default_device_exit()
>>>> from default_device_ops pernet device, as devices exit methods are 
>>>> executed first
>>>> (see net/core/net_namespace.c).
>>>
>>> Hm - a device exit fires the NETDEV_UNREGISTER notifier which removes the
>>> user-generated filters (e.g. in raw_notifier() in net/can/raw.c).
>>> Finally the can_dev_rcv_lists structure is free'd in af_can.c.
>>>
>>> Marc Kleine-Budde recently proposed a patch to create the can_dev_rcv_lists 
>>> at
>>> netdevice creation time (-> the space is allocated by alloc_netdev() and
>>> removed by free_netdev()). This would remove the handling (allocate & free) 
>>> of
>>> ml_priv by af_can.c. Would this rework fix the described issue?
>>
>> Could you please give me a link to the patches? I can't find them in 
>> patchwork.
> 
> There was a patchset of 14 patches from Marc where some of the refactoring &
> renaming already made it into mainline - but the patches to move the
> can_dev_rcv_lists data structure into the network device space have not been
> pushed:
> 
> https://marc.info/?l=linux-can&m=150169588319315&w=2
> https://marc.info/?l=linux-can&r=1&b=201708&w=2
> 
> This patch & documentation describes Marc's proposed idea best:
> https://marc.info/?l=linux-can&m=150169589619340&w=2

Yes, this one looks good:
https://marc.info/?l=linux-can&m=150169589119335&w=2

Regards,
Kirill

[PATCH RFC RFC] rds: Use NETDEV_UNREGISTER in rds_tcp_dev_event() (then kill NETDEV_UNREGISTER_FINAL)

2018-03-16 Thread Kirill Tkhai

Hi,

rds_tcp_dev_event() is the last user of NETDEV_UNREGISTER_FINAL stage.
If we rework it, we'll be able to kill the stage, and this will decrease
the number of rtnl_lock() we take during generic net device unregistration.
This is very hot path for namespaces.

467fa15356acf by Sowmini Varadhan added NETDEV_UNREGISTER_FINAL dependence
with the commentary:

/* rds-tcp registers as a pernet subys, so the ->exit will only
 * get invoked after network acitivity has quiesced. We need to
 * clean up all sockets  to quiesce network activity, and use
 * the unregistration of the per-net loopback device as a trigger
 * to start that cleanup.
 */

It seems all the protocols pernet subsystems meet this situation, but they
solve it in generic way. What does rds so specific have?

This commit makes event handler to iterate rds_tcp_conn_list and
kill them. If we change the stage to NETDEV_UNREGISTER, what will change?
Can unregistered loopback device on dead net add new items to rds_tcp_conn_list?
How it's possible?

What the problem is to move rds_tcp_kill_sock() into pernet subsys exit?

Kirill
---
 net/rds/tcp.c |2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/rds/tcp.c b/net/rds/tcp.c
index eb04e7fa2467..4c6db9cb6261 100644
--- a/net/rds/tcp.c
+++ b/net/rds/tcp.c
@@ -567,7 +567,7 @@ static int rds_tcp_dev_event(struct notifier_block *this,
 * the unregistration of the per-net loopback device as a trigger
 * to start that cleanup.
 */
-   if (event == NETDEV_UNREGISTER_FINAL &&
+   if (event == NETDEV_UNREGISTER &&
dev->ifindex == LOOPBACK_IFINDEX)
rds_tcp_kill_sock(dev_net(dev));

Re: [PATCH RFC RFC] rds: Use NETDEV_UNREGISTER in rds_tcp_dev_event() (then kill NETDEV_UNREGISTER_FINAL)

2018-03-16 Thread Kirill Tkhai

On 16.03.2018 16:00, Sowmini Varadhan wrote:
> On (03/16/18 15:38), Kirill Tkhai wrote:
>>
>> 467fa15356acf by Sowmini Varadhan added NETDEV_UNREGISTER_FINAL dependence
>> with the commentary:
>>
>>  /* rds-tcp registers as a pernet subys, so the ->exit will only
>>   * get invoked after network acitivity has quiesced. We need to
>>   * clean up all sockets  to quiesce network activity, and use
>>   * the unregistration of the per-net loopback device as a trigger
>>   * to start that cleanup.
>>   */
>>
>> It seems all the protocols pernet subsystems meet this situation, but they
>> solve it in generic way. What does rds so specific have?
> 
> The difference with rds is this: most consumers of netns associate
> a net_device with the netns, and cleanup of the netns state 
> happens as part of the net_device teardown without the constraint
> above. rds-tcp does has a netns tied to listening socket, not
> to a specific network interface (net_device) so it registers
> as a pernet-subsys. But this means that cleanup has to be
> cone carefully (see comments in net_namespace.h before
> register_pernet_subsys)

This is not a problem, and rds-tcp is not the only pernet_subsys registering
a socket. It's OK to close it from .exit method. There are many examples,
let me point you to icmp_sk_ops as one of them. But it's not the only.

> For rds-tcp, we need to be able to do the right thing in both of these
> cases
> 1. modprobe -r rds-tcp (cleanup of rds-tcp state should happen in
>every namespace, including init_net)
> 2. netns delete (rds_tcp.ko should remain loaded for other namespaces)

The same as above, every pernet_subsys does this. It's not a problem.
exit and exit_batch methods are called in both of the cases.

Please, see __unregister_pernet_operations()->ops_exit_list for the details.

>> This commit makes event handler to iterate rds_tcp_conn_list and
>> kill them. If we change the stage to NETDEV_UNREGISTER, what will change?
> 
> The above two cases need to work correctly.

Yeah, but let's find another way to have the same.

>> Can unregistered loopback device on dead net add new items to 
>> rds_tcp_conn_list?
>> How it's possible?
> 
> I dont understand the question- no unregistered loopback devices
> cannot add items. 

If we replace NETDEV_UNREGISTER_FINAL with NETDEV_UNREGISTER, the only change
which happens is we call rds_tcp_kill_sock() earlier. So, it may be a reason
of problems only if someone changes the list during the time between
NETDEV_UNREGISTER and NETDEV_UNREGISTER_FINAL are called for loopback.
But since this time noone related to this net can extend the list,
there is no a problem to do that.

> fwiw, I had asked questions about this (netns per net_device
> vs netns for module) on the netdev list a few years ago, I can
> try to hunt down that thread for you later (nobody replied to 
> it, but maybe it will help answer your questions).

After your words it looks like we may simply do all the things
in exit method.

Kirill

1 2 3 4 5 >

1 - 100 of 429 matches

Mail list logo