Re: [PATCH 21/34] kernfs, sysfs, cgroup, intel_rdt: Support fs_context [ver #12]

2018-12-06 Thread Andrei Vagin
On Sun, Nov 18, 2018 at 08:23:42PM -0800, Andrei Vagin wrote:
> On Fri, Sep 21, 2018 at 05:33:01PM +0100, David Howells wrote:
> > @@ -1993,57 +2009,53 @@ int cgroup_setup_root(struct cgroup_root *root, u16 
> > ss_mask, int ref_flags)
> > return ret;
> >  }
> >  
> > -struct dentry *cgroup_do_mount(struct file_system_type *fs_type, int flags,
> > -  struct cgroup_root *root, unsigned long magic,
> > -  struct cgroup_namespace *ns)
> > +int cgroup_do_get_tree(struct fs_context *fc)
> >  {
> > -   struct dentry *dentry;
> > -   bool new_sb;
> > +   struct cgroup_fs_context *ctx = cgroup_fc2context(fc);
> > +   int ret;
> >  
> > -   dentry = kernfs_mount(fs_type, flags, root->kf_root, magic, _sb);
> > +   ctx->kfc.root = ctx->root->kf_root;
> > +
> > +   ret = kernfs_get_tree(fc);
> > +   if (ret < 0)
> > +   goto out_cgrp;
> >  
> > /*
> >  * In non-init cgroup namespace, instead of root cgroup's dentry,
> >  * we return the dentry corresponding to the cgroupns->root_cgrp.
> >  */
> > -   if (!IS_ERR(dentry) && ns != _cgroup_ns) {
> > +   if (ctx->ns != _cgroup_ns) {
> > struct dentry *nsdentry;
> > struct cgroup *cgrp;
> >  
> > mutex_lock(_mutex);
> > spin_lock_irq(_set_lock);
> >  
> > -   cgrp = cset_cgroup_from_root(ns->root_cset, root);
> > +   cgrp = cset_cgroup_from_root(ctx->ns->root_cset, ctx->root);
> >  
> > spin_unlock_irq(_set_lock);
> > mutex_unlock(_mutex);
> >  
> > -   nsdentry = kernfs_node_dentry(cgrp->kn, dentry->d_sb);
> > -   dput(dentry);
> > -   dentry = nsdentry;
> > +   nsdentry = kernfs_node_dentry(cgrp->kn, fc->root->d_sb);
> > +   if (IS_ERR(nsdentry))
> > +   return PTR_ERR(nsdentry);
> > +   dput(fc->root);
> > +   fc->root = nsdentry;
> > }
> >  
> > -   if (IS_ERR(dentry) || !new_sb)
> > -   cgroup_put(>cgrp);
> 
> I don't see where this cgroup_put() has been moved.

David, have you looked at this problem? It isn't fixed in linux-next
yet.

https://travis-ci.org/avagin/linux/jobs/463960763

Thanks,
Andrei

> 
> With this patch, the next script works only once, on the second attempt
> it hangs up on mounting a cgroup file system.
> 
> This is the only suspicious place in this patch what I have found.
> 
> [root@fc24 ~]# cat fs-vs-cg 
> d=$(mktemp -d /tmp/cg.XX)
> mkdir $d/a
> mkdir $d/b
> mount -t cgroup -o none,name= xxx $d/a
> mount -t cgroup -o none,name= xxx $d/b
> umount $d/a
> umount $d/b
> 
> [root@fc24 ~]# unshare -m --propagation private bash -x fs-vs-cg
> ++ mktemp -d /tmp/cg.XX
> + d=/tmp/cg.yUfagS
> + mkdir /tmp/cg.yUfagS/a
> + mkdir /tmp/cg.yUfagS/b
> + mount -t cgroup -o none,name= xxx /tmp/cg.yUfagS/a
> + mount -t cgroup -o none,name= xxx /tmp/cg.yUfagS/b
> + umount /tmp/cg.yUfagS/a
> + umount /tmp/cg.yUfagS/b
> [root@fc24 ~]# unshare -m --propagation private bash -x fs-vs-cg
> ++ mktemp -d /tmp/cg.XX
> + d=/tmp/cg.ippWUn
> + mkdir /tmp/cg.ippWUn/a
> + mkdir /tmp/cg.ippWUn/b
> + mount -t cgroup -o none,name= xxx /tmp/cg.ippWUn/a
> ^Z
> [1]+  Stopped unshare -m --propagation private bash -x 
> fs-vs-cg
> 
> [root@fc24 ~]# ps
>   PID TTY  TIME CMD
>   556 pts/000:00:00 bash
>   591 pts/000:00:00 bash
>   595 pts/000:00:00 mount
>   596 pts/000:00:00 ps
> 
> [root@fc24 ~]# bg
> [1]+ unshare -m --propagation private bash -x fs-vs-cg &
> 
> [root@fc24 ~]# cat /proc/595/stack 
> [<0>] msleep+0x38/0x40
> [<0>] cgroup1_get_tree+0x4e1/0x72c
> [<0>] vfs_get_tree+0x5e/0x140
> [<0>] do_mount+0x326/0xc70
> [<0>] ksys_mount+0xba/0xd0
> [<0>] __x64_sys_mount+0x21/0x30
> [<0>] do_syscall_64+0x60/0x210
> [<0>] entry_SYSCALL_64_after_hwframe+0x49/0xbe
> [<0>] 0x
> 


[PATCH] include: replace tsk to task in linux/sched/signal.h

2018-11-29 Thread Andrei Vagin
This file uses "task" 85 times and "tsk" 25 times. It should be better to
choose one of these variants.

Signed-off-by: Andrei Vagin 
---
 include/linux/sched/signal.h | 51 ++--
 1 file changed, 26 insertions(+), 25 deletions(-)

diff --git a/include/linux/sched/signal.h b/include/linux/sched/signal.h
index 76b8399b17f6..0c3e396dca04 100644
--- a/include/linux/sched/signal.h
+++ b/include/linux/sched/signal.h
@@ -270,17 +270,18 @@ static inline int signal_group_exit(const struct 
signal_struct *sig)
 extern void flush_signals(struct task_struct *);
 extern void ignore_signals(struct task_struct *);
 extern void flush_signal_handlers(struct task_struct *, int force_default);
-extern int dequeue_signal(struct task_struct *tsk, sigset_t *mask, 
kernel_siginfo_t *info);
+extern int dequeue_signal(struct task_struct *task,
+ sigset_t *mask, kernel_siginfo_t *info);
 
 static inline int kernel_dequeue_signal(void)
 {
-   struct task_struct *tsk = current;
+   struct task_struct *task = current;
kernel_siginfo_t __info;
int ret;
 
-   spin_lock_irq(>sighand->siglock);
-   ret = dequeue_signal(tsk, >blocked, &__info);
-   spin_unlock_irq(>sighand->siglock);
+   spin_lock_irq(>sighand->siglock);
+   ret = dequeue_signal(task, >blocked, &__info);
+   spin_unlock_irq(>sighand->siglock);
 
return ret;
 }
@@ -418,18 +419,18 @@ static inline void set_restore_sigmask(void)
WARN_ON(!test_thread_flag(TIF_SIGPENDING));
 }
 
-static inline void clear_tsk_restore_sigmask(struct task_struct *tsk)
+static inline void clear_tsk_restore_sigmask(struct task_struct *task)
 {
-   clear_tsk_thread_flag(tsk, TIF_RESTORE_SIGMASK);
+   clear_tsk_thread_flag(task, TIF_RESTORE_SIGMASK);
 }
 
 static inline void clear_restore_sigmask(void)
 {
clear_thread_flag(TIF_RESTORE_SIGMASK);
 }
-static inline bool test_tsk_restore_sigmask(struct task_struct *tsk)
+static inline bool test_tsk_restore_sigmask(struct task_struct *task)
 {
-   return test_tsk_thread_flag(tsk, TIF_RESTORE_SIGMASK);
+   return test_tsk_thread_flag(task, TIF_RESTORE_SIGMASK);
 }
 static inline bool test_restore_sigmask(void)
 {
@@ -448,9 +449,9 @@ static inline void set_restore_sigmask(void)
current->restore_sigmask = true;
WARN_ON(!test_thread_flag(TIF_SIGPENDING));
 }
-static inline void clear_tsk_restore_sigmask(struct task_struct *tsk)
+static inline void clear_tsk_restore_sigmask(struct task_struct *task)
 {
-   tsk->restore_sigmask = false;
+   task->restore_sigmask = false;
 }
 static inline void clear_restore_sigmask(void)
 {
@@ -460,9 +461,9 @@ static inline bool test_restore_sigmask(void)
 {
return current->restore_sigmask;
 }
-static inline bool test_tsk_restore_sigmask(struct task_struct *tsk)
+static inline bool test_tsk_restore_sigmask(struct task_struct *task)
 {
-   return tsk->restore_sigmask;
+   return task->restore_sigmask;
 }
 static inline bool test_and_clear_restore_sigmask(void)
 {
@@ -616,9 +617,9 @@ static inline struct pid *task_session(struct task_struct 
*task)
return task->signal->pids[PIDTYPE_SID];
 }
 
-static inline int get_nr_threads(struct task_struct *tsk)
+static inline int get_nr_threads(struct task_struct *task)
 {
-   return tsk->signal->nr_threads;
+   return task->signal->nr_threads;
 }
 
 static inline bool thread_group_leader(struct task_struct *p)
@@ -657,35 +658,35 @@ static inline int thread_group_empty(struct task_struct 
*p)
 #define delay_group_leader(p) \
(thread_group_leader(p) && !thread_group_empty(p))
 
-extern struct sighand_struct *__lock_task_sighand(struct task_struct *tsk,
+extern struct sighand_struct *__lock_task_sighand(struct task_struct *task,
unsigned long *flags);
 
-static inline struct sighand_struct *lock_task_sighand(struct task_struct *tsk,
+static inline struct sighand_struct *lock_task_sighand(struct task_struct 
*task,
   unsigned long *flags)
 {
struct sighand_struct *ret;
 
-   ret = __lock_task_sighand(tsk, flags);
-   (void)__cond_lock(>sighand->siglock, ret);
+   ret = __lock_task_sighand(task, flags);
+   (void)__cond_lock(>sighand->siglock, ret);
return ret;
 }
 
-static inline void unlock_task_sighand(struct task_struct *tsk,
+static inline void unlock_task_sighand(struct task_struct *task,
unsigned long *flags)
 {
-   spin_unlock_irqrestore(>sighand->siglock, *flags);
+   spin_unlock_irqrestore(>sighand->siglock, *flags);
 }
 
-static inline unsigned long task_rlimit(const struct task_struct *tsk,
+static inline unsigned long task_rlimit(const struct task_st

Re: [PATCH] ptrace: take into account saved_sigmask in PTRACE_{GET,SET}SIGMASK

2018-11-26 Thread Andrei Vagin
On Thu, Nov 22, 2018 at 12:47:52PM +0100, Oleg Nesterov wrote:
> On 11/19, Andrei Vagin wrote:
> >
> > case PTRACE_SETSIGMASK: {
> > sigset_t new_set;
> > @@ -962,6 +971,8 @@ int ptrace_request(struct task_struct *child, long 
> > request,
> > child->blocked = new_set;
> > spin_unlock_irq(>sighand->siglock);
> >
> > +   clear_tsk_restore_sigmask(child);
> > +
> 
> I am not sure I understand this change...
> 
> I forgot everything I knew about criu, but iiuc PTRACE_SETSIGMASK is used
> at "restore" time, doesn't this mean that TIF_RESTORE_SIGMASK/restore_sigmask
> can not be set?

PTRACE_SETSIGMASK isn't used on restore. On restore, criu generates
sigframe and calls sigreturn to restore registers, fpu state, sigmask
and resume a process.  When the kernel constructs a signal frame, it
calls sigmask_to_save() to get a process signal mask. With this patch,
PTRACE_GETSIGMASK returns the same signal mask what is returned by
sigmask_to_save().

In CRIU, we don't need to set TIF_RESTORE_SIGMASK, because all processes
are dumped when they are in user-space.

> 
> IOW, could you please explain how PTRACE_SETSIGMASK should be used, and why
> it doesn't do something like
> 

CRIU uses PTRACE_SETSIGMASK when it injects a parasite code into a
target process. In this case, we have to be sure that when the process
is resumed by PTRACE_CONT, it will not start handling signals and
executing signal handlers.

>   if (test_tsk_restore_sigmask(child))
>   child->saved_sigmask = new_set;
>   else
>   child->blocked = new_set;
> 
> which looks symmetrical to PTRACE_GETSIGMASK?

If we set child->saved_sigmask, the child can start handling signals
which are not set in child->blocked.

> 
> Oleg.
> 


[PATCH] ptrace: take into account saved_sigmask in PTRACE_{GET,SET}SIGMASK

2018-11-19 Thread Andrei Vagin
There are a few system calls (pselect, ppoll, etc) which replace a task
sigmask while they are running in a kernel-space

When a task calls one of these syscalls, the kernel saves a current
sigmask in task->saved_sigmask and sets a syscall sigmask.

On syscall-exit-stop, ptrace traps a task before restoring the
saved_sigmask, so PTRACE_GETSIGMASK returns the syscall sigmask and
PTRACE_SETSIGMASK does nothing, because its sigmask is replaced by
saved_sigmask, when the task returns to user-space.

This patch fixes this problem. PTRACE_GET_SIGMASK returns saved_sigmask
is it's set. PTRACE_SETSIGMASK drops the TIF_RESTORE_SIGMASK flag.

Cc: Oleg Nesterov 
Cc: "Eric W. Biederman" 
Cc: Andrew Morton 
Fixes: 29000caecbe8 ("ptrace: add ability to get/set signal-blocked mask")
Signed-off-by: Andrei Vagin 
---
 include/linux/sched/signal.h | 18 ++
 kernel/ptrace.c  | 15 +--
 2 files changed, 31 insertions(+), 2 deletions(-)

diff --git a/include/linux/sched/signal.h b/include/linux/sched/signal.h
index 1be35729c2c5..660d78c9af6c 100644
--- a/include/linux/sched/signal.h
+++ b/include/linux/sched/signal.h
@@ -417,10 +417,20 @@ static inline void set_restore_sigmask(void)
set_thread_flag(TIF_RESTORE_SIGMASK);
WARN_ON(!test_thread_flag(TIF_SIGPENDING));
 }
+
+static inline void clear_tsk_restore_sigmask(struct task_struct *tsk)
+{
+   clear_tsk_thread_flag(tsk, TIF_RESTORE_SIGMASK);
+}
+
 static inline void clear_restore_sigmask(void)
 {
clear_thread_flag(TIF_RESTORE_SIGMASK);
 }
+static inline bool test_tsk_restore_sigmask(struct task_struct *tsk)
+{
+   return test_tsk_thread_flag(tsk, TIF_RESTORE_SIGMASK);
+}
 static inline bool test_restore_sigmask(void)
 {
return test_thread_flag(TIF_RESTORE_SIGMASK);
@@ -438,6 +448,10 @@ static inline void set_restore_sigmask(void)
current->restore_sigmask = true;
WARN_ON(!test_thread_flag(TIF_SIGPENDING));
 }
+static inline void clear_tsk_restore_sigmask(struct task_struct *tsk)
+{
+   tsk->restore_sigmask = false;
+}
 static inline void clear_restore_sigmask(void)
 {
current->restore_sigmask = false;
@@ -446,6 +460,10 @@ static inline bool test_restore_sigmask(void)
 {
return current->restore_sigmask;
 }
+static inline bool test_tsk_restore_sigmask(struct task_struct *tsk)
+{
+   return tsk->restore_sigmask;
+}
 static inline bool test_and_clear_restore_sigmask(void)
 {
if (!current->restore_sigmask)
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 21fec73d45d4..fc0d667f5792 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -29,6 +29,7 @@
 #include 
 #include 
 #include 
+#include 
 
 /*
  * Access another process' address space via ptrace.
@@ -925,18 +926,26 @@ int ptrace_request(struct task_struct *child, long 
request,
ret = ptrace_setsiginfo(child, );
break;
 
-   case PTRACE_GETSIGMASK:
+   case PTRACE_GETSIGMASK: {
+   sigset_t *mask;
+
if (addr != sizeof(sigset_t)) {
ret = -EINVAL;
break;
}
 
-   if (copy_to_user(datavp, >blocked, sizeof(sigset_t)))
+   if (test_tsk_restore_sigmask(child))
+   mask = >saved_sigmask;
+   else
+   mask = >blocked;
+
+   if (copy_to_user(datavp, mask, sizeof(sigset_t)))
ret = -EFAULT;
else
ret = 0;
 
break;
+   }
 
case PTRACE_SETSIGMASK: {
sigset_t new_set;
@@ -962,6 +971,8 @@ int ptrace_request(struct task_struct *child, long request,
child->blocked = new_set;
spin_unlock_irq(>sighand->siglock);
 
+   clear_tsk_restore_sigmask(child);
+
ret = 0;
break;
}
-- 
2.17.2



Re: [PATCH 21/34] kernfs, sysfs, cgroup, intel_rdt: Support fs_context [ver #12]

2018-11-18 Thread Andrei Vagin
On Fri, Sep 21, 2018 at 05:33:01PM +0100, David Howells wrote:
> Make kernfs support superblock creation/mount/remount with fs_context.
> 
> This requires that sysfs, cgroup and intel_rdt, which are built on kernfs,
> be made to support fs_context also.
> 
> Notes:
> 
>  (1) A kernfs_fs_context struct is created to wrap fs_context and the
>  kernfs mount parameters are moved in here (or are in fs_context).
> 
>  (2) kernfs_mount{,_ns}() are made into kernfs_get_tree().  The extra
>  namespace tag parameter is passed in the context if desired
> 
>  (3) kernfs_free_fs_context() is provided as a destructor for the
>  kernfs_fs_context struct, but for the moment it does nothing except
>  get called in the right places.
> 
>  (4) sysfs doesn't wrap kernfs_fs_context since it has no parameters to
>  pass, but possibly this should be done anyway in case someone wants to
>  add a parameter in future.
> 
>  (5) A cgroup_fs_context struct is created to wrap kernfs_fs_context and
>  the cgroup v1 and v2 mount parameters are all moved there.
> 
>  (6) cgroup1 parameter parsing error messages are now handled by invalf(),
>  which allows userspace to collect them directly.
> 
>  (7) cgroup1 parameter cleanup is now done in the context destructor rather
>  than in the mount/get_tree and remount functions.
> 
> Weirdies:
> 
>  (*) cgroup_do_get_tree() calls cset_cgroup_from_root() with locks held,
>  but then uses the resulting pointer after dropping the locks.  I'm
>  told this is okay and needs commenting.
> 
>  (*) The cgroup refcount web.  This really needs documenting.
> 
>  (*) cgroup2 only has one root?
> 
> Add a suggestion from Thomas Gleixner in which the RDT enablement code is
> placed into its own function.
> 
> Signed-off-by: David Howells 
> cc: Greg Kroah-Hartman 
> cc: Tejun Heo 
> cc: Li Zefan 
> cc: Johannes Weiner 
> cc: cgro...@vger.kernel.org
> cc: fenghua...@intel.com
> ---
> 
>  arch/x86/kernel/cpu/intel_rdt.h  |   15 +
>  arch/x86/kernel/cpu/intel_rdt_rdtgroup.c |  183 ++--
>  fs/kernfs/mount.c|   88 
>  fs/sysfs/mount.c |   67 --
>  include/linux/cgroup.h   |3 
>  include/linux/kernfs.h   |   39 ++-
>  kernel/cgroup/cgroup-internal.h  |   50 +++-
>  kernel/cgroup/cgroup-v1.c|  345 
> --
>  kernel/cgroup/cgroup.c   |  264 +++
>  kernel/cgroup/cpuset.c   |4 
>  10 files changed, 640 insertions(+), 418 deletions(-)
> 
> diff --git a/arch/x86/kernel/cpu/intel_rdt.h b/arch/x86/kernel/cpu/intel_rdt.h
> index 4e588f36228f..1461adc2c5e8 100644
> --- a/arch/x86/kernel/cpu/intel_rdt.h
> +++ b/arch/x86/kernel/cpu/intel_rdt.h
> @@ -33,6 +33,21 @@
>  #define RMID_VAL_ERROR   BIT_ULL(63)
>  #define RMID_VAL_UNAVAIL BIT_ULL(62)
>  
> +
> +struct rdt_fs_context {
> + struct kernfs_fs_contextkfc;
> + boolenable_cdpl2;
> + boolenable_cdpl3;
> + boolenable_mba_mbps;
> +};
> +
> +static inline struct rdt_fs_context *rdt_fc2context(struct fs_context *fc)
> +{
> + struct kernfs_fs_context *kfc = fc->fs_private;
> +
> + return container_of(kfc, struct rdt_fs_context, kfc);
> +}
> +
>  DECLARE_STATIC_KEY_FALSE(rdt_enable_key);
>  
>  /**
> diff --git a/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c 
> b/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c
> index d6cb04c3a28b..34733a221669 100644
> --- a/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c
> +++ b/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c
> @@ -24,6 +24,7 @@
>  #include 
>  #include 
>  #include 
> +#include 
>  #include 
>  #include 
>  #include 
> @@ -1707,43 +1708,6 @@ static void cdp_disable_all(void)
>   cdpl2_disable();
>  }
>  
> -static int parse_rdtgroupfs_options(char *data)
> -{
> - char *token, *o = data;
> - int ret = 0;
> -
> - while ((token = strsep(, ",")) != NULL) {
> - if (!*token) {
> - ret = -EINVAL;
> - goto out;
> - }
> -
> - if (!strcmp(token, "cdp")) {
> - ret = cdpl3_enable();
> - if (ret)
> - goto out;
> - } else if (!strcmp(token, "cdpl2")) {
> - ret = cdpl2_enable();
> - if (ret)
> - goto out;
> - } else if (!strcmp(token, "mba_MBps")) {
> - ret = set_mba_sc(true);
> - if (ret)
> - goto out;
> - } else {
> - ret = -EINVAL;
> - goto out;
> - }
> - }
> -
> - return 0;
> -
> -out:
> - pr_err("Invalid mount option \"%s\"\n", token);
> -
> - return ret;

linux-next: Unable to mount a cgroup file system

2018-11-17 Thread Andrei Vagin
Hello,

We run CRIU tests on linux-next kernels. Today, I found that our test
robot hangs up on mounting a cgroup file system.

https://travis-ci.org/avagin/linux/jobs/455732006

  632 ?Ssl0:00 /usr/bin/containerd
  843 ?Sl 0:00  \_ containerd-shim -namespace moby
-workdir 
/var/lib/containerd/io.containerd.runtime.v1.linux/moby/c2311352d53eed1f5094580102e41c2a02eaf98b8626c86ccf314599101b26
  862 pts/0Ss+0:00  \_ python test/zdtm.py run -T
zdtm/static/cgroup.*
 1652 pts/0S+ 0:00  \_ flock zdtm_mount_cgroups.lock
./zdtm_umount_cgroups
 1653 pts/0S+ 0:00  \_ /bin/sh ./zdtm_umount_cgroups
 1659 pts/0D+ 0:06  \_ mount -t cgroup -o
none,name=zdtmtst.defaultroot zdtm zdtm.9QFGko

[root@fc24 ~]# cat /proc/1659/stack
[<0>] msleep+0x38/0x40
[<0>] cgroup1_get_tree+0x4e1/0x749
[<0>] vfs_get_tree+0x5e/0x140
[<0>] do_mount+0x326/0xc70
[<0>] ksys_mount+0xba/0xd0
[<0>] __x64_sys_mount+0x21/0x30
[<0>] do_syscall_64+0x60/0x210
[<0>] entry_SYSCALL_64_after_hwframe+0x49/0xbe
[<0>] 0x

[root@fc24 ~]# cat /proc/1659/cgroup  | grep zdtm
13:name=zdtmtst.defaultroot:/
12:name=zdtmtst:/

[root@fc24 ~]# strace -fp 1659
strace: Process 1659 attached
mount("zdtm", "/criu/test/zdtm.9QFGko", "cgroup", MS_MGC_VAL,
"none,name=zdtmtst.defaultroot") = ? ERESTARTNOINTR (To be restarted)
mount("zdtm", "/criu/test/zdtm.9QFGko", "cgroup", MS_MGC_VAL,
"none,name=zdtmtst.defaultroot") = ? ERESTARTNOINTR (To be restarted)
mount("zdtm", "/criu/test/zdtm.9QFGko", "cgroup", MS_MGC_VAL,
"none,name=zdtmtst.defaultroot") = ? ERESTARTNOINTR (To be restarted)
mount("zdtm", "/criu/test/zdtm.9QFGko", "cgroup", MS_MGC_VAL,
"none,name=zdtmtst.defaultroot") = ? ERESTARTNOINTR (To be restarted)
mount("zdtm", "/criu/test/zdtm.9QFGko", "cgroup", MS_MGC_VAL,
"none,name=zdtmtst.defaultroot") = ? ERESTARTNOINTR (To be restarted)

Steps to reproduce:
I don't know how to reproduce this issue without running criu tests. I
tried to create a simple reproducer, but I failed. So I created a
docker container and the problem can be reproduced by running this
command:
docker run --rm -it --privileged -v /lib/modules:/lib/modules --tmpfs
/run docker.io/avagin/criu-fc29-cgroup python test/zdtm.py run -T
'zdtm/static/cgroup.*'

I found that something wrong is in
[16ec1a5d58ea67ba737d3a66efe9e53c6bb149f7] kernfs, sysfs, cgroup,
intel_rdt: Support fs_context

https://git.kernel.org/pub/scm/linux/kernel/git/next/linux-next.git/commit/?h=next-20181116=16ec1a5d58ea67ba737d3a66efe9e53c6bb149f7

Thanks,
Andrei


Re: [RFC 00/20] ns: Introduce Time Namespace

2018-10-31 Thread Andrei Vagin
On Mon, Oct 29, 2018 at 09:33:14PM +0100, Thomas Gleixner wrote:
> Andrei,
> 
> On Sat, 20 Oct 2018, Andrei Vagin wrote:
> > When a container is migrated to another host, we have to restore its
> > monotonic and boottime clocks, but we still expect that the container
> > will continue using the host real-time clock.
> > 
> > Before stating this series, I was thinking about this, I decided that
> > these cases can be solved independently. Probably, the full isolation of
> > the time sub-system will have much higher overhead than just offsets for
> > a few clocks. And the idea that isolation of the real-time clock should
> > be optional gives us another hint that offsets for monotonic and
> > boot-time clocks can be implemented independently.
> > 
> > Eric and Tomas, what do you think about this? If you agree that these
> > two cases can be implemented separately, what should we do with this
> > series to make it ready to be merged?
> > 
> > I know that we need to:
> > 
> > * look at device drivers that report timestamps in CLOCK_MONOTONIC base.
> 
> and CLOCK_BOOTTIME and that's quite a few.
> 
> > * forbid changing offsets after creating timers
> 
> There are more things to think about. What about interfaces which expose
> boot time or monotonic time in /proc?

We didn't find any proc files where boot or monotonic time is reported,
but we will double check this.

> 
> Aside of that (I finally came around to look at the series in more detail)
> I'm really unhappy about the unconditional overhead once the Time namespace
> config switch is enabled. This applies especially to the VDSO. We spent
> quite some time recently to squeeze a few cycles out of those functions and
> it would be a pity to pointlessly waste cycles for the !namespace case.

It is a good point. We will work on it.

> 
> I can see the urge for this, but please let us think it through properly
> before rushing anything in which we are going to regret once we want to do
> more sophisticated time domain management, e.g. support for isolated clock
> real time. I'm worried, that without a clear plan about the overall
> picture, we end up with duct tape which is hard to distangle after the
> fact.

Thomas, there is no rush at all. This functionality is critical for
CRUI, but we have enough time to solve it properly.

The only thing what I want is that this functionality continues moving
forward and will not be put in the back burner.

> 
> There have been a few other things brought up versus time management in
> general, like the TSN folks utilizing grand clock masters which expose
> random time instead of proper TAI. Plus some requirements for exposing some
> sort of 'monotonic' clocks which are derived from external synchronization
> mechanisms, but should not affect the regular time keeping clocks.
> 
> While different issues, these all fall into the category of separate time
> domains, so taking a step back to the drawing board is probably the best
> thing what we can do now.
> 
> There are certainly a few things which can be looked at independently,
> e.g. the VDSO mechanics or general mechanisms to avoid plastering the whole
> kernel with these name space functions applying offsets left and right. I
> rather have dedicated core functionality which replaces/amends existing
> timer functions to become time namespace aware.
> 
> I'll try to find some time in the next weeks to look deeper into that, but
> I can't promise anything before returning from LPC. Btw, LPC would be a
> great opportunity to discuss that. Are you and the other name space wizards
> there by any chance?

Dmitry and I are going to be there.

Thanks!
Andrei

> 
> Thanks,
> 
>   tglx
> 
> 


Re: [RFC 00/20] ns: Introduce Time Namespace

2018-10-20 Thread Andrei Vagin
On Sat, Oct 20, 2018 at 06:41:23PM -0700, Andrei Vagin wrote:
> On Fri, Sep 28, 2018 at 07:03:22PM +0200, Eric W. Biederman wrote:
> > Thomas Gleixner  writes:
> > 
> > > On Wed, 26 Sep 2018, Eric W. Biederman wrote:
> > >> Reading the code the calling sequence there is:
> > >> tick_sched_do_timer
> > >>tick_do_update_jiffies64
> > >>   update_wall_time
> > >>   timekeeping_advance
> > >>  timekeepging_update
> > >> 
> > >> If I read that properly under the right nohz circumstances that update
> > >> can be delayed indefinitely.
> > >> 
> > >> So I think we could prototype a time namespace that was per
> > >> timekeeping_update and just had update_wall_time iterate through
> > >> all of the time namespaces.
> > >
> > > Please don't go there. timekeeping_update() is already heavy and walking
> > > through a gazillion of namespaces will just make it horrible,
> > >
> > >> I don't think the naive version would scale to very many time
> > >> namespaces.
> > >
> > > :)
> > >
> > >> At the same time using the techniques from the nohz work and a little
> > >> smarts I expect we could get the code to scale.
> > >
> > > You'd need to invoke the update when the namespace is switched in and
> > > hasn't been updated since the last tick happened. That might be doable, 
> > > but
> > > you also need to take the wraparound constraints of the underlying
> > > clocksources into account, which again can cause walking all name spaces
> > > when they are all idle long enough.
> > 
> > The wrap around constraints being how long before the time sources wrap
> > around so you have to read them once per wrap around?  I have not dug
> > deeply enough into the code to see that yet.
> > 
> > > From there it becomes hairy, because it's not only timekeeping,
> > > i.e. reading time, this is also affecting all timers which are armed from 
> > > a
> > > namespace.
> > >
> > > That gets really ugly because when you do settimeofday() or adjtimex() for
> > > a particular namespace, then you have to search for all armed timers of
> > > that namespace and adjust them.
> > >
> > > The original posix timer code had the same issue because it mapped the
> > > clock realtime timers to the timer wheel so any setting of the clock 
> > > caused
> > > a full walk of all armed timers, disarming, adjusting and requeing
> > > them. That's horrible not only performance wise, it's also a locking
> > > nightmare of all sorts.
> > >
> > > Add time skew via NTP/PTP into the picture and you might have to adjust
> > > timers as well, because you need to guarantee that they are not expiring
> > > early.
> > >
> > > I haven't looked through Dimitry's patches yet, but I don't see how this
> > > can work at all without introducing subtle issues all over the place.
> > 
> > Then it sounds like this will take some more digging.
> > 
> > Please pardon me for thinking out load.
> > 
> > There are one or more time sources that we use to compute the time
> > and for each time source we have a conversion from ticks of the
> > time source to nanoseconds.
> > 
> > Each time source needs to be sampled at least once per wrap-around
> > and something incremented so that we don't loose time when looking
> > at that time source.
> > 
> > There are several clocks presented to userspace and they all share the
> > same length of second and are all fundamentally offsets from
> > CLOCK_MONOTONIC.
> > 
> > I see two fundamental driving cases for a time namespace.
> > 1) Migration from one node to another node in a cluster in almost
> >real time.
> > 
> >The problem is that CLOCK_MONOTONIC between nodes in the cluster
> >has not relation ship to each other (except a synchronized length of
> >the second).  So applications that migrate can see CLOCK_MONOTONIC
> >and CLOCK_BOOTTIME go backwards.
> > 
> >This is the truly pressing problem and adding some kind of offset
> >sounds like it would be the solution.  Possibly by allowing a boot
> >time synchronization of CLOCK_BOOTTIME and CLOCK_MONOTONIC.
> > 
> > 2) Dealing with two separate time management domains.  Say a machine
> >that needes to deal with both something inside of google where they
> >sl

Re: [RFC 00/20] ns: Introduce Time Namespace

2018-10-20 Thread Andrei Vagin
On Fri, Sep 28, 2018 at 07:03:22PM +0200, Eric W. Biederman wrote:
> Thomas Gleixner  writes:
> 
> > On Wed, 26 Sep 2018, Eric W. Biederman wrote:
> >> Reading the code the calling sequence there is:
> >> tick_sched_do_timer
> >>tick_do_update_jiffies64
> >>   update_wall_time
> >>   timekeeping_advance
> >>  timekeepging_update
> >> 
> >> If I read that properly under the right nohz circumstances that update
> >> can be delayed indefinitely.
> >> 
> >> So I think we could prototype a time namespace that was per
> >> timekeeping_update and just had update_wall_time iterate through
> >> all of the time namespaces.
> >
> > Please don't go there. timekeeping_update() is already heavy and walking
> > through a gazillion of namespaces will just make it horrible,
> >
> >> I don't think the naive version would scale to very many time
> >> namespaces.
> >
> > :)
> >
> >> At the same time using the techniques from the nohz work and a little
> >> smarts I expect we could get the code to scale.
> >
> > You'd need to invoke the update when the namespace is switched in and
> > hasn't been updated since the last tick happened. That might be doable, but
> > you also need to take the wraparound constraints of the underlying
> > clocksources into account, which again can cause walking all name spaces
> > when they are all idle long enough.
> 
> The wrap around constraints being how long before the time sources wrap
> around so you have to read them once per wrap around?  I have not dug
> deeply enough into the code to see that yet.
> 
> > From there it becomes hairy, because it's not only timekeeping,
> > i.e. reading time, this is also affecting all timers which are armed from a
> > namespace.
> >
> > That gets really ugly because when you do settimeofday() or adjtimex() for
> > a particular namespace, then you have to search for all armed timers of
> > that namespace and adjust them.
> >
> > The original posix timer code had the same issue because it mapped the
> > clock realtime timers to the timer wheel so any setting of the clock caused
> > a full walk of all armed timers, disarming, adjusting and requeing
> > them. That's horrible not only performance wise, it's also a locking
> > nightmare of all sorts.
> >
> > Add time skew via NTP/PTP into the picture and you might have to adjust
> > timers as well, because you need to guarantee that they are not expiring
> > early.
> >
> > I haven't looked through Dimitry's patches yet, but I don't see how this
> > can work at all without introducing subtle issues all over the place.
> 
> Then it sounds like this will take some more digging.
> 
> Please pardon me for thinking out load.
> 
> There are one or more time sources that we use to compute the time
> and for each time source we have a conversion from ticks of the
> time source to nanoseconds.
> 
> Each time source needs to be sampled at least once per wrap-around
> and something incremented so that we don't loose time when looking
> at that time source.
> 
> There are several clocks presented to userspace and they all share the
> same length of second and are all fundamentally offsets from
> CLOCK_MONOTONIC.
> 
> I see two fundamental driving cases for a time namespace.
> 1) Migration from one node to another node in a cluster in almost
>real time.
> 
>The problem is that CLOCK_MONOTONIC between nodes in the cluster
>has not relation ship to each other (except a synchronized length of
>the second).  So applications that migrate can see CLOCK_MONOTONIC
>and CLOCK_BOOTTIME go backwards.
> 
>This is the truly pressing problem and adding some kind of offset
>sounds like it would be the solution.  Possibly by allowing a boot
>time synchronization of CLOCK_BOOTTIME and CLOCK_MONOTONIC.
> 
> 2) Dealing with two separate time management domains.  Say a machine
>that needes to deal with both something inside of google where they
>slew time to avoid leap time seconds and something in the outside
>world proper UTC time is kept as an offset from TAI with the
>occasional leap seconds.
> 
>In the later case it would fundamentally require having seconds of
>different length.
> 

I want to add that the second case should be optional.

When a container is migrated to another host, we have to restore its
monotonic and boottime clocks, but we still expect that the container
will continue using the host real-time clock.

Before stating this series, I was thinking about this, I decided that
these cases can be solved independently. Probably, the full isolation of
the time sub-system will have much higher overhead than just offsets for
a few clocks. And the idea that isolation of the real-time clock should
be optional gives us another hint that offsets for monotonic and
boot-time clocks can be implemented independently.

Eric and Tomas, what do you think about this? If you agree that these
two cases can be implemented separately, what should we do with this

Re: [PATCH v6 1/1] ns: add binfmt_misc to the user namespace

2018-10-16 Thread Andrei Vagin
On Wed, Oct 10, 2018 at 06:14:30PM +0200, Laurent Vivier wrote:
> This patch allows to have a different binfmt_misc configuration
> for each new user namespace. By default, the binfmt_misc configuration
> is the one of the previous level, but if the binfmt_misc filesystem is
> mounted in the new namespace a new empty binfmt instance is created and
> used in this namespace.
> 
> For instance, using "unshare" we can start a chroot of another
> architecture and configure the binfmt_misc interpreter without being root
> to run the binaries in this chroot.
> 
> Signed-off-by: Laurent Vivier 

Acked-by: Andrei Vagin 

Thanks,
Andrei

> ---
>  fs/binfmt_misc.c   | 111 -
>  include/linux/user_namespace.h |  15 +
>  kernel/user.c  |  14 +
>  kernel/user_namespace.c|   3 +
>  4 files changed, 115 insertions(+), 28 deletions(-)
> 
> diff --git a/fs/binfmt_misc.c b/fs/binfmt_misc.c
> index aa4a7a23ff99..df9dc3248b7b 100644
> --- a/fs/binfmt_misc.c
> +++ b/fs/binfmt_misc.c
> @@ -38,9 +38,6 @@ enum {
>   VERBOSE_STATUS = 1 /* make it zero to save 400 bytes kernel memory */
>  };
>  
> -static LIST_HEAD(entries);
> -static int enabled = 1;
> -
>  enum {Enabled, Magic};
>  #define MISC_FMT_PRESERVE_ARGV0 (1 << 31)
>  #define MISC_FMT_OPEN_BINARY (1 << 30)
> @@ -60,10 +57,7 @@ typedef struct {
>   struct file *interp_file;
>  } Node;
>  
> -static DEFINE_RWLOCK(entries_lock);
>  static struct file_system_type bm_fs_type;
> -static struct vfsmount *bm_mnt;
> -static int entry_count;
>  
>  /*
>   * Max length of the register string.  Determined by:
> @@ -80,18 +74,37 @@ static int entry_count;
>   */
>  #define MAX_REGISTER_LENGTH 1920
>  
> +static struct binfmt_namespace *binfmt_ns(struct user_namespace *ns)
> +{
> + struct binfmt_namespace *b_ns;
> +
> + while (ns) {
> + b_ns = READ_ONCE(ns->binfmt_ns);
> + if (b_ns)
> + return b_ns;
> + ns = ns->parent;
> + }
> + /* as the first user namespace is initialized with
> +  * _binfmt_ns we should never come here
> +  * but we try to stay safe by logging a warning
> +  * and returning a sane value
> +  */
> + WARN_ON_ONCE(1);
> + return _binfmt_ns;
> +}
> +
>  /*
>   * Check if we support the binfmt
>   * if we do, return the node, else NULL
>   * locking is done in load_misc_binary
>   */
> -static Node *check_file(struct linux_binprm *bprm)
> +static Node *check_file(struct binfmt_namespace *ns, struct linux_binprm 
> *bprm)
>  {
>   char *p = strrchr(bprm->interp, '.');
>   struct list_head *l;
>  
>   /* Walk all the registered handlers. */
> - list_for_each(l, ) {
> + list_for_each(l, >entries) {
>   Node *e = list_entry(l, Node, list);
>   char *s;
>   int j;
> @@ -133,17 +146,18 @@ static int load_misc_binary(struct linux_binprm *bprm)
>   struct file *interp_file = NULL;
>   int retval;
>   int fd_binary = -1;
> + struct binfmt_namespace *ns = binfmt_ns(current_user_ns());
>  
>   retval = -ENOEXEC;
> - if (!enabled)
> + if (!ns->enabled)
>   return retval;
>  
>   /* to keep locking time low, we copy the interpreter string */
> - read_lock(_lock);
> - fmt = check_file(bprm);
> + read_lock(>entries_lock);
> + fmt = check_file(ns, bprm);
>   if (fmt)
>   dget(fmt->dentry);
> - read_unlock(_lock);
> + read_unlock(>entries_lock);
>   if (!fmt)
>   return retval;
>  
> @@ -609,19 +623,19 @@ static void bm_evict_inode(struct inode *inode)
>   kfree(e);
>  }
>  
> -static void kill_node(Node *e)
> +static void kill_node(struct binfmt_namespace *ns, Node *e)
>  {
>   struct dentry *dentry;
>  
> - write_lock(_lock);
> + write_lock(>entries_lock);
>   list_del_init(>list);
> - write_unlock(_lock);
> + write_unlock(>entries_lock);
>  
>   dentry = e->dentry;
>   drop_nlink(d_inode(dentry));
>   d_drop(dentry);
>   dput(dentry);
> - simple_release_fs(_mnt, _count);
> + simple_release_fs(>bm_mnt, >entry_count);
>  }
>  
>  /* / */
> @@ -651,6 +665,9 @@ static ssize_t bm_entry_write(struct file *file, const 
> char __user *buffer,
>   struct dentry *root;
>   Node *e = file_inode(file)->i_private;
>   int res = parse_command(buffer, count);
> + struct binfmt_namespace *ns;
> +
> + ns = binfmt_ns(file

Re: [PATCH] ptrace: zero out siginfo_t in ptrace_peek_siginfo()

2018-10-10 Thread Andrei Vagin
On Wed, Sep 26, 2018 at 05:17:25PM +0200, Alexander Potapenko wrote:
> KMSAN reported the following infoleak:
> 
> ==
> BUG: KMSAN: kernel-infoleak in _copy_to_user+0x15d/0x1f0
> ...
> Call Trace:
>  __dump_stack lib/dump_stack.c:77
>  dump_stack+0x2f5/0x430 lib/dump_stack.c:113
>  kmsan_report+0x183/0x2b0 mm/kmsan/kmsan.c:917
>  kmsan_internal_check_memory+0x17e/0x1f0 mm/kmsan/kmsan.c:981
>  kmsan_copy_to_user+0x79/0xc0 mm/kmsan/kmsan_hooks.c:482
>  _copy_to_user+0x15d/0x1f0 lib/usercopy.c:31
>  copy_to_user ./include/linux/uaccess.h:183
>  copy_siginfo_to_user+0x81/0x130 kernel/signal.c:2897
>  ptrace_peek_siginfo kernel/ptrace.c:741
>  ptrace_request+0x2278/0x2680 kernel/ptrace.c:912
>  arch_ptrace+0xbdd/0x11a0 arch/x86/kernel/ptrace.c:877
>  __do_sys_ptrace kernel/ptrace.c:1145
>  __se_sys_ptrace+0x422/0x920 kernel/ptrace.c:1110
>  __x64_sys_ptrace+0x56/0x70 kernel/ptrace.c:1110
>  do_syscall_64+0xb8/0x100 arch/x86/entry/common.c:291
>  entry_SYSCALL_64_after_hwframe+0x63/0xe7 arch/x86/entry/entry_64.S:240
> ...
> Local variable description: info.i@ptrace_request
> Variable was created at:
>  ptrace_peek_siginfo kernel/ptrace.c:712
>  ptrace_request+0xdf/0x2680 kernel/ptrace.c:912
>  arch_ptrace+0xbdd/0x11a0 arch/x86/kernel/ptrace.c:877
> 
> Bytes 16-127 of 128 are uninitialized
> Memory access starts at 88007af6fc90
> ==
> 
> when calling ptrace(PTRACE_PEEKSIGINFO) for a traceable child process
> with args = {-1, 0, 1}.
> 
> Initialize the |info| structure to avoid leaking stack data.


"info" is filled up by copy_siginfo(), which overwrites everything.

static inline void copy_siginfo(struct siginfo *to, const struct siginfo *from)
{
memcpy(to, from, sizeof(*to));
}


so here is another problem. We handle arg.off incorrectly. The right fix
should look something like this:

diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 21fec73d45d4..e336434a6f71 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -710,7 +710,7 @@ static int ptrace_peek_siginfo(struct task_struct *child,
 
for (i = 0; i < arg.nr; ) {
siginfo_t info;
-   s32 off = arg.off + i;
+   u64 off = arg.off + i;
 
spin_lock_irq(>sighand->siglock);
list_for_each_entry(q, >list, list) {
@@ -721,7 +721,7 @@ static int ptrace_peek_siginfo(struct task_struct *child,
}
spin_unlock_irq(>sighand->siglock);
 
-   if (off >= 0) /* beyond the end of the list */
+   if (off + 1 != 0) /* beyond the end of the list */
break;
 
 #ifdef CONFIG_COMPAT


> 
> Signed-off-by: Alexander Potapenko 
> Reported-by: syzbot+69c3bd9869b32e394...@syzkaller.appspotmail.com
> Fixes: 84c751bd4aebb ("ptrace: add ability to retrieve signals without
> removing from a queue (v4)")
> Cc: Andrey Vagin 
> Cc: Oleg Nesterov 
> Cc: Willy Tarreau 
> ---
>  kernel/ptrace.c | 1 +
>  1 file changed, 1 insertion(+)
> 
> diff --git a/kernel/ptrace.c b/kernel/ptrace.c
> index 21fec73d45d4..92c3855c2b9c 100644
> --- a/kernel/ptrace.c
> +++ b/kernel/ptrace.c
> @@ -712,6 +712,7 @@ static int ptrace_peek_siginfo(struct task_struct *child,
>   siginfo_t info;
>   s32 off = arg.off + i;
>  
> + memset(, 0, sizeof(info));
>   spin_lock_irq(>sighand->siglock);
>   list_for_each_entry(q, >list, list) {
>   if (!off--) {
> -- 
> 2.19.0.605.g01d371f741-goog
> 


Re: [RFC v4 1/1] ns: add binfmt_misc to the user namespace

2018-10-07 Thread Andrei Vagin
On Sat, Oct 06, 2018 at 09:35:46PM +0200, Laurent Vivier wrote:
> This patch allows to have a different binfmt_misc configuration
> for each new user namespace. By default, the binfmt_misc configuration
> is the one of the previous level, but if the binfmt_misc filesystem is
> mounted in the new namespace a new empty binfmt instance is created and
> used in this namespace.
> 
> For instance, using "unshare" we can start a chroot of an another
> architecture and configure the binfmt_misc interpreter without being root
> to run the binaries in this chroot.
> 
> Signed-off-by: Laurent Vivier 
> ---
>  fs/binfmt_misc.c   | 99 --
>  include/linux/user_namespace.h | 13 +
>  kernel/user.c  | 13 +
>  kernel/user_namespace.c|  7 +++
>  4 files changed, 104 insertions(+), 28 deletions(-)
> 
> diff --git a/fs/binfmt_misc.c b/fs/binfmt_misc.c
> index aa4a7a23ff99..1beefafcb416 100644
> --- a/fs/binfmt_misc.c
> +++ b/fs/binfmt_misc.c
> @@ -38,9 +38,6 @@ enum {
>   VERBOSE_STATUS = 1 /* make it zero to save 400 bytes kernel memory */
>  };
>  
> -static LIST_HEAD(entries);
> -static int enabled = 1;
> -
>  enum {Enabled, Magic};
>  #define MISC_FMT_PRESERVE_ARGV0 (1 << 31)
>  #define MISC_FMT_OPEN_BINARY (1 << 30)
> @@ -60,10 +57,7 @@ typedef struct {
>   struct file *interp_file;
>  } Node;
>  
> -static DEFINE_RWLOCK(entries_lock);
>  static struct file_system_type bm_fs_type;
> -static struct vfsmount *bm_mnt;
> -static int entry_count;
>  
>  /*
>   * Max length of the register string.  Determined by:
> @@ -80,18 +74,28 @@ static int entry_count;
>   */
>  #define MAX_REGISTER_LENGTH 1920
>  
> +static struct binfmt_namespace *binfmt_ns(struct user_namespace *ns)
> +{
> + while (ns) {
> + if (ns->binfmt_ns)
> + return ns->binfmt_ns;
> + ns = ns->parent;
> + }
> + return NULL;
> +}
> +
>  /*
>   * Check if we support the binfmt
>   * if we do, return the node, else NULL
>   * locking is done in load_misc_binary
>   */
> -static Node *check_file(struct linux_binprm *bprm)
> +static Node *check_file(struct binfmt_namespace *ns, struct linux_binprm 
> *bprm)
>  {
>   char *p = strrchr(bprm->interp, '.');
>   struct list_head *l;
>  
>   /* Walk all the registered handlers. */
> - list_for_each(l, ) {
> + list_for_each(l, >entries) {
>   Node *e = list_entry(l, Node, list);
>   char *s;
>   int j;
> @@ -133,17 +137,18 @@ static int load_misc_binary(struct linux_binprm *bprm)
>   struct file *interp_file = NULL;
>   int retval;
>   int fd_binary = -1;
> + struct binfmt_namespace *ns = binfmt_ns(current_user_ns());
>  
>   retval = -ENOEXEC;
> - if (!enabled)
> + if (!ns->enabled)
>   return retval;
>  
>   /* to keep locking time low, we copy the interpreter string */
> - read_lock(_lock);
> - fmt = check_file(bprm);
> + read_lock(>entries_lock);
> + fmt = check_file(ns, bprm);
>   if (fmt)
>   dget(fmt->dentry);
> - read_unlock(_lock);
> + read_unlock(>entries_lock);
>   if (!fmt)
>   return retval;
>  
> @@ -609,19 +614,19 @@ static void bm_evict_inode(struct inode *inode)
>   kfree(e);
>  }
>  
> -static void kill_node(Node *e)
> +static void kill_node(struct binfmt_namespace *ns, Node *e)
>  {
>   struct dentry *dentry;
>  
> - write_lock(_lock);
> + write_lock(>entries_lock);
>   list_del_init(>list);
> - write_unlock(_lock);
> + write_unlock(>entries_lock);
>  
>   dentry = e->dentry;
>   drop_nlink(d_inode(dentry));
>   d_drop(dentry);
>   dput(dentry);
> - simple_release_fs(_mnt, _count);
> + simple_release_fs(>bm_mnt, >entry_count);
>  }
>  
>  /* / */
> @@ -651,6 +656,9 @@ static ssize_t bm_entry_write(struct file *file, const 
> char __user *buffer,
>   struct dentry *root;
>   Node *e = file_inode(file)->i_private;
>   int res = parse_command(buffer, count);
> + struct binfmt_namespace *ns;
> +
> + ns = binfmt_ns(file->f_path.dentry->d_sb->s_user_ns);
>  
>   switch (res) {
>   case 1:
> @@ -667,7 +675,7 @@ static ssize_t bm_entry_write(struct file *file, const 
> char __user *buffer,
>   inode_lock(d_inode(root));
>  
>   if (!list_empty(>list))
> - kill_node(e);
> + kill_node(ns, e);
>  
>   inode_unlock(d_inode(root));
>   break;
> @@ -693,6 +701,7 @@ static ssize_t bm_register_write(struct file *file, const 
> char __user *buffer,
>   struct inode *inode;
>   struct super_block *sb = file_inode(file)->i_sb;
>   struct dentry *root = sb->s_root, *dentry;
> + struct binfmt_namespace *ns;
>   int err = 0;
>  
>   e = create_entry(buffer, count);
> @@ -716,7 +725,9 @@ static ssize_t bm_register_write(struct file *file, const 
> 

Re: [RFC v3 1/1] ns: add binfmt_misc to the user namespace

2018-10-06 Thread Andrei Vagin
On Thu, Oct 04, 2018 at 12:50:22AM +0200, Laurent Vivier wrote:
> This patch allows to have a different binfmt_misc configuration
> for each new user namespace. By default, the binfmt_misc configuration
> is the one of the host, but if the binfmt_misc filesystem is mounted
> in the new namespace a new empty binfmt instance is created and used
> in this namespace.
> 
> For instance, using "unshare" we can start a chroot of an another
> architecture and configure the binfmt_misc interpreter without being root
> to run the binaries in this chroot.
> 
> Signed-off-by: Laurent Vivier 
> ---
>  fs/binfmt_misc.c   | 85 +++---
>  include/linux/user_namespace.h | 15 ++
>  kernel/user.c  | 14 ++
>  kernel/user_namespace.c|  9 
>  4 files changed, 95 insertions(+), 28 deletions(-)
> 
> diff --git a/fs/binfmt_misc.c b/fs/binfmt_misc.c
> index aa4a7a23ff99..78780bc87506 100644
> --- a/fs/binfmt_misc.c
> +++ b/fs/binfmt_misc.c
> @@ -38,9 +38,6 @@ enum {
>   VERBOSE_STATUS = 1 /* make it zero to save 400 bytes kernel memory */
>  };
>  
> -static LIST_HEAD(entries);
> -static int enabled = 1;
> -
>  enum {Enabled, Magic};
>  #define MISC_FMT_PRESERVE_ARGV0 (1 << 31)
>  #define MISC_FMT_OPEN_BINARY (1 << 30)
> @@ -60,10 +57,7 @@ typedef struct {
>   struct file *interp_file;
>  } Node;
>  
> -static DEFINE_RWLOCK(entries_lock);
>  static struct file_system_type bm_fs_type;
> -static struct vfsmount *bm_mnt;
> -static int entry_count;
>  
>  /*
>   * Max length of the register string.  Determined by:
> @@ -85,13 +79,13 @@ static int entry_count;
>   * if we do, return the node, else NULL
>   * locking is done in load_misc_binary
>   */
> -static Node *check_file(struct linux_binprm *bprm)
> +static Node *check_file(struct user_namespace *ns, struct linux_binprm *bprm)
>  {
>   char *p = strrchr(bprm->interp, '.');
>   struct list_head *l;
>  
>   /* Walk all the registered handlers. */
> - list_for_each(l, ) {
> + list_for_each(l, >binfmt_ns->entries) {
>   Node *e = list_entry(l, Node, list);
>   char *s;
>   int j;
> @@ -133,17 +127,18 @@ static int load_misc_binary(struct linux_binprm *bprm)
>   struct file *interp_file = NULL;
>   int retval;
>   int fd_binary = -1;
> + struct user_namespace *ns = current_user_ns();
>  
>   retval = -ENOEXEC;
> - if (!enabled)
> + if (!ns->binfmt_ns->enabled)
>   return retval;
>  
>   /* to keep locking time low, we copy the interpreter string */
> - read_lock(_lock);
> - fmt = check_file(bprm);
> + read_lock(>binfmt_ns->entries_lock);

It looks like ns->binfmt_ns isn't protected by any lock and
ns->binfmt_ns can be changed between read_lock() and read_unlock().

This can be fixed if ns->binfmt_ns will be dereferenced only once in
this function:

struct binfmt_namespace *binfmt_ns = ns->binfmt_ns;

> + fmt = check_file(ns ,bprm);
>   if (fmt)
>   dget(fmt->dentry);
> - read_unlock(_lock);
> + read_unlock(>binfmt_ns->entries_lock);
>   if (!fmt)
>   return retval;
>  
> @@ -609,19 +604,19 @@ static void bm_evict_inode(struct inode *inode)
>   kfree(e);
>  }
>  
> -static void kill_node(Node *e)
> +static void kill_node(struct user_namespace *ns, Node *e)
>  {
>   struct dentry *dentry;
>  
> - write_lock(_lock);
> + write_lock(>binfmt_ns->entries_lock);
>   list_del_init(>list);
> - write_unlock(_lock);
> + write_unlock(>binfmt_ns->entries_lock);
>  
>   dentry = e->dentry;
>   drop_nlink(d_inode(dentry));
>   d_drop(dentry);
>   dput(dentry);
> - simple_release_fs(_mnt, _count);
> + simple_release_fs(>binfmt_ns->bm_mnt, >binfmt_ns->entry_count);
>  }
>  
>  /* / */
> @@ -651,6 +646,7 @@ static ssize_t bm_entry_write(struct file *file, const 
> char __user *buffer,
>   struct dentry *root;
>   Node *e = file_inode(file)->i_private;
>   int res = parse_command(buffer, count);
> + struct user_namespace *ns = file->f_path.dentry->d_sb->s_user_ns;
>  
>   switch (res) {
>   case 1:
> @@ -667,7 +663,7 @@ static ssize_t bm_entry_write(struct file *file, const 
> char __user *buffer,
>   inode_lock(d_inode(root));
>  
>   if (!list_empty(>list))
> - kill_node(e);
> + kill_node(ns, e);
>  
>   inode_unlock(d_inode(root));
>   break;
> @@ -693,6 +689,7 @@ static ssize_t bm_register_write(struct file *file, const 
> char __user *buffer,
>   struct inode *inode;
>   struct super_block *sb = file_inode(file)->i_sb;
>   struct dentry *root = sb->s_root, *dentry;
> + struct user_namespace *ns = file->f_path.dentry->d_sb->s_user_ns;
>   int err = 0;
>  
>   e = create_entry(buffer, count);
> @@ -716,7 +713,8 @@ static ssize_t bm_register_write(struct file *file, const 
> 

Re: [REVIEW][PATCH 2/6] signal: Fail sigqueueinfo if si_signo != sig

2018-10-05 Thread Andrei Vagin
On Tue, Sep 25, 2018 at 07:19:02PM +0200, Eric W. Biederman wrote:
> The kernel needs to validate that the contents of struct siginfo make
> sense as siginfo is copied into the kernel, so that the proper union
> members can be put in the appropriate locations.  The field si_signo
> is a fundamental part of that validation.  As such changing the
> contents of si_signo after the validation make no sense and can result
> in nonsense values in the kernel.

Accoding to the man page, the user should not set si_signo, it has to be set
by kernel.

$ man 2 rt_sigqueueinfo

The uinfo argument specifies the data to accompany  the  signal.   This
   argument  is  a  pointer to a structure of type siginfo_t, described in
   sigaction(2) (and defined  by  including  ).   The  caller
   should set the following fields in this structure:

   si_code
  This  must  be  one of the SI_* codes in the Linux kernel source
  file include/asm-generic/siginfo.h, with  the  restriction  that
  the  code  must  be  negative (i.e., cannot be SI_USER, which is
  used by the kernel to indicate a signal  sent  by  kill(2))  and
  cannot  (since  Linux  2.6.39) be SI_TKILL (which is used by the
  kernel to indicate a signal sent using tgkill(2)).

   si_pid This should be set to a process ID, typically the process ID  of
  the sender.

   si_uid This  should  be set to a user ID, typically the real user ID of
  the sender.

   si_value
  This field contains the user data to accompany the signal.   For
  more information, see the description of the last (union sigval)
  argument of sigqueue(3).

   Internally, the kernel sets the si_signo field to the  value  specified
   in  sig,  so that the receiver of the signal can also obtain the signal
   number via that field.

> 
> As such simply fail if someone is silly enough to set si_signo out of
> sync with the signal number passed to sigqueueinfo.
> 
> I don't expect a problem as glibc's sigqueue implementation sets
> "si_signo = sig" and CRIU just returns to the kernel what the kernel
> gave to it.
> 
> If there is some application that calls sigqueueinfo directly that has
> a problem with this added sanity check we can revisit this when we see
> what kind of crazy that application is doing.


I already know two "applications" ;)

https://github.com/torvalds/linux/blob/master/tools/testing/selftests/ptrace/peeksiginfo.c
https://github.com/checkpoint-restore/criu/blob/master/test/zdtm/static/sigpending.c

Disclaimer: I'm the author of both of them.

> 
> Signed-off-by: "Eric W. Biederman" 
> ---
>  kernel/signal.c | 6 --
>  1 file changed, 4 insertions(+), 2 deletions(-)
> 
> diff --git a/kernel/signal.c b/kernel/signal.c
> index 7b49c31d3fdb..e445b0a63faa 100644
> --- a/kernel/signal.c
> +++ b/kernel/signal.c
> @@ -3306,7 +3306,8 @@ static int do_rt_sigqueueinfo(pid_t pid, int sig, 
> siginfo_t *info)
>   (task_pid_vnr(current) != pid))
>   return -EPERM;
>  
> - info->si_signo = sig;
> + if (info->si_signo != sig)
> + return -EINVAL;
>  
>   /* POSIX.1b doesn't mention process groups.  */
>   return kill_proc_info(sig, info, pid);
> @@ -3354,7 +3355,8 @@ static int do_rt_tgsigqueueinfo(pid_t tgid, pid_t pid, 
> int sig, siginfo_t *info)
>   (task_pid_vnr(current) != pid))
>   return -EPERM;
>  
> - info->si_signo = sig;
> + if (info->si_signo != sig)
> + return -EINVAL;
>  
>   return do_send_specific(tgid, pid, sig, info);
>  }


Re: [PATCH 7/7] aio: implement io_pgetevents

2018-07-09 Thread Andrei Vagin
On Sun, Jul 08, 2018 at 10:44:00PM +0200, Christoph Hellwig wrote:
> On Wed, Jul 04, 2018 at 04:21:16PM +0200, Adrian Reber wrote:
> > In file included from /usr/include/linux/signal.h:5,
> >  from /usr/include/linux/aio_abi.h:32,
> >  from include.c:2:
> > /usr/include/asm/signal.h:16:23: error: conflicting types for ‘sigset_t’
> >  typedef unsigned long sigset_t;
> >^~~~
> > In file included from /usr/include/signal.h:35,
> >  from include.c:1:
> > /usr/include/bits/types/sigset_t.h:7:20: note: previous declaration of 
> > ‘sigset_t’ was here
> >  typedef __sigset_t sigset_t;
> 
> I guess we could do something like the patch below, although it is
> rather ugly:
> 
> diff --git a/include/uapi/linux/aio_abi.h b/include/uapi/linux/aio_abi.h
> index 75846164290e..b7705ad66d78 100644
> --- a/include/uapi/linux/aio_abi.h
> +++ b/include/uapi/linux/aio_abi.h
> @@ -29,7 +29,11 @@
>  
>  #include 
>  #include 
> +#ifdef __KERNEL__
>  #include 
> +#else
> +#include 
> +#endif

I think we can not do this because this header specifies the kernel
API, but signal.h is provided by libc and sigset_t can be defined
differently there:

[avagin@laptop ~]$ cat test.c 
#ifdef TEST_LINUX_SIGNAL
#  include 
#  include 
#else
#  include 
#endif
#include 
int main()
{
printf("sizeof(sigset_t) = %d\n", sizeof(sigset_t));
return 0;
}
[avagin@laptop ~]$ gcc -DTEST_LINUX_SIGNAL test.c && ./a.out
sizeof(sigset_t) = 8
[avagin@laptop ~]$ gcc test.c && ./a.out
sizeof(sigset_t) = 128

[avagin@laptop include]$ rpm -qf /usr/include/signal.h 
glibc-headers-2.27-8.fc28.i686
glibc-headers-2.27-8.fc28.x86_64
[avagin@laptop include]$ rpm -qf /usr/include/linux/signal.h 
kernel-headers-4.16.5-300.fc28.x86_64

>  #include 
>  
>  typedef __kernel_ulong_t aio_context_t;


Re: [12/24] proc: Add fs_context support to procfs [ver #7]

2018-06-27 Thread Andrei Vagin
On Tue, Jun 26, 2018 at 09:57:07AM +0100, David Howells wrote:
> Andrei Vagin  wrote:
> 
> > > > > - mnt = kern_mount_data(_fs_type, ns, 0);
> > > 
> > > Here ns->user_ns and get_current_cred()->user_ns are not always equal
> > 
> > What do you think about the attached patch?
> > ...
> > -   fc = vfs_new_fs_context(_fs_type, NULL, 0,
> > -   FS_CONTEXT_FOR_KERNEL_MOUNT);
> > +   fc = vfs_new_fs_context_userns(_fs_type, NULL, 0,
> > +   FS_CONTEXT_FOR_KERNEL_MOUNT, ns->user_ns);
> 
> Or you could just change fc->user_ns immediately after calling
> vfs_new_fs_context().  This is what network filesystems should do with
> fc->net_ns, for example.

Ok, it works for me. The patch is attached.

> 
> > -struct fs_context *vfs_new_fs_context(struct file_system_type *fs_type,
> > +struct fs_context *vfs_new_fs_context_userns(struct file_system_type 
> > *fs_type,
> >   struct dentry *reference,
> >   unsigned int sb_flags,
> > - enum fs_context_purpose purpose)
> > + enum fs_context_purpose purpose,
> > + struct user_namespace *user_ns)
> 
> 
> If you'd really rather add a new parameter, please don't rename the function
> to vfs_new_fs_context_userns() - just add a new parameter.  There don't need
> to be two versions of it.
> 
> 
> This brings me to another thought:  I want to add the ability to let
> namespaces be configured by userspace, for example:

It may be a good feature, but I am not sure about procfs. A procfs
instance is created per pidns, so they should have the same owner
userns.

> 
>   fd = fsopen("nfs");
>   sprintf(buf, "ns user %d", my_user_ns_fd);
>   write(fd, buf);
>   sprintf(buf, "ns net %d", my_net_ns_fd);
>   write(fd, buf);
>   write(fd, "s fedoraproject.org:/pub");
>   write(fd, "o intr");
>   ...
> 
> I think therefore, I might need to insert another phase between creating the
> context and calling the filesystem initialiser:
> 
>   fc = vfs_new_fs_context(_fs_type, mntpt, 0,
>   FS_CONTEXT_FOR_SUBMOUNT);
> 
> followed by:
> 
>   vfs_sb_set_namespace(fc, THIS_IS_USER_NS, user_ns);
>   vfs_sb_set_namespace(fc, THIS_IS_NET_NS, net_ns);
> 
> but then we'd need to do:
> 
>   vfs_begin_options(fc);
> 
> before continuing (unless we made this happen automatically on the receipt of
> the first option):
> 
>   afs_mntpt_set_params(fc, mntpt);
>   vfs_get_tree(fc);
>   mnt = vfs_create_mount(fc, 0);
> 
> Alternatively, we could do the namespace setting after initialisation and let
> the fs apply the changes itself.
> 
> David
>From 2297ffb333a7bcee466a5273a3fc84202b9695a6 Mon Sep 17 00:00:00 2001
From: Andrei Vagin 
Date: Wed, 27 Jun 2018 22:45:43 -0700
Subject: [PATCH] proc: set a proper user namespace for fs_context

A user namespace should be taken from a pidns for which a procfs is created.

Signed-off-by: Andrei Vagin 
---
 fs/proc/root.c | 5 +
 1 file changed, 5 insertions(+)

diff --git a/fs/proc/root.c b/fs/proc/root.c
index efbdc08a3c86..59aaf06a40c7 100644
--- a/fs/proc/root.c
+++ b/fs/proc/root.c
@@ -303,6 +303,11 @@ int pid_ns_prepare_proc(struct pid_namespace *ns)
if (IS_ERR(fc))
return PTR_ERR(fc);
 
+   if (fc->user_ns != ns->user_ns) {
+   put_user_ns(fc->user_ns);
+   fc->user_ns = get_user_ns(ns->user_ns);
+   }
+
ctx = fc->fs_private;
if (ctx->pid_ns != ns) {
put_pid_ns(ctx->pid_ns);
-- 
2.17.0



Re: [12/24] proc: Add fs_context support to procfs [ver #7]

2018-06-26 Thread Andrei Vagin
On Mon, Jun 25, 2018 at 11:13:20PM -0700, Andrei Vagin wrote:
> On Mon, Jun 18, 2018 at 08:34:50PM -0700, Andrei Vagin wrote:
> > Hi David,
> > 
> > We run CRIU tests for vfs/for-next, and today a few of these test failed. I
> > found that the problem appears after this patch..
> > 
> > >  int pid_ns_prepare_proc(struct pid_namespace *ns)
> > >  {
> > > + struct proc_fs_context *ctx;
> > > + struct fs_context *fc;
> > >   struct vfsmount *mnt;
> > > + int ret;
> > > +
> > > + fc = vfs_new_fs_context(_fs_type, NULL, 0,
> > > + FS_CONTEXT_FOR_KERNEL_MOUNT);
> > > + if (IS_ERR(fc))
> > > + return PTR_ERR(fc);
> > > +
> > > + ctx = container_of(fc, struct proc_fs_context, fc);
> > > + if (ctx->pid_ns != ns) {
> > > + put_pid_ns(ctx->pid_ns);
> > > + get_pid_ns(ns);
> > > + ctx->pid_ns = ns;
> > > + }
> > > +
> > > + ret = vfs_get_tree(fc);
> > > + if (ret < 0) {
> > > + put_fs_context(fc);
> > > + return ret;
> > > + }
> > >  
> > > - mnt = kern_mount_data(_fs_type, ns, 0);
> 
> Here ns->user_ns and get_current_cred()->user_ns are not always equal

What do you think about the attached patch?

> 
> > > + mnt = vfs_create_mount(fc);
> > > + put_fs_context(fc);
> > >   if (IS_ERR(mnt))
> > >   return PTR_ERR(mnt);
> > >  
> 
> > #define _GNU_SOURCE
> > #include 
> > #include 
> > #include 
> > #include 
> > #include 
> > #include 
> > #include 
> > #include 
> > #include 
> > #include 
> > #include 
> > 
> > 
> > #define NS_STACK_SIZE   4096
> > 
> > #define __stack_aligned__   __attribute__((aligned(16)))
> > 
> > /* All arguments should be above stack, because it grows down */
> > struct ns_exec_args {
> > char stack[NS_STACK_SIZE] __stack_aligned__;
> > char stack_ptr[0];
> > int pfd[2];
> > };
> > 
> > static int ns_exec(void *_arg)
> > {
> > struct ns_exec_args *args = (struct ns_exec_args *) _arg;
> > int ret;
> > 
> > close(args->pfd[1]);
> > if (read(args->pfd[0], , sizeof(ret)) != sizeof(ret))
> > return -1;
> > 
> > setsid();
> > 
> > if (setuid(0) || setgid(0) || setgroups(0, NULL)) {
> > fprintf(stderr, "set*id failed: %m\n");
> > return -1;
> > }
> > 
> > if (mount("proc", "/mnt", "proc", MS_MGC_VAL | MS_NOSUID | MS_NOEXEC | 
> > MS_NODEV, NULL)) {
> > fprintf(stderr, "mount(/proc) failed: %m\n");
> > return -1;
> > }
> > 
> > return 0;
> > }
> > 
> > #define UID_MAP "0 10 10\n10 20 5"
> > #define GID_MAP "0 40 5\n5 50 10"
> > int main()
> > {
> > pid_t pid;
> > int ret, status;
> > struct ns_exec_args args;
> > int flags;
> > char pname[PATH_MAX];
> > int fd, pfd[2];
> > 
> > if (pipe(pfd))
> > return 1;
> > 
> > args.pfd[0] = pfd[0];
> > args.pfd[1] = pfd[1];
> > 
> > flags = CLONE_NEWPID | CLONE_NEWNS | CLONE_NEWUTS |
> > CLONE_NEWNET | CLONE_NEWIPC | CLONE_NEWUSER | SIGCHLD;
> > 
> > pid = clone(ns_exec, args.stack_ptr, flags, );
> > if (pid < 0) {
> > fprintf(stderr, "clone() failed: %m\n");
> > exit(1);
> > }
> > 
> > 
> > snprintf(pname, sizeof(pname), "/proc/%d/uid_map", pid);
> > fd = open(pname, O_WRONLY);
> > if (fd < 0) {
> > fprintf(stderr, "open(%s): %m\n", pname);
> > exit(1);
> > }
> > if (write(fd, UID_MAP, sizeof(UID_MAP)) < 0) {
> > fprintf(stderr, "write(" UID_MAP "): %m\n");
> > exit(1);
> > }
> > close(fd);
> > 
> > snprintf(pname, sizeof(pname), "/proc/%d/gid_map", pid);
> > fd = open(pname, O_WRONLY);
> > if (fd < 0) {
> > fprintf(stderr, "open(%s): %m\n", pname);
> > exit(1);
> > }
> > if (write(fd, GID_MAP, sizeof(GID_MAP)) < 0) {
> > fprintf(s

Re: [12/24] proc: Add fs_context support to procfs [ver #7]

2018-06-26 Thread Andrei Vagin
On Mon, Jun 18, 2018 at 08:34:50PM -0700, Andrei Vagin wrote:
> Hi David,
> 
> We run CRIU tests for vfs/for-next, and today a few of these test failed. I
> found that the problem appears after this patch..
> 
> >  int pid_ns_prepare_proc(struct pid_namespace *ns)
> >  {
> > +   struct proc_fs_context *ctx;
> > +   struct fs_context *fc;
> > struct vfsmount *mnt;
> > +   int ret;
> > +
> > +   fc = vfs_new_fs_context(_fs_type, NULL, 0,
> > +   FS_CONTEXT_FOR_KERNEL_MOUNT);
> > +   if (IS_ERR(fc))
> > +   return PTR_ERR(fc);
> > +
> > +   ctx = container_of(fc, struct proc_fs_context, fc);
> > +   if (ctx->pid_ns != ns) {
> > +   put_pid_ns(ctx->pid_ns);
> > +   get_pid_ns(ns);
> > +   ctx->pid_ns = ns;
> > +   }
> > +
> > +   ret = vfs_get_tree(fc);
> > +   if (ret < 0) {
> > +   put_fs_context(fc);
> > +   return ret;
> > +   }
> >  
> > -   mnt = kern_mount_data(_fs_type, ns, 0);

Here ns->user_ns and get_current_cred()->user_ns are not always equal

> > +   mnt = vfs_create_mount(fc);
> > +   put_fs_context(fc);
> > if (IS_ERR(mnt))
> > return PTR_ERR(mnt);
> >  

> #define _GNU_SOURCE
> #include 
> #include 
> #include 
> #include 
> #include 
> #include 
> #include 
> #include 
> #include 
> #include 
> #include 
> 
> 
> #define NS_STACK_SIZE 4096
> 
> #define __stack_aligned__ __attribute__((aligned(16)))
> 
> /* All arguments should be above stack, because it grows down */
> struct ns_exec_args {
>   char stack[NS_STACK_SIZE] __stack_aligned__;
>   char stack_ptr[0];
>   int pfd[2];
> };
> 
> static int ns_exec(void *_arg)
> {
>   struct ns_exec_args *args = (struct ns_exec_args *) _arg;
>   int ret;
> 
>   close(args->pfd[1]);
>   if (read(args->pfd[0], , sizeof(ret)) != sizeof(ret))
>   return -1;
> 
>   setsid();
> 
>   if (setuid(0) || setgid(0) || setgroups(0, NULL)) {
>   fprintf(stderr, "set*id failed: %m\n");
>   return -1;
>   }
> 
>   if (mount("proc", "/mnt", "proc", MS_MGC_VAL | MS_NOSUID | MS_NOEXEC | 
> MS_NODEV, NULL)) {
>   fprintf(stderr, "mount(/proc) failed: %m\n");
>   return -1;
>   }
> 
>   return 0;
> }
> 
> #define UID_MAP "0 10 10\n10 20 5"
> #define GID_MAP "0 40 5\n5 50 10"
> int main()
> {
>   pid_t pid;
>   int ret, status;
>   struct ns_exec_args args;
>   int flags;
>   char pname[PATH_MAX];
>   int fd, pfd[2];
> 
>   if (pipe(pfd))
>   return 1;
> 
>   args.pfd[0] = pfd[0];
>   args.pfd[1] = pfd[1];
> 
>   flags = CLONE_NEWPID | CLONE_NEWNS | CLONE_NEWUTS |
>   CLONE_NEWNET | CLONE_NEWIPC | CLONE_NEWUSER | SIGCHLD;
> 
>   pid = clone(ns_exec, args.stack_ptr, flags, );
>   if (pid < 0) {
>   fprintf(stderr, "clone() failed: %m\n");
>   exit(1);
>   }
> 
> 
>   snprintf(pname, sizeof(pname), "/proc/%d/uid_map", pid);
>   fd = open(pname, O_WRONLY);
>   if (fd < 0) {
>   fprintf(stderr, "open(%s): %m\n", pname);
>   exit(1);
>   }
>   if (write(fd, UID_MAP, sizeof(UID_MAP)) < 0) {
>   fprintf(stderr, "write(" UID_MAP "): %m\n");
>   exit(1);
>   }
>   close(fd);
> 
>   snprintf(pname, sizeof(pname), "/proc/%d/gid_map", pid);
>   fd = open(pname, O_WRONLY);
>   if (fd < 0) {
>   fprintf(stderr, "open(%s): %m\n", pname);
>   exit(1);
>   }
>   if (write(fd, GID_MAP, sizeof(GID_MAP)) < 0) {
>   fprintf(stderr, "write(" GID_MAP "): %m\n");
>   exit(1);
>   }
>   close(fd);
> 
>   if (write(pfd[1], , sizeof(ret)) != sizeof(ret))
>   return 1;
> 
>   if (waitpid(pid, , 0) != pid)
>   return 1;
>   if (status)
>   return 1;
> 
>   return 0;
> }



Re: [16/32] kernfs, sysfs, cgroup, intel_rdt: Support fs_context [ver #8]

2018-06-22 Thread Andrei Vagin
On Fri, Jun 22, 2018 at 08:30:29AM -0700, Andrei Vagin wrote:
> On Fri, Jun 22, 2018 at 01:52:16PM +0100, David Howells wrote:
> > Andrei Vagin  wrote:
> > 
> > > ret = 0;
> > > +   ctx->root = root;
> > > goto out_unlock;
> > 
> > Okay, I can see that.
> > 
> > > percpu_ref_reinit(>cgrp.self.refcnt);
> > > mutex_unlock(_mutex);
> > > }
> > > +   cgroup_get(>cgrp);
> > 
> > This probably needs to be conditional on ret == 0.
> 
> yes, you are right


I've read the code and I think it isn't obvious. A reference will be
released id cgroup_fs_context_free() even if ret isn't zero here.

I look at do_new_mount()

vfs_new_fs_context()
...
if (vfs_get_tree()) 
goto out_fc;

out_fc:
put_fs_context(fc);
fc->ops->free(fc);
cgroup_fs_context_free()
cgroup_put(>root->cgrp);

> 
> > 
> > Which version are you testing btw?  The patches in git have been fixed a
> > little from what was last posted.
> 
> I'm testing linux-next-20180621
> 
> commit 8439c34f07a3f58245e933ca2703239417288363 (tag: next-20180621,
> linux-next/master)
> Author: Stephen Rothwell 
> Date:   Thu Jun 21 14:09:41 2018 +1000
> 
> Add linux-next specific files for 20180621
> 
> Signed-off-by: Stephen Rothwell 
> 
> > 
> > David


Re: [16/32] kernfs, sysfs, cgroup, intel_rdt: Support fs_context [ver #8]

2018-06-22 Thread Andrei Vagin
On Fri, Jun 22, 2018 at 01:52:16PM +0100, David Howells wrote:
> Andrei Vagin  wrote:
> 
> > ret = 0;
> > +   ctx->root = root;
> > goto out_unlock;
> 
> Okay, I can see that.
> 
> > percpu_ref_reinit(>cgrp.self.refcnt);
> > mutex_unlock(_mutex);
> > }
> > +   cgroup_get(>cgrp);
> 
> This probably needs to be conditional on ret == 0.

yes, you are right

> 
> Which version are you testing btw?  The patches in git have been fixed a
> little from what was last posted.

I'm testing linux-next-20180621

commit 8439c34f07a3f58245e933ca2703239417288363 (tag: next-20180621,
linux-next/master)
Author: Stephen Rothwell 
Date:   Thu Jun 21 14:09:41 2018 +1000

Add linux-next specific files for 20180621

Signed-off-by: Stephen Rothwell 

> 
> David


Re: [16/32] kernfs, sysfs, cgroup, intel_rdt: Support fs_context [ver #8]

2018-06-21 Thread Andrei Vagin
On Fri, May 25, 2018 at 01:07:08AM +0100, David Howells wrote:

...

> @@ -1972,57 +1957,51 @@ int cgroup_setup_root(struct cgroup_root *root, u16 
> ss_mask, int ref_flags)
>   return ret;
>  }
>  
> -struct dentry *cgroup_do_mount(struct file_system_type *fs_type, int flags,
> -struct cgroup_root *root, unsigned long magic,
> -struct cgroup_namespace *ns)
> +int cgroup_do_get_tree(struct fs_context *fc)
>  {
> - struct dentry *dentry;
> - bool new_sb;
> + struct cgroup_fs_context *ctx = fc->fs_private;
> + int ret;
> +
> + ctx->kfc.root = ctx->root->kf_root;

[root@fc24 ~]# mount -t cgroup -o none,name=zdtmtst xxx /mnt/test
[root@fc24 ~]# mkdir /mnt/test/holder
[root@fc24 ~]# umount /mnt/test 
[root@fc24 ~]# mount -t cgroup -o none,name=zdtmtst xxx /mnt/test
Killed

ctx->root can be NULL here

[   93.719897] BUG: unable to handle kernel NULL pointer dereference at 

[   93.720097] PGD 8002115f5067 P4D 8002115f5067 PUD 1ef421067 PMD 0 
[   93.720179] Oops:  [#1] SMP PTI
[   93.720257] CPU: 1 PID: 13843 Comm: cgroup04 Not tainted 
4.18.0-rc1-next-20180621+ #1
[   93.720342] Hardware name: Google Google Compute Engine/Google Compute 
Engine, BIOS Google 01/01/2011
[   93.720432] RIP: 0010:cgroup_do_get_tree+0x1b/0xf0
[   93.720515] Code: 00 00 02 5b 5d c3 66 0f 1f 84 00 00 00 00 00 0f 1f 44 00 
00 41 54 55 49 89 fc 53 48 83 ec 08 48 8b 9f 90 00 00 00 48 8b 43 20 <48> 8b 00 
48 89 03 e8 8a cc 1e 00 85 c0 0f 88 97 00 00 00 48 81 7b 
[   93.720655] RSP: 0018:b07941b03df8 EFLAGS: 00010292
[   93.720740] RAX:  RBX: 9ba3527da300 RCX: 
[   93.720819] RDX:  RSI: 0001 RDI: 9ba34d47b400
[   93.720897] RBP: b07941b03e58 R08:  R09: 0002
[   93.720975] R10:  R11: 4aee8a3cb0beb9ec R12: 9ba34d47b400
[   93.721053] R13: 9ba351518000 R14: 961705d4 R15: 9ba35143f000
[   93.721131] FS:  15418d893740() GS:9ba35fd0() 
knlGS:
[   93.721233] CS:  0010 DS:  ES:  CR0: 80050033
[   93.721336] CR2:  CR3: 0001c4658004 CR4: 001606e0
[   93.721421] Call Trace:
[   93.721508]  cgroup1_get_tree+0x57c/0x640
[   93.721587]  vfs_get_tree+0x6e/0x180
[   93.721665]  do_mount+0x76b/0xa80
[   93.721753]  ksys_mount+0x80/0xd0
[   93.721831]  __x64_sys_mount+0x21/0x30
[   93.721908]  do_syscall_64+0x60/0x1b0
[   93.721987]  entry_SYSCALL_64_after_hwframe+0x49/0xbe
[   93.722065] RIP: 0033:0x15418d3bc85a

I think we need something like this:

diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c
index e12c0a91b8a4..b1340bd5f5fc 100644
--- a/kernel/cgroup/cgroup-v1.c
+++ b/kernel/cgroup/cgroup-v1.c
@@ -1192,6 +1192,7 @@ int cgroup1_get_tree(struct fs_context *fc)
}
 
ret = 0;
+   ctx->root = root;
goto out_unlock;
}
 
@@ -1241,6 +1242,7 @@ int cgroup1_get_tree(struct fs_context *fc)
percpu_ref_reinit(>cgrp.self.refcnt);
mutex_unlock(_mutex);
}
+   cgroup_get(>cgrp);
 
/*
 * If @pinned_sb, we're reusing an existing root and holding an

https://travis-ci.org/avagin/linux/jobs/394887987

>  
> - dentry = kernfs_mount(fs_type, flags, root->kf_root, magic, _sb);
> + ret = kernfs_get_tree(fc);
> + if (ret < 0)
> + goto out_cgrp;
>  
>   /*
>* In non-init cgroup namespace, instead of root cgroup's dentry,
>* we return the dentry corresponding to the cgroupns->root_cgrp.
>*/
> - if (!IS_ERR(dentry) && ns != _cgroup_ns) {


Re: [12/24] proc: Add fs_context support to procfs [ver #7]

2018-06-18 Thread Andrei Vagin
Hi David,

We run CRIU tests for vfs/for-next, and today a few of these test failed. I
found that the problem appears after this patch..

https://travis-ci.org/avagin/linux/jobs/393766778

The reproducer is attached. It creates a process in a new set of namespaces
(user, mount, etc) and then this process fails to mount procfs, the mount
syscall returns EBUSY.

666   pipe([3, 4])  = 0
666   clone(child_stack=0x7ffc23a89400, 
flags=CLONE_NEWNS|CLONE_NEWUTS|CLONE_NEWIPC|CLONE_NEWUSER|CLONE_NEWPID|CLONE_NEWNET|SIGCHLD)
 = 667
666   openat(AT_FDCWD, "/proc/667/uid_map", O_WRONLY 
667   close(4 
666   <... openat resumed> )= 5
666   write(5, "0 10 10\n10 20 50"..., 36 
667   <... close resumed> ) = 0
666   <... write resumed> ) = 36
666   close(5 
667   read(3,  
666   <... close resumed> ) = 0
666   openat(AT_FDCWD, "/proc/667/gid_map", O_WRONLY) = 5
666   write(5, "0 40 5\n5 50 1000"..., 35) = 35
666   close(5)  = 0
666   write(4, " \225\250#", 4) = 4
667   <... read resumed> " \225\250#", 4) = 4
666   wait4(667,  
667   setsid()  = 1
667   setuid(0) = 0
667   setgid(0) = 0
667   setgroups(0, NULL)= 0
667   mount("proc", "/mnt", "proc", MS_MGC_VAL|MS_NOSUID|MS_NODEV|MS_NOEXEC, 
NULL) = -1 EBUSY (Device or resource busy)

Thanks,
Andrei

On Thu, Apr 19, 2018 at 02:32:28PM +0100, David Howells wrote:
> Add fs_context support to procfs.
> 
> Signed-off-by: David Howells 
> ---
> 
>  fs/proc/inode.c|2 -
>  fs/proc/internal.h |2 -
>  fs/proc/root.c |  169 
> ++--
>  3 files changed, 113 insertions(+), 60 deletions(-)
> 
> diff --git a/fs/proc/inode.c b/fs/proc/inode.c
> index 0b13cf6eb6d7..7aa86dd65ba8 100644
> --- a/fs/proc/inode.c
> +++ b/fs/proc/inode.c
> @@ -128,7 +128,7 @@ const struct super_operations proc_sops = {
>   .drop_inode = generic_delete_inode,
>   .evict_inode= proc_evict_inode,
>   .statfs = simple_statfs,
> - .remount_fs = proc_remount,
> + .reconfigure= proc_reconfigure,
>   .show_options   = proc_show_options,
>  };
>  
> diff --git a/fs/proc/internal.h b/fs/proc/internal.h
> index 3182e1b636d3..a5ab9504768a 100644
> --- a/fs/proc/internal.h
> +++ b/fs/proc/internal.h
> @@ -254,7 +254,7 @@ static inline void proc_tty_init(void) {}
>  extern struct proc_dir_entry proc_root;
>  
>  extern void proc_self_init(void);
> -extern int proc_remount(struct super_block *, int *, char *, size_t);
> +extern int proc_reconfigure(struct super_block *, struct fs_context *);
>  
>  /*
>   * task_[no]mmu.c
> diff --git a/fs/proc/root.c b/fs/proc/root.c
> index 2fbc177f37a8..e6bd31fbc714 100644
> --- a/fs/proc/root.c
> +++ b/fs/proc/root.c
> @@ -19,14 +19,24 @@
>  #include 
>  #include 
>  #include 
> +#include 
>  #include 
>  #include 
>  #include 
>  #include 
>  #include 
> +#include 
>  
>  #include "internal.h"
>  
> +struct proc_fs_context {
> + struct fs_context   fc;
> + struct pid_namespace*pid_ns;
> + unsigned long   mask;
> + int hidepid;
> + int gid;
> +};
> +
>  enum {
>   Opt_gid, Opt_hidepid, Opt_err,
>  };
> @@ -37,56 +47,60 @@ static const match_table_t tokens = {
>   {Opt_err, NULL},
>  };
>  
> -static int proc_parse_options(char *options, struct pid_namespace *pid)
> +static int proc_parse_option(struct fs_context *fc, char *opt, size_t len)
>  {
> - char *p;
> + struct proc_fs_context *ctx = container_of(fc, struct proc_fs_context, 
> fc);
>   substring_t args[MAX_OPT_ARGS];
> - int option;
> -
> - if (!options)
> - return 1;
> -
> - while ((p = strsep(, ",")) != NULL) {
> - int token;
> - if (!*p)
> - continue;
> -
> - args[0].to = args[0].from = NULL;
> - token = match_token(p, tokens, args);
> - switch (token) {
> - case Opt_gid:
> - if (match_int([0], ))
> - return 0;
> - pid->pid_gid = make_kgid(current_user_ns(), option);
> - break;
> - case Opt_hidepid:
> - if (match_int([0], ))
> - return 0;
> - if (option < HIDEPID_OFF ||
> - option > HIDEPID_INVISIBLE) {
> - pr_err("proc: hidepid value must be between 0 
> and 2.\n");
> - return 0;
> - }
> - pid->hide_pid = option;
> - break;
> - default:
> - pr_err("proc: unrecognized mount option \"%s\" "
> -"or missing value\n", p);
> -   

Re: [v5, 03/31] kconfig: reference environment variables directly and remove 'option env='

2018-06-08 Thread Andrei Vagin
Hi Masahiro,

localyesconfig doesn't work and git bisect points on this patch.

[avagin@laptop linux]$ make  localyesconfig
using config: '.config'
Can't open arch/$(SRCARCH)/Kconfig at ./scripts/kconfig/streamline_config.pl 
line 174, <$kinfile> line 8.
make[1]: *** [scripts/kconfig/Makefile:45: localyesconfig] Error 2
make: *** [Makefile:526: localyesconfig] Error 2

On Mon, May 28, 2018 at 06:21:40PM +0900, Masahiro Yamada wrote:
> To get access to environment variables, Kconfig needs to define a
> symbol using "option env=" syntax.  It is tedious to add a symbol entry
> for each environment variable given that we need to define much more
> such as 'CC', 'AS', 'srctree' etc. to evaluate the compiler capability
> in Kconfig.
> 
> Adding '$' for symbol references is grammatically inconsistent.
> Looking at the code, the symbols prefixed with 'S' are expanded by:
>  - conf_expand_value()
>This is used to expand 'arch/$ARCH/defconfig' and 'defconfig_list'
>  - sym_expand_string_value()
>This is used to expand strings in 'source' and 'mainmenu'
> 
> All of them are fixed values independent of user configuration.  So,
> they can be changed into the direct expansion instead of symbols.
> 
> This change makes the code much cleaner.  The bounce symbols 'SRCARCH',
> 'ARCH', 'SUBARCH', 'KERNELVERSION' are gone.
> 
> sym_init() hard-coding 'UNAME_RELEASE' is also gone.  'UNAME_RELEASE'
> should be replaced with an environment variable.
> 
> ARCH_DEFCONFIG is a normal symbol, so it should be simply referenced
> without '$' prefix.
> 
> The new syntax is addicted by Make.  The variable reference needs
> parentheses, like $(FOO), but you can omit them for single-letter
> variables, like $F.  Yet, in Makefiles, people tend to use the
> parenthetical form for consistency / clarification.
> 
> At this moment, only the environment variable is supported, but I will
> extend the concept of 'variable' later on.
> 
> The variables are expanded in the lexer so we can simplify the token
> handling on the parser side.
> 
> For example, the following code works.
> 
> [Example code]
> 
>   config MY_TOOLCHAIN_LIST
>   string
>   default "My tools: CC=$(CC), AS=$(AS), CPP=$(CPP)"
> 
> [Result]
> 
>   $ make -s alldefconfig && tail -n 1 .config
>   CONFIG_MY_TOOLCHAIN_LIST="My tools: CC=gcc, AS=as, CPP=gcc -E"
> 
> Signed-off-by: Masahiro Yamada 
> Reviewed-by: Kees Cook 
> ---
> 
> Changes in v5:
>   - More comments like "advance pointer to ..."
>   - Factor out the duplicated code into __expand_string()
>   - Move the empty name check to env_expand().
>   - Remove escape sequence of '$'
> 
> Changes in v4:
>   - Enclose ARCH in conf_defname
>   - Drop single-letter support
> 
> Changes in v3:
>   - Reimplement
>   - Variable reference need parentheses except single-letter variable
> 
> Changes in v2:
>   - Move the string expansion to the lexer phase.
>   - Split environment helpers to env.c
> 
>  Documentation/kbuild/kconfig-language.txt |   8 -
>  Kconfig   |   8 +-
>  Makefile  |   3 +-
>  arch/sh/Kconfig   |   4 +-
>  arch/sparc/Kconfig|   4 +-
>  arch/um/Kconfig.common|   4 -
>  arch/x86/Kconfig  |   4 +-
>  arch/x86/um/Kconfig   |   6 +-
>  init/Kconfig  |  16 +-
>  scripts/kconfig/confdata.c|  33 +
>  scripts/kconfig/kconf_id.c|   1 -
>  scripts/kconfig/lkc.h |   5 +-
>  scripts/kconfig/lkc_proto.h   |   6 +
>  scripts/kconfig/menu.c|   3 -
>  scripts/kconfig/preprocess.c  | 238 
> ++
>  scripts/kconfig/symbol.c  |  56 ---
>  scripts/kconfig/util.c|  29 ++--
>  scripts/kconfig/zconf.l   |  67 -
>  scripts/kconfig/zconf.y   |   2 +-
>  19 files changed, 343 insertions(+), 154 deletions(-)
>  create mode 100644 scripts/kconfig/preprocess.c
> 
> diff --git a/Documentation/kbuild/kconfig-language.txt 
> b/Documentation/kbuild/kconfig-language.txt
> index f5b9493..0e966e8 100644
> --- a/Documentation/kbuild/kconfig-language.txt
> +++ b/Documentation/kbuild/kconfig-language.txt
> @@ -198,14 +198,6 @@ applicable everywhere (see syntax).
>  enables the third modular state for all config symbols.
>  At most one symbol may have the "modules" option set.
>  
> -  - "env"=
> -This imports the environment variable into Kconfig. It behaves like
> -a default, except that the value comes from the environment, this
> -also means that the behaviour when mixing it with normal defaults is
> -undefined at this point. The symbol is currently not exported back
> -to the build environment (if this is desired, it can be done via
> -another symbol).
> -
>- "allnoconfig_y"
>  This 

Re: possible deadlock in sk_diag_fill

2018-05-15 Thread Andrei Vagin
On Tue, May 15, 2018 at 07:19:39AM +0200, Dmitry Vyukov wrote:
> On Mon, May 14, 2018 at 8:00 PM, Andrei Vagin <ava...@virtuozzo.com> wrote:
> >> >> Hello,
> >> >>
> >> >> syzbot found the following crash on:
> >> >>
> >> >> HEAD commit:c1c07416cdd4 Merge tag 'kbuild-fixes-v4.17' of 
> >> >> git://git.k..
> >> >> git tree:   upstream
> >> >> console output: https://syzkaller.appspot.com/x/log.txt?x=12164c9780
> >> >> kernel config:  
> >> >> https://syzkaller.appspot.com/x/.config?x=5a1dc06635c10d27
> >> >> dashboard link: 
> >> >> https://syzkaller.appspot.com/bug?extid=c1872be62e587eae9669
> >> >> compiler:   gcc (GCC) 8.0.1 20180413 (experimental)
> >> >> userspace arch: i386
> >> >>
> >> >> Unfortunately, I don't have any reproducer for this crash yet.
> >> >>
> >> >> IMPORTANT: if you fix the bug, please add the following tag to the 
> >> >> commit:
> >> >> Reported-by: syzbot+c1872be62e587eae9...@syzkaller.appspotmail.com
> >> >>
> >> >>
> >> >> ==
> >> >> WARNING: possible circular locking dependency detected
> >> >> 4.17.0-rc3+ #59 Not tainted
> >> >> --
> >> >> syz-executor1/25282 is trying to acquire lock:
> >> >> 4fddf743 (&(>lock)->rlock/1){+.+.}, at: sk_diag_dump_icons
> >> >> net/unix/diag.c:82 [inline]
> >> >> 4fddf743 (&(>lock)->rlock/1){+.+.}, at:
> >> >> sk_diag_fill.isra.5+0xa43/0x10d0 net/unix/diag.c:144
> >> >>
> >> >> but task is already holding lock:
> >> >> b6895645 (rlock-AF_UNIX){+.+.}, at: spin_lock
> >> >> include/linux/spinlock.h:310 [inline]
> >> >> b6895645 (rlock-AF_UNIX){+.+.}, at: sk_diag_dump_icons
> >> >> net/unix/diag.c:64 [inline]
> >> >> b6895645 (rlock-AF_UNIX){+.+.}, at: 
> >> >> sk_diag_fill.isra.5+0x94e/0x10d0
> >> >> net/unix/diag.c:144
> >> >>
> >> >> which lock already depends on the new lock.
> >> >
> >> > In the code, we have a comment which explains why it is safe to take 
> >> > this lock
> >> >
> >> > /*
> >> >  * The state lock is outer for the same sk's
> >> >  * queue lock. With the other's queue locked it's
> >> >  * OK to lock the state.
> >> >  */
> >> > unix_state_lock_nested(req);
> >> >
> >> > It is a question how to explain this to lockdep.
> >>
> >> Do I understand it correctly that (>lock)->rlock associated with
> >> AF_UNIX is locked under rlock-AF_UNIX, and then rlock-AF_UNIX is
> >> locked under (>lock)->rlock associated with AF_NETLINK? If so, I
> >> think we need to split (>lock)->rlock by family too, so that we
> >> have u->lock-AF_UNIX and u->lock-AF_NETLINK.
> >
> > I think here is another problem. lockdep woried about
> > sk->sk_receive_queue vs unix_sk(s)->lock.
> >
> > sk_diag_dump_icons() takes sk->sk_receive_queue and then
> > unix_sk(s)->lock.
> >
> > unix_dgram_sendmsg takes unix_sk(sk)->lock and then sk->sk_receive_queue.
> >
> > sk_diag_dump_icons() takes locks for two different sockets, but
> > unix_dgram_sendmsg() takes locks for one socket.
> >
> > sk_diag_dump_icons
> > if (sk->sk_state == TCP_LISTEN) {
> > spin_lock(>sk_receive_queue.lock);
> > skb_queue_walk(>sk_receive_queue, skb) {
> > unix_state_lock_nested(req);
> > spin_lock_nested(_sk(s)->lock,
> >
> >
> > unix_dgram_sendmsg
> > unix_state_lock(other)
> > spin_lock(_sk(s)->lock)
> > skb_queue_tail(>sk_receive_queue, skb);
> > spin_lock_irqsave(>lock, flags);
> 
> 
> Do you mean the following?
> There is socket 1 with state lock (S1) and queue lock (Q2), and socket
> 2 with state lock (S2) and queue lock (Q2). unix_dgram_sendmsg lock
> S1->Q1. And sk_diag_dump_icons locks Q1->S2.
> If yes, then this looks pretty much as deadlock. Consider that 2
> unix_dgram_sendmsg in 2 different threads lock S1 and S2 respectively.
> Now 2  sk_diag_dump_icons in 2 different threads lock Q1 and Q2
> respectively. Now sk_diag_dump_icons want to lock S's, and
> unix_dgram_sendmsg want to lock Q's. Nobody can proceed.

Q1 and S1 belongs to a listen socket, so they can't be taken from
unix_dgram_sendmsg().



Re: possible deadlock in sk_diag_fill

2018-05-14 Thread Andrei Vagin
On Sat, May 12, 2018 at 09:46:25AM +0200, Dmitry Vyukov wrote:
> On Fri, May 11, 2018 at 8:33 PM, Andrei Vagin <ava...@virtuozzo.com> wrote:
> > On Sat, May 05, 2018 at 10:59:02AM -0700, syzbot wrote:
> >> Hello,
> >>
> >> syzbot found the following crash on:
> >>
> >> HEAD commit:c1c07416cdd4 Merge tag 'kbuild-fixes-v4.17' of 
> >> git://git.k..
> >> git tree:   upstream
> >> console output: https://syzkaller.appspot.com/x/log.txt?x=12164c9780
> >> kernel config:  https://syzkaller.appspot.com/x/.config?x=5a1dc06635c10d27
> >> dashboard link: 
> >> https://syzkaller.appspot.com/bug?extid=c1872be62e587eae9669
> >> compiler:   gcc (GCC) 8.0.1 20180413 (experimental)
> >> userspace arch: i386
> >>
> >> Unfortunately, I don't have any reproducer for this crash yet.
> >>
> >> IMPORTANT: if you fix the bug, please add the following tag to the commit:
> >> Reported-by: syzbot+c1872be62e587eae9...@syzkaller.appspotmail.com
> >>
> >>
> >> ==
> >> WARNING: possible circular locking dependency detected
> >> 4.17.0-rc3+ #59 Not tainted
> >> --
> >> syz-executor1/25282 is trying to acquire lock:
> >> 4fddf743 (&(>lock)->rlock/1){+.+.}, at: sk_diag_dump_icons
> >> net/unix/diag.c:82 [inline]
> >> 4fddf743 (&(>lock)->rlock/1){+.+.}, at:
> >> sk_diag_fill.isra.5+0xa43/0x10d0 net/unix/diag.c:144
> >>
> >> but task is already holding lock:
> >> b6895645 (rlock-AF_UNIX){+.+.}, at: spin_lock
> >> include/linux/spinlock.h:310 [inline]
> >> b6895645 (rlock-AF_UNIX){+.+.}, at: sk_diag_dump_icons
> >> net/unix/diag.c:64 [inline]
> >> b6895645 (rlock-AF_UNIX){+.+.}, at: 
> >> sk_diag_fill.isra.5+0x94e/0x10d0
> >> net/unix/diag.c:144
> >>
> >> which lock already depends on the new lock.
> >
> > In the code, we have a comment which explains why it is safe to take this 
> > lock
> >
> > /*
> >  * The state lock is outer for the same sk's
> >  * queue lock. With the other's queue locked it's
> >  * OK to lock the state.
> >  */
> > unix_state_lock_nested(req);
> >
> > It is a question how to explain this to lockdep.
> 
> Do I understand it correctly that (>lock)->rlock associated with
> AF_UNIX is locked under rlock-AF_UNIX, and then rlock-AF_UNIX is
> locked under (>lock)->rlock associated with AF_NETLINK? If so, I
> think we need to split (>lock)->rlock by family too, so that we
> have u->lock-AF_UNIX and u->lock-AF_NETLINK.

I think here is another problem. lockdep woried about
sk->sk_receive_queue vs unix_sk(s)->lock.

sk_diag_dump_icons() takes sk->sk_receive_queue and then
unix_sk(s)->lock.

unix_dgram_sendmsg takes unix_sk(sk)->lock and then sk->sk_receive_queue.

sk_diag_dump_icons() takes locks for two different sockets, but
unix_dgram_sendmsg() takes locks for one socket.

sk_diag_dump_icons
if (sk->sk_state == TCP_LISTEN) {
spin_lock(>sk_receive_queue.lock);
skb_queue_walk(>sk_receive_queue, skb) {
unix_state_lock_nested(req);
spin_lock_nested(_sk(s)->lock,


unix_dgram_sendmsg
unix_state_lock(other)
spin_lock(_sk(s)->lock)
skb_queue_tail(>sk_receive_queue, skb);
spin_lock_irqsave(>lock, flags);

> 
> 
> 
> >> the existing dependency chain (in reverse order) is:
> >>
> >> -> #1 (rlock-AF_UNIX){+.+.}:
> >>__raw_spin_lock_irqsave include/linux/spinlock_api_smp.h:110 
> >> [inline]
> >>_raw_spin_lock_irqsave+0x96/0xc0 kernel/locking/spinlock.c:152
> >>skb_queue_tail+0x26/0x150 net/core/skbuff.c:2900
> >>unix_dgram_sendmsg+0xf77/0x1730 net/unix/af_unix.c:1797
> >>sock_sendmsg_nosec net/socket.c:629 [inline]
> >>sock_sendmsg+0xd5/0x120 net/socket.c:639
> >>___sys_sendmsg+0x525/0x940 net/socket.c:2117
> >>__sys_sendmmsg+0x3bb/0x6f0 net/socket.c:2205
> >>__compat_sys_sendmmsg net/compat.c:770 [inline]
> >>__do_compat_sys_sendmmsg net/compat.c:777 [inline]
> >>__se_compat_sys_sendmmsg net/compat.c:774 [inline]
> >>__ia32_compat_sys_sendmmsg+0x9f/0x100 net/compat.c:774
> >>do_syscall_32_irqs_on arch/x86/entry/common.c:323 [inli

Re: possible deadlock in sk_diag_fill

2018-05-11 Thread Andrei Vagin
On Sat, May 05, 2018 at 10:59:02AM -0700, syzbot wrote:
> Hello,
> 
> syzbot found the following crash on:
> 
> HEAD commit:c1c07416cdd4 Merge tag 'kbuild-fixes-v4.17' of git://git.k..
> git tree:   upstream
> console output: https://syzkaller.appspot.com/x/log.txt?x=12164c9780
> kernel config:  https://syzkaller.appspot.com/x/.config?x=5a1dc06635c10d27
> dashboard link: https://syzkaller.appspot.com/bug?extid=c1872be62e587eae9669
> compiler:   gcc (GCC) 8.0.1 20180413 (experimental)
> userspace arch: i386
> 
> Unfortunately, I don't have any reproducer for this crash yet.
> 
> IMPORTANT: if you fix the bug, please add the following tag to the commit:
> Reported-by: syzbot+c1872be62e587eae9...@syzkaller.appspotmail.com
> 
> 
> ==
> WARNING: possible circular locking dependency detected
> 4.17.0-rc3+ #59 Not tainted
> --
> syz-executor1/25282 is trying to acquire lock:
> 4fddf743 (&(>lock)->rlock/1){+.+.}, at: sk_diag_dump_icons
> net/unix/diag.c:82 [inline]
> 4fddf743 (&(>lock)->rlock/1){+.+.}, at:
> sk_diag_fill.isra.5+0xa43/0x10d0 net/unix/diag.c:144
> 
> but task is already holding lock:
> b6895645 (rlock-AF_UNIX){+.+.}, at: spin_lock
> include/linux/spinlock.h:310 [inline]
> b6895645 (rlock-AF_UNIX){+.+.}, at: sk_diag_dump_icons
> net/unix/diag.c:64 [inline]
> b6895645 (rlock-AF_UNIX){+.+.}, at: sk_diag_fill.isra.5+0x94e/0x10d0
> net/unix/diag.c:144
> 
> which lock already depends on the new lock.

In the code, we have a comment which explains why it is safe to take this lock

/*
 * The state lock is outer for the same sk's
 * queue lock. With the other's queue locked it's
 * OK to lock the state.
 */
unix_state_lock_nested(req);

It is a question how to explain this to lockdep.

> 
> 
> the existing dependency chain (in reverse order) is:
> 
> -> #1 (rlock-AF_UNIX){+.+.}:
>__raw_spin_lock_irqsave include/linux/spinlock_api_smp.h:110 [inline]
>_raw_spin_lock_irqsave+0x96/0xc0 kernel/locking/spinlock.c:152
>skb_queue_tail+0x26/0x150 net/core/skbuff.c:2900
>unix_dgram_sendmsg+0xf77/0x1730 net/unix/af_unix.c:1797
>sock_sendmsg_nosec net/socket.c:629 [inline]
>sock_sendmsg+0xd5/0x120 net/socket.c:639
>___sys_sendmsg+0x525/0x940 net/socket.c:2117
>__sys_sendmmsg+0x3bb/0x6f0 net/socket.c:2205
>__compat_sys_sendmmsg net/compat.c:770 [inline]
>__do_compat_sys_sendmmsg net/compat.c:777 [inline]
>__se_compat_sys_sendmmsg net/compat.c:774 [inline]
>__ia32_compat_sys_sendmmsg+0x9f/0x100 net/compat.c:774
>do_syscall_32_irqs_on arch/x86/entry/common.c:323 [inline]
>do_fast_syscall_32+0x345/0xf9b arch/x86/entry/common.c:394
>entry_SYSENTER_compat+0x70/0x7f arch/x86/entry/entry_64_compat.S:139
> 
> -> #0 (&(>lock)->rlock/1){+.+.}:
>lock_acquire+0x1dc/0x520 kernel/locking/lockdep.c:3920
>_raw_spin_lock_nested+0x28/0x40 kernel/locking/spinlock.c:354
>sk_diag_dump_icons net/unix/diag.c:82 [inline]
>sk_diag_fill.isra.5+0xa43/0x10d0 net/unix/diag.c:144
>sk_diag_dump net/unix/diag.c:178 [inline]
>unix_diag_dump+0x35f/0x550 net/unix/diag.c:206
>netlink_dump+0x507/0xd20 net/netlink/af_netlink.c:2226
>__netlink_dump_start+0x51a/0x780 net/netlink/af_netlink.c:2323
>netlink_dump_start include/linux/netlink.h:214 [inline]
>unix_diag_handler_dump+0x3f4/0x7b0 net/unix/diag.c:307
>__sock_diag_cmd net/core/sock_diag.c:230 [inline]
>sock_diag_rcv_msg+0x2e0/0x3d0 net/core/sock_diag.c:261
>netlink_rcv_skb+0x172/0x440 net/netlink/af_netlink.c:2448
>sock_diag_rcv+0x2a/0x40 net/core/sock_diag.c:272
>netlink_unicast_kernel net/netlink/af_netlink.c:1310 [inline]
>netlink_unicast+0x58b/0x740 net/netlink/af_netlink.c:1336
>netlink_sendmsg+0x9f0/0xfa0 net/netlink/af_netlink.c:1901
>sock_sendmsg_nosec net/socket.c:629 [inline]
>sock_sendmsg+0xd5/0x120 net/socket.c:639
>sock_write_iter+0x35a/0x5a0 net/socket.c:908
>call_write_iter include/linux/fs.h:1784 [inline]
>new_sync_write fs/read_write.c:474 [inline]
>__vfs_write+0x64d/0x960 fs/read_write.c:487
>vfs_write+0x1f8/0x560 fs/read_write.c:549
>ksys_write+0xf9/0x250 fs/read_write.c:598
>__do_sys_write fs/read_write.c:610 [inline]
>__se_sys_write fs/read_write.c:607 [inline]
>__ia32_sys_write+0x71/0xb0 fs/read_write.c:607
>do_syscall_32_irqs_on arch/x86/entry/common.c:323 [inline]
>do_fast_syscall_32+0x345/0xf9b arch/x86/entry/common.c:394
>entry_SYSENTER_compat+0x70/0x7f arch/x86/entry/entry_64_compat.S:139
> 
> other info that might help us debug this:
> 
>  Possible unsafe locking scenario:
> 
>CPU0CPU1
>
>   

Re: [v2] mm: access to uninitialized struct page

2018-05-04 Thread Andrei Vagin
On Fri, May 04, 2018 at 12:47:53PM +, Pavel Tatashin wrote:
> Hi Andrei,
> 
> Could you please provide me with scripts to reproduce this issue?

I boot this kernel in a kvm virtual machine. The kernel is built without
modules. A config file is attahced.

Here is a qemu command line what I use to reproduce the problem:

qemu-kvm -kernel /home/avagin/git/linux-next/arch/x86/boot/bzImage  \
-append 'root=/dev/vda2 ro debug console=ttyS0,115200 LANG=en_US.UTF-8 
slub_debug=FZP raid=noautodetect selinux=0 earlyprintk=serial,ttyS0,115200' \
-boot c \
-smp 2,sockets=2,cores=1,threads=1 \
-drive file=/home/vms/fc22.img,format=raw,if=none,id=drive-virtio-disk0 
\
--display none \
-serial telnet:127.0.0.1:,server,nowait -cpu 
Skylake-Client-IBRS,ss=on,hypervisor=on,tsc_adjust=on,clflushopt=on,xsaves=on,pdpe1gb=on,ibpb=on
 \
-m 4096 \
-realtime mlock=off \
-machine pc-i440fx-2.3,accel=kvm,usb=off,dump-guest-core=off \
-device ich9-usb-ehci1,id=usb,bus=pci.0,addr=0x6.0x7 -device 
ich9-usb-uhci1,masterbus=usb.0,firstport=0,bus=pci.0,multifunction=on,addr=0x6 \
-device 
ich9-usb-uhci2,masterbus=usb.0,firstport=2,bus=pci.0,addr=0x6.0x1 \
-device 
ich9-usb-uhci3,masterbus=usb.0,firstport=4,bus=pci.0,addr=0x6.0x2 \
-device 
virtio-blk-pci,scsi=off,bus=pci.0,addr=0x7,drive=drive-virtio-disk0,id=virtio-disk0,bootindex=1
 -device virtio-balloon-pci,id=balloon0,bus=pci.0,addr=0x8 -msg timestamp=on


[avagin@laptop linux-next]$ cat /proc/cpuinfo 
processor   : 0
vendor_id   : GenuineIntel
cpu family  : 6
model   : 78
model name  : Intel(R) Core(TM) i5-6300U CPU @ 2.40GHz
stepping: 3
microcode   : 0xc2
cpu MHz : 1213.986
cache size  : 3072 KB
physical id : 0
siblings: 4
core id : 0
cpu cores   : 2
apicid  : 0
initial apicid  : 0
fpu : yes
fpu_exception   : yes
cpuid level : 22
wp  : yes
flags   : fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov 
pat pse36 clflush dts acpi mmx fxsr sse sse2 ss ht tm pbe syscall nx pdpe1gb 
rdtscp lm constant_tsc art arch_perfmon pebs bts rep_good nopl xtopology 
nonstop_tsc cpuid aperfmperf tsc_known_freq pni pclmulqdq dtes64 monitor ds_cpl 
vmx smx est tm2 ssse3 sdbg fma cx16 xtpr pdcm pcid sse4_1 sse4_2 x2apic movbe 
popcnt tsc_deadline_timer aes xsave avx f16c rdrand lahf_lm abm 3dnowprefetch 
cpuid_fault epb invpcid_single pti tpr_shadow vnmi flexpriority ept vpid 
fsgsbase tsc_adjust bmi1 hle avx2 smep bmi2 erms invpcid rtm mpx rdseed adx 
smap clflushopt intel_pt xsaveopt xsavec xgetbv1 xsaves ibpb ibrs stibp dtherm 
ida arat pln pts hwp hwp_notify hwp_act_window hwp_epp
bugs: cpu_meltdown spectre_v1 spectre_v2
bogomips: 4992.00
clflush size: 64
cache_alignment : 64
address sizes   : 39 bits physical, 48 bits virtual
power management:

> 
> Thank you,
> Pavel
> On Fri, May 4, 2018 at 4:27 AM Andrei Vagin <ava...@virtuozzo.com> wrote:
> 
> > Hello,
> 
> > We have a robot which runs criu tests on linux-next kernels.
> 
> > All tests passed on 4.17.0-rc3-next-20180502.
> 
> > But the 4.17.0-rc3-next-20180504 kernel didn't boot.
> 
> > git bisect points on this patch.
> 
> > On Thu, Apr 26, 2018 at 04:26:19PM -0400, Pavel Tatashin wrote:
> > > The following two bugs were reported by Fengguang Wu:
> > >
> > > kernel reboot-without-warning in early-boot stage, last printk:
> > > early console in setup code
> > >
> > >
> http://lkml.kernel.org/r/20180418135300.inazvpxjxowog...@wfg-t540p.sh.intel.com
> 
> > The problem looks similar with this one.
> 
> > [5.596975] devtmpfs: mounted
> > [5.855754] Freeing unused kernel memory: 1704K
> > [5.858162] Write protecting the kernel read-only data: 18432k
> > [5.860772] Freeing unused kernel memory: 2012K
> > [5.861838] Freeing unused kernel memory: 160K
> > [5.862572] rodata_test: all tests were successful
> > [5.866857] random: fast init done
> > early console in setup code
> > [0.00] Linux version 4.17.0-rc3-00023-g7c4cc2d022a1
> > (avagin@laptop) (gcc version 8.0.1 20180324 (Red Hat 8.0.1-0.20) (GCC))
> > #13 SMP Fri May 4 01:10:51 PDT 2018
> > [0.00] Command line: root=/dev/vda2 ro debug
> > console=ttyS0,115200 LANG=en_US.UTF-8 slub_debug=FZP raid=noautodetect
> > selinux=0 earlyprintk=serial,ttyS0,115200
> > [0.00] x86/fpu: Supporting XSAVE feature 0x001: 'x87 floating
> > point registers'
> > [0.00] x86/fpu: Supporting XSAVE feature 0x002: 'SSE registers'
> > [0.00] x86/fpu: Supporting XSAVE feature 0x004: 'AVX registers'
> > [0.00] x86/fpu: Support

Re: [v2] mm: access to uninitialized struct page

2018-05-04 Thread Andrei Vagin
Hello,

We have a robot which runs criu tests on linux-next kernels.

All tests passed on 4.17.0-rc3-next-20180502.

But the 4.17.0-rc3-next-20180504 kernel didn't boot.

git bisect points on this patch.

On Thu, Apr 26, 2018 at 04:26:19PM -0400, Pavel Tatashin wrote:
> The following two bugs were reported by Fengguang Wu:
> 
> kernel reboot-without-warning in early-boot stage, last printk:
> early console in setup code
> 
> http://lkml.kernel.org/r/20180418135300.inazvpxjxowog...@wfg-t540p.sh.intel.com

The problem looks similar with this one.

[5.596975] devtmpfs: mounted
[5.855754] Freeing unused kernel memory: 1704K
[5.858162] Write protecting the kernel read-only data: 18432k
[5.860772] Freeing unused kernel memory: 2012K
[5.861838] Freeing unused kernel memory: 160K
[5.862572] rodata_test: all tests were successful
[5.866857] random: fast init done
early console in setup code
[0.00] Linux version 4.17.0-rc3-00023-g7c4cc2d022a1
(avagin@laptop) (gcc version 8.0.1 20180324 (Red Hat 8.0.1-0.20) (GCC))
#13 SMP Fri May 4 01:10:51 PDT 2018
[0.00] Command line: root=/dev/vda2 ro debug
console=ttyS0,115200 LANG=en_US.UTF-8 slub_debug=FZP raid=noautodetect
selinux=0 earlyprintk=serial,ttyS0,115200
[0.00] x86/fpu: Supporting XSAVE feature 0x001: 'x87 floating
point registers'
[0.00] x86/fpu: Supporting XSAVE feature 0x002: 'SSE registers'
[0.00] x86/fpu: Supporting XSAVE feature 0x004: 'AVX registers'
[0.00] x86/fpu: Supporting XSAVE feature 0x008: 'MPX bounds
registers'

$ git describe HEAD
v4.17-rc3-23-g7c4cc2d022a1

[avagin@laptop linux-next]$ git log --pretty=oneline  | head -n 1
7c4cc2d022a1fd56eb2ee33b8666bc780f1e mm: access to uninitialized struct page


> 
> And, also:
> [per_cpu_ptr_to_phys] PANIC: early exception 0x0d
> IP 10:a892f15f error 0 cr2 0x88001fbff000
> 
> http://lkml.kernel.org/r/20180419013128.iurzouiqxvcnp...@wfg-t540p.sh.intel.com
> 
> Both of the problems are due to accessing uninitialized struct page from
> trap_init(). We must first do mm_init() in order to initialize allocated
> struct pages, and than we can access fields of any struct page that belongs
> to memory that's been allocated.
> 
> Below is explanation of the root cause.
> 
> The issue arises in this stack:
> 
> start_kernel()
>  trap_init()
>   setup_cpu_entry_areas()
>setup_cpu_entry_area(cpu)
> get_cpu_gdt_paddr(cpu)
>  per_cpu_ptr_to_phys(addr)
>   pcpu_addr_to_page(addr)
>virt_to_page(addr)
> pfn_to_page(__pa(addr) >> PAGE_SHIFT)
> The returned "struct page" is sometimes uninitialized, and thus
> failing later when used. It turns out sometimes is because it depends
> on KASLR.
> 
> When boot is failing we have this when  pfn_to_page() is called:
> kasrl: 0x0d60
>  addr: 83e0d000
> pa: 1040d000
>pfn: 1040d
> page: 88001f113340
> page->flags  <- Uninitialized!
> 
> When boot is successful:
> kaslr: 0x0a80
>  addr: 83e0d000
>  pa: d60d000
> pfn: d60d
>  page: 88001f05b340
> page->flags 2800 <- Initialized!
> 
> Here are physical addresses that BIOS provided to us:
> e820: BIOS-provided physical RAM map:
> BIOS-e820: [mem 0x-0x0009fbff] usable
> BIOS-e820: [mem 0x0009fc00-0x0009] reserved
> BIOS-e820: [mem 0x000f-0x000f] reserved
> BIOS-e820: [mem 0x0010-0x1ffd] usable
> BIOS-e820: [mem 0x1ffe-0x1fff] reserved
> BIOS-e820: [mem 0xfeffc000-0xfeff] reserved
> BIOS-e820: [mem 0xfffc-0x] reserved
> 
> In both cases, working and non-working the real physical address is
> the same:
> 
> pa - kasrl = 0x2E0D000
> 
> The only thing that is different is PFN.
> 
> We initialize struct pages in four places:
> 
> 1. Early in boot a small set of struct pages is initialized to fill
> the first section, and lower zones.
> 2. During mm_init() we initialize "struct pages" for all the memory
> that is allocated, i.e reserved in memblock.
> 3. Using on-demand logic when pages are allocated after mm_init call
> 4. After smp_init() when the rest free deferred pages are initialized.
> 
> The above path happens before deferred memory is initialized, and thus
> it must be covered either by 1, 2 or 3.
> 
> So, lets check what PFNs are initialized after (1).
> 
> memmap_init_zone() is called for pfn ranges:
> 1 - 1000, and 1000 - 1ffe0, but it quits after reaching pfn 0x1,
> as it leaves the rest to be initialized as deferred pages.
> 
> In the working scenario pfn ended up being below 1000, but in the
> failing scenario it is above. Hence, we must initialize this page in
> (2). But trap_init() is called before mm_init().
> 
> The bug was introduced by "mm: initialize pages on demand during boot"
> because we lowered amount of pages that is initialized in the step
> (1). 

[PATCH] scsi: qla2xxx: remove the unused tcm_qla2xxx_cmd_wq

2018-05-02 Thread Andrei Vagin
Signed-off-by: Andrei Vagin <ava...@openvz.org>
---
 drivers/scsi/qla2xxx/tcm_qla2xxx.c | 10 --
 1 file changed, 10 deletions(-)

diff --git a/drivers/scsi/qla2xxx/tcm_qla2xxx.c 
b/drivers/scsi/qla2xxx/tcm_qla2xxx.c
index aadfeaac3898..b63440fec18e 100644
--- a/drivers/scsi/qla2xxx/tcm_qla2xxx.c
+++ b/drivers/scsi/qla2xxx/tcm_qla2xxx.c
@@ -48,7 +48,6 @@
 #include "tcm_qla2xxx.h"
 
 static struct workqueue_struct *tcm_qla2xxx_free_wq;
-static struct workqueue_struct *tcm_qla2xxx_cmd_wq;
 
 /*
  * Parse WWN.
@@ -1976,16 +1975,8 @@ static int tcm_qla2xxx_register_configfs(void)
goto out_fabric_npiv;
}
 
-   tcm_qla2xxx_cmd_wq = alloc_workqueue("tcm_qla2xxx_cmd", 0, 0);
-   if (!tcm_qla2xxx_cmd_wq) {
-   ret = -ENOMEM;
-   goto out_free_wq;
-   }
-
return 0;
 
-out_free_wq:
-   destroy_workqueue(tcm_qla2xxx_free_wq);
 out_fabric_npiv:
target_unregister_template(_qla2xxx_npiv_ops);
 out_fabric:
@@ -1995,7 +1986,6 @@ static int tcm_qla2xxx_register_configfs(void)
 
 static void tcm_qla2xxx_deregister_configfs(void)
 {
-   destroy_workqueue(tcm_qla2xxx_cmd_wq);
destroy_workqueue(tcm_qla2xxx_free_wq);
 
target_unregister_template(_qla2xxx_ops);
-- 
2.14.3



Re: [v8, 11/18] mm, dax: enable filesystems to trigger dev_pagemap ->page_free callbacks

2018-04-04 Thread Andrei Vagin
On Wed, Apr 04, 2018 at 02:23:40PM -0700, Andrei Vagin wrote:
> Hi Dan,
> 
> I catch the following bug on the linux-next 20180404. git bisect brought me 
> to this commit:


The next patch fixes the problem:

diff --git a/drivers/dax/super.c b/drivers/dax/super.c
index 5b13da127982..a67a7fe75fd5 100644
--- a/drivers/dax/super.c
+++ b/drivers/dax/super.c
@@ -228,6 +228,10 @@ static void __fs_dax_release(struct dax_device *dax_dev, 
void *owner)
 
 void fs_dax_release(struct dax_device *dax_dev, void *owner)
 {
+   if (!dax_dev) {
+   printk("%s:%d: dax_dev == NULL\n", __func__, __LINE__);
+   return;
+   }
if (dax_dev->ops->fs_release)
dax_dev->ops->fs_release(dax_dev, owner);
else


And here is dmesg from my test vm:
[root@fc24 ~]# dmesg | grep -A 2 -B 2 dax
[   14.659318] md: Skipping autodetection of RAID arrays. (raid=autodetect will 
force)
[   14.662436] EXT4-fs (vda2): couldn't mount as ext3 due to feature 
incompatibilities
[   14.663983] fs_dax_release:232: dax_dev == NULL
[   14.665646] EXT4-fs (vda2): couldn't mount as ext2 due to feature 
incompatibilities
[   14.667047] fs_dax_release:232: dax_dev == NULL
[   14.668933] EXT4-fs (vda2): INFO: recovery required on readonly filesystem
[   14.670039] EXT4-fs (vda2): write access will be enabled during recovery

> 
> commit 8e4d1ccc5286d2c3da6515b92323a3529aa64496 (HEAD, refs/bisect/bad)
> Author: Dan Williams <dan.j.willi...@intel.com>
> Date:   Sat Oct 21 14:41:13 2017 -0700
> 
> mm, dax: enable filesystems to trigger dev_pagemap ->page_free callbacks
> 
> 
> [   11.278768] BUG: unable to handle kernel NULL pointer dereference at 
> 0440
> [   11.27] IP: fs_dax_release+0x5/0x90
> [   11.280587] PGD 0 P4D 0 
> [   11.280973] Oops:  [#1] SMP PTI
> [   11.281500] Modules linked in:
> [   11.281968] CPU: 1 PID: 1 Comm: swapper/0 Not tainted 
> 4.16.0-rc4-00193-g8e4d1ccc5286 #7
> [   11.283163] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 
> 1.10.2-1.fc26 04/01/2014
> [   11.284418] RIP: 0010:fs_dax_release+0x5/0x90
> [   11.285068] RSP: :b1480062fbd8 EFLAGS: 00010287
> [   11.285845] RAX: 0001 RBX: 9e2cb823c088 RCX: 
> 0003
> [   11.286896] RDX:  RSI: 9e2cb823c088 RDI: 
> 
> [   11.287980] RBP: b1480062fcd8 R08: 0001 R09: 
> 
> [   11.289147] R10: b1480062fb20 R11:  R12: 
> ffea
> [   11.290576] R13:  R14:  R15: 
> 9e2cb823a048
> [   11.291630] FS:  () GS:9e2cbfd0() 
> knlGS:
> [   11.292781] CS:  0010 DS:  ES:  CR0: 80050033
> [   11.293602] CR2: 0440 CR3: 7d21e001 CR4: 
> 003606e0
> [   11.294817] DR0:  DR1:  DR2: 
> 
> [   11.296827] DR3:  DR6: fffe0ff0 DR7: 
> 0400
> [   11.298293] Call Trace:
> [   11.298728]  ext4_fill_super+0x31b/0x39d0
> [   11.299441]  ? sget_userns+0x155/0x500
> [   11.300144]  ? vsnprintf+0x253/0x4b0
> [   11.301223]  ? ext4_calculate_overhead+0x4a0/0x4a0
> [   11.301801]  ? snprintf+0x45/0x70
> [   11.302214]  ? ext4_calculate_overhead+0x4a0/0x4a0
> [   11.302822]  mount_bdev+0x17b/0x1b0
> [   11.303332]  mount_fs+0x35/0x150
> [   11.303803]  vfs_kern_mount.part.25+0x54/0x150
> [   11.304443]  do_mount+0x620/0xd60
> [   11.304935]  ? memdup_user+0x3e/0x70
> [   11.305458]  SyS_mount+0x80/0xd0
> [   11.305931]  mount_block_root+0x105/0x2b7
> [   11.306512]  ? SyS_mknod+0x16b/0x1f0
> [   11.307035]  ? set_debug_rodata+0x11/0x11
> [   11.307616]  prepare_namespace+0x135/0x16b
> [   11.308215]  kernel_init_freeable+0x271/0x297
> [   11.308838]  ? rest_init+0xd0/0xd0
> [   11.309322]  kernel_init+0xa/0x110
> [   11.309821]  ret_from_fork+0x3a/0x50
> [   11.310347] Code: a5 45 31 ed e8 5d 5e 36 00 eb d7 48 c7 c7 20 48 2f a5 e8 
> 4f 5e 36 00 eb c9 0f 1f 00 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 44 00 00 <48> 
> 8b 87 40 04 00 00 48 8b 40 18 48 85 c0 74 05 e9 c6 7e 60 00 
> [   11.313168] RIP: fs_dax_release+0x5/0x90 RSP: b1480062fbd8
> [   11.313991] CR2: 0440
> [   11.314475] ---[ end trace 8acbb19b74409665 ]---
> 
> 
> On Fri, Mar 30, 2018 at 09:03:08PM -0700, Dan Williams wrote:
> > In order to resolve collisions between filesystem operations and DMA to
> > DAX mapped pages we need a callback when DMA completes. With a callback
> > we can hold off filesystem operations while DMA is in-flight and then
> > resume those operations when the last put_page() occurs on a DMA page.
> > 
> >

Re: [v8, 11/18] mm, dax: enable filesystems to trigger dev_pagemap ->page_free callbacks

2018-04-04 Thread Andrei Vagin
Hi Dan,

I catch the following bug on the linux-next 20180404. git bisect brought me to 
this commit:

commit 8e4d1ccc5286d2c3da6515b92323a3529aa64496 (HEAD, refs/bisect/bad)
Author: Dan Williams 
Date:   Sat Oct 21 14:41:13 2017 -0700

mm, dax: enable filesystems to trigger dev_pagemap ->page_free callbacks


[   11.278768] BUG: unable to handle kernel NULL pointer dereference at 
0440
[   11.27] IP: fs_dax_release+0x5/0x90
[   11.280587] PGD 0 P4D 0 
[   11.280973] Oops:  [#1] SMP PTI
[   11.281500] Modules linked in:
[   11.281968] CPU: 1 PID: 1 Comm: swapper/0 Not tainted 
4.16.0-rc4-00193-g8e4d1ccc5286 #7
[   11.283163] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 
1.10.2-1.fc26 04/01/2014
[   11.284418] RIP: 0010:fs_dax_release+0x5/0x90
[   11.285068] RSP: :b1480062fbd8 EFLAGS: 00010287
[   11.285845] RAX: 0001 RBX: 9e2cb823c088 RCX: 0003
[   11.286896] RDX:  RSI: 9e2cb823c088 RDI: 
[   11.287980] RBP: b1480062fcd8 R08: 0001 R09: 
[   11.289147] R10: b1480062fb20 R11:  R12: ffea
[   11.290576] R13:  R14:  R15: 9e2cb823a048
[   11.291630] FS:  () GS:9e2cbfd0() 
knlGS:
[   11.292781] CS:  0010 DS:  ES:  CR0: 80050033
[   11.293602] CR2: 0440 CR3: 7d21e001 CR4: 003606e0
[   11.294817] DR0:  DR1:  DR2: 
[   11.296827] DR3:  DR6: fffe0ff0 DR7: 0400
[   11.298293] Call Trace:
[   11.298728]  ext4_fill_super+0x31b/0x39d0
[   11.299441]  ? sget_userns+0x155/0x500
[   11.300144]  ? vsnprintf+0x253/0x4b0
[   11.301223]  ? ext4_calculate_overhead+0x4a0/0x4a0
[   11.301801]  ? snprintf+0x45/0x70
[   11.302214]  ? ext4_calculate_overhead+0x4a0/0x4a0
[   11.302822]  mount_bdev+0x17b/0x1b0
[   11.303332]  mount_fs+0x35/0x150
[   11.303803]  vfs_kern_mount.part.25+0x54/0x150
[   11.304443]  do_mount+0x620/0xd60
[   11.304935]  ? memdup_user+0x3e/0x70
[   11.305458]  SyS_mount+0x80/0xd0
[   11.305931]  mount_block_root+0x105/0x2b7
[   11.306512]  ? SyS_mknod+0x16b/0x1f0
[   11.307035]  ? set_debug_rodata+0x11/0x11
[   11.307616]  prepare_namespace+0x135/0x16b
[   11.308215]  kernel_init_freeable+0x271/0x297
[   11.308838]  ? rest_init+0xd0/0xd0
[   11.309322]  kernel_init+0xa/0x110
[   11.309821]  ret_from_fork+0x3a/0x50
[   11.310347] Code: a5 45 31 ed e8 5d 5e 36 00 eb d7 48 c7 c7 20 48 2f a5 e8 
4f 5e 36 00 eb c9 0f 1f 00 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 44 00 00 <48> 8b 
87 40 04 00 00 48 8b 40 18 48 85 c0 74 05 e9 c6 7e 60 00 
[   11.313168] RIP: fs_dax_release+0x5/0x90 RSP: b1480062fbd8
[   11.313991] CR2: 0440
[   11.314475] ---[ end trace 8acbb19b74409665 ]---


On Fri, Mar 30, 2018 at 09:03:08PM -0700, Dan Williams wrote:
> In order to resolve collisions between filesystem operations and DMA to
> DAX mapped pages we need a callback when DMA completes. With a callback
> we can hold off filesystem operations while DMA is in-flight and then
> resume those operations when the last put_page() occurs on a DMA page.
> 
> Recall that the 'struct page' entries for DAX memory are created with
> devm_memremap_pages(). That routine arranges for the pages to be
> allocated, but never onlined, so a DAX page is DMA-idle when its
> reference count reaches one.
> 
> Also recall that the HMM sub-system added infrastructure to trap the
> page-idle (2-to-1 reference count) transition of the pages allocated by
> devm_memremap_pages() and trigger a callback via the 'struct
> dev_pagemap' associated with the page range. Whereas the HMM callbacks
> are going to a device driver to manage bounce pages in device-memory in
> the filesystem-dax case we will call back to filesystem specified
> callback.
> 
> Since the callback is not known at devm_memremap_pages() time we arrange
> for the filesystem to install it at mount time. No functional changes
> are expected as this only registers a nop handler for the ->page_free()
> event for device-mapped pages.
> 
> Cc: Michal Hocko 
> Reviewed-by: "Jérôme Glisse" 
> Reviewed-by: Christoph Hellwig 
> Reviewed-by: Jan Kara 
> Signed-off-by: Dan Williams 
> ---
>  drivers/dax/super.c   |   21 +++--
>  drivers/nvdimm/pmem.c |3 ++-
>  fs/ext2/super.c   |6 +++---
>  fs/ext4/super.c   |6 +++---
>  fs/xfs/xfs_super.c|   20 ++--
>  include/linux/dax.h   |   23 ++-
>  6 files changed, 43 insertions(+), 36 deletions(-)
> 
> diff --git a/drivers/dax/super.c b/drivers/dax/super.c
> index c4cf284dfe1c..7d260f118a39 100644
> --- a/drivers/dax/super.c
> +++ b/drivers/dax/super.c
> @@ -63,16 +63,6 @@ int 

Re: [PATCH] autofs4: use wake_up() instead of wake_up_interruptible

2018-04-01 Thread Andrei Vagin
On Sun, Apr 01, 2018 at 10:01:41AM +0800, Ian Kent wrote:
> On 01/04/18 09:31, Ian Kent wrote:
> > On 31/03/18 10:28, Andrei Vagin wrote:
> >> In "autofs4: use wait_event_killable",  wait_event_interruptible() was
> >> replaced by wait_event_killable(), but in this case we have to use
> >> wake_up() instead of wake_up_interruptible().
> > 
> > Why do you believe wake_up() is needed rather than wake_up_interruptible()?
> > 
> > Now that I'm thinking about the wake up I'm wondering if this is in fact
> > what's needed. Rather, I think maybe wake_up_all() is probably the only
> > one that will actually do what's needed.
> 
> Ok, so that 1 is the number of exclusive waiters.
> So what is the difference between the two wake_up calls in this case?

In CRIU, we have the autofs test:
https://github.com/checkpoint-restore/criu/blob/master/test/zdtm/static/autofs.c

We run CRIU tests on the linux-next kernels and a few days ago this test
started to fail, actually it hangs up.

I found that wake_up_interruptible() doesn't wake up a thread, which is
waiting.

try_to_wake_up() has the argument "state", it is the mask of task states
that can be woken.

For wake_up_interruptible(), state is TASK_INTERRUPTIBLE.
For wake_up(). state is TASK_NORMAL (TASK_INTERRUPTIBLE | TASK_UNINTERRUPTIBLE)

If we use wait_event_killable(), the task sleeps in the TASK_KILLABLE
state, so wake_up_interruptible() isn't suitable in this case.

#define TASK_KILLABLE   (TASK_WAKEKILL | TASK_UNINTERRUPTIBLE)

I checked that our test passes with this patch. I mean that we had a
real problem and we checked that it is fixed by this patch.

Thanks,
Andrei

> 
> > 
> > There's an individual wait queue for each mount, there can be multiple
> > waiters for a mount, they all should be woken up when the daemon signals
> > mount completion.
> > 
> >>
> >> Cc: Matthew Wilcox <mawil...@microsoft.com>
> >> Cc: Ian Kent <ra...@themaw.net>
> >> Cc: Andrew Morton <a...@linux-foundation.org>
> >> Cc: Stephen Rothwell <s...@canb.auug.org.au>
> >> Signed-off-by: Andrei Vagin <ava...@openvz.org>
> >> ---
> >>  fs/autofs4/waitq.c | 2 +-
> >>  1 file changed, 1 insertion(+), 1 deletion(-)
> >>
> >> diff --git a/fs/autofs4/waitq.c b/fs/autofs4/waitq.c
> >> index c160e9b3aa0f..be9c3dc048ab 100644
> >> --- a/fs/autofs4/waitq.c
> >> +++ b/fs/autofs4/waitq.c
> >> @@ -549,7 +549,7 @@ int autofs4_wait_release(struct autofs_sb_info *sbi, 
> >> autofs_wqt_t wait_queue_tok
> >>kfree(wq->name.name);
> >>wq->name.name = NULL;   /* Do not wait on this queue */
> >>wq->status = status;
> >> -  wake_up_interruptible(>queue);
> >> +  wake_up(>queue);
> >>if (!--wq->wait_ctr)
> >>kfree(wq);
> >>mutex_unlock(>wq_mutex);
> >>
> > 
> 


Re: [03/10] genksyms: generate lexer and parser during build instead of shipping

2018-03-30 Thread Andrei Vagin
On Sat, Mar 31, 2018 at 11:20:22AM +0900, Masahiro Yamada wrote:
> 2018-03-31 7:21 GMT+09:00 Andrei Vagin <ava...@virtuozzo.com>:
> > On Fri, Mar 30, 2018 at 10:40:22AM -0700, Andrei Vagin wrote:
> >> On Fri, Mar 23, 2018 at 10:04:32PM +0900, Masahiro Yamada wrote:
> >> > Now that the kernel build supports flex and bison, remove the _shipped
> >> > files and generate them during the build instead.
> >> >
> >> > There are no more shipped lexer and parser, so I ripped off the rules
> >> > in scripts/Malefile.lib that were used for REGENERATE_PARSERS.
> >> >
> >> > The genksyms parser has ambiguous grammar, which would emit warnings:
> >> >
> >> >  scripts/genksyms/parse.y: warning: 9 shift/reduce conflicts 
> >> > [-Wconflicts-sr]
> >> >  scripts/genksyms/parse.y: warning: 5 reduce/reduce conflicts 
> >> > [-Wconflicts-rr]
> >> >
> >> > They are normally suppressed, but displayed when W=1 is given.
> >> >
> >> > Signed-off-by: Masahiro Yamada <yamada.masah...@socionext.com>
> >> > ---
> >> >
> >> >  scripts/Makefile.lib |   24 +-
> >> >  scripts/genksyms/Makefile|   23 +
> >> >  scripts/genksyms/lex.lex.c_shipped   | 2291 
> >> > 
> >> >  scripts/genksyms/parse.tab.c_shipped | 2394 
> >> > --
> >> >  scripts/genksyms/parse.tab.h_shipped |  119 --
> >> >  5 files changed, 26 insertions(+), 4825 deletions(-)
> >> >  delete mode 100644 scripts/genksyms/lex.lex.c_shipped
> >> >  delete mode 100644 scripts/genksyms/parse.tab.c_shipped
> >> >  delete mode 100644 scripts/genksyms/parse.tab.h_shipped
> >> >
> >> > diff --git a/scripts/Makefile.lib b/scripts/Makefile.lib
> >> > index 2fde810..b7d2c97 100644
> >> > --- a/scripts/Makefile.lib
> >> > +++ b/scripts/Makefile.lib
> >> > @@ -183,14 +183,8 @@ endef
> >> >  quiet_cmd_flex = LEX $@
> >> >cmd_flex = $(LEX) -o$@ -L $<
> >> >
> >> > -ifdef REGENERATE_PARSERS
> >> > -.PRECIOUS: $(src)/%.lex.c_shipped
> >> > -$(src)/%.lex.c_shipped: $(src)/%.l
> >> > -   $(call cmd,flex)
> >> > -endif
> >> > -
> >> >  .PRECIOUS: $(obj)/%.lex.c
> >> > -$(filter %.lex.c,$(targets)): $(obj)/%.lex.c: $(src)/%.l FORCE
> >> > +$(obj)/%.lex.c: $(src)/%.l FORCE
> >> > $(call if_changed,flex)
> >> >
> >> >  # YACC
> >> > @@ -198,27 +192,15 @@ $(filter %.lex.c,$(targets)): $(obj)/%.lex.c: 
> >> > $(src)/%.l FORCE
> >> >  quiet_cmd_bison = YACC$@
> >> >cmd_bison = $(YACC) -o$@ -t -l $<
> >> >
> >> > -ifdef REGENERATE_PARSERS
> >> > -.PRECIOUS: $(src)/%.tab.c_shipped
> >> > -$(src)/%.tab.c_shipped: $(src)/%.y
> >> > -   $(call cmd,bison)
> >> > -endif
> >> > -
> >> >  .PRECIOUS: $(obj)/%.tab.c
> >> > -$(filter %.tab.c,$(targets)): $(obj)/%.tab.c: $(src)/%.y FORCE
> >> > +$(obj)/%.tab.c: $(src)/%.y FORCE
> >> > $(call if_changed,bison)
> >> >
> >> >  quiet_cmd_bison_h = YACC$@
> >> >cmd_bison_h = bison -o/dev/null --defines=$@ -t -l $<
> >> >
> >> > -ifdef REGENERATE_PARSERS
> >> > -.PRECIOUS: $(src)/%.tab.h_shipped
> >> > -$(src)/%.tab.h_shipped: $(src)/%.y
> >> > -   $(call cmd,bison_h)
> >> > -endif
> >> > -
> >> >  .PRECIOUS: $(obj)/%.tab.h
> >> > -$(filter %.tab.h,$(targets)): $(obj)/%.tab.h: $(src)/%.y FORCE
> >> > +$(obj)/%.tab.h: $(src)/%.y FORCE
> >> > $(call if_changed,bison_h)
> >> >
> >> >  # Shipped files
> >> > diff --git a/scripts/genksyms/Makefile b/scripts/genksyms/Makefile
> >> > index 0ccac51..f4749e8 100644
> >> > --- a/scripts/genksyms/Makefile
> >> > +++ b/scripts/genksyms/Makefile
> >> > @@ -5,9 +5,32 @@ always := $(hostprogs-y)
> >> >
> >> >  genksyms-objs  := genksyms.o parse.tab.o lex.lex.o
> >> >
> >> > +# FIXME: fix the ambiguous grammar in parse.y and delete this hack
> >> > +#
> >> > +# Suppress shift/reduce, reduce/reduce conflicts warnings
> >> > +# unless W=1 is specified.

[PATCH] autofs4: use wake_up() instead of wake_up_interruptible

2018-03-30 Thread Andrei Vagin
In "autofs4: use wait_event_killable",  wait_event_interruptible() was
replaced by wait_event_killable(), but in this case we have to use
wake_up() instead of wake_up_interruptible().

Cc: Matthew Wilcox <mawil...@microsoft.com>
Cc: Ian Kent <ra...@themaw.net>
Cc: Andrew Morton <a...@linux-foundation.org>
Cc: Stephen Rothwell <s...@canb.auug.org.au>
Signed-off-by: Andrei Vagin <ava...@openvz.org>
---
 fs/autofs4/waitq.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/autofs4/waitq.c b/fs/autofs4/waitq.c
index c160e9b3aa0f..be9c3dc048ab 100644
--- a/fs/autofs4/waitq.c
+++ b/fs/autofs4/waitq.c
@@ -549,7 +549,7 @@ int autofs4_wait_release(struct autofs_sb_info *sbi, 
autofs_wqt_t wait_queue_tok
kfree(wq->name.name);
wq->name.name = NULL;   /* Do not wait on this queue */
wq->status = status;
-   wake_up_interruptible(>queue);
+   wake_up(>queue);
if (!--wq->wait_ctr)
kfree(wq);
mutex_unlock(>wq_mutex);
-- 
2.13.6



Re: [PATCH v5 0/4] vm: add a syscall to map a process memory into a pipe

2018-02-28 Thread Andrei Vagin
On Wed, Feb 28, 2018 at 10:12:55AM +0300, Pavel Emelyanov wrote:
> On 02/27/2018 05:18 AM, Dmitry V. Levin wrote:
> > On Mon, Feb 26, 2018 at 12:02:25PM +0300, Pavel Emelyanov wrote:
> >> On 02/21/2018 03:44 AM, Andrew Morton wrote:
> >>> On Tue,  9 Jan 2018 08:30:49 +0200 Mike Rapoport 
> >>>  wrote:
> >>>
>  This patches introduces new process_vmsplice system call that combines
>  functionality of process_vm_read and vmsplice.
> >>>
> >>> All seems fairly strightforward.  The big question is: do we know that
> >>> people will actually use this, and get sufficient value from it to
> >>> justify its addition?
> >>
> >> Yes, that's what bothers us a lot too :) I've tried to start with finding 
> >> out if anyone 
> >> used the sys_read/write_process_vm() calls, but failed :( Does anybody 
> >> know how popular
> >> these syscalls are?
> > 
> > Well, process_vm_readv itself is quite popular, it's used by debuggers 
> > nowadays,
> > see e.g.
> > $ strace -qq -esignal=none -eprocess_vm_readv strace -qq -o/dev/null cat 
> > /dev/null
> 
> I see. Well, yes, this use-case will not benefit much from remote splice. How 
> about more
> interactive debug by, say, gdb? It may attach, then splice all the memory, 
> then analyze
> the victim code/data w/o copying it to its address space?

Hmm, in this case, you probably will want to be able to map pipe pages
into memory.

> 
> -- Pavel


Re: [PATCH v5 0/4] vm: add a syscall to map a process memory into a pipe

2018-02-27 Thread Andrei Vagin
On Tue, Feb 27, 2018 at 05:18:18AM +0300, Dmitry V. Levin wrote:
> On Mon, Feb 26, 2018 at 12:02:25PM +0300, Pavel Emelyanov wrote:
> > On 02/21/2018 03:44 AM, Andrew Morton wrote:
> > > On Tue,  9 Jan 2018 08:30:49 +0200 Mike Rapoport 
> > >  wrote:
> > > 
> > >> This patches introduces new process_vmsplice system call that combines
> > >> functionality of process_vm_read and vmsplice.
> > > 
> > > All seems fairly strightforward.  The big question is: do we know that
> > > people will actually use this, and get sufficient value from it to
> > > justify its addition?
> > 
> > Yes, that's what bothers us a lot too :) I've tried to start with finding 
> > out if anyone 
> > used the sys_read/write_process_vm() calls, but failed :( Does anybody know 
> > how popular
> > these syscalls are?
> 
> Well, process_vm_readv itself is quite popular, it's used by debuggers 
> nowadays,
> see e.g.
> $ strace -qq -esignal=none -eprocess_vm_readv strace -qq -o/dev/null cat 
> /dev/null

For this case, there is no advantage from process_vmsplice().

But it can significantly optimize a process of generating a core file.
In this case, we need to read a process memory and save content into a
file. process_vmsplice() allows to do this more optimal than
process_vm_readv(), because it doesn't copy data into a userspace.

Here is a part of strace how gdb saves memory content into a core file:

10593 open("/proc/10193/mem", O_RDONLY|O_CLOEXEC) = 17
10593 pread64(17, ""..., 1048576, 
140009356111872) = 1048576
10593 close(17) = 0
10593 write(16, ""..., 4096) = 4096
10593 write(16, ""..., 1044480) = 1044480
10593 open("/proc/10193/mem", O_RDONLY|O_CLOEXEC) = 17
10593 pread64(17, ""..., 1048576, 
140009357160448) = 1048576
10593 close(17) = 0
10593 write(16, ""..., 4096) = 4096
10593 write(16, ""..., 1044480) = 1044480
10593 open("/proc/10193/mem", O_RDONLY|O_CLOEXEC) = 17
10593 pread64(17, ""..., 1048576, 
140009358209024) = 1048576
10593 close(17) = 0
10593 write(16, ""..., 4096) = 4096
10593 write(16, ""..., 1044480) = 1044480
10593 open("/proc/10193/mem", O_RDONLY|O_CLOEXEC) = 17
10593 pread64(17, ""..., 1048576, 
140009359257600) = 1048576
10593 close(17)

It is strange that process_vm_readv() isn't used and that
/proc/10193/mem is opened many times.

BTW: "strace -fo strace-gdb.log gdb -p PID" doesn't work properly.

Thanks,
Andrei

> 
> 
> -- 
> ldv




Re: [tip:x86/boot] x86/boot/compressed/64: Handle 5-level paging boot if kernel is above 4G

2018-02-13 Thread Andrei Vagin
On Tue, Feb 13, 2018 at 12:02:49PM +0300, Kirill A. Shutemov wrote:
> On Tue, Feb 13, 2018 at 12:41:22AM -0800, Andrei Vagin wrote:
> > On Tue, Feb 13, 2018 at 11:08:16AM +0300, Kirill A. Shutemov wrote:
> > > On Mon, Feb 12, 2018 at 10:51:56PM -0800, Andrei Vagin wrote:
> > > > Hi Kirill,
> > > > 
> > > > Something is wrong in this patch.
> 
> Could you please check if this makes a difference?

The kernel booted with this patch. Thanks!
https://travis-ci.org/avagin/linux/jobs/341030882

> 
> diff --git a/arch/x86/boot/compressed/head_64.S 
> b/arch/x86/boot/compressed/head_64.S
> index 70b30f2bc9e0..99a0e7993252 100644
> --- a/arch/x86/boot/compressed/head_64.S
> +++ b/arch/x86/boot/compressed/head_64.S
> @@ -332,7 +332,7 @@ ENTRY(startup_64)
>  
>   /* Make sure we have GDT with 32-bit code segment */
>   leaqgdt(%rip), %rax
> - movl%eax, gdt64+2(%rip)
> + movq%rax, gdt64+2(%rip)
>   lgdtgdt64(%rip)
>  
>   /*
> -- 
>  Kirill A. Shutemov


Re: [tip:x86/boot] x86/boot/compressed/64: Handle 5-level paging boot if kernel is above 4G

2018-02-13 Thread Andrei Vagin
On Tue, Feb 13, 2018 at 11:08:16AM +0300, Kirill A. Shutemov wrote:
> On Mon, Feb 12, 2018 at 10:51:56PM -0800, Andrei Vagin wrote:
> > Hi Kirill,
> > 
> > Something is wrong in this patch.
> 
> Was it bisected to exactly this patch? Is the previous one fine?

Yes. Yes.
> 
> > We regularly run CRIU tests on linux-next, and yesterday I found that a
> > kernel didn't boot. We run this tests in Travis-CI, and we don't have
> > access to kernel logs. I tried to reproduce the problem localy, but I
> > failed.
> 
> Do you know anything about host kernel which handles kexec?


Distributor ID: Ubuntu
Description:Ubuntu 14.04.5 LTS
Release:14.04
Codename:   trusty
Linux Version   4.4.0-51-generic

+ uname -a
Linux travis-job-43f4b617-65d3-4621-bd05-911efa0d69df 4.4.0-51-generic 
#72~14.04.1-Ubuntu SMP Thu Nov 24 19:22:30 UTC 2016 x86_64 x86_64 x86_64 
GNU/Linu

> 
> -- 
>  Kirill A. Shutemov


Re: [tip:x86/boot] x86/boot/compressed/64: Handle 5-level paging boot if kernel is above 4G

2018-02-12 Thread Andrei Vagin
Hi Kirill,

Something is wrong in this patch. We regularly run CRIU tests on
linux-next, and yesterday I found that a kernel didn't boot. We run this
tests in Travis-CI, and we don't have access to kernel logs. I tried to
reproduce the problem localy, but I failed.

In Travis-CI, we build kernel, then dump a travis deamon, boot the
kernel with help of kexec and restore the travis daemon back.

Here is logs without this patch:
https://travis-ci.org/avagin/linux/jobs/340820418

Here is logs with this patch:
https://travis-ci.org/avagin/linux/jobs/340820584

Thanks,
Andrei

On Sun, Feb 11, 2018 at 04:20:04AM -0800, tip-bot for Jacob Shin wrote:
> Commit-ID:  b4b56015ed1c98cbc9469e35ebbc4373a2844030
> Gitweb: 
> https://git.kernel.org/tip/b4b56015ed1c98cbc9469e35ebbc4373a2844030
> Author: Kirill A. Shutemov 
> AuthorDate: Fri, 9 Feb 2018 17:22:28 +0300
> Committer:  Ingo Molnar 
> CommitDate: Sun, 11 Feb 2018 12:36:19 +0100
> 
> x86/boot/compressed/64: Handle 5-level paging boot if kernel is above 4G
> 
> This patch addresses a shortcoming in current boot process on machines
> that supports 5-level paging.
> 
> If a bootloader enables 64-bit mode with 4-level paging, we might need to
> switch over to 5-level paging. The switching requires the disabling
> paging. It works fine if kernel itself is loaded below 4G.
> 
> But if the bootloader put the kernel above 4G (not sure if anybody does
> this), we would lose control as soon as paging is disabled, because the
> code becomes unreachable to the CPU.
> 
> This patch implements a trampoline in lower memory to handle this
> situation.
> 
> We only need the memory for a very short time, until the main kernel
> image sets up own page tables.
> 
> We go through the trampoline even if we don't have to: if we're already
> in 5-level paging mode or if we don't need to switch to it. This way the
> trampoline gets tested on every boot.
> 
> Signed-off-by: Kirill A. Shutemov 
> Cc: Andy Lutomirski 
> Cc: Borislav Petkov 
> Cc: Cyrill Gorcunov 
> Cc: Linus Torvalds 
> Cc: Matthew Wilcox 
> Cc: Peter Zijlstra 
> Cc: Thomas Gleixner 
> Cc: linux...@kvack.org
> Link: 
> http://lkml.kernel.org/r/20180209142228.21231-5-kirill.shute...@linux.intel.com
> Signed-off-by: Ingo Molnar 
> ---
>  arch/x86/boot/compressed/head_64.S | 127 
> ++---
>  1 file changed, 89 insertions(+), 38 deletions(-)
> 
> diff --git a/arch/x86/boot/compressed/head_64.S 
> b/arch/x86/boot/compressed/head_64.S
> index af9ffbd..70b30f2 100644
> --- a/arch/x86/boot/compressed/head_64.S
> +++ b/arch/x86/boot/compressed/head_64.S
> @@ -307,13 +307,34 @@ ENTRY(startup_64)
>  
>   /*
>* At this point we are in long mode with 4-level paging enabled,
> -  * but we want to enable 5-level paging.
> +  * but we might want to enable 5-level paging or vice versa.
>*
> -  * The problem is that we cannot do it directly. Setting LA57 in
> -  * long mode would trigger #GP. So we need to switch off long mode
> -  * first.
> +  * The problem is that we cannot do it directly. Setting or clearing
> +  * CR4.LA57 in long mode would trigger #GP. So we need to switch off
> +  * long mode and paging first.
> +  *
> +  * We also need a trampoline in lower memory to switch over from
> +  * 4- to 5-level paging for cases when the bootloader puts the kernel
> +  * above 4G, but didn't enable 5-level paging for us.
> +  *
> +  * The same trampoline can be used to switch from 5- to 4-level paging
> +  * mode, like when starting 4-level paging kernel via kexec() when
> +  * original kernel worked in 5-level paging mode.
> +  *
> +  * For the trampoline, we need the top page table to reside in lower
> +  * memory as we don't have a way to load 64-bit values into CR3 in
> +  * 32-bit mode.
> +  *
> +  * We go though the trampoline even if we don't have to: if we're
> +  * already in a desired paging mode. This way the trampoline code gets
> +  * tested on every boot.
>*/
>  
> + /* Make sure we have GDT with 32-bit code segment */
> + leaqgdt(%rip), %rax
> + movl%eax, gdt64+2(%rip)
> + lgdtgdt64(%rip)
> +
>   /*
>* paging_prepare() sets up the trampoline and checks if we need to
>* enable 5-level paging.
> @@ -331,30 +352,20 @@ ENTRY(startup_64)
>   /* Save the trampoline address in RCX */
>   movq%rax, %rcx
>  
> - /* Check if we need to enable 5-level paging */
> - cmpq$0, %rdx
> - jz  lvl5
> -
> - /* Clear additional page table */
> - leaqlvl5_pgtable(%rbx), %rdi
> - xorq%rax, %rax
> - movq$(PAGE_SIZE/8), %rcx
> - rep   

Re: [PATCH v2 00/31] Replacing net_mutex with rw_semaphore

2018-01-18 Thread Andrei Vagin
On Mon, Nov 20, 2017 at 09:32:08PM +0300, Kirill Tkhai wrote:
> Hi,
> 
> there is the second version of patchset introducing net_sem
> instead of net_mutex. The patchset adds net_sem in addition
> to net_mutex and allows pernet_operations to be async. This
> flag means, the pernet_operations methods are safe to be
> executed with any othor pernet_operations (un)initializing
> another net.
> 
> If there are only async pernet_operations in the system,
> net_mutex is not used either for setup_net() or for cleanup_net().
> 
> The flag is little easier, then (un)register_pernet_sys(),
> as it changes one line only. Also, it requires less changes
> in code. In future, when all pernet_operations are async,
> we'll just remove this struct field.
> 
> The pernet_operations converted in this patchset allow
> to create minimal .config to have network working, and
> the changes improve the performance like you may see
> below:
> 
> %for i in {1..1}; do unshare -n bash -c exit; done
> 
> *before*
> real 1m40,377s
> user 0m9,672s
> sys 0m19,928s
> 
> *after*
> real 0m17,007s
>     user 0m5,311s
> sys 0m11,779
> 
> (5.8 times faster)

Good job!

Acked-by: Andrei Vagin <ava...@virtuozzo.com>

> ---
> 
> Kirill Tkhai (31):
>   net: Assign net to net_namespace_list in setup_net()
>   net: Cleanup copy_net_ns()
>   net: Introduce net_sem for protection of pernet_list
>   net: Move mutex_unlock() in cleanup_net() up
>   net: Allow pernet_operations to be executed in parallel
>   net: Convert proc_net_ns_ops
>   net: Convert net_ns_ops methods
>   net: Convert sysctl_pernet_ops
>   net: Convert netfilter_net_ops
>   net: Convert nf_log_net_ops
>   net: Convert net_inuse_ops
>   net: Convert net_defaults_ops
>   net: Convert netlink_net_ops
>   net: Convert rtnetlink_net_ops
>   net: Convert audit_net_ops
>   net: Convert uevent_net_ops
>   net: Convert proto_net_ops
>   net: Convert pernet_subsys ops, registered via net_dev_init()
>   net: Convert fib_* pernet_operations, registered via subsys_initcall
>   net: Convert subsys_initcall() registered pernet_operations from 
> net/sched
>   net: Convert genl_pernet_ops
>   net: Convert wext_pernet_ops
>   net: Convert sysctl_core_ops
>   net: Convert pernet_subsys, registered from inet_init()
>   net: Convert unix_net_ops
>   net: Convert packet_net_ops
>   net: Convert ipv4_sysctl_ops
>   net: Convert addrconf_ops
>   net: Convert loopback_net_ops
>   net: Convert default_device_ops
>   net: Convert diag_net_ops
> 
> 
>  drivers/net/loopback.c  |1 
>  fs/proc/proc_net.c  |1 
>  include/linux/rtnetlink.h   |1 
>  include/net/net_namespace.h |6 +++
>  kernel/audit.c  |1 
>  lib/kobject_uevent.c|1 
>  net/core/dev.c  |2 +
>  net/core/fib_notifier.c |1 
>  net/core/fib_rules.c|1 
>  net/core/net-procfs.c   |2 +
>  net/core/net_namespace.c|   94 
> +--
>  net/core/rtnetlink.c|5 +-
>  net/core/sock.c |2 +
>  net/core/sock_diag.c|1 
>  net/core/sysctl_net_core.c  |1 
>  net/ipv4/af_inet.c  |2 +
>  net/ipv4/arp.c  |1 
>  net/ipv4/devinet.c  |1 
>  net/ipv4/fib_frontend.c |1 
>  net/ipv4/icmp.c |1 
>  net/ipv4/igmp.c |1 
>  net/ipv4/ip_fragment.c  |1 
>  net/ipv4/ipmr.c |1 
>  net/ipv4/ping.c |1 
>  net/ipv4/proc.c |1 
>  net/ipv4/raw.c  |1 
>  net/ipv4/route.c|4 ++
>  net/ipv4/sysctl_net_ipv4.c  |1 
>  net/ipv4/tcp_ipv4.c |2 +
>  net/ipv4/tcp_metrics.c  |1 
>  net/ipv4/udp.c  |1 
>  net/ipv4/udplite.c  |1 
>  net/ipv4/xfrm4_policy.c |1 
>  net/ipv6/addrconf.c |1 
>  net/netfilter/core.c|1 
>  net/netfilter/nf_log.c  |1 
>  net/netlink/af_netlink.c|1 
>  net/netlink/genetlink.c |1 
>  net/packet/af_packet.c  |1 
>  net/sched/act_api.c |1 
>  net/sched/sch_api.c |1 
>  net/sysctl_net.c|1 
>  net/unix/af_unix.c  |1 
>  net/wireless/wext-core.c|1 
>  net/xfrm/xfrm_policy.c  |1 
>  45 files changed, 114 insertions(+), 41 deletions(-)
> 
> --
> Signed-off-by: Kirill Tkhai <ktk...@virtuozzo.com>


Re: [PATCH net-next] net: core: Expose number of link up/down transitions

2018-01-17 Thread Andrei Vagin
On Thu, Jan 18, 2018 at 01:06:52AM +0100, Andrew Lunn wrote:
> > What is the idea to have two separate counters? Can a delta between them
> > be a bigger than 1?
> 
> Yes, it can.
> 
> These counters are incremented in netif_carrier_on() /
> netif_carrier_off(). They are not always called in pairs and they can
> be called multiple times for the same event. The phylib will call them
> when it notices the PHY saying the link is down/up, and the MAC driver
> sometimes also calls them.

We check the __LINK_STATE_NOCARRIER bit before changing these counters,
so if we call netif_carrier_on() twice, the counter will be incrimented
only by one, will it not?

void netif_carrier_on(struct net_device *dev)
if (test_and_clear_bit(__LINK_STATE_NOCARRIER, >state)) {
atomic_inc(>carrier_changes);

...

void netif_carrier_off(struct net_device *dev)
if (!test_and_set_bit(__LINK_STATE_NOCARRIER, >state)) {
atomic_inc(>carrier_changes);


> 
> Andrew


Re: [PATCH net-next] net: core: Expose number of link up/down transitions

2018-01-17 Thread Andrei Vagin
On Wed, Jan 17, 2018 at 03:06:57PM -0800, Florian Fainelli wrote:
> From: David Decotigny 
> 
> Expose the number of times the link has been going UP or DOWN, and
> update the "carrier_changes" counter to be the sum of these two events.
> While at it, also update the sysfs-class-net documentation to cover:
> carrier_changes (3.15), count_link_up (4.16) and count_link_down (4.16)

What is the idea to have two separate counters? Can a delta between them
be a bigger than 1?

> 
> Signed-off-by: David Decotigny 
> [Florian:
> * rebase
> * add documentation
> * merge carrier_changes with up/down counters]
> Signed-off-by: Florian Fainelli 
> ---
>  Documentation/ABI/testing/sysfs-class-net | 24 
>  include/linux/netdevice.h |  6 --
>  include/uapi/linux/if_link.h  |  2 ++
>  net/core/net-sysfs.c  | 23 ++-
>  net/core/rtnetlink.c  | 13 +++--
>  net/sched/sch_generic.c   |  4 ++--
>  6 files changed, 65 insertions(+), 7 deletions(-)
> 
> diff --git a/Documentation/ABI/testing/sysfs-class-net 
> b/Documentation/ABI/testing/sysfs-class-net
> index 6856da99b6f7..e4b0d5157305 100644
> --- a/Documentation/ABI/testing/sysfs-class-net
> +++ b/Documentation/ABI/testing/sysfs-class-net
> @@ -259,3 +259,27 @@ Contact: net...@vger.kernel.org
>  Description:
>   Symbolic link to the PHY device this network device is attached
>   to.
> +
> +What:/sys/class/net/ +Date:Mar 2014
> +KernelVersion:   3.15
> +Contact: net...@vger.kernel.org
> +Description:
> + 32-bit unsigned integer counting the number of times the link 
> has
> + seen a change from UP to DOWN and vice versa
> +
> +What:/sys/class/net/ +Date:Jan 2018
> +KernelVersion:   4.16
> +Contact: net...@vger.kernel.org
> +Description:
> + 32-bit unsigned integer counting the number of times the link 
> has
> + been up
> +
> +What:/sys/class/net/ +Date:Jan 2018
> +KernelVersion:   4.16
> +Contact: net...@vger.kernel.org
> +Description:
> + 32-bit unsigned integer counting the number of times the link 
> has
> + been down
> diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
> index ed0799a12bf2..28f68f7513d0 100644
> --- a/include/linux/netdevice.h
> +++ b/include/linux/netdevice.h
> @@ -1680,8 +1680,6 @@ struct net_device {
>   unsigned long   base_addr;
>   int irq;
>  
> - atomic_tcarrier_changes;
> -
>   /*
>*  Some hardware also needs these fields (state,dev_list,
>*  napi_list,unreg_list,close_list) but they are not
> @@ -1719,6 +1717,10 @@ struct net_device {
>   atomic_long_t   tx_dropped;
>   atomic_long_t   rx_nohandler;
>  
> + /* Stats to monitor link on/off, flapping */
> + atomic_tcount_link_up;
> + atomic_tcount_link_down;
> +
>  #ifdef CONFIG_WIRELESS_EXT
>   const struct iw_handler_def *wireless_handlers;
>   struct iw_public_data   *wireless_data;
> diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h
> index f8f04fed6186..6e44b0674ba4 100644
> --- a/include/uapi/linux/if_link.h
> +++ b/include/uapi/linux/if_link.h
> @@ -161,6 +161,8 @@ enum {
>   IFLA_EVENT,
>   IFLA_NEW_NETNSID,
>   IFLA_IF_NETNSID,
> + IFLA_COUNT_LINK_UP,
> + IFLA_COUNT_LINK_DOWN,
>   __IFLA_MAX
>  };
>  
> diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c
> index 7bf8b85ade16..9f732c3dc2ce 100644
> --- a/net/core/net-sysfs.c
> +++ b/net/core/net-sysfs.c
> @@ -295,10 +295,29 @@ static ssize_t carrier_changes_show(struct device *dev,
>   struct net_device *netdev = to_net_dev(dev);
>  
>   return sprintf(buf, fmt_dec,
> -atomic_read(>carrier_changes));
> +atomic_read(>count_link_up) +
> +atomic_read(>count_link_down));
>  }
>  static DEVICE_ATTR_RO(carrier_changes);
>  
> +static ssize_t count_link_up_show(struct device *dev,
> +   struct device_attribute *attr, char *buf)
> +{
> + struct net_device *netdev = to_net_dev(dev);
> +
> + return sprintf(buf, fmt_dec, atomic_read(>count_link_up));
> +}
> +static DEVICE_ATTR_RO(count_link_up);
> +
> +static ssize_t count_link_down_show(struct device *dev,
> + struct device_attribute *attr, char *buf)
> +{
> + struct net_device *netdev = to_net_dev(dev);
> +
> + return sprintf(buf, fmt_dec, atomic_read(>count_link_down));
> +}
> +static DEVICE_ATTR_RO(count_link_down);
> +
>  /* read-write attributes */
>  
>  static int change_mtu(struct net_device *dev, unsigned long 

Re: [PATCH v2 03/31] net: Introduce net_sem for protection of pernet_list

2018-01-17 Thread Andrei Vagin
On Mon, Nov 20, 2017 at 09:32:34PM +0300, Kirill Tkhai wrote:
> Curently mutex is used to protect pernet operations list. It makes
> cleanup_net() to execute ->exit methods of the same operations set,
> which was used on the time of ->init, even after net namespace is
> unlinked from net_namespace_list.
> 
> But the problem is it's need to synchronize_rcu() after net is removed
> from net_namespace_list():
> 
> Destroy net_ns:
> cleanup_net()
>   mutex_lock(_mutex)
>   list_del_rcu(>list)
>   synchronize_rcu()  <--- Sleep there for ages
>   list_for_each_entry_reverse(ops, _list, list)
> ops_exit_list(ops, _exit_list)
>   list_for_each_entry_reverse(ops, _list, list)
> ops_free_list(ops, _exit_list)
>   mutex_unlock(_mutex)
> 
> This primitive is not fast, especially on the systems with many processors
> and/or when preemptible RCU is enabled in config. So, all the time, while
> cleanup_net() is waiting for RCU grace period, creation of new net namespaces
> is not possible, the tasks, who makes it, are sleeping on the same mutex:
> 
> Create net_ns:
> copy_net_ns()
>   mutex_lock_killable(_mutex)<--- Sleep there for ages
> 
> I observed 20-30 seconds hangs of "unshare -n" on ordinary 8-cpu laptop
> with preemptible RCU enabled.
> 
> The solution is to convert net_mutex to the rw_semaphore and add small locks
> to really small number of pernet_operations, what really need them. Then,
> pernet_operations::init/::exit methods, modifying the net-related data,
> will require down_read() locking only, while down_write() will be used
> for changing pernet_list.
> 
> This gives signify performance increase, after all patch set is applied,
> like you may see here:
> 
> %for i in {1..1}; do unshare -n bash -c exit; done
> 
> *before*
> real 1m40,377s
> user 0m9,672s
> sys 0m19,928s
> 
> *after*
> real 0m17,007s
> user 0m5,311s
> sys 0m11,779
> 
> (5.8 times faster)
> 
> This patch starts replacing net_mutex to net_sem. It adds rw_semaphore,
> describes the variables it protects, and makes to use where appropriate.
> net_mutex is still present, and next patches will kick it out step-by-step.
> 
> Signed-off-by: Kirill Tkhai 
> ---
>  include/linux/rtnetlink.h |1 +
>  net/core/net_namespace.c  |   39 ++-
>  net/core/rtnetlink.c  |4 ++--
>  3 files changed, 29 insertions(+), 15 deletions(-)
> 
> diff --git a/include/linux/rtnetlink.h b/include/linux/rtnetlink.h
> index 2032ce2eb20b..f640fc87fe1d 100644
> --- a/include/linux/rtnetlink.h
> +++ b/include/linux/rtnetlink.h
> @@ -35,6 +35,7 @@ extern int rtnl_is_locked(void);
>  
>  extern wait_queue_head_t netdev_unregistering_wq;
>  extern struct mutex net_mutex;
> +extern struct rw_semaphore net_sem;
>  
>  #ifdef CONFIG_PROVE_LOCKING
>  extern bool lockdep_rtnl_is_held(void);
> diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c
> index 2e512965bf42..859dce31e37e 100644
> --- a/net/core/net_namespace.c
> +++ b/net/core/net_namespace.c
> @@ -41,6 +41,11 @@ struct net init_net = {

> static LIST_HEAD(pernet_list);
> static struct list_head *first_device = _list;
> DEFINE_MUTEX(net_mutex);

With all patches, we still have the net_mutex, I think we need to add a
comment, which explains why we need it. Are "sync" pernet operations
depricated after this series? Or is it ok to have them?


>  EXPORT_SYMBOL(init_net);
>  
>  static bool init_net_initialized;
> +/*
> + * net_sem: protects: pernet_list, net_generic_ids,
> + * init_net_initialized and first_device pointer.
> + */
> +DECLARE_RWSEM(net_sem);
>  
>  #define MIN_PERNET_OPS_ID\
>   ((sizeof(struct net_generic) + sizeof(void *) - 1) / sizeof(void *))
> @@ -279,7 +284,7 @@ struct net *get_net_ns_by_id(struct net *net, int id)
>   */
>  static __net_init int setup_net(struct net *net, struct user_namespace 
> *user_ns)
>  {
> - /* Must be called with net_mutex held */
> + /* Must be called with net_sem held */
>   const struct pernet_operations *ops, *saved_ops;
>   int error = 0;
>   LIST_HEAD(net_exit_list);
> @@ -411,12 +416,16 @@ struct net *copy_net_ns(unsigned long flags,
>   net->ucounts = ucounts;
>   get_user_ns(user_ns);
>  
> - rv = mutex_lock_killable(_mutex);
> + rv = down_read_killable(_sem);
>   if (rv < 0)
>   goto put_userns;
> -
> + rv = mutex_lock_killable(_mutex);
> + if (rv < 0)
> + goto up_read;
>   rv = setup_net(net, user_ns);
>   mutex_unlock(_mutex);
> +up_read:
> + up_read(_sem);
>   if (rv < 0) {
>  put_userns:
>   put_user_ns(user_ns);
> @@ -443,6 +452,7 @@ static void cleanup_net(struct work_struct *work)
>   list_replace_init(_list, _kill_list);
>   spin_unlock_irq(_list_lock);
>  
> + down_read(_sem);
>   mutex_lock(_mutex);
>  
>   /* Don't let anyone else find us. */
> @@ -484,6 +494,7 @@ static void 

Re: [PATCH v2 05/31] net: Allow pernet_operations to be executed in parallel

2018-01-17 Thread Andrei Vagin
On Mon, Nov 20, 2017 at 09:32:55PM +0300, Kirill Tkhai wrote:
> This adds new pernet_operations::async flag to indicate operations,
> which ->init(), ->exit() and ->exit_batch() methods are allowed
> to be executed in parallel with the methods of any other pernet_operations.
> 
> When there are only asynchronous pernet_operations in the system,
> net_mutex won't be taken for a net construction and destruction.
> 
> Also, remove BUG_ON(mutex_is_locked()) from net_assign_generic()
> without replacing with the equivalent net_sem check, as there is
> one more lockdep assert below.
> 
> Suggested-by: Eric W. Biederman 
> Signed-off-by: Kirill Tkhai 
> ---
>  include/net/net_namespace.h |6 ++
>  net/core/net_namespace.c|   29 +++--
>  2 files changed, 25 insertions(+), 10 deletions(-)
> 
> diff --git a/include/net/net_namespace.h b/include/net/net_namespace.h
> index 10f99dafd5ac..db978c4755f7 100644
> --- a/include/net/net_namespace.h
> +++ b/include/net/net_namespace.h
> @@ -303,6 +303,12 @@ struct pernet_operations {
>   void (*exit_batch)(struct list_head *net_exit_list);
>   unsigned int *id;
>   size_t size;
> + /*
> +  * Indicates above methods are allowe to be executed in parallel
> +  * with methods of any other pernet_operations, i.e. they are not
> +  * need synchronization via net_mutex.
> +  */
> + bool async;
>  };
>  
>  /*
> diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c
> index c4f7452906bb..550c766f73aa 100644
> --- a/net/core/net_namespace.c
> +++ b/net/core/net_namespace.c
> @@ -41,8 +41,9 @@ struct net init_net = {
>  EXPORT_SYMBOL(init_net);
>  
>  static bool init_net_initialized;
> +static unsigned nr_sync_pernet_ops;
>  /*
> - * net_sem: protects: pernet_list, net_generic_ids,
> + * net_sem: protects: pernet_list, net_generic_ids, nr_sync_pernet_ops,
>   * init_net_initialized and first_device pointer.
>   */
>  DECLARE_RWSEM(net_sem);
> @@ -70,11 +71,10 @@ static int net_assign_generic(struct net *net, unsigned 
> int id, void *data)
>  {
>   struct net_generic *ng, *old_ng;
>  
> - BUG_ON(!mutex_is_locked(_mutex));
>   BUG_ON(id < MIN_PERNET_OPS_ID);
>  
>   old_ng = rcu_dereference_protected(net->gen,
> -lockdep_is_held(_mutex));
> +lockdep_is_held(_sem));
>   if (old_ng->s.len > id) {
>   old_ng->ptr[id] = data;
>   return 0;
> @@ -419,11 +419,14 @@ struct net *copy_net_ns(unsigned long flags,
>   rv = down_read_killable(_sem);
>   if (rv < 0)
>   goto put_userns;
> - rv = mutex_lock_killable(_mutex);
> - if (rv < 0)
> - goto up_read;
> + if (nr_sync_pernet_ops) {
> + rv = mutex_lock_killable(_mutex);
> + if (rv < 0)
> + goto up_read;
> + }
>   rv = setup_net(net, user_ns);
> - mutex_unlock(_mutex);
> + if (nr_sync_pernet_ops)
> + mutex_unlock(_mutex);
>  up_read:
>   up_read(_sem);
>   if (rv < 0) {
> @@ -453,7 +456,8 @@ static void cleanup_net(struct work_struct *work)
>   spin_unlock_irq(_list_lock);
>  
>   down_read(_sem);
> - mutex_lock(_mutex);
> + if (nr_sync_pernet_ops)
> + mutex_lock(_mutex);
>  
>   /* Don't let anyone else find us. */
>   rtnl_lock();
> @@ -489,7 +493,8 @@ static void cleanup_net(struct work_struct *work)
>   list_for_each_entry_reverse(ops, _list, list)
>   ops_exit_list(ops, _exit_list);
>  
> - mutex_unlock(_mutex);
> + if (nr_sync_pernet_ops)
> + mutex_unlock(_mutex);
>  
>   /* Free the net generic variables */
>   list_for_each_entry_reverse(ops, _list, list)
> @@ -961,6 +966,9 @@ static int register_pernet_operations(struct list_head 
> *list,
>   rcu_barrier();
>   if (ops->id)
>   ida_remove(_generic_ids, *ops->id);
> + } else if (!ops->async) {
> + pr_info_once("Pernet operations %ps are sync.\n", ops);

As far as I understand, we have this sync mode for backward
compatibility with non-upstream modules, don't we? If the answer is yes,
it may be better to add WARN_ONCE here?

> + nr_sync_pernet_ops++;
>   }
>  
>   return error;
> @@ -968,7 +976,8 @@ static int register_pernet_operations(struct list_head 
> *list,
>  
>  static void unregister_pernet_operations(struct pernet_operations *ops)
>  {
> - 
> + if (!ops->async)
> + BUG_ON(nr_sync_pernet_ops-- == 0);
>   __unregister_pernet_operations(ops);
>   rcu_barrier();
>   if (ops->id)
> 


Re: [v7, 05/11] x86/retpoline/entry: Convert entry assembler indirect jumps

2018-01-09 Thread Andrei Vagin
On Tue, Jan 09, 2018 at 08:39:21PM -0800, Dave Hansen wrote:
> On 01/09/2018 08:30 PM, Andi Kleen wrote:
> > On Tue, Jan 09, 2018 at 07:54:08PM -0800, Andrei Vagin wrote:
> >>
> >> In my test environment, the kernel with this patch crashes.
> > 
> > I posted a patch for this.
> 
> It's called:
> 
> [PATCH v3 2/3] x86/retpoline: Use better sequences for NOSPEC_CALL/JMP
> 
> right, Andi?

This patch fixed the problem. Thanks!

> 
> BTW, that's a fun oops.  It looks like it's call'ing (or jumping) to an
> address that's used by cpu_entry_area->exception_stacks[1].


Re: [v7, 05/11] x86/retpoline/entry: Convert entry assembler indirect jumps

2018-01-09 Thread Andrei Vagin

Hi,

In my test environment, the kernel with this patch crashes.

https://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git/commit/?h=x86/pti=f3433c1010c6af61c9897f0f0447f81b991feac1

My config and a kernel log are attached.

[1.985901] Freeing unused kernel memory: 524K
[1.987505] rodata_test: all tests were successful
[2.019787] kernel tried to execute NX-protected page - exploit attempt? 
(uid: 0)
[2.023023] BUG: unable to handle kernel paging request at fe007000
[2.027524] IP: 0xfe007000
[2.029371] PGD 13ffda067 P4D 13ffda067 PUD 13ffcf067 PMD 13ffce067 PTE 
80013fc09063
[2.032847] Oops: 0011 [#1] SMP PTI
[2.034598] Modules linked in:
[2.036420] CPU: 0 PID: 1 Comm: init Not tainted 4.14.0-00209-gf3433c1010c6 
#7
[2.039005] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 
1.10.2-1.fc26 04/01/2014
[2.041685] task: 96dc3ab88000 task.stack: b7aa4062c000
[2.044508] RIP: 0010:0xfe007000
[2.046511] RSP: 0018:b7aa4062ffd0 EFLAGS: 00010082
[2.049264] RAX: 000c RBX: 0001 RCX: 7fe8a02f8889
[2.054181] RDX: 004d RSI: 0041 RDI: b9a00010
[2.056528] RBP: 55b931deb040 R08: 0008 R09: 7fe8a02fdfc4
[2.058622] R10:  R11: 0246 R12: 0009
[2.060367] R13: 7fe8a02df3a0 R14: 0001 R15: 1000
[2.062107] FS:  () GS:96dc3fc0() 
knlGS:
[2.063949] CS:  0010 DS:  ES:  CR0: 80050033
[2.065207] CR2: fe007000 CR3: 0001393d2002 CR4: 003606f0
[2.066768] DR0:  DR1:  DR2: 
[2.068190] DR3:  DR6: fffe0ff0 DR7: 0400
[2.069406] Call Trace:
[2.069864] Code: 90 90 90 90 90 90 90 90 90 90 90 90 90 90 90 90 90 90 90 
90 90 90 90 90 90 90 90 90 90 90 90 90 90 90 90 90 90 90 90 90 90 90 90 <00> 00 
00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
[2.073535] RIP: 0xfe007000 RSP: b7aa4062ffd0
[2.074540] CR2: fe007000
[2.075165] ---[ end trace 394308f539cb80d2 ]--

Thanks,
Andrei

On Tue, Jan 09, 2018 at 02:43:11PM +, Woodhouse, David wrote:
> Convert indirect jumps in core 32/64bit entry assembler code to use
> non-speculative sequences when CONFIG_RETPOLINE is enabled.
> 
> Don't use CALL_NOSPEC in entry_SYSCALL_64_fastpath because the return
> address after the 'call' instruction must be *precisely* at the
> .Lentry_SYSCALL_64_after_fastpath label for stub_ptregs_64 to work,
> and the use of alternatives will mess that up unless we play horrid
> games to prepend with NOPs and make the variants the same length. It's
> not worth it; in the case where we ALTERNATIVE out the retpoline, the
> first instruction at __x86.indirect_thunk.rax is going to be a bare
> jmp *%rax anyway.
> 
> Signed-off-by: David Woodhouse 
> Acked-By: Arjan van de Ven 
> Acked-by: Ingo Molnar 
> Cc: gno...@lxorguk.ukuu.org.uk
> Cc: Rik van Riel 
> Cc: Andi Kleen 
> Cc: Peter Zijlstra 
> Cc: Linus Torvalds 
> Cc: Jiri Kosina 
> Cc: Andy Lutomirski 
> Cc: Dave Hansen 
> Cc: Kees Cook 
> Cc: Tim Chen 
> Cc: Greg Kroah-Hartman 
> Cc: Paul Turner 
> ---
>  arch/x86/entry/entry_32.S |  5 +++--
>  arch/x86/entry/entry_64.S | 12 +---
>  2 files changed, 12 insertions(+), 5 deletions(-)
> 
> diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S
> index ace8f32..a1f28a5 100644
> --- a/arch/x86/entry/entry_32.S
> +++ b/arch/x86/entry/entry_32.S
> @@ -44,6 +44,7 @@
>  #include 
>  #include 
>  #include 
> +#include 
>  
>   .section .entry.text, "ax"
>  
> @@ -290,7 +291,7 @@ ENTRY(ret_from_fork)
>  
>   /* kernel thread */
>  1:   movl%edi, %eax
> - call*%ebx
> + CALL_NOSPEC %ebx
>   /*
>* A kernel thread is allowed to return here after successfully
>* calling do_execve().  Exit to userspace to complete the execve()
> @@ -919,7 +920,7 @@ common_exception:
>   movl%ecx, %es
>   TRACE_IRQS_OFF
>   movl%esp, %eax  # pt_regs pointer
> - call*%edi
> + CALL_NOSPEC %edi
>   jmp ret_from_exception
>  END(common_exception)
>  
> diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
> index ed31d00..59874bc 100644
> --- a/arch/x86/entry/entry_64.S
> +++ b/arch/x86/entry/entry_64.S
> @@ -37,6 +37,7 @@
>  #include 
>  #include 
>  #include 
> +#include 
>  #include 
>  
>  #include "calling.h"
> @@ -187,7 +188,7 @@ ENTRY(entry_SYSCALL_64_trampoline)
>*/

Re: general protection fault in __netlink_ns_capable

2018-01-04 Thread Andrei Vagin
On Thu, Jan 04, 2018 at 01:01:17PM +0100, Dmitry Vyukov wrote:
> On Wed, Jan 3, 2018 at 8:37 AM, Andrei Vagin <ava...@virtuozzo.com> wrote:
> >> > Hello,
> >> >
> >> > syzkaller hit the following crash on
> >> > 75aa5540627fdb3d8f86229776ea87f995275351
> >> > git://git.cmpxchg.org/linux-mmots.git/master
> >> > compiler: gcc (GCC) 7.1.1 20170620
> >> > .config is attached
> >> > Raw console output is attached.
> >> > C reproducer is attached
> >> > syzkaller reproducer is attached. See https://goo.gl/kgGztJ
> >> > for information about syzkaller reproducers
> >> >
> >> >
> >> > IMPORTANT: if you fix the bug, please add the following tag to the 
> >> > commit:
> >> > Reported-by: syzbot+e432865c29eb4c48c...@syzkaller.appspotmail.com
> >> > It will help syzbot understand when the bug is fixed. See footer for
> >> > details.
> >> > If you forward the report, please keep this part and the footer.
> >> >
> >> > netlink: 3 bytes leftover after parsing attributes in process
> >> > `syzkaller140561'.
> >> > netlink: 3 bytes leftover after parsing attributes in process
> >> > `syzkaller140561'.
> >> > netlink: 3 bytes leftover after parsing attributes in process
> >> > `syzkaller140561'.
> >> > kasan: CONFIG_KASAN_INLINE enabled
> >> > kasan: GPF could be caused by NULL-ptr deref or user memory access
> >> > general protection fault:  [#1] SMP KASAN
> >> > Dumping ftrace buffer:
> >> >(ftrace buffer empty)
> >> > Modules linked in:
> >> > CPU: 1 PID: 3149 Comm: syzkaller140561 Not tainted 4.15.0-rc4-mm1+ #47
> >> > Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS
> >> > Google 01/01/2011
> >> > RIP: 0010:__netlink_ns_capable+0x8b/0x120 net/netlink/af_netlink.c:868
> >>
> >> NETLINK_CB(skb).sk is NULL here. It looks like we have to use
> >> sk_ns_capable instead of netlink_ns_capable:
> >>
> >> diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
> >> index c688dc564b11..408c75de52ea 100644
> >> --- a/net/core/rtnetlink.c
> >> +++ b/net/core/rtnetlink.c
> >> @@ -1762,7 +1762,7 @@ static struct net *get_target_net(struct sk_buff
> >> *skb, int netnsid)
> >> /* For now, the caller is required to have CAP_NET_ADMIN in
> >>  * the user namespace owning the target net ns.
> >>  */
> >> -   if (!netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN)) {
> >> +   if (!sk_ns_capable(skb->sk, net->user_ns, CAP_NET_ADMIN)) {
> >> put_net(net);
> >> return ERR_PTR(-EACCES);
> >> }
> >>
> >
> > get_target_net() is used twice in the code. In rtnl_getlink(), we need
> > to use netlink_ns_capable(skb, ...), but in rtnl_dump_ifinfo, we need to
> > use sk_ns_capable(skb->sk, ...).
> >
> > Pls, take a look at this patch:
> > https://patchwork.ozlabs.org/patch/854896/
> > Subject: rtnetlink: give a user socket to get_target_net()
> 
> 
> Please include this tag into the commit:
> 

I sent v2 with this tag. Sorry for inconvenience.
https://patchwork.ozlabs.org/patch/855147/

> > > IMPORTANT: if you fix the bug, please add the following tag to the commit:
> > > Reported-by: syzbot+e432865c29eb4c48c...@syzkaller.appspotmail.com
> > > It will help syzbot understand when the bug is fixed.


Re: general protection fault in __netlink_ns_capable

2018-01-02 Thread Andrei Vagin
On Tue, Jan 02, 2018 at 04:35:11PM -0800, Andrei Vagin wrote:
> On Tue, Jan 02, 2018 at 10:58:01AM -0800, syzbot wrote:
> > Hello,
> > 
> > syzkaller hit the following crash on
> > 75aa5540627fdb3d8f86229776ea87f995275351
> > git://git.cmpxchg.org/linux-mmots.git/master
> > compiler: gcc (GCC) 7.1.1 20170620
> > .config is attached
> > Raw console output is attached.
> > C reproducer is attached
> > syzkaller reproducer is attached. See https://goo.gl/kgGztJ
> > for information about syzkaller reproducers
> > 
> > 
> > IMPORTANT: if you fix the bug, please add the following tag to the commit:
> > Reported-by: syzbot+e432865c29eb4c48c...@syzkaller.appspotmail.com
> > It will help syzbot understand when the bug is fixed. See footer for
> > details.
> > If you forward the report, please keep this part and the footer.
> > 
> > netlink: 3 bytes leftover after parsing attributes in process
> > `syzkaller140561'.
> > netlink: 3 bytes leftover after parsing attributes in process
> > `syzkaller140561'.
> > netlink: 3 bytes leftover after parsing attributes in process
> > `syzkaller140561'.
> > kasan: CONFIG_KASAN_INLINE enabled
> > kasan: GPF could be caused by NULL-ptr deref or user memory access
> > general protection fault:  [#1] SMP KASAN
> > Dumping ftrace buffer:
> >(ftrace buffer empty)
> > Modules linked in:
> > CPU: 1 PID: 3149 Comm: syzkaller140561 Not tainted 4.15.0-rc4-mm1+ #47
> > Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS
> > Google 01/01/2011
> > RIP: 0010:__netlink_ns_capable+0x8b/0x120 net/netlink/af_netlink.c:868
> 
> NETLINK_CB(skb).sk is NULL here. It looks like we have to use
> sk_ns_capable instead of netlink_ns_capable:
> 
> diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
> index c688dc564b11..408c75de52ea 100644
> --- a/net/core/rtnetlink.c
> +++ b/net/core/rtnetlink.c
> @@ -1762,7 +1762,7 @@ static struct net *get_target_net(struct sk_buff
> *skb, int netnsid)
> /* For now, the caller is required to have CAP_NET_ADMIN in
>  * the user namespace owning the target net ns.
>  */
> -   if (!netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN)) {
> +   if (!sk_ns_capable(skb->sk, net->user_ns, CAP_NET_ADMIN)) {
> put_net(net);
> return ERR_PTR(-EACCES);
> }
>

get_target_net() is used twice in the code. In rtnl_getlink(), we need
to use netlink_ns_capable(skb, ...), but in rtnl_dump_ifinfo, we need to
use sk_ns_capable(skb->sk, ...).

Pls, take a look at this patch:
https://patchwork.ozlabs.org/patch/854896/
Subject: rtnetlink: give a user socket to get_target_net()


Re: general protection fault in __netlink_ns_capable

2018-01-02 Thread Andrei Vagin
On Tue, Jan 02, 2018 at 10:58:01AM -0800, syzbot wrote:
> Hello,
> 
> syzkaller hit the following crash on
> 75aa5540627fdb3d8f86229776ea87f995275351
> git://git.cmpxchg.org/linux-mmots.git/master
> compiler: gcc (GCC) 7.1.1 20170620
> .config is attached
> Raw console output is attached.
> C reproducer is attached
> syzkaller reproducer is attached. See https://goo.gl/kgGztJ
> for information about syzkaller reproducers
> 
> 
> IMPORTANT: if you fix the bug, please add the following tag to the commit:
> Reported-by: syzbot+e432865c29eb4c48c...@syzkaller.appspotmail.com
> It will help syzbot understand when the bug is fixed. See footer for
> details.
> If you forward the report, please keep this part and the footer.
> 
> netlink: 3 bytes leftover after parsing attributes in process
> `syzkaller140561'.
> netlink: 3 bytes leftover after parsing attributes in process
> `syzkaller140561'.
> netlink: 3 bytes leftover after parsing attributes in process
> `syzkaller140561'.
> kasan: CONFIG_KASAN_INLINE enabled
> kasan: GPF could be caused by NULL-ptr deref or user memory access
> general protection fault:  [#1] SMP KASAN
> Dumping ftrace buffer:
>(ftrace buffer empty)
> Modules linked in:
> CPU: 1 PID: 3149 Comm: syzkaller140561 Not tainted 4.15.0-rc4-mm1+ #47
> Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS
> Google 01/01/2011
> RIP: 0010:__netlink_ns_capable+0x8b/0x120 net/netlink/af_netlink.c:868

NETLINK_CB(skb).sk is NULL here. It looks like we have to use
sk_ns_capable instead of netlink_ns_capable:

diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index c688dc564b11..408c75de52ea 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -1762,7 +1762,7 @@ static struct net *get_target_net(struct sk_buff
*skb, int netnsid)
/* For now, the caller is required to have CAP_NET_ADMIN in
 * the user namespace owning the target net ns.
 */
-   if (!netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN)) {
+   if (!sk_ns_capable(skb->sk, net->user_ns, CAP_NET_ADMIN)) {
put_net(net);
return ERR_PTR(-EACCES);
}

> RSP: 0018:8801c880f348 EFLAGS: 00010206
> RAX: dc00 RBX:  RCX: 8443f900
> RDX: 007b RSI: 86510f40 RDI: 03d8
> RBP: 8801c880f360 R08:  R09: 110039101e4f
> R10:  R11: 0001 R12: 86510f40
> R13: 000c R14: 0004 R15: 0011
> FS:  01a1a880() GS:8801db30() knlGS:
> CS:  0010 DS:  ES:  CR0: 80050033
> CR2: 20151000 CR3: 0001c9511005 CR4: 001606e0
> DR0:  DR1:  DR2: 
> DR3:  DR6: fffe0ff0 DR7: 0400
> Call Trace:
>  netlink_ns_capable+0x26/0x30 net/netlink/af_netlink.c:886
>  get_target_net+0x9d/0x120 net/core/rtnetlink.c:1765
>  rtnl_dump_ifinfo+0x2e5/0xee0 net/core/rtnetlink.c:1806
>  netlink_dump+0x48c/0xce0 net/netlink/af_netlink.c:
>  __netlink_dump_start+0x4f0/0x6d0 net/netlink/af_netlink.c:2319
>  netlink_dump_start include/linux/netlink.h:214 [inline]
>  rtnetlink_rcv_msg+0x7f0/0xb10 net/core/rtnetlink.c:4485
>  netlink_rcv_skb+0x21e/0x460 net/netlink/af_netlink.c:2441
>  rtnetlink_rcv+0x1c/0x20 net/core/rtnetlink.c:4540
>  netlink_unicast_kernel net/netlink/af_netlink.c:1308 [inline]
>  netlink_unicast+0x4be/0x6a0 net/netlink/af_netlink.c:1334
>  netlink_sendmsg+0xa4a/0xe60 net/netlink/af_netlink.c:1897
>  sock_sendmsg_nosec net/socket.c:628 [inline]
>  sock_sendmsg+0xca/0x110 net/socket.c:638
>  sock_write_iter+0x31a/0x5d0 net/socket.c:907
>  call_write_iter include/linux/fs.h:1776 [inline]
>  new_sync_write fs/read_write.c:469 [inline]
>  __vfs_write+0x684/0x970 fs/read_write.c:482
>  vfs_write+0x189/0x510 fs/read_write.c:544
>  SYSC_write fs/read_write.c:589 [inline]
>  SyS_write+0xef/0x220 fs/read_write.c:581
>  entry_SYSCALL_64_fastpath+0x1f/0x96
> RIP: 0033:0x43fd49
> RSP: 002b:7ffc7fb92238 EFLAGS: 0203 ORIG_RAX: 0001
> RAX: ffda RBX:  RCX: 0043fd49
> RDX: 001f RSI: 20151000 RDI: 0005
> RBP: 006ca018 R08:  R09: 
> R10:  R11: 0203 R12: 004016b0
> R13: 00401740 R14:  R15: 
> Code: fa 48 c1 ea 03 80 3c 02 00 0f 85 95 00 00 00 48 8b 5b 18 48 b8 00 00
> 00 00 00 fc ff df 48 8d bb d8 03 00 00 48 89 fa 48 c1 ea 03 <80> 3c 02 00 0f
> 85 80 00 00 00 48 8b 9b d8 03 00 00 48 b8 00 00
> RIP: __netlink_ns_capable+0x8b/0x120 net/netlink/af_netlink.c:868 RSP:
> 8801c880f348
> ---[ end trace d7574f6bd3eea534 ]---
> 
> 
> ---
> This bug is generated by a dumb bot. It may contain errors.
> See https://goo.gl/tpsmEJ for details.
> Direct all questions to 

Re: [V181,22/54] x86/cpu_entry_area: Move it out of fixmap

2017-12-21 Thread Andrei Vagin
Hi Thomas,

The kernel with this patch doesn't boot, if CONFIG_KASAN is set:
[0.00] Linux version 4.14.0-00142-g8604322546c0 (avagin@laptop) (gcc 
version 7.2.1 20170915 (Red Hat 7.2.1-2) (GCC)) #11 SMP Thu Dec 21 18:38:44 PST 
2017
[0.00] Command line: root=/dev/vda2 ro debug console=ttyS0,115200 
LANG=en_US.UTF-8 slub_debug=FZP raid=noautodetect selinux=0 
earlyprintk=serial,ttyS0,115200
[0.00] x86/fpu: Supporting XSAVE feature 0x001: 'x87 floating point 
registers'
[0.00] x86/fpu: Supporting XSAVE feature 0x002: 'SSE registers'
[0.00] x86/fpu: Supporting XSAVE feature 0x004: 'AVX registers'
[0.00] x86/fpu: Supporting XSAVE feature 0x008: 'MPX bounds registers'
[0.00] x86/fpu: Supporting XSAVE feature 0x010: 'MPX CSR'
[0.00] x86/fpu: xstate_offset[2]:  576, xstate_sizes[2]:  256
[0.00] x86/fpu: xstate_offset[3]:  832, xstate_sizes[3]:   64
[0.00] x86/fpu: xstate_offset[4]:  896, xstate_sizes[4]:   64
[0.00] x86/fpu: Enabled xstate features 0x1f, context size is 960 
bytes, using 'compacted' format.
[0.00] e820: BIOS-provided physical RAM map:
[0.00] BIOS-e820: [mem 0x-0x0009fbff] usable
[0.00] BIOS-e820: [mem 0x0009fc00-0x0009] reserved
[0.00] BIOS-e820: [mem 0x000f-0x000f] reserved
[0.00] BIOS-e820: [mem 0x0010-0x7ffd8fff] usable
[0.00] BIOS-e820: [mem 0x7ffd9000-0x7fff] reserved
[0.00] BIOS-e820: [mem 0xfeffc000-0xfeff] reserved
[0.00] BIOS-e820: [mem 0xfffc-0x] reserved
[0.00] bootconsole [earlyser0] enabled
[0.00] NX (Execute Disable) protection: active
[0.00] random: fast init done
[0.00] SMBIOS 2.8 present.
[0.00] DMI: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.10.2-1.fc26 
04/01/2014
[0.00] Hypervisor detected: KVM
[0.00] tsc: Fast TSC calibration using PIT
[0.00] e820: update [mem 0x-0x0fff] usable ==> reserved
[0.00] e820: remove [mem 0x000a-0x000f] usable
[0.00] e820: last_pfn = 0x7ffd9 max_arch_pfn = 0x4
[0.00] MTRR default type: write-back
[0.00] MTRR fixed ranges enabled:
[0.00]   0-9 write-back
[0.00]   A-B uncachable
[0.00]   C-F write-protect
[0.00] MTRR variable ranges enabled:
[0.00]   0 base 008000 mask FF8000 uncachable
[0.00]   1 disabled
[0.00]   2 disabled
[0.00]   3 disabled
[0.00]   4 disabled
[0.00]   5 disabled
[0.00]   6 disabled
[0.00]   7 disabled
[0.00] x86/PAT: Configuration [0-7]: WB  WC  UC- UC  WB  WP  UC- WT  
[0.00] found SMP MP-table at [mem 0x000f6bd0-0x000f6bdf] mapped at 
[ff200bd0]
[0.00] Base memory trampoline at [88099000] 99000 size 24576
[0.00] Using GB pages for direct mapping
[0.00] BRK [0x5bf4e000, 0x5bf4efff] PGTABLE
[0.00] BRK [0x5bf4f000, 0x5bf4] PGTABLE
[0.00] BRK [0x5bf5, 0x5bf50fff] PGTABLE
[0.00] BRK [0x5bf51000, 0x5bf51fff] PGTABLE
[0.00] BRK [0x5bf52000, 0x5bf52fff] PGTABLE
[0.00] ACPI: Early table checksum verification disabled
[0.00] ACPI: RSDP 0x000F69C0 14 (v00 BOCHS )
[0.00] ACPI: RSDT 0x7FFE12FF 2C (v01 BOCHS  BXPCRSDT 
0001 BXPC 0001)
[0.00] ACPI: FACP 0x7FFE120B 74 (v01 BOCHS  BXPCFACP 
0001 BXPC 0001)
[0.00] ACPI: DSDT 0x7FFE0040 0011CB (v01 BOCHS  BXPCDSDT 
0001 BXPC 0001)
[0.00] ACPI: FACS 0x7FFE 40
[0.00] ACPI: APIC 0x7FFE127F 80 (v01 BOCHS  BXPCAPIC 
0001 BXPC 0001)
[0.00] ACPI: Local APIC address 0xfee0
[0.00] No NUMA configuration found
[0.00] Faking a node at [mem 0x-0x7ffd8fff]
[0.00] NODE_DATA(0) allocated [mem 0x7ffc2000-0x7ffd8fff]
[0.00] kvm-clock: Using msrs 4b564d01 and 4b564d00
[0.00] kvm-clock: cpu 0, msr 0:7ffc1001, primary cpu clock
[0.00] kvm-clock: using sched offset of 137192604594 cycles
[0.00] clocksource: kvm-clock: mask: 0x max_cycles: 
0x1cd42e4dffb, max_idle_ns: 881590591483 ns
[0.00] Zone ranges:
[0.00]   DMA  [mem 0x1000-0x00ff]
[0.00]   DMA32[mem 0x0100-0x7ffd8fff]
[0.00]   Normal   empty
[0.00]   Device   empty
[0.00] Movable zone start for each node
[0.00] Early memory node ranges
[0.00]   node   0: [mem 0x1000-0x0009efff]
[0.00]   node   0: [mem 0x0010-0x7ffd8fff]
[0.00] Initmem setup node 0 [mem 

Re: virtio: make VIRTIO a menuconfig to ease disabling it all

2017-12-20 Thread Andrei Vagin
On Sat, Dec 09, 2017 at 04:26:57PM +0100, Vincent Legoll wrote:
> No need to get into the submenu to disable all VIRTIO-related
> config entries.
> 
> This makes it easier to disable all VIRTIO config options
> without entering the submenu. It will also enable one
> to see that en/dis-abled state from the outside menu.
> 
> This is only intended to change menuconfig UI, not change
> the config dependencies.
> 
> Signed-off-by: Vincent Legoll 
> ---
>  drivers/virtio/Kconfig | 7 +--
>  1 file changed, 5 insertions(+), 2 deletions(-)
> 
> diff --git a/drivers/virtio/Kconfig b/drivers/virtio/Kconfig
> index cff773f15b7e..d485a63a8233 100644
> --- a/drivers/virtio/Kconfig
> +++ b/drivers/virtio/Kconfig
> @@ -5,7 +5,10 @@ config VIRTIO
> bus, such as CONFIG_VIRTIO_PCI, CONFIG_VIRTIO_MMIO, CONFIG_RPMSG
> or CONFIG_S390_GUEST.
>  
> -menu "Virtio drivers"
> +menuconfig VIRTIO_MENU
> + bool "Virtio drivers"

Hi Vincent,

make localyesconfig and make localmodconfig doesn't work with this
patch.

My scenario looks like this:
* Create a virtual machine with Ubuntu 14.04 in GCE
* git clone git://git.kernel.org/pub/scm/linux/kernel/git/next/linux-next.git
* cd linux-next
* curl -o .config 
https://raw.githubusercontent.com/avagin/criu/linux-next/scripts/linux-next-config
* make localyesconfig

Without this patch:

$ cat .config | grep VIRTIO
CONFIG_BLK_MQ_VIRTIO=y
CONFIG_VIRTIO_BLK=y
# CONFIG_VIRTIO_BLK_SCSI is not set
CONFIG_SCSI_VIRTIO=y
CONFIG_VIRTIO_NET=y
CONFIG_VIRTIO_CONSOLE=y
# CONFIG_HW_RANDOM_VIRTIO is not set
CONFIG_VIRTIO=y
CONFIG_VIRTIO_PCI=y
CONFIG_VIRTIO_PCI_LEGACY=y
CONFIG_VIRTIO_BALLOON=y
# CONFIG_VIRTIO_INPUT is not set
CONFIG_VIRTIO_MMIO=y
CONFIG_VIRTIO_MMIO_CMDLINE_DEVICES=y
# CONFIG_RPMSG_VIRTIO is not set
# CONFIG_CRYPTO_DEV_VIRTIO is not set


With this patch:

$ cat .config | grep VIRTIO
CONFIG_BLK_MQ_VIRTIO=y
# CONFIG_VIRTIO_BLK is not set
CONFIG_SCSI_VIRTIO=y
# CONFIG_VIRTIO_NET is not set
CONFIG_CAIF_VIRTIO=y
# CONFIG_VIRTIO_CONSOLE is not set
# CONFIG_HW_RANDOM_VIRTIO is not set
CONFIG_VIRTIO=y
# CONFIG_VIRTIO_MENU is not set
# CONFIG_RPMSG_VIRTIO is not set
# CONFIG_CRYPTO_DEV_VIRTIO is not set


You can see that with this patch CONFIG_VIRTIO_BLK is not set.
It is wrong, and the kernel with this config will not be able to boot.

We can add "default y" to fix this problem.

https://travis-ci.org/avagin/linux/jobs/313348334
https://travis-ci.org/avagin/linux/jobs/318491188

Thanks,
Andrei

> +
> +if VIRTIO_MENU
>  
>  config VIRTIO_PCI
>   tristate "PCI driver for virtio devices"
> @@ -79,4 +82,4 @@ config VIRTIO_MMIO_CMDLINE_DEVICES
>  
>If unsure, say 'N'.
>  
> -endmenu
> +endif # VIRTIO_MENU


[PATCH] mm: don't use the same value for MAP_FIXED_SAFE and MAP_SYNC

2017-12-18 Thread Andrei Vagin
Cc: Michal Hocko <mho...@kernel.org>
Fixes: ("fs, elf: drop MAP_FIXED usage from elf_map")
Signed-off-by: Andrei Vagin <ava...@openvz.org>
---
 include/uapi/asm-generic/mman-common.h | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/include/uapi/asm-generic/mman-common.h 
b/include/uapi/asm-generic/mman-common.h
index b37502cbbef7..2db3fa287274 100644
--- a/include/uapi/asm-generic/mman-common.h
+++ b/include/uapi/asm-generic/mman-common.h
@@ -26,7 +26,9 @@
 #else
 # define MAP_UNINITIALIZED 0x0 /* Don't support this flag */
 #endif
-#define MAP_FIXED_SAFE 0x8 /* MAP_FIXED which doesn't unmap 
underlying mapping */
+
+/* 0x0100 - 0x8 flags are defined in asm-generic/mman.h */
+#define MAP_FIXED_SAFE 0x10/* MAP_FIXED which doesn't 
unmap underlying mapping */
 
 /*
  * Flags for mlock
-- 
2.13.6



Re: [2/2] fs, elf: drop MAP_FIXED usage from elf_map

2017-12-18 Thread Andrei Vagin
On Mon, Dec 18, 2017 at 10:13:02AM +0100, Michal Hocko wrote:
> On Fri 15-12-17 16:49:28, Andrei Vagin wrote:
> > Hi Michal,
> > 
> > We run CRIU tests for linux-next and the 4.15.0-rc3-next-20171215 kernel
> > doesn't boot:
> > 
> > [3.492549] Freeing unused kernel memory: 1640K
> > [3.494547] Write protecting the kernel read-only data: 18432k
> > [3.498781] Freeing unused kernel memory: 2016K
> > [3.503330] Freeing unused kernel memory: 512K
> > [3.505232] rodata_test: all tests were successful
> > [3.515355] 1 (init): Uhuuh, elf segement at 928fda3e requested 
> > but the memory is mapped already
> 
> Hmm, this interesting. What does the test actualy do? Could you add some
> instrumentation to see what is actually mapped there? Something like

There is nothing mapped there. It returns -95 (ENOSUPP)

The kernel is booted with this patch:

+   int ttype = type & ~MAP_FIXED_SAFE;
if (total_size) {
total_size = ELF_PAGEALIGN(total_size);
-   map_addr = vm_mmap(filep, addr, total_size, prot, type,
off);
+   map_addr = vm_mmap(filep, addr, total_size, prot, ttype, off);
if (!BAD_ADDR(map_addr))
vm_munmap(map_addr+size, total_size-size);
} else
-   map_addr = vm_mmap(filep, addr, size, prot, type, off);
+   map_addr = vm_mmap(filep, addr, size, prot, ttype, off);


> 
> diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
> index 0e50230ce53d..1b68ddc34043 100644
> --- a/fs/binfmt_elf.c
> +++ b/fs/binfmt_elf.c
> @@ -372,10 +372,28 @@ static unsigned long elf_map(struct file *filep, 
> unsigned long addr,
>   } else
>   map_addr = vm_mmap(filep, addr, size, prot, type, off);
>  
> - if ((type & MAP_FIXED_SAFE) && BAD_ADDR(map_addr))
> + if ((type & MAP_FIXED_SAFE) && BAD_ADDR(map_addr)) {
> + struct vm_area_struct *vma;
> +
>   pr_info("%d (%s): Uhuuh, elf segment at %p requested but the 
> memory is mapped already\n",
>   task_pid_nr(current), current->comm,
>   (void *)addr);
> + vma = find_vma(current->mm, map_addr);
> + if (vma && vma->vm_start < addr) {
> + pr_info("requested [%lx, %lx] mapped [%lx, %lx] %lx ", 
> addr, addr + total_size,
> + vma->vm_start, vma->vm_end, 
> vma->vm_flags);
> + if (!vma->vm_file) {
> + pr_cont("anon\n");
> + } else {
> + char path[512];
> + char *p = file_path(vma->vm_file, path, 
> sizeof(path));
> + if (IS_ERR(p))
> + p = "?";
> + pr_cont("\"%s\"\n", kbasename(p));
> + }
> + dump_stack();
> + }
> + }
>  
>   return(map_addr);
>  }
> 
> > [3.519533] Starting init: /sbin/init exists but couldn't execute it 
> > (error -95)
> > [3.528993] Starting init: /bin/sh exists but couldn't execute it (error 
> > -14)
> > [3.532127] Kernel panic - not syncing: No working init found.  Try 
> > passing init= option to kernel. See Linux 
> > Documentation/admin-guide/init.rst for guidance.
> > [3.538328] CPU: 0 PID: 1 Comm: init Not tainted 
> > 4.15.0-rc3-next-20171215-1-g6d6aea478fce #11
> > [3.542201] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 
> > 1.10.2-1.fc26 04/01/2014
> > [3.546081] Call Trace:
> > [3.547221]  dump_stack+0x5c/0x79
> > [3.548768]  ? rest_init+0x30/0xb0
> > [3.550320]  panic+0xe4/0x232
> > [3.551669]  ? rest_init+0xb0/0xb0
> > [3.553110]  kernel_init+0xeb/0x100
> > [3.554701]  ret_from_fork+0x1f/0x30
> > [3.558964] Kernel Offset: 0x200 from 0x8100 (relocation 
> > range: 0x8000-0xbfff)
> > [3.564160] ---[ end Kernel panic - not syncing: No working init found.  
> > Try passing init= option to kernel. See Linux 
> > Documentation/admin-guide/init.rst for guidance.
> > 
> > If I revert this patch, it boots normally.
> > 
> > Thanks,
> > Andrei
> > 
> > On Wed, Dec 13, 2017 at 10:25:50AM +0100, Michal Hocko wrote:
> > > From: Michal Hocko <mho...@suse.com>
> > > 
> > > Both load_elf_interp and load

Re: [2/2] fs, elf: drop MAP_FIXED usage from elf_map

2017-12-15 Thread Andrei Vagin
Hi Michal,

We run CRIU tests for linux-next and the 4.15.0-rc3-next-20171215 kernel
doesn't boot:

[3.492549] Freeing unused kernel memory: 1640K
[3.494547] Write protecting the kernel read-only data: 18432k
[3.498781] Freeing unused kernel memory: 2016K
[3.503330] Freeing unused kernel memory: 512K
[3.505232] rodata_test: all tests were successful
[3.515355] 1 (init): Uhuuh, elf segement at 928fda3e requested but 
the memory is mapped already
[3.519533] Starting init: /sbin/init exists but couldn't execute it (error 
-95)
[3.528993] Starting init: /bin/sh exists but couldn't execute it (error -14)
[3.532127] Kernel panic - not syncing: No working init found.  Try passing 
init= option to kernel. See Linux Documentation/admin-guide/init.rst for 
guidance.
[3.538328] CPU: 0 PID: 1 Comm: init Not tainted 
4.15.0-rc3-next-20171215-1-g6d6aea478fce #11
[3.542201] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 
1.10.2-1.fc26 04/01/2014
[3.546081] Call Trace:
[3.547221]  dump_stack+0x5c/0x79
[3.548768]  ? rest_init+0x30/0xb0
[3.550320]  panic+0xe4/0x232
[3.551669]  ? rest_init+0xb0/0xb0
[3.553110]  kernel_init+0xeb/0x100
[3.554701]  ret_from_fork+0x1f/0x30
[3.558964] Kernel Offset: 0x200 from 0x8100 (relocation 
range: 0x8000-0xbfff)
[3.564160] ---[ end Kernel panic - not syncing: No working init found.  Try 
passing init= option to kernel. See Linux Documentation/admin-guide/init.rst 
for guidance.

If I revert this patch, it boots normally.

Thanks,
Andrei

On Wed, Dec 13, 2017 at 10:25:50AM +0100, Michal Hocko wrote:
> From: Michal Hocko 
> 
> Both load_elf_interp and load_elf_binary rely on elf_map to map segments
> on a controlled address and they use MAP_FIXED to enforce that. This is
> however dangerous thing prone to silent data corruption which can be
> even exploitable. Let's take CVE-2017-1000253 as an example. At the time
> (before eab09532d400 ("binfmt_elf: use ELF_ET_DYN_BASE only for PIE"))
> ELF_ET_DYN_BASE was at TASK_SIZE / 3 * 2 which is not that far away from
> the stack top on 32b (legacy) memory layout (only 1GB away). Therefore
> we could end up mapping over the existing stack with some luck.
> 
> The issue has been fixed since then (a87938b2e246 ("fs/binfmt_elf.c:
> fix bug in loading of PIE binaries")), ELF_ET_DYN_BASE moved moved much
> further from the stack (eab09532d400 and later by c715b72c1ba4 ("mm:
> revert x86_64 and arm64 ELF_ET_DYN_BASE base changes")) and excessive
> stack consumption early during execve fully stopped by da029c11e6b1
> ("exec: Limit arg stack to at most 75% of _STK_LIM"). So we should be
> safe and any attack should be impractical. On the other hand this is
> just too subtle assumption so it can break quite easily and hard to
> spot.
> 
> I believe that the MAP_FIXED usage in load_elf_binary (et. al) is still
> fundamentally dangerous. Moreover it shouldn't be even needed. We are
> at the early process stage and so there shouldn't be unrelated mappings
> (except for stack and loader) existing so mmap for a given address
> should succeed even without MAP_FIXED. Something is terribly wrong if
> this is not the case and we should rather fail than silently corrupt the
> underlying mapping.
> 
> Address this issue by changing MAP_FIXED to the newly added
> MAP_FIXED_SAFE. This will mean that mmap will fail if there is an
> existing mapping clashing with the requested one without clobbering it.
> 
> Cc: Abdul Haleem 
> Cc: Joel Stanley 
> Acked-by: Kees Cook 
> Reviewed-by: Khalid Aziz 
> Signed-off-by: Michal Hocko 
> ---
>  arch/metag/kernel/process.c |  6 +-
>  fs/binfmt_elf.c | 12 
>  2 files changed, 13 insertions(+), 5 deletions(-)
> 
> diff --git a/arch/metag/kernel/process.c b/arch/metag/kernel/process.c
> index 0909834c83a7..867c8d0a5fb4 100644
> --- a/arch/metag/kernel/process.c
> +++ b/arch/metag/kernel/process.c
> @@ -399,7 +399,7 @@ unsigned long __metag_elf_map(struct file *filep, 
> unsigned long addr,
>   tcm_tag = tcm_lookup_tag(addr);
>  
>   if (tcm_tag != TCM_INVALID_TAG)
> - type &= ~MAP_FIXED;
> + type &= ~(MAP_FIXED | MAP_FIXED_SAFE);
>  
>   /*
>   * total_size is the size of the ELF (interpreter) image.
> @@ -417,6 +417,10 @@ unsigned long __metag_elf_map(struct file *filep, 
> unsigned long addr,
>   } else
>   map_addr = vm_mmap(filep, addr, size, prot, type, off);
>  
> + if ((type & MAP_FIXED_SAFE) && BAD_ADDR(map_addr))
> + pr_info("%d (%s): Uhuuh, elf segement at %p requested but the 
> memory is mapped already\n",
> + task_pid_nr(current), tsk->comm, (void*)addr);
> +
>   if (!BAD_ADDR(map_addr) && tcm_tag != TCM_INVALID_TAG) {
>  

BUG: unable to handle kernel NULL pointer dereference in fdb_find_rcu

2017-12-15 Thread Andrei Vagin
Hi,

We run criu tests for linux-next and today we get this bug:

The kernel version is 4.15.0-rc3-next-20171215

[  235.397328] BUG: unable to handle kernel NULL pointer dereference
at 000c
[  235.398624] IP: fdb_find_rcu+0x3c/0x130
[  235.399365] PGD 51970067 P4D 51970067 PUD 51971067 PMD 0
[  235.400400] Oops:  [#1] SMP
[  235.400959] Modules linked in:
[  235.401455] CPU: 1 PID: 32057 Comm: criu Not tainted
4.15.0-rc3-next-20171215-2-g2e56147d7dc8 #10
[  235.402935] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996),
BIOS 1.10.2-1.fc26 04/01/2014
[  235.404356] RIP: 0010:fdb_find_rcu+0x3c/0x130
[  235.405069] RSP: 0018:bc64c05f76e8 EFLAGS: 00010246
[  235.405908] RAX: e871 RBX:  RCX: 
[  235.407062] RDX:  RSI: 9d9b69a04798 RDI: 9d9b64c14948
[  235.408370] RBP: 9d9b64c14948 R08: bc64c05f7c18 R09: 9d9b64c140a0
[  235.409518] R10: 64c140a0 R11: 9d9b75743968 R12: 9d9b757ce260
[  235.410674] R13:  R14: bc64c05f7c18 R15: a5384560
[  235.411837] FS:  7ffac2660100() GS:9d9b7fd0()
knlGS:
[  235.412920] CS:  0010 DS:  ES:  CR0: 80050033
[  235.413764] CR2: 000c CR3: 6a7ff004 CR4: 003606e0
[  235.414829] DR0:  DR1:  DR2: 
[  235.416096] DR3:  DR6: fffe0ff0 DR7: 0400
[  235.417365] Call Trace:
[  235.417774]  br_fdb_change_mac_address+0x31/0x80
[  235.418538]  br_stp_change_bridge_id+0x25/0x110
[  235.419280]  br_dev_newlink+0x43/0xa0
[  235.419796]  rtnl_newlink+0x70f/0x940
[  235.420391]  ? nla_parse+0x83/0xf0
[  235.420948]  ? nla_strlcpy+0x48/0x50
[  235.421540]  ? rtnl_link_ops_get+0x34/0x50
[  235.422211]  ? rtnl_newlink+0x193/0x940
[  235.422843]  ? deactivate_slab.isra.78+0x11b/0x3b0
[  235.423599]  rtnetlink_rcv_msg+0x25d/0x2d0
[  235.424189]  ? __alloc_skb+0x82/0x1e0
[  235.424718]  ? __slab_alloc+0x1c/0x30
[  235.425420]  ? rtnl_calcit.isra.27+0x110/0x110
[  235.426354]  netlink_rcv_skb+0x8d/0x130
[  235.426995]  netlink_unicast+0x19d/0x250
[  235.427754]  netlink_sendmsg+0x2a5/0x3a0
[  235.428387]  sock_sendmsg+0x30/0x40
[  235.428807]  ___sys_sendmsg+0x269/0x2c0
[  235.429277]  ? generic_perform_write+0x122/0x1b0
[  235.429828]  ? __generic_file_write_iter+0x192/0x1c0
[  235.430422]  ? ext4_file_write_iter+0x20c/0x3e0
[  235.430975]  ? SyS_setns+0xc2/0xd0
[  235.431425]  ? __vfs_write+0xf9/0x170
[  235.431886]  ? __sys_sendmsg+0x51/0x90
[  235.432501]  __sys_sendmsg+0x51/0x90
[  235.433098]  entry_SYSCALL_64_fastpath+0x1a/0x7d
[  235.433822] RIP: 0033:0x7ffac1e170f7
[  235.434409] RSP: 002b:7ffc5610c688 EFLAGS: 0246 ORIG_RAX:
002e
[  235.435635] RAX: ffda RBX: 022eb010 RCX: 7ffac1e170f7
[  235.436796] RDX:  RSI: 7ffc5610c6c0 RDI: 0002
[  235.437956] RBP:  R08:  R09: 
[  235.439125] R10:  R11: 0246 R12: 7ffc5610cbac
[  235.440556] R13: 000500580011 R14: 7ffac2666158 R15: 0001
[  235.442175] Code: fd 48 83 ec 10 65 48 8b 04 25 28 00 00 00 48 89
44 24 08 31 c0 8b 06 48 8b 1f 66 89 54 24 06 89 04 24 0f b7 46 04 66
89 44 24 04 <8b> 43 0c 8b 14 24 8d b0 f7 be ad de 8b 44 24 04 01 f2 01
f0 89
[  235.444717] RIP: fdb_find_rcu+0x3c/0x130 RSP: bc64c05f76e8
[  235.445636] CR2: 000c
[  235.446278] ---[ end trace b77358b1e42a9dd7 ]---
[  235.447170] Kernel panic - not syncing: Fatal exception in interrupt
[  235.448379] Kernel Offset: 0x2300 from 0x8100
(relocation range: 0x8000-0xbfff)
[  235.449750] ---[ end Kernel panic - not syncing: Fatal exception in interrupt


[PATCH] target: don't call an unmap callback if a range length is zero

2017-12-13 Thread Andrei Vagin
If a length of a range is zero, it means there is nothing to unmap
and we can skip this range.

Here is one more reason, why we have to skip such ranges.  An unmap
callback calls file_operations->fallocate(), but the man page for the
fallocate syscall says that fallocate(fd, mode, offset, let) returns
EINVAL, if len is zero. It means that file_operations->fallocate() isn't
obligated to handle zero ranges too.

Signed-off-by: Andrei Vagin <ava...@openvz.org>
---
 drivers/target/target_core_sbc.c | 8 +---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/drivers/target/target_core_sbc.c b/drivers/target/target_core_sbc.c
index 750a04ed0e93..b054682e974f 100644
--- a/drivers/target/target_core_sbc.c
+++ b/drivers/target/target_core_sbc.c
@@ -1216,9 +1216,11 @@ sbc_execute_unmap(struct se_cmd *cmd)
goto err;
}
 
-   ret = ops->execute_unmap(cmd, lba, range);
-   if (ret)
-   goto err;
+   if (range) {
+   ret = ops->execute_unmap(cmd, lba, range);
+   if (ret)
+   goto err;
+   }
 
ptr += 16;
size -= 16;
-- 
2.13.6



Re: proc: fix /proc/*/map_files lookup

2017-11-29 Thread Andrei Vagin
On Wed, Nov 29, 2017 at 02:56:03PM -0800, Andrew Morton wrote:
> On Mon, 27 Nov 2017 21:29:25 -0800 Andrei Vagin <ava...@virtuozzo.com> wrote:
> 
> > On Tue, Nov 21, 2017 at 12:27:06AM +0300, Alexey Dobriyan wrote:
> > > Current code does:
> > > 
> > >   if (sscanf(dentry->d_name.name, "%lx-%lx", start, end) != 2)
> > > 
> > > However sscanf() is broken garbage.
> > > 
> > > It silently accepts whitespace between format specifiers
> > > (did you know that?).
> > > 
> > > It silently accepts valid strings which result in integer overflow.
> > > 
> > > Do not use sscanf() for any even remotely reliable parsing code.
> > 
> > This patch breaks criu, criu has one places where a file name is generated
> > as map_files/%p-%p
> > 
> > openat(1048572, "map_files/0x7f9912dd5000-0x7f9912de4000", O_RDWR) = -1 
> > ENOENT (No such file or directory) <0.15>
> > 
> > And this code worked before this patch and it doesn't work with this
> > patch. And you have to know that we never break user-space programs ;)
> > 
> > But seriously, the patch looks good to me, but I would prefer to not queue
> > it into stable kernels.
> 
> The patch breaks CRIU but you're OK with merging it?  How does that work ;)

It was a bug in criu. And this bug is on a minor path, which works when
memfd_create() isn't available. It is a reason why I ask to not
backport this patch to stable kernels.

In CRIU this bug can be triggered, only if this patch will be backported
to a kernel which version is lower than v3.16.

> 
> Now I'm worried that it will break other things.

I think a chance is very small. All programs should use names which
listed in /proc/PID/map_files/.


Re: [PATCH v4 2/4] vm: add a syscall to map a process memory into a pipe

2017-11-28 Thread Andrei Vagin
On Mon, Nov 27, 2017 at 03:42:49PM -0800, Andrew Morton wrote:
> On Mon, 27 Nov 2017 09:19:39 +0200 Mike Rapoport <r...@linux.vnet.ibm.com> 
> wrote:
> 
> > From: Andrei Vagin <ava...@virtuozzo.com>
> > 
> > It is a hybrid of process_vm_readv() and vmsplice().
> > 
> > vmsplice can map memory from a current address space into a pipe.
> > process_vm_readv can read memory of another process.
> > 
> > A new system call can map memory of another process into a pipe.
> > 
> > ssize_t process_vmsplice(pid_t pid, int fd, const struct iovec *iov,
> > unsigned long nr_segs, unsigned int flags)
> > 
> > All arguments are identical with vmsplice except pid which specifies a
> > target process.
> > 
> > Currently if we want to dump a process memory to a file or to a socket,
> > we can use process_vm_readv() + write(), but it works slow, because data
> > are copied into a temporary user-space buffer.
> > 
> > A second way is to use vmsplice() + splice(). It is more effective,
> > because data are not copied into a temporary buffer, but here is another
> > problem. vmsplice works with the currect address space, so it can be
> > used only if we inject our code into a target process.
> > 
> > The second way suffers from a few other issues:
> > * a process has to be stopped to run a parasite code
> > * a number of pipes is limited, so it may be impossible to dump all
> >   memory in one iteration, and we have to stop process and inject our
> >   code a few times.
> > * pages in pipes are unreclaimable, so it isn't good to hold a lot of
> >   memory in pipes.
> > 
> > The introduced syscall allows to use a second way without injecting any
> > code into a target process.
> > 
> > My experiments shows that process_vmsplice() + splice() works two time
> > faster than process_vm_readv() + write().
> >
> > It is particularly useful on a pre-dump stage. On this stage we enable a
> > memory tracker, and then we are dumping  a process memory while a
> > process continues work. On the first iteration we are dumping all
> > memory, and then we are dumpung only modified memory from a previous
> > iteration.  After a few pre-dump operations, a process is stopped and
> > dumped finally. The pre-dump operations allow to significantly decrease
> > a process downtime, when a process is migrated to another host.
> 
> What is the overall improvement in a typical dumping operation?
> 
> Does that improvement justify the addition of a new syscall, and all
> that this entails?  If so, why?

In criu, we have a pre-dump operation, which is used to reduce a process
downtime during live migration of processes. The pre-dump operation
allows to dump memory without stopping processes. On the first
iteration, criu pre-dump dumps the whole memory of processes, on the
second iteration it saves only changed pages after the first pre-dump
and so on.

The primary goal here is to do this operation without a downtime of
processes, or as maximum this downtime has to be as small as possible.

Currently when we are doing pre-dump, we do next steps:

1. stop all processes by ptrace
2. inject a parasite code into each process to call vmsplice
3. read /proc/pid/pagemap and splice all dirty pages into pipes
4. reset the soft-dirty memory tracker
5. resume processes
6. splice memory from pipe to sockets

But this way has a few limitations:

1. We need to inject a parasite code into processes. This operation is
slow, and it requires to stop processes, so we can't do this step many
times. As result, we have to splice the whole memory to pipes at once.

2. A number of pipes are limited, and a size of each pipe is limited

A default limit for a number of file descriptors is 1024.  The reliable
maximum pipe size is 3354624 bytes.

        pipe->bufs = kcalloc(pipe_bufs, sizeof(struct pipe_buffer),
                             GFP_KERNEL_ACCOUNT);

so the maximum pipe size can be calculated by this formula:
(1 << PAGE_ALLOC_COSTLY_ORDER) * PAGE_SIZE / sizeof(struct
kernel_pipe_buffer)) * PAGE_SIZE)

This means that we can dump only 1.5 GB of memory.

The major issue of this way is that we need to inject a parasite code
and we can't do this many times, so we have to splice the whole memory
in one iteration.

With the introduced syscall, we are able to splice memory without a
parasite code and even without stopping processes, so we can dump memory
in a few iterations.

> 
> Are there any other applications of this syscall?
> 


For example, gdb can use it to generate a core file, it can splice
memory of a process into a pipe and then splice it from the pipe to a file.
This method works much faster than using PTRACE_PEEK* command

[tip:perf/core] perf trace: Fix an exit code of trace__symbols_init

2017-11-28 Thread tip-bot for Andrei Vagin
Commit-ID:  35c33633abc14b906e84b7b0115ede6df6830120
Gitweb: https://git.kernel.org/tip/35c33633abc14b906e84b7b0115ede6df6830120
Author: Andrei Vagin <ava...@openvz.org>
AuthorDate: Tue, 7 Nov 2017 16:22:46 -0800
Committer:  Arnaldo Carvalho de Melo <a...@redhat.com>
CommitDate: Tue, 28 Nov 2017 14:20:15 -0300

perf trace: Fix an exit code of trace__symbols_init

Currently if trace_event__register_resolver() fails, we return -errno,
but we can't be sure that errno isn't zero in this case.

Signed-off-by: Andrei Vagin <ava...@openvz.org>
Reviewed-by: Jiri Olsa <jo...@redhat.com>
Cc: Alexander Shishkin <alexander.shish...@linux.intel.com>
Cc: Namhyung Kim <namhy...@kernel.org>
Cc: Peter Zijlstra <pet...@infradead.org>
Cc: Vasily Averin <v...@virtuozzo.com>
Link: http://lkml.kernel.org/r/20171108002246.8924-2-ava...@openvz.org
Signed-off-by: Arnaldo Carvalho de Melo <a...@redhat.com>
---
 tools/perf/builtin-trace.c | 6 --
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/tools/perf/builtin-trace.c b/tools/perf/builtin-trace.c
index f2757d3..84debdb 100644
--- a/tools/perf/builtin-trace.c
+++ b/tools/perf/builtin-trace.c
@@ -1152,12 +1152,14 @@ static int trace__symbols_init(struct trace *trace, 
struct perf_evlist *evlist)
if (trace->host == NULL)
return -ENOMEM;
 
-   if (trace_event__register_resolver(trace->host, 
trace__machine__resolve_kernel_addr) < 0)
-   return -errno;
+   err = trace_event__register_resolver(trace->host, 
trace__machine__resolve_kernel_addr);
+   if (err < 0)
+   goto out;
 
err = __machine__synthesize_threads(trace->host, >tool, 
>opts.target,
evlist->threads, 
trace__tool_process, false,
trace->opts.proc_map_timeout, 1);
+out:
if (err)
symbol__exit();
 


Re: proc: fix /proc/*/map_files lookup

2017-11-27 Thread Andrei Vagin
On Tue, Nov 21, 2017 at 12:27:06AM +0300, Alexey Dobriyan wrote:
> Current code does:
> 
>   if (sscanf(dentry->d_name.name, "%lx-%lx", start, end) != 2)
> 
> However sscanf() is broken garbage.
> 
> It silently accepts whitespace between format specifiers
> (did you know that?).
> 
> It silently accepts valid strings which result in integer overflow.
> 
> Do not use sscanf() for any even remotely reliable parsing code.

This patch breaks criu, criu has one places where a file name is generated
as map_files/%p-%p

openat(1048572, "map_files/0x7f9912dd5000-0x7f9912de4000", O_RDWR) = -1 ENOENT 
(No such file or directory) <0.15>

And this code worked before this patch and it doesn't work with this
patch. And you have to know that we never break user-space programs ;)

But seriously, the patch looks good to me, but I would prefer to not queue
it into stable kernels.

Thanks,
Andrei


> 
>   OK
>   # readlink '/proc/1/map_files/55a23af39000-55a23b05b000'
>   /lib/systemd/systemd
> 
>   broken
>   # readlink '/proc/1/map_files/   55a23af39000-55a23b05b000'
>   /lib/systemd/systemd
> 
>   broken
>   # readlink '/proc/1/map_files/55a23af39000-55a23b05b000'
>   /lib/systemd/systemd
> 
>   very broken
>   # readlink 
> '/proc/1/map_files/155a23af39000-55a23b05b000'
>   /lib/systemd/systemd
> 
> Signed-off-by: Alexey Dobriyan 
> Cc: sta...@kernel.org
> ---
> 
>  fs/proc/base.c |   29 -
>  1 file changed, 28 insertions(+), 1 deletion(-)
> 
> --- a/fs/proc/base.c
> +++ b/fs/proc/base.c
> @@ -100,6 +100,8 @@
>  #include "internal.h"
>  #include "fd.h"
>  
> +#include "../../lib/kstrtox.h"
> +
>  /* NOTE:
>   *   Implementing inode permission operations in /proc is almost
>   *   certainly an error.  Permission checks need to happen during
> @@ -1907,8 +1909,33 @@ bool proc_fill_cache(struct file *file, struct 
> dir_context *ctx,
>  static int dname_to_vma_addr(struct dentry *dentry,
>unsigned long *start, unsigned long *end)
>  {
> - if (sscanf(dentry->d_name.name, "%lx-%lx", start, end) != 2)
> + const char *str = dentry->d_name.name;
> + unsigned long long sval, eval;
> + unsigned int len;
> +
> + len = _parse_integer(str, 16, );
> + if (len & KSTRTOX_OVERFLOW)
> + return -EINVAL;
> + if (sval != (unsigned long)sval)
> + return -EINVAL;
> + str += len;
> +
> + if (*str != '-')
>   return -EINVAL;
> + str++;
> +
> + len = _parse_integer(str, 16, );
> + if (len & KSTRTOX_OVERFLOW)
> + return -EINVAL;
> + if (eval != (unsigned long)eval)
> + return -EINVAL;
> + str += len;
> +
> + if (*str != '\0')
> + return -EINVAL;
> +
> + *start = sval;
> + *end = eval;
>  
>   return 0;
>  }


[tip:perf/core] perf trace: Fix an exit code of trace__symbols_init

2017-11-18 Thread tip-bot for Andrei Vagin
Commit-ID:  cbd5c1787bab4643e5959522275b46de94eba5ac
Gitweb: https://git.kernel.org/tip/cbd5c1787bab4643e5959522275b46de94eba5ac
Author: Andrei Vagin <ava...@openvz.org>
AuthorDate: Tue, 7 Nov 2017 16:22:46 -0800
Committer:  Arnaldo Carvalho de Melo <a...@redhat.com>
CommitDate: Thu, 16 Nov 2017 14:49:52 -0300

perf trace: Fix an exit code of trace__symbols_init

Currently if trace_event__register_resolver() fails, we return -errno,
but we can't be sure that errno isn't zero in this case.

Signed-off-by: Andrei Vagin <ava...@openvz.org>
Reviewed-by: Jiri Olsa <jo...@redhat.com>
Cc: Alexander Shishkin <alexander.shish...@linux.intel.com>
Cc: Namhyung Kim <namhy...@kernel.org>
Cc: Peter Zijlstra <pet...@infradead.org>
Cc: Vasily Averin <v...@virtuozzo.com>
Link: http://lkml.kernel.org/r/20171108002246.8924-2-ava...@openvz.org
Signed-off-by: Arnaldo Carvalho de Melo <a...@redhat.com>
---
 tools/perf/builtin-trace.c | 6 --
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/tools/perf/builtin-trace.c b/tools/perf/builtin-trace.c
index f2757d3..84debdb 100644
--- a/tools/perf/builtin-trace.c
+++ b/tools/perf/builtin-trace.c
@@ -1152,12 +1152,14 @@ static int trace__symbols_init(struct trace *trace, 
struct perf_evlist *evlist)
if (trace->host == NULL)
return -ENOMEM;
 
-   if (trace_event__register_resolver(trace->host, 
trace__machine__resolve_kernel_addr) < 0)
-   return -errno;
+   err = trace_event__register_resolver(trace->host, 
trace__machine__resolve_kernel_addr);
+   if (err < 0)
+   goto out;
 
err = __machine__synthesize_threads(trace->host, >tool, 
>opts.target,
evlist->threads, 
trace__tool_process, false,
trace->opts.proc_map_timeout, 1);
+out:
if (err)
symbol__exit();
 


Re: [PATCH] net: Convert net_mutex into rw_semaphore and down read it on net->init/->exit

2017-11-14 Thread Andrei Vagin
On Tue, Nov 14, 2017 at 10:00:59AM -0800, Eric Dumazet wrote:
> On Tue, 2017-11-14 at 09:44 -0800, Andrei Vagin wrote:
> > On Tue, Nov 14, 2017 at 04:53:33PM +0300, Kirill Tkhai wrote:
> > > Curently mutex is used to protect pernet operations list. It makes
> > > cleanup_net() to execute ->exit methods of the same operations set,
> > > which was used on the time of ->init, even after net namespace is
> > > unlinked from net_namespace_list.
> > > 
> > > But the problem is it's need to synchronize_rcu() after net is removed
> > > from net_namespace_list():
> > > 
> > > Destroy net_ns:
> > > cleanup_net()
> > >   mutex_lock(_mutex)
> > >   list_del_rcu(>list)
> > >   synchronize_rcu()  <--- Sleep there for 
> > > ages
> > >   list_for_each_entry_reverse(ops, _list, list)
> > > ops_exit_list(ops, _exit_list)
> > >   list_for_each_entry_reverse(ops, _list, list)
> > > ops_free_list(ops, _exit_list)
> > >   mutex_unlock(_mutex)
> > > 
> > > This primitive is not fast, especially on the systems with many processors
> > > and/or when preemptible RCU is enabled in config. So, all the time, while
> > > cleanup_net() is waiting for RCU grace period, creation of new net 
> > > namespaces
> > > is not possible, the tasks, who makes it, are sleeping on the same mutex:
> > > 
> > > Create net_ns:
> > > copy_net_ns()
> > >   mutex_lock_killable(_mutex)<--- Sleep there for 
> > > ages
> > > 
> > > The solution is to convert net_mutex to the rw_semaphore. Then,
> > > pernet_operations::init/::exit methods, modifying the net-related data,
> > > will require down_read() locking only, while down_write() will be used
> > > for changing pernet_list.
> > > 
> > > This gives signify performance increase, like you may see below. There
> > > is measured sequential net namespace creation in a cycle, in single
> > > thread, without other tasks (single user mode):
> > > 
> > > 1)int main(int argc, char *argv[])
> > > {
> > > unsigned nr;
> > > if (argc < 2) {
> > > fprintf(stderr, "Provide nr iterations arg\n");
> > > return 1;
> > > }
> > > nr = atoi(argv[1]);
> > > while (nr-- > 0) {
> > > if (unshare(CLONE_NEWNET)) {
> > > perror("Can't unshare");
> > > return 1;
> > > }
> > > }
> > > return 0;
> > > }
> > > 
> > > Origin, 10 unshare():
> > > 0.03user 23.14system 1:39.85elapsed 23%CPU
> > > 
> > > Patched, 10 unshare():
> > > 0.03user 67.49system 1:08.34elapsed 98%CPU
> > > 
> > > 2)for i in {1..1}; do unshare -n bash -c exit; done
> > 
> > Hi Kirill,
> > 
> > This mutex has another role. You know that net namespaces are destroyed
> > asynchronously, and the net mutex gurantees that a backlog will be not
> > big. If we have something in backlog, we know that it will be handled
> > before creating a new net ns.
> > 
> > As far as I remember net namespaces are created much faster than
> > they are destroyed, so with this changes we can create a really big
> > backlog, can't we?
> 
> Please take a look at the recent patches I did :
> 
> 8ca712c373a462cfa1b62272870b6c2c74aa83f9 Merge branch 
> 'net-speedup-netns-create-delete-time'
> 64bc17811b72758753e2b64cd8f2a63812c61fe1 ipv4: speedup ipv6 tunnels dismantle
> bb401caefe9d2c65e0c0fa23b21deecfbfa473fe ipv6: speedup ipv6 tunnels dismantle
> 789e6ddb0b2fb5d5024b760b178a47876e4de7a6 tcp: batch tcp_net_metrics_exit
> a90c9347e90ed1e9323d71402ed18023bc910cd8 ipv6: addrlabel: per netns list
> d464e84eed02993d40ad55fdc19f4523e4deee5b kobject: factorize skb setup in 
> kobject_uevent_net_broadcast()
> 4a336a23d619e96aef37d4d054cfadcdd1b581ba kobject: copy env blob in one go
> 16dff336b33d87c15d9cbe933cfd275aae2a8251 kobject: add 
> kobject_uevent_net_broadcast()
> 

Good job! Now it really works much faster. I tested these patches with
Kirill's one and everithing works good. I could not reproduce a
situation, when a backlog starts growing.

Thanks Kirill and Eric.


Re: [PATCH] net: Convert net_mutex into rw_semaphore and down read it on net->init/->exit

2017-11-14 Thread Andrei Vagin
On Tue, Nov 14, 2017 at 09:04:06PM +0300, Kirill Tkhai wrote:
> On 14.11.2017 20:44, Andrei Vagin wrote:
> > On Tue, Nov 14, 2017 at 04:53:33PM +0300, Kirill Tkhai wrote:
> >> Curently mutex is used to protect pernet operations list. It makes
> >> cleanup_net() to execute ->exit methods of the same operations set,
> >> which was used on the time of ->init, even after net namespace is
> >> unlinked from net_namespace_list.
> >>
> >> But the problem is it's need to synchronize_rcu() after net is removed
> >> from net_namespace_list():
> >>
> >> Destroy net_ns:
> >> cleanup_net()
> >>   mutex_lock(_mutex)
> >>   list_del_rcu(>list)
> >>   synchronize_rcu()  <--- Sleep there for 
> >> ages
> >>   list_for_each_entry_reverse(ops, _list, list)
> >> ops_exit_list(ops, _exit_list)
> >>   list_for_each_entry_reverse(ops, _list, list)
> >> ops_free_list(ops, _exit_list)
> >>   mutex_unlock(_mutex)
> >>
> >> This primitive is not fast, especially on the systems with many processors
> >> and/or when preemptible RCU is enabled in config. So, all the time, while
> >> cleanup_net() is waiting for RCU grace period, creation of new net 
> >> namespaces
> >> is not possible, the tasks, who makes it, are sleeping on the same mutex:
> >>
> >> Create net_ns:
> >> copy_net_ns()
> >>   mutex_lock_killable(_mutex)<--- Sleep there for 
> >> ages
> >>
> >> The solution is to convert net_mutex to the rw_semaphore. Then,
> >> pernet_operations::init/::exit methods, modifying the net-related data,
> >> will require down_read() locking only, while down_write() will be used
> >> for changing pernet_list.
> >>
> >> This gives signify performance increase, like you may see below. There
> >> is measured sequential net namespace creation in a cycle, in single
> >> thread, without other tasks (single user mode):
> >>
> >> 1)int main(int argc, char *argv[])
> >> {
> >> unsigned nr;
> >> if (argc < 2) {
> >> fprintf(stderr, "Provide nr iterations arg\n");
> >> return 1;
> >> }
> >> nr = atoi(argv[1]);
> >> while (nr-- > 0) {
> >> if (unshare(CLONE_NEWNET)) {
> >> perror("Can't unshare");
> >> return 1;
> >> }
> >> }
> >> return 0;
> >> }
> >>
> >> Origin, 10 unshare():
> >> 0.03user 23.14system 1:39.85elapsed 23%CPU
> >>
> >> Patched, 10 unshare():
> >> 0.03user 67.49system 1:08.34elapsed 98%CPU
> >>
> >> 2)for i in {1..1}; do unshare -n bash -c exit; done
> > 
> > Hi Kirill,
> > 
> > This mutex has another role. You know that net namespaces are destroyed
> > asynchronously, and the net mutex gurantees that a backlog will be not
> > big. If we have something in backlog, we know that it will be handled
> > before creating a new net ns.
> > 
> > As far as I remember net namespaces are created much faster than
> > they are destroyed, so with this changes we can create a really big
> > backlog, can't we?
> 
> I don't think limitation is a good goal or a gool for the mutex,
> because it's very easy to create many net namespaces in case of
> the mutex exists. You may open /proc/[pid]/ns/net like a file,
> and net_ns counter will increment. Then, do unshare(), and
> the mutex has no a way to protect against that.

You are right, but with the mutex a user can not support a big backlog
for a long time, it is shrunk to zero periodically. With these changes
he can support a big backlog for a long time.

A big backlog affects other users. If someone creates namespaces, he
probably expects that they will be destroyed for a reasonable time.

But currently someone else can increase a destroy time to a really big
values. This problem was before your patches, but they may do this
problem worse. The question here is: Should we think about this problem
in the context of these patches?


> Anyway, mutex
> can't limit a number of something in general, I've never seen
> a (good) example in kernel.

I'm agree with you here.


> 
> As I see, the real limitation happen in inc_net_namespaces(),
> which is decremented after RCU grace period in cleanup_net(),
> and it has not changed.

ucount limits are to big to handle this problem.


> 
> > There was a discussion a few month ago:
> > https://lists.onap.org/pipermail/containers/2016-October/037509.html
> > 
> > 
> >>
> >> Origin:
> >> real 1m24,190s
> >> user 0m6,225s
> >> sys 0m15,132s
> > 
> > Here you measure time of creating and destroying net namespaces.
> > 
> >>
> >> Patched:
> >> real 0m18,235s   (4.6 times faster)
> >> user 0m4,544s
> >> sys 0m13,796s
> > 
> > But here you measure time of crearing namespaces and you know nothing
> > when they will be destroyed.
> 
> You're right, and I predict, the sum time, spent on cpu, will remain the same,
> but the think is that now creation and destroying may be executed in parallel.


Re: [PATCH] net: Convert net_mutex into rw_semaphore and down read it on net->init/->exit

2017-11-14 Thread Andrei Vagin
On Tue, Nov 14, 2017 at 04:53:33PM +0300, Kirill Tkhai wrote:
> Curently mutex is used to protect pernet operations list. It makes
> cleanup_net() to execute ->exit methods of the same operations set,
> which was used on the time of ->init, even after net namespace is
> unlinked from net_namespace_list.
> 
> But the problem is it's need to synchronize_rcu() after net is removed
> from net_namespace_list():
> 
> Destroy net_ns:
> cleanup_net()
>   mutex_lock(_mutex)
>   list_del_rcu(>list)
>   synchronize_rcu()  <--- Sleep there for ages
>   list_for_each_entry_reverse(ops, _list, list)
> ops_exit_list(ops, _exit_list)
>   list_for_each_entry_reverse(ops, _list, list)
> ops_free_list(ops, _exit_list)
>   mutex_unlock(_mutex)
> 
> This primitive is not fast, especially on the systems with many processors
> and/or when preemptible RCU is enabled in config. So, all the time, while
> cleanup_net() is waiting for RCU grace period, creation of new net namespaces
> is not possible, the tasks, who makes it, are sleeping on the same mutex:
> 
> Create net_ns:
> copy_net_ns()
>   mutex_lock_killable(_mutex)<--- Sleep there for ages
> 
> The solution is to convert net_mutex to the rw_semaphore. Then,
> pernet_operations::init/::exit methods, modifying the net-related data,
> will require down_read() locking only, while down_write() will be used
> for changing pernet_list.
> 
> This gives signify performance increase, like you may see below. There
> is measured sequential net namespace creation in a cycle, in single
> thread, without other tasks (single user mode):
> 
> 1)int main(int argc, char *argv[])
> {
> unsigned nr;
> if (argc < 2) {
> fprintf(stderr, "Provide nr iterations arg\n");
> return 1;
> }
> nr = atoi(argv[1]);
> while (nr-- > 0) {
> if (unshare(CLONE_NEWNET)) {
> perror("Can't unshare");
> return 1;
> }
> }
> return 0;
> }
> 
> Origin, 10 unshare():
> 0.03user 23.14system 1:39.85elapsed 23%CPU
> 
> Patched, 10 unshare():
> 0.03user 67.49system 1:08.34elapsed 98%CPU
> 
> 2)for i in {1..1}; do unshare -n bash -c exit; done

Hi Kirill,

This mutex has another role. You know that net namespaces are destroyed
asynchronously, and the net mutex gurantees that a backlog will be not
big. If we have something in backlog, we know that it will be handled
before creating a new net ns.

As far as I remember net namespaces are created much faster than
they are destroyed, so with this changes we can create a really big
backlog, can't we?

There was a discussion a few month ago:
https://lists.onap.org/pipermail/containers/2016-October/037509.html


> 
> Origin:
> real 1m24,190s
> user 0m6,225s
> sys 0m15,132s

Here you measure time of creating and destroying net namespaces.

> 
> Patched:
> real 0m18,235s   (4.6 times faster)
> user 0m4,544s
> sys 0m13,796s

But here you measure time of crearing namespaces and you know nothing
when they will be destroyed.

Thanks,
Andrei


Re: [PATCH 1/3] x86/entry: Fix idtentry unwind hint

2017-11-13 Thread Andrei Vagin
Hi Josh,

On Thu, Oct 26, 2017 at 8:24 AM, Josh Poimboeuf <jpoim...@redhat.com> wrote:
> On Wed, Oct 25, 2017 at 02:07:38PM -0700, Andrei Vagin wrote:
>> Hi Josh,
>>
>> Here is one more warning:
>> [5.852094] WARNING: can't dereference iret registers at b6ce01b7ffe0 
>> for ip entry_SYSCALL_64_fastpath+0xa/0xc2
>
> Thanks, I hadn't seen this one yet.
>
> I suspect this is in the middle of the ENABLE_INTERRUPTS() paravirt
> code, which would mean this is another issue that will be fixed by my
> "Make pv ops code generation more closely match reality" patches.
>
> If you can share either the entry_64.o file or the .config, and what
> virt platform it's running on (kvm, xen, native), I should be able to
> confirm the issue.
>
> I'm in Prague this week but I should have a v2 of those patches in a
> week or two (will cc you).

Do you have any news? We still see this warning in the kernel log. Thanks!

>
> --
> Josh


[tip:perf/urgent] perf trace: Call machine__exit() at exit

2017-11-11 Thread tip-bot for Andrei Vagin
Commit-ID:  33974a414ce2324554f75dbd204ff0868f499e32
Gitweb: https://git.kernel.org/tip/33974a414ce2324554f75dbd204ff0868f499e32
Author: Andrei Vagin <ava...@openvz.org>
AuthorDate: Tue, 7 Nov 2017 16:22:45 -0800
Committer:  Arnaldo Carvalho de Melo <a...@redhat.com>
CommitDate: Thu, 9 Nov 2017 10:17:32 -0300

perf trace: Call machine__exit() at exit

Otherwise 'perf trace' leaves a temporary file /tmp/perf-vdso.so-XX.

  $ perf trace -o log true
  $ ls -l /tmp/perf-vdso.*
  -rw--- 1 root root 8192 Nov  8 03:08 /tmp/perf-vdso.so-5bCpD0

Signed-off-by: Andrei Vagin <ava...@openvz.org>
Reviewed-by: Jiri Olsa <jo...@redhat.com>
Cc: Alexander Shishkin <alexander.shish...@linux.intel.com>
Cc: Namhyung Kim <namhy...@kernel.org>
Cc: Peter Zijlstra <pet...@infradead.org>
Cc: Vasily Averin <v...@virtuozzo.com>
Link: http://lkml.kernel.org/r/20171108002246.8924-1-ava...@openvz.org
Signed-off-by: Arnaldo Carvalho de Melo <a...@redhat.com>
---
 tools/perf/builtin-trace.c | 10 ++
 1 file changed, 10 insertions(+)

diff --git a/tools/perf/builtin-trace.c b/tools/perf/builtin-trace.c
index 771ddab..d5d7fff 100644
--- a/tools/perf/builtin-trace.c
+++ b/tools/perf/builtin-trace.c
@@ -1138,6 +1138,14 @@ static int trace__symbols_init(struct trace *trace, 
struct perf_evlist *evlist)
return err;
 }
 
+static void trace__symbols__exit(struct trace *trace)
+{
+   machine__exit(trace->host);
+   trace->host = NULL;
+
+   symbol__exit();
+}
+
 static int syscall__alloc_arg_fmts(struct syscall *sc, int nr_args)
 {
int idx;
@@ -2481,6 +2489,8 @@ out_disable:
}
 
 out_delete_evlist:
+   trace__symbols__exit(trace);
+
perf_evlist__delete(evlist);
trace->evlist = NULL;
trace->live = false;


[PATCH 1/2] perf/trace: call machine__exit() at exiting

2017-11-07 Thread Andrei Vagin
Otherwise perf trace leaves a temprary file /tmp/perf-vdso.so-XX.

$ perf trace -o log true
$ ls -l /tmp/perf-vdso.*
-rw--- 1 root root 8192 Nov  8 03:08 /tmp/perf-vdso.so-5bCpD0

Cc: Peter Zijlstra <pet...@infradead.org>
Cc: Ingo Molnar <mi...@redhat.com>
Cc: Arnaldo Carvalho de Melo <a...@kernel.org>
Cc: Alexander Shishkin <alexander.shish...@linux.intel.com>
Cc: Jiri Olsa <jo...@redhat.com>
Cc: Namhyung Kim <namhy...@kernel.org>
Signed-off-by: Andrei Vagin <ava...@openvz.org>
---
 tools/perf/builtin-trace.c | 10 ++
 1 file changed, 10 insertions(+)

diff --git a/tools/perf/builtin-trace.c b/tools/perf/builtin-trace.c
index 505b871fdc82..f2757d38c7d7 100644
--- a/tools/perf/builtin-trace.c
+++ b/tools/perf/builtin-trace.c
@@ -1164,6 +1164,14 @@ static int trace__symbols_init(struct trace *trace, 
struct perf_evlist *evlist)
return err;
 }
 
+static void trace__symbols__exit(struct trace *trace)
+{
+   machine__exit(trace->host);
+   trace->host = NULL;
+
+   symbol__exit();
+}
+
 static int syscall__alloc_arg_fmts(struct syscall *sc, int nr_args)
 {
int idx;
@@ -2508,6 +2516,8 @@ static int trace__run(struct trace *trace, int argc, 
const char **argv)
}
 
 out_delete_evlist:
+   trace__symbols__exit(trace);
+
perf_evlist__delete(evlist);
trace->evlist = NULL;
trace->live = false;
-- 
2.13.6



[PATCH 2/2] perf/trace: fix an exit code of trace__symbols_init

2017-11-07 Thread Andrei Vagin
Currently if trace_event__register_resolver() fails, we return -errno,
but we can't be sure that errno isn't zero in this case.

Cc: Peter Zijlstra <pet...@infradead.org>
Cc: Ingo Molnar <mi...@redhat.com>
Cc: Arnaldo Carvalho de Melo <a...@kernel.org>
Cc: Alexander Shishkin <alexander.shish...@linux.intel.com>
Cc: Jiri Olsa <jo...@redhat.com>
Cc: Namhyung Kim <namhy...@kernel.org>
Signed-off-by: Andrei Vagin <ava...@openvz.org>
---
 tools/perf/builtin-trace.c | 6 --
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/tools/perf/builtin-trace.c b/tools/perf/builtin-trace.c
index f2757d38c7d7..84debdbad327 100644
--- a/tools/perf/builtin-trace.c
+++ b/tools/perf/builtin-trace.c
@@ -1152,12 +1152,14 @@ static int trace__symbols_init(struct trace *trace, 
struct perf_evlist *evlist)
if (trace->host == NULL)
return -ENOMEM;
 
-   if (trace_event__register_resolver(trace->host, 
trace__machine__resolve_kernel_addr) < 0)
-   return -errno;
+   err = trace_event__register_resolver(trace->host, 
trace__machine__resolve_kernel_addr);
+   if (err < 0)
+   goto out;
 
err = __machine__synthesize_threads(trace->host, >tool, 
>opts.target,
evlist->threads, 
trace__tool_process, false,
trace->opts.proc_map_timeout, 1);
+out:
if (err)
symbol__exit();
 
-- 
2.13.6



Re: lost path_put in perf_fill_ns_link_info

2017-11-07 Thread Andrei Vagin
On Tue, Nov 07, 2017 at 11:03:18AM -0800, Andrei Vagin wrote:
> Hi Vasily and all,
> 
> The patch looks correct for me. I tried to reproduce this issue and
> checked that this patch fixes it. Bellow you can find my test program
> and a command line to run it. The problem still exists even with this patch.
> 
> $ cat test.c
> #define _GNU_SOURCE
> #include 
> 
> int main(int argc, char **argv)
> {
>   while (1)
>   unshare(CLONE_NEWUTS);
>   return 0;
> }
> 
> $ gcc -o test_unshare test.c
> $ for i in `seq 1`; do perf trace -o log unshare -u true; done &
> [5] 28766
> $ for i in `seq 1`; do perf trace -o log unshare -u true; done &
> [6] 28840
> 
> $ echo 3 > /proc/sys/vm/drop_caches | cat /proc/slabinfo | grep dentry
> dentry 74848  78660224   181 : tunables000 : 
> slabdata   4370   4370  0
> 
> $ sleep 10
> $ echo 3 > /proc/sys/vm/drop_caches | cat /proc/slabinfo | grep dentry
> dentry 75145  79002224   181 : tunables000 : 
> slabdata   4389   4389  0
> 
> $ sleep 10
> $ echo 3 > /proc/sys/vm/drop_caches | cat /proc/slabinfo | grep dentry
> dentry 75921  79776224   181 : tunables000 : 
> slabdata   4432   4432  0

Actually here is another issue, and it is reproduced by another script:

for i in `seq 1`; do perf trace -o /dev/null unshare -u true; done &

And it is due to files what perf creates in /tmp

//tmp/perf-vdso.so-84wDCZ
//tmp/perf-vdso.so-YDfUuX
//tmp/perf-vdso.so-KkTBfU
//tmp/perf-vdso.so-srXfvU
//tmp/perf-vdso.so-QrPscR
//tmp/perf-vdso.so-wlxIZO
//tmp/perf-vdso.so-ur4fBP
//tmp/perf-vdso.so-gBMExN
//tmp/perf-vdso.so-6sCehK
//tmp/perf-vdso.so-cZ4GDK
//tmp/perf-vdso.so-ImQLoH
//tmp/perf-vdso.so-y4rMuF
//tmp/perf-vdso.so-Wx1qIG
//tmp/perf-vdso.so-g5mYOD
...

Do we really need all these files? They all are identical.

[root@fc24 ~]# diff -up /tmp/perf-vdso.so-fnVcRz /tmp/perf-vdso.so-FQGYzh
[root@fc24 ~]# echo $?
0
> $ git log --pretty=oneline | head -n 2
> c83bceb10b36cef895def4b2dfe0aff6ca7c9784 lost path_put in 
> perf_fill_ns_link_info
> 8b82a8a7ab53ee1a065ac69c835737a701f46b2e Add linux-next specific files for 
> 20171107
> 
> Thanks,
> Andrei
> 
> On Mon, Nov 06, 2017 at 09:22:18AM +0300, Vasily Averin wrote:
> > Fixes: commit e422267322cd ("perf: Add PERF_RECORD_NAMESPACES to include 
> > namespaces related info")
> > Signed-off-by: Vasily Averin <v...@virtuozzo.com>
> > ---
> >  kernel/events/core.c | 1 +
> >  1 file changed, 1 insertion(+)
> > 
> > diff --git a/kernel/events/core.c b/kernel/events/core.c
> > index 10cdb9c..ab5ac84 100644
> > --- a/kernel/events/core.c
> > +++ b/kernel/events/core.c
> > @@ -6756,6 +6756,7 @@ static void perf_fill_ns_link_info(struct 
> > perf_ns_link_info *ns_link_info,
> > ns_inode = ns_path.dentry->d_inode;
> > ns_link_info->dev = new_encode_dev(ns_inode->i_sb->s_dev);
> > ns_link_info->ino = ns_inode->i_ino;
> > +   path_put(_path);
> > }
> >  }
> >  


Re: lost path_put in perf_fill_ns_link_info

2017-11-07 Thread Andrei Vagin
Hi Vasily and all,

The patch looks correct for me. I tried to reproduce this issue and
checked that this patch fixes it. Bellow you can find my test program
and a command line to run it. The problem still exists even with this patch.

$ cat test.c
#define _GNU_SOURCE
#include 

int main(int argc, char **argv)
{
while (1)
unshare(CLONE_NEWUTS);
return 0;
}

$ gcc -o test_unshare test.c
$ for i in `seq 1`; do perf trace -o log unshare -u true; done &
[5] 28766
$ for i in `seq 1`; do perf trace -o log unshare -u true; done &
[6] 28840

$ echo 3 > /proc/sys/vm/drop_caches | cat /proc/slabinfo | grep dentry
dentry 74848  78660224   181 : tunables000 : 
slabdata   4370   4370  0

$ sleep 10
$ echo 3 > /proc/sys/vm/drop_caches | cat /proc/slabinfo | grep dentry
dentry 75145  79002224   181 : tunables000 : 
slabdata   4389   4389  0

$ sleep 10
$ echo 3 > /proc/sys/vm/drop_caches | cat /proc/slabinfo | grep dentry
dentry 75921  79776224   181 : tunables000 : 
slabdata   4432   4432  0
$ git log --pretty=oneline | head -n 2
c83bceb10b36cef895def4b2dfe0aff6ca7c9784 lost path_put in perf_fill_ns_link_info
8b82a8a7ab53ee1a065ac69c835737a701f46b2e Add linux-next specific files for 
20171107

Thanks,
Andrei

On Mon, Nov 06, 2017 at 09:22:18AM +0300, Vasily Averin wrote:
> Fixes: commit e422267322cd ("perf: Add PERF_RECORD_NAMESPACES to include 
> namespaces related info")
> Signed-off-by: Vasily Averin 
> ---
>  kernel/events/core.c | 1 +
>  1 file changed, 1 insertion(+)
> 
> diff --git a/kernel/events/core.c b/kernel/events/core.c
> index 10cdb9c..ab5ac84 100644
> --- a/kernel/events/core.c
> +++ b/kernel/events/core.c
> @@ -6756,6 +6756,7 @@ static void perf_fill_ns_link_info(struct 
> perf_ns_link_info *ns_link_info,
>   ns_inode = ns_path.dentry->d_inode;
>   ns_link_info->dev = new_encode_dev(ns_inode->i_sb->s_dev);
>   ns_link_info->ino = ns_inode->i_ino;
> + path_put(_path);
>   }
>  }
>  


[PATCH v2] pid: restore the old behaviour of the ns_last_pid sysctl

2017-11-06 Thread Andrei Vagin
CRIU uses ns_last_pid to fork a process with a specified pid. For
example, if we need to create a process with the pid of 1,
we write  into /proc/sys/kernel/ns_last_pid

$ echo  > /proc/sys/kernel/ns_last_pid; sh -c 'echo $$'
1

This behaviour has been broken and now if we write  to ns_last_pid,
a process will get the pid . This patch restores the old behaviour.

v2: make code a bit more readable // Oleg

fixes: ("pid: replace pid bitmap implementation with IDR API")
Cc: Gargi Sharma <gs051...@gmail.com>
Cc: Oleg Nesterov <o...@redhat.com>
Acked-by: Oleg Nesterov <o...@redhat.com>
Signed-off-by: Andrei Vagin <ava...@openvz.org>
---
 kernel/pid_namespace.c | 11 +--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c
index fea2c24fa460..0b53eef7d34b 100644
--- a/kernel/pid_namespace.c
+++ b/kernel/pid_namespace.c
@@ -287,6 +287,7 @@ static int pid_ns_ctl_handler(struct ctl_table *table, int 
write,
 {
struct pid_namespace *pid_ns = task_active_pid_ns(current);
struct ctl_table tmp = *table;
+   int ret, next;
 
if (write && !ns_capable(pid_ns->user_ns, CAP_SYS_ADMIN))
return -EPERM;
@@ -297,8 +298,14 @@ static int pid_ns_ctl_handler(struct ctl_table *table, int 
write,
 * it should synchronize its usage with external means.
 */
 
-   tmp.data = _ns->idr.idr_next;
-   return proc_dointvec_minmax(, write, buffer, lenp, ppos);
+   next = idr_get_cursor(_ns->idr) - 1;
+
+   tmp.data = 
+   ret = proc_dointvec_minmax(, write, buffer, lenp, ppos);
+   if (!ret && write)
+   idr_set_cursor(_ns->idr, next + 1);
+
+   return ret;
 }
 
 extern int pid_max;
-- 
2.13.6



[PATCH] pid: restore the old behaviour of the ns_last_pid sysctl

2017-11-03 Thread Andrei Vagin
CRIU uses ns_last_pid to fork a process with a specified pid. For
example, if we need to create a process with the pid of 1,
we write  into /proc/sys/kernel/ns_last_pid

$ echo  > /proc/sys/kernel/ns_last_pid; sh -c 'echo $$'
1

This behaviour has been broken and now if we write  to ns_last_pid,
a process will get the pid . This patch restores the old behaviour.

fixes: ("pid: replace pid bitmap implementation with IDR API")
Cc: Gargi Sharma <gs051...@gmail.com>
Cc: Oleg Nesterov <o...@redhat.com>
Signed-off-by: Andrei Vagin <ava...@openvz.org>
---
 kernel/pid_namespace.c | 15 +--
 1 file changed, 13 insertions(+), 2 deletions(-)

diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c
index fea2c24fa460..504dadb1d920 100644
--- a/kernel/pid_namespace.c
+++ b/kernel/pid_namespace.c
@@ -287,6 +287,7 @@ static int pid_ns_ctl_handler(struct ctl_table *table, int 
write,
 {
struct pid_namespace *pid_ns = task_active_pid_ns(current);
struct ctl_table tmp = *table;
+   int ret, next;
 
if (write && !ns_capable(pid_ns->user_ns, CAP_SYS_ADMIN))
return -EPERM;
@@ -297,8 +298,18 @@ static int pid_ns_ctl_handler(struct ctl_table *table, int 
write,
 * it should synchronize its usage with external means.
 */
 
-   tmp.data = _ns->idr.idr_next;
-   return proc_dointvec_minmax(, write, buffer, lenp, ppos);
+   next = idr_get_cursor(_ns->idr) - 1;
+
+   tmp.data = 
+   ret = proc_dointvec_minmax(, write, buffer, lenp, ppos);
+   if (ret < 0)
+   return ret;
+
+   if (!write)
+   return 0;
+
+   idr_set_cursor(_ns->idr, next + 1);
+   return 0;
 }
 
 extern int pid_max;
-- 
2.13.6



Re: [PATCH] [RFC] vm: add a syscall to map a process memory into a pipe

2017-11-01 Thread Andrei Vagin
Hi Michael,

On Mon, Oct 30, 2017 at 01:47:31PM +0100, Michael Kerrisk (man-pages) wrote:
> On 10 August 2017 at 20:46, Andrei Vagin <ava...@openvz.org> wrote:
> > It is a hybrid of process_vm_readv() and vmsplice().
> >
> > vmsplice can map memory from a current address space into a pipe.
> > process_vm_readv can read memory of another process.
> >
> > A new system call can map memory of another process into a pipe.
> >
> > ssize_t process_vmsplice(pid_t pid, int fd, const struct iovec *iov,
> > unsigned long nr_segs, unsigned int flags)
> >
> > All arguments are identical with vmsplice except pid which specifies a
> > target process.
> 
> Can we have a man page for this new syscall please?

I think we can add a description for process_vmsplice into
man2/vmsplice.2. The patch is attached.

Thanks,
Andrei

> 
> Thanks,
> 
> Michael
> 
> 
>From 923cbd38805f8017b6d86ac6a12c8f45a4117399 Mon Sep 17 00:00:00 2001
From: Andrei Vagin <ava...@openvz.org>
Date: Wed, 1 Nov 2017 10:26:06 -0700
Subject: [PATCH] vmsplice.2: add description for process_vmsplice

Signed-off-by: Andrei Vagin <ava...@openvz.org>
---
 man2/vmsplice.2 | 20 +++-
 1 file changed, 19 insertions(+), 1 deletion(-)

diff --git a/man2/vmsplice.2 b/man2/vmsplice.2
index e3e61cf27..8b28b6fff 100644
--- a/man2/vmsplice.2
+++ b/man2/vmsplice.2
@@ -25,7 +25,7 @@
 .\"
 .TH VMSPLICE 2 2014-10-02 "Linux" "Linux Programmer's Manual"
 .SH NAME
-vmsplice \- splice user pages into a pipe
+vmsplice, process_vmsplice \- splice user pages into a pipe
 .SH SYNOPSIS
 .nf
 .BR "#define _GNU_SOURCE" " /* See feature_test_macros(7) */"
@@ -34,6 +34,8 @@ vmsplice \- splice user pages into a pipe
 
 .BI "ssize_t vmsplice(int " fd ", const struct iovec *" iov ,
 .BI " unsigned long " nr_segs ", unsigned int " flags );
+.BI "ssize_t process_vmsplice(pid_t " pid ", int " fd ", const struct iovec *" 
iov ,
+.BI " unsigned long " nr_segs ", unsigned int " flags );
 .fi
 .\" Return type was long before glibc 2.7
 .SH DESCRIPTION
@@ -55,6 +57,12 @@ The file descriptor
 .I fd
 must refer to a pipe.
 
+The
+.BR process_vmsplice()
+system call maps user memory from the process identified by
+.I pid
+to the local pipe.
+
 The pointer
 .I iov
 points to an array of
@@ -114,6 +122,7 @@ Data must also be properly page aligned, both in memory and 
length.
 .\"commit bd1a68b59c8e3bce45fb76632c64e1e063c3962d
 .\"
 .\"  if we expect to later SPLICE_F_MOVE to the cache.
+
 .SH RETURN VALUE
 Upon successful completion,
 .BR vmsplice ()
@@ -145,6 +154,15 @@ set.
 .TP
 .B ENOMEM
 Out of memory.
+.TP
+.B ESRCH
+No process with ID
+.I pid
+exists.
+.TP
+.B EPERM
+The caller does not have permission to access the address space of the process
+.IR pid .
 .SH VERSIONS
 The
 .BR vmsplice ()
-- 
2.13.6



Re: [PATCH 1/3] x86/entry: Fix idtentry unwind hint

2017-10-26 Thread Andrei Vagin
On Thu, Oct 26, 2017 at 10:24:29AM -0500, Josh Poimboeuf wrote:
> On Wed, Oct 25, 2017 at 02:07:38PM -0700, Andrei Vagin wrote:
> > Hi Josh,
> > 
> > Here is one more warning:
> > [5.852094] WARNING: can't dereference iret registers at 
> > b6ce01b7ffe0 for ip entry_SYSCALL_64_fastpath+0xa/0xc2
> 
> Thanks, I hadn't seen this one yet.
> 
> I suspect this is in the middle of the ENABLE_INTERRUPTS() paravirt
> code, which would mean this is another issue that will be fixed by my
> "Make pv ops code generation more closely match reality" patches.
> 
> If you can share either the entry_64.o file or the .config, and what
> virt platform it's running on (kvm, xen, native), I should be able to
> confirm the issue.

The config is attached to this e-mail. It is a travis-ci vm. I think
they are hosted in Google Cloud:

[0.00] DMI: Google Google Compute Engine/Google Compute Engine, BIOS 
Google 01/01/2011
[0.00] Hypervisor detected: KVM

https://travis-ci.org/avagin/linux/jobs/292773933

> 
> I'm in Prague this week but I should have a v2 of those patches in a
> week or two (will cc you).

Good!

> 
> -- 
> Josh


[PATCH 2/3] x86: Write up the process_vmsplice syscall

2017-10-25 Thread Andrei Vagin
Signed-off-by: Andrei Vagin <ava...@openvz.org>
---
 arch/x86/entry/syscalls/syscall_32.tbl | 1 +
 arch/x86/entry/syscalls/syscall_64.tbl | 2 ++
 2 files changed, 3 insertions(+)

diff --git a/arch/x86/entry/syscalls/syscall_32.tbl 
b/arch/x86/entry/syscalls/syscall_32.tbl
index 448ac2161112..dc64bf577b17 100644
--- a/arch/x86/entry/syscalls/syscall_32.tbl
+++ b/arch/x86/entry/syscalls/syscall_32.tbl
@@ -391,3 +391,4 @@
 382i386pkey_free   sys_pkey_free
 383i386statx   sys_statx
 384i386arch_prctl  sys_arch_prctl  
compat_sys_arch_prctl
+385i386process_vmsplicesys_process_vmsplice
compat_sys_process_vmsplice
diff --git a/arch/x86/entry/syscalls/syscall_64.tbl 
b/arch/x86/entry/syscalls/syscall_64.tbl
index 5aef183e2f85..d2f916c0309a 100644
--- a/arch/x86/entry/syscalls/syscall_64.tbl
+++ b/arch/x86/entry/syscalls/syscall_64.tbl
@@ -339,6 +339,7 @@
 330common  pkey_alloc  sys_pkey_alloc
 331common  pkey_free   sys_pkey_free
 332common  statx   sys_statx
+33364  process_vmsplicesys_process_vmsplice
 
 #
 # x32-specific system call numbers start at 512 to avoid cache impact
@@ -380,3 +381,4 @@
 545x32 execveatcompat_sys_execveat/ptregs
 546x32 preadv2 compat_sys_preadv64v2
 547x32 pwritev2compat_sys_pwritev64v2
+548x32 process_vmsplicecompat_sys_process_vmsplice
-- 
2.13.6



[PATCH 1/3] [v2] vm: add a syscall to map a process memory into a pipe

2017-10-25 Thread Andrei Vagin
It is a hybrid of process_vm_readv() and vmsplice().

vmsplice can map memory from a current address space into a pipe.
process_vm_readv can read memory of another process.

A new system call can map memory of another process into a pipe.

ssize_t process_vmsplice(pid_t pid, int fd, const struct iovec *iov,
unsigned long nr_segs, unsigned int flags)

All arguments are identical with vmsplice except pid which specifies a
target process.

Currently if we want to dump a process memory to a file or to a socket,
we can use process_vm_readv() + write(), but it works slow, because data
are copied into a temporary user-space buffer.

A second way is to use vmsplice() + splice(). It is more effective,
because data are not copied into a temporary buffer, but here is another
problem. vmsplice works with the currect address space, so it can be
used only if we inject our code into a target process.

The second way suffers from a few other issues:
* a process has to be stopped to run a parasite code
* a number of pipes is limited, so it may be impossible to dump all
  memory in one iteration, and we have to stop process and inject our
  code a few times.
* pages in pipes are unreclaimable, so it isn't good to hold a lot of
  memory in pipes.

The introduced syscall allows to use a second way without injecting any
code into a target process.

My experiments shows that process_vmsplice() + splice() works two time
faster than process_vm_readv() + write().

It is particularly useful on a pre-dump stage. On this stage we enable a
memory tracker, and then we are dumping  a process memory while a
process continues work. On the first iteration we are dumping all
memory, and then we are dumpung only modified memory from a previous
iteration.  After a few pre-dump operations, a process is stopped and
dumped finally. The pre-dump operations allow to significantly decrease
a process downtime, when a process is migrated to another host.

v2: move this syscall under CONFIG_CROSS_MEMORY_ATTACH
give correct flags to get_user_pages_remote()

Cc: Alexander Viro <v...@zeniv.linux.org.uk>
Cc: Arnd Bergmann <a...@arndb.de>
Cc: Pavel Emelyanov <xe...@virtuozzo.com>
Cc: Michael Kerrisk <mtk.manpa...@gmail.com>
Cc: Thomas Gleixner <t...@linutronix.de>
Cc: Andrew Morton <a...@linux-foundation.org>
Cc: Josh Triplett <j...@joshtriplett.org>
Cc: Jann Horn <ja...@google.com>
Signed-off-by: Andrei Vagin <ava...@openvz.org>
---
 fs/splice.c   | 223 ++
 include/linux/compat.h|   3 +
 include/linux/syscalls.h  |   4 +
 include/uapi/asm-generic/unistd.h |   5 +-
 kernel/sys_ni.c   |   2 +
 5 files changed, 236 insertions(+), 1 deletion(-)

diff --git a/fs/splice.c b/fs/splice.c
index f3084cce0ea6..4bf37207feb9 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -34,6 +34,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include "internal.h"
 
@@ -1358,6 +1359,228 @@ SYSCALL_DEFINE4(vmsplice, int, fd, const struct iovec 
__user *, iov,
return error;
 }
 
+#ifdef CONFIG_CROSS_MEMORY_ATTACH
+/*
+ * Map pages from a specified task into a pipe
+ */
+static int remote_single_vec_to_pipe(struct task_struct *task,
+   struct mm_struct *mm,
+   const struct iovec *rvec,
+   struct pipe_inode_info *pipe,
+   unsigned int flags,
+   size_t *total)
+{
+   struct pipe_buffer buf = {
+   .ops = _page_pipe_buf_ops,
+   .flags = flags
+   };
+   unsigned long addr = (unsigned long) rvec->iov_base;
+   unsigned long pa = addr & PAGE_MASK;
+   unsigned long start_offset = addr - pa;
+   unsigned long nr_pages;
+   ssize_t len = rvec->iov_len;
+   struct page *process_pages[16];
+   bool failed = false;
+   int ret = 0;
+
+   nr_pages = (addr + len - 1) / PAGE_SIZE - addr / PAGE_SIZE + 1;
+   while (nr_pages) {
+   long pages = min(nr_pages, 16UL);
+   int locked = 1, n;
+   ssize_t copied;
+
+   /*
+* Get the pages we're interested in.  We must
+* access remotely because task/mm might not
+* current/current->mm
+*/
+   down_read(>mmap_sem);
+   pages = get_user_pages_remote(task, mm, pa, pages, 0,
+ process_pages, NULL, );
+   if (locked)
+   up_read(>mmap_sem);
+   if (pages <= 0) {
+   failed = true;
+   ret = -EFAULT;
+   break;
+   }
+
+   copied = pages * PAGE_SIZE - start_offset;
+   if (copied > len)
+   copied = len;
+   len -= copied;
+

[PATCH 3/3] test: add a test for the process_vmsplice syscall

2017-10-25 Thread Andrei Vagin
This test checks that process_vmsplice() can splice pages from a remote
process and returns EFAULT, if process_vmsplice() tries to splice pages
by an unaccessiable address.

Signed-off-by: Andrei Vagin <ava...@openvz.org>
---
 tools/testing/selftests/process_vmsplice/Makefile  |   5 +
 .../process_vmsplice/process_vmsplice_test.c   | 188 +
 2 files changed, 193 insertions(+)
 create mode 100644 tools/testing/selftests/process_vmsplice/Makefile
 create mode 100644 
tools/testing/selftests/process_vmsplice/process_vmsplice_test.c

diff --git a/tools/testing/selftests/process_vmsplice/Makefile 
b/tools/testing/selftests/process_vmsplice/Makefile
new file mode 100644
index ..246d5a7dfed6
--- /dev/null
+++ b/tools/testing/selftests/process_vmsplice/Makefile
@@ -0,0 +1,5 @@
+CFLAGS += -I../../../../usr/include/
+
+TEST_GEN_PROGS := process_vmsplice_test
+
+include ../lib.mk
diff --git a/tools/testing/selftests/process_vmsplice/process_vmsplice_test.c 
b/tools/testing/selftests/process_vmsplice/process_vmsplice_test.c
new file mode 100644
index ..8abf59b9c567
--- /dev/null
+++ b/tools/testing/selftests/process_vmsplice/process_vmsplice_test.c
@@ -0,0 +1,188 @@
+#define _GNU_SOURCE
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+#include "../kselftest.h"
+
+#ifndef __NR_process_vmsplice
+#define __NR_process_vmsplice 333
+#endif
+
+#define pr_err(fmt, ...) \
+   ({ \
+   fprintf(stderr, "%s:%d:" fmt, \
+   __func__, __LINE__, ##__VA_ARGS__); \
+   KSFT_FAIL; \
+   })
+#define pr_perror(fmt, ...) pr_err(fmt ": %m\n", ##__VA_ARGS__)
+#define fail(fmt, ...) pr_err("FAIL:" fmt, ##__VA_ARGS__)
+
+static ssize_t process_vmsplice(pid_t pid, int fd, const struct iovec *iov,
+   unsigned long nr_segs, unsigned int flags)
+{
+   return syscall(__NR_process_vmsplice, pid, fd, iov, nr_segs, flags);
+
+}
+
+#define MEM_SIZE (4096 * 100)
+#define MEM_WRONLY_SIZE (4096 * 10)
+
+int main(int argc, char **argv)
+{
+   char *addr, *addr_wronly;
+   int p[2];
+   struct iovec iov[2];
+   char buf[4096];
+   int status, ret;
+   pid_t pid;
+
+   ksft_print_header();
+
+   addr = mmap(0, MEM_SIZE, PROT_READ | PROT_WRITE,
+   MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
+   if (addr == MAP_FAILED)
+   return pr_perror("Unable to create a mapping");
+
+   addr_wronly = mmap(0, MEM_WRONLY_SIZE, PROT_WRITE,
+   MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
+   if (addr_wronly == MAP_FAILED)
+   return pr_perror("Unable to create a write-only mapping");
+
+   if (pipe(p))
+   return pr_perror("Unable to create a pipe");
+
+   pid = fork();
+   if (pid < 0)
+   return pr_perror("Unable to fork");
+
+   if (pid == 0) {
+   addr[0] = 'C';
+   addr[4096 + 128] = 'A';
+   addr[4096 + 128 + 4096 - 1] = 'B';
+
+   if (prctl(PR_SET_PDEATHSIG, SIGKILL))
+   return pr_perror("Unable to set PR_SET_PDEATHSIG");
+   if (write(p[1], "c", 1) != 1)
+   return pr_perror("Unable to write data into pipe");
+
+   while (1)
+   sleep(1);
+   return 1;
+   }
+   if (read(p[0], buf, 1) != 1) {
+   pr_perror("Unable to read data from pipe");
+   kill(pid, SIGKILL);
+   wait();
+   return 1;
+   }
+
+   munmap(addr, MEM_SIZE);
+   munmap(addr_wronly, MEM_WRONLY_SIZE);
+
+   iov[0].iov_base = addr;
+   iov[0].iov_len = 1;
+
+   iov[1].iov_base = addr + 4096 + 128;
+   iov[1].iov_len = 4096;
+
+   /* check one iovec */
+   if (process_vmsplice(pid, p[1], iov, 1, SPLICE_F_GIFT) != 1)
+   return pr_perror("Unable to splice pages");
+
+   if (read(p[0], buf, 1) != 1)
+   return pr_perror("Unable to read from pipe");
+
+   if (buf[0] != 'C')
+   ksft_test_result_fail("Get wrong data\n");
+   else
+   ksft_test_result_pass("Check process_vmsplice with one vec\n");
+
+   /* check two iovec-s */
+   if (process_vmsplice(pid, p[1], iov, 2, SPLICE_F_GIFT) != 4097)
+   return pr_perror("Unable to spice pages\n");
+
+   if (read(p[0], buf, 1) != 1)
+   return pr_perror("Unable to read from pipe\n");
+
+   if (buf[0] != 'C')
+   ksft_test_result_fail("Get wrong data\n");
+
+   if (read(p[0], buf, 4096) != 4096)
+   return pr_perror("Unable t

Re: [PATCH 1/3] x86/entry: Fix idtentry unwind hint

2017-10-25 Thread Andrei Vagin
Hi Josh,

Here is one more warning:
[5.852094] WARNING: can't dereference iret registers at b6ce01b7ffe0 
for ip entry_SYSCALL_64_fastpath+0xa/0xc2

[avagin@laptop linux]$ git describe tip/auto-latest
v4.14-rc6-471-g376214a8543d

On Fri, Oct 20, 2017 at 11:21:33AM -0500, Josh Poimboeuf wrote:
> This fixes the following ORC warning in the 'int3' entry code:
> 
>   WARNING: can't dereference iret registers at 8801c5f17fe0 for ip 
> 95f0d94b
> 
> The ORC metadata had the wrong stack offset for the iret registers.
> 
> Their location on the stack is dependent on whether the exception has an
> error code.
> 
> Reported-and-tested-by: Andrei Vagin <ava...@virtuozzo.com>
> Fixes: 8c1f75587a18 ("x86/entry/64: Add unwind hint annotations")
> Signed-off-by: Josh Poimboeuf <jpoim...@redhat.com>
> ---
>  arch/x86/entry/entry_64.S | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
> 
> diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
> index 49167258d587..f6cdb7a1455e 100644
> --- a/arch/x86/entry/entry_64.S
> +++ b/arch/x86/entry/entry_64.S
> @@ -808,7 +808,7 @@ apicinterrupt IRQ_WORK_VECTOR 
> irq_work_interrupt  smp_irq_work_interrupt
>  
>  .macro idtentry sym do_sym has_error_code:req paranoid=0 shift_ist=-1
>  ENTRY(\sym)
> - UNWIND_HINT_IRET_REGS offset=8
> + UNWIND_HINT_IRET_REGS offset=\has_error_code*8
>  
>   /* Sanity check */
>   .if \shift_ist != -1 && \paranoid == 0
> -- 
> 2.13.6
> 


Re: [RFC] net/unix_diag: Provide UDIAG_SHOW_VFS2 attribute to fetch complete inode number

2017-10-24 Thread Andrei Vagin
On Wed, Oct 25, 2017 at 12:48:14AM +0300, Cyrill Gorcunov wrote:
> Currently unix_diag_vfs structure reports unix socket inode
> as u32 value which of course doesn't fit to ino_t type and

BTW: As far as I understand, it is not a problem right now, because
get_next_ino returns int. And I'm agree that it maybe a problem in a
future and it is better to be ready.

> the number may be trimmed. Lets rather deprecate old UDIAG_SHOW_VFS
> interface and provide UDIAG_SHOW_VFS2 (with one field "__zero" reserved
> which we could extend in future).

There is one more place where we return ino as u32:

static int sk_diag_dump_peer(struct sock *sk, struct sk_buff *nlskb)

return nla_put_u32(nlskb, UNIX_DIAG_PEER, ino);

> 
> CC: Andrey Vagin 
> CC: David S. Miller 
> CC: Pavel Emelyanov 
> Signed-off-by: Cyrill Gorcunov 
> ---
> 
> I build-tested it only thus not for inclusion yet, but rather
> to discuss if there some better way to handle this potential
> problem.
> 
>  include/uapi/linux/unix_diag.h |8 
>  net/unix/diag.c|   25 -
>  2 files changed, 24 insertions(+), 9 deletions(-)
> 
> Index: linux-ml.git/include/uapi/linux/unix_diag.h
> ===
> --- linux-ml.git.orig/include/uapi/linux/unix_diag.h
> +++ linux-ml.git/include/uapi/linux/unix_diag.h
> @@ -19,6 +19,7 @@ struct unix_diag_req {
>  #define UDIAG_SHOW_ICONS 0x0008  /* show pending connections */
>  #define UDIAG_SHOW_RQLEN 0x0010  /* show skb receive queue len */
>  #define UDIAG_SHOW_MEMINFO   0x0020  /* show memory info of a socket 
> */
> +#define UDIAG_SHOW_VFS2  0x0040  /* show VFS inode info 
> v2 */
>  
>  struct unix_diag_msg {
>   __u8udiag_family;
> @@ -39,6 +40,7 @@ enum {
>   UNIX_DIAG_RQLEN,
>   UNIX_DIAG_MEMINFO,
>   UNIX_DIAG_SHUTDOWN,
> + UNIX_DIAG_VFS2,
>  
>   __UNIX_DIAG_MAX,
>  };
> @@ -50,6 +52,12 @@ struct unix_diag_vfs {
>   __u32   udiag_vfs_dev;
>  };
>  
> +struct unix_diag_vfs2 {
> + __u64   udiag_vfs_ino;
> + __u32   udiag_vfs_dev;
> + __u32   __zero; /* Reserve for future use */

How can a user understand whether this field is used or not?

Each netlink attribute has its size in a header. Any attribute can be
extended, and users can understand which fields are filled by
a size of an attribute.

> +};
> +
>  struct unix_diag_rqlen {
>   __u32   udiag_rqueue;
>   __u32   udiag_wqueue;
> Index: linux-ml.git/net/unix/diag.c
> ===
> --- linux-ml.git.orig/net/unix/diag.c
> +++ linux-ml.git/net/unix/diag.c
> @@ -19,17 +19,24 @@ static int sk_diag_dump_name(struct sock
>  addr->name->sun_path);
>  }
>  
> -static int sk_diag_dump_vfs(struct sock *sk, struct sk_buff *nlskb)
> +static int sk_diag_dump_vfs(struct sock *sk, struct sk_buff *nlskb, unsigned 
> int flags)
>  {
>   struct dentry *dentry = unix_sk(sk)->path.dentry;
>  
>   if (dentry) {
> - struct unix_diag_vfs uv = {
> - .udiag_vfs_ino = d_backing_inode(dentry)->i_ino,
> - .udiag_vfs_dev = dentry->d_sb->s_dev,
> - };
> -
> - return nla_put(nlskb, UNIX_DIAG_VFS, sizeof(uv), );
> + if (flags & UDIAG_SHOW_VFS2) {
> + struct unix_diag_vfs uv = {
> + .udiag_vfs_ino = d_backing_inode(dentry)->i_ino,
> + .udiag_vfs_dev = dentry->d_sb->s_dev,
> + };
> + return nla_put(nlskb, UNIX_DIAG_VFS, sizeof(uv), );
> + } else {
> + struct unix_diag_vfs2 uv = {
> + .udiag_vfs_ino = d_backing_inode(dentry)->i_ino,
> + .udiag_vfs_dev = dentry->d_sb->s_dev,
> + };
> + return nla_put(nlskb, UDIAG_SHOW_VFS2, sizeof(uv), );
> + }
>   }
>  
>   return 0;
> @@ -132,8 +139,8 @@ static int sk_diag_fill(struct sock *sk,
>   sk_diag_dump_name(sk, skb))
>   goto out_nlmsg_trim;
>  
> - if ((req->udiag_show & UDIAG_SHOW_VFS) &&
> - sk_diag_dump_vfs(sk, skb))
> + if ((req->udiag_show & (UDIAG_SHOW_VFS | UDIAG_SHOW_VFS2)) &&
> + sk_diag_dump_vfs(sk, skb, req->udiag_show))
>   goto out_nlmsg_trim;
>  
>   if ((req->udiag_show & UDIAG_SHOW_PEER) &&


Re: [v6,1/2] pid: Replace pid bitmap implementation with IDR API

2017-10-20 Thread Andrei Vagin
On Fri, Oct 20, 2017 at 05:06:47PM +0100, Gargi Sharma wrote:
> On Thu, Oct 19, 2017 at 5:18 PM, Oleg Nesterov <o...@redhat.com> wrote:
> > On 10/19, Andrei Vagin wrote:
> >>
> >> Hi Gargi,
> >>
> >> This patch breaks CRIU, because it changes a meaning of ns_last_pid.
> >
> > ...
> >
> >> > @@ -311,7 +297,7 @@ static int pid_ns_ctl_handler(struct ctl_table 
> >> > *table, int write,
> >> >  * it should synchronize its usage with external means.
> >> >  */
> >> >
> >> > -   tmp.data = _ns->last_pid;
> >> > +   tmp.data = _ns->idr.idr_next;
> >
> > Ah, yes, off-by-one error...
> >
> > Gargi, I don't think you need to make another version, I'd suggest you to 
> > send
> > the trivial fix to Andrew, afaics you just need to replace these 2 lines 
> > with
> >
> > unsigned int last;
> > int err;
> >
> > tmp.data = 
> > err = proc_dointvec_minmax(, write, buffer, lenp, ppos);
> > if (!err)
> > idr_set_cursor(_ns->idr, last + 1);
> > return err;
> I'm not sure entirely understand how this takes care of rolling over of PIDs?
> Can we ignore that? If yes, won't the tests for CRIU still break?

Gargi, I don't understand what you mean. Could you elaborate? Do you
mean a case when idr_next is bigger than pid_max? I think this logic
remains the same what we had before switching to idr.

CRIU tests works with a following patch. It is slightly modified version
of Oleg's patch.

diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c
index fea2c24..1c791b3 100644
--- a/kernel/pid_namespace.c
+++ b/kernel/pid_namespace.c
@@ -287,6 +287,7 @@ static int pid_ns_ctl_handler(struct ctl_table
*table, int write,
 {
struct pid_namespace *pid_ns = task_active_pid_ns(current);
struct ctl_table tmp = *table;
+   int ret;
 
if (write && !ns_capable(pid_ns->user_ns, CAP_SYS_ADMIN))
return -EPERM;
@@ -298,7 +299,12 @@ static int pid_ns_ctl_handler(struct ctl_table
*table, int write,
 */
 
tmp.data = _ns->idr.idr_next;
-   return proc_dointvec_minmax(, write, buffer, lenp, ppos);
+   ret = proc_dointvec_minmax(, write, buffer, lenp, ppos);
+   if (ret < 0)
+   return ret;
+
+   idr_set_cursor(_ns->idr, pid_ns->idr.idr_next + 1);
+   return 0;
 }
 
 extern int pid_max;


> 
> Thanks,
> Gargi
> >
> > Oleg.
> >


Re: [2/2] x86/unwind: Make CONFIG_UNWINDER_ORC=y the default in kconfig for 64-bit

2017-10-20 Thread Andrei Vagin
On Thu, Oct 19, 2017 at 08:28:04PM -0500, Josh Poimboeuf wrote:
> On Thu, Oct 19, 2017 at 03:35:22PM -0700, Andrei Vagin wrote:
> > On Thu, Oct 19, 2017 at 01:16:55PM -0500, Josh Poimboeuf wrote:
> > > On Thu, Oct 19, 2017 at 09:51:04AM -0700, Andrei Vagin wrote:
> > > > Hi,
> > > > 
> > > > We run CRIU tests for tip/auto-latest regularly, and a few days ago our
> > > > test job started to detect this warning in a kernel log:
> > > > 
> > > > [   44.235786] WARNING: can't dereference iret registers at 
> > > > 8801c5f17fe0 for ip 95f0d94b
> > > > 
> > > > What does it mean? How critical is it?
> > > > 
> > > > Our test job fails if it detects any warning in a kernel log. Maybe we
> > > > need to investigate reasons of this warning and try to eliminate it?
> > > > 
> > > > Here are logs:
> > > > https://travis-ci.org/avagin/linux/jobs/289676634
> > > 
> > > I think it means the unwinder found some bad ORC unwinder metadata.  Any
> > > chance you have access to the kernel binary?  I need to know what code
> > > corresponds to that 95f0d94b address.
> > > 
> > > Or if you can reproduce with the following patch, that should help:
> > > 
> > > 
> > > diff --git a/arch/x86/kernel/unwind_orc.c b/arch/x86/kernel/unwind_orc.c
> > > index 570b70d3f604..95b633f0ce51 100644
> > > --- a/arch/x86/kernel/unwind_orc.c
> > > +++ b/arch/x86/kernel/unwind_orc.c
> > > @@ -448,7 +448,7 @@ bool unwind_next_frame(struct unwind_state *state)
> > >  
> > >   case ORC_TYPE_REGS_IRET:
> > >   if (!deref_stack_regs(state, sp, >ip, >sp, 
> > > false)) {
> > > - orc_warn("can't dereference iret registers at %p for ip 
> > > %p\n",
> > > + orc_warn("can't dereference iret registers at %p for ip 
> > > %pB\n",
> > >(void *)sp, (void *)orig_ip);
> > >   goto done;
> > >   }
> > 
> > I applied your patch and rerun tests.
> > 
> > [   44.947699] WARNING: can't dereference iret registers at 
> > 880178f5ffe0 for ip int3+0x5b/0x60
> 
> Thanks, that was enough for me to figure it out.  Can you test the below fix?

This patch works for me. I run tests a few times and they found nothing
suspicious.

Tested-by: Andrei Vagin <ava...@virtuozzo.com>

Thank you!

> 
> > and now here is a warning from kasan:
> > 
> > [  477.775676] 
> > ==
> > [  477.775845] BUG: KASAN: stack-out-of-bounds in 
> > deref_stack_reg+0x11d/0x150
> 
> The KASAN warning is a known issue for which the fix is a little more
> complicated.  v1 of the patch was here:
> 
>   https://lkml.kernel.org/r/cover.1507128293.git.jpoim...@redhat.com
> 
> 
> 
> diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
> index 49167258d587..f6cdb7a1455e 100644
> --- a/arch/x86/entry/entry_64.S
> +++ b/arch/x86/entry/entry_64.S
> @@ -808,7 +808,7 @@ apicinterrupt IRQ_WORK_VECTOR 
> irq_work_interrupt  smp_irq_work_interrupt
>  
>  .macro idtentry sym do_sym has_error_code:req paranoid=0 shift_ist=-1
>  ENTRY(\sym)
> - UNWIND_HINT_IRET_REGS offset=8
> + UNWIND_HINT_IRET_REGS offset=\has_error_code*8
>  
>   /* Sanity check */
>   .if \shift_ist != -1 && \paranoid == 0
> 


Re: [2/2] x86/unwind: Make CONFIG_UNWINDER_ORC=y the default in kconfig for 64-bit

2017-10-19 Thread Andrei Vagin
On Thu, Oct 19, 2017 at 03:35:22PM -0700, Andrei Vagin wrote:
> On Thu, Oct 19, 2017 at 01:16:55PM -0500, Josh Poimboeuf wrote:
> > On Thu, Oct 19, 2017 at 09:51:04AM -0700, Andrei Vagin wrote:
> > > Hi,
> > > 
> > > We run CRIU tests for tip/auto-latest regularly, and a few days ago our
> > > test job started to detect this warning in a kernel log:
> > > 
> > > [   44.235786] WARNING: can't dereference iret registers at 
> > > 8801c5f17fe0 for ip 95f0d94b
> > > 
> > > What does it mean? How critical is it?
> > > 
> > > Our test job fails if it detects any warning in a kernel log. Maybe we
> > > need to investigate reasons of this warning and try to eliminate it?
> > > 
> > > Here are logs:
> > > https://travis-ci.org/avagin/linux/jobs/289676634
> > 
> > I think it means the unwinder found some bad ORC unwinder metadata.  Any
> > chance you have access to the kernel binary?  I need to know what code
> > corresponds to that 95f0d94b address.
> > 
> > Or if you can reproduce with the following patch, that should help:
> > 
> > 
> > diff --git a/arch/x86/kernel/unwind_orc.c b/arch/x86/kernel/unwind_orc.c
> > index 570b70d3f604..95b633f0ce51 100644
> > --- a/arch/x86/kernel/unwind_orc.c
> > +++ b/arch/x86/kernel/unwind_orc.c
> > @@ -448,7 +448,7 @@ bool unwind_next_frame(struct unwind_state *state)
> >  
> > case ORC_TYPE_REGS_IRET:
> > if (!deref_stack_regs(state, sp, >ip, >sp, 
> > false)) {
> > -   orc_warn("can't dereference iret registers at %p for ip 
> > %p\n",
> > +   orc_warn("can't dereference iret registers at %p for ip 
> > %pB\n",
> >  (void *)sp, (void *)orig_ip);
> > goto done;
> > }
> 
> I applied your patch and rerun tests.
> 
> [   44.947699] WARNING: can't dereference iret registers at 880178f5ffe0 
> for ip int3+0x5b/0x60
> 
> and now here is a warning from kasan:
> 
> [  477.775676] 
> ==
> [  477.775845] BUG: KASAN: stack-out-of-bounds in deref_stack_reg+0x11d/0x150
> [  477.775952] Read of size 8 at addr 880166b7fe90 by task make/16028
> [  477.776055] 
> [  477.776149] CPU: 0 PID: 16028 Comm: make Not tainted 4.14.0-rc5+ #1
> [  477.776152] Hardware name: Google Google Compute Engine/Google Compute 
> Engine, BIOS Google 01/01/2011
> [  477.776155] Call Trace:
> [  477.776159]  
> [  477.776167]  dump_stack+0x5c/0x7e
> [  477.776175]  print_address_description+0x6b/0x290
> [  477.776182]  ? deref_stack_reg+0x11d/0x150
> [  477.776186]  kasan_report+0x25d/0x340
> [  477.776194]  deref_stack_reg+0x11d/0x150
> [  477.776201]  ? __read_once_size_nocheck.constprop.6+0x10/0x10
> [  477.776206]  ? get_stack_info+0x37/0x170
> [  477.776212]  ? stack_access_ok+0xdc/0x150
> [  477.776221]  unwind_next_frame+0xe35/0x1c10
> [  477.776230]  ? do_execveat_common.isra.34+0x78e/0x1890
> [  477.776238]  ? deref_stack_reg+0x150/0x150
> [  477.776247]  ? is_bpf_text_address+0x54/0x60
> [  477.776253]  ? kernel_text_address+0xf4/0x100
> [  477.776257]  ? do_execveat_common.isra.34+0x78e/0x1890
> [  477.776266]  __save_stack_trace+0x73/0xd0
> [  477.776277]  ? do_execveat_common.isra.34+0x78e/0x1890
> [  477.776285]  save_stack+0x33/0xb0
> [  477.776291]  ? kasan_slab_free+0x70/0xc0
> [  477.776298]  ? kmem_cache_free+0x9f/0x230
> [  477.776303]  ? rcu_process_callbacks+0x451/0xd60
> [  477.776307]  ? __do_softirq+0x1d3/0x5e0
> [  477.776312]  ? irq_exit+0x146/0x170
> [  477.776322]  ? smp_apic_timer_interrupt+0x13e/0x3b0
> [  477.776326]  ? apic_timer_interrupt+0x8c/0xa0
> [  477.776331]  ? lock_acquire+0x6b/0x260
> [  477.776336]  ? do_execveat_common.isra.34+0x78e/0x1890
> [  477.776347]  ? update_curr+0x2d6/0x600
> [  477.776354]  ? posix_cpu_timers_exit_group+0x50/0x50
> [  477.776365]  ? trigger_load_balance+0x1fd/0x8a0
> [  477.776374]  ? note_gp_changes+0x14e/0x1b0
> [  477.776384]  ? lock_downgrade+0x590/0x590
> [  477.776389]  ? rcu_accelerate_cbs+0x106/0x5e0
> [  477.776398]  ? lock_acquire+0x113/0x260
> [  477.776402]  ? rcu_process_callbacks+0x407/0xd60
> [  477.776407]  kasan_slab_free+0x70/0xc0
> [  477.776414]  ? rcu_process_callbacks+0x451/0xd60
> [  477.776418]  kmem_cache_free+0x9f/0x230
> [  477.776425]  ? free_inode_nonrcu+0x20/0x20
> [  477.776430]  rcu_process_callbacks+0x451/0xd60
> [  477.776443]  ? note_gp_changes+0x1b0/0x1b0
> [  477.776451]  ? native_apic_msr_write+0x27/0x30
> [  477.776456]  ? 

Re: [2/2] x86/unwind: Make CONFIG_UNWINDER_ORC=y the default in kconfig for 64-bit

2017-10-19 Thread Andrei Vagin
On Thu, Oct 19, 2017 at 01:16:55PM -0500, Josh Poimboeuf wrote:
> On Thu, Oct 19, 2017 at 09:51:04AM -0700, Andrei Vagin wrote:
> > Hi,
> > 
> > We run CRIU tests for tip/auto-latest regularly, and a few days ago our
> > test job started to detect this warning in a kernel log:
> > 
> > [   44.235786] WARNING: can't dereference iret registers at 
> > 8801c5f17fe0 for ip 95f0d94b
> > 
> > What does it mean? How critical is it?
> > 
> > Our test job fails if it detects any warning in a kernel log. Maybe we
> > need to investigate reasons of this warning and try to eliminate it?
> > 
> > Here are logs:
> > https://travis-ci.org/avagin/linux/jobs/289676634
> 
> I think it means the unwinder found some bad ORC unwinder metadata.  Any
> chance you have access to the kernel binary?  I need to know what code
> corresponds to that 95f0d94b address.
> 
> Or if you can reproduce with the following patch, that should help:
> 
> 
> diff --git a/arch/x86/kernel/unwind_orc.c b/arch/x86/kernel/unwind_orc.c
> index 570b70d3f604..95b633f0ce51 100644
> --- a/arch/x86/kernel/unwind_orc.c
> +++ b/arch/x86/kernel/unwind_orc.c
> @@ -448,7 +448,7 @@ bool unwind_next_frame(struct unwind_state *state)
>  
>   case ORC_TYPE_REGS_IRET:
>   if (!deref_stack_regs(state, sp, >ip, >sp, 
> false)) {
> - orc_warn("can't dereference iret registers at %p for ip 
> %p\n",
> + orc_warn("can't dereference iret registers at %p for ip 
> %pB\n",
>(void *)sp, (void *)orig_ip);
>   goto done;
>   }

I applied your patch and rerun tests.

[   44.947699] WARNING: can't dereference iret registers at 880178f5ffe0 
for ip int3+0x5b/0x60

and now here is a warning from kasan:

[  477.775676] 
==
[  477.775845] BUG: KASAN: stack-out-of-bounds in deref_stack_reg+0x11d/0x150
[  477.775952] Read of size 8 at addr 880166b7fe90 by task make/16028
[  477.776055] 
[  477.776149] CPU: 0 PID: 16028 Comm: make Not tainted 4.14.0-rc5+ #1
[  477.776152] Hardware name: Google Google Compute Engine/Google Compute 
Engine, BIOS Google 01/01/2011
[  477.776155] Call Trace:
[  477.776159]  
[  477.776167]  dump_stack+0x5c/0x7e
[  477.776175]  print_address_description+0x6b/0x290
[  477.776182]  ? deref_stack_reg+0x11d/0x150
[  477.776186]  kasan_report+0x25d/0x340
[  477.776194]  deref_stack_reg+0x11d/0x150
[  477.776201]  ? __read_once_size_nocheck.constprop.6+0x10/0x10
[  477.776206]  ? get_stack_info+0x37/0x170
[  477.776212]  ? stack_access_ok+0xdc/0x150
[  477.776221]  unwind_next_frame+0xe35/0x1c10
[  477.776230]  ? do_execveat_common.isra.34+0x78e/0x1890
[  477.776238]  ? deref_stack_reg+0x150/0x150
[  477.776247]  ? is_bpf_text_address+0x54/0x60
[  477.776253]  ? kernel_text_address+0xf4/0x100
[  477.776257]  ? do_execveat_common.isra.34+0x78e/0x1890
[  477.776266]  __save_stack_trace+0x73/0xd0
[  477.776277]  ? do_execveat_common.isra.34+0x78e/0x1890
[  477.776285]  save_stack+0x33/0xb0
[  477.776291]  ? kasan_slab_free+0x70/0xc0
[  477.776298]  ? kmem_cache_free+0x9f/0x230
[  477.776303]  ? rcu_process_callbacks+0x451/0xd60
[  477.776307]  ? __do_softirq+0x1d3/0x5e0
[  477.776312]  ? irq_exit+0x146/0x170
[  477.776322]  ? smp_apic_timer_interrupt+0x13e/0x3b0
[  477.776326]  ? apic_timer_interrupt+0x8c/0xa0
[  477.776331]  ? lock_acquire+0x6b/0x260
[  477.776336]  ? do_execveat_common.isra.34+0x78e/0x1890
[  477.776347]  ? update_curr+0x2d6/0x600
[  477.776354]  ? posix_cpu_timers_exit_group+0x50/0x50
[  477.776365]  ? trigger_load_balance+0x1fd/0x8a0
[  477.776374]  ? note_gp_changes+0x14e/0x1b0
[  477.776384]  ? lock_downgrade+0x590/0x590
[  477.776389]  ? rcu_accelerate_cbs+0x106/0x5e0
[  477.776398]  ? lock_acquire+0x113/0x260
[  477.776402]  ? rcu_process_callbacks+0x407/0xd60
[  477.776407]  kasan_slab_free+0x70/0xc0
[  477.776414]  ? rcu_process_callbacks+0x451/0xd60
[  477.776418]  kmem_cache_free+0x9f/0x230
[  477.776425]  ? free_inode_nonrcu+0x20/0x20
[  477.776430]  rcu_process_callbacks+0x451/0xd60
[  477.776443]  ? note_gp_changes+0x1b0/0x1b0
[  477.776451]  ? native_apic_msr_write+0x27/0x30
[  477.776456]  ? lapic_next_event+0x55/0x80
[  477.776465]  __do_softirq+0x1d3/0x5e0
[  477.776479]  ? do_execveat_common.isra.34+0x78e/0x1890
[  477.776483]  irq_exit+0x146/0x170
[  477.776487]  smp_apic_timer_interrupt+0x13e/0x3b0
[  477.776494]  apic_timer_interrupt+0x8c/0xa0
[  477.776497]  
[  477.776502] RIP: 0010:lock_acquire+0x6b/0x260
[  477.776505] RSP: 0018:880166b7fd48 EFLAGS: 0246 ORIG_RAX: 
ff11
[  477.776512] RAX: 0007 RBX: 8801c91cb080 RCX: 
[  477.776515] RDX:  RSI: 0

Re: [2/2] x86/unwind: Make CONFIG_UNWINDER_ORC=y the default in kconfig for 64-bit

2017-10-19 Thread Andrei Vagin
Hi,

We run CRIU tests for tip/auto-latest regularly, and a few days ago our
test job started to detect this warning in a kernel log:

[   44.235786] WARNING: can't dereference iret registers at 8801c5f17fe0 
for ip 95f0d94b

What does it mean? How critical is it?

Our test job fails if it detects any warning in a kernel log. Maybe we
need to investigate reasons of this warning and try to eliminate it?

Here are logs:
https://travis-ci.org/avagin/linux/jobs/289676634

Thanks,
Andrei

On Fri, Oct 13, 2017 at 03:02:01PM -0500, Josh Poimboeuf wrote:
> The ORC unwinder has been stable in testing so far.  Give it much wider
> testing by making it the default in kconfig for x86_64.  It's not yet
> supported for 32-bit, so leave frame pointers as the default there.
> 
> Suggested-by: Ingo Molnar 
> Signed-off-by: Josh Poimboeuf 
> ---
>  arch/x86/Kconfig.debug | 33 +
>  1 file changed, 17 insertions(+), 16 deletions(-)
> 
> diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug
> index f274dbb87c26..a4ff214fb760 100644
> --- a/arch/x86/Kconfig.debug
> +++ b/arch/x86/Kconfig.debug
> @@ -358,27 +358,13 @@ config PUNIT_ATOM_DEBUG
>  
>  choice
>   prompt "Choose kernel unwinder"
> - default UNWINDER_FRAME_POINTER
> + default UNWINDER_ORC if X86_64
> + default UNWINDER_FRAME_POINTER if X86_32
>   ---help---
> This determines which method will be used for unwinding kernel stack
> traces for panics, oopses, bugs, warnings, perf, /proc//stack,
> livepatch, lockdep, and more.
>  
> -config UNWINDER_FRAME_POINTER
> - bool "Frame pointer unwinder"
> - select FRAME_POINTER
> - ---help---
> -   This option enables the frame pointer unwinder for unwinding kernel
> -   stack traces.
> -
> -   The unwinder itself is fast and it uses less RAM than the ORC
> -   unwinder, but the kernel text size will grow by ~3% and the kernel's
> -   overall performance will degrade by roughly 5-10%.
> -
> -   This option is recommended if you want to use the livepatch
> -   consistency model, as this is currently the only way to get a
> -   reliable stack trace (CONFIG_HAVE_RELIABLE_STACKTRACE).
> -
>  config UNWINDER_ORC
>   bool "ORC unwinder"
>   depends on X86_64
> @@ -395,6 +381,21 @@ config UNWINDER_ORC
> Enabling this option will increase the kernel's runtime memory usage
> by roughly 2-4MB, depending on your kernel config.
>  
> +config UNWINDER_FRAME_POINTER
> + bool "Frame pointer unwinder"
> + select FRAME_POINTER
> + ---help---
> +   This option enables the frame pointer unwinder for unwinding kernel
> +   stack traces.
> +
> +   The unwinder itself is fast and it uses less RAM than the ORC
> +   unwinder, but the kernel text size will grow by ~3% and the kernel's
> +   overall performance will degrade by roughly 5-10%.
> +
> +   This option is recommended if you want to use the livepatch
> +   consistency model, as this is currently the only way to get a
> +   reliable stack trace (CONFIG_HAVE_RELIABLE_STACKTRACE).
> +
>  config UNWINDER_GUESS
>   bool "Guess unwinder"
>   depends on EXPERT


Re: [v6,1/2] pid: Replace pid bitmap implementation with IDR API

2017-10-19 Thread Andrei Vagin
Hi Gargi,

This patch breaks CRIU, because it changes a meaning of ns_last_pid.

== Run zdtm/static/env00 in h ==
 DEP   env00.d
 CCenv00.o
 LINK  env00
Start test
./env00 --pidfile=env00.pid --outfile=env00.out --envname=ENV_00_TEST
Run criu dump
Run criu restore
=[log]=> dump/zdtm/static/env00/52/1/restore.log
 grep Error 
(00.000587) No mountpoints-6.img image
(00.000593) mnt: Reading mountpoint images (id 6 pid 52)
(00.000653) Forking task with 52 pid (flags 0x0)
(00.007568) PID: real 51 virt 52
(00.010363) 52: Error (criu/cr-restore.c:1787): Pid 51 do not match 
expected 52 (task 52)
(00.010474) Error (criu/cr-restore.c:2449): Restoring FAILED.
 ERROR OVER 

Before this patch, ns_last_pid contains a pid of a last process. With
this patch, it contains a pid of a "next" process.

In CRIU we use ns_last_pid to restore a process with a specified pid,
and now this logic is broken:

$ uname -a
Linux laptop 4.11.11-200.fc25.x86_64 #1 SMP Mon Jul 17 17:41:12 UTC 2017 x86_64 
x86_64 x86_64 GNU/Linux
$ echo 1 > /proc/sys/kernel/ns_last_pid && sh -c 'echo $$'
2

$ uname -a
Linux fc24 4.14.0-rc5-next-20171018 #1 SMP Wed Oct 18 23:52:43 PDT 2017 x86_64 
x86_64 x86_64 GNU/Linux
$ echo 1 > /proc/sys/kernel/ns_last_pid && sh -c 'echo $$'
1

Thanks,
Andrei

On Wed, Oct 11, 2017 at 06:19:38PM -0400, Gargi Sharma wrote:
> This patch replaces the current bitmap implemetation for
> Process ID allocation. Functions that are no longer required,
> for example, free_pidmap(), alloc_pidmap(), etc. are removed.
> The rest of the functions are modified to use the IDR API.
> The change was made to make the PID allocation less complex by
> replacing custom code with calls to generic API.
> 
> Signed-off-by: Gargi Sharma 
> Reviewed-by: Rik van Riel 
> ---
>  arch/powerpc/platforms/cell/spufs/sched.c |   2 +-
>  fs/proc/loadavg.c |   2 +-
>  include/linux/pid_namespace.h |  14 +--
>  init/main.c   |   2 +-
>  kernel/pid.c  | 201 
> ++
>  kernel/pid_namespace.c|  44 +++
>  6 files changed, 57 insertions(+), 208 deletions(-)
> 
> diff --git a/arch/powerpc/platforms/cell/spufs/sched.c 
> b/arch/powerpc/platforms/cell/spufs/sched.c
> index 1fbb5da..e47761c 100644
> --- a/arch/powerpc/platforms/cell/spufs/sched.c
> +++ b/arch/powerpc/platforms/cell/spufs/sched.c
> @@ -1093,7 +1093,7 @@ static int show_spu_loadavg(struct seq_file *s, void 
> *private)
>   LOAD_INT(c), LOAD_FRAC(c),
>   count_active_contexts(),
>   atomic_read(_spu_contexts),
> - task_active_pid_ns(current)->last_pid);
> + idr_get_cursor(_active_pid_ns(current)->idr));
>   return 0;
>  }
>  
> diff --git a/fs/proc/loadavg.c b/fs/proc/loadavg.c
> index 983fce5..ba3d0e2 100644
> --- a/fs/proc/loadavg.c
> +++ b/fs/proc/loadavg.c
> @@ -23,7 +23,7 @@ static int loadavg_proc_show(struct seq_file *m, void *v)
>   LOAD_INT(avnrun[1]), LOAD_FRAC(avnrun[1]),
>   LOAD_INT(avnrun[2]), LOAD_FRAC(avnrun[2]),
>   nr_running(), nr_threads,
> - task_active_pid_ns(current)->last_pid);
> + idr_get_cursor(_active_pid_ns(current)->idr));
>   return 0;
>  }
>  
> diff --git a/include/linux/pid_namespace.h b/include/linux/pid_namespace.h
> index b09136f..f4db4a7 100644
> --- a/include/linux/pid_namespace.h
> +++ b/include/linux/pid_namespace.h
> @@ -9,15 +9,8 @@
>  #include 
>  #include 
>  #include 
> +#include 
>  
> -struct pidmap {
> -   atomic_t nr_free;
> -   void *page;
> -};
> -
> -#define BITS_PER_PAGE(PAGE_SIZE * 8)
> -#define BITS_PER_PAGE_MASK   (BITS_PER_PAGE-1)
> -#define PIDMAP_ENTRIES   
> ((PID_MAX_LIMIT+BITS_PER_PAGE-1)/BITS_PER_PAGE)
>  
>  struct fs_pin;
>  
> @@ -29,9 +22,8 @@ enum { /* definitions for pid_namespace's hide_pid field */
>  
>  struct pid_namespace {
>   struct kref kref;
> - struct pidmap pidmap[PIDMAP_ENTRIES];
> + struct idr idr;
>   struct rcu_head rcu;
> - int last_pid;
>   unsigned int nr_hashed;
>   struct task_struct *child_reaper;
>   struct kmem_cache *pid_cachep;
> @@ -105,6 +97,6 @@ static inline int reboot_pid_ns(struct pid_namespace 
> *pid_ns, int cmd)
>  
>  extern struct pid_namespace *task_active_pid_ns(struct task_struct *tsk);
>  void pidhash_init(void);
> -void pidmap_init(void);
> +void pid_idr_init(void);
>  
>  #endif /* _LINUX_PID_NS_H */
> diff --git a/init/main.c b/init/main.c
> index 0ee9c686..9f4db20 100644
> --- a/init/main.c
> +++ b/init/main.c
> @@ -667,7 +667,7 @@ asmlinkage __visible void __init start_kernel(void)
>   if (late_time_init)
>   late_time_init();
>   

Re: [1/2,v2] fdmap(2)

2017-10-12 Thread Andrei Vagin
On Wed, Oct 11, 2017 at 09:12:34PM +0300, Alexey Dobriyan wrote:
> On Tue, Oct 10, 2017 at 03:08:06PM -0700, Andrei Vagin wrote:
> > On Sun, Sep 24, 2017 at 11:06:20PM +0300, Alexey Dobriyan wrote:
> > > From: Aliaksandr Patseyenak <aliaksandr_patseyen...@epam.com>
> > > 
> > > Implement system call for bulk retrieveing of opened descriptors
> > > in binary form.
> > > 
> > > Some daemons could use it to reliably close file descriptors
> > > before starting. Currently they close everything upto some number
> > > which formally is not reliable. Other natural users are lsof(1) and CRIU
> > > (although lsof does so much in /proc that the effect is thoroughly 
> > > buried).
> > 
> > Hello Alexey,
> > 
> > I am not sure about the idea to add syscalls for all sort of process
> > attributes. For example, in CRIU we need file descriptors with their
> > properties, which we currently get from /proc/pid/fdinfo/. How can
> > this interface be extended to achieve our goal?
> > 
> > Have you seen the task-diag interface what I sent about a year ago?
> 
> Of course, let's discuss /proc/task_diag.
> 
> Adding it as /proc file is obviously unnecessary: you do it only
> to hook ->read and ->write netlink style
> (and BTW you don't need .THIS_MODULE anymore ;-)
> 
> Transactional netlink send and recv aren't necessary either.
> As I understand it, it comes from old times when netlink was async,
> so 2 syscalls were neccesary. Netlink is not async anymore.
> 
> Basically you want to do sys_task_diag(2) which accepts set of pids
> (maybe) and a mask (see statx()) and returns synchronously result into
> a buffer.

You are not quite right here. We send a request and then we read a
response, which can be bigger than what we can read for one call.

So we need something like a cursor, in your case it is the "start"
argument. But sometimes this cursor contains a kernel internal data
to have a better performance. We need to have a way to address this
cursor from userspace, and it is a reason why we need a file
descriptor in this scheme.

For example, you can look at the proc_maps_private structure.


> 
> > We had a discussion on the previous kernel summit how to rework
> > task-diag, so that it can be merged into the upstream kernel.
> > Unfortunately, I didn't send a summary for this discussion. But it's
> > better now than never. We decided to do something like this:
> > 
> > 1. Add a new syscall readfile(fname, buf, size), which can be
> > used to read small files without opening a file descriptor. It will be
> > useful for proc files, configs, etc.
> 
> If nothing, it should be done because the number of programmers capable
> of writing readfile() in userspace correctly handling all errors and
> short reads is very small indeed. Out of curiosity I once booted a kernel
> which made all reads short by default. It was fascinating I can tell you.
> 
> > 2. bin/text/bin conversion is very slow
> >  - 65.47% proc_pid_status
> >   - 20.81% render_sigset_t
> >- 18.27% seq_printf
> > + 15.77% seq_vprintf
> >   - 10.65% task_mem
> > + 8.78% seq_print
> > + 1.02% hugetlb_rep
> >   + 7.40% seq_printf
> > so a new interface has to use a binary format and the format of netlink
> > messages can be used here. It should be possible to extend a file
> > without breaking backward compatibility.
> 
> Binary -- yes.
> netlink attributes -- maybe.
> 
> There is statx() model which is perfect for this usecase:
> do not want pagecache of all block devices? sure, no problem.
> 
> > 3. There are a lot of objection to use a netlink sockets out of the network
> > subsystem. The idea of using a "transaction" file looks weird for many
> > people, so we decided to add a few files in /proc/pid/. I see
> > minimum two files. One file contains information about a task, it is
> > mostly what we have in /proc/pid/status and /proc/pid/stat. Another file
> > describes a task memory, it is what we have now in /proc/pid/smaps.
> > Here is one more major idea. All attributes in a file has to be equal in
> > term of performance, or by other words there should not be attributes,
> > which significantly affect a generation time of a whole file.
> > 
> > If we look at /proc/pid/smaps, we spend a lot of time to get memory
> > statistics. This file contains a lot of data and if you read it to get
> > VmFlags, the kernel will waste your time by generating a useless data
> > for you.
> 
> There is a unsolvable problem with /proc/*/stat style files. Anyone
> who wants to add ne

Re: [1/2,v2] fdmap(2)

2017-10-10 Thread Andrei Vagin
On Sun, Sep 24, 2017 at 11:06:20PM +0300, Alexey Dobriyan wrote:
> From: Aliaksandr Patseyenak 
> 
> Implement system call for bulk retrieveing of opened descriptors
> in binary form.
> 
> Some daemons could use it to reliably close file descriptors
> before starting. Currently they close everything upto some number
> which formally is not reliable. Other natural users are lsof(1) and CRIU
> (although lsof does so much in /proc that the effect is thoroughly buried).

Hello Alexey,

I am not sure about the idea to add syscalls for all sort of process
attributes. For example, in CRIU we need file descriptors with their
properties, which we currently get from /proc/pid/fdinfo/. How can
this interface be extended to achieve our goal?

Have you seen the task-diag interface what I sent about a year ago?

We had a discussion on the previous kernel summit how to rework
task-diag, so that it can be merged into the upstream kernel.
Unfortunately, I didn't send a summary for this discussion. But it's
better now than never. We decided to do something like this:

1. Add a new syscall readfile(fname, buf, size), which can be
used to read small files without opening a file descriptor. It will be
useful for proc files, configs, etc.

2. bin/text/bin conversion is very slow
 - 65.47% proc_pid_status
  - 20.81% render_sigset_t
   - 18.27% seq_printf
+ 15.77% seq_vprintf
  - 10.65% task_mem
+ 8.78% seq_print
+ 1.02% hugetlb_rep
  + 7.40% seq_printf
so a new interface has to use a binary format and the format of netlink
messages can be used here. It should be possible to extend a file
without breaking backward compatibility.

3. There are a lot of objection to use a netlink sockets out of the network
subsystem. The idea of using a "transaction" file looks weird for many
people, so we decided to add a few files in /proc/pid/. I see
minimum two files. One file contains information about a task, it is
mostly what we have in /proc/pid/status and /proc/pid/stat. Another file
describes a task memory, it is what we have now in /proc/pid/smaps.
Here is one more major idea. All attributes in a file has to be equal in
term of performance, or by other words there should not be attributes,
which significantly affect a generation time of a whole file.

If we look at /proc/pid/smaps, we spend a lot of time to get memory
statistics. This file contains a lot of data and if you read it to get
VmFlags, the kernel will waste your time by generating a useless data
for you.

Here is my slides for this discussion:
https://www.linuxplumbersconf.org/2016/ocw/system/presentations/4599/original/Netlink-issues.pdf

Could you add me into recipients for this sort of patches in a future?

Thanks,
Andrei

> 
> /proc, the only way to learn anything about file descriptors may not be
> available. There is unavoidable overhead associated with instantiating
> 3 dentries and 3 inodes and converting integers to strings and back.
> 
> Benchmark:
> 
>   N=1<<22 times
>   4 opened descriptors (0, 1, 2, 3)
>   opendir+readdir+closedir /proc/self/fd vs fdmap
> 
>   /proc 8.31 ± 0.37%
>   fdmap 0.32 ± 0.72%
> 
> 
> FDMAP(2)   Linux Programmer's Manual  FDMAP(2)
> 
> NAME
>fdmap - get open file descriptors of the process
> 
> SYNOPSIS
>long fdmap(pid_t pid, int *fd, unsigned int nfd, int start, int flags);
> 
> DESCRIPTION
>fdmap()  writes  open  file  descriptors  of the process into buffer fd
>starting from the start descriptor. At most nfd elements  are  written.
>flags argument is reserved and must be zero.
> 
>If pid is zero, syscall will work with the current process.
> 
> RETURN VALUE
>On success, number of descriptors written is returned.  On error, -1 is
>returned, and errno is set appropriately.
> 
> ERRORS
>ESRCH  No such process.
> 
>EACCES Permission denied.
> 
>EFAULT Invalid fd pointer.
> 
>EINVAL Negative start argument.
> 
> NOTES
>Glibc does not provide a wrapper for these system call; call  it  using
>syscall(2).
> 
> EXAMPLE
>The program below demonstrates fdmap() usage.
> 
>$ ./a.out $$
>0 1 2 255
> 
>$ ./a.out 420 1 2 42
>1023
> 
>Program source
> 
>#include 
>#include 
>#include 
> 
>static inline long fdmap(int pid, int *fd, unsigned int nfd, unsigned 
> int start, int flags)
>{
> register long r10 asm ("r10") = start;
> register long r8 asm ("r8") = flags;
> long rv;
> asm volatile (
>  "syscall"
>  : "=a" (rv)
>  : "0" (333), "D" (pid), "S" (fd), "d" (nfd), "r" (r10), "r" 
> (r8)
>  : "rcx", "r11", "cc", "memory"
> );
> return rv;
>}
> 
>int main(int argc, char *argv[])
>

Re: [PATCH] kcmp: Drop branch leftover typo

2017-09-18 Thread Andrei Vagin
Acked-by: Andrei Vagin <ava...@virtuozzo.com>

Fixes: 0791e3644e5e ("kcmp: add KCMP_EPOLL_TFD mode to compare epoll target 
files")

On Sun, Sep 17, 2017 at 07:58:38PM +0300, Cyrill Gorcunov wrote:
> The else branch been leftover and escaped the source
> code refresh. Not a problem but better clean it up.
> 
> Reported-by: Eugene Syromiatnikov <e...@redhat.com>
> Signed-off-by: Cyrill Gorcunov <gorcu...@openvz.org>
> CC: Andrey Vagin <ava...@openvz.org>
> CC: Andrew Morton <a...@linuxfoundation.org>
> CC: Pavel Emelyanov <xe...@virtuozzo.com>
> ---
>  kernel/kcmp.c |2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
> 
> Index: linux-ml.git/kernel/kcmp.c
> ===
> --- linux-ml.git.orig/kernel/kcmp.c
> +++ linux-ml.git/kernel/kcmp.c
> @@ -131,7 +131,7 @@ static int kcmp_epoll_target(struct task
>   if (filp_epoll) {
>   filp_tgt = get_epoll_tfile_raw_ptr(filp_epoll, slot.tfd, 
> slot.toff);
>   fput(filp_epoll);
> - } else
> + }
>  
>   if (IS_ERR(filp_tgt))
>   return PTR_ERR(filp_tgt);


Re: [PATCH] x86/idt: Fix the X86_TRAP_BP gate

2017-09-01 Thread Andrei Vagin
On Fri, Sep 01, 2017 at 10:26:30AM +0200, Ingo Molnar wrote:
> 
> Hi Andrei,
> 
> * Andrei Vagin <ava...@virtuozzo.com> wrote:
> 
> > Hi Thomas,
> > 
> > We run CRIU tests for linux-next and today they detected an issue. I've
> > bisected this problem and it looks like a problem is in this patch.
> 
> Ok, there appears to be a bug in that conversion - does the patch below fix 
> the 
> regression for you?

Yes, it is.

Thank you!

> 
> Thanks,
> 
>   Ingo
> 
> >
> Subject: x86/idt: Fix the X86_TRAP_BP gate
> From: Ingo Molnar <mi...@kernel.org>
> 
> Andrei Vagin reported a CRIU regression and bisected it back to:
> 
>   90f6225fba0c ("x86/idt: Move IST stack based traps to table init")
> 
> This table init conversion loses the system-gate property of X86_TRAP_BP
> and erroneously moves it from DPL3 to DPL0.
> 
> Fix it.
> 
> Reported-by: Andrei Vagin <ava...@virtuozzo.com>
> Cc: Linus Torvalds <torva...@linux-foundation.org>
> Cc: Peter Zijlstra <pet...@infradead.org>
> Cc: Thomas Gleixner <t...@linutronix.de>
> Signed-off-by: Ingo Molnar <mi...@kernel.org>
> ---
> arch/x86/kernel/idt.c | 6 +-
>  arch/x86/kernel/idt.c |6 +-
>  1 file changed, 5 insertions(+), 1 deletion(-)
> 
> Index: tip/arch/x86/kernel/idt.c
> ===
> --- tip.orig/arch/x86/kernel/idt.c
> +++ tip/arch/x86/kernel/idt.c
> @@ -44,6 +44,10 @@ struct idt_data {
>  #define ISTG(_vector, _addr, _ist)   \
>   G(_vector, _addr, _ist, GATE_INTERRUPT, DPL0, __KERNEL_CS)
>  
> +/* System interrupt gate with interrupt stack */
> +#define SISTG(_vector, _addr, _ist)  \
> + G(_vector, _addr, _ist, GATE_INTERRUPT, DPL3, __KERNEL_CS)
> +
>  /* Task gate */
>  #define TSKG(_vector, _gdt)  \
>   G(_vector, NULL, DEFAULT_STACK, GATE_TASK, DPL0, _gdt << 3)
> @@ -181,7 +185,7 @@ gate_desc debug_idt_table[IDT_ENTRIES] _
>  static const __initdata struct idt_data ist_idts[] = {
>   ISTG(X86_TRAP_DB,   debug,  DEBUG_STACK),
>   ISTG(X86_TRAP_NMI,  nmi,NMI_STACK),
> - ISTG(X86_TRAP_BP,   int3,   DEBUG_STACK),
> + SISTG(X86_TRAP_BP,  int3,   DEBUG_STACK),
>   ISTG(X86_TRAP_DF,   double_fault,   DOUBLEFAULT_STACK),
>  #ifdef CONFIG_X86_MCE
>   ISTG(X86_TRAP_MC,   _check, MCE_STACK),


Re: [tip:x86/apic] x86/idt: Move IST stack based traps to table init

2017-09-01 Thread Andrei Vagin
Hi Thomas,

We run CRIU tests for linux-next and today they detected an issue. I've
bisected this problem and it looks like a problem is in this patch.

[root@fc24 criu]# python ./test/zdtm.py run -t zdtm/static/env00
=== Run 1/1  zdtm/static/env00

== Run zdtm/static/env00 in h ==
Start test
./env00 --pidfile=env00.pid --outfile=env00.out --envname=ENV_00_TEST
Run criu dump
=[log]=> dump/zdtm/static/env00/36/1/dump.log
 grep Error 
(00.004013) Dump private signals of 36
(00.004022) Dump shared signals of 36
(00.004036) Parasite syscall_ip at 0x40
(00.004097) ** delivering signal 11 si_code=128
(00.004103) Error (compel/src/lib/infect.c:528): Unexpected 36 task 
interruption, aborting
(00.004118) Error (criu/cr-dump.c:1371): Can't infect (pid: 36) with parasite
(00.004208) Unlock network
(00.004218) Unfreezing tasks into 1
(00.004223) Unseizing 36 into 1
(00.004244) Error (criu/cr-dump.c:1807): Dumping FAILED.
 ERROR OVER 
### Test zdtm/static/env00 FAIL at CRIU dump ###
Send the 9 signal to  36
Wait for zdtm/static/env00(36) to die for 0.10
# FAIL #

Here is a line in a code where we get this unexpected error:
https://github.com/xemul/criu/blob/criu-dev/compel/src/lib/infect.c#L735

At this moment criu tries to execute the memfd_create syscall in a
context of another task with help of ptrace.

Here is a link to a test job:
https://travis-ci.org/avagin/linux/builds/270623449

Thanks,
Andrei

On Tue, Aug 29, 2017 at 04:18:51AM -0700, tip-bot for Jacob Shin wrote:
> Commit-ID:  90f6225fba0c732f3f5f9f5e265bdefa021ff12d
> Gitweb: http://git.kernel.org/tip/90f6225fba0c732f3f5f9f5e265bdefa021ff12d
> Author: Thomas Gleixner 
> AuthorDate: Mon, 28 Aug 2017 08:47:52 +0200
> Committer:  Ingo Molnar 
> CommitDate: Tue, 29 Aug 2017 12:07:27 +0200
> 
> x86/idt: Move IST stack based traps to table init
> 
> Initialize the IST based traps via a table.
> 
> Signed-off-by: Thomas Gleixner 
> Cc: Andy Lutomirski 
> Cc: Borislav Petkov 
> Cc: Brian Gerst 
> Cc: Denys Vlasenko 
> Cc: H. Peter Anvin 
> Cc: Josh Poimboeuf 
> Cc: Linus Torvalds 
> Cc: Peter Zijlstra 
> Cc: Steven Rostedt 
> Link: http://lkml.kernel.org/r/20170828064959.091328...@linutronix.de
> Signed-off-by: Ingo Molnar 
> ---
>  arch/x86/include/asm/desc.h |  2 ++
>  arch/x86/kernel/idt.c   | 22 ++
>  arch/x86/kernel/traps.c |  9 +
>  3 files changed, 25 insertions(+), 8 deletions(-)
> 
> diff --git a/arch/x86/include/asm/desc.h b/arch/x86/include/asm/desc.h
> index 930acd5..e624527 100644
> --- a/arch/x86/include/asm/desc.h
> +++ b/arch/x86/include/asm/desc.h
> @@ -509,9 +509,11 @@ extern void idt_setup_early_traps(void);
>  
>  #ifdef CONFIG_X86_64
>  extern void idt_setup_early_pf(void);
> +extern void idt_setup_ist_traps(void);
>  extern void idt_setup_debugidt_traps(void);
>  #else
>  static inline void idt_setup_early_pf(void) { }
> +static inline void idt_setup_ist_traps(void) { }
>  static inline void idt_setup_debugidt_traps(void) { }
>  #endif
>  
> diff --git a/arch/x86/kernel/idt.c b/arch/x86/kernel/idt.c
> index f5281b8..a6326fd 100644
> --- a/arch/x86/kernel/idt.c
> +++ b/arch/x86/kernel/idt.c
> @@ -92,6 +92,20 @@ struct desc_ptr idt_descr __ro_after_init = {
>  gate_desc debug_idt_table[IDT_ENTRIES] __page_aligned_bss;
>  
>  /*
> + * The exceptions which use Interrupt stacks. They are setup after
> + * cpu_init() when the TSS has been initialized.
> + */
> +static const __initdata struct idt_data ist_idts[] = {
> + ISTG(X86_TRAP_DB,   debug,  DEBUG_STACK),
> + ISTG(X86_TRAP_NMI,  nmi,NMI_STACK),
> + ISTG(X86_TRAP_BP,   int3,   DEBUG_STACK),
> + ISTG(X86_TRAP_DF,   double_fault,   DOUBLEFAULT_STACK),
> +#ifdef CONFIG_X86_MCE
> + ISTG(X86_TRAP_MC,   _check, MCE_STACK),
> +#endif
> +};
> +
> +/*
>   * Override for the debug_idt. Same as the default, but with interrupt
>   * stack set to DEFAULT_STACK (0). Required for NMI trap handling.
>   */
> @@ -158,6 +172,14 @@ void __init idt_setup_early_pf(void)
>  }
>  
>  /**
> + * idt_setup_ist_traps - Initialize the idt table with traps using IST
> + */
> +void __init idt_setup_ist_traps(void)
> +{
> + idt_setup_from_table(idt_table, ist_idts, ARRAY_SIZE(ist_idts));
> +}
> +
> +/**
>   * idt_setup_debugidt_traps - Initialize the debug idt table with debug traps
>   */
>  void __init idt_setup_debugidt_traps(void)
> diff --git a/arch/x86/kernel/traps.c 

Re: [PATCH] [RFC] vm: add a syscall to map a process memory into a pipe

2017-08-12 Thread Andrei Vagin
On Thu, Aug 10, 2017 at 09:42:44PM +0200, Jann Horn wrote:
> On Thu, Aug 10, 2017 at 8:46 PM, Andrei Vagin <ava...@openvz.org> wrote:
> > It is a hybrid of process_vm_readv() and vmsplice().
> >
> > vmsplice can map memory from a current address space into a pipe.
> > process_vm_readv can read memory of another process.
> [...]
> > +/*
> > + * Map pages from a specified task into a pipe
> > + */
> > +static int remote_single_vec_to_pipe(struct task_struct *task,
> > +   struct mm_struct *mm,
> > +   const struct iovec *rvec,
> > +   struct pipe_inode_info *pipe,
> > +   unsigned int flags,
> > +   size_t *total)
> > +{
> > +   struct pipe_buffer buf = {
> > +   .ops = _page_pipe_buf_ops,
> > +   .flags = flags
> > +   };
> [...]
> > +   while (nr_pages) {
> [...]
> > +   /*
> > +* Get the pages we're interested in.  We must
> > +* access remotely because task/mm might not
> > +* current/current->mm
> > +*/
> > +   down_read(>mmap_sem);
> > +   pages = get_user_pages_remote(task, mm, pa, pages, flags,
> > + process_pages, NULL, );
> 
> This fifth "flags" argument of get_user_pages_remote() should contain
> GUP flags (FOLL_*), but it looks like you're actually passing in 0 or
> PIPE_BUF_FLAG_GIFT, which will be interpreted as FOLL_GET?
> (See the snippets quoted below.) This looks like a bug.
> 
> Maybe use a more meaningful variable name than "flags".

Good catch. I will fix and rename the variable. get_user_pages_remote
has to be called with zero flags here. Thank you.

> 
> > +static ssize_t remote_iovec_to_pipe(struct task_struct *task,
> > +   struct mm_struct *mm,
> > +   const struct iovec *rvec,
> > +   unsigned long riovcnt,
> > +   struct pipe_inode_info *pipe,
> > +   unsigned int flags)
> > +{
> [...]
> > +   ret = remote_single_vec_to_pipe(
> > +   task, mm, [i], pipe, flags, );
> [...]
> > +}
> > +
> > +static long process_vmsplice_to_pipe(struct task_struct *task,
> > +   struct mm_struct *mm, struct file *file,
> > +   const struct iovec __user *uiov,
> > +   unsigned long nr_segs, unsigned int flags)
> > +{
> [...]
> > +   unsigned int buf_flag = 0;
> [...]
> > +   if (flags & SPLICE_F_GIFT)
> > +   buf_flag = PIPE_BUF_FLAG_GIFT;
> [...]
> > +   if (!ret)
> > +   ret = remote_iovec_to_pipe(task, mm, iov,
> > +   nr_segs, pipe, buf_flag);
> [...]
> > +}


[PATCH] [RFC] vm: add a syscall to map a process memory into a pipe

2017-08-10 Thread Andrei Vagin
It is a hybrid of process_vm_readv() and vmsplice().

vmsplice can map memory from a current address space into a pipe.
process_vm_readv can read memory of another process.

A new system call can map memory of another process into a pipe.

ssize_t process_vmsplice(pid_t pid, int fd, const struct iovec *iov,
unsigned long nr_segs, unsigned int flags)

All arguments are identical with vmsplice except pid which specifies a
target process.

Currently if we want to dump a process memory to a file or to a socket,
we can use process_vm_readv() + write(), but it works slow, because data
are copied into a temporary user-space buffer.

A second way is to use vmsplice() + splice(). It is more effective,
because data are not copied into a temporary buffer, but here is another
problem. vmsplice works with the currect address space, so it can be
used only if we inject our code into a target process.

The second way suffers from a few other issues:
* a process has to be stopped to run a parasite code
* a number of pipes is limited, so it may be impossible to dump all
  memory in one iteration, and we have to stop process and inject our
  code a few times.
* pages in pipes are unreclaimable, so it isn't good to hold a lot of
  memory in pipes.

The introduced syscall allows to use a second way without injecting any
code into a target process.

My experiments shows that process_vmsplice() + splice() works two time
faster than process_vm_readv() + write().

It is particularly useful on a pre-dump stage. On this stage we enable a
memory tracker, and then we are dumping  a process memory while a
process continues work. On the first iteration we are dumping all
memory, and then we are dumpung only modified memory from a previous
iteration.  After a few pre-dump operations, a process is stopped and
dumped finally. The pre-dump operations allow to significantly decrease
a process downtime, when a process is migrated to another host.

Cc: Alexander Viro <v...@zeniv.linux.org.uk>
Cc: Arnd Bergmann <a...@arndb.de>
Cc: Pavel Emelyanov <xe...@virtuozzo.com>
Cc: Michael Kerrisk <mtk.manpa...@gmail.com>
Cc: Thomas Gleixner <t...@linutronix.de>
Cc: Andrew Morton <a...@linux-foundation.org>
Signed-off-by: Andrei Vagin <ava...@openvz.org>
---
 fs/splice.c   | 219 ++
 include/linux/compat.h|   3 +
 include/linux/syscalls.h  |   4 +
 include/uapi/asm-generic/unistd.h |   5 +-
 4 files changed, 230 insertions(+), 1 deletion(-)

diff --git a/fs/splice.c b/fs/splice.c
index ae41201..4b050a4 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -34,6 +34,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include "internal.h"
 
@@ -1374,6 +1375,201 @@ SYSCALL_DEFINE4(vmsplice, int, fd, const struct iovec 
__user *, iov,
return error;
 }
 
+/*
+ * Map pages from a specified task into a pipe
+ */
+static int remote_single_vec_to_pipe(struct task_struct *task,
+   struct mm_struct *mm,
+   const struct iovec *rvec,
+   struct pipe_inode_info *pipe,
+   unsigned int flags,
+   size_t *total)
+{
+   struct pipe_buffer buf = {
+   .ops = _page_pipe_buf_ops,
+   .flags = flags
+   };
+   unsigned long addr = (unsigned long) rvec->iov_base;
+   unsigned long pa = addr & PAGE_MASK;
+   unsigned long start_offset = addr - pa;
+   unsigned long nr_pages;
+   ssize_t len = rvec->iov_len;
+   struct page *process_pages[16];
+   bool failed = false;
+   int ret = 0;
+
+   nr_pages = (addr + len - 1) / PAGE_SIZE - addr / PAGE_SIZE + 1;
+   while (nr_pages) {
+   long pages = min(nr_pages, 16UL);
+   int locked = 1, n;
+   ssize_t copied;
+
+   /*
+* Get the pages we're interested in.  We must
+* access remotely because task/mm might not
+* current/current->mm
+*/
+   down_read(>mmap_sem);
+   pages = get_user_pages_remote(task, mm, pa, pages, flags,
+ process_pages, NULL, );
+   if (locked)
+   up_read(>mmap_sem);
+   if (pages <= 0) {
+   failed = true;
+   ret = -EFAULT;
+   break;
+   }
+
+   copied = pages * PAGE_SIZE - start_offset;
+   if (copied > len)
+   copied = len;
+   len -= copied;
+
+   for (n = 0; copied; n++, start_offset = 0) {
+   int size = min_t(int, copied, PAGE_SIZE - start_offset);
+
+   if (!failed) {
+   buf.page = process_pages[n];
+   

Re: [PATCH 8/8] signal: Remove kernel interal si_code magic

2017-07-12 Thread Andrei Vagin
On Fri, Jun 30, 2017 at 07:39:06AM -0500, Eric W. Biederman wrote:
> struct siginfo is a union and the kernel since 2.4 has been hiding a union
> tag in the high 16bits of si_code using the values:
> __SI_KILL
> __SI_TIMER
> __SI_POLL
> __SI_FAULT
> __SI_CHLD
> __SI_RT
> __SI_MESGQ
> __SI_SYS
> 
> While this looks plausible on the surface, in practice this situation has
> not worked well.
> 
> - Injected positive signals are not copied to user space properly
>   unless they have these magic high bits set.
> 
> - Injected positive signals are not reported properly by signalfd
>   unless they have these magic high bits set.
> 
> - These kernel internal values leaked to userspace via ptrace_peek_siginfo
> 
> - It was possible to inject these kernel internal values and cause the
>   the kernel to misbehave.
> 
> - Kernel developers got confused and expected these kernel internal values
>   in userspace in kernel self tests.
> 
> - Kernel developers got confused and set si_code to __SI_FAULT which
>   is SI_USER in userspace which causes userspace to think an ordinary user
>   sent the signal and that it was not kernel generated.
> 
> - The values make it impossible to reorganize the code to transform
>   siginfo_copy_to_user into a plain copy_to_user.  As si_code must
>   be massaged before being passed to userspace.
> 
> So remove these kernel internal si codes and make the kernel code simpler
> and more maintainable.
> 
> To replace these kernel internal magic si_codes introduce the helper
> function siginfo_layout, that takes a signal number and an si_code and
> computes which union member of siginfo is being used.  Have
> siginfo_layout return an enumeration so that gcc will have enough
> information to warn if a switch statement does not handle all of union
> members.
> 
> A couple of architectures have a messed up ABI that defines signal
> specific duplications of SI_USER which causes more special cases in
> siginfo_layout than I would like.  The good news is only problem
> architectures pay the cost.
> 
> Update all of the code that used the previous magic __SI_ values to
> use the new SIL_ values and to call siginfo_layout to get those
> values.  Escept where not all of the cases are handled remove the
> defaults in the switch statements so that if a new case is missed in
> the future the lack will show up at compile time.
> 
> Modify the code that copies siginfo si_code to userspace to just copy
> the value and not cast si_code to a short first.  The high bits are no
> longer used to hold a magic union member.
> 
> Fixup the siginfo header files to stop including the __SI_ values in
> their constants and for the headers that were missing it to properly
> update the number of si_codes for each signal type.
> 
> The fixes to copy_siginfo_from_user32 implementations has the
> interesting property that several of them perviously should never have
> worked as the __SI_ values they depended up where kernel internal.
> With that dependency gone those implementations should work much
> better.
> 
> The idea of not passing the __SI_ values out to userspace and then
> not reinserting them has been tested with criu and criu worked without
> changes.
> 
> Signed-off-by: "Eric W. Biederman" 
> ---
>  arch/alpha/include/uapi/asm/siginfo.h|   2 +-
>  arch/arm64/kernel/signal32.c |  23 +++
>  arch/blackfin/include/uapi/asm/siginfo.h |  30 +
>  arch/frv/include/uapi/asm/siginfo.h  |   2 +-
>  arch/ia64/include/uapi/asm/siginfo.h |  20 +++---
>  arch/ia64/kernel/signal.c|  17 +++---
>  arch/mips/include/uapi/asm/siginfo.h |   6 +-
>  arch/mips/kernel/signal32.c  |  19 +++---
>  arch/parisc/kernel/signal32.c|  31 +-
>  arch/powerpc/kernel/signal_32.c  |  20 +++---
>  arch/s390/kernel/compat_signal.c |  32 +-
>  arch/sparc/include/uapi/asm/siginfo.h|   4 +-
>  arch/sparc/kernel/signal32.c |  16 ++---
>  arch/tile/include/uapi/asm/siginfo.h |   4 +-
>  arch/tile/kernel/compat_signal.c |  18 +++---
>  arch/tile/kernel/traps.c |   1 -
>  arch/x86/kernel/signal_compat.c  |  21 +++
>  fs/fcntl.c   |   2 +-
>  fs/signalfd.c|  22 +++
>  include/asm-generic/siginfo.h|  22 ---
>  include/uapi/asm-generic/siginfo.h   | 102 
> ++-
>  kernel/compat.c  |   2 -
>  kernel/exit.c|   6 +-
>  kernel/ptrace.c  |   6 +-
>  kernel/signal.c  |  72 --
>  25 files changed, 254 insertions(+), 246 deletions(-)
> 
<...>
> diff --git a/arch/tile/kernel/traps.c b/arch/tile/kernel/traps.c
> index 54804866f238..4433d1dc28e6 100644
> --- a/arch/tile/kernel/traps.c
> +++ b/arch/tile/kernel/traps.c
> @@ -188,7 +188,6 @@ static int 

Re: [CRIU] BUG: Dentry ffff9f795a08fe60{i=af565f, n=lo} still in use (1) [unmount of proc proc]

2017-07-07 Thread Andrei Vagin
On Thu, Jul 06, 2017 at 08:41:00AM -0500, Eric W. Biederman wrote:
> Andrei Vagin <ava...@gmail.com> writes:
> 
> > I did a few experiments and found that the bug is reproduced for 6-12
> > hours on the our test server. Then I reverted two patches and the server
> > is working normally for more than 24 hours already, so the bug is
> > probably in one of these patches.
> >
> > commit e3d0065ab8535cbeee69a4c46a59f4d7360803ae
> > Author: Andrei Vagin <ava...@openvz.org>
> > Date:   Sun Jul 2 07:41:25 2017 +0200
> >
> > Revert "proc/sysctl: prune stale dentries during unregistering"
> > 
> > This reverts commit d6cffbbe9a7e51eb705182965a189457c17ba8a3.
> >
> > commit 2d3c50dac81011c1da4d2f7a63b84bd75287e320
> > Author: Andrei Vagin <ava...@openvz.org>
> > Date:   Sun Jul 2 07:40:08 2017 +0200
> >
> > Revert "proc/sysctl: Don't grab i_lock under sysctl_lock."
> > 
> > This reverts commit ace0c791e6c3cf5ef37cad2df69f0d90ccc40ffb.
> >
> >
> > FYI: This bug has been reproduced on 4.11.7
> 
> Instead of the revert could you test the patch below?

Our CI server are working with this patch for more than one day without
any problem.

Tested-by: Andrei Vagin <ava...@openvz.org>

Thanks,
Andrei
> 
> This should fix the issue by grabbing a s_active reference
> to the proc super block for every inode we flush.
> 
> diff --git a/fs/proc/internal.h b/fs/proc/internal.h
> index c5ae09b6c726..18694598bebf 100644
> --- a/fs/proc/internal.h
> +++ b/fs/proc/internal.h
> @@ -67,7 +67,7 @@ struct proc_inode {
>   struct proc_dir_entry *pde;
>   struct ctl_table_header *sysctl;
>   struct ctl_table *sysctl_entry;
> - struct list_head sysctl_inodes;
> + struct hlist_node sysctl_inodes;
>   const struct proc_ns_operations *ns_ops;
>   struct inode vfs_inode;
>  };
> diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c
> index 67985a7233c2..9bf06e2b1284 100644
> --- a/fs/proc/proc_sysctl.c
> +++ b/fs/proc/proc_sysctl.c
> @@ -191,7 +191,7 @@ static void init_header(struct ctl_table_header *head,
>   head->set = set;
>   head->parent = NULL;
>   head->node = node;
> - INIT_LIST_HEAD(>inodes);
> + INIT_HLIST_HEAD(>inodes);
>   if (node) {
>   struct ctl_table *entry;
>   for (entry = table; entry->procname; entry++, node++)
> @@ -261,25 +261,42 @@ static void unuse_table(struct ctl_table_header *p)
>   complete(p->unregistering);
>  }
>  
> -/* called under sysctl_lock */
>  static void proc_sys_prune_dcache(struct ctl_table_header *head)
>  {
> - struct inode *inode, *prev = NULL;
> + struct inode *inode;
>   struct proc_inode *ei;
> + struct hlist_node *node;
> + struct super_block *sb;
>  
>   rcu_read_lock();
> - list_for_each_entry_rcu(ei, >inodes, sysctl_inodes) {
> - inode = igrab(>vfs_inode);
> - if (inode) {
> - rcu_read_unlock();
> - iput(prev);
> - prev = inode;
> - d_prune_aliases(inode);
> + for (;;) {
> + node = hlist_first_rcu(>inodes);
> + if (!node)
> + break;
> + ei = hlist_entry(node, struct proc_inode, sysctl_inodes);
> + spin_lock(_lock);
> + hlist_del_init_rcu(>sysctl_inodes);
> + spin_unlock(_lock);
> +
> + inode = >vfs_inode;
> + sb = inode->i_sb;
> + if (!atomic_inc_not_zero(>s_active))
> + continue;
> + inode = igrab(inode);
> + rcu_read_unlock();
> + if (unlikely(!inode)) {
> + deactivate_super(sb);
>   rcu_read_lock();
> + continue;
>   }
> +
> + d_prune_aliases(inode);
> + iput(inode);
> + deactivate_super(sb);
> +
> + rcu_read_lock();
>   }
>   rcu_read_unlock();
> - iput(prev);
>  }
>  
>  /* called under sysctl_lock, will reacquire if has to wait */
> @@ -461,7 +478,7 @@ static struct inode *proc_sys_make_inode(struct 
> super_block *sb,
>   }
>   ei->sysctl = head;
>   ei->sysctl_entry = table;
> - list_add_rcu(>sysctl_inodes, >inodes);
> + hlist_add_head_rcu(>sysctl_inodes, >inodes);
>   head->count++;
>   spin_unlock(_lock);
>  
> @@ -489,7 +506,7 @@ static struct inode *p

lockdep reports possible recursive locking for sb_writers from do_iter_write and do_sendfile

2017-07-07 Thread Andrei Vagin
Hello,

We run CRIU tests for Linus' tree and today we found this warning:

[   27.131931] 
[   27.132008] WARNING: possible recursive locking detected
[   27.132085] 4.12.0+ #1 Not tainted
[   27.132158] 
[   27.132243] criu/1537 is trying to acquire lock:
[   27.132330]  (sb_writers#5){.+.+.+}, at: []
do_iter_write+0x1d4/0x390
[   27.132429]
[   27.132429] but task is already holding lock:
[   27.132511]  (sb_writers#5){.+.+.+}, at: [] +0x671/0x6e0
[   27.132594]
[   27.132594] other info that might help us debug this:
[   27.132671]  Possible unsafe locking scenario:
[   27.132671]
[   27.132747]CPU0
[   27.132819]
[   27.132892]   lock(sb_writers#5);
[   27.132972]   lock(sb_writers#5);
[   27.133056]
[   27.133056]  *** DEADLOCK ***
[   27.133056]
[   27.133154]  May be due to missing lock nesting notation
[   27.133154]
[   27.133245] 1 lock held by criu/1537:
[   27.133325]  #0:  (sb_writers#5){.+.+.+}, at: []
do_sendfile+0x671/0x6e0
[   27.133424]
[   27.133424] stack backtrace:
[   27.133515] CPU: 0 PID: 1537 Comm: criu Not tainted 4.12.0+ #1
[   27.133598] Hardware name: Google Google Compute Engine/Google
Compute Engine, BIOS Google 01/01/2011
[   27.133690] Call Trace:
[   27.133774]  dump_stack+0x85/0xc2
[   27.133859]  __lock_acquire+0x1dbc/0x1e70
[   27.133949]  ? debug_check_no_locks_freed+0x1c0/0x1c0
[   27.134034]  ? do_splice_direct+0x113/0x160
[   27.134117]  ? do_sendfile+0x37e/0x6e0
[   27.134199]  ? SyS_sendfile64+0x122/0x130
[   27.134282]  ? entry_SYSCALL_64_fastpath+0x23/0xc2
[   27.134367]  ? find_held_lock+0x119/0x150
[   27.134456]  ? __lock_is_held+0x93/0x100
[   27.134539]  ? current_time+0x18/0x80
[   27.134623]  lock_acquire+0x101/0x220
[   27.134706]  ? lock_acquire+0x101/0x220
[   27.134791]  ? do_iter_write+0x1d4/0x390
[   27.134895]  __sb_start_write+0xc6/0x1e0
[   27.134998]  ? do_iter_write+0x1d4/0x390
[   27.135082]  do_iter_write+0x1d4/0x390
[   27.135167]  ? kasan_unpoison_shadow+0x36/0x50
[   27.135256]  ? kasan_kmalloc+0xad/0xe0
[   27.135346]  vfs_iter_write+0x4e/0x70
[   27.135430]  iter_file_splice_write+0x409/0x650
[   27.135515]  ? page_cache_pipe_buf_steal+0x140/0x140
[   27.135599]  ? mark_held_locks+0x8e/0xc0
[   27.135686]  ? common_file_perm+0xe6/0x2c0
[   27.135769]  ? trace_hardirqs_on_caller+0x18b/0x270
[   27.135853]  ? __fsnotify_parent+0x2c/0x130
[   27.135942]  ? rw_verify_area+0x78/0x140
[   27.136026]  direct_splice_actor+0x94/0xb0
[   27.136110]  splice_direct_to_actor+0x1b9/0x400
[   27.136194]  ? generic_pipe_buf_nosteal+0x10/0x10
[   27.136278]  ? do_splice_to+0xc0/0xc0
[   27.136362]  ? rw_verify_area+0x78/0x140
[   27.136446]  do_splice_direct+0x113/0x160
[   27.136537]  ? splice_direct_to_actor+0x400/0x400
[   27.136621]  ? __sb_start_write+0xed/0x1e0
[   27.136704]  ? do_sendfile+0x671/0x6e0
[   27.136788]  ? __fget_light+0xa7/0xc0
[   27.136872]  do_sendfile+0x37e/0x6e0
[   27.136961]  ? do_compat_pwritev64+0xa0/0xa0
[   27.137047]  ? ext4_find_unwritten_pgoff.isra.14+0x480/0x480
[   27.137132]  ? vfs_read+0x15d/0x1d0
[   27.137215]  SyS_sendfile64+0x122/0x130
[   27.137297]  ? SyS_sendfile+0x120/0x120
[   27.137385]  ? trace_hardirqs_on_caller+0x18b/0x270
[   27.137469]  ? trace_hardirqs_on_thunk+0x1a/0x1c
[   27.137554]  entry_SYSCALL_64_fastpath+0x23/0xc2
[   27.137638] RIP: 0033:0x7f2bb27e8f5a
[   27.137720] RSP: 002b:7ffc53a398d8 EFLAGS: 0206 ORIG_RAX:
0028
[   27.137813] RAX: ffda RBX: ac168175 RCX: 7f2bb27e8f5a
[   27.137897] RDX:  RSI:  RDI: 0001
[   27.137985] RBP: 8801c8547f98 R08: 004af768 R09: e6f0
[   27.138069] R10: 59a8 R11: 0206 R12: 01e49f00
[   27.138152] R13: 7f2bb33081e0 R14: 000b R15: 0001
[   27.138243]  ? trace_hardirqs_off_caller+0xc5/0x110

$ git describe origin/master
v4.12-7934-g9f45efb

Here is all logs: https://goo.gl/TK6VSy

Thanks.
Andrei


Re: [PATCH] selftests/nsfs: create kconfig fragments

2017-07-05 Thread Andrei Vagin
On Thu, Jun 29, 2017 at 02:25:53PM +0530, naresh.kamb...@linaro.org wrote:
> From: Naresh Kamboju <naresh.kamb...@linaro.org>
> 
> Create a config fragment for nsfs to enable additional config options.
> The config fragments can be used with the help of
> scripts/kconfig/merge_config.sh.
>

Acked-by: Andrei Vagin <ava...@virtuozzo.com>

Thank you.

> Signed-off-by: Naresh Kamboju <naresh.kamb...@linaro.org>
> ---
>  tools/testing/selftests/nsfs/config | 3 +++
>  1 file changed, 3 insertions(+)
>  create mode 100644 tools/testing/selftests/nsfs/config
> 
> diff --git a/tools/testing/selftests/nsfs/config 
> b/tools/testing/selftests/nsfs/config
> new file mode 100644
> index ..598d0a225fc9
> --- /dev/null
> +++ b/tools/testing/selftests/nsfs/config
> @@ -0,0 +1,3 @@
> +CONFIG_USER_NS=y
> +CONFIG_UTS_NS=y
> +CONFIG_PID_NS=y
> -- 
> 2.13.0
> 


Re: [PATCH] ptrace: Add compat PTRACE_{G,S}ETSIGMASK handlers

2017-07-05 Thread Andrei Vagin
On Thu, Jun 29, 2017 at 05:26:37PM +0100, James Morse wrote:
> compat_ptrace_request() lacks handlers for PTRACE_{G,S}ETSIGMASK,
> instead using those in ptrace_request(). The compat variant should
> read a compat_sigset_t from userspace instead of ptrace_request()s
> sigset_t.
> 
> While compat_sigset_t is the same size as sigset_t, it is defined as
> 2xu32, instead of a single u64. On a big-endian CPU this means that
> compat_sigset_t is passed to user-space using middle-endianness,
> where the least-significant u32 is written most significant byte
> first.
> 
> If ptrace_request()s code is used userspace will read the most
> significant u32 where it expected the least significant.
> 
> Instead of duplicating ptrace_request()s code as a special case in
> the arch code, handle it here.
>

Acked-by: Andrei Vagin <ava...@openvz.org>

> CC: Yury Norov <yno...@caviumnetworks.com>
> CC: Andrey Vagin <ava...@openvz.org>
> Reported-by: Zhou Chengming <zhouchengmi...@huawei.com>
> Signed-off-by: James Morse <james.mo...@arm.com>
> Fixes: 29000caecbe87 ("ptrace: add ability to get/set signal-blocked mask")
> ---
> LTP test case here:
> https://lists.linux.it/pipermail/ltp/2017-June/004932.html
> 
>  kernel/ptrace.c | 52 
>  1 file changed, 40 insertions(+), 12 deletions(-)
> 
> diff --git a/kernel/ptrace.c b/kernel/ptrace.c
> index 8d2c10714530..a5bebb6713e8 100644
> --- a/kernel/ptrace.c
> +++ b/kernel/ptrace.c
> @@ -843,6 +843,22 @@ static int ptrace_regset(struct task_struct *task, int 
> req, unsigned int type,
>  EXPORT_SYMBOL_GPL(task_user_regset_view);
>  #endif
>  
> +static int ptrace_setsigmask(struct task_struct *child, sigset_t *new_set)
> +{
> + sigdelsetmask(new_set, sigmask(SIGKILL)|sigmask(SIGSTOP));
> +
> + /*
> +  * Every thread does recalc_sigpending() after resume, so
> +  * retarget_shared_pending() and recalc_sigpending() are not
> +  * called here.
> +  */
> + spin_lock_irq(>sighand->siglock);
> + child->blocked = *new_set;
> + spin_unlock_irq(>sighand->siglock);
> +
> + return 0;
> +}
> +
>  int ptrace_request(struct task_struct *child, long request,
>  unsigned long addr, unsigned long data)
>  {
> @@ -914,18 +930,7 @@ int ptrace_request(struct task_struct *child, long 
> request,
>   break;
>   }
>  
> - sigdelsetmask(_set, sigmask(SIGKILL)|sigmask(SIGSTOP));
> -
> - /*
> -  * Every thread does recalc_sigpending() after resume, so
> -  * retarget_shared_pending() and recalc_sigpending() are not
> -  * called here.
> -  */
> - spin_lock_irq(>sighand->siglock);
> - child->blocked = new_set;
> - spin_unlock_irq(>sighand->siglock);
> -
> - ret = 0;
> + ret = ptrace_setsigmask(child, _set);
>   break;
>   }
>  
> @@ -1149,7 +1154,9 @@ int compat_ptrace_request(struct task_struct *child, 
> compat_long_t request,
> compat_ulong_t addr, compat_ulong_t data)
>  {
>   compat_ulong_t __user *datap = compat_ptr(data);
> + compat_sigset_t set32;
>   compat_ulong_t word;
> + sigset_t new_set;
>   siginfo_t siginfo;
>   int ret;
>  
> @@ -1189,6 +1196,27 @@ int compat_ptrace_request(struct task_struct *child, 
> compat_long_t request,
>   else
>   ret = ptrace_setsiginfo(child, );
>   break;
> + case PTRACE_GETSIGMASK:
> + if (addr != sizeof(compat_sigset_t))
> + return -EINVAL;
> +
> + sigset_to_compat(, >blocked);
> +
> + if (copy_to_user(datap, , sizeof(set32)))
> + return -EFAULT;
> +
> + ret = 0;
> + break;
> + case PTRACE_SETSIGMASK:
> + if (addr != sizeof(compat_sigset_t))
> + return -EINVAL;
> +
> + if (copy_from_user(, datap, sizeof(compat_sigset_t)))
> + return -EFAULT;
> +
> + sigset_from_compat(_set, );
> + ret = ptrace_setsigmask(child, _set);
> + break;
>  #ifdef CONFIG_HAVE_ARCH_TRACEHOOK
>   case PTRACE_GETREGSET:
>   case PTRACE_SETREGSET:
> -- 
> 2.11.0
> 


Re: [CRIU] BUG: Dentry ffff9f795a08fe60{i=af565f, n=lo} still in use (1) [unmount of proc proc]

2017-07-03 Thread Andrei Vagin
On Fri, Jun 30, 2017 at 12:11:07PM -0700, Andrei Vagin wrote:
> On Thu, Jun 29, 2017 at 08:42:23PM -0500, Eric W. Biederman wrote:
> > Andrei Vagin <ava...@gmail.com> writes:
> > 
> > > On Thu, Jun 29, 2017 at 12:06 PM, Eric W. Biederman
> > > <ebied...@xmission.com> wrote:
> > >> Andrei Vagin <ava...@gmail.com> writes:
> > >>
> > >>> Hello,
> > >>>
> > >>> We run CRIU tests on linus' tree and today we found this issue.
> > >>>
> > >>> CRIU tests are the set of small programs to check checkpoint/restore
> > >>> of different primitives (files, sockets, signals, pipes, etc).
> > >>> https://github.com/xemul/criu/tree/master/test
> > >>>
> > >>> Each test is executed three times: without namespaces, in a set of all
> > >>> namespaces except userns, in a set of all namespaces. When a test
> > >>> passed the preparation tests, it sends a signal to an executer, and
> > >>> then the executer dumps and restores tests processes, and sends a
> > >>> signal to the test back to check that everything are restored
> > >>> correctly.
> > >>
> > >> I am not certain what you are saying, and you seem to have Cc'd
> > >> every list except the netdev and netfilter lists that are needed
> > >> to deal with this.
> > >>
> > >> Are you saing that the change from Liping Zhang is needed? Or are you
> > >> saying that change introduces the problem below?
> > >
> > > Hi Eric,
> > >
> > > Here I tried to explain our usecase. I don't know which changes in the
> > > kernel affect this issue.
> > >
> > > Actually I reported about the similar problem a few month ago on the 
> > > linux-next:
> > > https://lkml.org/lkml/2017/3/10/1586
> > >
> > > So I don't think that the change from Liping Zhang affects this issue
> > > somehow. I mentioned it just to describe what kernel we used.
> > >
> > > And I don't know how to reproduce the issue. You can see from the
> > > kernel log, that the kernel worked for more than 6 hours in out case.
> > > During this perioud we run all our tests a few times, so I think there
> > > is a kind of race.
> > >
> > >>
> > >> I could not find the mentioned commits.  Are the in Linus's tree or
> > >> someone's next tree that feeds into linux-next?
> > >
> > > Here is the patch from Liping Zhang
> > > https://patchwork.ozlabs.org/patch/770887/
> > >
> > > The second mentioned commit is HEAD of the master branch in Linus' tree:
> > > https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=6474924e2b5ddb0030c38966adcbe3b49022
> > 
> > Apologies I somehow thought that g in the kernel version you mentioned
> > was part of the commit id and thus I could not find it.  Sigh.
> > 
> > Ok so with Linus's tree and that one extra patch from Liping Zhang you
> > have kernel problems sometimes.
> > 
> > The warning and the oops combined are quite suggestive of what is going
> > on.  It does sound like while the pid namespace is being unregistered
> > something under /proc/sys/net/ipv4/conf//... is being accessed
> > and keeping the inode busy.
> > 
> > Which then leads to an oops when the network namespace is being cleaned
> > up later, as it tries to purge all of the inodes.
> > 
> > Which raises the question how in the world can the count of the
> > superblock drop to zero with an inode in use.
> > 
> > As devinet is where things go strange this does seem completely
> > independent of what Liping Zhang was looking at.
> > 
> > This does smell like a bug in the generic code.  Hmm.
> > 
> > Is this consistently reproducible when you run your tests...
> 
> I'm not sure about that. I'm going to do some experiments to understand
> how often it is reproduced on our test system, and then will try to
> revert the patch from Konstantin.

I did a few experiments and found that the bug is reproduced for 6-12
hours on the our test server. Then I reverted two patches and the server
is working normally for more than 24 hours already, so the bug is
probably in one of these patches.

commit e3d0065ab8535cbeee69a4c46a59f4d7360803ae
Author: Andrei Vagin <ava...@openvz.org>
Date:   Sun Jul 2 07:41:25 2017 +0200

Revert "proc/sysctl: prune stale dentries during unregistering"

This reverts commit d6cffbbe9a7e51eb70518296

Re: [CRIU] BUG: Dentry ffff9f795a08fe60{i=af565f, n=lo} still in use (1) [unmount of proc proc]

2017-06-30 Thread Andrei Vagin
On Thu, Jun 29, 2017 at 08:42:23PM -0500, Eric W. Biederman wrote:
> Andrei Vagin <ava...@gmail.com> writes:
> 
> > On Thu, Jun 29, 2017 at 12:06 PM, Eric W. Biederman
> > <ebied...@xmission.com> wrote:
> >> Andrei Vagin <ava...@gmail.com> writes:
> >>
> >>> Hello,
> >>>
> >>> We run CRIU tests on linus' tree and today we found this issue.
> >>>
> >>> CRIU tests are the set of small programs to check checkpoint/restore
> >>> of different primitives (files, sockets, signals, pipes, etc).
> >>> https://github.com/xemul/criu/tree/master/test
> >>>
> >>> Each test is executed three times: without namespaces, in a set of all
> >>> namespaces except userns, in a set of all namespaces. When a test
> >>> passed the preparation tests, it sends a signal to an executer, and
> >>> then the executer dumps and restores tests processes, and sends a
> >>> signal to the test back to check that everything are restored
> >>> correctly.
> >>
> >> I am not certain what you are saying, and you seem to have Cc'd
> >> every list except the netdev and netfilter lists that are needed
> >> to deal with this.
> >>
> >> Are you saing that the change from Liping Zhang is needed? Or are you
> >> saying that change introduces the problem below?
> >
> > Hi Eric,
> >
> > Here I tried to explain our usecase. I don't know which changes in the
> > kernel affect this issue.
> >
> > Actually I reported about the similar problem a few month ago on the 
> > linux-next:
> > https://lkml.org/lkml/2017/3/10/1586
> >
> > So I don't think that the change from Liping Zhang affects this issue
> > somehow. I mentioned it just to describe what kernel we used.
> >
> > And I don't know how to reproduce the issue. You can see from the
> > kernel log, that the kernel worked for more than 6 hours in out case.
> > During this perioud we run all our tests a few times, so I think there
> > is a kind of race.
> >
> >>
> >> I could not find the mentioned commits.  Are the in Linus's tree or
> >> someone's next tree that feeds into linux-next?
> >
> > Here is the patch from Liping Zhang
> > https://patchwork.ozlabs.org/patch/770887/
> >
> > The second mentioned commit is HEAD of the master branch in Linus' tree:
> > https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=6474924e2b5ddb0030c38966adcbe3b49022
> 
> Apologies I somehow thought that g in the kernel version you mentioned
> was part of the commit id and thus I could not find it.  Sigh.
> 
> Ok so with Linus's tree and that one extra patch from Liping Zhang you
> have kernel problems sometimes.
> 
> The warning and the oops combined are quite suggestive of what is going
> on.  It does sound like while the pid namespace is being unregistered
> something under /proc/sys/net/ipv4/conf//... is being accessed
> and keeping the inode busy.
> 
> Which then leads to an oops when the network namespace is being cleaned
> up later, as it tries to purge all of the inodes.
> 
> Which raises the question how in the world can the count of the
> superblock drop to zero with an inode in use.
> 
> As devinet is where things go strange this does seem completely
> independent of what Liping Zhang was looking at.
> 
> This does smell like a bug in the generic code.  Hmm.
> 
> Is this consistently reproducible when you run your tests...

I'm not sure about that. I'm going to do some experiments to understand
how often it is reproduced on our test system, and then will try to
revert the patch from Konstantin.

Thanks!

> 
> I think the following change where it uses igrab has the possibility
> of increasing the inode count while a pid namespace is being shut down.
> So it is worth a good hard look and fixes and possibly a revert if we
> can prove that this is the issue.
> 
> commit d6cffbbe9a7e51eb705182965a189457c17ba8a3
> Author: Konstantin Khlebnikov <khlebni...@yandex-team.ru>
> Date:   Fri Feb 10 10:35:02 2017 +0300
> 
> proc/sysctl: prune stale dentries during unregistering
> 
> Eric
> 
> 
> > Thanks,
> > Andrei
> >
> >>
> >> Eric
> >>
> >>> [root@zdtm linux]# git describe HEAD~1
> >>> v4.12-rc7-25-g6474924
> >>>
> >>> And there is one more patch from the netfilter tree:
> >>> commit b216759c0cb5d37d1eec3cd5b67ba38bace94fd8
> >>> Author: Liping Zhang
> >>> Date:   Sun Jun 4 19:17:34 2017 +0800

Re: BUG: Dentry ffff9f795a08fe60{i=af565f,n=lo} still in use (1) [unmount of proc proc]

2017-06-29 Thread Andrei Vagin
On Thu, Jun 29, 2017 at 12:06 PM, Eric W. Biederman
<ebied...@xmission.com> wrote:
> Andrei Vagin <ava...@gmail.com> writes:
>
>> Hello,
>>
>> We run CRIU tests on linus' tree and today we found this issue.
>>
>> CRIU tests are the set of small programs to check checkpoint/restore
>> of different primitives (files, sockets, signals, pipes, etc).
>> https://github.com/xemul/criu/tree/master/test
>>
>> Each test is executed three times: without namespaces, in a set of all
>> namespaces except userns, in a set of all namespaces. When a test
>> passed the preparation tests, it sends a signal to an executer, and
>> then the executer dumps and restores tests processes, and sends a
>> signal to the test back to check that everything are restored
>> correctly.
>
> I am not certain what you are saying, and you seem to have Cc'd
> every list except the netdev and netfilter lists that are needed
> to deal with this.
>
> Are you saing that the change from Liping Zhang is needed? Or are you
> saying that change introduces the problem below?

Hi Eric,

Here I tried to explain our usecase. I don't know which changes in the
kernel affect this issue.

Actually I reported about the similar problem a few month ago on the linux-next:
https://lkml.org/lkml/2017/3/10/1586

So I don't think that the change from Liping Zhang affects this issue
somehow. I mentioned it just to describe what kernel we used.

And I don't know how to reproduce the issue. You can see from the
kernel log, that the kernel worked for more than 6 hours in out case.
During this perioud we run all our tests a few times, so I think there
is a kind of race.

>
> I could not find the mentioned commits.  Are the in Linus's tree or
> someone's next tree that feeds into linux-next?

Here is the patch from Liping Zhang
https://patchwork.ozlabs.org/patch/770887/

The second mentioned commit is HEAD of the master branch in Linus' tree:
https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=6474924e2b5ddb0030c38966adcbe3b49022

Thanks,
Andrei

>
> Eric
>
>> [root@zdtm linux]# git describe HEAD~1
>> v4.12-rc7-25-g6474924
>>
>> And there is one more patch from the netfilter tree:
>> commit b216759c0cb5d37d1eec3cd5b67ba38bace94fd8
>> Author: Liping Zhang
>> Date:   Sun Jun 4 19:17:34 2017 +0800
>>
>> netfilter: nf_ct_dccp/sctp: fix memory leak after netns cleanup
>>
>> [root@zdtm linux]# uname -a
>> Linux zdtm.openvz.org 4.12.0-rc7+ #9 SMP Thu Jun 29 08:28:18 CEST 2017
>> x86_64 x86_64 x86_64 GNU/Linux
>>
>> A kernel config is attached.
>>
>>
>> [22458.504137] BUG: Dentry 9f795a08fe60{i=af565f,n=lo}  still in
>> use (1) [unmount of proc proc]
>> [22458.505117] [ cut here ]
>> [22458.505299] WARNING: CPU: 0 PID: 15036 at fs/dcache.c:1445
>> umount_check+0x66/0x80
>> [22458.505564] Modules linked in: nfsd auth_rpcgss nfs_acl lockd grace
>> sunrpc macvlan tun veth nf_conntrack_netlink xt_mark udp_diag tcp_diag
>> inet_diag netlink_diag af_packet_diag unix_diag binfmt_misc
>> ip6t_rpfilter ip6t_REJECT nf_reject_ipv6 xt_conntrack ip_set nfnetlink
>> ebtable_broute bridge stp llc ebtable_nat ip6table_security
>> ip6table_mangle ip6table_raw ip6table_nat nf_conntrack_ipv6
>> nf_defrag_ipv6 nf_nat_ipv6 iptable_security iptable_mangle iptable_raw
>> iptable_nat nf_conntrack_ipv4 nf_defrag_ipv4 nf_nat_ipv4 nf_nat
>> nf_conntrack ebtable_filter ebtables ip6table_filter ip6_tables btrfs
>> xor raid6_pq loop ppdev crct10dif_pclmul crc32_pclmul
>> ghash_clmulni_intel lpc_ich sbs virtio_balloon shpchp sbshc parport_pc
>> parport tpm_tis tpm_tis_core tpm xfs libcrc32c crc32c_intel
>> ata_generic serio_raw pata_acpi
>> [22458.507586]  virtio_pci virtio virtio_ring e1000
>> [22458.507771] CPU: 0 PID: 15036 Comm: kworker/0:2 Not tainted 4.12.0-rc7+ #9
>> [22458.507830] systemd-journald[561]: Compressed data object 807 ->
>> 605 using LZ4
>> [22458.508184] Hardware name: Parallels Software International Inc.
>> Parallels Virtual Platform/Parallels Virtual Platform, BIOS
>> 6.12.26068.1232434 02/27/2017
>> [22458.508641] Workqueue: events proc_cleanup_work
>> [22458.508848] task: 9f797be8 task.stack: b2b8825f
>> [22458.509033] RIP: 0010:umount_check+0x66/0x80
>> [22458.509172] RSP: 0018:b2b8825f3c68 EFLAGS: 00010286
>> [22458.509363] RAX: 0054 RBX: 9f798492afa0 RCX: 
>> 
>> [22458.509589] RDX:  RSI: 9f79bce0e388 RDI: 
>> 9f79bce0e388
>> [22458.509823] RBP: b2b8825f3c70 R08:  R09: 
>> 000

  1   2   3   >