Re: [PATCH 2/3] namei: implement AT_THIS_ROOT chroot-like path resolution

2018-10-08 Thread Jann Horn
On Sat, Oct 6, 2018 at 4:10 AM Aleksa Sarai  wrote:
> On 2018-10-05, Jann Horn  wrote:
> > > What if we took rename_lock (call it nd->r_seq) at the start of the
> > > resolution, and then only tried the __d_path-style check
> > >
> > >   if (read_seqretry(_lock, nd->r_seq) ||
> > >   read_seqretry(_lock, nd->m_seq))
> > >   /* do the __d_path lookup. */
> > >
> > > That way you would only hit the slow path if there were concurrent
> > > renames or mounts *and* you are doing a path resolution with
> > > AT_THIS_ROOT or AT_BENEATH. I've attached a modified patch that does
> > > this (and after some testing it also appears to work).
> >
> > Yeah, I think that might do the job.
>
> *phew* I was all out of other ideas. :P
>
> > > ---
> > >  fs/namei.c | 49 ++---
> > >  1 file changed, 46 insertions(+), 3 deletions(-)
> > >
> > > diff --git a/fs/namei.c b/fs/namei.c
> > > index 6f995e6de6b1..12c9be175cb4 100644
> > > --- a/fs/namei.c
> > > +++ b/fs/namei.c
> > > @@ -493,7 +493,7 @@ struct nameidata {
> > > struct path root;
> > > struct inode*inode; /* path.dentry.d_inode */
> > > unsigned intflags;
> > > -   unsignedseq, m_seq;
> > > +   unsignedseq, m_seq, r_seq;
> > > int last_type;
> > > unsigneddepth;
> > > int total_link_count;
> > > @@ -1375,6 +1375,27 @@ static int follow_dotdot_rcu(struct nameidata *nd)
> > > return -EXDEV;
> > > break;
> > > }
> > > +   if (unlikely((nd->flags & (LOOKUP_BENEATH | 
> > > LOOKUP_CHROOT)) &&
> > > +(read_seqretry(_lock, nd->r_seq) ||
> > > + read_seqretry(_lock, nd->m_seq {
> > > +   char *pathbuf, *pathptr;
> > > +
> > > +   nd->r_seq = read_seqbegin(_lock);
> > > +   /* Cannot take m_seq here. */
> > > +
> > > +   pathbuf = kmalloc(PATH_MAX, GFP_ATOMIC);
> > > +   if (!pathbuf)
> > > +   return -ECHILD;
> > > +   pathptr = __d_path(>path, >root, pathbuf, 
> > > PATH_MAX);
> > > +   kfree(pathbuf);
> >
> > You're doing this check before actually looking up the parent, right?
> > So as long as I don't trigger the "path_equal(>path, >root)"
> > check that you do for O_BENEATH, escaping up by one level is possible,
> > right? You should probably move this check so that it happens after
> > following "..".
>
> Yup, you're right. I'll do that.
>
> > (Also: I assume that you're going to get rid of that memory allocation
> > in a future version.)
>
> Sure. Would you prefer adding some scratch space in nameidata, or that I
> change __d_path so it accepts NULL as the buffer (and thus it doesn't
> actually do any string operations)?

Well, I think accepting a NULL buffer would be much cleaner; but keep
in mind that I'm just someone making suggestions, Al Viro is the one
who has to like your code. :P

> > > if (nd->path.dentry != nd->path.mnt->mnt_root) {
> > > int ret = path_parent_directory(>path);
> > > if (ret)
> > > @@ -2269,6 +2311,9 @@ static const char *path_init(struct nameidata *nd, 
> > > unsigned flags)
> > > nd->last_type = LAST_ROOT; /* if there are only slashes... */
> > > nd->flags = flags | LOOKUP_JUMPED | LOOKUP_PARENT;
> > > nd->depth = 0;
> > > +   nd->m_seq = read_seqbegin(_lock);
> > > +   nd->r_seq = read_seqbegin(_lock);
> >
> > This means that now, attempting to perform a lookup while something is
> > holding the rename_lock will spin on the lock. I don't know whether
> > that's a problem in practice though. Does anyone on this thread know
> > whether this is problematic?
>
> I could make it so that we only take _lock
>   if (nd->flags & (FOLLOW_BENEATH | FOLLOW_CHROOT)),
> since it's not used outside of that path.

I think that might be a sensible change; but as I said, I don't
actually know whether it's necessary, and it would be very helpful if
someone who actually knows commented on this.


Re: [PATCH 2/3] namei: implement AT_THIS_ROOT chroot-like path resolution

2018-10-08 Thread Jann Horn
On Sat, Oct 6, 2018 at 4:10 AM Aleksa Sarai  wrote:
> On 2018-10-05, Jann Horn  wrote:
> > > What if we took rename_lock (call it nd->r_seq) at the start of the
> > > resolution, and then only tried the __d_path-style check
> > >
> > >   if (read_seqretry(_lock, nd->r_seq) ||
> > >   read_seqretry(_lock, nd->m_seq))
> > >   /* do the __d_path lookup. */
> > >
> > > That way you would only hit the slow path if there were concurrent
> > > renames or mounts *and* you are doing a path resolution with
> > > AT_THIS_ROOT or AT_BENEATH. I've attached a modified patch that does
> > > this (and after some testing it also appears to work).
> >
> > Yeah, I think that might do the job.
>
> *phew* I was all out of other ideas. :P
>
> > > ---
> > >  fs/namei.c | 49 ++---
> > >  1 file changed, 46 insertions(+), 3 deletions(-)
> > >
> > > diff --git a/fs/namei.c b/fs/namei.c
> > > index 6f995e6de6b1..12c9be175cb4 100644
> > > --- a/fs/namei.c
> > > +++ b/fs/namei.c
> > > @@ -493,7 +493,7 @@ struct nameidata {
> > > struct path root;
> > > struct inode*inode; /* path.dentry.d_inode */
> > > unsigned intflags;
> > > -   unsignedseq, m_seq;
> > > +   unsignedseq, m_seq, r_seq;
> > > int last_type;
> > > unsigneddepth;
> > > int total_link_count;
> > > @@ -1375,6 +1375,27 @@ static int follow_dotdot_rcu(struct nameidata *nd)
> > > return -EXDEV;
> > > break;
> > > }
> > > +   if (unlikely((nd->flags & (LOOKUP_BENEATH | 
> > > LOOKUP_CHROOT)) &&
> > > +(read_seqretry(_lock, nd->r_seq) ||
> > > + read_seqretry(_lock, nd->m_seq {
> > > +   char *pathbuf, *pathptr;
> > > +
> > > +   nd->r_seq = read_seqbegin(_lock);
> > > +   /* Cannot take m_seq here. */
> > > +
> > > +   pathbuf = kmalloc(PATH_MAX, GFP_ATOMIC);
> > > +   if (!pathbuf)
> > > +   return -ECHILD;
> > > +   pathptr = __d_path(>path, >root, pathbuf, 
> > > PATH_MAX);
> > > +   kfree(pathbuf);
> >
> > You're doing this check before actually looking up the parent, right?
> > So as long as I don't trigger the "path_equal(>path, >root)"
> > check that you do for O_BENEATH, escaping up by one level is possible,
> > right? You should probably move this check so that it happens after
> > following "..".
>
> Yup, you're right. I'll do that.
>
> > (Also: I assume that you're going to get rid of that memory allocation
> > in a future version.)
>
> Sure. Would you prefer adding some scratch space in nameidata, or that I
> change __d_path so it accepts NULL as the buffer (and thus it doesn't
> actually do any string operations)?

Well, I think accepting a NULL buffer would be much cleaner; but keep
in mind that I'm just someone making suggestions, Al Viro is the one
who has to like your code. :P

> > > if (nd->path.dentry != nd->path.mnt->mnt_root) {
> > > int ret = path_parent_directory(>path);
> > > if (ret)
> > > @@ -2269,6 +2311,9 @@ static const char *path_init(struct nameidata *nd, 
> > > unsigned flags)
> > > nd->last_type = LAST_ROOT; /* if there are only slashes... */
> > > nd->flags = flags | LOOKUP_JUMPED | LOOKUP_PARENT;
> > > nd->depth = 0;
> > > +   nd->m_seq = read_seqbegin(_lock);
> > > +   nd->r_seq = read_seqbegin(_lock);
> >
> > This means that now, attempting to perform a lookup while something is
> > holding the rename_lock will spin on the lock. I don't know whether
> > that's a problem in practice though. Does anyone on this thread know
> > whether this is problematic?
>
> I could make it so that we only take _lock
>   if (nd->flags & (FOLLOW_BENEATH | FOLLOW_CHROOT)),
> since it's not used outside of that path.

I think that might be a sensible change; but as I said, I don't
actually know whether it's necessary, and it would be very helpful if
someone who actually knows commented on this.


Re: [PATCH 2/3] namei: implement AT_THIS_ROOT chroot-like path resolution

2018-10-06 Thread Christian Brauner
On Sat, Oct 6, 2018 at 10:56 PM Florian Weimer  wrote:
>
> * Aleksa Sarai:
>
> > On 2018-10-01, Andy Lutomirski  wrote:
> >> >>> Currently most container runtimes try to do this resolution in
> >> >>> userspace[1], causing many potential race conditions. In addition, the
> >> >>> "obvious" alternative (actually performing a {ch,pivot_}root(2))
> >> >>> requires a fork+exec which is *very* costly if necessary for every
> >> >>> filesystem operation involving a container.
> >> >>
> >> >> Wait. fork() I understand, but why exec? And actually, you don't need
> >> >> a full fork() either, clone() lets you do this with some process parts
> >> >> shared. And then you also shouldn't need to use SCM_RIGHTS, just keep
> >> >> the file descriptor table shared. And why chroot()/pivot_root(),
> >> >> wouldn't you want to use setns()?
> >> >
> >> > You're right about this -- for C runtimes. In Go we cannot do a raw
> >> > clone() or fork() (if you do it manually with RawSyscall you'll end with
> >> > broken runtime state). So you're forced to do fork+exec (which then
> >> > means that you can't use CLONE_FILES and must use SCM_RIGHTS). Same goes
> >> > for CLONE_VFORK.
> >>
> >> I must admit that I’m not very sympathetic to the argument that “Go’s
> >> runtime model is incompatible with the simpler solution.”
> >
> > Multi-threaded programs have a similar issue (though with Go it's much
> > worse). If you fork a multi-threaded C program then you can only safely
> > use AS-Safe glibc functions (those that are safe within a signal
> > handler). But if you're just doing three syscalls this shouldn't be as
> > big of a problem as Go where you can't even do said syscalls.
>
> The situation is a bit more complicated.  There are many programs out
> there which use malloc and free (at least indirectly) after a fork,
> and we cannot break them.  In glibc, we have a couple of subsystems
> which are put into a known state before calling the fork/clone system
> call if the application calls fork.  The price we pay for that is a
> fork which is not POSIX-compliant because it is not async-signal-safe.
> Admittedly, other libcs chose different trade-offs.
>
> However, what is the same across libcs is this: You cannot call the
> clone system call directly and get a fully working new process.  Some
> things break.  For example, for recursive mutexes, we need to know the
> TID of the current thread, and we cannot perform a system call to get
> it for performance reasons.  So everyone has a TID cache for that.
> But the TID cache does not get reset when you bypass the fork
> implementation in libc, so you end up with subtle corruption bugs on
> TID reuse.

Sure, but recursive mutexes etc. are very specific use-case.
I'd even go so far to say that if you use mutexes + threads and then
also fork in those threads you're hosed anyway. If you don't things get a little
cleaner assuming you don't call library functions that use mutexes
internally.
Event then you might (sometimes at least) still get around most problems
with atfork handlers (thought I really don't like him). But you know more
about this then I do. :)

>
> So I'd say that in most cases, the C situation is pretty much the same
> as the Go situation.  If I recall correctly, the problem for Go is
> that it cannot call setns from Go code because it fails in the kernel
> for multi-threaded processes, and Go processes are already
> multi-threaded when user Go code runs.

That is true for *some* namespaces (user, mount) but not for all.
For example, setns(CLONE_NEWNET) would be fine from go.
But the go runtime thinks it's clever to clone a new thread in between
entry and exit of a syscall. If you switch namespaces you might end
up with a new thread that belongs to the wrong namespace which is
very problematic.
So you can either rely on calling some go magic that locks
you to a specific os thread but that does only work in later go versions or
you go the constructor route, i.e. you e.g. implement a (dummy)
subcommand that you can call and that triggers the execution of a
C function that is marked with __attribute__((constructor)) that runs
before the go runtime and in which you can do setns(), fork() and
friends (somewhat) safely. This has very
bad performance and is a nasty hack but it's really unavoidable.


Re: [PATCH 2/3] namei: implement AT_THIS_ROOT chroot-like path resolution

2018-10-06 Thread Christian Brauner
On Sat, Oct 6, 2018 at 10:56 PM Florian Weimer  wrote:
>
> * Aleksa Sarai:
>
> > On 2018-10-01, Andy Lutomirski  wrote:
> >> >>> Currently most container runtimes try to do this resolution in
> >> >>> userspace[1], causing many potential race conditions. In addition, the
> >> >>> "obvious" alternative (actually performing a {ch,pivot_}root(2))
> >> >>> requires a fork+exec which is *very* costly if necessary for every
> >> >>> filesystem operation involving a container.
> >> >>
> >> >> Wait. fork() I understand, but why exec? And actually, you don't need
> >> >> a full fork() either, clone() lets you do this with some process parts
> >> >> shared. And then you also shouldn't need to use SCM_RIGHTS, just keep
> >> >> the file descriptor table shared. And why chroot()/pivot_root(),
> >> >> wouldn't you want to use setns()?
> >> >
> >> > You're right about this -- for C runtimes. In Go we cannot do a raw
> >> > clone() or fork() (if you do it manually with RawSyscall you'll end with
> >> > broken runtime state). So you're forced to do fork+exec (which then
> >> > means that you can't use CLONE_FILES and must use SCM_RIGHTS). Same goes
> >> > for CLONE_VFORK.
> >>
> >> I must admit that I’m not very sympathetic to the argument that “Go’s
> >> runtime model is incompatible with the simpler solution.”
> >
> > Multi-threaded programs have a similar issue (though with Go it's much
> > worse). If you fork a multi-threaded C program then you can only safely
> > use AS-Safe glibc functions (those that are safe within a signal
> > handler). But if you're just doing three syscalls this shouldn't be as
> > big of a problem as Go where you can't even do said syscalls.
>
> The situation is a bit more complicated.  There are many programs out
> there which use malloc and free (at least indirectly) after a fork,
> and we cannot break them.  In glibc, we have a couple of subsystems
> which are put into a known state before calling the fork/clone system
> call if the application calls fork.  The price we pay for that is a
> fork which is not POSIX-compliant because it is not async-signal-safe.
> Admittedly, other libcs chose different trade-offs.
>
> However, what is the same across libcs is this: You cannot call the
> clone system call directly and get a fully working new process.  Some
> things break.  For example, for recursive mutexes, we need to know the
> TID of the current thread, and we cannot perform a system call to get
> it for performance reasons.  So everyone has a TID cache for that.
> But the TID cache does not get reset when you bypass the fork
> implementation in libc, so you end up with subtle corruption bugs on
> TID reuse.

Sure, but recursive mutexes etc. are very specific use-case.
I'd even go so far to say that if you use mutexes + threads and then
also fork in those threads you're hosed anyway. If you don't things get a little
cleaner assuming you don't call library functions that use mutexes
internally.
Event then you might (sometimes at least) still get around most problems
with atfork handlers (thought I really don't like him). But you know more
about this then I do. :)

>
> So I'd say that in most cases, the C situation is pretty much the same
> as the Go situation.  If I recall correctly, the problem for Go is
> that it cannot call setns from Go code because it fails in the kernel
> for multi-threaded processes, and Go processes are already
> multi-threaded when user Go code runs.

That is true for *some* namespaces (user, mount) but not for all.
For example, setns(CLONE_NEWNET) would be fine from go.
But the go runtime thinks it's clever to clone a new thread in between
entry and exit of a syscall. If you switch namespaces you might end
up with a new thread that belongs to the wrong namespace which is
very problematic.
So you can either rely on calling some go magic that locks
you to a specific os thread but that does only work in later go versions or
you go the constructor route, i.e. you e.g. implement a (dummy)
subcommand that you can call and that triggers the execution of a
C function that is marked with __attribute__((constructor)) that runs
before the go runtime and in which you can do setns(), fork() and
friends (somewhat) safely. This has very
bad performance and is a nasty hack but it's really unavoidable.


Re: [PATCH 2/3] namei: implement AT_THIS_ROOT chroot-like path resolution

2018-10-06 Thread Florian Weimer
* Aleksa Sarai:

> On 2018-10-01, Andy Lutomirski  wrote:
>> >>> Currently most container runtimes try to do this resolution in
>> >>> userspace[1], causing many potential race conditions. In addition, the
>> >>> "obvious" alternative (actually performing a {ch,pivot_}root(2))
>> >>> requires a fork+exec which is *very* costly if necessary for every
>> >>> filesystem operation involving a container.
>> >> 
>> >> Wait. fork() I understand, but why exec? And actually, you don't need
>> >> a full fork() either, clone() lets you do this with some process parts
>> >> shared. And then you also shouldn't need to use SCM_RIGHTS, just keep
>> >> the file descriptor table shared. And why chroot()/pivot_root(),
>> >> wouldn't you want to use setns()?
>> > 
>> > You're right about this -- for C runtimes. In Go we cannot do a raw
>> > clone() or fork() (if you do it manually with RawSyscall you'll end with
>> > broken runtime state). So you're forced to do fork+exec (which then
>> > means that you can't use CLONE_FILES and must use SCM_RIGHTS). Same goes
>> > for CLONE_VFORK.
>> 
>> I must admit that I’m not very sympathetic to the argument that “Go’s
>> runtime model is incompatible with the simpler solution.”
>
> Multi-threaded programs have a similar issue (though with Go it's much
> worse). If you fork a multi-threaded C program then you can only safely
> use AS-Safe glibc functions (those that are safe within a signal
> handler). But if you're just doing three syscalls this shouldn't be as
> big of a problem as Go where you can't even do said syscalls.

The situation is a bit more complicated.  There are many programs out
there which use malloc and free (at least indirectly) after a fork,
and we cannot break them.  In glibc, we have a couple of subsystems
which are put into a known state before calling the fork/clone system
call if the application calls fork.  The price we pay for that is a
fork which is not POSIX-compliant because it is not async-signal-safe.
Admittedly, other libcs chose different trade-offs.

However, what is the same across libcs is this: You cannot call the
clone system call directly and get a fully working new process.  Some
things break.  For example, for recursive mutexes, we need to know the
TID of the current thread, and we cannot perform a system call to get
it for performance reasons.  So everyone has a TID cache for that.
But the TID cache does not get reset when you bypass the fork
implementation in libc, so you end up with subtle corruption bugs on
TID reuse.

So I'd say that in most cases, the C situation is pretty much the same
as the Go situation.  If I recall correctly, the problem for Go is
that it cannot call setns from Go code because it fails in the kernel
for multi-threaded processes, and Go processes are already
multi-threaded when user Go code runs.


Re: [PATCH 2/3] namei: implement AT_THIS_ROOT chroot-like path resolution

2018-10-06 Thread Florian Weimer
* Aleksa Sarai:

> On 2018-10-01, Andy Lutomirski  wrote:
>> >>> Currently most container runtimes try to do this resolution in
>> >>> userspace[1], causing many potential race conditions. In addition, the
>> >>> "obvious" alternative (actually performing a {ch,pivot_}root(2))
>> >>> requires a fork+exec which is *very* costly if necessary for every
>> >>> filesystem operation involving a container.
>> >> 
>> >> Wait. fork() I understand, but why exec? And actually, you don't need
>> >> a full fork() either, clone() lets you do this with some process parts
>> >> shared. And then you also shouldn't need to use SCM_RIGHTS, just keep
>> >> the file descriptor table shared. And why chroot()/pivot_root(),
>> >> wouldn't you want to use setns()?
>> > 
>> > You're right about this -- for C runtimes. In Go we cannot do a raw
>> > clone() or fork() (if you do it manually with RawSyscall you'll end with
>> > broken runtime state). So you're forced to do fork+exec (which then
>> > means that you can't use CLONE_FILES and must use SCM_RIGHTS). Same goes
>> > for CLONE_VFORK.
>> 
>> I must admit that I’m not very sympathetic to the argument that “Go’s
>> runtime model is incompatible with the simpler solution.”
>
> Multi-threaded programs have a similar issue (though with Go it's much
> worse). If you fork a multi-threaded C program then you can only safely
> use AS-Safe glibc functions (those that are safe within a signal
> handler). But if you're just doing three syscalls this shouldn't be as
> big of a problem as Go where you can't even do said syscalls.

The situation is a bit more complicated.  There are many programs out
there which use malloc and free (at least indirectly) after a fork,
and we cannot break them.  In glibc, we have a couple of subsystems
which are put into a known state before calling the fork/clone system
call if the application calls fork.  The price we pay for that is a
fork which is not POSIX-compliant because it is not async-signal-safe.
Admittedly, other libcs chose different trade-offs.

However, what is the same across libcs is this: You cannot call the
clone system call directly and get a fully working new process.  Some
things break.  For example, for recursive mutexes, we need to know the
TID of the current thread, and we cannot perform a system call to get
it for performance reasons.  So everyone has a TID cache for that.
But the TID cache does not get reset when you bypass the fork
implementation in libc, so you end up with subtle corruption bugs on
TID reuse.

So I'd say that in most cases, the C situation is pretty much the same
as the Go situation.  If I recall correctly, the problem for Go is
that it cannot call setns from Go code because it fails in the kernel
for multi-threaded processes, and Go processes are already
multi-threaded when user Go code runs.


Re: [PATCH 2/3] namei: implement AT_THIS_ROOT chroot-like path resolution

2018-10-05 Thread Aleksa Sarai
On 2018-10-05, Jann Horn  wrote:
> > What if we took rename_lock (call it nd->r_seq) at the start of the
> > resolution, and then only tried the __d_path-style check
> >
> >   if (read_seqretry(_lock, nd->r_seq) ||
> >   read_seqretry(_lock, nd->m_seq))
> >   /* do the __d_path lookup. */
> >
> > That way you would only hit the slow path if there were concurrent
> > renames or mounts *and* you are doing a path resolution with
> > AT_THIS_ROOT or AT_BENEATH. I've attached a modified patch that does
> > this (and after some testing it also appears to work).
> 
> Yeah, I think that might do the job.

*phew* I was all out of other ideas. :P

> > ---
> >  fs/namei.c | 49 ++---
> >  1 file changed, 46 insertions(+), 3 deletions(-)
> >
> > diff --git a/fs/namei.c b/fs/namei.c
> > index 6f995e6de6b1..12c9be175cb4 100644
> > --- a/fs/namei.c
> > +++ b/fs/namei.c
> > @@ -493,7 +493,7 @@ struct nameidata {
> > struct path root;
> > struct inode*inode; /* path.dentry.d_inode */
> > unsigned intflags;
> > -   unsignedseq, m_seq;
> > +   unsignedseq, m_seq, r_seq;
> > int last_type;
> > unsigneddepth;
> > int total_link_count;
> > @@ -1375,6 +1375,27 @@ static int follow_dotdot_rcu(struct nameidata *nd)
> > return -EXDEV;
> > break;
> > }
> > +   if (unlikely((nd->flags & (LOOKUP_BENEATH | LOOKUP_CHROOT)) 
> > &&
> > +(read_seqretry(_lock, nd->r_seq) ||
> > + read_seqretry(_lock, nd->m_seq {
> > +   char *pathbuf, *pathptr;
> > +
> > +   nd->r_seq = read_seqbegin(_lock);
> > +   /* Cannot take m_seq here. */
> > +
> > +   pathbuf = kmalloc(PATH_MAX, GFP_ATOMIC);
> > +   if (!pathbuf)
> > +   return -ECHILD;
> > +   pathptr = __d_path(>path, >root, pathbuf, 
> > PATH_MAX);
> > +   kfree(pathbuf);
> 
> You're doing this check before actually looking up the parent, right?
> So as long as I don't trigger the "path_equal(>path, >root)"
> check that you do for O_BENEATH, escaping up by one level is possible,
> right? You should probably move this check so that it happens after
> following "..".

Yup, you're right. I'll do that.

> (Also: I assume that you're going to get rid of that memory allocation
> in a future version.)

Sure. Would you prefer adding some scratch space in nameidata, or that I
change __d_path so it accepts NULL as the buffer (and thus it doesn't
actually do any string operations)?

> > if (nd->path.dentry != nd->path.mnt->mnt_root) {
> > int ret = path_parent_directory(>path);
> > if (ret)
> > @@ -2269,6 +2311,9 @@ static const char *path_init(struct nameidata *nd, 
> > unsigned flags)
> > nd->last_type = LAST_ROOT; /* if there are only slashes... */
> > nd->flags = flags | LOOKUP_JUMPED | LOOKUP_PARENT;
> > nd->depth = 0;
> > +   nd->m_seq = read_seqbegin(_lock);
> > +   nd->r_seq = read_seqbegin(_lock);
> 
> This means that now, attempting to perform a lookup while something is
> holding the rename_lock will spin on the lock. I don't know whether
> that's a problem in practice though. Does anyone on this thread know
> whether this is problematic?

I could make it so that we only take _lock
  if (nd->flags & (FOLLOW_BENEATH | FOLLOW_CHROOT)),
since it's not used outside of that path.

-- 
Aleksa Sarai
Senior Software Engineer (Containers)
SUSE Linux GmbH



signature.asc
Description: PGP signature


Re: [PATCH 2/3] namei: implement AT_THIS_ROOT chroot-like path resolution

2018-10-05 Thread Aleksa Sarai
On 2018-10-05, Jann Horn  wrote:
> > What if we took rename_lock (call it nd->r_seq) at the start of the
> > resolution, and then only tried the __d_path-style check
> >
> >   if (read_seqretry(_lock, nd->r_seq) ||
> >   read_seqretry(_lock, nd->m_seq))
> >   /* do the __d_path lookup. */
> >
> > That way you would only hit the slow path if there were concurrent
> > renames or mounts *and* you are doing a path resolution with
> > AT_THIS_ROOT or AT_BENEATH. I've attached a modified patch that does
> > this (and after some testing it also appears to work).
> 
> Yeah, I think that might do the job.

*phew* I was all out of other ideas. :P

> > ---
> >  fs/namei.c | 49 ++---
> >  1 file changed, 46 insertions(+), 3 deletions(-)
> >
> > diff --git a/fs/namei.c b/fs/namei.c
> > index 6f995e6de6b1..12c9be175cb4 100644
> > --- a/fs/namei.c
> > +++ b/fs/namei.c
> > @@ -493,7 +493,7 @@ struct nameidata {
> > struct path root;
> > struct inode*inode; /* path.dentry.d_inode */
> > unsigned intflags;
> > -   unsignedseq, m_seq;
> > +   unsignedseq, m_seq, r_seq;
> > int last_type;
> > unsigneddepth;
> > int total_link_count;
> > @@ -1375,6 +1375,27 @@ static int follow_dotdot_rcu(struct nameidata *nd)
> > return -EXDEV;
> > break;
> > }
> > +   if (unlikely((nd->flags & (LOOKUP_BENEATH | LOOKUP_CHROOT)) 
> > &&
> > +(read_seqretry(_lock, nd->r_seq) ||
> > + read_seqretry(_lock, nd->m_seq {
> > +   char *pathbuf, *pathptr;
> > +
> > +   nd->r_seq = read_seqbegin(_lock);
> > +   /* Cannot take m_seq here. */
> > +
> > +   pathbuf = kmalloc(PATH_MAX, GFP_ATOMIC);
> > +   if (!pathbuf)
> > +   return -ECHILD;
> > +   pathptr = __d_path(>path, >root, pathbuf, 
> > PATH_MAX);
> > +   kfree(pathbuf);
> 
> You're doing this check before actually looking up the parent, right?
> So as long as I don't trigger the "path_equal(>path, >root)"
> check that you do for O_BENEATH, escaping up by one level is possible,
> right? You should probably move this check so that it happens after
> following "..".

Yup, you're right. I'll do that.

> (Also: I assume that you're going to get rid of that memory allocation
> in a future version.)

Sure. Would you prefer adding some scratch space in nameidata, or that I
change __d_path so it accepts NULL as the buffer (and thus it doesn't
actually do any string operations)?

> > if (nd->path.dentry != nd->path.mnt->mnt_root) {
> > int ret = path_parent_directory(>path);
> > if (ret)
> > @@ -2269,6 +2311,9 @@ static const char *path_init(struct nameidata *nd, 
> > unsigned flags)
> > nd->last_type = LAST_ROOT; /* if there are only slashes... */
> > nd->flags = flags | LOOKUP_JUMPED | LOOKUP_PARENT;
> > nd->depth = 0;
> > +   nd->m_seq = read_seqbegin(_lock);
> > +   nd->r_seq = read_seqbegin(_lock);
> 
> This means that now, attempting to perform a lookup while something is
> holding the rename_lock will spin on the lock. I don't know whether
> that's a problem in practice though. Does anyone on this thread know
> whether this is problematic?

I could make it so that we only take _lock
  if (nd->flags & (FOLLOW_BENEATH | FOLLOW_CHROOT)),
since it's not used outside of that path.

-- 
Aleksa Sarai
Senior Software Engineer (Containers)
SUSE Linux GmbH



signature.asc
Description: PGP signature


Re: [PATCH 2/3] namei: implement AT_THIS_ROOT chroot-like path resolution

2018-10-05 Thread Jann Horn
On Fri, Oct 5, 2018 at 5:07 PM Aleksa Sarai  wrote:
> On 2018-10-04, Jann Horn  wrote:
> > On Thu, Oct 4, 2018 at 6:26 PM Aleksa Sarai  wrote:
> > > On 2018-09-29, Jann Horn  wrote:
> > > > You attempt to open "C/../../etc/passwd" under the root "/A/B".
> > > > Something else concurrently moves /A/B/C to /A/C. This can result in
> > > > the following:
> > > >
> > > > 1. You start the path walk and reach /A/B/C.
> > > > 2. The other process moves /A/B/C to /A/C. Your path walk is now at 
> > > > /A/C.
> > > > 3. Your path walk follows the first ".." up into /A. This is outside
> > > > the process root, but you never actually encountered the process root,
> > > > so you don't notice.
> > > > 4. Your path walk follows the second ".." up to /. Again, this is
> > > > outside the process root, but you don't notice.
> > > > 5. Your path walk walks down to /etc/passwd, and the open completes
> > > > successfully. You now have an fd pointing outside your chroot.
> > >
> > > I've been playing with this and I have the following patch, which
> > > according to my testing protects against attacks where ".." skips over
> > > nd->root. It abuses __d_path to figure out if nd->path can be resolved
> > > from nd->root (obviously a proper version of this patch would refactor
> > > __d_path so it could be used like this -- and would not return
> > > -EMULTIHOP).
> > >
> > > I've also attached my reproducer. With it, I was seeing fairly constant
> > > breakouts before this patch and after it I didn't see a single breakout
> > > after running it overnight. Obviously this is not conclusive, but I'm
> > > hoping that it can show what my idea for protecting against ".." was.
> > >
> > > Does this patch make sense? Or is there something wrong with it that I'm
> > > not seeing?
> > >
> > > --8<---
> > >
> > > There is a fairly easy-to-exploit race condition with chroot(2) (and
> > > thus by extension AT_THIS_ROOT and AT_BENEATH) where a rename(2) of a
> > > path can be used to "skip over" nd->root and thus escape to the
> > > filesystem above nd->root.
> > >
> > >   thread1 [attacker]:
> > > for (;;)
> > >   renameat2(AT_FDCWD, "/a/b/c", AT_FDCWD, "/a/d", RENAME_EXCHANGE);
> > >   thread2 [victim]:
> > > for (;;)
> > >   openat(dirb, "b/c/../../etc/shadow", O_THISROOT);
> > >
> > > With fairly significant regularity, thread2 will resolve to
> > > "/etc/shadow" rather than "/a/b/etc/shadow". With this patch, such cases
> > > will be detected during ".." resolution (which is the weak point of
> > > chroot(2) -- since walking *into* a subdirectory tautologically cannot
> > > result in you walking *outside* nd->root).
> > >
> > > The use of __d_path here might seem suspect, however we don't mind if a
> > > path is moved from within the chroot to outside the chroot and we
> > > incorrectly decide it is safe (because at that point we are still within
> > > the set of files which were accessible at the beginning of resolution).
> > > However, we can fail resolution on the next path component if it remains
> > > outside of the root. A path which has always been outside nd->root
> > > during resolution will never be resolveable from nd->root and thus will
> > > always be blocked.
> > >
> > > DO NOT MERGE: Currently this code returns -EMULTIHOP in this case,
> > >   purely as a debugging measure (so that you can see that
> > >   the protection actually does something). Obviously in the
> > >   proper patch this will return -EXDEV.
> > >
> > > Signed-off-by: Aleksa Sarai 
> > > ---
> > >  fs/namei.c | 32 ++--
> > >  1 file changed, 30 insertions(+), 2 deletions(-)
> > >
> > > diff --git a/fs/namei.c b/fs/namei.c
> > > index 6f995e6de6b1..c8349693d47b 100644
> > > --- a/fs/namei.c
> > > +++ b/fs/namei.c
> > > @@ -53,8 +53,8 @@
> > >   * The new code replaces the old recursive symlink resolution with
> > >   * an iterative one (in case of non-nested symlink chains).  It does
> > >   * this with calls to _follow_link().
> > > - * As a side effect, dir_namei(), _namei() and follow_link() are now
> > > - * replaced with a single function lookup_dentry() that can handle all
> > > + * As a side effect, dir_namei(), _namei() and follow_link() are now
> > > + * replaced with a single function lookup_dentry() that can handle all
> > >   * the special cases of the former code.
> > >   *
> > >   * With the new dcache, the pathname is stored at each inode, at least as
> > > @@ -1375,6 +1375,20 @@ static int follow_dotdot_rcu(struct nameidata *nd)
> > > return -EXDEV;
> > > break;
> > > }
> > > +   if (unlikely(nd->flags & (LOOKUP_BENEATH | 
> > > LOOKUP_CHROOT))) {
> > > +   char *pathbuf, *pathptr;
> > > +
> > > +   pathbuf = kmalloc(PATH_MAX, GFP_ATOMIC);
> > > +   if 

Re: [PATCH 2/3] namei: implement AT_THIS_ROOT chroot-like path resolution

2018-10-05 Thread Jann Horn
On Fri, Oct 5, 2018 at 5:07 PM Aleksa Sarai  wrote:
> On 2018-10-04, Jann Horn  wrote:
> > On Thu, Oct 4, 2018 at 6:26 PM Aleksa Sarai  wrote:
> > > On 2018-09-29, Jann Horn  wrote:
> > > > You attempt to open "C/../../etc/passwd" under the root "/A/B".
> > > > Something else concurrently moves /A/B/C to /A/C. This can result in
> > > > the following:
> > > >
> > > > 1. You start the path walk and reach /A/B/C.
> > > > 2. The other process moves /A/B/C to /A/C. Your path walk is now at 
> > > > /A/C.
> > > > 3. Your path walk follows the first ".." up into /A. This is outside
> > > > the process root, but you never actually encountered the process root,
> > > > so you don't notice.
> > > > 4. Your path walk follows the second ".." up to /. Again, this is
> > > > outside the process root, but you don't notice.
> > > > 5. Your path walk walks down to /etc/passwd, and the open completes
> > > > successfully. You now have an fd pointing outside your chroot.
> > >
> > > I've been playing with this and I have the following patch, which
> > > according to my testing protects against attacks where ".." skips over
> > > nd->root. It abuses __d_path to figure out if nd->path can be resolved
> > > from nd->root (obviously a proper version of this patch would refactor
> > > __d_path so it could be used like this -- and would not return
> > > -EMULTIHOP).
> > >
> > > I've also attached my reproducer. With it, I was seeing fairly constant
> > > breakouts before this patch and after it I didn't see a single breakout
> > > after running it overnight. Obviously this is not conclusive, but I'm
> > > hoping that it can show what my idea for protecting against ".." was.
> > >
> > > Does this patch make sense? Or is there something wrong with it that I'm
> > > not seeing?
> > >
> > > --8<---
> > >
> > > There is a fairly easy-to-exploit race condition with chroot(2) (and
> > > thus by extension AT_THIS_ROOT and AT_BENEATH) where a rename(2) of a
> > > path can be used to "skip over" nd->root and thus escape to the
> > > filesystem above nd->root.
> > >
> > >   thread1 [attacker]:
> > > for (;;)
> > >   renameat2(AT_FDCWD, "/a/b/c", AT_FDCWD, "/a/d", RENAME_EXCHANGE);
> > >   thread2 [victim]:
> > > for (;;)
> > >   openat(dirb, "b/c/../../etc/shadow", O_THISROOT);
> > >
> > > With fairly significant regularity, thread2 will resolve to
> > > "/etc/shadow" rather than "/a/b/etc/shadow". With this patch, such cases
> > > will be detected during ".." resolution (which is the weak point of
> > > chroot(2) -- since walking *into* a subdirectory tautologically cannot
> > > result in you walking *outside* nd->root).
> > >
> > > The use of __d_path here might seem suspect, however we don't mind if a
> > > path is moved from within the chroot to outside the chroot and we
> > > incorrectly decide it is safe (because at that point we are still within
> > > the set of files which were accessible at the beginning of resolution).
> > > However, we can fail resolution on the next path component if it remains
> > > outside of the root. A path which has always been outside nd->root
> > > during resolution will never be resolveable from nd->root and thus will
> > > always be blocked.
> > >
> > > DO NOT MERGE: Currently this code returns -EMULTIHOP in this case,
> > >   purely as a debugging measure (so that you can see that
> > >   the protection actually does something). Obviously in the
> > >   proper patch this will return -EXDEV.
> > >
> > > Signed-off-by: Aleksa Sarai 
> > > ---
> > >  fs/namei.c | 32 ++--
> > >  1 file changed, 30 insertions(+), 2 deletions(-)
> > >
> > > diff --git a/fs/namei.c b/fs/namei.c
> > > index 6f995e6de6b1..c8349693d47b 100644
> > > --- a/fs/namei.c
> > > +++ b/fs/namei.c
> > > @@ -53,8 +53,8 @@
> > >   * The new code replaces the old recursive symlink resolution with
> > >   * an iterative one (in case of non-nested symlink chains).  It does
> > >   * this with calls to _follow_link().
> > > - * As a side effect, dir_namei(), _namei() and follow_link() are now
> > > - * replaced with a single function lookup_dentry() that can handle all
> > > + * As a side effect, dir_namei(), _namei() and follow_link() are now
> > > + * replaced with a single function lookup_dentry() that can handle all
> > >   * the special cases of the former code.
> > >   *
> > >   * With the new dcache, the pathname is stored at each inode, at least as
> > > @@ -1375,6 +1375,20 @@ static int follow_dotdot_rcu(struct nameidata *nd)
> > > return -EXDEV;
> > > break;
> > > }
> > > +   if (unlikely(nd->flags & (LOOKUP_BENEATH | 
> > > LOOKUP_CHROOT))) {
> > > +   char *pathbuf, *pathptr;
> > > +
> > > +   pathbuf = kmalloc(PATH_MAX, GFP_ATOMIC);
> > > +   if 

Re: [PATCH 2/3] namei: implement AT_THIS_ROOT chroot-like path resolution

2018-10-05 Thread Aleksa Sarai
On 2018-10-04, Jann Horn  wrote:
> On Thu, Oct 4, 2018 at 6:26 PM Aleksa Sarai  wrote:
> > On 2018-09-29, Jann Horn  wrote:
> > > You attempt to open "C/../../etc/passwd" under the root "/A/B".
> > > Something else concurrently moves /A/B/C to /A/C. This can result in
> > > the following:
> > >
> > > 1. You start the path walk and reach /A/B/C.
> > > 2. The other process moves /A/B/C to /A/C. Your path walk is now at /A/C.
> > > 3. Your path walk follows the first ".." up into /A. This is outside
> > > the process root, but you never actually encountered the process root,
> > > so you don't notice.
> > > 4. Your path walk follows the second ".." up to /. Again, this is
> > > outside the process root, but you don't notice.
> > > 5. Your path walk walks down to /etc/passwd, and the open completes
> > > successfully. You now have an fd pointing outside your chroot.
> >
> > I've been playing with this and I have the following patch, which
> > according to my testing protects against attacks where ".." skips over
> > nd->root. It abuses __d_path to figure out if nd->path can be resolved
> > from nd->root (obviously a proper version of this patch would refactor
> > __d_path so it could be used like this -- and would not return
> > -EMULTIHOP).
> >
> > I've also attached my reproducer. With it, I was seeing fairly constant
> > breakouts before this patch and after it I didn't see a single breakout
> > after running it overnight. Obviously this is not conclusive, but I'm
> > hoping that it can show what my idea for protecting against ".." was.
> >
> > Does this patch make sense? Or is there something wrong with it that I'm
> > not seeing?
> >
> > --8<---
> >
> > There is a fairly easy-to-exploit race condition with chroot(2) (and
> > thus by extension AT_THIS_ROOT and AT_BENEATH) where a rename(2) of a
> > path can be used to "skip over" nd->root and thus escape to the
> > filesystem above nd->root.
> >
> >   thread1 [attacker]:
> > for (;;)
> >   renameat2(AT_FDCWD, "/a/b/c", AT_FDCWD, "/a/d", RENAME_EXCHANGE);
> >   thread2 [victim]:
> > for (;;)
> >   openat(dirb, "b/c/../../etc/shadow", O_THISROOT);
> >
> > With fairly significant regularity, thread2 will resolve to
> > "/etc/shadow" rather than "/a/b/etc/shadow". With this patch, such cases
> > will be detected during ".." resolution (which is the weak point of
> > chroot(2) -- since walking *into* a subdirectory tautologically cannot
> > result in you walking *outside* nd->root).
> >
> > The use of __d_path here might seem suspect, however we don't mind if a
> > path is moved from within the chroot to outside the chroot and we
> > incorrectly decide it is safe (because at that point we are still within
> > the set of files which were accessible at the beginning of resolution).
> > However, we can fail resolution on the next path component if it remains
> > outside of the root. A path which has always been outside nd->root
> > during resolution will never be resolveable from nd->root and thus will
> > always be blocked.
> >
> > DO NOT MERGE: Currently this code returns -EMULTIHOP in this case,
> >   purely as a debugging measure (so that you can see that
> >   the protection actually does something). Obviously in the
> >   proper patch this will return -EXDEV.
> >
> > Signed-off-by: Aleksa Sarai 
> > ---
> >  fs/namei.c | 32 ++--
> >  1 file changed, 30 insertions(+), 2 deletions(-)
> >
> > diff --git a/fs/namei.c b/fs/namei.c
> > index 6f995e6de6b1..c8349693d47b 100644
> > --- a/fs/namei.c
> > +++ b/fs/namei.c
> > @@ -53,8 +53,8 @@
> >   * The new code replaces the old recursive symlink resolution with
> >   * an iterative one (in case of non-nested symlink chains).  It does
> >   * this with calls to _follow_link().
> > - * As a side effect, dir_namei(), _namei() and follow_link() are now
> > - * replaced with a single function lookup_dentry() that can handle all
> > + * As a side effect, dir_namei(), _namei() and follow_link() are now
> > + * replaced with a single function lookup_dentry() that can handle all
> >   * the special cases of the former code.
> >   *
> >   * With the new dcache, the pathname is stored at each inode, at least as
> > @@ -1375,6 +1375,20 @@ static int follow_dotdot_rcu(struct nameidata *nd)
> > return -EXDEV;
> > break;
> > }
> > +   if (unlikely(nd->flags & (LOOKUP_BENEATH | LOOKUP_CHROOT))) 
> > {
> > +   char *pathbuf, *pathptr;
> > +
> > +   pathbuf = kmalloc(PATH_MAX, GFP_ATOMIC);
> > +   if (!pathbuf)
> > +   return -ECHILD;
> > +   pathptr = __d_path(>path, >root, pathbuf, 
> > PATH_MAX);
> > +   kfree(pathbuf);
> > +   if (IS_ERR_OR_NULL(pathptr)) {
> > 

Re: [PATCH 2/3] namei: implement AT_THIS_ROOT chroot-like path resolution

2018-10-05 Thread Aleksa Sarai
On 2018-10-04, Jann Horn  wrote:
> On Thu, Oct 4, 2018 at 6:26 PM Aleksa Sarai  wrote:
> > On 2018-09-29, Jann Horn  wrote:
> > > You attempt to open "C/../../etc/passwd" under the root "/A/B".
> > > Something else concurrently moves /A/B/C to /A/C. This can result in
> > > the following:
> > >
> > > 1. You start the path walk and reach /A/B/C.
> > > 2. The other process moves /A/B/C to /A/C. Your path walk is now at /A/C.
> > > 3. Your path walk follows the first ".." up into /A. This is outside
> > > the process root, but you never actually encountered the process root,
> > > so you don't notice.
> > > 4. Your path walk follows the second ".." up to /. Again, this is
> > > outside the process root, but you don't notice.
> > > 5. Your path walk walks down to /etc/passwd, and the open completes
> > > successfully. You now have an fd pointing outside your chroot.
> >
> > I've been playing with this and I have the following patch, which
> > according to my testing protects against attacks where ".." skips over
> > nd->root. It abuses __d_path to figure out if nd->path can be resolved
> > from nd->root (obviously a proper version of this patch would refactor
> > __d_path so it could be used like this -- and would not return
> > -EMULTIHOP).
> >
> > I've also attached my reproducer. With it, I was seeing fairly constant
> > breakouts before this patch and after it I didn't see a single breakout
> > after running it overnight. Obviously this is not conclusive, but I'm
> > hoping that it can show what my idea for protecting against ".." was.
> >
> > Does this patch make sense? Or is there something wrong with it that I'm
> > not seeing?
> >
> > --8<---
> >
> > There is a fairly easy-to-exploit race condition with chroot(2) (and
> > thus by extension AT_THIS_ROOT and AT_BENEATH) where a rename(2) of a
> > path can be used to "skip over" nd->root and thus escape to the
> > filesystem above nd->root.
> >
> >   thread1 [attacker]:
> > for (;;)
> >   renameat2(AT_FDCWD, "/a/b/c", AT_FDCWD, "/a/d", RENAME_EXCHANGE);
> >   thread2 [victim]:
> > for (;;)
> >   openat(dirb, "b/c/../../etc/shadow", O_THISROOT);
> >
> > With fairly significant regularity, thread2 will resolve to
> > "/etc/shadow" rather than "/a/b/etc/shadow". With this patch, such cases
> > will be detected during ".." resolution (which is the weak point of
> > chroot(2) -- since walking *into* a subdirectory tautologically cannot
> > result in you walking *outside* nd->root).
> >
> > The use of __d_path here might seem suspect, however we don't mind if a
> > path is moved from within the chroot to outside the chroot and we
> > incorrectly decide it is safe (because at that point we are still within
> > the set of files which were accessible at the beginning of resolution).
> > However, we can fail resolution on the next path component if it remains
> > outside of the root. A path which has always been outside nd->root
> > during resolution will never be resolveable from nd->root and thus will
> > always be blocked.
> >
> > DO NOT MERGE: Currently this code returns -EMULTIHOP in this case,
> >   purely as a debugging measure (so that you can see that
> >   the protection actually does something). Obviously in the
> >   proper patch this will return -EXDEV.
> >
> > Signed-off-by: Aleksa Sarai 
> > ---
> >  fs/namei.c | 32 ++--
> >  1 file changed, 30 insertions(+), 2 deletions(-)
> >
> > diff --git a/fs/namei.c b/fs/namei.c
> > index 6f995e6de6b1..c8349693d47b 100644
> > --- a/fs/namei.c
> > +++ b/fs/namei.c
> > @@ -53,8 +53,8 @@
> >   * The new code replaces the old recursive symlink resolution with
> >   * an iterative one (in case of non-nested symlink chains).  It does
> >   * this with calls to _follow_link().
> > - * As a side effect, dir_namei(), _namei() and follow_link() are now
> > - * replaced with a single function lookup_dentry() that can handle all
> > + * As a side effect, dir_namei(), _namei() and follow_link() are now
> > + * replaced with a single function lookup_dentry() that can handle all
> >   * the special cases of the former code.
> >   *
> >   * With the new dcache, the pathname is stored at each inode, at least as
> > @@ -1375,6 +1375,20 @@ static int follow_dotdot_rcu(struct nameidata *nd)
> > return -EXDEV;
> > break;
> > }
> > +   if (unlikely(nd->flags & (LOOKUP_BENEATH | LOOKUP_CHROOT))) 
> > {
> > +   char *pathbuf, *pathptr;
> > +
> > +   pathbuf = kmalloc(PATH_MAX, GFP_ATOMIC);
> > +   if (!pathbuf)
> > +   return -ECHILD;
> > +   pathptr = __d_path(>path, >root, pathbuf, 
> > PATH_MAX);
> > +   kfree(pathbuf);
> > +   if (IS_ERR_OR_NULL(pathptr)) {
> > 

Re: [PATCH 2/3] namei: implement AT_THIS_ROOT chroot-like path resolution

2018-10-04 Thread Jann Horn
On Thu, Oct 4, 2018 at 6:26 PM Aleksa Sarai  wrote:
> On 2018-09-29, Jann Horn  wrote:
> > You attempt to open "C/../../etc/passwd" under the root "/A/B".
> > Something else concurrently moves /A/B/C to /A/C. This can result in
> > the following:
> >
> > 1. You start the path walk and reach /A/B/C.
> > 2. The other process moves /A/B/C to /A/C. Your path walk is now at /A/C.
> > 3. Your path walk follows the first ".." up into /A. This is outside
> > the process root, but you never actually encountered the process root,
> > so you don't notice.
> > 4. Your path walk follows the second ".." up to /. Again, this is
> > outside the process root, but you don't notice.
> > 5. Your path walk walks down to /etc/passwd, and the open completes
> > successfully. You now have an fd pointing outside your chroot.
>
> I've been playing with this and I have the following patch, which
> according to my testing protects against attacks where ".." skips over
> nd->root. It abuses __d_path to figure out if nd->path can be resolved
> from nd->root (obviously a proper version of this patch would refactor
> __d_path so it could be used like this -- and would not return
> -EMULTIHOP).
>
> I've also attached my reproducer. With it, I was seeing fairly constant
> breakouts before this patch and after it I didn't see a single breakout
> after running it overnight. Obviously this is not conclusive, but I'm
> hoping that it can show what my idea for protecting against ".." was.
>
> Does this patch make sense? Or is there something wrong with it that I'm
> not seeing?
>
> --8<---
>
> There is a fairly easy-to-exploit race condition with chroot(2) (and
> thus by extension AT_THIS_ROOT and AT_BENEATH) where a rename(2) of a
> path can be used to "skip over" nd->root and thus escape to the
> filesystem above nd->root.
>
>   thread1 [attacker]:
> for (;;)
>   renameat2(AT_FDCWD, "/a/b/c", AT_FDCWD, "/a/d", RENAME_EXCHANGE);
>   thread2 [victim]:
> for (;;)
>   openat(dirb, "b/c/../../etc/shadow", O_THISROOT);
>
> With fairly significant regularity, thread2 will resolve to
> "/etc/shadow" rather than "/a/b/etc/shadow". With this patch, such cases
> will be detected during ".." resolution (which is the weak point of
> chroot(2) -- since walking *into* a subdirectory tautologically cannot
> result in you walking *outside* nd->root).
>
> The use of __d_path here might seem suspect, however we don't mind if a
> path is moved from within the chroot to outside the chroot and we
> incorrectly decide it is safe (because at that point we are still within
> the set of files which were accessible at the beginning of resolution).
> However, we can fail resolution on the next path component if it remains
> outside of the root. A path which has always been outside nd->root
> during resolution will never be resolveable from nd->root and thus will
> always be blocked.
>
> DO NOT MERGE: Currently this code returns -EMULTIHOP in this case,
>   purely as a debugging measure (so that you can see that
>   the protection actually does something). Obviously in the
>   proper patch this will return -EXDEV.
>
> Signed-off-by: Aleksa Sarai 
> ---
>  fs/namei.c | 32 ++--
>  1 file changed, 30 insertions(+), 2 deletions(-)
>
> diff --git a/fs/namei.c b/fs/namei.c
> index 6f995e6de6b1..c8349693d47b 100644
> --- a/fs/namei.c
> +++ b/fs/namei.c
> @@ -53,8 +53,8 @@
>   * The new code replaces the old recursive symlink resolution with
>   * an iterative one (in case of non-nested symlink chains).  It does
>   * this with calls to _follow_link().
> - * As a side effect, dir_namei(), _namei() and follow_link() are now
> - * replaced with a single function lookup_dentry() that can handle all
> + * As a side effect, dir_namei(), _namei() and follow_link() are now
> + * replaced with a single function lookup_dentry() that can handle all
>   * the special cases of the former code.
>   *
>   * With the new dcache, the pathname is stored at each inode, at least as
> @@ -1375,6 +1375,20 @@ static int follow_dotdot_rcu(struct nameidata *nd)
> return -EXDEV;
> break;
> }
> +   if (unlikely(nd->flags & (LOOKUP_BENEATH | LOOKUP_CHROOT))) {
> +   char *pathbuf, *pathptr;
> +
> +   pathbuf = kmalloc(PATH_MAX, GFP_ATOMIC);
> +   if (!pathbuf)
> +   return -ECHILD;
> +   pathptr = __d_path(>path, >root, pathbuf, 
> PATH_MAX);
> +   kfree(pathbuf);
> +   if (IS_ERR_OR_NULL(pathptr)) {
> +   if (!pathptr)
> +   pathptr = ERR_PTR(-EMULTIHOP);
> +   return PTR_ERR(pathptr);
> +   }
> +   }

One 

Re: [PATCH 2/3] namei: implement AT_THIS_ROOT chroot-like path resolution

2018-10-04 Thread Jann Horn
On Thu, Oct 4, 2018 at 6:26 PM Aleksa Sarai  wrote:
> On 2018-09-29, Jann Horn  wrote:
> > You attempt to open "C/../../etc/passwd" under the root "/A/B".
> > Something else concurrently moves /A/B/C to /A/C. This can result in
> > the following:
> >
> > 1. You start the path walk and reach /A/B/C.
> > 2. The other process moves /A/B/C to /A/C. Your path walk is now at /A/C.
> > 3. Your path walk follows the first ".." up into /A. This is outside
> > the process root, but you never actually encountered the process root,
> > so you don't notice.
> > 4. Your path walk follows the second ".." up to /. Again, this is
> > outside the process root, but you don't notice.
> > 5. Your path walk walks down to /etc/passwd, and the open completes
> > successfully. You now have an fd pointing outside your chroot.
>
> I've been playing with this and I have the following patch, which
> according to my testing protects against attacks where ".." skips over
> nd->root. It abuses __d_path to figure out if nd->path can be resolved
> from nd->root (obviously a proper version of this patch would refactor
> __d_path so it could be used like this -- and would not return
> -EMULTIHOP).
>
> I've also attached my reproducer. With it, I was seeing fairly constant
> breakouts before this patch and after it I didn't see a single breakout
> after running it overnight. Obviously this is not conclusive, but I'm
> hoping that it can show what my idea for protecting against ".." was.
>
> Does this patch make sense? Or is there something wrong with it that I'm
> not seeing?
>
> --8<---
>
> There is a fairly easy-to-exploit race condition with chroot(2) (and
> thus by extension AT_THIS_ROOT and AT_BENEATH) where a rename(2) of a
> path can be used to "skip over" nd->root and thus escape to the
> filesystem above nd->root.
>
>   thread1 [attacker]:
> for (;;)
>   renameat2(AT_FDCWD, "/a/b/c", AT_FDCWD, "/a/d", RENAME_EXCHANGE);
>   thread2 [victim]:
> for (;;)
>   openat(dirb, "b/c/../../etc/shadow", O_THISROOT);
>
> With fairly significant regularity, thread2 will resolve to
> "/etc/shadow" rather than "/a/b/etc/shadow". With this patch, such cases
> will be detected during ".." resolution (which is the weak point of
> chroot(2) -- since walking *into* a subdirectory tautologically cannot
> result in you walking *outside* nd->root).
>
> The use of __d_path here might seem suspect, however we don't mind if a
> path is moved from within the chroot to outside the chroot and we
> incorrectly decide it is safe (because at that point we are still within
> the set of files which were accessible at the beginning of resolution).
> However, we can fail resolution on the next path component if it remains
> outside of the root. A path which has always been outside nd->root
> during resolution will never be resolveable from nd->root and thus will
> always be blocked.
>
> DO NOT MERGE: Currently this code returns -EMULTIHOP in this case,
>   purely as a debugging measure (so that you can see that
>   the protection actually does something). Obviously in the
>   proper patch this will return -EXDEV.
>
> Signed-off-by: Aleksa Sarai 
> ---
>  fs/namei.c | 32 ++--
>  1 file changed, 30 insertions(+), 2 deletions(-)
>
> diff --git a/fs/namei.c b/fs/namei.c
> index 6f995e6de6b1..c8349693d47b 100644
> --- a/fs/namei.c
> +++ b/fs/namei.c
> @@ -53,8 +53,8 @@
>   * The new code replaces the old recursive symlink resolution with
>   * an iterative one (in case of non-nested symlink chains).  It does
>   * this with calls to _follow_link().
> - * As a side effect, dir_namei(), _namei() and follow_link() are now
> - * replaced with a single function lookup_dentry() that can handle all
> + * As a side effect, dir_namei(), _namei() and follow_link() are now
> + * replaced with a single function lookup_dentry() that can handle all
>   * the special cases of the former code.
>   *
>   * With the new dcache, the pathname is stored at each inode, at least as
> @@ -1375,6 +1375,20 @@ static int follow_dotdot_rcu(struct nameidata *nd)
> return -EXDEV;
> break;
> }
> +   if (unlikely(nd->flags & (LOOKUP_BENEATH | LOOKUP_CHROOT))) {
> +   char *pathbuf, *pathptr;
> +
> +   pathbuf = kmalloc(PATH_MAX, GFP_ATOMIC);
> +   if (!pathbuf)
> +   return -ECHILD;
> +   pathptr = __d_path(>path, >root, pathbuf, 
> PATH_MAX);
> +   kfree(pathbuf);
> +   if (IS_ERR_OR_NULL(pathptr)) {
> +   if (!pathptr)
> +   pathptr = ERR_PTR(-EMULTIHOP);
> +   return PTR_ERR(pathptr);
> +   }
> +   }

One 

Re: [PATCH 2/3] namei: implement AT_THIS_ROOT chroot-like path resolution

2018-10-04 Thread Christian Brauner
On Fri, Oct 05, 2018 at 02:26:11AM +1000, Aleksa Sarai wrote:
> On 2018-09-29, Jann Horn  wrote:
> > You attempt to open "C/../../etc/passwd" under the root "/A/B".
> > Something else concurrently moves /A/B/C to /A/C. This can result in
> > the following:
> > 
> > 1. You start the path walk and reach /A/B/C.
> > 2. The other process moves /A/B/C to /A/C. Your path walk is now at /A/C.
> > 3. Your path walk follows the first ".." up into /A. This is outside
> > the process root, but you never actually encountered the process root,
> > so you don't notice.
> > 4. Your path walk follows the second ".." up to /. Again, this is
> > outside the process root, but you don't notice.
> > 5. Your path walk walks down to /etc/passwd, and the open completes
> > successfully. You now have an fd pointing outside your chroot.
> 
> I've been playing with this and I have the following patch, which
> according to my testing protects against attacks where ".." skips over
> nd->root. It abuses __d_path to figure out if nd->path can be resolved
> from nd->root (obviously a proper version of this patch would refactor
> __d_path so it could be used like this -- and would not return
> -EMULTIHOP).
> 
> I've also attached my reproducer. With it, I was seeing fairly constant
> breakouts before this patch and after it I didn't see a single breakout
> after running it overnight. Obviously this is not conclusive, but I'm
> hoping that it can show what my idea for protecting against ".." was.
> 
> Does this patch make sense? Or is there something wrong with it that I'm
> not seeing?

Interesting.
Apart from the abuse of __d_path() :) the question I'd have is whether
this just minimizes the race window or if you can provide a sound
argument that this actually can't happen anymore with this patch.

> 
> --8<---
> 
> There is a fairly easy-to-exploit race condition with chroot(2) (and
> thus by extension AT_THIS_ROOT and AT_BENEATH) where a rename(2) of a
> path can be used to "skip over" nd->root and thus escape to the
> filesystem above nd->root.
> 
>   thread1 [attacker]:
> for (;;)
>   renameat2(AT_FDCWD, "/a/b/c", AT_FDCWD, "/a/d", RENAME_EXCHANGE);
>   thread2 [victim]:
> for (;;)
>   openat(dirb, "b/c/../../etc/shadow", O_THISROOT);
> 
> With fairly significant regularity, thread2 will resolve to
> "/etc/shadow" rather than "/a/b/etc/shadow". With this patch, such cases
> will be detected during ".." resolution (which is the weak point of
> chroot(2) -- since walking *into* a subdirectory tautologically cannot
> result in you walking *outside* nd->root).
> 
> The use of __d_path here might seem suspect, however we don't mind if a
> path is moved from within the chroot to outside the chroot and we
> incorrectly decide it is safe (because at that point we are still within
> the set of files which were accessible at the beginning of resolution).
> However, we can fail resolution on the next path component if it remains
> outside of the root. A path which has always been outside nd->root
> during resolution will never be resolveable from nd->root and thus will
> always be blocked.
> 
> DO NOT MERGE: Currently this code returns -EMULTIHOP in this case,
> purely as a debugging measure (so that you can see that
> the protection actually does something). Obviously in the
> proper patch this will return -EXDEV.
> 
> Signed-off-by: Aleksa Sarai 
> ---
>  fs/namei.c | 32 ++--
>  1 file changed, 30 insertions(+), 2 deletions(-)
> 
> diff --git a/fs/namei.c b/fs/namei.c
> index 6f995e6de6b1..c8349693d47b 100644
> --- a/fs/namei.c
> +++ b/fs/namei.c
> @@ -53,8 +53,8 @@
>   * The new code replaces the old recursive symlink resolution with
>   * an iterative one (in case of non-nested symlink chains).  It does
>   * this with calls to _follow_link().
> - * As a side effect, dir_namei(), _namei() and follow_link() are now 
> - * replaced with a single function lookup_dentry() that can handle all 
> + * As a side effect, dir_namei(), _namei() and follow_link() are now
> + * replaced with a single function lookup_dentry() that can handle all
>   * the special cases of the former code.
>   *
>   * With the new dcache, the pathname is stored at each inode, at least as
> @@ -1375,6 +1375,20 @@ static int follow_dotdot_rcu(struct nameidata *nd)
>   return -EXDEV;
>   break;
>   }
> + if (unlikely(nd->flags & (LOOKUP_BENEATH | LOOKUP_CHROOT))) {
> + char *pathbuf, *pathptr;
> +
> + pathbuf = kmalloc(PATH_MAX, GFP_ATOMIC);
> + if (!pathbuf)
> + return -ECHILD;
> + pathptr = __d_path(>path, >root, pathbuf, 
> PATH_MAX);
> + kfree(pathbuf);
> + if (IS_ERR_OR_NULL(pathptr)) {
> +   

Re: [PATCH 2/3] namei: implement AT_THIS_ROOT chroot-like path resolution

2018-10-04 Thread Christian Brauner
On Fri, Oct 05, 2018 at 02:26:11AM +1000, Aleksa Sarai wrote:
> On 2018-09-29, Jann Horn  wrote:
> > You attempt to open "C/../../etc/passwd" under the root "/A/B".
> > Something else concurrently moves /A/B/C to /A/C. This can result in
> > the following:
> > 
> > 1. You start the path walk and reach /A/B/C.
> > 2. The other process moves /A/B/C to /A/C. Your path walk is now at /A/C.
> > 3. Your path walk follows the first ".." up into /A. This is outside
> > the process root, but you never actually encountered the process root,
> > so you don't notice.
> > 4. Your path walk follows the second ".." up to /. Again, this is
> > outside the process root, but you don't notice.
> > 5. Your path walk walks down to /etc/passwd, and the open completes
> > successfully. You now have an fd pointing outside your chroot.
> 
> I've been playing with this and I have the following patch, which
> according to my testing protects against attacks where ".." skips over
> nd->root. It abuses __d_path to figure out if nd->path can be resolved
> from nd->root (obviously a proper version of this patch would refactor
> __d_path so it could be used like this -- and would not return
> -EMULTIHOP).
> 
> I've also attached my reproducer. With it, I was seeing fairly constant
> breakouts before this patch and after it I didn't see a single breakout
> after running it overnight. Obviously this is not conclusive, but I'm
> hoping that it can show what my idea for protecting against ".." was.
> 
> Does this patch make sense? Or is there something wrong with it that I'm
> not seeing?

Interesting.
Apart from the abuse of __d_path() :) the question I'd have is whether
this just minimizes the race window or if you can provide a sound
argument that this actually can't happen anymore with this patch.

> 
> --8<---
> 
> There is a fairly easy-to-exploit race condition with chroot(2) (and
> thus by extension AT_THIS_ROOT and AT_BENEATH) where a rename(2) of a
> path can be used to "skip over" nd->root and thus escape to the
> filesystem above nd->root.
> 
>   thread1 [attacker]:
> for (;;)
>   renameat2(AT_FDCWD, "/a/b/c", AT_FDCWD, "/a/d", RENAME_EXCHANGE);
>   thread2 [victim]:
> for (;;)
>   openat(dirb, "b/c/../../etc/shadow", O_THISROOT);
> 
> With fairly significant regularity, thread2 will resolve to
> "/etc/shadow" rather than "/a/b/etc/shadow". With this patch, such cases
> will be detected during ".." resolution (which is the weak point of
> chroot(2) -- since walking *into* a subdirectory tautologically cannot
> result in you walking *outside* nd->root).
> 
> The use of __d_path here might seem suspect, however we don't mind if a
> path is moved from within the chroot to outside the chroot and we
> incorrectly decide it is safe (because at that point we are still within
> the set of files which were accessible at the beginning of resolution).
> However, we can fail resolution on the next path component if it remains
> outside of the root. A path which has always been outside nd->root
> during resolution will never be resolveable from nd->root and thus will
> always be blocked.
> 
> DO NOT MERGE: Currently this code returns -EMULTIHOP in this case,
> purely as a debugging measure (so that you can see that
> the protection actually does something). Obviously in the
> proper patch this will return -EXDEV.
> 
> Signed-off-by: Aleksa Sarai 
> ---
>  fs/namei.c | 32 ++--
>  1 file changed, 30 insertions(+), 2 deletions(-)
> 
> diff --git a/fs/namei.c b/fs/namei.c
> index 6f995e6de6b1..c8349693d47b 100644
> --- a/fs/namei.c
> +++ b/fs/namei.c
> @@ -53,8 +53,8 @@
>   * The new code replaces the old recursive symlink resolution with
>   * an iterative one (in case of non-nested symlink chains).  It does
>   * this with calls to _follow_link().
> - * As a side effect, dir_namei(), _namei() and follow_link() are now 
> - * replaced with a single function lookup_dentry() that can handle all 
> + * As a side effect, dir_namei(), _namei() and follow_link() are now
> + * replaced with a single function lookup_dentry() that can handle all
>   * the special cases of the former code.
>   *
>   * With the new dcache, the pathname is stored at each inode, at least as
> @@ -1375,6 +1375,20 @@ static int follow_dotdot_rcu(struct nameidata *nd)
>   return -EXDEV;
>   break;
>   }
> + if (unlikely(nd->flags & (LOOKUP_BENEATH | LOOKUP_CHROOT))) {
> + char *pathbuf, *pathptr;
> +
> + pathbuf = kmalloc(PATH_MAX, GFP_ATOMIC);
> + if (!pathbuf)
> + return -ECHILD;
> + pathptr = __d_path(>path, >root, pathbuf, 
> PATH_MAX);
> + kfree(pathbuf);
> + if (IS_ERR_OR_NULL(pathptr)) {
> +   

Re: [PATCH 2/3] namei: implement AT_THIS_ROOT chroot-like path resolution

2018-10-04 Thread Christian Brauner
On Tue, Oct 02, 2018 at 02:18:33AM +1000, Aleksa Sarai wrote:
> On 2018-10-01, Jann Horn  wrote:
> > > If this is an issue for AT_THIS_ROOT, I believe this might also be an
> > > issue for AT_BENEATH since they are effectively both using the same
> > > nd->root trick (so you could similarly trick AT_BENEATH to not error
> > > out). So we'd need to figure out how to solve this problem in order for
> > > AT_BENEATH to be safe.
> > 
> > Oh, wait, what? I think I didn't notice that the semantics of
> > AT_BENEATH changed like that since the original posting of David
> > Drysdale's O_BENEATH_ONLY patch
> > (https://lore.kernel.org/lkml/1439458366-8223-2-git-send-email-drysd...@google.com/).
> > David's patch had nice, straightforward semantics, blocking any form
> > of upwards traversal. Why was that changed? Does anyone actually want
> > to use paths that contain ".." with AT_BENEATH? I would strongly
> > prefer something that blocks any use of "..".
> > 
> > @Al: It looks like this already changed back when you posted
> > https://lore.kernel.org/lkml/20170429220414.gt29...@zeniv.linux.org.uk/
> > ?
> 
> Yes, I copied the semantics from Al's patchset. I don't know why he felt
> strongly that this was the best idea, but in my opinion allowing paths
> like "a/../b/../c" seems like it's quite useful because otherwise you
> wouldn't be able to operate on most distribution root filesystems (many
> have symlinks that have ".." components). Looking at my own (openSUSE)
> machine there are something like 100k such symlinks (~37% of symlinks on
> my machine).
> 
> While I do understand that the easiest way of solving this problem is to
> disallow ".." entirely with AT_BENEATH, given that support ".." has
> utility, I would like to know whether it's actually not possible to have
> this work safely.
> 
> > > Speaking naively, doesn't it make sense to invalidate the walk if a path
> > > component was modified? Or is this something that would be far too
> > > costly with little benefit? What if we do more aggressive nd->root
> > > checks when resolving with AT_BENEATH or AT_THIS_ROOT (or if nd->root !=
> > > current->mnt_ns->root)?
> > 
> > It seems to me like doing that would basically require looking at each
> > node in the path walk twice? And it'd be difficult to guarantee
> > forward progress unless you're willing to do some fairly heavy
> > locking.
> 
> I had another idea since I wrote my previous mail -- since the issue (at
> least the way I understand it) is that ".." can "skip" over nd->root
> because of the rename, what if we had some sort of is_descendant() check
> within follow_dotdot()? (FWIW, ".." already has some pretty interesting
> "hand-over-hand" locking semantics.) This should be effectively similar
> to how prepend_path() deals with a path that is not reachable from @root
> (I'm not sure if the locking is acceptable for the namei path though).
> 
> Since ".." with AT_THIS_ROOT (or AT_BENEATH) is not going to be the most
> common component type (and we only need to do these checks for those
> flags), I would think that the extra cost would not be _that_ awful.
> 
> Would this work?
> 
> > > You're right about this -- for C runtimes. In Go we cannot do a raw
> > > clone() or fork() (if you do it manually with RawSyscall you'll end with
> > > broken runtime state). So you're forced to do fork+exec (which then
> > > means that you can't use CLONE_FILES and must use SCM_RIGHTS). Same goes
> > > for CLONE_VFORK.
> > 
> > If you insist on implementing every last bit of your code in Go, I guess.
> 
> Fair enough, though I believe this would affect most multi-threaded
> programs as well (regardless of the language they're written in).

(Depends on whether you do any explicit locking and have atfork handlers
for your locks and so on. If you do a clone syscall directly to avoid
having libc running any additional atfork handlers (flushing streams
etc.) it's doable though not ideal.)


Re: [PATCH 2/3] namei: implement AT_THIS_ROOT chroot-like path resolution

2018-10-04 Thread Christian Brauner
On Tue, Oct 02, 2018 at 02:18:33AM +1000, Aleksa Sarai wrote:
> On 2018-10-01, Jann Horn  wrote:
> > > If this is an issue for AT_THIS_ROOT, I believe this might also be an
> > > issue for AT_BENEATH since they are effectively both using the same
> > > nd->root trick (so you could similarly trick AT_BENEATH to not error
> > > out). So we'd need to figure out how to solve this problem in order for
> > > AT_BENEATH to be safe.
> > 
> > Oh, wait, what? I think I didn't notice that the semantics of
> > AT_BENEATH changed like that since the original posting of David
> > Drysdale's O_BENEATH_ONLY patch
> > (https://lore.kernel.org/lkml/1439458366-8223-2-git-send-email-drysd...@google.com/).
> > David's patch had nice, straightforward semantics, blocking any form
> > of upwards traversal. Why was that changed? Does anyone actually want
> > to use paths that contain ".." with AT_BENEATH? I would strongly
> > prefer something that blocks any use of "..".
> > 
> > @Al: It looks like this already changed back when you posted
> > https://lore.kernel.org/lkml/20170429220414.gt29...@zeniv.linux.org.uk/
> > ?
> 
> Yes, I copied the semantics from Al's patchset. I don't know why he felt
> strongly that this was the best idea, but in my opinion allowing paths
> like "a/../b/../c" seems like it's quite useful because otherwise you
> wouldn't be able to operate on most distribution root filesystems (many
> have symlinks that have ".." components). Looking at my own (openSUSE)
> machine there are something like 100k such symlinks (~37% of symlinks on
> my machine).
> 
> While I do understand that the easiest way of solving this problem is to
> disallow ".." entirely with AT_BENEATH, given that support ".." has
> utility, I would like to know whether it's actually not possible to have
> this work safely.
> 
> > > Speaking naively, doesn't it make sense to invalidate the walk if a path
> > > component was modified? Or is this something that would be far too
> > > costly with little benefit? What if we do more aggressive nd->root
> > > checks when resolving with AT_BENEATH or AT_THIS_ROOT (or if nd->root !=
> > > current->mnt_ns->root)?
> > 
> > It seems to me like doing that would basically require looking at each
> > node in the path walk twice? And it'd be difficult to guarantee
> > forward progress unless you're willing to do some fairly heavy
> > locking.
> 
> I had another idea since I wrote my previous mail -- since the issue (at
> least the way I understand it) is that ".." can "skip" over nd->root
> because of the rename, what if we had some sort of is_descendant() check
> within follow_dotdot()? (FWIW, ".." already has some pretty interesting
> "hand-over-hand" locking semantics.) This should be effectively similar
> to how prepend_path() deals with a path that is not reachable from @root
> (I'm not sure if the locking is acceptable for the namei path though).
> 
> Since ".." with AT_THIS_ROOT (or AT_BENEATH) is not going to be the most
> common component type (and we only need to do these checks for those
> flags), I would think that the extra cost would not be _that_ awful.
> 
> Would this work?
> 
> > > You're right about this -- for C runtimes. In Go we cannot do a raw
> > > clone() or fork() (if you do it manually with RawSyscall you'll end with
> > > broken runtime state). So you're forced to do fork+exec (which then
> > > means that you can't use CLONE_FILES and must use SCM_RIGHTS). Same goes
> > > for CLONE_VFORK.
> > 
> > If you insist on implementing every last bit of your code in Go, I guess.
> 
> Fair enough, though I believe this would affect most multi-threaded
> programs as well (regardless of the language they're written in).

(Depends on whether you do any explicit locking and have atfork handlers
for your locks and so on. If you do a clone syscall directly to avoid
having libc running any additional atfork handlers (flushing streams
etc.) it's doable though not ideal.)


Re: [PATCH 2/3] namei: implement AT_THIS_ROOT chroot-like path resolution

2018-10-04 Thread Aleksa Sarai
On 2018-09-29, Jann Horn  wrote:
> You attempt to open "C/../../etc/passwd" under the root "/A/B".
> Something else concurrently moves /A/B/C to /A/C. This can result in
> the following:
> 
> 1. You start the path walk and reach /A/B/C.
> 2. The other process moves /A/B/C to /A/C. Your path walk is now at /A/C.
> 3. Your path walk follows the first ".." up into /A. This is outside
> the process root, but you never actually encountered the process root,
> so you don't notice.
> 4. Your path walk follows the second ".." up to /. Again, this is
> outside the process root, but you don't notice.
> 5. Your path walk walks down to /etc/passwd, and the open completes
> successfully. You now have an fd pointing outside your chroot.

I've been playing with this and I have the following patch, which
according to my testing protects against attacks where ".." skips over
nd->root. It abuses __d_path to figure out if nd->path can be resolved
from nd->root (obviously a proper version of this patch would refactor
__d_path so it could be used like this -- and would not return
-EMULTIHOP).

I've also attached my reproducer. With it, I was seeing fairly constant
breakouts before this patch and after it I didn't see a single breakout
after running it overnight. Obviously this is not conclusive, but I'm
hoping that it can show what my idea for protecting against ".." was.

Does this patch make sense? Or is there something wrong with it that I'm
not seeing?

--8<---

There is a fairly easy-to-exploit race condition with chroot(2) (and
thus by extension AT_THIS_ROOT and AT_BENEATH) where a rename(2) of a
path can be used to "skip over" nd->root and thus escape to the
filesystem above nd->root.

  thread1 [attacker]:
for (;;)
  renameat2(AT_FDCWD, "/a/b/c", AT_FDCWD, "/a/d", RENAME_EXCHANGE);
  thread2 [victim]:
for (;;)
  openat(dirb, "b/c/../../etc/shadow", O_THISROOT);

With fairly significant regularity, thread2 will resolve to
"/etc/shadow" rather than "/a/b/etc/shadow". With this patch, such cases
will be detected during ".." resolution (which is the weak point of
chroot(2) -- since walking *into* a subdirectory tautologically cannot
result in you walking *outside* nd->root).

The use of __d_path here might seem suspect, however we don't mind if a
path is moved from within the chroot to outside the chroot and we
incorrectly decide it is safe (because at that point we are still within
the set of files which were accessible at the beginning of resolution).
However, we can fail resolution on the next path component if it remains
outside of the root. A path which has always been outside nd->root
during resolution will never be resolveable from nd->root and thus will
always be blocked.

DO NOT MERGE: Currently this code returns -EMULTIHOP in this case,
  purely as a debugging measure (so that you can see that
  the protection actually does something). Obviously in the
  proper patch this will return -EXDEV.

Signed-off-by: Aleksa Sarai 
---
 fs/namei.c | 32 ++--
 1 file changed, 30 insertions(+), 2 deletions(-)

diff --git a/fs/namei.c b/fs/namei.c
index 6f995e6de6b1..c8349693d47b 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -53,8 +53,8 @@
  * The new code replaces the old recursive symlink resolution with
  * an iterative one (in case of non-nested symlink chains).  It does
  * this with calls to _follow_link().
- * As a side effect, dir_namei(), _namei() and follow_link() are now 
- * replaced with a single function lookup_dentry() that can handle all 
+ * As a side effect, dir_namei(), _namei() and follow_link() are now
+ * replaced with a single function lookup_dentry() that can handle all
  * the special cases of the former code.
  *
  * With the new dcache, the pathname is stored at each inode, at least as
@@ -1375,6 +1375,20 @@ static int follow_dotdot_rcu(struct nameidata *nd)
return -EXDEV;
break;
}
+   if (unlikely(nd->flags & (LOOKUP_BENEATH | LOOKUP_CHROOT))) {
+   char *pathbuf, *pathptr;
+
+   pathbuf = kmalloc(PATH_MAX, GFP_ATOMIC);
+   if (!pathbuf)
+   return -ECHILD;
+   pathptr = __d_path(>path, >root, pathbuf, 
PATH_MAX);
+   kfree(pathbuf);
+   if (IS_ERR_OR_NULL(pathptr)) {
+   if (!pathptr)
+   pathptr = ERR_PTR(-EMULTIHOP);
+   return PTR_ERR(pathptr);
+   }
+   }
if (nd->path.dentry != nd->path.mnt->mnt_root) {
struct dentry *old = nd->path.dentry;
struct dentry *parent = old->d_parent;
@@ -1510,6 +1524,20 @@ static int follow_dotdot(struct nameidata 

Re: [PATCH 2/3] namei: implement AT_THIS_ROOT chroot-like path resolution

2018-10-04 Thread Aleksa Sarai
On 2018-09-29, Jann Horn  wrote:
> You attempt to open "C/../../etc/passwd" under the root "/A/B".
> Something else concurrently moves /A/B/C to /A/C. This can result in
> the following:
> 
> 1. You start the path walk and reach /A/B/C.
> 2. The other process moves /A/B/C to /A/C. Your path walk is now at /A/C.
> 3. Your path walk follows the first ".." up into /A. This is outside
> the process root, but you never actually encountered the process root,
> so you don't notice.
> 4. Your path walk follows the second ".." up to /. Again, this is
> outside the process root, but you don't notice.
> 5. Your path walk walks down to /etc/passwd, and the open completes
> successfully. You now have an fd pointing outside your chroot.

I've been playing with this and I have the following patch, which
according to my testing protects against attacks where ".." skips over
nd->root. It abuses __d_path to figure out if nd->path can be resolved
from nd->root (obviously a proper version of this patch would refactor
__d_path so it could be used like this -- and would not return
-EMULTIHOP).

I've also attached my reproducer. With it, I was seeing fairly constant
breakouts before this patch and after it I didn't see a single breakout
after running it overnight. Obviously this is not conclusive, but I'm
hoping that it can show what my idea for protecting against ".." was.

Does this patch make sense? Or is there something wrong with it that I'm
not seeing?

--8<---

There is a fairly easy-to-exploit race condition with chroot(2) (and
thus by extension AT_THIS_ROOT and AT_BENEATH) where a rename(2) of a
path can be used to "skip over" nd->root and thus escape to the
filesystem above nd->root.

  thread1 [attacker]:
for (;;)
  renameat2(AT_FDCWD, "/a/b/c", AT_FDCWD, "/a/d", RENAME_EXCHANGE);
  thread2 [victim]:
for (;;)
  openat(dirb, "b/c/../../etc/shadow", O_THISROOT);

With fairly significant regularity, thread2 will resolve to
"/etc/shadow" rather than "/a/b/etc/shadow". With this patch, such cases
will be detected during ".." resolution (which is the weak point of
chroot(2) -- since walking *into* a subdirectory tautologically cannot
result in you walking *outside* nd->root).

The use of __d_path here might seem suspect, however we don't mind if a
path is moved from within the chroot to outside the chroot and we
incorrectly decide it is safe (because at that point we are still within
the set of files which were accessible at the beginning of resolution).
However, we can fail resolution on the next path component if it remains
outside of the root. A path which has always been outside nd->root
during resolution will never be resolveable from nd->root and thus will
always be blocked.

DO NOT MERGE: Currently this code returns -EMULTIHOP in this case,
  purely as a debugging measure (so that you can see that
  the protection actually does something). Obviously in the
  proper patch this will return -EXDEV.

Signed-off-by: Aleksa Sarai 
---
 fs/namei.c | 32 ++--
 1 file changed, 30 insertions(+), 2 deletions(-)

diff --git a/fs/namei.c b/fs/namei.c
index 6f995e6de6b1..c8349693d47b 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -53,8 +53,8 @@
  * The new code replaces the old recursive symlink resolution with
  * an iterative one (in case of non-nested symlink chains).  It does
  * this with calls to _follow_link().
- * As a side effect, dir_namei(), _namei() and follow_link() are now 
- * replaced with a single function lookup_dentry() that can handle all 
+ * As a side effect, dir_namei(), _namei() and follow_link() are now
+ * replaced with a single function lookup_dentry() that can handle all
  * the special cases of the former code.
  *
  * With the new dcache, the pathname is stored at each inode, at least as
@@ -1375,6 +1375,20 @@ static int follow_dotdot_rcu(struct nameidata *nd)
return -EXDEV;
break;
}
+   if (unlikely(nd->flags & (LOOKUP_BENEATH | LOOKUP_CHROOT))) {
+   char *pathbuf, *pathptr;
+
+   pathbuf = kmalloc(PATH_MAX, GFP_ATOMIC);
+   if (!pathbuf)
+   return -ECHILD;
+   pathptr = __d_path(>path, >root, pathbuf, 
PATH_MAX);
+   kfree(pathbuf);
+   if (IS_ERR_OR_NULL(pathptr)) {
+   if (!pathptr)
+   pathptr = ERR_PTR(-EMULTIHOP);
+   return PTR_ERR(pathptr);
+   }
+   }
if (nd->path.dentry != nd->path.mnt->mnt_root) {
struct dentry *old = nd->path.dentry;
struct dentry *parent = old->d_parent;
@@ -1510,6 +1524,20 @@ static int follow_dotdot(struct nameidata 

Re: [PATCH 2/3] namei: implement AT_THIS_ROOT chroot-like path resolution

2018-10-03 Thread Andy Lutomirski
On Tue, Oct 2, 2018 at 12:32 AM Aleksa Sarai  wrote:
>
> On 2018-10-01, Andy Lutomirski  wrote:
> > >>> Currently most container runtimes try to do this resolution in
> > >>> userspace[1], causing many potential race conditions. In addition, the
> > >>> "obvious" alternative (actually performing a {ch,pivot_}root(2))
> > >>> requires a fork+exec which is *very* costly if necessary for every
> > >>> filesystem operation involving a container.
> > >>
> > >> Wait. fork() I understand, but why exec? And actually, you don't need
> > >> a full fork() either, clone() lets you do this with some process parts
> > >> shared. And then you also shouldn't need to use SCM_RIGHTS, just keep
> > >> the file descriptor table shared. And why chroot()/pivot_root(),
> > >> wouldn't you want to use setns()?
> > >
> > > You're right about this -- for C runtimes. In Go we cannot do a raw
> > > clone() or fork() (if you do it manually with RawSyscall you'll end with
> > > broken runtime state). So you're forced to do fork+exec (which then
> > > means that you can't use CLONE_FILES and must use SCM_RIGHTS). Same goes
> > > for CLONE_VFORK.
> >
> > I must admit that I’m not very sympathetic to the argument that “Go’s
> > runtime model is incompatible with the simpler solution.”
>
> Multi-threaded programs have a similar issue (though with Go it's much
> worse). If you fork a multi-threaded C program then you can only safely
> use AS-Safe glibc functions (those that are safe within a signal
> handler). But if you're just doing three syscalls this shouldn't be as
> big of a problem as Go where you can't even do said syscalls.
>
> > Anyway, it occurs to me that the real problem is that setns() and
> > chroot() are both overkill for this use case.
>
> I agree. My diversion to Go was to explain why it was particularly bad
> for cri-o/rkt/runc/Docker/etc.
>
> > What’s needed is to start your walk from /proc/pid-in-container/root,
> > with two twists:
> >
> > 1. Do the walk as though rooted at a directory. This is basically just
> > your AT_THIS_ROOT, but the footgun is avoided because the dirfd you
> > use is from a foreign namespace, and, except for symlinks to absolute
> > paths, no amount of .. racing is going to escape the *namespace*.
>
> This is quite clever and I'll admit I hadn't thought of this. This
> definitely fixes the ".." issue, but as you've said it won't handle
> absolute symlinks (which means userspace has the same races that we
> currently have even if you assume that you have a container process
> already running -- CVE-2018-15664 is one of many examples of this).
>
> (AT_THIS_ROOT using /proc/$container/root would in principle fix all of
> the mentioned issues -- but as I said before I'd like to see whether
> hardening ".." would be enough to solve the escape problem.)

Hmm.  Good point.


Re: [PATCH 2/3] namei: implement AT_THIS_ROOT chroot-like path resolution

2018-10-03 Thread Andy Lutomirski
On Tue, Oct 2, 2018 at 12:32 AM Aleksa Sarai  wrote:
>
> On 2018-10-01, Andy Lutomirski  wrote:
> > >>> Currently most container runtimes try to do this resolution in
> > >>> userspace[1], causing many potential race conditions. In addition, the
> > >>> "obvious" alternative (actually performing a {ch,pivot_}root(2))
> > >>> requires a fork+exec which is *very* costly if necessary for every
> > >>> filesystem operation involving a container.
> > >>
> > >> Wait. fork() I understand, but why exec? And actually, you don't need
> > >> a full fork() either, clone() lets you do this with some process parts
> > >> shared. And then you also shouldn't need to use SCM_RIGHTS, just keep
> > >> the file descriptor table shared. And why chroot()/pivot_root(),
> > >> wouldn't you want to use setns()?
> > >
> > > You're right about this -- for C runtimes. In Go we cannot do a raw
> > > clone() or fork() (if you do it manually with RawSyscall you'll end with
> > > broken runtime state). So you're forced to do fork+exec (which then
> > > means that you can't use CLONE_FILES and must use SCM_RIGHTS). Same goes
> > > for CLONE_VFORK.
> >
> > I must admit that I’m not very sympathetic to the argument that “Go’s
> > runtime model is incompatible with the simpler solution.”
>
> Multi-threaded programs have a similar issue (though with Go it's much
> worse). If you fork a multi-threaded C program then you can only safely
> use AS-Safe glibc functions (those that are safe within a signal
> handler). But if you're just doing three syscalls this shouldn't be as
> big of a problem as Go where you can't even do said syscalls.
>
> > Anyway, it occurs to me that the real problem is that setns() and
> > chroot() are both overkill for this use case.
>
> I agree. My diversion to Go was to explain why it was particularly bad
> for cri-o/rkt/runc/Docker/etc.
>
> > What’s needed is to start your walk from /proc/pid-in-container/root,
> > with two twists:
> >
> > 1. Do the walk as though rooted at a directory. This is basically just
> > your AT_THIS_ROOT, but the footgun is avoided because the dirfd you
> > use is from a foreign namespace, and, except for symlinks to absolute
> > paths, no amount of .. racing is going to escape the *namespace*.
>
> This is quite clever and I'll admit I hadn't thought of this. This
> definitely fixes the ".." issue, but as you've said it won't handle
> absolute symlinks (which means userspace has the same races that we
> currently have even if you assume that you have a container process
> already running -- CVE-2018-15664 is one of many examples of this).
>
> (AT_THIS_ROOT using /proc/$container/root would in principle fix all of
> the mentioned issues -- but as I said before I'd like to see whether
> hardening ".." would be enough to solve the escape problem.)

Hmm.  Good point.


Re: [PATCH 2/3] namei: implement AT_THIS_ROOT chroot-like path resolution

2018-10-02 Thread Aleksa Sarai
On 2018-10-01, Andy Lutomirski  wrote:
> >>> Currently most container runtimes try to do this resolution in
> >>> userspace[1], causing many potential race conditions. In addition, the
> >>> "obvious" alternative (actually performing a {ch,pivot_}root(2))
> >>> requires a fork+exec which is *very* costly if necessary for every
> >>> filesystem operation involving a container.
> >> 
> >> Wait. fork() I understand, but why exec? And actually, you don't need
> >> a full fork() either, clone() lets you do this with some process parts
> >> shared. And then you also shouldn't need to use SCM_RIGHTS, just keep
> >> the file descriptor table shared. And why chroot()/pivot_root(),
> >> wouldn't you want to use setns()?
> > 
> > You're right about this -- for C runtimes. In Go we cannot do a raw
> > clone() or fork() (if you do it manually with RawSyscall you'll end with
> > broken runtime state). So you're forced to do fork+exec (which then
> > means that you can't use CLONE_FILES and must use SCM_RIGHTS). Same goes
> > for CLONE_VFORK.
> 
> I must admit that I’m not very sympathetic to the argument that “Go’s
> runtime model is incompatible with the simpler solution.”

Multi-threaded programs have a similar issue (though with Go it's much
worse). If you fork a multi-threaded C program then you can only safely
use AS-Safe glibc functions (those that are safe within a signal
handler). But if you're just doing three syscalls this shouldn't be as
big of a problem as Go where you can't even do said syscalls.

> Anyway, it occurs to me that the real problem is that setns() and
> chroot() are both overkill for this use case.

I agree. My diversion to Go was to explain why it was particularly bad
for cri-o/rkt/runc/Docker/etc.

> What’s needed is to start your walk from /proc/pid-in-container/root,
> with two twists:
> 
> 1. Do the walk as though rooted at a directory. This is basically just
> your AT_THIS_ROOT, but the footgun is avoided because the dirfd you
> use is from a foreign namespace, and, except for symlinks to absolute
> paths, no amount of .. racing is going to escape the *namespace*.

This is quite clever and I'll admit I hadn't thought of this. This
definitely fixes the ".." issue, but as you've said it won't handle
absolute symlinks (which means userspace has the same races that we
currently have even if you assume that you have a container process
already running -- CVE-2018-15664 is one of many examples of this).

(AT_THIS_ROOT using /proc/$container/root would in principle fix all of
the mentioned issues -- but as I said before I'd like to see whether
hardening ".." would be enough to solve the escape problem.)

> 2. Avoid /proc. It’s not just the *links* — you really don’t want to
> walk into /proc/self. *Maybe* procfs is already careful enough when
> mounted in a namespace?

I just tried it and /proc/self gives you -ENOENT. I believe this is
because it does a check against the pid namespace that the procfs mount
has pinned.

-- 
Aleksa Sarai
Senior Software Engineer (Containers)
SUSE Linux GmbH



signature.asc
Description: PGP signature


Re: [PATCH 2/3] namei: implement AT_THIS_ROOT chroot-like path resolution

2018-10-02 Thread Aleksa Sarai
On 2018-10-01, Andy Lutomirski  wrote:
> >>> Currently most container runtimes try to do this resolution in
> >>> userspace[1], causing many potential race conditions. In addition, the
> >>> "obvious" alternative (actually performing a {ch,pivot_}root(2))
> >>> requires a fork+exec which is *very* costly if necessary for every
> >>> filesystem operation involving a container.
> >> 
> >> Wait. fork() I understand, but why exec? And actually, you don't need
> >> a full fork() either, clone() lets you do this with some process parts
> >> shared. And then you also shouldn't need to use SCM_RIGHTS, just keep
> >> the file descriptor table shared. And why chroot()/pivot_root(),
> >> wouldn't you want to use setns()?
> > 
> > You're right about this -- for C runtimes. In Go we cannot do a raw
> > clone() or fork() (if you do it manually with RawSyscall you'll end with
> > broken runtime state). So you're forced to do fork+exec (which then
> > means that you can't use CLONE_FILES and must use SCM_RIGHTS). Same goes
> > for CLONE_VFORK.
> 
> I must admit that I’m not very sympathetic to the argument that “Go’s
> runtime model is incompatible with the simpler solution.”

Multi-threaded programs have a similar issue (though with Go it's much
worse). If you fork a multi-threaded C program then you can only safely
use AS-Safe glibc functions (those that are safe within a signal
handler). But if you're just doing three syscalls this shouldn't be as
big of a problem as Go where you can't even do said syscalls.

> Anyway, it occurs to me that the real problem is that setns() and
> chroot() are both overkill for this use case.

I agree. My diversion to Go was to explain why it was particularly bad
for cri-o/rkt/runc/Docker/etc.

> What’s needed is to start your walk from /proc/pid-in-container/root,
> with two twists:
> 
> 1. Do the walk as though rooted at a directory. This is basically just
> your AT_THIS_ROOT, but the footgun is avoided because the dirfd you
> use is from a foreign namespace, and, except for symlinks to absolute
> paths, no amount of .. racing is going to escape the *namespace*.

This is quite clever and I'll admit I hadn't thought of this. This
definitely fixes the ".." issue, but as you've said it won't handle
absolute symlinks (which means userspace has the same races that we
currently have even if you assume that you have a container process
already running -- CVE-2018-15664 is one of many examples of this).

(AT_THIS_ROOT using /proc/$container/root would in principle fix all of
the mentioned issues -- but as I said before I'd like to see whether
hardening ".." would be enough to solve the escape problem.)

> 2. Avoid /proc. It’s not just the *links* — you really don’t want to
> walk into /proc/self. *Maybe* procfs is already careful enough when
> mounted in a namespace?

I just tried it and /proc/self gives you -ENOENT. I believe this is
because it does a check against the pid namespace that the procfs mount
has pinned.

-- 
Aleksa Sarai
Senior Software Engineer (Containers)
SUSE Linux GmbH



signature.asc
Description: PGP signature


Re: [PATCH 2/3] namei: implement AT_THIS_ROOT chroot-like path resolution

2018-10-01 Thread Aleksa Sarai
On 2018-10-01, Jann Horn  wrote:
> > If this is an issue for AT_THIS_ROOT, I believe this might also be an
> > issue for AT_BENEATH since they are effectively both using the same
> > nd->root trick (so you could similarly trick AT_BENEATH to not error
> > out). So we'd need to figure out how to solve this problem in order for
> > AT_BENEATH to be safe.
> 
> Oh, wait, what? I think I didn't notice that the semantics of
> AT_BENEATH changed like that since the original posting of David
> Drysdale's O_BENEATH_ONLY patch
> (https://lore.kernel.org/lkml/1439458366-8223-2-git-send-email-drysd...@google.com/).
> David's patch had nice, straightforward semantics, blocking any form
> of upwards traversal. Why was that changed? Does anyone actually want
> to use paths that contain ".." with AT_BENEATH? I would strongly
> prefer something that blocks any use of "..".
> 
> @Al: It looks like this already changed back when you posted
> https://lore.kernel.org/lkml/20170429220414.gt29...@zeniv.linux.org.uk/
> ?

Yes, I copied the semantics from Al's patchset. I don't know why he felt
strongly that this was the best idea, but in my opinion allowing paths
like "a/../b/../c" seems like it's quite useful because otherwise you
wouldn't be able to operate on most distribution root filesystems (many
have symlinks that have ".." components). Looking at my own (openSUSE)
machine there are something like 100k such symlinks (~37% of symlinks on
my machine).

While I do understand that the easiest way of solving this problem is to
disallow ".." entirely with AT_BENEATH, given that support ".." has
utility, I would like to know whether it's actually not possible to have
this work safely.

> > Speaking naively, doesn't it make sense to invalidate the walk if a path
> > component was modified? Or is this something that would be far too
> > costly with little benefit? What if we do more aggressive nd->root
> > checks when resolving with AT_BENEATH or AT_THIS_ROOT (or if nd->root !=
> > current->mnt_ns->root)?
> 
> It seems to me like doing that would basically require looking at each
> node in the path walk twice? And it'd be difficult to guarantee
> forward progress unless you're willing to do some fairly heavy
> locking.

I had another idea since I wrote my previous mail -- since the issue (at
least the way I understand it) is that ".." can "skip" over nd->root
because of the rename, what if we had some sort of is_descendant() check
within follow_dotdot()? (FWIW, ".." already has some pretty interesting
"hand-over-hand" locking semantics.) This should be effectively similar
to how prepend_path() deals with a path that is not reachable from @root
(I'm not sure if the locking is acceptable for the namei path though).

Since ".." with AT_THIS_ROOT (or AT_BENEATH) is not going to be the most
common component type (and we only need to do these checks for those
flags), I would think that the extra cost would not be _that_ awful.

Would this work?

> > You're right about this -- for C runtimes. In Go we cannot do a raw
> > clone() or fork() (if you do it manually with RawSyscall you'll end with
> > broken runtime state). So you're forced to do fork+exec (which then
> > means that you can't use CLONE_FILES and must use SCM_RIGHTS). Same goes
> > for CLONE_VFORK.
> 
> If you insist on implementing every last bit of your code in Go, I guess.

Fair enough, though I believe this would affect most multi-threaded
programs as well (regardless of the language they're written in).

-- 
Aleksa Sarai
Senior Software Engineer (Containers)
SUSE Linux GmbH



signature.asc
Description: PGP signature


Re: [PATCH 2/3] namei: implement AT_THIS_ROOT chroot-like path resolution

2018-10-01 Thread Aleksa Sarai
On 2018-10-01, Jann Horn  wrote:
> > If this is an issue for AT_THIS_ROOT, I believe this might also be an
> > issue for AT_BENEATH since they are effectively both using the same
> > nd->root trick (so you could similarly trick AT_BENEATH to not error
> > out). So we'd need to figure out how to solve this problem in order for
> > AT_BENEATH to be safe.
> 
> Oh, wait, what? I think I didn't notice that the semantics of
> AT_BENEATH changed like that since the original posting of David
> Drysdale's O_BENEATH_ONLY patch
> (https://lore.kernel.org/lkml/1439458366-8223-2-git-send-email-drysd...@google.com/).
> David's patch had nice, straightforward semantics, blocking any form
> of upwards traversal. Why was that changed? Does anyone actually want
> to use paths that contain ".." with AT_BENEATH? I would strongly
> prefer something that blocks any use of "..".
> 
> @Al: It looks like this already changed back when you posted
> https://lore.kernel.org/lkml/20170429220414.gt29...@zeniv.linux.org.uk/
> ?

Yes, I copied the semantics from Al's patchset. I don't know why he felt
strongly that this was the best idea, but in my opinion allowing paths
like "a/../b/../c" seems like it's quite useful because otherwise you
wouldn't be able to operate on most distribution root filesystems (many
have symlinks that have ".." components). Looking at my own (openSUSE)
machine there are something like 100k such symlinks (~37% of symlinks on
my machine).

While I do understand that the easiest way of solving this problem is to
disallow ".." entirely with AT_BENEATH, given that support ".." has
utility, I would like to know whether it's actually not possible to have
this work safely.

> > Speaking naively, doesn't it make sense to invalidate the walk if a path
> > component was modified? Or is this something that would be far too
> > costly with little benefit? What if we do more aggressive nd->root
> > checks when resolving with AT_BENEATH or AT_THIS_ROOT (or if nd->root !=
> > current->mnt_ns->root)?
> 
> It seems to me like doing that would basically require looking at each
> node in the path walk twice? And it'd be difficult to guarantee
> forward progress unless you're willing to do some fairly heavy
> locking.

I had another idea since I wrote my previous mail -- since the issue (at
least the way I understand it) is that ".." can "skip" over nd->root
because of the rename, what if we had some sort of is_descendant() check
within follow_dotdot()? (FWIW, ".." already has some pretty interesting
"hand-over-hand" locking semantics.) This should be effectively similar
to how prepend_path() deals with a path that is not reachable from @root
(I'm not sure if the locking is acceptable for the namei path though).

Since ".." with AT_THIS_ROOT (or AT_BENEATH) is not going to be the most
common component type (and we only need to do these checks for those
flags), I would think that the extra cost would not be _that_ awful.

Would this work?

> > You're right about this -- for C runtimes. In Go we cannot do a raw
> > clone() or fork() (if you do it manually with RawSyscall you'll end with
> > broken runtime state). So you're forced to do fork+exec (which then
> > means that you can't use CLONE_FILES and must use SCM_RIGHTS). Same goes
> > for CLONE_VFORK.
> 
> If you insist on implementing every last bit of your code in Go, I guess.

Fair enough, though I believe this would affect most multi-threaded
programs as well (regardless of the language they're written in).

-- 
Aleksa Sarai
Senior Software Engineer (Containers)
SUSE Linux GmbH



signature.asc
Description: PGP signature


Re: [PATCH 2/3] namei: implement AT_THIS_ROOT chroot-like path resolution

2018-10-01 Thread Andy Lutomirski


>>> Currently most container runtimes try to do this resolution in
>>> userspace[1], causing many potential race conditions. In addition, the
>>> "obvious" alternative (actually performing a {ch,pivot_}root(2))
>>> requires a fork+exec which is *very* costly if necessary for every
>>> filesystem operation involving a container.
>> 
>> Wait. fork() I understand, but why exec? And actually, you don't need
>> a full fork() either, clone() lets you do this with some process parts
>> shared. And then you also shouldn't need to use SCM_RIGHTS, just keep
>> the file descriptor table shared. And why chroot()/pivot_root(),
>> wouldn't you want to use setns()?
> 
> You're right about this -- for C runtimes. In Go we cannot do a raw
> clone() or fork() (if you do it manually with RawSyscall you'll end with
> broken runtime state). So you're forced to do fork+exec (which then
> means that you can't use CLONE_FILES and must use SCM_RIGHTS). Same goes
> for CLONE_VFORK.

I must admit that I’m not very sympathetic to the argument that “Go’s runtime 
model is incompatible with the simpler solution.”

Anyway, it occurs to me that the real problem is that setns() and chroot() are 
both overkill for this use case. What’s needed is to start your walk from 
/proc/pid-in-container/root, with two twists:

1. Do the walk as though rooted at a directory. This is basically just your 
AT_THIS_ROOT, but the footgun is avoided because the dirfd you use is from a 
foreign namespace, and, except for symlinks to absolute paths, no amount of .. 
racing is going to escape the *namespace*.

2. Avoid /proc. It’s not just the *links* — you really don’t want to walk into 
/proc/self. *Maybe* procfs is already careful enough when mounted in a 
namespace?


Re: [PATCH 2/3] namei: implement AT_THIS_ROOT chroot-like path resolution

2018-10-01 Thread Andy Lutomirski


>>> Currently most container runtimes try to do this resolution in
>>> userspace[1], causing many potential race conditions. In addition, the
>>> "obvious" alternative (actually performing a {ch,pivot_}root(2))
>>> requires a fork+exec which is *very* costly if necessary for every
>>> filesystem operation involving a container.
>> 
>> Wait. fork() I understand, but why exec? And actually, you don't need
>> a full fork() either, clone() lets you do this with some process parts
>> shared. And then you also shouldn't need to use SCM_RIGHTS, just keep
>> the file descriptor table shared. And why chroot()/pivot_root(),
>> wouldn't you want to use setns()?
> 
> You're right about this -- for C runtimes. In Go we cannot do a raw
> clone() or fork() (if you do it manually with RawSyscall you'll end with
> broken runtime state). So you're forced to do fork+exec (which then
> means that you can't use CLONE_FILES and must use SCM_RIGHTS). Same goes
> for CLONE_VFORK.

I must admit that I’m not very sympathetic to the argument that “Go’s runtime 
model is incompatible with the simpler solution.”

Anyway, it occurs to me that the real problem is that setns() and chroot() are 
both overkill for this use case. What’s needed is to start your walk from 
/proc/pid-in-container/root, with two twists:

1. Do the walk as though rooted at a directory. This is basically just your 
AT_THIS_ROOT, but the footgun is avoided because the dirfd you use is from a 
foreign namespace, and, except for symlinks to absolute paths, no amount of .. 
racing is going to escape the *namespace*.

2. Avoid /proc. It’s not just the *links* — you really don’t want to walk into 
/proc/self. *Maybe* procfs is already careful enough when mounted in a 
namespace?


Re: [PATCH 2/3] namei: implement AT_THIS_ROOT chroot-like path resolution

2018-10-01 Thread Christian Brauner
On Sat, Sep 29, 2018 at 06:35:17PM +0200, Jann Horn wrote:
> +cc linux-api; please keep them in CC for future versions of the patch
> 
> On Sat, Sep 29, 2018 at 4:29 PM Aleksa Sarai  wrote:
> > The primary motivation for the need for this flag is container runtimes
> > which have to interact with malicious root filesystems in the host
> > namespaces. One of the first requirements for a container runtime to be
> > secure against a malicious rootfs is that they correctly scope symlinks
> > (that is, they should be scoped as though they are chroot(2)ed into the
> > container's rootfs) and ".."-style paths. The already-existing AT_XDEV
> > and AT_NO_PROCLINKS help defend against other potential attacks in a
> > malicious rootfs scenario.
> 
> So, I really like the concept for patch 1 of this series (but haven't
> read the code yet); but I dislike this patch because of its footgun
> potential.
> 
> If this patch landed as-is, the manpage would need some big warning
> labels. chroot() basically provides no security guarantees at all; and
> yes, that includes that if you do `chroot(...); chdir("/");
> open(attacker_controlled_path, ...);`, you can potentially end up
> opening a file outside the chroot. See
> https://github.com/QubesOS/qubes-secpack/blob/master/QSBs/qsb-014-2015.txt
> for an example, where Qubes OS did pretty much that, and ended up with
> a potentially exploitable security bug because of that, where one VM,
> while performing a file transfer into another VM, could write outside
> of the transfer target directory.
> The problem is what happens if a folder you are walking through is
> concurrently moved out of the chroot. Consider the following scenario:
> 
> You attempt to open "C/../../etc/passwd" under the root "/A/B".
> Something else concurrently moves /A/B/C to /A/C. This can result in
> the following:
> 
> 1. You start the path walk and reach /A/B/C.
> 2. The other process moves /A/B/C to /A/C. Your path walk is now at /A/C.
> 3. Your path walk follows the first ".." up into /A. This is outside
> the process root, but you never actually encountered the process root,
> so you don't notice.
> 4. Your path walk follows the second ".." up to /. Again, this is
> outside the process root, but you don't notice.
> 5. Your path walk walks down to /etc/passwd, and the open completes
> successfully. You now have an fd pointing outside your chroot.
> 
> If the root of your walk is below an attacker-controlled directory,
> this of course means that you lose instantly. If you point the root of
> the walk at a directory out of which a process in the container
> wouldn't be able to move the file, you're probably kinda mostly fine -
> as long as you know, for certain, that nothing else on the system
> would ever do that. But I still wouldn't feel good about that.
> 
> (Yes, this means that if you run an SFTP server with OpenSSH's
> ChrootDirectory directive, you have to be very careful about these
> things.)
> 
> I believe that the only way to robustly use this would be to point the
> dirfd at a mount point, such that you know that being moved out of the
> chroot is impossible because the mount point limits movement of
> directories under it. (Well, technically, it doesn't, but it ensures
> that if a directory does dangerously move away, the syscall fails.) It
> might make sense to hardcode this constraint in the implementation of
> AT_THIS_ROOT, to keep people from shooting themselves in the foot.

I'm very much in favor of dropping AT_THIS_ROOT from this patch series
at least for now and only land the first patch. The first patch is
something that we really want and that it seems we can find a good
design for.
If AT_THIS_ROOT is a feature that still makes sense we can revisit it.

> 
> > Currently most container runtimes try to do this resolution in
> > userspace[1], causing many potential race conditions. In addition, the
> > "obvious" alternative (actually performing a {ch,pivot_}root(2))
> > requires a fork+exec which is *very* costly if necessary for every
> > filesystem operation involving a container.
> 
> Wait. fork() I understand, but why exec? And actually, you don't need
> a full fork() either, clone() lets you do this with some process parts
> shared. And then you also shouldn't need to use SCM_RIGHTS, just keep
> the file descriptor table shared. And why chroot()/pivot_root(),
> wouldn't you want to use setns()? I think something like this should
> work (except that you should add some error handling - omitted here
> because I'm lazy), assuming that the container runtime does NOT have
> CAP_SYS_ADMIN in the init namespace (otherwise it's easier). Of
> course, this is entirely untested, and probably won't compile because
> I screwed something up. :P But you should get the idea...
> 
> // Ensure that we are non-dumpable. Together with
> // commit bfedb589252c, this ensures that container root
> // can't trace our child once it enters the container.
> // My patch
> // 
> 

Re: [PATCH 2/3] namei: implement AT_THIS_ROOT chroot-like path resolution

2018-10-01 Thread Christian Brauner
On Sat, Sep 29, 2018 at 06:35:17PM +0200, Jann Horn wrote:
> +cc linux-api; please keep them in CC for future versions of the patch
> 
> On Sat, Sep 29, 2018 at 4:29 PM Aleksa Sarai  wrote:
> > The primary motivation for the need for this flag is container runtimes
> > which have to interact with malicious root filesystems in the host
> > namespaces. One of the first requirements for a container runtime to be
> > secure against a malicious rootfs is that they correctly scope symlinks
> > (that is, they should be scoped as though they are chroot(2)ed into the
> > container's rootfs) and ".."-style paths. The already-existing AT_XDEV
> > and AT_NO_PROCLINKS help defend against other potential attacks in a
> > malicious rootfs scenario.
> 
> So, I really like the concept for patch 1 of this series (but haven't
> read the code yet); but I dislike this patch because of its footgun
> potential.
> 
> If this patch landed as-is, the manpage would need some big warning
> labels. chroot() basically provides no security guarantees at all; and
> yes, that includes that if you do `chroot(...); chdir("/");
> open(attacker_controlled_path, ...);`, you can potentially end up
> opening a file outside the chroot. See
> https://github.com/QubesOS/qubes-secpack/blob/master/QSBs/qsb-014-2015.txt
> for an example, where Qubes OS did pretty much that, and ended up with
> a potentially exploitable security bug because of that, where one VM,
> while performing a file transfer into another VM, could write outside
> of the transfer target directory.
> The problem is what happens if a folder you are walking through is
> concurrently moved out of the chroot. Consider the following scenario:
> 
> You attempt to open "C/../../etc/passwd" under the root "/A/B".
> Something else concurrently moves /A/B/C to /A/C. This can result in
> the following:
> 
> 1. You start the path walk and reach /A/B/C.
> 2. The other process moves /A/B/C to /A/C. Your path walk is now at /A/C.
> 3. Your path walk follows the first ".." up into /A. This is outside
> the process root, but you never actually encountered the process root,
> so you don't notice.
> 4. Your path walk follows the second ".." up to /. Again, this is
> outside the process root, but you don't notice.
> 5. Your path walk walks down to /etc/passwd, and the open completes
> successfully. You now have an fd pointing outside your chroot.
> 
> If the root of your walk is below an attacker-controlled directory,
> this of course means that you lose instantly. If you point the root of
> the walk at a directory out of which a process in the container
> wouldn't be able to move the file, you're probably kinda mostly fine -
> as long as you know, for certain, that nothing else on the system
> would ever do that. But I still wouldn't feel good about that.
> 
> (Yes, this means that if you run an SFTP server with OpenSSH's
> ChrootDirectory directive, you have to be very careful about these
> things.)
> 
> I believe that the only way to robustly use this would be to point the
> dirfd at a mount point, such that you know that being moved out of the
> chroot is impossible because the mount point limits movement of
> directories under it. (Well, technically, it doesn't, but it ensures
> that if a directory does dangerously move away, the syscall fails.) It
> might make sense to hardcode this constraint in the implementation of
> AT_THIS_ROOT, to keep people from shooting themselves in the foot.

I'm very much in favor of dropping AT_THIS_ROOT from this patch series
at least for now and only land the first patch. The first patch is
something that we really want and that it seems we can find a good
design for.
If AT_THIS_ROOT is a feature that still makes sense we can revisit it.

> 
> > Currently most container runtimes try to do this resolution in
> > userspace[1], causing many potential race conditions. In addition, the
> > "obvious" alternative (actually performing a {ch,pivot_}root(2))
> > requires a fork+exec which is *very* costly if necessary for every
> > filesystem operation involving a container.
> 
> Wait. fork() I understand, but why exec? And actually, you don't need
> a full fork() either, clone() lets you do this with some process parts
> shared. And then you also shouldn't need to use SCM_RIGHTS, just keep
> the file descriptor table shared. And why chroot()/pivot_root(),
> wouldn't you want to use setns()? I think something like this should
> work (except that you should add some error handling - omitted here
> because I'm lazy), assuming that the container runtime does NOT have
> CAP_SYS_ADMIN in the init namespace (otherwise it's easier). Of
> course, this is entirely untested, and probably won't compile because
> I screwed something up. :P But you should get the idea...
> 
> // Ensure that we are non-dumpable. Together with
> // commit bfedb589252c, this ensures that container root
> // can't trace our child once it enters the container.
> // My patch
> // 
> 

Re: [PATCH 2/3] namei: implement AT_THIS_ROOT chroot-like path resolution

2018-10-01 Thread Bruce Fields
On Mon, Oct 01, 2018 at 03:44:28PM +1000, Aleksa Sarai wrote:
> On 2018-09-29, Jann Horn  wrote:
> > The problem is what happens if a folder you are walking through is
> > concurrently moved out of the chroot. Consider the following scenario:
> > 
> > You attempt to open "C/../../etc/passwd" under the root "/A/B".
> > Something else concurrently moves /A/B/C to /A/C. This can result in
> > the following:
> > 
> > 1. You start the path walk and reach /A/B/C.
> > 2. The other process moves /A/B/C to /A/C. Your path walk is now at /A/C.
> > 3. Your path walk follows the first ".." up into /A. This is outside
> > the process root, but you never actually encountered the process root,
> > so you don't notice.
> > 4. Your path walk follows the second ".." up to /. Again, this is
> > outside the process root, but you don't notice.
> > 5. Your path walk walks down to /etc/passwd, and the open completes
> > successfully. You now have an fd pointing outside your chroot.
> > 
> > If the root of your walk is below an attacker-controlled directory,
> > this of course means that you lose instantly. If you point the root of
> > the walk at a directory out of which a process in the container
> > wouldn't be able to move the file, you're probably kinda mostly fine -
> > as long as you know, for certain, that nothing else on the system
> > would ever do that. But I still wouldn't feel good about that.
> 
> Please correct me if I'm wrong here (this is the first patch I've
> written for VFS). Isn't the retry/LOOKUP_REVAL code meant to handle this

No.

...
> Speaking naively, doesn't it make sense to invalidate the walk if a path
> component was modified? Or is this something that would be far too
> costly with little benefit?

Lookups and renames can definitely proceed in parallel, and yes I
suspect it would be difficult to get good performance and guaranteed
forward progress if you required lookup of the full path to be atomic
with respect to renames.

--b.


Re: [PATCH 2/3] namei: implement AT_THIS_ROOT chroot-like path resolution

2018-10-01 Thread Bruce Fields
On Mon, Oct 01, 2018 at 03:44:28PM +1000, Aleksa Sarai wrote:
> On 2018-09-29, Jann Horn  wrote:
> > The problem is what happens if a folder you are walking through is
> > concurrently moved out of the chroot. Consider the following scenario:
> > 
> > You attempt to open "C/../../etc/passwd" under the root "/A/B".
> > Something else concurrently moves /A/B/C to /A/C. This can result in
> > the following:
> > 
> > 1. You start the path walk and reach /A/B/C.
> > 2. The other process moves /A/B/C to /A/C. Your path walk is now at /A/C.
> > 3. Your path walk follows the first ".." up into /A. This is outside
> > the process root, but you never actually encountered the process root,
> > so you don't notice.
> > 4. Your path walk follows the second ".." up to /. Again, this is
> > outside the process root, but you don't notice.
> > 5. Your path walk walks down to /etc/passwd, and the open completes
> > successfully. You now have an fd pointing outside your chroot.
> > 
> > If the root of your walk is below an attacker-controlled directory,
> > this of course means that you lose instantly. If you point the root of
> > the walk at a directory out of which a process in the container
> > wouldn't be able to move the file, you're probably kinda mostly fine -
> > as long as you know, for certain, that nothing else on the system
> > would ever do that. But I still wouldn't feel good about that.
> 
> Please correct me if I'm wrong here (this is the first patch I've
> written for VFS). Isn't the retry/LOOKUP_REVAL code meant to handle this

No.

...
> Speaking naively, doesn't it make sense to invalidate the walk if a path
> component was modified? Or is this something that would be far too
> costly with little benefit?

Lookups and renames can definitely proceed in parallel, and yes I
suspect it would be difficult to get good performance and guaranteed
forward progress if you required lookup of the full path to be atomic
with respect to renames.

--b.


Re: [PATCH 2/3] namei: implement AT_THIS_ROOT chroot-like path resolution

2018-10-01 Thread Christian Brauner
On Mon, Oct 01, 2018 at 01:29:16PM +0200, Jann Horn wrote:
> On Mon, Oct 1, 2018 at 12:42 PM Christian Brauner  
> wrote:
> > On Mon, Oct 01, 2018 at 03:44:28PM +1000, Aleksa Sarai wrote:
> > > On 2018-09-29, Jann Horn  wrote:
> > > > The problem is what happens if a folder you are walking through is
> > > > concurrently moved out of the chroot. Consider the following scenario:
> > > >
> > > > You attempt to open "C/../../etc/passwd" under the root "/A/B".
> > > > Something else concurrently moves /A/B/C to /A/C. This can result in
> > > > the following:
> > > >
> > > > 1. You start the path walk and reach /A/B/C.
> > > > 2. The other process moves /A/B/C to /A/C. Your path walk is now at 
> > > > /A/C.
> > > > 3. Your path walk follows the first ".." up into /A. This is outside
> > > > the process root, but you never actually encountered the process root,
> > > > so you don't notice.
> > > > 4. Your path walk follows the second ".." up to /. Again, this is
> > > > outside the process root, but you don't notice.
> > > > 5. Your path walk walks down to /etc/passwd, and the open completes
> > > > successfully. You now have an fd pointing outside your chroot.
> > > >
> > > > If the root of your walk is below an attacker-controlled directory,
> > > > this of course means that you lose instantly. If you point the root of
> > > > the walk at a directory out of which a process in the container
> > > > wouldn't be able to move the file, you're probably kinda mostly fine -
> > > > as long as you know, for certain, that nothing else on the system
> > > > would ever do that. But I still wouldn't feel good about that.
> > >
> > > Please correct me if I'm wrong here (this is the first patch I've
> > > written for VFS). Isn't the retry/LOOKUP_REVAL code meant to handle this
> > > -- or does that only handle if a particular path component changes
> > > *while* it's being walked through? Is it possible for a path walk to
> > > succeed after a path component was unmounted (obviously you can't delete
> > > a directory path component since you'd get -ENOTEMPTY)?
> > >
> > > If this is an issue for AT_THIS_ROOT, I believe this might also be an
> > > issue for AT_BENEATH since they are effectively both using the same
> > > nd->root trick (so you could similarly trick AT_BENEATH to not error
> > > out). So we'd need to figure out how to solve this problem in order for
> > > AT_BENEATH to be safe.
> > >
> > > Speaking naively, doesn't it make sense to invalidate the walk if a path
> > > component was modified? Or is this something that would be far too
> > > costly with little benefit? What if we do more aggressive nd->root
> > > checks when resolving with AT_BENEATH or AT_THIS_ROOT (or if nd->root !=
> > > current->mnt_ns->root)?
> > >
> > > Regarding chroot attacks, I was aware of the trivial
> > > chroot-open-chroot-fchdir attack but I was not aware that there was a
> > > rename attack for chroot. Thanks for bringing this up!
> > >
> > > > I believe that the only way to robustly use this would be to point the
> > > > dirfd at a mount point, such that you know that being moved out of the
> > > > chroot is impossible because the mount point limits movement of
> > > > directories under it. (Well, technically, it doesn't, but it ensures
> > > > that if a directory does dangerously move away, the syscall fails.) It
> > > > might make sense to hardcode this constraint in the implementation of
> > > > AT_THIS_ROOT, to keep people from shooting themselves in the foot.
> > >
> > > Unless I'm missing something, would this not also affect using a
> > > mountpoint as a dirfd-root (with MS_MOVE of an already-walked-through
> > > path component) -- or does MS_MOVE cause a rewalk in a way that rename
> > > does not?
> > >
> > > I wouldn't mind tying AT_THIS_ROOT to only work on mountpoints (I
> > > thought that bind-mounts would be an issue but you also get -EXDEV when
> > > trying to rename across bind-mounts even if they are on the same
> > > underlying filesystem). But AT_BENEATH might be a more bitter pill to
> > > swallow. I'm not sure.
> > >
> > > In the usecase of container runtimes, we wouldn't generally be doing
> > > resolution of attacker-controlled paths but it still definitely doesn't
> > > hurt to consider this part of the threat model -- to avoid foot-gunning
> > > as you've said. (There also might be some nested-container cases where
> > > you might want to do that.)
> > >
> > > > > Currently most container runtimes try to do this resolution in
> > > > > userspace[1], causing many potential race conditions. In addition, the
> > > > > "obvious" alternative (actually performing a {ch,pivot_}root(2))
> > > > > requires a fork+exec which is *very* costly if necessary for every
> > > > > filesystem operation involving a container.
> > > >
> > > > Wait. fork() I understand, but why exec? And actually, you don't need
> > > > a full fork() either, clone() lets you do this with some process parts
> > > > shared. And then you also 

Re: [PATCH 2/3] namei: implement AT_THIS_ROOT chroot-like path resolution

2018-10-01 Thread Christian Brauner
On Mon, Oct 01, 2018 at 01:29:16PM +0200, Jann Horn wrote:
> On Mon, Oct 1, 2018 at 12:42 PM Christian Brauner  
> wrote:
> > On Mon, Oct 01, 2018 at 03:44:28PM +1000, Aleksa Sarai wrote:
> > > On 2018-09-29, Jann Horn  wrote:
> > > > The problem is what happens if a folder you are walking through is
> > > > concurrently moved out of the chroot. Consider the following scenario:
> > > >
> > > > You attempt to open "C/../../etc/passwd" under the root "/A/B".
> > > > Something else concurrently moves /A/B/C to /A/C. This can result in
> > > > the following:
> > > >
> > > > 1. You start the path walk and reach /A/B/C.
> > > > 2. The other process moves /A/B/C to /A/C. Your path walk is now at 
> > > > /A/C.
> > > > 3. Your path walk follows the first ".." up into /A. This is outside
> > > > the process root, but you never actually encountered the process root,
> > > > so you don't notice.
> > > > 4. Your path walk follows the second ".." up to /. Again, this is
> > > > outside the process root, but you don't notice.
> > > > 5. Your path walk walks down to /etc/passwd, and the open completes
> > > > successfully. You now have an fd pointing outside your chroot.
> > > >
> > > > If the root of your walk is below an attacker-controlled directory,
> > > > this of course means that you lose instantly. If you point the root of
> > > > the walk at a directory out of which a process in the container
> > > > wouldn't be able to move the file, you're probably kinda mostly fine -
> > > > as long as you know, for certain, that nothing else on the system
> > > > would ever do that. But I still wouldn't feel good about that.
> > >
> > > Please correct me if I'm wrong here (this is the first patch I've
> > > written for VFS). Isn't the retry/LOOKUP_REVAL code meant to handle this
> > > -- or does that only handle if a particular path component changes
> > > *while* it's being walked through? Is it possible for a path walk to
> > > succeed after a path component was unmounted (obviously you can't delete
> > > a directory path component since you'd get -ENOTEMPTY)?
> > >
> > > If this is an issue for AT_THIS_ROOT, I believe this might also be an
> > > issue for AT_BENEATH since they are effectively both using the same
> > > nd->root trick (so you could similarly trick AT_BENEATH to not error
> > > out). So we'd need to figure out how to solve this problem in order for
> > > AT_BENEATH to be safe.
> > >
> > > Speaking naively, doesn't it make sense to invalidate the walk if a path
> > > component was modified? Or is this something that would be far too
> > > costly with little benefit? What if we do more aggressive nd->root
> > > checks when resolving with AT_BENEATH or AT_THIS_ROOT (or if nd->root !=
> > > current->mnt_ns->root)?
> > >
> > > Regarding chroot attacks, I was aware of the trivial
> > > chroot-open-chroot-fchdir attack but I was not aware that there was a
> > > rename attack for chroot. Thanks for bringing this up!
> > >
> > > > I believe that the only way to robustly use this would be to point the
> > > > dirfd at a mount point, such that you know that being moved out of the
> > > > chroot is impossible because the mount point limits movement of
> > > > directories under it. (Well, technically, it doesn't, but it ensures
> > > > that if a directory does dangerously move away, the syscall fails.) It
> > > > might make sense to hardcode this constraint in the implementation of
> > > > AT_THIS_ROOT, to keep people from shooting themselves in the foot.
> > >
> > > Unless I'm missing something, would this not also affect using a
> > > mountpoint as a dirfd-root (with MS_MOVE of an already-walked-through
> > > path component) -- or does MS_MOVE cause a rewalk in a way that rename
> > > does not?
> > >
> > > I wouldn't mind tying AT_THIS_ROOT to only work on mountpoints (I
> > > thought that bind-mounts would be an issue but you also get -EXDEV when
> > > trying to rename across bind-mounts even if they are on the same
> > > underlying filesystem). But AT_BENEATH might be a more bitter pill to
> > > swallow. I'm not sure.
> > >
> > > In the usecase of container runtimes, we wouldn't generally be doing
> > > resolution of attacker-controlled paths but it still definitely doesn't
> > > hurt to consider this part of the threat model -- to avoid foot-gunning
> > > as you've said. (There also might be some nested-container cases where
> > > you might want to do that.)
> > >
> > > > > Currently most container runtimes try to do this resolution in
> > > > > userspace[1], causing many potential race conditions. In addition, the
> > > > > "obvious" alternative (actually performing a {ch,pivot_}root(2))
> > > > > requires a fork+exec which is *very* costly if necessary for every
> > > > > filesystem operation involving a container.
> > > >
> > > > Wait. fork() I understand, but why exec? And actually, you don't need
> > > > a full fork() either, clone() lets you do this with some process parts
> > > > shared. And then you also 

Re: [PATCH 2/3] namei: implement AT_THIS_ROOT chroot-like path resolution

2018-10-01 Thread Jann Horn
On Mon, Oct 1, 2018 at 12:42 PM Christian Brauner  wrote:
> On Mon, Oct 01, 2018 at 03:44:28PM +1000, Aleksa Sarai wrote:
> > On 2018-09-29, Jann Horn  wrote:
> > > The problem is what happens if a folder you are walking through is
> > > concurrently moved out of the chroot. Consider the following scenario:
> > >
> > > You attempt to open "C/../../etc/passwd" under the root "/A/B".
> > > Something else concurrently moves /A/B/C to /A/C. This can result in
> > > the following:
> > >
> > > 1. You start the path walk and reach /A/B/C.
> > > 2. The other process moves /A/B/C to /A/C. Your path walk is now at /A/C.
> > > 3. Your path walk follows the first ".." up into /A. This is outside
> > > the process root, but you never actually encountered the process root,
> > > so you don't notice.
> > > 4. Your path walk follows the second ".." up to /. Again, this is
> > > outside the process root, but you don't notice.
> > > 5. Your path walk walks down to /etc/passwd, and the open completes
> > > successfully. You now have an fd pointing outside your chroot.
> > >
> > > If the root of your walk is below an attacker-controlled directory,
> > > this of course means that you lose instantly. If you point the root of
> > > the walk at a directory out of which a process in the container
> > > wouldn't be able to move the file, you're probably kinda mostly fine -
> > > as long as you know, for certain, that nothing else on the system
> > > would ever do that. But I still wouldn't feel good about that.
> >
> > Please correct me if I'm wrong here (this is the first patch I've
> > written for VFS). Isn't the retry/LOOKUP_REVAL code meant to handle this
> > -- or does that only handle if a particular path component changes
> > *while* it's being walked through? Is it possible for a path walk to
> > succeed after a path component was unmounted (obviously you can't delete
> > a directory path component since you'd get -ENOTEMPTY)?
> >
> > If this is an issue for AT_THIS_ROOT, I believe this might also be an
> > issue for AT_BENEATH since they are effectively both using the same
> > nd->root trick (so you could similarly trick AT_BENEATH to not error
> > out). So we'd need to figure out how to solve this problem in order for
> > AT_BENEATH to be safe.
> >
> > Speaking naively, doesn't it make sense to invalidate the walk if a path
> > component was modified? Or is this something that would be far too
> > costly with little benefit? What if we do more aggressive nd->root
> > checks when resolving with AT_BENEATH or AT_THIS_ROOT (or if nd->root !=
> > current->mnt_ns->root)?
> >
> > Regarding chroot attacks, I was aware of the trivial
> > chroot-open-chroot-fchdir attack but I was not aware that there was a
> > rename attack for chroot. Thanks for bringing this up!
> >
> > > I believe that the only way to robustly use this would be to point the
> > > dirfd at a mount point, such that you know that being moved out of the
> > > chroot is impossible because the mount point limits movement of
> > > directories under it. (Well, technically, it doesn't, but it ensures
> > > that if a directory does dangerously move away, the syscall fails.) It
> > > might make sense to hardcode this constraint in the implementation of
> > > AT_THIS_ROOT, to keep people from shooting themselves in the foot.
> >
> > Unless I'm missing something, would this not also affect using a
> > mountpoint as a dirfd-root (with MS_MOVE of an already-walked-through
> > path component) -- or does MS_MOVE cause a rewalk in a way that rename
> > does not?
> >
> > I wouldn't mind tying AT_THIS_ROOT to only work on mountpoints (I
> > thought that bind-mounts would be an issue but you also get -EXDEV when
> > trying to rename across bind-mounts even if they are on the same
> > underlying filesystem). But AT_BENEATH might be a more bitter pill to
> > swallow. I'm not sure.
> >
> > In the usecase of container runtimes, we wouldn't generally be doing
> > resolution of attacker-controlled paths but it still definitely doesn't
> > hurt to consider this part of the threat model -- to avoid foot-gunning
> > as you've said. (There also might be some nested-container cases where
> > you might want to do that.)
> >
> > > > Currently most container runtimes try to do this resolution in
> > > > userspace[1], causing many potential race conditions. In addition, the
> > > > "obvious" alternative (actually performing a {ch,pivot_}root(2))
> > > > requires a fork+exec which is *very* costly if necessary for every
> > > > filesystem operation involving a container.
> > >
> > > Wait. fork() I understand, but why exec? And actually, you don't need
> > > a full fork() either, clone() lets you do this with some process parts
> > > shared. And then you also shouldn't need to use SCM_RIGHTS, just keep
> > > the file descriptor table shared. And why chroot()/pivot_root(),
> > > wouldn't you want to use setns()?
> >
> > You're right about this -- for C runtimes. In Go we cannot do a raw
> > 

Re: [PATCH 2/3] namei: implement AT_THIS_ROOT chroot-like path resolution

2018-10-01 Thread Jann Horn
On Mon, Oct 1, 2018 at 12:42 PM Christian Brauner  wrote:
> On Mon, Oct 01, 2018 at 03:44:28PM +1000, Aleksa Sarai wrote:
> > On 2018-09-29, Jann Horn  wrote:
> > > The problem is what happens if a folder you are walking through is
> > > concurrently moved out of the chroot. Consider the following scenario:
> > >
> > > You attempt to open "C/../../etc/passwd" under the root "/A/B".
> > > Something else concurrently moves /A/B/C to /A/C. This can result in
> > > the following:
> > >
> > > 1. You start the path walk and reach /A/B/C.
> > > 2. The other process moves /A/B/C to /A/C. Your path walk is now at /A/C.
> > > 3. Your path walk follows the first ".." up into /A. This is outside
> > > the process root, but you never actually encountered the process root,
> > > so you don't notice.
> > > 4. Your path walk follows the second ".." up to /. Again, this is
> > > outside the process root, but you don't notice.
> > > 5. Your path walk walks down to /etc/passwd, and the open completes
> > > successfully. You now have an fd pointing outside your chroot.
> > >
> > > If the root of your walk is below an attacker-controlled directory,
> > > this of course means that you lose instantly. If you point the root of
> > > the walk at a directory out of which a process in the container
> > > wouldn't be able to move the file, you're probably kinda mostly fine -
> > > as long as you know, for certain, that nothing else on the system
> > > would ever do that. But I still wouldn't feel good about that.
> >
> > Please correct me if I'm wrong here (this is the first patch I've
> > written for VFS). Isn't the retry/LOOKUP_REVAL code meant to handle this
> > -- or does that only handle if a particular path component changes
> > *while* it's being walked through? Is it possible for a path walk to
> > succeed after a path component was unmounted (obviously you can't delete
> > a directory path component since you'd get -ENOTEMPTY)?
> >
> > If this is an issue for AT_THIS_ROOT, I believe this might also be an
> > issue for AT_BENEATH since they are effectively both using the same
> > nd->root trick (so you could similarly trick AT_BENEATH to not error
> > out). So we'd need to figure out how to solve this problem in order for
> > AT_BENEATH to be safe.
> >
> > Speaking naively, doesn't it make sense to invalidate the walk if a path
> > component was modified? Or is this something that would be far too
> > costly with little benefit? What if we do more aggressive nd->root
> > checks when resolving with AT_BENEATH or AT_THIS_ROOT (or if nd->root !=
> > current->mnt_ns->root)?
> >
> > Regarding chroot attacks, I was aware of the trivial
> > chroot-open-chroot-fchdir attack but I was not aware that there was a
> > rename attack for chroot. Thanks for bringing this up!
> >
> > > I believe that the only way to robustly use this would be to point the
> > > dirfd at a mount point, such that you know that being moved out of the
> > > chroot is impossible because the mount point limits movement of
> > > directories under it. (Well, technically, it doesn't, but it ensures
> > > that if a directory does dangerously move away, the syscall fails.) It
> > > might make sense to hardcode this constraint in the implementation of
> > > AT_THIS_ROOT, to keep people from shooting themselves in the foot.
> >
> > Unless I'm missing something, would this not also affect using a
> > mountpoint as a dirfd-root (with MS_MOVE of an already-walked-through
> > path component) -- or does MS_MOVE cause a rewalk in a way that rename
> > does not?
> >
> > I wouldn't mind tying AT_THIS_ROOT to only work on mountpoints (I
> > thought that bind-mounts would be an issue but you also get -EXDEV when
> > trying to rename across bind-mounts even if they are on the same
> > underlying filesystem). But AT_BENEATH might be a more bitter pill to
> > swallow. I'm not sure.
> >
> > In the usecase of container runtimes, we wouldn't generally be doing
> > resolution of attacker-controlled paths but it still definitely doesn't
> > hurt to consider this part of the threat model -- to avoid foot-gunning
> > as you've said. (There also might be some nested-container cases where
> > you might want to do that.)
> >
> > > > Currently most container runtimes try to do this resolution in
> > > > userspace[1], causing many potential race conditions. In addition, the
> > > > "obvious" alternative (actually performing a {ch,pivot_}root(2))
> > > > requires a fork+exec which is *very* costly if necessary for every
> > > > filesystem operation involving a container.
> > >
> > > Wait. fork() I understand, but why exec? And actually, you don't need
> > > a full fork() either, clone() lets you do this with some process parts
> > > shared. And then you also shouldn't need to use SCM_RIGHTS, just keep
> > > the file descriptor table shared. And why chroot()/pivot_root(),
> > > wouldn't you want to use setns()?
> >
> > You're right about this -- for C runtimes. In Go we cannot do a raw
> > 

Re: [PATCH 2/3] namei: implement AT_THIS_ROOT chroot-like path resolution

2018-10-01 Thread Christian Brauner
On Mon, Oct 01, 2018 at 03:44:28PM +1000, Aleksa Sarai wrote:
> On 2018-09-29, Jann Horn  wrote:
> > The problem is what happens if a folder you are walking through is
> > concurrently moved out of the chroot. Consider the following scenario:
> > 
> > You attempt to open "C/../../etc/passwd" under the root "/A/B".
> > Something else concurrently moves /A/B/C to /A/C. This can result in
> > the following:
> > 
> > 1. You start the path walk and reach /A/B/C.
> > 2. The other process moves /A/B/C to /A/C. Your path walk is now at /A/C.
> > 3. Your path walk follows the first ".." up into /A. This is outside
> > the process root, but you never actually encountered the process root,
> > so you don't notice.
> > 4. Your path walk follows the second ".." up to /. Again, this is
> > outside the process root, but you don't notice.
> > 5. Your path walk walks down to /etc/passwd, and the open completes
> > successfully. You now have an fd pointing outside your chroot.
> > 
> > If the root of your walk is below an attacker-controlled directory,
> > this of course means that you lose instantly. If you point the root of
> > the walk at a directory out of which a process in the container
> > wouldn't be able to move the file, you're probably kinda mostly fine -
> > as long as you know, for certain, that nothing else on the system
> > would ever do that. But I still wouldn't feel good about that.
> 
> Please correct me if I'm wrong here (this is the first patch I've
> written for VFS). Isn't the retry/LOOKUP_REVAL code meant to handle this
> -- or does that only handle if a particular path component changes
> *while* it's being walked through? Is it possible for a path walk to
> succeed after a path component was unmounted (obviously you can't delete
> a directory path component since you'd get -ENOTEMPTY)?
> 
> If this is an issue for AT_THIS_ROOT, I believe this might also be an
> issue for AT_BENEATH since they are effectively both using the same
> nd->root trick (so you could similarly trick AT_BENEATH to not error
> out). So we'd need to figure out how to solve this problem in order for
> AT_BENEATH to be safe.
> 
> Speaking naively, doesn't it make sense to invalidate the walk if a path
> component was modified? Or is this something that would be far too
> costly with little benefit? What if we do more aggressive nd->root
> checks when resolving with AT_BENEATH or AT_THIS_ROOT (or if nd->root !=
> current->mnt_ns->root)?
> 
> Regarding chroot attacks, I was aware of the trivial
> chroot-open-chroot-fchdir attack but I was not aware that there was a
> rename attack for chroot. Thanks for bringing this up!
> 
> > I believe that the only way to robustly use this would be to point the
> > dirfd at a mount point, such that you know that being moved out of the
> > chroot is impossible because the mount point limits movement of
> > directories under it. (Well, technically, it doesn't, but it ensures
> > that if a directory does dangerously move away, the syscall fails.) It
> > might make sense to hardcode this constraint in the implementation of
> > AT_THIS_ROOT, to keep people from shooting themselves in the foot.
> 
> Unless I'm missing something, would this not also affect using a
> mountpoint as a dirfd-root (with MS_MOVE of an already-walked-through
> path component) -- or does MS_MOVE cause a rewalk in a way that rename
> does not?
> 
> I wouldn't mind tying AT_THIS_ROOT to only work on mountpoints (I
> thought that bind-mounts would be an issue but you also get -EXDEV when
> trying to rename across bind-mounts even if they are on the same
> underlying filesystem). But AT_BENEATH might be a more bitter pill to
> swallow. I'm not sure.
> 
> In the usecase of container runtimes, we wouldn't generally be doing
> resolution of attacker-controlled paths but it still definitely doesn't
> hurt to consider this part of the threat model -- to avoid foot-gunning
> as you've said. (There also might be some nested-container cases where
> you might want to do that.)
> 
> > > Currently most container runtimes try to do this resolution in
> > > userspace[1], causing many potential race conditions. In addition, the
> > > "obvious" alternative (actually performing a {ch,pivot_}root(2))
> > > requires a fork+exec which is *very* costly if necessary for every
> > > filesystem operation involving a container.
> > 
> > Wait. fork() I understand, but why exec? And actually, you don't need
> > a full fork() either, clone() lets you do this with some process parts
> > shared. And then you also shouldn't need to use SCM_RIGHTS, just keep
> > the file descriptor table shared. And why chroot()/pivot_root(),
> > wouldn't you want to use setns()?
> 
> You're right about this -- for C runtimes. In Go we cannot do a raw
> clone() or fork() (if you do it manually with RawSyscall you'll end with
> broken runtime state). So you're forced to do fork+exec (which then
> means that you can't use CLONE_FILES and must use SCM_RIGHTS). Same goes
> for 

Re: [PATCH 2/3] namei: implement AT_THIS_ROOT chroot-like path resolution

2018-10-01 Thread Christian Brauner
On Mon, Oct 01, 2018 at 03:44:28PM +1000, Aleksa Sarai wrote:
> On 2018-09-29, Jann Horn  wrote:
> > The problem is what happens if a folder you are walking through is
> > concurrently moved out of the chroot. Consider the following scenario:
> > 
> > You attempt to open "C/../../etc/passwd" under the root "/A/B".
> > Something else concurrently moves /A/B/C to /A/C. This can result in
> > the following:
> > 
> > 1. You start the path walk and reach /A/B/C.
> > 2. The other process moves /A/B/C to /A/C. Your path walk is now at /A/C.
> > 3. Your path walk follows the first ".." up into /A. This is outside
> > the process root, but you never actually encountered the process root,
> > so you don't notice.
> > 4. Your path walk follows the second ".." up to /. Again, this is
> > outside the process root, but you don't notice.
> > 5. Your path walk walks down to /etc/passwd, and the open completes
> > successfully. You now have an fd pointing outside your chroot.
> > 
> > If the root of your walk is below an attacker-controlled directory,
> > this of course means that you lose instantly. If you point the root of
> > the walk at a directory out of which a process in the container
> > wouldn't be able to move the file, you're probably kinda mostly fine -
> > as long as you know, for certain, that nothing else on the system
> > would ever do that. But I still wouldn't feel good about that.
> 
> Please correct me if I'm wrong here (this is the first patch I've
> written for VFS). Isn't the retry/LOOKUP_REVAL code meant to handle this
> -- or does that only handle if a particular path component changes
> *while* it's being walked through? Is it possible for a path walk to
> succeed after a path component was unmounted (obviously you can't delete
> a directory path component since you'd get -ENOTEMPTY)?
> 
> If this is an issue for AT_THIS_ROOT, I believe this might also be an
> issue for AT_BENEATH since they are effectively both using the same
> nd->root trick (so you could similarly trick AT_BENEATH to not error
> out). So we'd need to figure out how to solve this problem in order for
> AT_BENEATH to be safe.
> 
> Speaking naively, doesn't it make sense to invalidate the walk if a path
> component was modified? Or is this something that would be far too
> costly with little benefit? What if we do more aggressive nd->root
> checks when resolving with AT_BENEATH or AT_THIS_ROOT (or if nd->root !=
> current->mnt_ns->root)?
> 
> Regarding chroot attacks, I was aware of the trivial
> chroot-open-chroot-fchdir attack but I was not aware that there was a
> rename attack for chroot. Thanks for bringing this up!
> 
> > I believe that the only way to robustly use this would be to point the
> > dirfd at a mount point, such that you know that being moved out of the
> > chroot is impossible because the mount point limits movement of
> > directories under it. (Well, technically, it doesn't, but it ensures
> > that if a directory does dangerously move away, the syscall fails.) It
> > might make sense to hardcode this constraint in the implementation of
> > AT_THIS_ROOT, to keep people from shooting themselves in the foot.
> 
> Unless I'm missing something, would this not also affect using a
> mountpoint as a dirfd-root (with MS_MOVE of an already-walked-through
> path component) -- or does MS_MOVE cause a rewalk in a way that rename
> does not?
> 
> I wouldn't mind tying AT_THIS_ROOT to only work on mountpoints (I
> thought that bind-mounts would be an issue but you also get -EXDEV when
> trying to rename across bind-mounts even if they are on the same
> underlying filesystem). But AT_BENEATH might be a more bitter pill to
> swallow. I'm not sure.
> 
> In the usecase of container runtimes, we wouldn't generally be doing
> resolution of attacker-controlled paths but it still definitely doesn't
> hurt to consider this part of the threat model -- to avoid foot-gunning
> as you've said. (There also might be some nested-container cases where
> you might want to do that.)
> 
> > > Currently most container runtimes try to do this resolution in
> > > userspace[1], causing many potential race conditions. In addition, the
> > > "obvious" alternative (actually performing a {ch,pivot_}root(2))
> > > requires a fork+exec which is *very* costly if necessary for every
> > > filesystem operation involving a container.
> > 
> > Wait. fork() I understand, but why exec? And actually, you don't need
> > a full fork() either, clone() lets you do this with some process parts
> > shared. And then you also shouldn't need to use SCM_RIGHTS, just keep
> > the file descriptor table shared. And why chroot()/pivot_root(),
> > wouldn't you want to use setns()?
> 
> You're right about this -- for C runtimes. In Go we cannot do a raw
> clone() or fork() (if you do it manually with RawSyscall you'll end with
> broken runtime state). So you're forced to do fork+exec (which then
> means that you can't use CLONE_FILES and must use SCM_RIGHTS). Same goes
> for 

Re: [PATCH 2/3] namei: implement AT_THIS_ROOT chroot-like path resolution

2018-10-01 Thread Jann Horn
On Mon, Oct 1, 2018 at 7:44 AM Aleksa Sarai  wrote:
> On 2018-09-29, Jann Horn  wrote:
> > The problem is what happens if a folder you are walking through is
> > concurrently moved out of the chroot. Consider the following scenario:
> >
> > You attempt to open "C/../../etc/passwd" under the root "/A/B".
> > Something else concurrently moves /A/B/C to /A/C. This can result in
> > the following:
> >
> > 1. You start the path walk and reach /A/B/C.
> > 2. The other process moves /A/B/C to /A/C. Your path walk is now at /A/C.
> > 3. Your path walk follows the first ".." up into /A. This is outside
> > the process root, but you never actually encountered the process root,
> > so you don't notice.
> > 4. Your path walk follows the second ".." up to /. Again, this is
> > outside the process root, but you don't notice.
> > 5. Your path walk walks down to /etc/passwd, and the open completes
> > successfully. You now have an fd pointing outside your chroot.
> >
> > If the root of your walk is below an attacker-controlled directory,
> > this of course means that you lose instantly. If you point the root of
> > the walk at a directory out of which a process in the container
> > wouldn't be able to move the file, you're probably kinda mostly fine -
> > as long as you know, for certain, that nothing else on the system
> > would ever do that. But I still wouldn't feel good about that.
>
> Please correct me if I'm wrong here (this is the first patch I've
> written for VFS). Isn't the retry/LOOKUP_REVAL code meant to handle this
> -- or does that only handle if a particular path component changes
> *while* it's being walked through?

Eric Biederman should be able to talk about all this much better than
me, but as far as I know, the LOOKUP_REVAL path is only for dealing
with some special filesystems like procfs.

> Is it possible for a path walk to
> succeed after a path component was unmounted (obviously you can't delete
> a directory path component since you'd get -ENOTEMPTY)?

I don't think so, but I'm not exactly a VFS expert.

> If this is an issue for AT_THIS_ROOT, I believe this might also be an
> issue for AT_BENEATH since they are effectively both using the same
> nd->root trick (so you could similarly trick AT_BENEATH to not error
> out). So we'd need to figure out how to solve this problem in order for
> AT_BENEATH to be safe.

Oh, wait, what? I think I didn't notice that the semantics of
AT_BENEATH changed like that since the original posting of David
Drysdale's O_BENEATH_ONLY patch
(https://lore.kernel.org/lkml/1439458366-8223-2-git-send-email-drysd...@google.com/).
David's patch had nice, straightforward semantics, blocking any form
of upwards traversal. Why was that changed? Does anyone actually want
to use paths that contain ".." with AT_BENEATH? I would strongly
prefer something that blocks any use of "..".

@Al: It looks like this already changed back when you posted
https://lore.kernel.org/lkml/20170429220414.gt29...@zeniv.linux.org.uk/
?

> Speaking naively, doesn't it make sense to invalidate the walk if a path
> component was modified? Or is this something that would be far too
> costly with little benefit? What if we do more aggressive nd->root
> checks when resolving with AT_BENEATH or AT_THIS_ROOT (or if nd->root !=
> current->mnt_ns->root)?

It seems to me like doing that would basically require looking at each
node in the path walk twice? And it'd be difficult to guarantee
forward progress unless you're willing to do some fairly heavy
locking.

> Regarding chroot attacks, I was aware of the trivial
> chroot-open-chroot-fchdir attack but I was not aware that there was a
> rename attack for chroot. Thanks for bringing this up!
>
> > I believe that the only way to robustly use this would be to point the
> > dirfd at a mount point, such that you know that being moved out of the
> > chroot is impossible because the mount point limits movement of
> > directories under it. (Well, technically, it doesn't, but it ensures
> > that if a directory does dangerously move away, the syscall fails.) It
> > might make sense to hardcode this constraint in the implementation of
> > AT_THIS_ROOT, to keep people from shooting themselves in the foot.
>
> Unless I'm missing something, would this not also affect using a
> mountpoint as a dirfd-root (with MS_MOVE of an already-walked-through
> path component) -- or does MS_MOVE cause a rewalk in a way that rename
> does not?

Hmm. Good point.

It looks to me like you probably wouldn't be able to walk up through a
mountpoint in RCU mode after the mount hierarchy has changed (see
follow_dotdot_rcu()), but it might work in refwalk mode.

Eric?

> I wouldn't mind tying AT_THIS_ROOT to only work on mountpoints (I
> thought that bind-mounts would be an issue but you also get -EXDEV when
> trying to rename across bind-mounts even if they are on the same
> underlying filesystem). But AT_BENEATH might be a more bitter pill to
> swallow. I'm not sure.

Which is part of why I 

Re: [PATCH 2/3] namei: implement AT_THIS_ROOT chroot-like path resolution

2018-10-01 Thread Jann Horn
On Mon, Oct 1, 2018 at 7:44 AM Aleksa Sarai  wrote:
> On 2018-09-29, Jann Horn  wrote:
> > The problem is what happens if a folder you are walking through is
> > concurrently moved out of the chroot. Consider the following scenario:
> >
> > You attempt to open "C/../../etc/passwd" under the root "/A/B".
> > Something else concurrently moves /A/B/C to /A/C. This can result in
> > the following:
> >
> > 1. You start the path walk and reach /A/B/C.
> > 2. The other process moves /A/B/C to /A/C. Your path walk is now at /A/C.
> > 3. Your path walk follows the first ".." up into /A. This is outside
> > the process root, but you never actually encountered the process root,
> > so you don't notice.
> > 4. Your path walk follows the second ".." up to /. Again, this is
> > outside the process root, but you don't notice.
> > 5. Your path walk walks down to /etc/passwd, and the open completes
> > successfully. You now have an fd pointing outside your chroot.
> >
> > If the root of your walk is below an attacker-controlled directory,
> > this of course means that you lose instantly. If you point the root of
> > the walk at a directory out of which a process in the container
> > wouldn't be able to move the file, you're probably kinda mostly fine -
> > as long as you know, for certain, that nothing else on the system
> > would ever do that. But I still wouldn't feel good about that.
>
> Please correct me if I'm wrong here (this is the first patch I've
> written for VFS). Isn't the retry/LOOKUP_REVAL code meant to handle this
> -- or does that only handle if a particular path component changes
> *while* it's being walked through?

Eric Biederman should be able to talk about all this much better than
me, but as far as I know, the LOOKUP_REVAL path is only for dealing
with some special filesystems like procfs.

> Is it possible for a path walk to
> succeed after a path component was unmounted (obviously you can't delete
> a directory path component since you'd get -ENOTEMPTY)?

I don't think so, but I'm not exactly a VFS expert.

> If this is an issue for AT_THIS_ROOT, I believe this might also be an
> issue for AT_BENEATH since they are effectively both using the same
> nd->root trick (so you could similarly trick AT_BENEATH to not error
> out). So we'd need to figure out how to solve this problem in order for
> AT_BENEATH to be safe.

Oh, wait, what? I think I didn't notice that the semantics of
AT_BENEATH changed like that since the original posting of David
Drysdale's O_BENEATH_ONLY patch
(https://lore.kernel.org/lkml/1439458366-8223-2-git-send-email-drysd...@google.com/).
David's patch had nice, straightforward semantics, blocking any form
of upwards traversal. Why was that changed? Does anyone actually want
to use paths that contain ".." with AT_BENEATH? I would strongly
prefer something that blocks any use of "..".

@Al: It looks like this already changed back when you posted
https://lore.kernel.org/lkml/20170429220414.gt29...@zeniv.linux.org.uk/
?

> Speaking naively, doesn't it make sense to invalidate the walk if a path
> component was modified? Or is this something that would be far too
> costly with little benefit? What if we do more aggressive nd->root
> checks when resolving with AT_BENEATH or AT_THIS_ROOT (or if nd->root !=
> current->mnt_ns->root)?

It seems to me like doing that would basically require looking at each
node in the path walk twice? And it'd be difficult to guarantee
forward progress unless you're willing to do some fairly heavy
locking.

> Regarding chroot attacks, I was aware of the trivial
> chroot-open-chroot-fchdir attack but I was not aware that there was a
> rename attack for chroot. Thanks for bringing this up!
>
> > I believe that the only way to robustly use this would be to point the
> > dirfd at a mount point, such that you know that being moved out of the
> > chroot is impossible because the mount point limits movement of
> > directories under it. (Well, technically, it doesn't, but it ensures
> > that if a directory does dangerously move away, the syscall fails.) It
> > might make sense to hardcode this constraint in the implementation of
> > AT_THIS_ROOT, to keep people from shooting themselves in the foot.
>
> Unless I'm missing something, would this not also affect using a
> mountpoint as a dirfd-root (with MS_MOVE of an already-walked-through
> path component) -- or does MS_MOVE cause a rewalk in a way that rename
> does not?

Hmm. Good point.

It looks to me like you probably wouldn't be able to walk up through a
mountpoint in RCU mode after the mount hierarchy has changed (see
follow_dotdot_rcu()), but it might work in refwalk mode.

Eric?

> I wouldn't mind tying AT_THIS_ROOT to only work on mountpoints (I
> thought that bind-mounts would be an issue but you also get -EXDEV when
> trying to rename across bind-mounts even if they are on the same
> underlying filesystem). But AT_BENEATH might be a more bitter pill to
> swallow. I'm not sure.

Which is part of why I 

Re: [PATCH 2/3] namei: implement AT_THIS_ROOT chroot-like path resolution

2018-10-01 Thread Aleksa Sarai
On 2018-09-29, Andy Lutomirski  wrote:
> >> On Sat, Sep 29, 2018 at 4:29 PM Aleksa Sarai  wrote:
> >> The primary motivation for the need for this flag is container runtimes
> >> which have to interact with malicious root filesystems in the host
> >> namespaces. One of the first requirements for a container runtime to be
> >> secure against a malicious rootfs is that they correctly scope symlinks
> >> (that is, they should be scoped as though they are chroot(2)ed into the
> >> container's rootfs) and ".."-style paths. The already-existing AT_XDEV
> >> and AT_NO_PROCLINKS help defend against other potential attacks in a
> >> malicious rootfs scenario.
> > 
> > So, I really like the concept for patch 1 of this series (but haven't
> > read the code yet); but I dislike this patch because of its footgun
> > potential.
> > 
> 
> The code could do it differently: do the path walk and then, before
> accepting the result, walk back up and make sure the result is under
> the starting point.
> 
> This is *not* a full solution, though, since a walk above the root gas
> side effects on timing, various caches, and possibly network traffic,
> so it’s open to Spectre-like attacks in which a malicious container
> could use a runtime-initiated AT_THIS_ROOT to infer the existence of
> directories outside the container.

I think that one way to solve this problem might be to have more strict
checks on nd->root in follow_dotdot(). The problem here (as far as I can
tell) is that ".." could end up skipping past the root because of a
rename, however walking *down* into a path shouldn't be a problem (even
absolute symlinks shouldn't be a problem because they will nd_jump_root
and will land back in the root).

However, I'm not entirely sure what happens to nd->root if it gets
renamed -- can you still safely do checks against it (we'd need to do
some sort of is_descendant() check on the current path before we handle
".." in follow_dotdot).

That way, we wouldn't shouldn't have the spectre-like attack problem
(since the attack would be halted at the ".." stage -- before the path
walk can proceed into host paths). Would this be sufficient or is there
a more serious issue I'm missing?

> But what’s the container usecase?  Any sane container is based on
> pivot_root or similar, so the runtime can just do the walk in the
> container context. IOW I’m a bit confused as to the exact intended use
> of the whole series. Can you elaborate?

I went into this in my response to Jann.

-- 
Aleksa Sarai
Senior Software Engineer (Containers)
SUSE Linux GmbH



signature.asc
Description: PGP signature


Re: [PATCH 2/3] namei: implement AT_THIS_ROOT chroot-like path resolution

2018-10-01 Thread Aleksa Sarai
On 2018-09-29, Andy Lutomirski  wrote:
> >> On Sat, Sep 29, 2018 at 4:29 PM Aleksa Sarai  wrote:
> >> The primary motivation for the need for this flag is container runtimes
> >> which have to interact with malicious root filesystems in the host
> >> namespaces. One of the first requirements for a container runtime to be
> >> secure against a malicious rootfs is that they correctly scope symlinks
> >> (that is, they should be scoped as though they are chroot(2)ed into the
> >> container's rootfs) and ".."-style paths. The already-existing AT_XDEV
> >> and AT_NO_PROCLINKS help defend against other potential attacks in a
> >> malicious rootfs scenario.
> > 
> > So, I really like the concept for patch 1 of this series (but haven't
> > read the code yet); but I dislike this patch because of its footgun
> > potential.
> > 
> 
> The code could do it differently: do the path walk and then, before
> accepting the result, walk back up and make sure the result is under
> the starting point.
> 
> This is *not* a full solution, though, since a walk above the root gas
> side effects on timing, various caches, and possibly network traffic,
> so it’s open to Spectre-like attacks in which a malicious container
> could use a runtime-initiated AT_THIS_ROOT to infer the existence of
> directories outside the container.

I think that one way to solve this problem might be to have more strict
checks on nd->root in follow_dotdot(). The problem here (as far as I can
tell) is that ".." could end up skipping past the root because of a
rename, however walking *down* into a path shouldn't be a problem (even
absolute symlinks shouldn't be a problem because they will nd_jump_root
and will land back in the root).

However, I'm not entirely sure what happens to nd->root if it gets
renamed -- can you still safely do checks against it (we'd need to do
some sort of is_descendant() check on the current path before we handle
".." in follow_dotdot).

That way, we wouldn't shouldn't have the spectre-like attack problem
(since the attack would be halted at the ".." stage -- before the path
walk can proceed into host paths). Would this be sufficient or is there
a more serious issue I'm missing?

> But what’s the container usecase?  Any sane container is based on
> pivot_root or similar, so the runtime can just do the walk in the
> container context. IOW I’m a bit confused as to the exact intended use
> of the whole series. Can you elaborate?

I went into this in my response to Jann.

-- 
Aleksa Sarai
Senior Software Engineer (Containers)
SUSE Linux GmbH



signature.asc
Description: PGP signature


Re: [PATCH 2/3] namei: implement AT_THIS_ROOT chroot-like path resolution

2018-09-30 Thread Aleksa Sarai
On 2018-09-29, Jann Horn  wrote:
> The problem is what happens if a folder you are walking through is
> concurrently moved out of the chroot. Consider the following scenario:
> 
> You attempt to open "C/../../etc/passwd" under the root "/A/B".
> Something else concurrently moves /A/B/C to /A/C. This can result in
> the following:
> 
> 1. You start the path walk and reach /A/B/C.
> 2. The other process moves /A/B/C to /A/C. Your path walk is now at /A/C.
> 3. Your path walk follows the first ".." up into /A. This is outside
> the process root, but you never actually encountered the process root,
> so you don't notice.
> 4. Your path walk follows the second ".." up to /. Again, this is
> outside the process root, but you don't notice.
> 5. Your path walk walks down to /etc/passwd, and the open completes
> successfully. You now have an fd pointing outside your chroot.
> 
> If the root of your walk is below an attacker-controlled directory,
> this of course means that you lose instantly. If you point the root of
> the walk at a directory out of which a process in the container
> wouldn't be able to move the file, you're probably kinda mostly fine -
> as long as you know, for certain, that nothing else on the system
> would ever do that. But I still wouldn't feel good about that.

Please correct me if I'm wrong here (this is the first patch I've
written for VFS). Isn't the retry/LOOKUP_REVAL code meant to handle this
-- or does that only handle if a particular path component changes
*while* it's being walked through? Is it possible for a path walk to
succeed after a path component was unmounted (obviously you can't delete
a directory path component since you'd get -ENOTEMPTY)?

If this is an issue for AT_THIS_ROOT, I believe this might also be an
issue for AT_BENEATH since they are effectively both using the same
nd->root trick (so you could similarly trick AT_BENEATH to not error
out). So we'd need to figure out how to solve this problem in order for
AT_BENEATH to be safe.

Speaking naively, doesn't it make sense to invalidate the walk if a path
component was modified? Or is this something that would be far too
costly with little benefit? What if we do more aggressive nd->root
checks when resolving with AT_BENEATH or AT_THIS_ROOT (or if nd->root !=
current->mnt_ns->root)?

Regarding chroot attacks, I was aware of the trivial
chroot-open-chroot-fchdir attack but I was not aware that there was a
rename attack for chroot. Thanks for bringing this up!

> I believe that the only way to robustly use this would be to point the
> dirfd at a mount point, such that you know that being moved out of the
> chroot is impossible because the mount point limits movement of
> directories under it. (Well, technically, it doesn't, but it ensures
> that if a directory does dangerously move away, the syscall fails.) It
> might make sense to hardcode this constraint in the implementation of
> AT_THIS_ROOT, to keep people from shooting themselves in the foot.

Unless I'm missing something, would this not also affect using a
mountpoint as a dirfd-root (with MS_MOVE of an already-walked-through
path component) -- or does MS_MOVE cause a rewalk in a way that rename
does not?

I wouldn't mind tying AT_THIS_ROOT to only work on mountpoints (I
thought that bind-mounts would be an issue but you also get -EXDEV when
trying to rename across bind-mounts even if they are on the same
underlying filesystem). But AT_BENEATH might be a more bitter pill to
swallow. I'm not sure.

In the usecase of container runtimes, we wouldn't generally be doing
resolution of attacker-controlled paths but it still definitely doesn't
hurt to consider this part of the threat model -- to avoid foot-gunning
as you've said. (There also might be some nested-container cases where
you might want to do that.)

> > Currently most container runtimes try to do this resolution in
> > userspace[1], causing many potential race conditions. In addition, the
> > "obvious" alternative (actually performing a {ch,pivot_}root(2))
> > requires a fork+exec which is *very* costly if necessary for every
> > filesystem operation involving a container.
> 
> Wait. fork() I understand, but why exec? And actually, you don't need
> a full fork() either, clone() lets you do this with some process parts
> shared. And then you also shouldn't need to use SCM_RIGHTS, just keep
> the file descriptor table shared. And why chroot()/pivot_root(),
> wouldn't you want to use setns()?

You're right about this -- for C runtimes. In Go we cannot do a raw
clone() or fork() (if you do it manually with RawSyscall you'll end with
broken runtime state). So you're forced to do fork+exec (which then
means that you can't use CLONE_FILES and must use SCM_RIGHTS). Same goes
for CLONE_VFORK.

(It should be noted that multi-threaded C runtimes have somewhat similar
issues -- AFAIK you can technically only use AS-Safe glibc functions
after a fork() but that's more of a theoretical concern here. If you
just use raw 

Re: [PATCH 2/3] namei: implement AT_THIS_ROOT chroot-like path resolution

2018-09-30 Thread Aleksa Sarai
On 2018-09-29, Jann Horn  wrote:
> The problem is what happens if a folder you are walking through is
> concurrently moved out of the chroot. Consider the following scenario:
> 
> You attempt to open "C/../../etc/passwd" under the root "/A/B".
> Something else concurrently moves /A/B/C to /A/C. This can result in
> the following:
> 
> 1. You start the path walk and reach /A/B/C.
> 2. The other process moves /A/B/C to /A/C. Your path walk is now at /A/C.
> 3. Your path walk follows the first ".." up into /A. This is outside
> the process root, but you never actually encountered the process root,
> so you don't notice.
> 4. Your path walk follows the second ".." up to /. Again, this is
> outside the process root, but you don't notice.
> 5. Your path walk walks down to /etc/passwd, and the open completes
> successfully. You now have an fd pointing outside your chroot.
> 
> If the root of your walk is below an attacker-controlled directory,
> this of course means that you lose instantly. If you point the root of
> the walk at a directory out of which a process in the container
> wouldn't be able to move the file, you're probably kinda mostly fine -
> as long as you know, for certain, that nothing else on the system
> would ever do that. But I still wouldn't feel good about that.

Please correct me if I'm wrong here (this is the first patch I've
written for VFS). Isn't the retry/LOOKUP_REVAL code meant to handle this
-- or does that only handle if a particular path component changes
*while* it's being walked through? Is it possible for a path walk to
succeed after a path component was unmounted (obviously you can't delete
a directory path component since you'd get -ENOTEMPTY)?

If this is an issue for AT_THIS_ROOT, I believe this might also be an
issue for AT_BENEATH since they are effectively both using the same
nd->root trick (so you could similarly trick AT_BENEATH to not error
out). So we'd need to figure out how to solve this problem in order for
AT_BENEATH to be safe.

Speaking naively, doesn't it make sense to invalidate the walk if a path
component was modified? Or is this something that would be far too
costly with little benefit? What if we do more aggressive nd->root
checks when resolving with AT_BENEATH or AT_THIS_ROOT (or if nd->root !=
current->mnt_ns->root)?

Regarding chroot attacks, I was aware of the trivial
chroot-open-chroot-fchdir attack but I was not aware that there was a
rename attack for chroot. Thanks for bringing this up!

> I believe that the only way to robustly use this would be to point the
> dirfd at a mount point, such that you know that being moved out of the
> chroot is impossible because the mount point limits movement of
> directories under it. (Well, technically, it doesn't, but it ensures
> that if a directory does dangerously move away, the syscall fails.) It
> might make sense to hardcode this constraint in the implementation of
> AT_THIS_ROOT, to keep people from shooting themselves in the foot.

Unless I'm missing something, would this not also affect using a
mountpoint as a dirfd-root (with MS_MOVE of an already-walked-through
path component) -- or does MS_MOVE cause a rewalk in a way that rename
does not?

I wouldn't mind tying AT_THIS_ROOT to only work on mountpoints (I
thought that bind-mounts would be an issue but you also get -EXDEV when
trying to rename across bind-mounts even if they are on the same
underlying filesystem). But AT_BENEATH might be a more bitter pill to
swallow. I'm not sure.

In the usecase of container runtimes, we wouldn't generally be doing
resolution of attacker-controlled paths but it still definitely doesn't
hurt to consider this part of the threat model -- to avoid foot-gunning
as you've said. (There also might be some nested-container cases where
you might want to do that.)

> > Currently most container runtimes try to do this resolution in
> > userspace[1], causing many potential race conditions. In addition, the
> > "obvious" alternative (actually performing a {ch,pivot_}root(2))
> > requires a fork+exec which is *very* costly if necessary for every
> > filesystem operation involving a container.
> 
> Wait. fork() I understand, but why exec? And actually, you don't need
> a full fork() either, clone() lets you do this with some process parts
> shared. And then you also shouldn't need to use SCM_RIGHTS, just keep
> the file descriptor table shared. And why chroot()/pivot_root(),
> wouldn't you want to use setns()?

You're right about this -- for C runtimes. In Go we cannot do a raw
clone() or fork() (if you do it manually with RawSyscall you'll end with
broken runtime state). So you're forced to do fork+exec (which then
means that you can't use CLONE_FILES and must use SCM_RIGHTS). Same goes
for CLONE_VFORK.

(It should be noted that multi-threaded C runtimes have somewhat similar
issues -- AFAIK you can technically only use AS-Safe glibc functions
after a fork() but that's more of a theoretical concern here. If you
just use raw 

Re: [PATCH 2/3] namei: implement AT_THIS_ROOT chroot-like path resolution

2018-09-29 Thread Andy Lutomirski



> On Sep 29, 2018, at 9:35 AM, Jann Horn  wrote:
> 
> +cc linux-api; please keep them in CC for future versions of the patch
> 
>> On Sat, Sep 29, 2018 at 4:29 PM Aleksa Sarai  wrote:
>> The primary motivation for the need for this flag is container runtimes
>> which have to interact with malicious root filesystems in the host
>> namespaces. One of the first requirements for a container runtime to be
>> secure against a malicious rootfs is that they correctly scope symlinks
>> (that is, they should be scoped as though they are chroot(2)ed into the
>> container's rootfs) and ".."-style paths. The already-existing AT_XDEV
>> and AT_NO_PROCLINKS help defend against other potential attacks in a
>> malicious rootfs scenario.
> 
> So, I really like the concept for patch 1 of this series (but haven't
> read the code yet); but I dislike this patch because of its footgun
> potential.
> 

The code could do it differently: do the path walk and then, before accepting 
the result, walk back up and make sure the result is under the starting point.

This is *not* a full solution, though, since a walk above the root gas side 
effects on timing, various caches, and possibly network traffic, so it’s open 
to Spectre-like attacks in which a malicious container could use a 
runtime-initiated AT_THIS_ROOT to infer the existence of directories outside 
the container.

But what’s the container usecase?  Any sane container is based on pivot_root or 
similar, so the runtime can just do the walk in the container context. IOW I’m 
a bit confused as to the exact intended use of the whole series. Can you 
elaborate?

Re: [PATCH 2/3] namei: implement AT_THIS_ROOT chroot-like path resolution

2018-09-29 Thread Andy Lutomirski



> On Sep 29, 2018, at 9:35 AM, Jann Horn  wrote:
> 
> +cc linux-api; please keep them in CC for future versions of the patch
> 
>> On Sat, Sep 29, 2018 at 4:29 PM Aleksa Sarai  wrote:
>> The primary motivation for the need for this flag is container runtimes
>> which have to interact with malicious root filesystems in the host
>> namespaces. One of the first requirements for a container runtime to be
>> secure against a malicious rootfs is that they correctly scope symlinks
>> (that is, they should be scoped as though they are chroot(2)ed into the
>> container's rootfs) and ".."-style paths. The already-existing AT_XDEV
>> and AT_NO_PROCLINKS help defend against other potential attacks in a
>> malicious rootfs scenario.
> 
> So, I really like the concept for patch 1 of this series (but haven't
> read the code yet); but I dislike this patch because of its footgun
> potential.
> 

The code could do it differently: do the path walk and then, before accepting 
the result, walk back up and make sure the result is under the starting point.

This is *not* a full solution, though, since a walk above the root gas side 
effects on timing, various caches, and possibly network traffic, so it’s open 
to Spectre-like attacks in which a malicious container could use a 
runtime-initiated AT_THIS_ROOT to infer the existence of directories outside 
the container.

But what’s the container usecase?  Any sane container is based on pivot_root or 
similar, so the runtime can just do the walk in the container context. IOW I’m 
a bit confused as to the exact intended use of the whole series. Can you 
elaborate?

Re: [PATCH 2/3] namei: implement AT_THIS_ROOT chroot-like path resolution

2018-09-29 Thread Jann Horn
+cc linux-api; please keep them in CC for future versions of the patch

On Sat, Sep 29, 2018 at 4:29 PM Aleksa Sarai  wrote:
> The primary motivation for the need for this flag is container runtimes
> which have to interact with malicious root filesystems in the host
> namespaces. One of the first requirements for a container runtime to be
> secure against a malicious rootfs is that they correctly scope symlinks
> (that is, they should be scoped as though they are chroot(2)ed into the
> container's rootfs) and ".."-style paths. The already-existing AT_XDEV
> and AT_NO_PROCLINKS help defend against other potential attacks in a
> malicious rootfs scenario.

So, I really like the concept for patch 1 of this series (but haven't
read the code yet); but I dislike this patch because of its footgun
potential.

If this patch landed as-is, the manpage would need some big warning
labels. chroot() basically provides no security guarantees at all; and
yes, that includes that if you do `chroot(...); chdir("/");
open(attacker_controlled_path, ...);`, you can potentially end up
opening a file outside the chroot. See
https://github.com/QubesOS/qubes-secpack/blob/master/QSBs/qsb-014-2015.txt
for an example, where Qubes OS did pretty much that, and ended up with
a potentially exploitable security bug because of that, where one VM,
while performing a file transfer into another VM, could write outside
of the transfer target directory.
The problem is what happens if a folder you are walking through is
concurrently moved out of the chroot. Consider the following scenario:

You attempt to open "C/../../etc/passwd" under the root "/A/B".
Something else concurrently moves /A/B/C to /A/C. This can result in
the following:

1. You start the path walk and reach /A/B/C.
2. The other process moves /A/B/C to /A/C. Your path walk is now at /A/C.
3. Your path walk follows the first ".." up into /A. This is outside
the process root, but you never actually encountered the process root,
so you don't notice.
4. Your path walk follows the second ".." up to /. Again, this is
outside the process root, but you don't notice.
5. Your path walk walks down to /etc/passwd, and the open completes
successfully. You now have an fd pointing outside your chroot.

If the root of your walk is below an attacker-controlled directory,
this of course means that you lose instantly. If you point the root of
the walk at a directory out of which a process in the container
wouldn't be able to move the file, you're probably kinda mostly fine -
as long as you know, for certain, that nothing else on the system
would ever do that. But I still wouldn't feel good about that.

(Yes, this means that if you run an SFTP server with OpenSSH's
ChrootDirectory directive, you have to be very careful about these
things.)

I believe that the only way to robustly use this would be to point the
dirfd at a mount point, such that you know that being moved out of the
chroot is impossible because the mount point limits movement of
directories under it. (Well, technically, it doesn't, but it ensures
that if a directory does dangerously move away, the syscall fails.) It
might make sense to hardcode this constraint in the implementation of
AT_THIS_ROOT, to keep people from shooting themselves in the foot.

> Currently most container runtimes try to do this resolution in
> userspace[1], causing many potential race conditions. In addition, the
> "obvious" alternative (actually performing a {ch,pivot_}root(2))
> requires a fork+exec which is *very* costly if necessary for every
> filesystem operation involving a container.

Wait. fork() I understand, but why exec? And actually, you don't need
a full fork() either, clone() lets you do this with some process parts
shared. And then you also shouldn't need to use SCM_RIGHTS, just keep
the file descriptor table shared. And why chroot()/pivot_root(),
wouldn't you want to use setns()? I think something like this should
work (except that you should add some error handling - omitted here
because I'm lazy), assuming that the container runtime does NOT have
CAP_SYS_ADMIN in the init namespace (otherwise it's easier). Of
course, this is entirely untested, and probably won't compile because
I screwed something up. :P But you should get the idea...

// Ensure that we are non-dumpable. Together with
// commit bfedb589252c, this ensures that container root
// can't trace our child once it enters the container.
// My patch
// https://lore.kernel.org/lkml/1451098351-8917-1-git-send-email-j...@thejh.net/
// would make this unnecessary, but that patch didn't
// land because Eric nacked it (for political reasons,
// because people incorrectly claimed that this was a
// security fix):
// https://lore.kernel.org/lkml/8760z7fope@x220.int.ebiederm.org/
// Note that dumpability is per-mm, not per-process,
// so this hack has the unfortunate side effect of preventing
// unprivileged debugging of the container runtime.
// Oh well.
prctl(PR_SET_DUMPABLE, 

Re: [PATCH 2/3] namei: implement AT_THIS_ROOT chroot-like path resolution

2018-09-29 Thread Jann Horn
+cc linux-api; please keep them in CC for future versions of the patch

On Sat, Sep 29, 2018 at 4:29 PM Aleksa Sarai  wrote:
> The primary motivation for the need for this flag is container runtimes
> which have to interact with malicious root filesystems in the host
> namespaces. One of the first requirements for a container runtime to be
> secure against a malicious rootfs is that they correctly scope symlinks
> (that is, they should be scoped as though they are chroot(2)ed into the
> container's rootfs) and ".."-style paths. The already-existing AT_XDEV
> and AT_NO_PROCLINKS help defend against other potential attacks in a
> malicious rootfs scenario.

So, I really like the concept for patch 1 of this series (but haven't
read the code yet); but I dislike this patch because of its footgun
potential.

If this patch landed as-is, the manpage would need some big warning
labels. chroot() basically provides no security guarantees at all; and
yes, that includes that if you do `chroot(...); chdir("/");
open(attacker_controlled_path, ...);`, you can potentially end up
opening a file outside the chroot. See
https://github.com/QubesOS/qubes-secpack/blob/master/QSBs/qsb-014-2015.txt
for an example, where Qubes OS did pretty much that, and ended up with
a potentially exploitable security bug because of that, where one VM,
while performing a file transfer into another VM, could write outside
of the transfer target directory.
The problem is what happens if a folder you are walking through is
concurrently moved out of the chroot. Consider the following scenario:

You attempt to open "C/../../etc/passwd" under the root "/A/B".
Something else concurrently moves /A/B/C to /A/C. This can result in
the following:

1. You start the path walk and reach /A/B/C.
2. The other process moves /A/B/C to /A/C. Your path walk is now at /A/C.
3. Your path walk follows the first ".." up into /A. This is outside
the process root, but you never actually encountered the process root,
so you don't notice.
4. Your path walk follows the second ".." up to /. Again, this is
outside the process root, but you don't notice.
5. Your path walk walks down to /etc/passwd, and the open completes
successfully. You now have an fd pointing outside your chroot.

If the root of your walk is below an attacker-controlled directory,
this of course means that you lose instantly. If you point the root of
the walk at a directory out of which a process in the container
wouldn't be able to move the file, you're probably kinda mostly fine -
as long as you know, for certain, that nothing else on the system
would ever do that. But I still wouldn't feel good about that.

(Yes, this means that if you run an SFTP server with OpenSSH's
ChrootDirectory directive, you have to be very careful about these
things.)

I believe that the only way to robustly use this would be to point the
dirfd at a mount point, such that you know that being moved out of the
chroot is impossible because the mount point limits movement of
directories under it. (Well, technically, it doesn't, but it ensures
that if a directory does dangerously move away, the syscall fails.) It
might make sense to hardcode this constraint in the implementation of
AT_THIS_ROOT, to keep people from shooting themselves in the foot.

> Currently most container runtimes try to do this resolution in
> userspace[1], causing many potential race conditions. In addition, the
> "obvious" alternative (actually performing a {ch,pivot_}root(2))
> requires a fork+exec which is *very* costly if necessary for every
> filesystem operation involving a container.

Wait. fork() I understand, but why exec? And actually, you don't need
a full fork() either, clone() lets you do this with some process parts
shared. And then you also shouldn't need to use SCM_RIGHTS, just keep
the file descriptor table shared. And why chroot()/pivot_root(),
wouldn't you want to use setns()? I think something like this should
work (except that you should add some error handling - omitted here
because I'm lazy), assuming that the container runtime does NOT have
CAP_SYS_ADMIN in the init namespace (otherwise it's easier). Of
course, this is entirely untested, and probably won't compile because
I screwed something up. :P But you should get the idea...

// Ensure that we are non-dumpable. Together with
// commit bfedb589252c, this ensures that container root
// can't trace our child once it enters the container.
// My patch
// https://lore.kernel.org/lkml/1451098351-8917-1-git-send-email-j...@thejh.net/
// would make this unnecessary, but that patch didn't
// land because Eric nacked it (for political reasons,
// because people incorrectly claimed that this was a
// security fix):
// https://lore.kernel.org/lkml/8760z7fope@x220.int.ebiederm.org/
// Note that dumpability is per-mm, not per-process,
// so this hack has the unfortunate side effect of preventing
// unprivileged debugging of the container runtime.
// Oh well.
prctl(PR_SET_DUMPABLE, 

[PATCH 2/3] namei: implement AT_THIS_ROOT chroot-like path resolution

2018-09-29 Thread Aleksa Sarai
The primary motivation for the need for this flag is container runtimes
which have to interact with malicious root filesystems in the host
namespaces. One of the first requirements for a container runtime to be
secure against a malicious rootfs is that they correctly scope symlinks
(that is, they should be scoped as though they are chroot(2)ed into the
container's rootfs) and ".."-style paths. The already-existing AT_XDEV
and AT_NO_PROCLINKS help defend against other potential attacks in a
malicious rootfs scenario.

Currently most container runtimes try to do this resolution in
userspace[1], causing many potential race conditions. In addition, the
"obvious" alternative (actually performing a {ch,pivot_}root(2))
requires a fork+exec which is *very* costly if necessary for every
filesystem operation involving a container.

The most significant change in semantics with AT_THIS_ROOT is that
*at(2) syscalls now no longer have the property that an absolute
pathname causes the dirfd to be ignored completely (if LOOKUP_CHROOT is
specified). The reasoning behind this is that AT_THIS_ROOT necessarily
has to chroot-scope symlinks with absolute paths to dirfd, and so doing
it for the base path seems to be the most consistent behaviour (and also
avoids foot-gunning users who want to chroot-scope paths that might be
absolute).

Currently this is only enabled for the stat(2) and openat(2) family (the
latter has its own flag O_THISROOT with the same semantics). Ideally
this flag would be supported by all *at(2) syscalls, but this will
require adding flags arguments to many of them (and will be done in a
separate patchset).

[1]: https://github.com/cyphar/filepath-securejoin

Cc: Eric Biederman 
Cc: Christian Brauner 
Signed-off-by: Aleksa Sarai 
---
 fs/fcntl.c   |   2 +-
 fs/namei.c   | 121 +--
 fs/open.c|   2 +
 fs/stat.c|   4 +-
 include/linux/fcntl.h|   2 +-
 include/linux/namei.h|   1 +
 include/uapi/asm-generic/fcntl.h |   3 +
 include/uapi/linux/fcntl.h   |   2 +
 8 files changed, 81 insertions(+), 56 deletions(-)

diff --git a/fs/fcntl.c b/fs/fcntl.c
index e343618736f7..4c36c5b9fdb9 100644
--- a/fs/fcntl.c
+++ b/fs/fcntl.c
@@ -1031,7 +1031,7 @@ static int __init fcntl_init(void)
 * Exceptions: O_NONBLOCK is a two bit define on parisc; O_NDELAY
 * is defined as O_NONBLOCK on some platforms and not on others.
 */
-   BUILD_BUG_ON(25 - 1 /* for O_RDONLY being 0 */ !=
+   BUILD_BUG_ON(26 - 1 /* for O_RDONLY being 0 */ !=
HWEIGHT32(
(VALID_OPEN_FLAGS & ~(O_NONBLOCK | O_NDELAY)) |
__FMODE_EXEC | __FMODE_NONOTIFY));
diff --git a/fs/namei.c b/fs/namei.c
index 757dd783771c..1b984f0dbbb4 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -2193,9 +2193,64 @@ static int link_path_walk(const char *name, struct 
nameidata *nd)
}
 }
 
+/*
+ * Configure nd->path based on the nd->dfd. This is only used as part of
+ * path_init().
+ */
+static inline int dirfd_path_init(struct nameidata *nd)
+{
+   if (nd->dfd == AT_FDCWD) {
+   if (nd->flags & LOOKUP_RCU) {
+   struct fs_struct *fs = current->fs;
+   unsigned seq;
+
+   do {
+   seq = read_seqcount_begin(>seq);
+   nd->path = fs->pwd;
+   nd->inode = nd->path.dentry->d_inode;
+   nd->seq = 
__read_seqcount_begin(>path.dentry->d_seq);
+   } while (read_seqcount_retry(>seq, seq));
+   } else {
+   get_fs_pwd(current->fs, >path);
+   nd->inode = nd->path.dentry->d_inode;
+   }
+   } else {
+   /* Caller must check execute permissions on the starting path 
component */
+   struct fd f = fdget_raw(nd->dfd);
+   struct dentry *dentry;
+
+   if (!f.file)
+   return -EBADF;
+
+   dentry = f.file->f_path.dentry;
+
+   if (*nd->name->name && unlikely(!d_can_lookup(dentry))) {
+   fdput(f);
+   return -ENOTDIR;
+   }
+
+   nd->path = f.file->f_path;
+   if (nd->flags & LOOKUP_RCU) {
+   nd->inode = nd->path.dentry->d_inode;
+   nd->seq = read_seqcount_begin(>path.dentry->d_seq);
+   } else {
+   path_get(>path);
+   nd->inode = nd->path.dentry->d_inode;
+   }
+   fdput(f);
+   }
+   if (unlikely(nd->flags & (LOOKUP_CHROOT | LOOKUP_BENEATH))) {
+   nd->root = nd->path;
+   if (!(nd->flags & LOOKUP_RCU))
+   path_get(>root);
+   }
+   return 0;
+}

[PATCH 2/3] namei: implement AT_THIS_ROOT chroot-like path resolution

2018-09-29 Thread Aleksa Sarai
The primary motivation for the need for this flag is container runtimes
which have to interact with malicious root filesystems in the host
namespaces. One of the first requirements for a container runtime to be
secure against a malicious rootfs is that they correctly scope symlinks
(that is, they should be scoped as though they are chroot(2)ed into the
container's rootfs) and ".."-style paths. The already-existing AT_XDEV
and AT_NO_PROCLINKS help defend against other potential attacks in a
malicious rootfs scenario.

Currently most container runtimes try to do this resolution in
userspace[1], causing many potential race conditions. In addition, the
"obvious" alternative (actually performing a {ch,pivot_}root(2))
requires a fork+exec which is *very* costly if necessary for every
filesystem operation involving a container.

The most significant change in semantics with AT_THIS_ROOT is that
*at(2) syscalls now no longer have the property that an absolute
pathname causes the dirfd to be ignored completely (if LOOKUP_CHROOT is
specified). The reasoning behind this is that AT_THIS_ROOT necessarily
has to chroot-scope symlinks with absolute paths to dirfd, and so doing
it for the base path seems to be the most consistent behaviour (and also
avoids foot-gunning users who want to chroot-scope paths that might be
absolute).

Currently this is only enabled for the stat(2) and openat(2) family (the
latter has its own flag O_THISROOT with the same semantics). Ideally
this flag would be supported by all *at(2) syscalls, but this will
require adding flags arguments to many of them (and will be done in a
separate patchset).

[1]: https://github.com/cyphar/filepath-securejoin

Cc: Eric Biederman 
Cc: Christian Brauner 
Signed-off-by: Aleksa Sarai 
---
 fs/fcntl.c   |   2 +-
 fs/namei.c   | 121 +--
 fs/open.c|   2 +
 fs/stat.c|   4 +-
 include/linux/fcntl.h|   2 +-
 include/linux/namei.h|   1 +
 include/uapi/asm-generic/fcntl.h |   3 +
 include/uapi/linux/fcntl.h   |   2 +
 8 files changed, 81 insertions(+), 56 deletions(-)

diff --git a/fs/fcntl.c b/fs/fcntl.c
index e343618736f7..4c36c5b9fdb9 100644
--- a/fs/fcntl.c
+++ b/fs/fcntl.c
@@ -1031,7 +1031,7 @@ static int __init fcntl_init(void)
 * Exceptions: O_NONBLOCK is a two bit define on parisc; O_NDELAY
 * is defined as O_NONBLOCK on some platforms and not on others.
 */
-   BUILD_BUG_ON(25 - 1 /* for O_RDONLY being 0 */ !=
+   BUILD_BUG_ON(26 - 1 /* for O_RDONLY being 0 */ !=
HWEIGHT32(
(VALID_OPEN_FLAGS & ~(O_NONBLOCK | O_NDELAY)) |
__FMODE_EXEC | __FMODE_NONOTIFY));
diff --git a/fs/namei.c b/fs/namei.c
index 757dd783771c..1b984f0dbbb4 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -2193,9 +2193,64 @@ static int link_path_walk(const char *name, struct 
nameidata *nd)
}
 }
 
+/*
+ * Configure nd->path based on the nd->dfd. This is only used as part of
+ * path_init().
+ */
+static inline int dirfd_path_init(struct nameidata *nd)
+{
+   if (nd->dfd == AT_FDCWD) {
+   if (nd->flags & LOOKUP_RCU) {
+   struct fs_struct *fs = current->fs;
+   unsigned seq;
+
+   do {
+   seq = read_seqcount_begin(>seq);
+   nd->path = fs->pwd;
+   nd->inode = nd->path.dentry->d_inode;
+   nd->seq = 
__read_seqcount_begin(>path.dentry->d_seq);
+   } while (read_seqcount_retry(>seq, seq));
+   } else {
+   get_fs_pwd(current->fs, >path);
+   nd->inode = nd->path.dentry->d_inode;
+   }
+   } else {
+   /* Caller must check execute permissions on the starting path 
component */
+   struct fd f = fdget_raw(nd->dfd);
+   struct dentry *dentry;
+
+   if (!f.file)
+   return -EBADF;
+
+   dentry = f.file->f_path.dentry;
+
+   if (*nd->name->name && unlikely(!d_can_lookup(dentry))) {
+   fdput(f);
+   return -ENOTDIR;
+   }
+
+   nd->path = f.file->f_path;
+   if (nd->flags & LOOKUP_RCU) {
+   nd->inode = nd->path.dentry->d_inode;
+   nd->seq = read_seqcount_begin(>path.dentry->d_seq);
+   } else {
+   path_get(>path);
+   nd->inode = nd->path.dentry->d_inode;
+   }
+   fdput(f);
+   }
+   if (unlikely(nd->flags & (LOOKUP_CHROOT | LOOKUP_BENEATH))) {
+   nd->root = nd->path;
+   if (!(nd->flags & LOOKUP_RCU))
+   path_get(>root);
+   }
+   return 0;
+}