from:"Christian Brauner"

Re: [RFC PATCH 2/2] fs/xattr: wire up syscalls

2022-08-30 Thread Christian Brauner

On Tue, Aug 30, 2022 at 05:28:38PM +0200, Christian Göttsche wrote:
> Enable the new added extended attribute related syscalls.
> 
> Signed-off-by: Christian Göttsche 
> ---

Fwiw, I think a while ago it was pointed out that for most syscall
additions you can just fold the hookup patch in. It probably also
wouldn't hurt to trim that Cc list significantly down to mostly the
lists...

--
Linux-audit mailing list
Linux-audit@redhat.com
https://listman.redhat.com/mailman/listinfo/linux-audit

Re: [PATCH v4 3/3] audit: add OPENAT2 record to list how

2021-05-20 Thread Christian Brauner

efine AUDIT_SELINUX_ERR1401/* Internal SE Linux Errors */
> diff --git a/kernel/audit.h b/kernel/audit.h
> index 1522e100fd17..c5af17905976 100644
> --- a/kernel/audit.h
> +++ b/kernel/audit.h
> @@ -11,6 +11,7 @@
>  #include 
>  #include 
>  #include 
> +#include  // struct open_how
>  
>  /* AUDIT_NAMES is the number of slots we reserve in the audit_context
>   * for saving names from getname().  If we get more names we will allocate
> @@ -185,6 +186,7 @@ struct audit_context {
>   int fd;
>   int flags;
>   } mmap;
> + struct open_how openat2;
>   struct {
>   int argc;
>   } execve;
> diff --git a/kernel/auditsc.c b/kernel/auditsc.c
> index 3f59ab209dfd..faf2485323a9 100644
> --- a/kernel/auditsc.c
> +++ b/kernel/auditsc.c
> @@ -76,7 +76,7 @@
>  #include 
>  #include 
>  #include 
> -#include 
> +#include  // struct open_how
>  
>  #include "audit.h"
>  
> @@ -1319,6 +1319,12 @@ static void show_special(struct audit_context 
> *context, int *call_panic)
>   audit_log_format(ab, "fd=%d flags=0x%x", context->mmap.fd,
>context->mmap.flags);
>   break;
> + case AUDIT_OPENAT2:
> + audit_log_format(ab, "oflag=0%llo mode=0%llo resolve=0x%llx",

Hm, should we maybe follow the struct member names for all entries, i.e.
replace s/oflag/flags? 

Otherwise
Acked-by: Christian Brauner 

> +  context->openat2.flags,
> +  context->openat2.mode,
> +  context->openat2.resolve);
> + break;
>   case AUDIT_EXECVE:
>   audit_log_execve_info(context, );
>   break;
> @@ -2549,6 +2555,16 @@ void __audit_mmap_fd(int fd, int flags)
>   context->type = AUDIT_MMAP;
>  }
>  
> +void __audit_openat2_how(struct open_how *how)
> +{
> + struct audit_context *context = audit_context();
> +
> + context->openat2.flags = how->flags;
> + context->openat2.mode = how->mode;
> + context->openat2.resolve = how->resolve;
> + context->type = AUDIT_OPENAT2;
> +}
> +
>  void __audit_log_kern_module(char *name)
>  {
>   struct audit_context *context = audit_context();
> -- 
> 2.27.0
> 

--
Linux-audit mailing list
Linux-audit@redhat.com
https://listman.redhat.com/mailman/listinfo/linux-audit

Re: [PATCH v4 2/3] audit: add support for the openat2 syscall

2021-05-20 Thread Christian Brauner

gt; + case __NR_openat2:
> + return AUDITSC_OPENAT2;
>   default:
>   return AUDITSC_NATIVE;
>   }
> diff --git a/arch/s390/kernel/compat_audit.c b/arch/s390/kernel/compat_audit.c
> index fc3d1c7ad21c..4b3d463e7d97 100644
> --- a/arch/s390/kernel/compat_audit.c
> +++ b/arch/s390/kernel/compat_audit.c
> @@ -40,6 +40,8 @@ int s390_classify_syscall(unsigned syscall)
>   return AUDITSC_SOCKETCALL;
>   case __NR_execve:
>   return AUDITSC_EXECVE;
> + case __NR_openat2:
> + return AUDITSC_OPENAT2;
>   default:
>   return AUDITSC_COMPAT;
>   }
> diff --git a/arch/sparc/kernel/audit.c b/arch/sparc/kernel/audit.c
> index 50fab35bdaba..b092274eca79 100644
> --- a/arch/sparc/kernel/audit.c
> +++ b/arch/sparc/kernel/audit.c
> @@ -55,6 +55,8 @@ int audit_classify_syscall(int abi, unsigned int syscall)
>   return AUDITSC_SOCKETCALL;
>   case __NR_execve:
>   return AUDITSC_EXECVE;
> + case __NR_openat2:
> + return AUDITSC_OPENAT2;
>   default:
>   return AUDITSC_NATIVE;
>   }
> diff --git a/arch/sparc/kernel/compat_audit.c 
> b/arch/sparc/kernel/compat_audit.c
> index 1c1b6d075421..2a3f71206fc5 100644
> --- a/arch/sparc/kernel/compat_audit.c
> +++ b/arch/sparc/kernel/compat_audit.c
> @@ -40,6 +40,8 @@ int sparc32_classify_syscall(unsigned int syscall)
>   return AUDITSC_SOCKETCALL;
>   case __NR_execve:
>   return AUDITSC_EXECVE;
> + case __NR_openat2:
> + return AUDITSC_OPENAT2;
>   default:
>   return AUDITSC_COMPAT;
>   }
> diff --git a/arch/x86/ia32/audit.c b/arch/x86/ia32/audit.c
> index eedc37a1ee13..efc7d832fefb 100644
> --- a/arch/x86/ia32/audit.c
> +++ b/arch/x86/ia32/audit.c
> @@ -40,6 +40,8 @@ int ia32_classify_syscall(unsigned syscall)
>   case __NR_execve:
>   case __NR_execveat:
>   return AUDITSC_EXECVE;
> + case __NR_openat2:
> + return AUDITSC_OPENAT2;
>   default:
>   return AUDITSC_COMPAT;
>   }
> diff --git a/arch/x86/kernel/audit_64.c b/arch/x86/kernel/audit_64.c
> index 2a6cc9c9c881..44c3601cfdc4 100644
> --- a/arch/x86/kernel/audit_64.c
> +++ b/arch/x86/kernel/audit_64.c
> @@ -53,6 +53,8 @@ int audit_classify_syscall(int abi, unsigned syscall)
>   case __NR_execve:
>   case __NR_execveat:
>   return AUDITSC_EXECVE;
> + case __NR_openat2:
> + return AUDITSC_OPENAT2;
>   default:
>   return AUDITSC_NATIVE;
>   }
> diff --git a/include/linux/auditsc_classmacros.h 
> b/include/linux/auditsc_classmacros.h
> index 18757d270961..dc8e72536dbd 100644
> --- a/include/linux/auditsc_classmacros.h
> +++ b/include/linux/auditsc_classmacros.h
> @@ -16,6 +16,7 @@ enum auditsc_class_t {
>   AUDITSC_OPENAT,
>   AUDITSC_SOCKETCALL,
>   AUDITSC_EXECVE,
> + AUDITSC_OPENAT2,
>  
>   AUDITSC_NVALS /* count */
>  };
> diff --git a/kernel/auditsc.c b/kernel/auditsc.c
> index d775ea16505b..3f59ab209dfd 100644
> --- a/kernel/auditsc.c
> +++ b/kernel/auditsc.c
> @@ -76,6 +76,7 @@
>  #include 
>  #include 
>  #include 
> +#include 
>  
>  #include "audit.h"
>  
> @@ -196,6 +197,8 @@ static int audit_match_perm(struct audit_context *ctx, 
> int mask)
>   return ((mask & AUDIT_PERM_WRITE) && ctx->argv[0] == SYS_BIND);
>   case AUDITSC_EXECVE:
>   return mask & AUDIT_PERM_EXEC;
> + case AUDITSC_OPENAT2:
> + return mask & ACC_MODE((u32)((struct open_how 
> *)ctx->argv[2])->flags);

That's a lot of dereferncing, casting and masking all at once. Maybe a
small static inline helper would be good for the sake of legibility? Sm
like:

static inline u32 audit_openat2_acc(struct open_how *how, int mask)
{
u32 flags = how->flags;
return mask & ACC_MODE(flags);
}

but not sure. Just seems more legible to me.
Otherwise.
Acked-by: Christian Brauner 

--
Linux-audit mailing list
Linux-audit@redhat.com
https://listman.redhat.com/mailman/listinfo/linux-audit

Re: [PATCH v4 1/3] audit: replace magic audit syscall class numbers with macros

2021-05-20 Thread Christian Brauner

On Wed, May 19, 2021 at 04:00:20PM -0400, Richard Guy Briggs wrote:
> Replace audit syscall class magic numbers with macros.
> 
> This required putting the macros into new header file
> include/linux/auditsc_classmacros.h since the syscall macros were
> included for both 64 bit and 32 bit in any compat code, causing
> redefinition warnings.
> 
> Signed-off-by: Richard Guy Briggs 
> Link: 
> https://lore.kernel.org/r/2300b1083a32aade7ae7efb95826e8f3f260b1df.1621363275.git@redhat.com

Looks good.
Acked-by: Christian Brauner 

Fwiw, I would explicitly number all enum values in auditsc_class_t not
just the first one.

> ---
>  MAINTAINERS |  1 +
>  arch/alpha/kernel/audit.c   |  8 
>  arch/ia64/kernel/audit.c|  8 
>  arch/parisc/kernel/audit.c  |  8 
>  arch/parisc/kernel/compat_audit.c   |  9 +
>  arch/powerpc/kernel/audit.c | 10 +-
>  arch/powerpc/kernel/compat_audit.c  | 11 ++-
>  arch/s390/kernel/audit.c| 10 +-
>  arch/s390/kernel/compat_audit.c | 11 ++-
>  arch/sparc/kernel/audit.c   | 10 +-
>  arch/sparc/kernel/compat_audit.c| 11 ++-
>  arch/x86/ia32/audit.c   | 11 ++-
>  arch/x86/kernel/audit_64.c  |  8 
>  include/linux/audit.h   |  1 +
>  include/linux/auditsc_classmacros.h | 23 +++
>  kernel/auditsc.c| 12 ++--
>  lib/audit.c | 10 +-
>  lib/compat_audit.c  | 11 ++-
>  18 files changed, 102 insertions(+), 71 deletions(-)
>  create mode 100644 include/linux/auditsc_classmacros.h
> 
> diff --git a/MAINTAINERS b/MAINTAINERS
> index bd7aff0c120f..3348d12019f9 100644
> --- a/MAINTAINERS
> +++ b/MAINTAINERS
> @@ -3036,6 +3036,7 @@ W:  https://github.com/linux-audit
>  T:   git git://git.kernel.org/pub/scm/linux/kernel/git/pcmoore/audit.git
>  F:   include/asm-generic/audit_*.h
>  F:   include/linux/audit.h
> +F:   include/linux/auditsc_classmacros.h
>  F:   include/uapi/linux/audit.h
>  F:   kernel/audit*
>  F:   lib/*audit.c
> diff --git a/arch/alpha/kernel/audit.c b/arch/alpha/kernel/audit.c
> index 96a9d18ff4c4..81cbd804e375 100644
> --- a/arch/alpha/kernel/audit.c
> +++ b/arch/alpha/kernel/audit.c
> @@ -37,13 +37,13 @@ int audit_classify_syscall(int abi, unsigned syscall)
>  {
>   switch(syscall) {
>   case __NR_open:
> - return 2;
> + return AUDITSC_OPEN;
>   case __NR_openat:
> - return 3;
> + return AUDITSC_OPENAT;
>   case __NR_execve:
> - return 5;
> + return AUDITSC_EXECVE;
>   default:
> - return 0;
> + return AUDITSC_NATIVE;
>   }
>  }
>  
> diff --git a/arch/ia64/kernel/audit.c b/arch/ia64/kernel/audit.c
> index 5192ca899fe6..dba6a74c9ab3 100644
> --- a/arch/ia64/kernel/audit.c
> +++ b/arch/ia64/kernel/audit.c
> @@ -38,13 +38,13 @@ int audit_classify_syscall(int abi, unsigned syscall)
>  {
>   switch(syscall) {
>   case __NR_open:
> - return 2;
> + return AUDITSC_OPEN;
>   case __NR_openat:
> - return 3;
> + return AUDITSC_OPENAT;
>   case __NR_execve:
> - return 5;
> + return AUDITSC_EXECVE;
>   default:
> - return 0;
> + return AUDITSC_NATIVE;
>   }
>  }
>  
> diff --git a/arch/parisc/kernel/audit.c b/arch/parisc/kernel/audit.c
> index 9eb47b2225d2..14244e83db75 100644
> --- a/arch/parisc/kernel/audit.c
> +++ b/arch/parisc/kernel/audit.c
> @@ -47,13 +47,13 @@ int audit_classify_syscall(int abi, unsigned syscall)
>  #endif
>   switch (syscall) {
>   case __NR_open:
> - return 2;
> + return AUDITSC_OPEN;
>   case __NR_openat:
> - return 3;
> + return AUDITSC_OPENAT;
>   case __NR_execve:
> - return 5;
> + return AUDITSC_EXECVE;
>   default:
> - return 0;
> + return AUDITSC_NATIVE;
>   }
>  }
>  
> diff --git a/arch/parisc/kernel/compat_audit.c 
> b/arch/parisc/kernel/compat_audit.c
> index 20c39c9d86a9..1d6347d37d92 100644
> --- a/arch/parisc/kernel/compat_audit.c
> +++ b/arch/parisc/kernel/compat_audit.c
> @@ -1,4 +1,5 @@
>  // SPDX-License-Identifier: GPL-2.0
> +#include 
>  #include 
>  
>  unsigned int parisc32_dir_class[] = {
> @@ -30,12 +31,12 @@ int parisc32_classify_syscall(unsigned syscall)
>  {
>   switch (syscall) {
>   case __NR_

Re: [PATCH 0/2] audit: add support for openat2

2021-03-18 Thread Christian Brauner

On Wed, Mar 17, 2021 at 09:47:16PM -0400, Richard Guy Briggs wrote:
> The openat2(2) syscall was added in v5.6.  Add support for openat2 to the
> audit syscall classifier and for recording openat2 parameters that cannot
> be captured in the syscall parameters of the SYSCALL record.
> 
> Supporting userspace code can be found in
> https://github.com/rgbriggs/audit-userspace/tree/ghau-openat2
> 
> Supporting test case can be found in
> https://github.com/linux-audit/audit-testsuite/pull/103

Seems sensible, thank you.

--
Linux-audit mailing list
Linux-audit@redhat.com
https://listman.redhat.com/mailman/listinfo/linux-audit

Re: [PATCH 1/2] audit: add support for the openat2 syscall

2021-03-18 Thread Christian Brauner

[+Cc Aleksa, the author of openat2()]

and a comment below. :)

On Wed, Mar 17, 2021 at 09:47:17PM -0400, Richard Guy Briggs wrote:
> The openat2(2) syscall was added in kernel v5.6 with commit fddb5d430ad9
> ("open: introduce openat2(2) syscall")
> 
> Add the openat2(2) syscall to the audit syscall classifier.
> 
> See the github issue
> https://github.com/linux-audit/audit-kernel/issues/67
> 
> Signed-off-by: Richard Guy Briggs 
> ---
>  arch/alpha/kernel/audit.c  | 2 ++
>  arch/ia64/kernel/audit.c   | 2 ++
>  arch/parisc/kernel/audit.c | 2 ++
>  arch/parisc/kernel/compat_audit.c  | 2 ++
>  arch/powerpc/kernel/audit.c| 2 ++
>  arch/powerpc/kernel/compat_audit.c | 2 ++
>  arch/s390/kernel/audit.c   | 2 ++
>  arch/s390/kernel/compat_audit.c| 2 ++
>  arch/sparc/kernel/audit.c  | 2 ++
>  arch/sparc/kernel/compat_audit.c   | 2 ++
>  arch/x86/ia32/audit.c  | 2 ++
>  arch/x86/kernel/audit_64.c | 2 ++
>  kernel/auditsc.c   | 3 +++
>  lib/audit.c| 4 
>  lib/compat_audit.c | 4 
>  15 files changed, 35 insertions(+)
> 
> diff --git a/arch/alpha/kernel/audit.c b/arch/alpha/kernel/audit.c
> index 96a9d18ff4c4..06a911b685d1 100644
> --- a/arch/alpha/kernel/audit.c
> +++ b/arch/alpha/kernel/audit.c
> @@ -42,6 +42,8 @@ int audit_classify_syscall(int abi, unsigned syscall)
>   return 3;
>   case __NR_execve:
>   return 5;
> + case __NR_openat2:
> + return 6;
>   default:
>   return 0;
>   }
> diff --git a/arch/ia64/kernel/audit.c b/arch/ia64/kernel/audit.c
> index 5192ca899fe6..5eaa888c8fd3 100644
> --- a/arch/ia64/kernel/audit.c
> +++ b/arch/ia64/kernel/audit.c
> @@ -43,6 +43,8 @@ int audit_classify_syscall(int abi, unsigned syscall)
>   return 3;
>   case __NR_execve:
>   return 5;
> + case __NR_openat2:
> + return 6;
>   default:
>   return 0;
>   }
> diff --git a/arch/parisc/kernel/audit.c b/arch/parisc/kernel/audit.c
> index 9eb47b2225d2..fc721a7727ba 100644
> --- a/arch/parisc/kernel/audit.c
> +++ b/arch/parisc/kernel/audit.c
> @@ -52,6 +52,8 @@ int audit_classify_syscall(int abi, unsigned syscall)
>   return 3;
>   case __NR_execve:
>   return 5;
> + case __NR_openat2:
> + return 6;
>   default:
>   return 0;
>   }
> diff --git a/arch/parisc/kernel/compat_audit.c 
> b/arch/parisc/kernel/compat_audit.c
> index 20c39c9d86a9..fc6d35918c44 100644
> --- a/arch/parisc/kernel/compat_audit.c
> +++ b/arch/parisc/kernel/compat_audit.c
> @@ -35,6 +35,8 @@ int parisc32_classify_syscall(unsigned syscall)
>   return 3;
>   case __NR_execve:
>   return 5;
> + case __NR_openat2:
> + return 6;
>   default:
>   return 1;
>   }
> diff --git a/arch/powerpc/kernel/audit.c b/arch/powerpc/kernel/audit.c
> index a27f3d09..8f32700b0baa 100644
> --- a/arch/powerpc/kernel/audit.c
> +++ b/arch/powerpc/kernel/audit.c
> @@ -54,6 +54,8 @@ int audit_classify_syscall(int abi, unsigned syscall)
>   return 4;
>   case __NR_execve:
>   return 5;
> + case __NR_openat2:
> + return 6;
>   default:
>   return 0;
>   }
> diff --git a/arch/powerpc/kernel/compat_audit.c 
> b/arch/powerpc/kernel/compat_audit.c
> index 55c6ccda0a85..ebe45534b1c9 100644
> --- a/arch/powerpc/kernel/compat_audit.c
> +++ b/arch/powerpc/kernel/compat_audit.c
> @@ -38,6 +38,8 @@ int ppc32_classify_syscall(unsigned syscall)
>   return 4;
>   case __NR_execve:
>   return 5;
> + case __NR_openat2:
> + return 6;
>   default:
>   return 1;
>   }
> diff --git a/arch/s390/kernel/audit.c b/arch/s390/kernel/audit.c
> index d395c6c9944c..d964cb94cfaf 100644
> --- a/arch/s390/kernel/audit.c
> +++ b/arch/s390/kernel/audit.c
> @@ -54,6 +54,8 @@ int audit_classify_syscall(int abi, unsigned syscall)
>   return 4;
>   case __NR_execve:
>   return 5;
> + case __NR_openat2:
> + return 6;
>   default:
>   return 0;
>   }
> diff --git a/arch/s390/kernel/compat_audit.c b/arch/s390/kernel/compat_audit.c
> index 444fb1f66944..f7b32933ce0e 100644
> --- a/arch/s390/kernel/compat_audit.c
> +++ b/arch/s390/kernel/compat_audit.c
> @@ -39,6 +39,8 @@ int s390_classify_syscall(unsigned syscall)
>   return 4;
>   case __NR_execve:
>   return 5;
> + case __NR_openat2:
> + return 6;
>   default:
>   return 1;
>   }
> diff --git a/arch/sparc/kernel/audit.c b/arch/sparc/kernel/audit.c
> index a6e91bf34d48..b6dcca9c6520 100644
> --- a/arch/sparc/kernel/audit.c
> +++ b/arch/sparc/kernel/audit.c
> @@ -55,6 +55,8 @@ int audit_classify_syscall(int abi, unsigned int syscall)
>

Re: [PATCH 1/2] audit: add support for the openat2 syscall

2021-03-18 Thread Christian Brauner

On Thu, Mar 18, 2021 at 11:48:45AM +0100, Christian Brauner wrote:
> [+Cc Aleksa, the author of openat2()]
> 
> and a comment below. :)
> 
> On Wed, Mar 17, 2021 at 09:47:17PM -0400, Richard Guy Briggs wrote:
> > The openat2(2) syscall was added in kernel v5.6 with commit fddb5d430ad9
> > ("open: introduce openat2(2) syscall")
> > 
> > Add the openat2(2) syscall to the audit syscall classifier.
> > 
> > See the github issue
> > https://github.com/linux-audit/audit-kernel/issues/67
> > 
> > Signed-off-by: Richard Guy Briggs 
> > ---
> >  arch/alpha/kernel/audit.c  | 2 ++
> >  arch/ia64/kernel/audit.c   | 2 ++
> >  arch/parisc/kernel/audit.c | 2 ++
> >  arch/parisc/kernel/compat_audit.c  | 2 ++
> >  arch/powerpc/kernel/audit.c| 2 ++
> >  arch/powerpc/kernel/compat_audit.c | 2 ++
> >  arch/s390/kernel/audit.c   | 2 ++
> >  arch/s390/kernel/compat_audit.c| 2 ++
> >  arch/sparc/kernel/audit.c  | 2 ++
> >  arch/sparc/kernel/compat_audit.c   | 2 ++
> >  arch/x86/ia32/audit.c  | 2 ++
> >  arch/x86/kernel/audit_64.c | 2 ++
> >  kernel/auditsc.c   | 3 +++
> >  lib/audit.c| 4 
> >  lib/compat_audit.c | 4 
> >  15 files changed, 35 insertions(+)
> > 
> > diff --git a/arch/alpha/kernel/audit.c b/arch/alpha/kernel/audit.c
> > index 96a9d18ff4c4..06a911b685d1 100644
> > --- a/arch/alpha/kernel/audit.c
> > +++ b/arch/alpha/kernel/audit.c
> > @@ -42,6 +42,8 @@ int audit_classify_syscall(int abi, unsigned syscall)
> > return 3;
> > case __NR_execve:
> > return 5;
> > +   case __NR_openat2:
> > +   return 6;
> > default:
> > return 0;
> > }
> > diff --git a/arch/ia64/kernel/audit.c b/arch/ia64/kernel/audit.c
> > index 5192ca899fe6..5eaa888c8fd3 100644
> > --- a/arch/ia64/kernel/audit.c
> > +++ b/arch/ia64/kernel/audit.c
> > @@ -43,6 +43,8 @@ int audit_classify_syscall(int abi, unsigned syscall)
> > return 3;
> > case __NR_execve:
> > return 5;
> > +   case __NR_openat2:
> > +   return 6;
> > default:
> > return 0;
> > }
> > diff --git a/arch/parisc/kernel/audit.c b/arch/parisc/kernel/audit.c
> > index 9eb47b2225d2..fc721a7727ba 100644
> > --- a/arch/parisc/kernel/audit.c
> > +++ b/arch/parisc/kernel/audit.c
> > @@ -52,6 +52,8 @@ int audit_classify_syscall(int abi, unsigned syscall)
> > return 3;
> > case __NR_execve:
> > return 5;
> > +   case __NR_openat2:
> > +   return 6;
> > default:
> > return 0;
> > }
> > diff --git a/arch/parisc/kernel/compat_audit.c 
> > b/arch/parisc/kernel/compat_audit.c
> > index 20c39c9d86a9..fc6d35918c44 100644
> > --- a/arch/parisc/kernel/compat_audit.c
> > +++ b/arch/parisc/kernel/compat_audit.c
> > @@ -35,6 +35,8 @@ int parisc32_classify_syscall(unsigned syscall)
> > return 3;
> > case __NR_execve:
> > return 5;
> > +   case __NR_openat2:
> > +   return 6;
> > default:
> > return 1;
> > }
> > diff --git a/arch/powerpc/kernel/audit.c b/arch/powerpc/kernel/audit.c
> > index a27f3d09..8f32700b0baa 100644
> > --- a/arch/powerpc/kernel/audit.c
> > +++ b/arch/powerpc/kernel/audit.c
> > @@ -54,6 +54,8 @@ int audit_classify_syscall(int abi, unsigned syscall)
> > return 4;
> > case __NR_execve:
> > return 5;
> > +   case __NR_openat2:
> > +   return 6;
> > default:
> > return 0;
> > }
> > diff --git a/arch/powerpc/kernel/compat_audit.c 
> > b/arch/powerpc/kernel/compat_audit.c
> > index 55c6ccda0a85..ebe45534b1c9 100644
> > --- a/arch/powerpc/kernel/compat_audit.c
> > +++ b/arch/powerpc/kernel/compat_audit.c
> > @@ -38,6 +38,8 @@ int ppc32_classify_syscall(unsigned syscall)
> > return 4;
> > case __NR_execve:
> > return 5;
> > +   case __NR_openat2:
> > +   return 6;
> > default:
> > return 1;
> > }
> > diff --git a/arch/s390/kernel/audit.c b/arch/s390/kernel/audit.c
> > index d395c6c9944c..d964cb94cfaf 100644
> > --- a/arch/s390/kernel/audit.c
> > +++ b/arch/s390/kernel/audit.c
> > @@ -54,6 +54,8 @@ int audit_classify_syscall(int abi, unsigned sysc

Re: [PATCH v2 07/39] mount: attach mappings to mounts

2020-11-24 Thread Christian Brauner

On Tue, Nov 24, 2020 at 08:37:40AM -0500, Tycho Andersen wrote:
> On Tue, Nov 24, 2020 at 01:30:35PM +0100, Christian Brauner wrote:
> > On Mon, Nov 23, 2020 at 11:24:28AM -0500, Tycho Andersen wrote:
> > > On Mon, Nov 23, 2020 at 10:47:19AM -0500, Tycho Andersen wrote:
> > > > On Sun, Nov 15, 2020 at 11:36:46AM +0100, Christian Brauner wrote:
> > > > > +static inline struct user_namespace *mnt_user_ns(const struct 
> > > > > vfsmount *mnt)
> > > > > +{
> > > > > + return mnt->mnt_user_ns;
> > > > > +}
> > > > 
> > > > I think you might want a READ_ONCE() here. Right now it seems ok, since 
> > > > the
> > > > mnt_user_ns can't change, but if we ever allow it to change (and I see 
> > > > you have
> > > > a idmapped_mounts_wip_v2_allow_to_change_idmapping branch on your 
> > > > public tree
> > > > :D), the pattern of,
> > > > 
> > > > user_ns = mnt_user_ns(path->mnt);
> > > > if (mnt_idmapped(path->mnt)) {
> > > > uid = kuid_from_mnt(user_ns, uid);
> > > > gid = kgid_from_mnt(user_ns, gid);
> > > > }
> > > > 
> > > > could race.
> > > 
> > > Actually, isn't a race possible now?
> > > 
> > > kuid_from_mnt(mnt_user_ns(path->mnt) /* _user_ns */);
> > > WRITE_ONCE(mnt->mnt.mnt_user_ns, user_ns);
> > > WRITE_ONCE(m->mnt.mnt_flags, flags);
> > > kgid_from_mnt(mnt_user_ns(path->mnt) /* the right user ns */);
> > > 
> > > So maybe it should be:
> > > 
> > >  if (mnt_idmapped(path->mnt)) {
> > >  barrier();
> > >  user_ns = mnt_user_ns(path->mnt);
> > >  uid = kuid_from_mnt(user_ns, uid);
> > >  gid = kgid_from_mnt(user_ns, gid);
> > >  }
> > > 
> > > since there's no data dependency between mnt_idmapped() and
> > > mnt_user_ns()?
> > 
> > I think I had something to handle this case in another branch of mine.
> > The READ_ONCE() you mentioned in another patch I had originally dropped
> > because I wasn't sure whether it works on pointers but after talking to
> > Jann and David it seems that it handles pointers fine.
> > Let me take a look and fix it in the next version. I just finished
> > porting the test suite to xfstests as Christoph requested and I'm
> > looking at this now.
> 
> Another way would be to just have mnt_idmapped() test
> mnt_user_ns() != _user_ns instead of the flags; then I think you
> get the data dependency and thus correct ordering for free.

I indeed dropped mnt_idmapped() which is unnecessary. :)
I think we should still use smp_store_release() in mnt_user_ns() paired
with smp_load_acquire() in do_idmap_mount() thought.

Christian

--
Linux-audit mailing list
Linux-audit@redhat.com
https://www.redhat.com/mailman/listinfo/linux-audit

Re: [PATCH v2 07/39] mount: attach mappings to mounts

2020-11-24 Thread Christian Brauner

On Mon, Nov 23, 2020 at 11:24:28AM -0500, Tycho Andersen wrote:
> On Mon, Nov 23, 2020 at 10:47:19AM -0500, Tycho Andersen wrote:
> > On Sun, Nov 15, 2020 at 11:36:46AM +0100, Christian Brauner wrote:
> > > +static inline struct user_namespace *mnt_user_ns(const struct vfsmount 
> > > *mnt)
> > > +{
> > > + return mnt->mnt_user_ns;
> > > +}
> > 
> > I think you might want a READ_ONCE() here. Right now it seems ok, since the
> > mnt_user_ns can't change, but if we ever allow it to change (and I see you 
> > have
> > a idmapped_mounts_wip_v2_allow_to_change_idmapping branch on your public 
> > tree
> > :D), the pattern of,
> > 
> > user_ns = mnt_user_ns(path->mnt);
> > if (mnt_idmapped(path->mnt)) {
> > uid = kuid_from_mnt(user_ns, uid);
> > gid = kgid_from_mnt(user_ns, gid);
> > }
> > 
> > could race.
> 
> Actually, isn't a race possible now?
> 
> kuid_from_mnt(mnt_user_ns(path->mnt) /* _user_ns */);
> WRITE_ONCE(mnt->mnt.mnt_user_ns, user_ns);
> WRITE_ONCE(m->mnt.mnt_flags, flags);
> kgid_from_mnt(mnt_user_ns(path->mnt) /* the right user ns */);
> 
> So maybe it should be:
> 
>  if (mnt_idmapped(path->mnt)) {
>  barrier();
>  user_ns = mnt_user_ns(path->mnt);
>  uid = kuid_from_mnt(user_ns, uid);
>  gid = kgid_from_mnt(user_ns, gid);
>  }
> 
> since there's no data dependency between mnt_idmapped() and
> mnt_user_ns()?

I think I had something to handle this case in another branch of mine.
The READ_ONCE() you mentioned in another patch I had originally dropped
because I wasn't sure whether it works on pointers but after talking to
Jann and David it seems that it handles pointers fine.
Let me take a look and fix it in the next version. I just finished
porting the test suite to xfstests as Christoph requested and I'm
looking at this now.

Thanks!
Christian

--
Linux-audit mailing list
Linux-audit@redhat.com
https://www.redhat.com/mailman/listinfo/linux-audit

Re: [PATCH v2 07/39] mount: attach mappings to mounts

2020-11-24 Thread Christian Brauner

On Tue, Nov 24, 2020 at 08:44:59AM -0500, Tycho Andersen wrote:
> On Tue, Nov 24, 2020 at 02:40:35PM +0100, Christian Brauner wrote:
> > On Tue, Nov 24, 2020 at 08:37:40AM -0500, Tycho Andersen wrote:
> > > On Tue, Nov 24, 2020 at 01:30:35PM +0100, Christian Brauner wrote:
> > > > On Mon, Nov 23, 2020 at 11:24:28AM -0500, Tycho Andersen wrote:
> > > > > On Mon, Nov 23, 2020 at 10:47:19AM -0500, Tycho Andersen wrote:
> > > > > > On Sun, Nov 15, 2020 at 11:36:46AM +0100, Christian Brauner wrote:
> > > > > > > +static inline struct user_namespace *mnt_user_ns(const struct 
> > > > > > > vfsmount *mnt)
> > > > > > > +{
> > > > > > > + return mnt->mnt_user_ns;
> > > > > > > +}
> > > > > > 
> > > > > > I think you might want a READ_ONCE() here. Right now it seems ok, 
> > > > > > since the
> > > > > > mnt_user_ns can't change, but if we ever allow it to change (and I 
> > > > > > see you have
> > > > > > a idmapped_mounts_wip_v2_allow_to_change_idmapping branch on your 
> > > > > > public tree
> > > > > > :D), the pattern of,
> > > > > > 
> > > > > > user_ns = mnt_user_ns(path->mnt);
> > > > > > if (mnt_idmapped(path->mnt)) {
> > > > > > uid = kuid_from_mnt(user_ns, uid);
> > > > > > gid = kgid_from_mnt(user_ns, gid);
> > > > > > }
> > > > > > 
> > > > > > could race.
> > > > > 
> > > > > Actually, isn't a race possible now?
> > > > > 
> > > > > kuid_from_mnt(mnt_user_ns(path->mnt) /* _user_ns */);
> > > > > WRITE_ONCE(mnt->mnt.mnt_user_ns, user_ns);
> > > > > WRITE_ONCE(m->mnt.mnt_flags, flags);
> > > > > kgid_from_mnt(mnt_user_ns(path->mnt) /* the right user ns */);
> > > > > 
> > > > > So maybe it should be:
> > > > > 
> > > > >  if (mnt_idmapped(path->mnt)) {
> > > > >  barrier();
> > > > >  user_ns = mnt_user_ns(path->mnt);
> > > > >  uid = kuid_from_mnt(user_ns, uid);
> > > > >  gid = kgid_from_mnt(user_ns, gid);
> > > > >  }
> > > > > 
> > > > > since there's no data dependency between mnt_idmapped() and
> > > > > mnt_user_ns()?
> > > > 
> > > > I think I had something to handle this case in another branch of mine.
> > > > The READ_ONCE() you mentioned in another patch I had originally dropped
> > > > because I wasn't sure whether it works on pointers but after talking to
> > > > Jann and David it seems that it handles pointers fine.
> > > > Let me take a look and fix it in the next version. I just finished
> > > > porting the test suite to xfstests as Christoph requested and I'm
> > > > looking at this now.
> > > 
> > > Another way would be to just have mnt_idmapped() test
> > > mnt_user_ns() != _user_ns instead of the flags; then I think you
> > > get the data dependency and thus correct ordering for free.
> > 
> > I indeed dropped mnt_idmapped() which is unnecessary. :)
> 
> It still might be a nice helper to prevent people from checking the
> flags and forgetting that there's a memory ordering issue, though.

I just mentioned this offline but for the record: the flag is gone since
we can rely on the pointer alone. :)

Christian

--
Linux-audit mailing list
Linux-audit@redhat.com
https://www.redhat.com/mailman/listinfo/linux-audit

Re: [PATCH v2 31/39] audit: handle idmapped mounts

2020-11-23 Thread Christian Brauner

On Sun, Nov 22, 2020 at 05:17:39PM -0500, Paul Moore wrote:
> On Sun, Nov 15, 2020 at 5:43 AM Christian Brauner
>  wrote:
> >
> > Audit will sometimes log the inode's i_uid and i_gid. Enable audit to log 
> > the
> > mapped inode when it is accessed from an idmapped mount.
> 
> I mentioned this in an earlier patch in this patchset, but it is worth

I did not receive that message.

> repeating here: audit currently records information in the context of
> the initial/host namespace and I believe it should probably stay that
> way until the rest of the namespace smarts that Richard is working on

Ah, that's good to know and makes the patchset simpler so I'm all for
it.

Christian

--
Linux-audit mailing list
Linux-audit@redhat.com
https://www.redhat.com/mailman/listinfo/linux-audit

Re: [PATCH v2 14/39] commoncap: handle idmapped mounts

2020-11-23 Thread Christian Brauner

On Sun, Nov 22, 2020 at 04:18:55PM -0500, Paul Moore wrote:
> On Sun, Nov 15, 2020 at 5:39 AM Christian Brauner
>  wrote:
> > When interacting with user namespace and non-user namespace aware
> > filesystem capabilities the vfs will perform various security checks to
> > determine whether or not the filesystem capabilities can be used by the
> > caller (e.g. during exec), or even whether they need to be removed. The
> > main infrastructure for this resides in the capability codepaths but they
> > are called through the LSM security infrastructure even though they are not
> > technically an LSM or optional. This extends the existing security hooks
> > security_inode_removexattr(), security_inode_killpriv(),
> > security_inode_getsecurity() to pass down the mount's user namespace and
> > makes them aware of idmapped mounts.
> > In order to actually get filesystem capabilities from disk the capability
> > infrastructure exposes the get_vfs_caps_from_disk() helper. For user
> > namespace aware filesystem capabilities a root uid is stored alongside the
> > capabilities.
> > In order to determine whether the caller can make use of the filesystem
> > capability or whether it needs to be ignored it is translated according to
> > the superblock's user namespace. If it can be translated to uid 0 according
> > to that id mapping the caller can use the filesystem capabilities stored on
> > disk. If we are accessing the inode that holds the filesystem capabilities
> > through an idmapped mount we need to map the root uid according to the
> > mount's user namespace.
> > Afterwards the checks are identical to non-idmapped mounts. Reading
> > filesystem caps from disk enforces that the root uid associated with the
> > filesystem capability must have a mapping in the superblock's user
> > namespace and that the caller is either in the same user namespace or is a
> > descendant of the superblock's user namespace. For filesystems that are
> > mountable inside user namespace the container can just mount the filesystem
> > and won't usually need to idmap it. If it does create an idmapped mount it
> > can mark it with a user namespace it has created and which is therefore a
> > descendant of the s_user_ns. For filesystems that are not mountable inside
> > user namespaces the descendant rule is trivially true because the s_user_ns
> > will be the initial user namespace.
> >
> > If the initial user namespace is passed all operations are a nop so
> > non-idmapped mounts will not see a change in behavior and will also not see
> > any performance impact.
> >
> > Cc: Christoph Hellwig 
> > Cc: David Howells 
> > Cc: Al Viro 
> > Cc: linux-fsde...@vger.kernel.org
> > Signed-off-by: Christian Brauner 
> 
> ...
> 
> > diff --git a/kernel/auditsc.c b/kernel/auditsc.c
> > index 8dba8f0983b5..ddb9213a3e81 100644
> > --- a/kernel/auditsc.c
> > +++ b/kernel/auditsc.c
> > @@ -1944,7 +1944,7 @@ static inline int audit_copy_fcaps(struct audit_names 
> > *name,
> > if (!dentry)
> > return 0;
> >
> > -   rc = get_vfs_caps_from_disk(dentry, );
> > +   rc = get_vfs_caps_from_disk(_user_ns, dentry, );
> > if (rc)
> > return rc;
> >
> > @@ -2495,7 +2495,8 @@ int __audit_log_bprm_fcaps(struct linux_binprm *bprm,
> > ax->d.next = context->aux;
> > context->aux = (void *)ax;
> >
> > -   get_vfs_caps_from_disk(bprm->file->f_path.dentry, );
> > +   get_vfs_caps_from_disk(mnt_user_ns(bprm->file->f_path.mnt),
> > +  bprm->file->f_path.dentry, );
> 
> As audit currently records information in the context of the
> initial/host namespace I'm guessing we don't want the mnt_user_ns()
> call above; it seems like _user_ns would be the right choice
> (similar to audit_copy_fcaps()), yes?

Ok, sounds good. It also makes the patchset simpler.
Note that I'm currently not on the audit mailing list so this is likely
not going to show up there.

(Fwiw, I responded to you in your other mail too.)

Christian

--
Linux-audit mailing list
Linux-audit@redhat.com
https://www.redhat.com/mailman/listinfo/linux-audit

Re: [PATCH v2 00/39] fs: idmapped mounts

2020-11-20 Thread Christian Brauner

On Fri, Nov 20, 2020 at 09:12:47AM +, Christoph Hellwig wrote:
> On Fri, Nov 20, 2020 at 10:10:44AM +0100, Christian Brauner wrote:
> > Maybe you didn't see this or you're referring to xfstests but this
> > series contains a >=4000 lines long test-suite that validates all core
> > features with and without idmapped mounts. It's the last patch in this
> > version of the series and it's located in:
> > tools/testing/selftests/idmap_mounts.
> > 
> > Everytime a filesystem is added this test-suite will be updated. We
> > would perfer if this test would be shipped with the kernel itself and
> > not in a separate test-suite such as xfstests. But we're happy to add
> > patches for the latter at some point too.
> 
> selftests is a complete pain to use, partialy because it is not
> integrated with the framework we file system developers use (xfstests)
> and partially because having the test suite in the kernel tree really
> breaks a lot of the typical use cases.  So I think we'll need to wire
> this up in the proper place instead.

Ok, I think I can basically port the test-suite at the end of this patch
series so that it can be carried in xfstests/src/idmapped_mounts.c

I'll start doing that now.
It would make it a bit easier if we could carry it as a single file for
now.

Christian

--
Linux-audit mailing list
Linux-audit@redhat.com
https://www.redhat.com/mailman/listinfo/linux-audit

Re: [PATCH v2 00/39] fs: idmapped mounts

2020-11-20 Thread Christian Brauner

On Thu, Nov 19, 2020 at 06:33:09PM -0800, Darrick J. Wong wrote:
> On Sun, Nov 15, 2020 at 11:36:39AM +0100, Christian Brauner wrote:
> > Hey everyone,
> > 
> > This is v2. It is reworked according to the reviews coming from
> > Christoph and others to adapt all relevant helpers and inode_operations
> > methods to account for idmapped mounts instead of introducing new
> > helpers and methods specific to idmapped mounts like we did before.
> > We've also moved the overlayfs conversion to handle idmapped mounts into
> > a separate patchset that will be sent out separately after the core
> > changes. The converted filesytems in this series include fat and ext4.
> > The config option to disable idmapped mounts has been moved from a a vfs
> > config ption to a per-filesystem option. They default to off. Having a
> > config option allows us to gain some confidence in the patchset over
> > multiple kernel releases.
> 
> So uh I noticed that the new user_ns parameter passed to the xfs
> functions don't actually get used for anything.  Did I miss something
> when I pulled the branch, or does this simply reflect xfs not having any
> idmap support?  And how would one add such a thing?  Replace the
> _user_ns with the passed-in user_ns parameter?

Sorry, maybe you missed it but that's mentioned further below in the
commit messages:

"In order to support idmapped mounts, filesystems need to be changed and
mark themselves with the FS_ALLOW_IDMAP flag in fs_flags. The initial
version contains fat and ext4 including a list of examples. But patches
for other filesystems are actively worked on but will be sent out
separately. We are here to see this through and there are multiple
people involved in converting filesystems. So filesystem developers are
not left alone with this."

> 
> > There are two noteable things about this version. First, that it comes
> > with a really large test-suite to test current vfs behavior and
> > idmapped mounts behavior. We intend this test-suite to grow over time
> > and at some point cover most basic core vfs functionality that isn't
> > covered in xfstests and have it be part of the selftests.
> 
> Please put enough of a test in fstests to do basic validation that we
> filesystem developers didn't accidentally screw things up.

Maybe you didn't see this or you're referring to xfstests but this
series contains a >=4000 lines long test-suite that validates all core
features with and without idmapped mounts. It's the last patch in this
version of the series and it's located in:
tools/testing/selftests/idmap_mounts.

Everytime a filesystem is added this test-suite will be updated. We
would perfer if this test would be shipped with the kernel itself and
not in a separate test-suite such as xfstests. But we're happy to add
patches for the latter at some point too.

Christian

> 
> --D
> 
> > Second, while while working on adapting this patchset to the requested
> > changes, the runC and containerd crowd was nice enough to adapt
> > containerd to this patchset to make use of idmapped mounts in one of the
> > most widely used container runtimes:
> > https://github.com/containerd/containerd/pull/4734
> > 
> > With this patchset we make it possible to attach idmappings to bind
> > mounts. This handles several common use-cases. Here are just a few:
> > - Shifting of a container rootfs or base image without having to mangle
> >   every file (runc, Docker, containerd, k8s, LXD, systemd ...)
> > - Sharing of data between host or privileged containers with
> >   underprivileged containers (runc, Docker, containerd, k8s, LXD, ...)
> > - Shifting of subset of ownership-less filesystems (vfat) for use by
> >   multiple users, effectively allowing for DAC on such devices (systemd,
> >   Android, ...)
> > - Data sharing between multiple user namespaces with incompatible maps
> >   (LXD, k8s, ...)
> > Making it possible to share directories and mounts between users with
> > different uids and gids is itself quite an important use-case in
> > distributed systems environments. It's of course especially useful in
> > general for portable usb sticks, sharing data between multiple users in
> > general, and sharing home directories between multiple users. The last
> > example is now elegantly expressed in systemd's homed concept for
> > portable home directories. As mentioned above, idmapped mounts also
> > allow data from the host to be shared with unprivileged containers,
> > between privileged and unprivileged containers simultaneously and in
> > addition also between unprivileged containers with different idmappings
> > whenever they are used to isolate one container completely from

Re: [PATCH v2 00/39] fs: idmapped mounts

2020-11-18 Thread Christian Brauner

On Tue, Nov 17, 2020 at 04:54:33PM -0700, Jonathan Corbet wrote:
> On Sun, 15 Nov 2020 11:36:39 +0100
> Christian Brauner  wrote:
> 
> One quick question...
> 
> > I have written a simple tool available at
> > https://github.com/brauner/mount-idmapped that allows to create idmapped
> > mounts so people can play with this patch series.
> 
> I spent a while looking at that tool.  When actually setting the namespace
> for the mapping, it uses MOUNT_ATTR_SHIFT rather than MOUNT_ATTR_IDMAP.
> The value is the same, so I expect it works...:)  But did that perhaps not
> get updated to reflect a name change?

Yep, that was my mistake. I'll fix it up in the repo for that tool now
and maybe improve it a little too! :)

Christian

--
Linux-audit mailing list
Linux-audit@redhat.com
https://www.redhat.com/mailman/listinfo/linux-audit

[PATCH v2 25/39] init: handle idmapped mounts

2020-11-15 Thread Christian Brauner

Enable the init helpers to handle idmapped mounts by passing down the mount's
user namespace.

Cc: Christoph Hellwig 
Cc: David Howells 
Cc: Al Viro 
Cc: linux-fsde...@vger.kernel.org
Signed-off-by: Christian Brauner 
---
/* v2 */
patch introduced
---
 fs/init.c | 8 
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/fs/init.c b/fs/init.c
index 76f493600030..334e4c9c07eb 100644
--- a/fs/init.c
+++ b/fs/init.c
@@ -49,7 +49,7 @@ int __init init_chdir(const char *filename)
error = kern_path(filename, LOOKUP_FOLLOW | LOOKUP_DIRECTORY, );
if (error)
return error;
-   error = inode_permission(_user_ns, path.dentry->d_inode,
+   error = inode_permission(mnt_user_ns(path.mnt), path.dentry->d_inode,
 MAY_EXEC | MAY_CHDIR);
if (!error)
set_fs_pwd(current->fs, );
@@ -65,7 +65,7 @@ int __init init_chroot(const char *filename)
error = kern_path(filename, LOOKUP_FOLLOW | LOOKUP_DIRECTORY, );
if (error)
return error;
-   error = inode_permission(_user_ns, path.dentry->d_inode,
+   error = inode_permission(mnt_user_ns(path.mnt), path.dentry->d_inode,
 MAY_EXEC | MAY_CHDIR);
if (error)
goto dput_and_out;
@@ -120,7 +120,7 @@ int __init init_eaccess(const char *filename)
error = kern_path(filename, LOOKUP_FOLLOW, );
if (error)
return error;
-   error = inode_permission(_user_ns, d_inode(path.dentry),
+   error = inode_permission(mnt_user_ns(path.mnt), d_inode(path.dentry),
 MAY_ACCESS);
path_put();
return error;
@@ -190,7 +190,7 @@ int __init init_link(const char *oldname, const char 
*newname)
error = security_path_link(old_path.dentry, _path, new_dentry);
if (error)
goto out_dput;
-   error = vfs_link(old_path.dentry, _user_ns, 
+   error = vfs_link(old_path.dentry, _user_ns,
 new_path.dentry->d_inode, new_dentry, NULL);
 out_dput:
done_path_create(_path, new_dentry);
-- 
2.29.2

--
Linux-audit mailing list
Linux-audit@redhat.com
https://www.redhat.com/mailman/listinfo/linux-audit

[PATCH v2 23/39] fcntl: handle idmapped mounts

2020-11-15 Thread Christian Brauner

Enable the setfl() helper to handle idmapped mounts by passing down the
mount's user namespace.

Cc: Christoph Hellwig 
Cc: David Howells 
Cc: Al Viro 
Cc: linux-fsde...@vger.kernel.org
Signed-off-by: Christian Brauner 
---
/* v2 */
patch introduced
---
 fs/fcntl.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/fs/fcntl.c b/fs/fcntl.c
index df091d435603..ed330fa91438 100644
--- a/fs/fcntl.c
+++ b/fs/fcntl.c
@@ -25,6 +25,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include 
 #include 
@@ -46,7 +47,7 @@ static int setfl(int fd, struct file * filp, unsigned long 
arg)
 
/* O_NOATIME can only be set by the owner or superuser */
if ((arg & O_NOATIME) && !(filp->f_flags & O_NOATIME))
-   if (!inode_owner_or_capable(_user_ns, inode))
+   if (!inode_owner_or_capable(mnt_user_ns(filp->f_path.mnt), 
inode))
return -EPERM;
 
/* required for strict SunOS emulation */
-- 
2.29.2

--
Linux-audit mailing list
Linux-audit@redhat.com
https://www.redhat.com/mailman/listinfo/linux-audit

[PATCH v2 15/39] stat: handle idmapped mounts

2020-11-15 Thread Christian Brauner

The generic_fillattr() helper fills in the basic attributes associated with
an inode. Enable it to handle idmapped mounts. If the inode is accessed
through an idmapped mount we need to map it according to the mount's user
namespace. If the initial user namespace is passed all operations are a nop
so non-idmapped mounts will not see a change in behavior and will also not
see any performance impact.

Cc: Christoph Hellwig 
Cc: David Howells 
Cc: Al Viro 
Cc: linux-fsde...@vger.kernel.org
Signed-off-by: Christian Brauner 
---
/* v2 */
- Christoph Hellwig:
  - Don't pollute the vfs with additional helpers simply extend the existing
helpers with an additional argument and switch all callers.
---
 fs/9p/vfs_inode.c  |  4 ++--
 fs/9p/vfs_inode_dotl.c |  4 ++--
 fs/afs/inode.c |  2 +-
 fs/btrfs/inode.c   |  2 +-
 fs/ceph/inode.c|  2 +-
 fs/cifs/inode.c|  2 +-
 fs/coda/inode.c|  2 +-
 fs/ecryptfs/inode.c|  4 ++--
 fs/erofs/inode.c   |  2 +-
 fs/exfat/file.c|  2 +-
 fs/ext2/inode.c|  2 +-
 fs/ext4/inode.c|  2 +-
 fs/f2fs/file.c |  2 +-
 fs/fat/file.c  |  2 +-
 fs/fuse/dir.c  |  2 +-
 fs/gfs2/inode.c|  2 +-
 fs/hfsplus/inode.c |  2 +-
 fs/kernfs/inode.c  |  2 +-
 fs/libfs.c |  4 ++--
 fs/minix/inode.c   |  2 +-
 fs/nfs/inode.c |  2 +-
 fs/nfs/namespace.c |  2 +-
 fs/ocfs2/file.c|  2 +-
 fs/orangefs/inode.c|  2 +-
 fs/proc/base.c |  8 
 fs/proc/generic.c  |  2 +-
 fs/proc/proc_net.c |  2 +-
 fs/proc/proc_sysctl.c  |  2 +-
 fs/proc/root.c |  2 +-
 fs/stat.c  | 10 ++
 fs/sysv/itree.c|  2 +-
 fs/ubifs/dir.c |  2 +-
 fs/udf/symlink.c   |  2 +-
 fs/vboxsf/utils.c  |  2 +-
 include/linux/fs.h |  2 +-
 mm/shmem.c |  2 +-
 36 files changed, 48 insertions(+), 46 deletions(-)

diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c
index 404526499c94..0a5c022c1c70 100644
--- a/fs/9p/vfs_inode.c
+++ b/fs/9p/vfs_inode.c
@@ -1006,7 +1006,7 @@ v9fs_vfs_getattr(const struct path *path, struct kstat 
*stat,
p9_debug(P9_DEBUG_VFS, "dentry: %p\n", dentry);
v9ses = v9fs_dentry2v9ses(dentry);
if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) {
-   generic_fillattr(d_inode(dentry), stat);
+   generic_fillattr(_user_ns, d_inode(dentry), stat);
return 0;
}
fid = v9fs_fid_lookup(dentry);
@@ -1018,7 +1018,7 @@ v9fs_vfs_getattr(const struct path *path, struct kstat 
*stat,
return PTR_ERR(st);
 
v9fs_stat2inode(st, d_inode(dentry), dentry->d_sb, 0);
-   generic_fillattr(d_inode(dentry), stat);
+   generic_fillattr(_user_ns, d_inode(dentry), stat);
 
p9stat_free(st);
kfree(st);
diff --git a/fs/9p/vfs_inode_dotl.c b/fs/9p/vfs_inode_dotl.c
index 282ec5cb45dc..8f3c1daf72ba 100644
--- a/fs/9p/vfs_inode_dotl.c
+++ b/fs/9p/vfs_inode_dotl.c
@@ -466,7 +466,7 @@ v9fs_vfs_getattr_dotl(const struct path *path, struct kstat 
*stat,
p9_debug(P9_DEBUG_VFS, "dentry: %p\n", dentry);
v9ses = v9fs_dentry2v9ses(dentry);
if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) {
-   generic_fillattr(d_inode(dentry), stat);
+   generic_fillattr(_user_ns, d_inode(dentry), stat);
return 0;
}
fid = v9fs_fid_lookup(dentry);
@@ -482,7 +482,7 @@ v9fs_vfs_getattr_dotl(const struct path *path, struct kstat 
*stat,
return PTR_ERR(st);
 
v9fs_stat2inode_dotl(st, d_inode(dentry), 0);
-   generic_fillattr(d_inode(dentry), stat);
+   generic_fillattr(_user_ns, d_inode(dentry), stat);
/* Change block size to what the server returned */
stat->blksize = st->st_blksize;
 
diff --git a/fs/afs/inode.c b/fs/afs/inode.c
index 0fe8844b4bee..17ecdee404eb 100644
--- a/fs/afs/inode.c
+++ b/fs/afs/inode.c
@@ -737,7 +737,7 @@ int afs_getattr(const struct path *path, struct kstat *stat,
 
do {
read_seqbegin_or_lock(>cb_lock, );
-   generic_fillattr(inode, stat);
+   generic_fillattr(_user_ns, inode, stat);
if (test_bit(AFS_VNODE_SILLY_DELETED, >flags) &&
stat->nlink > 0)
stat->nlink -= 1;
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index e6f4aed0d311..99b4fd66681d 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -8783,7 +8783,7 @@ static int btrfs_getattr(const struct path *path, struct 
kstat *stat,
  STATX_ATTR_IMMUTABLE |
  STATX_ATTR_NODUMP);
 
-   generic_fillattr(inode, stat);
+   generic_fillattr(_user_ns, inode, stat);
stat->dev = BTRFS_I(inode)->root->anon_dev;
 
spin_lock(_I(inode)-

[PATCH v2 26/39] ioctl: handle idmapped mounts

2020-11-15 Thread Christian Brauner

Enable generic ioctls to handle idmapped mounts by passing down the mount's
user namespace.

Cc: Christoph Hellwig 
Cc: David Howells 
Cc: Al Viro 
Cc: linux-fsde...@vger.kernel.org
Signed-off-by: Christian Brauner 
---
/* v2 */
patch introduced
---
 fs/remap_range.c   | 7 +--
 fs/verity/enable.c | 2 +-
 2 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/fs/remap_range.c b/fs/remap_range.c
index 9e5b27641756..fe7f07228462 100644
--- a/fs/remap_range.c
+++ b/fs/remap_range.c
@@ -432,13 +432,16 @@ EXPORT_SYMBOL(vfs_clone_file_range);
 /* Check whether we are allowed to dedupe the destination file */
 static bool allow_file_dedupe(struct file *file)
 {
+   struct user_namespace *user_ns = mnt_user_ns(file->f_path.mnt);
+   struct inode *inode = file_inode(file);
+
if (capable(CAP_SYS_ADMIN))
return true;
if (file->f_mode & FMODE_WRITE)
return true;
-   if (uid_eq(current_fsuid(), file_inode(file)->i_uid))
+   if (uid_eq(current_fsuid(), i_uid_into_mnt(user_ns, inode)))
return true;
-   if (!inode_permission(_user_ns, file_inode(file), MAY_WRITE))
+   if (!inode_permission(user_ns, inode, MAY_WRITE))
return true;
return false;
 }
diff --git a/fs/verity/enable.c b/fs/verity/enable.c
index 7449ef0050f4..8b9ea0f0850f 100644
--- a/fs/verity/enable.c
+++ b/fs/verity/enable.c
@@ -369,7 +369,7 @@ int fsverity_ioctl_enable(struct file *filp, const void 
__user *uarg)
 * has verity enabled, and to stabilize the data being hashed.
 */
 
-   err = inode_permission(_user_ns, inode, MAY_WRITE);
+   err = inode_permission(mnt_user_ns(filp->f_path.mnt), inode, MAY_WRITE);
if (err)
return err;
 
-- 
2.29.2

--
Linux-audit mailing list
Linux-audit@redhat.com
https://www.redhat.com/mailman/listinfo/linux-audit

[PATCH v2 20/39] open: handle idmapped mounts

2020-11-15 Thread Christian Brauner

For core file operations such as changing directories or chrooting,
determining file access, changing mode or ownership the vfs will verify
that the caller is privileged over the inode. Extend the various helpers to
handle idmapped mounts. If the inode is accessed through an idmapped mount
it is mapped according to the mount's user namespace.  Afterwards the
permissions checks are identical to non-idmapped mounts.  When changing
file ownership we need to map the mount from the mount's user namespace. If
the initial user namespace is passed all mapping operations are a nop so
non-idmapped mounts will not see a change in behavior and will also not see
any performance impact.

Cc: Christoph Hellwig 
Cc: David Howells 
Cc: Al Viro 
Cc: linux-fsde...@vger.kernel.org
Signed-off-by: Christian Brauner 
---
/* v2 */
unchanged
---
 fs/open.c | 31 ---
 1 file changed, 24 insertions(+), 7 deletions(-)

diff --git a/fs/open.c b/fs/open.c
index 137dcc52d2f8..2e2eb55976b1 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -401,6 +401,7 @@ static const struct cred *access_override_creds(void)
 
 static long do_faccessat(int dfd, const char __user *filename, int mode, int 
flags)
 {
+   struct user_namespace *user_ns;
struct path path;
struct inode *inode;
int res;
@@ -441,7 +442,8 @@ static long do_faccessat(int dfd, const char __user 
*filename, int mode, int fla
goto out_path_release;
}
 
-   res = inode_permission(_user_ns, inode, mode | MAY_ACCESS);
+   user_ns = mnt_user_ns(path.mnt);
+   res = inode_permission(user_ns, inode, mode | MAY_ACCESS);
/* SuS v2 requires we report a read only fs too */
if (res || !(mode & S_IWOTH) || special_file(inode->i_mode))
goto out_path_release;
@@ -489,6 +491,7 @@ SYSCALL_DEFINE2(access, const char __user *, filename, int, 
mode)
 
 SYSCALL_DEFINE1(chdir, const char __user *, filename)
 {
+   struct user_namespace *user_ns;
struct path path;
int error;
unsigned int lookup_flags = LOOKUP_FOLLOW | LOOKUP_DIRECTORY;
@@ -497,7 +500,8 @@ SYSCALL_DEFINE1(chdir, const char __user *, filename)
if (error)
goto out;
 
-   error = inode_permission(_user_ns, path.dentry->d_inode, MAY_EXEC 
| MAY_CHDIR);
+   user_ns = mnt_user_ns(path.mnt);
+   error = inode_permission(user_ns, path.dentry->d_inode, MAY_EXEC | 
MAY_CHDIR);
if (error)
goto dput_and_out;
 
@@ -515,6 +519,7 @@ SYSCALL_DEFINE1(chdir, const char __user *, filename)
 
 SYSCALL_DEFINE1(fchdir, unsigned int, fd)
 {
+   struct user_namespace *user_ns;
struct fd f = fdget_raw(fd);
int error;
 
@@ -526,7 +531,8 @@ SYSCALL_DEFINE1(fchdir, unsigned int, fd)
if (!d_can_lookup(f.file->f_path.dentry))
goto out_putf;
 
-   error = inode_permission(_user_ns, file_inode(f.file), MAY_EXEC | 
MAY_CHDIR);
+   user_ns = mnt_user_ns(f.file->f_path.mnt);
+   error = inode_permission(user_ns, file_inode(f.file), MAY_EXEC | 
MAY_CHDIR);
if (!error)
set_fs_pwd(current->fs, >f_path);
 out_putf:
@@ -537,6 +543,7 @@ SYSCALL_DEFINE1(fchdir, unsigned int, fd)
 
 SYSCALL_DEFINE1(chroot, const char __user *, filename)
 {
+   struct user_namespace *user_ns;
struct path path;
int error;
unsigned int lookup_flags = LOOKUP_FOLLOW | LOOKUP_DIRECTORY;
@@ -545,7 +552,8 @@ SYSCALL_DEFINE1(chroot, const char __user *, filename)
if (error)
goto out;
 
-   error = inode_permission(_user_ns, path.dentry->d_inode, MAY_EXEC 
| MAY_CHDIR);
+   user_ns = mnt_user_ns(path.mnt);
+   error = inode_permission(user_ns, path.dentry->d_inode, MAY_EXEC | 
MAY_CHDIR);
if (error)
goto dput_and_out;
 
@@ -570,6 +578,7 @@ SYSCALL_DEFINE1(chroot, const char __user *, filename)
 
 int chmod_common(const struct path *path, umode_t mode)
 {
+   struct user_namespace *user_ns;
struct inode *inode = path->dentry->d_inode;
struct inode *delegated_inode = NULL;
struct iattr newattrs;
@@ -585,7 +594,8 @@ int chmod_common(const struct path *path, umode_t mode)
goto out_unlock;
newattrs.ia_mode = (mode & S_IALLUGO) | (inode->i_mode & ~S_IALLUGO);
newattrs.ia_valid = ATTR_MODE | ATTR_CTIME;
-   error = notify_change(_user_ns, path->dentry, , 
_inode);
+   user_ns = mnt_user_ns(path->mnt);
+   error = notify_change(user_ns, path->dentry, , 
_inode);
 out_unlock:
inode_unlock(inode);
if (delegated_inode) {
@@ -646,6 +656,7 @@ SYSCALL_DEFINE2(chmod, const char __user *, filename, 
umode_t, mode)
 
 int chown_common(const struct path *path, uid_t user, gid_t group)
 {
+   struct user_namespace *user_ns;
struct inode *inode = path->dentry->d_inode;
struct inode *dele

[PATCH v2 32/39] ima: handle idmapped mounts

2020-11-15 Thread Christian Brauner

IMA does sometimes access the inode's i_uid and compares it against the rules'
fowner. Enable IMA to handle idmapped mounts by passing down the mount's user
namespace. We simply make use of the helpers we introduced before.

Signed-off-by: Christian Brauner 
---
/* v2 */
unchanged
---
 fs/attr.c|  2 +-
 fs/namei.c   |  4 +--
 include/linux/ima.h  | 15 ++-
 security/integrity/ima/ima.h | 19 -
 security/integrity/ima/ima_api.c | 10 ---
 security/integrity/ima/ima_appraise.c| 14 +-
 security/integrity/ima/ima_asymmetric_keys.c |  2 +-
 security/integrity/ima/ima_main.c| 28 
 security/integrity/ima/ima_policy.c  | 17 ++--
 security/integrity/ima/ima_queue_keys.c  |  2 +-
 10 files changed, 66 insertions(+), 47 deletions(-)

diff --git a/fs/attr.c b/fs/attr.c
index 36383cd3a986..2d55b0c36544 100644
--- a/fs/attr.c
+++ b/fs/attr.c
@@ -357,7 +357,7 @@ int notify_change(struct user_namespace *user_ns, struct 
dentry *dentry,
 
if (!error) {
fsnotify_change(dentry, ia_valid);
-   ima_inode_post_setattr(dentry);
+   ima_inode_post_setattr(user_ns, dentry);
evm_inode_post_setattr(dentry, ia_valid);
}
 
diff --git a/fs/namei.c b/fs/namei.c
index 976ee05c5027..4cebcb002c4f 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -,7 +,7 @@ struct dentry *vfs_tmpfile(struct user_namespace *user_ns,
inode->i_state |= I_LINKABLE;
spin_unlock(>i_lock);
}
-   ima_post_create_tmpfile(inode);
+   ima_post_create_tmpfile(user_ns, inode);
return child;
 
 out_err:
@@ -3645,7 +3645,7 @@ static long do_mknodat(int dfd, const char __user 
*filename, umode_t mode,
error = vfs_create(user_ns, path.dentry->d_inode,
   dentry, mode, true);
if (!error)
-   ima_post_path_mknod(dentry);
+   ima_post_path_mknod(user_ns, dentry);
break;
case S_IFCHR: case S_IFBLK:
error = vfs_mknod(user_ns, path.dentry->d_inode, dentry,
diff --git a/include/linux/ima.h b/include/linux/ima.h
index 8fa7bcfb2da2..c3e3c260ad40 100644
--- a/include/linux/ima.h
+++ b/include/linux/ima.h
@@ -16,7 +16,7 @@ struct linux_binprm;
 #ifdef CONFIG_IMA
 extern int ima_bprm_check(struct linux_binprm *bprm);
 extern int ima_file_check(struct file *file, int mask);
-extern void ima_post_create_tmpfile(struct inode *inode);
+extern void ima_post_create_tmpfile(struct user_namespace *user_ns, struct 
inode *inode);
 extern void ima_file_free(struct file *file);
 extern int ima_file_mmap(struct file *file, unsigned long prot);
 extern int ima_file_mprotect(struct vm_area_struct *vma, unsigned long prot);
@@ -27,7 +27,8 @@ extern int ima_read_file(struct file *file, enum 
kernel_read_file_id id,
 bool contents);
 extern int ima_post_read_file(struct file *file, void *buf, loff_t size,
  enum kernel_read_file_id id);
-extern void ima_post_path_mknod(struct dentry *dentry);
+extern void ima_post_path_mknod(struct user_namespace *user_ns,
+   struct dentry *dentry);
 extern int ima_file_hash(struct file *file, char *buf, size_t buf_size);
 extern void ima_kexec_cmdline(int kernel_fd, const void *buf, int size);
 
@@ -61,7 +62,8 @@ static inline int ima_file_check(struct file *file, int mask)
return 0;
 }
 
-static inline void ima_post_create_tmpfile(struct inode *inode)
+static inline void ima_post_create_tmpfile(struct user_namespace *user_ns,
+  struct inode *inode)
 {
 }
 
@@ -105,7 +107,8 @@ static inline int ima_post_read_file(struct file *file, 
void *buf, loff_t size,
return 0;
 }
 
-static inline void ima_post_path_mknod(struct dentry *dentry)
+static inline void ima_post_path_mknod(struct user_namespace *user_ns,
+  struct dentry *dentry)
 {
return;
 }
@@ -141,7 +144,7 @@ static inline void ima_post_key_create_or_update(struct key 
*keyring,
 
 #ifdef CONFIG_IMA_APPRAISE
 extern bool is_ima_appraise_enabled(void);
-extern void ima_inode_post_setattr(struct dentry *dentry);
+extern void ima_inode_post_setattr(struct user_namespace *user_ns, struct 
dentry *dentry);
 extern int ima_inode_setxattr(struct dentry *dentry, const char *xattr_name,
   const void *xattr_value, size_t xattr_value_len);
 extern int ima_inode_removexattr(struct dentry *dentry, const char 
*xattr_name);
@@ -151,7 +154,7 @@ static inline bool is_ima_appraise_enabled(void)
return 0;
 }
 
-static inline void ima_inode_post_setattr(struct dentry *den

[PATCH v2 27/39] would_dump: handle idmapped mounts

2020-11-15 Thread Christian Brauner

When determining whether or not to create a coredump the vfs will verify that
the caller is privileged over the inode. Make the would_dump() helper handle
idmapped mounts by passing down the mount's user namespace of the exec file.

Cc: Christoph Hellwig 
Cc: David Howells 
Cc: Al Viro 
Cc: linux-fsde...@vger.kernel.org
Signed-off-by: Christian Brauner 
---
/* v2 */
unchanged
---
 fs/exec.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/fs/exec.c b/fs/exec.c
index b499a1a03934..10c06fdf78a7 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1391,14 +1391,15 @@ EXPORT_SYMBOL(begin_new_exec);
 void would_dump(struct linux_binprm *bprm, struct file *file)
 {
struct inode *inode = file_inode(file);
-   if (inode_permission(_user_ns, inode, MAY_READ) < 0) {
+   struct user_namespace *ns = mnt_user_ns(file->f_path.mnt);
+   if (inode_permission(ns, inode, MAY_READ) < 0) {
struct user_namespace *old, *user_ns;
bprm->interp_flags |= BINPRM_FLAGS_ENFORCE_NONDUMP;
 
/* Ensure mm->user_ns contains the executable */
user_ns = old = bprm->mm->user_ns;
while ((user_ns != _user_ns) &&
-  !privileged_wrt_inode_uidgid(user_ns, _user_ns, 
inode))
+  !privileged_wrt_inode_uidgid(user_ns, ns, inode))
user_ns = user_ns->parent;
 
if (old != user_ns) {
-- 
2.29.2

--
Linux-audit mailing list
Linux-audit@redhat.com
https://www.redhat.com/mailman/listinfo/linux-audit

[PATCH v2 31/39] audit: handle idmapped mounts

2020-11-15 Thread Christian Brauner

Audit will sometimes log the inode's i_uid and i_gid. Enable audit to log the
mapped inode when it is accessed from an idmapped mount.

Signed-off-by: Christian Brauner 
---
/* v2 */
unchanged
---
 fs/namei.c| 14 +++---
 include/linux/audit.h | 10 ++
 ipc/mqueue.c  |  8 
 kernel/auditsc.c  | 26 ++
 4 files changed, 31 insertions(+), 27 deletions(-)

diff --git a/fs/namei.c b/fs/namei.c
index 1d6a0da8bf81..976ee05c5027 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -986,7 +986,7 @@ static inline int may_follow_link(struct nameidata *nd, 
const struct inode *inod
if (nd->flags & LOOKUP_RCU)
return -ECHILD;
 
-   audit_inode(nd->name, nd->stack[0].link.dentry, 0);
+   audit_inode(nd->name, user_ns, nd->stack[0].link.dentry, 0);
audit_log_path_denied(AUDIT_ANOM_LINK, "follow_link");
return -EACCES;
 }
@@ -2398,7 +2398,7 @@ int filename_lookup(int dfd, struct filename *name, 
unsigned flags,
retval = path_lookupat(, flags | LOOKUP_REVAL, path);
 
if (likely(!retval))
-   audit_inode(name, path->dentry,
+   audit_inode(name, mnt_user_ns(path->mnt), path->dentry,
flags & LOOKUP_MOUNTPOINT ? AUDIT_INODE_NOEVAL : 0);
restore_nameidata();
putname(name);
@@ -2440,7 +2440,7 @@ static struct filename *filename_parentat(int dfd, struct 
filename *name,
if (likely(!retval)) {
*last = nd.last;
*type = nd.last_type;
-   audit_inode(name, parent->dentry, AUDIT_INODE_PARENT);
+   audit_inode(name, mnt_user_ns(parent->mnt), parent->dentry, 
AUDIT_INODE_PARENT);
} else {
putname(name);
name = ERR_PTR(retval);
@@ -3194,7 +3194,7 @@ static const char *open_last_lookups(struct nameidata *nd,
if (unlikely(error))
return ERR_PTR(error);
}
-   audit_inode(nd->name, dir, AUDIT_INODE_PARENT);
+   audit_inode(nd->name, mnt_user_ns(nd->path.mnt), dir, 
AUDIT_INODE_PARENT);
/* trailing slashes? */
if (unlikely(nd->last.name[nd->last.len]))
return ERR_PTR(-EISDIR);
@@ -3260,7 +3260,7 @@ static int do_open(struct nameidata *nd,
return error;
}
if (!(file->f_mode & FMODE_CREATED))
-   audit_inode(nd->name, nd->path.dentry, 0);
+   audit_inode(nd->name, mnt_user_ns(nd->path.mnt), 
nd->path.dentry, 0);
if (open_flag & O_CREAT) {
if ((open_flag & O_EXCL) && !(file->f_mode & FMODE_CREATED))
return -EEXIST;
@@ -3362,7 +3362,7 @@ static int do_tmpfile(struct nameidata *nd, unsigned 
flags,
goto out2;
dput(path.dentry);
path.dentry = child;
-   audit_inode(nd->name, child, 0);
+   audit_inode(nd->name, user_ns, child, 0);
/* Don't check for other permissions, the inode was just created */
error = may_open(, 0, op->open_flag);
if (error)
@@ -3381,7 +3381,7 @@ static int do_o_path(struct nameidata *nd, unsigned 
flags, struct file *file)
struct path path;
int error = path_lookupat(nd, flags, );
if (!error) {
-   audit_inode(nd->name, path.dentry, 0);
+   audit_inode(nd->name, mnt_user_ns(path.mnt), path.dentry, 0);
error = vfs_open(, file);
path_put();
}
diff --git a/include/linux/audit.h b/include/linux/audit.h
index b3d859831a31..217d2b0c273e 100644
--- a/include/linux/audit.h
+++ b/include/linux/audit.h
@@ -293,8 +293,8 @@ extern void __audit_syscall_exit(int ret_success, long 
ret_value);
 extern struct filename *__audit_reusename(const __user char *uptr);
 extern void __audit_getname(struct filename *name);
 extern void __audit_getcwd(void);
-extern void __audit_inode(struct filename *name, const struct dentry *dentry,
-   unsigned int flags);
+extern void __audit_inode(struct filename *name, struct user_namespace 
*user_ns,
+ const struct dentry *dentry, unsigned int flags);
 extern void __audit_file(const struct file *);
 extern void __audit_inode_child(struct inode *parent,
const struct dentry *dentry,
@@ -357,10 +357,11 @@ static inline void audit_getcwd(void)
__audit_getcwd();
 }
 static inline void audit_inode(struct filename *name,
+   struct user_namespace *user_ns,
const struct dentry *dentry,
unsigned int aflags) {
if (unlikely(!audit_dummy_context()))
-   __audit_inode(name, dent

[PATCH v2 09/39] namei: add idmapped mount aware permission helpers

2020-11-15 Thread Christian Brauner

The two helpers inode_permission() and generic_permission() are used by the
vfs to perform basic permission checking by verifying that the caller is
privileged over an inode. In order to handle idmapped mounts we extend the
two helpers with an additional user namespace argument. On idmapped mounts
the two helpers will make sure to map the inode according to the mount's
user namespace and then peform identical permission checks to
inode_permission() and generic_permission(). If the initial user namespace
is passed nothing changes so there will be no performance impact on
non-idmapped mounts.

Cc: Christoph Hellwig 
Cc: David Howells 
Cc: Al Viro 
Cc: linux-fsde...@vger.kernel.org
Signed-off-by: Christian Brauner 
---
/* v2 */
- Christoph Hellwig:
  - Don't pollute the vfs with additional helpers simply extend the existing
helpers with an additional argument and switch all callers.
---
 fs/attr.c  |  3 +-
 fs/btrfs/inode.c   |  2 +-
 fs/btrfs/ioctl.c   | 10 ++---
 fs/ceph/inode.c|  2 +-
 fs/cifs/cifsfs.c   |  2 +-
 fs/configfs/symlink.c  |  2 +-
 fs/ecryptfs/inode.c|  2 +-
 fs/exec.c  |  2 +-
 fs/fuse/dir.c  |  4 +-
 fs/gfs2/inode.c|  2 +-
 fs/hostfs/hostfs_kern.c|  2 +-
 fs/init.c  |  9 ++--
 fs/kernfs/inode.c  |  2 +-
 fs/libfs.c |  7 ++-
 fs/namei.c | 68 +-
 fs/nfs/dir.c   |  2 +-
 fs/nfsd/nfsfh.c|  2 +-
 fs/nfsd/vfs.c  |  4 +-
 fs/nilfs2/inode.c  |  2 +-
 fs/notify/fanotify/fanotify_user.c |  2 +-
 fs/notify/inotify/inotify_user.c   |  2 +-
 fs/ocfs2/file.c|  2 +-
 fs/ocfs2/refcounttree.c|  4 +-
 fs/open.c  | 10 ++---
 fs/orangefs/inode.c|  2 +-
 fs/overlayfs/file.c|  2 +-
 fs/overlayfs/inode.c   |  4 +-
 fs/overlayfs/util.c|  2 +-
 fs/posix_acl.c | 17 +---
 fs/proc/base.c |  4 +-
 fs/proc/fd.c   |  2 +-
 fs/reiserfs/xattr.c|  2 +-
 fs/remap_range.c   |  2 +-
 fs/udf/file.c  |  2 +-
 fs/verity/enable.c |  2 +-
 fs/xattr.c |  2 +-
 include/linux/fs.h |  4 +-
 include/linux/posix_acl.h  |  4 +-
 ipc/mqueue.c   |  2 +-
 kernel/bpf/inode.c |  4 +-
 kernel/cgroup/cgroup.c |  2 +-
 kernel/sys.c   |  2 +-
 mm/madvise.c   |  2 +-
 mm/memcontrol.c|  2 +-
 mm/mincore.c   |  2 +-
 net/unix/af_unix.c |  2 +-
 46 files changed, 122 insertions(+), 96 deletions(-)

diff --git a/fs/attr.c b/fs/attr.c
index d270f640a192..c9e29e589cec 100644
--- a/fs/attr.c
+++ b/fs/attr.c
@@ -244,7 +244,8 @@ int notify_change(struct dentry * dentry, struct iattr * 
attr, struct inode **de
return -EPERM;
 
if (!inode_owner_or_capable(inode)) {
-   error = inode_permission(inode, MAY_WRITE);
+   error = inode_permission(_user_ns, inode,
+MAY_WRITE);
if (error)
return error;
}
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index da58c58ef9aa..32e3bf88d4f7 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -9794,7 +9794,7 @@ static int btrfs_permission(struct inode *inode, int mask)
if (BTRFS_I(inode)->flags & BTRFS_INODE_READONLY)
return -EACCES;
}
-   return generic_permission(inode, mask);
+   return generic_permission(_user_ns, inode, mask);
 }
 
 static int btrfs_tmpfile(struct inode *dir, struct dentry *dentry, umode_t 
mode)
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index ab408a23ba32..771ee08920ed 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -910,7 +910,7 @@ static int btrfs_may_delete(struct inode *dir, struct 
dentry *victim, int isdir)
BUG_ON(d_inode(victim->d_parent) != dir);
audit_inode_child(dir, victim, AUDIT_TYPE_CHILD_DELETE);
 
-   error = inode_permission(dir, MAY_WRITE | MAY_EXEC);
+   error = inode_permission(_user_ns, dir, MAY_WRITE | MAY_EXEC);
if (error)
return error;
if (IS_APPEND(dir))
@@ -939,7 +939,7 @@ static inline int btrfs_may_create(struct inode *dir, 
struct dentry *child)
return -EEXIST;
if (IS_DEADDIR(dir))
return -ENOENT;
-   return inode_permission(dir, MAY_WRITE | MAY_EXEC);
+   return inode_

[PATCH v2 00/39] fs: idmapped mounts

2020-11-15 Thread Christian Brauner

he direction to avoid inode alias issues during
lookup. David for various discussions around this. Christoph for proving
a first proper review and for being involved in the original idea. Tycho
for helping with this series and on future patches to convert
filesystems. Alban Crequy and the Kinvolk located just a few streets
away from me in Berlin for providing use-case discussions and writing
patches for containerd! Stéphane for his invaluable input on many things
and level head and enabling me to work on this. Amir for explaining and
discussing aspects of overlayfs with me. I'd like to especially thank
Seth Forshee because he provided a lot of good analysis, suggestions,
and participated in short-notice discussions in both chat and video for
some nitty-gritty technical details.

This series can be found and pulled from the three usual locations:
https://git.kernel.org/pub/scm/linux/kernel/git/brauner/linux.git/log/?h=idmapped_mounts
https://github.com/brauner/linux/tree/idmapped_mounts
https://gitlab.com/brauner/linux/-/commits/idmapped_mounts

Thanks!
Christian

Christian Brauner (37):
  namespace: take lock_mount_hash() directly when changing flags
  mount: make {lock,unlock}_mount_hash() static
  namespace: only take read lock in do_reconfigure_mnt()
  fs: add mount_setattr()
  tests: add mount_setattr() selftests
  fs: add id translation helpers
  mount: attach mappings to mounts
  capability: handle idmapped mounts
  namei: add idmapped mount aware permission helpers
  inode: add idmapped mount aware init and permission helpers
  attr: handle idmapped mounts
  acl: handle idmapped mounts
  commoncap: handle idmapped mounts
  stat: handle idmapped mounts
  namei: handle idmapped mounts in may_*() helpers
  namei: introduce struct renamedata
  namei: prepare for idmapped mounts
  open: handle idmapped mounts in do_truncate()
  open: handle idmapped mounts
  af_unix: handle idmapped mounts
  utimes: handle idmapped mounts
  fcntl: handle idmapped mounts
  notify: handle idmapped mounts
  init: handle idmapped mounts
  ioctl: handle idmapped mounts
  would_dump: handle idmapped mounts
  exec: handle idmapped mounts
  fs: add helpers for idmap mounts
  apparmor: handle idmapped mounts
  audit: handle idmapped mounts
  ima: handle idmapped mounts
  fat: handle idmapped mounts
  ext4: support idmapped mounts
  ecryptfs: do not mount on top of idmapped mounts
  overlayfs: do not mount on top of idmapped mounts
  fs: introduce MOUNT_ATTR_IDMAP
  tests: add vfs/idmapped mounts test suite

Tycho Andersen (2):
  xattr: handle idmapped mounts
  selftests: add idmapped mounts xattr selftest

 Documentation/filesystems/locking.rst |6 +-
 Documentation/filesystems/porting.rst |2 +
 Documentation/filesystems/vfs.rst |   17 +-
 arch/alpha/kernel/syscalls/syscall.tbl|1 +
 arch/arm/tools/syscall.tbl|1 +
 arch/arm64/include/asm/unistd32.h |2 +
 arch/ia64/kernel/syscalls/syscall.tbl |1 +
 arch/m68k/kernel/syscalls/syscall.tbl |1 +
 arch/microblaze/kernel/syscalls/syscall.tbl   |1 +
 arch/mips/kernel/syscalls/syscall_n32.tbl |1 +
 arch/mips/kernel/syscalls/syscall_n64.tbl |1 +
 arch/mips/kernel/syscalls/syscall_o32.tbl |1 +
 arch/parisc/kernel/syscalls/syscall.tbl   |1 +
 arch/powerpc/kernel/syscalls/syscall.tbl  |1 +
 arch/powerpc/platforms/cell/spufs/inode.c |5 +-
 arch/s390/kernel/syscalls/syscall.tbl |1 +
 arch/sh/kernel/syscalls/syscall.tbl   |1 +
 arch/sparc/kernel/syscalls/syscall.tbl|1 +
 arch/x86/entry/syscalls/syscall_32.tbl|1 +
 arch/x86/entry/syscalls/syscall_64.tbl|1 +
 arch/xtensa/kernel/syscalls/syscall.tbl   |1 +
 drivers/android/binderfs.c|3 +-
 drivers/base/devtmpfs.c   |   12 +-
 fs/9p/acl.c   |7 +-
 fs/9p/v9fs.h  |3 +-
 fs/9p/v9fs_vfs.h  |2 +-
 fs/9p/vfs_inode.c |   32 +-
 fs/9p/vfs_inode_dotl.c|   34 +-
 fs/9p/xattr.c |1 +
 fs/adfs/adfs.h|3 +-
 fs/adfs/inode.c   |5 +-
 fs/affs/affs.h|   10 +-
 fs/affs/inode.c   |7 +-
 fs/affs/namei.c   |   15 +-
 fs/afs/dir.c  |   34 +-
 fs/afs/inode.c|5 +-
 fs/afs/internal.h |4 +-
 fs/afs/security.c |2 +-
 fs/afs/xattr.c|2 +
 fs/attr.c |   78 +-
 fs/autofs/root.c  |   13 +-
 fs/bad_inode.c

[PATCH v2 21/39] af_unix: handle idmapped mounts

2020-11-15 Thread Christian Brauner

When binding a non-abstract AF_UNIX socket it will gain a representation in the
filesystem. Enable the socket infrastructure to handle idmapped mounts by
passing down the user namespace of the mount the socket will be created
from. Non-idmapped mounts will not see any altered behavior.

Cc: Christoph Hellwig 
Cc: David Howells 
Cc: Al Viro 
Cc: linux-fsde...@vger.kernel.org
Signed-off-by: Christian Brauner 
---
/* v2 */
unchanged
---
 net/unix/af_unix.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index b4987805e5e5..4be33240e9cc 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -996,7 +996,7 @@ static int unix_mknod(const char *sun_path, umode_t mode, 
struct path *res)
 */
err = security_path_mknod(, dentry, mode, 0);
if (!err) {
-   err = vfs_mknod(_user_ns, d_inode(path.dentry), dentry, 
mode, 0);
+   err = vfs_mknod(mnt_user_ns(path.mnt), d_inode(path.dentry), 
dentry, mode, 0);
if (!err) {
res->mnt = mntget(path.mnt);
res->dentry = dget(dentry);
-- 
2.29.2

--
Linux-audit mailing list
Linux-audit@redhat.com
https://www.redhat.com/mailman/listinfo/linux-audit

[PATCH v2 05/39] tests: add mount_setattr() selftests

2020-11-15 Thread Christian Brauner

Add a range of selftests for the new mount_setattr() syscall to verify
that it works as expected. This tests that:
- no invalid flags can be specified
- changing properties of a single mount works and leaves other mounts in
  the mount tree unchanged
- changing a mount tre to read-only when one of the mounts has writers
  fails and leaves the whole mount tree unchanged
- changing mount properties from multiple threads works
- changing atime settings works
- changing mount propagation works
- changing the mount options of a mount tree where the individual mounts
  in the tree have different mount options only changes the flags that
  were requested to change
- changing mount options from another mount namespace fails
- changing mount options from another user namespace fails

[==] Running 9 tests from 2 test cases.
[ RUN  ] mount_setattr.invalid_attributes
[   OK ] mount_setattr.invalid_attributes
[ RUN  ] mount_setattr.basic
[   OK ] mount_setattr.basic
[ RUN  ] mount_setattr.basic_recursive
[   OK ] mount_setattr.basic_recursive
[ RUN  ] mount_setattr.mount_has_writers
[   OK ] mount_setattr.mount_has_writers
[ RUN  ] mount_setattr.mixed_mount_options
[   OK ] mount_setattr.mixed_mount_options
[ RUN  ] mount_setattr.time_changes
[   OK ] mount_setattr.time_changes
[ RUN  ] mount_setattr.multi_threaded
[   OK ] mount_setattr.multi_threaded
[ RUN  ] mount_setattr.wrong_user_namespace
[   OK ] mount_setattr.wrong_user_namespace
[ RUN  ] mount_setattr.wrong_mount_namespace
[   OK ] mount_setattr.wrong_mount_namespace
[==] 9 / 9 tests passed.
[  PASSED  ]

Cc: Christoph Hellwig 
Cc: David Howells 
Cc: Al Viro 
Cc: linux-fsde...@vger.kernel.org
Signed-off-by: Christian Brauner 
/* v2 */
unchanged
---
 tools/testing/selftests/Makefile  |   1 +
 .../selftests/mount_setattr/.gitignore|   1 +
 .../testing/selftests/mount_setattr/Makefile  |   7 +
 tools/testing/selftests/mount_setattr/config  |   1 +
 .../mount_setattr/mount_setattr_test.c| 889 ++
 5 files changed, 899 insertions(+)
 create mode 100644 tools/testing/selftests/mount_setattr/.gitignore
 create mode 100644 tools/testing/selftests/mount_setattr/Makefile
 create mode 100644 tools/testing/selftests/mount_setattr/config
 create mode 100644 tools/testing/selftests/mount_setattr/mount_setattr_test.c

diff --git a/tools/testing/selftests/Makefile b/tools/testing/selftests/Makefile
index d9c283503159..87b7107dd9a6 100644
--- a/tools/testing/selftests/Makefile
+++ b/tools/testing/selftests/Makefile
@@ -34,6 +34,7 @@ TARGETS += memfd
 TARGETS += memory-hotplug
 TARGETS += mincore
 TARGETS += mount
+TARGETS += mount_setattr
 TARGETS += mqueue
 TARGETS += net
 TARGETS += net/forwarding
diff --git a/tools/testing/selftests/mount_setattr/.gitignore 
b/tools/testing/selftests/mount_setattr/.gitignore
new file mode 100644
index ..5f74d8488472
--- /dev/null
+++ b/tools/testing/selftests/mount_setattr/.gitignore
@@ -0,0 +1 @@
+mount_setattr_test
diff --git a/tools/testing/selftests/mount_setattr/Makefile 
b/tools/testing/selftests/mount_setattr/Makefile
new file mode 100644
index ..2250f7dcb81e
--- /dev/null
+++ b/tools/testing/selftests/mount_setattr/Makefile
@@ -0,0 +1,7 @@
+# SPDX-License-Identifier: GPL-2.0
+# Makefile for mount selftests.
+CFLAGS = -g -I../../../../usr/include/ -Wall -O2 -pthread
+
+TEST_GEN_FILES += mount_setattr_test
+
+include ../lib.mk
diff --git a/tools/testing/selftests/mount_setattr/config 
b/tools/testing/selftests/mount_setattr/config
new file mode 100644
index ..416bd53ce982
--- /dev/null
+++ b/tools/testing/selftests/mount_setattr/config
@@ -0,0 +1 @@
+CONFIG_USER_NS=y
diff --git a/tools/testing/selftests/mount_setattr/mount_setattr_test.c 
b/tools/testing/selftests/mount_setattr/mount_setattr_test.c
new file mode 100644
index ..d6e2555c8cac
--- /dev/null
+++ b/tools/testing/selftests/mount_setattr/mount_setattr_test.c
@@ -0,0 +1,889 @@
+// SPDX-License-Identifier: GPL-2.0
+#define _GNU_SOURCE
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+#include "../kselftest_harness.h"
+
+#ifndef CLONE_NEWNS
+#define CLONE_NEWNS 0x0002
+#endif
+
+#ifndef CLONE_NEWUSER
+#define CLONE_NEWUSER 0x1000
+#endif
+
+#ifndef MS_REC
+#define MS_REC 16384
+#endif
+
+#ifndef MS_RELATIME
+#define MS_RELATIME (1 << 21)
+#endif
+
+#ifndef MS_STRICTATIME
+#define MS_STRICTATIME (1 << 24)
+#endif
+
+#ifndef MOUNT_ATTR_RDONLY
+#define MOUNT_ATTR_RDONLY 0x0001
+#endif
+
+#ifndef MOUNT_ATTR_NOSUID
+#define MOUNT_ATTR_NOSUID 0x0002
+#endif
+
+#ifndef MOUNT_ATTR_NOEXEC
+#define MOUNT_ATTR_NOEXEC 0x0008
+#endif
+
+#ifndef MOUNT_ATTR_NODIRATIME
+#define MOUNT_ATTR_NODIRATIME 0x0080
+#endif
+
+#if

[PATCH v2 04/39] fs: add mount_setattr()

2020-11-15 Thread Christian Brauner

This implements the mount_setattr() syscall. While the new mount api
allows to change the properties of a superblock there is currently no
way to change the mount properties of a mount or mount tree using file
descriptors which the new mount api is based on. In addition the old mount api
has the restriction that mount options cannot be applied recursively. This
hasn't changed since changing mount options on a per-mount basis was implemented
in [1] and has been a frequent request.
The legacy mount is currently unable to accommodate this behavior without
introducing a whole new set of flags because MS_REC | MS_REMOUNT | MS_BIND |
MS_RDONLY | MS_NOEXEC | [...] only apply the mount option to the topmost mount.
Changing MS_REC to apply to the whole mount tree would mean introducing a
significant uapi change and would likely cause significant regressions.

The new mount_setattr() syscall allows to recursively clear and set mount
options in one shot. Multiple calls to change mount options requesting the same
changes are idempotent:

int mount_setattr(int dfd, const char *path, unsigned flags,
  struct mount_attr *uattr, size_t usize);

Flags to modify path resolution behavior are specified in the @flags argument.
Currently, AT_EMPTY_PATH, AT_RECURSIVE, AT_SYMLINK_NOFOLLOW, and AT_NO_AUTOMOUNT
are supported. If useful, additional lookup flags to restrict path resolution as
introduced with openat2() might be supported in the future.

mount_setattr() can be expected to grow over time and is designed with
extensibility in mind. It follows the extensible syscall pattern we have used
with other syscalls such as openat2(), clone3(), sched_{set,get}attr(), and
others.
The set of mount options is passed in the uapi struct mount_attr which currently
has the following layout:

struct mount_attr {
__u64 attr_set;
__u64 attr_clr;
__u32 propagation;
};

The @attr_set and @attr_clr members are used to clear and set mount options.
This way a user can e.g. request that a set of flags is to be raised such as
turning mounts readonly by raising MOUNT_ATTR_RDONLY in @attr_set while at the
same time requesting that another set of flags is to be lowered such as removing
noexec from a mount tree by specifying MOUNT_ATTR_NOEXEC in @attr_clr.

Note, since the MOUNT_ATTR_ values are an enum starting from 0, not a
bitmap, users wanting to transition to a different atime setting cannot simply
specify the atime setting in @attr_set, but must also specify MOUNT_ATTR__ATIME
in the @attr_clr field. So we ensure that MOUNT_ATTR__ATIME can't be partially
set in @attr_clr and that @attr_set can't have any atime bits set if
MOUNT_ATTR__ATIME isn't set in @attr_clr.

The @propagation field lets callers specify the propagation type of a mount
tree. Propagation is a single property that has four different settings and as
such is not really a flag argument but an enum.  Specifically, it would be
unclear what setting and clearing propagation settings in combination would
amount to. The legacy mount() syscall thus forbids the combination of multiple
propagation settings too. The goal is to keep the semantics of mount propagation
somewhat simple as they are overly complex as it is.

[1]: commit 2e4b7fcd9260 ("[PATCH] r/o bind mounts: honor mount writer counts 
at remount")
Cc: Christoph Hellwig 
Cc: David Howells 
Cc: Aleksa Sarai 
Cc: Al Viro 
Cc: linux-fsde...@vger.kernel.org
Cc: linux-...@vger.kernel.org
Signed-off-by: Christian Brauner 
---
/* v2 */
- Christoph Hellwig:
  - Split into multiple helpers.
---
 arch/alpha/kernel/syscalls/syscall.tbl  |   1 +
 arch/arm/tools/syscall.tbl  |   1 +
 arch/arm64/include/asm/unistd32.h   |   2 +
 arch/ia64/kernel/syscalls/syscall.tbl   |   1 +
 arch/m68k/kernel/syscalls/syscall.tbl   |   1 +
 arch/microblaze/kernel/syscalls/syscall.tbl |   1 +
 arch/mips/kernel/syscalls/syscall_n32.tbl   |   1 +
 arch/mips/kernel/syscalls/syscall_n64.tbl   |   1 +
 arch/mips/kernel/syscalls/syscall_o32.tbl   |   1 +
 arch/parisc/kernel/syscalls/syscall.tbl |   1 +
 arch/powerpc/kernel/syscalls/syscall.tbl|   1 +
 arch/s390/kernel/syscalls/syscall.tbl   |   1 +
 arch/sh/kernel/syscalls/syscall.tbl |   1 +
 arch/sparc/kernel/syscalls/syscall.tbl  |   1 +
 arch/x86/entry/syscalls/syscall_32.tbl  |   1 +
 arch/x86/entry/syscalls/syscall_64.tbl  |   1 +
 arch/xtensa/kernel/syscalls/syscall.tbl |   1 +
 fs/internal.h   |   8 +
 fs/namespace.c  | 325 ++--
 include/linux/syscalls.h|   3 +
 include/uapi/asm-generic/unistd.h   |   4 +-
 include/uapi/linux/mount.h  |  22 ++
 tools/include/uapi/asm-generic/unistd.h |   4 +-
 23 files changed, 358 insertions(+), 26 deletions(-)

diff --git a/arch/alpha/kernel/syscalls/syscall.tbl 
b/arch/alpha/kernel/syscalls/syscall.tbl
index ee7b01bb7346..24d870962

[PATCH v2 18/39] namei: prepare for idmapped mounts

2020-11-15 Thread Christian Brauner

The various vfs_*() helpers are called by filesystems or by the vfs itself
to perform core operations create, link, mkdir, mknod, rename, rmdir,
tmpfile and unlink. Enable them to handle idmapped mounts. If the inode is
accessed through an idmapped mount it is mapped according to the mount's
user namespace.
Afterwards the checks and operations are identical to non-idmapped mounts.
If the initial user namespace is passed all mapping operations are a nop so
non-idmapped mounts will not see a change in behavior and will also not see
any performance impact.

Cc: Christoph Hellwig 
Cc: David Howells 
Cc: Al Viro 
Cc: linux-fsde...@vger.kernel.org
Signed-off-by: Christian Brauner 
---
/* v2 */
- Christoph Hellwig:
  - Don't pollute the vfs with additional helpers simply extend the existing
helpers with an additional argument and switch all callers.
---
 drivers/base/devtmpfs.c  |   8 +--
 fs/cachefiles/namei.c|  10 ++--
 fs/ecryptfs/inode.c  |  22 
 fs/init.c|  12 ++---
 fs/namei.c   | 108 +--
 fs/nfsd/nfs4recover.c|   6 +--
 fs/nfsd/vfs.c|  19 ---
 fs/overlayfs/dir.c   |   4 +-
 fs/overlayfs/overlayfs.h |  20 
 include/linux/fs.h   |  25 +
 ipc/mqueue.c |   2 +-
 net/unix/af_unix.c   |   2 +-
 12 files changed, 144 insertions(+), 94 deletions(-)

diff --git a/drivers/base/devtmpfs.c b/drivers/base/devtmpfs.c
index 2e0c3cdb4184..fd4e86c58111 100644
--- a/drivers/base/devtmpfs.c
+++ b/drivers/base/devtmpfs.c
@@ -162,7 +162,7 @@ static int dev_mkdir(const char *name, umode_t mode)
if (IS_ERR(dentry))
return PTR_ERR(dentry);
 
-   err = vfs_mkdir(d_inode(path.dentry), dentry, mode);
+   err = vfs_mkdir(_user_ns, d_inode(path.dentry), dentry, mode);
if (!err)
/* mark as kernel-created inode */
d_inode(dentry)->i_private = 
@@ -212,7 +212,7 @@ static int handle_create(const char *nodename, umode_t 
mode, kuid_t uid,
if (IS_ERR(dentry))
return PTR_ERR(dentry);
 
-   err = vfs_mknod(d_inode(path.dentry), dentry, mode, dev->devt);
+   err = vfs_mknod(_user_ns, d_inode(path.dentry), dentry, mode, 
dev->devt);
if (!err) {
struct iattr newattrs;
 
@@ -242,7 +242,7 @@ static int dev_rmdir(const char *name)
return PTR_ERR(dentry);
if (d_really_is_positive(dentry)) {
if (d_inode(dentry)->i_private == )
-   err = vfs_rmdir(d_inode(parent.dentry), dentry);
+   err = vfs_rmdir(_user_ns, d_inode(parent.dentry), 
dentry);
else
err = -EPERM;
} else {
@@ -330,7 +330,7 @@ static int handle_remove(const char *nodename, struct 
device *dev)
inode_lock(d_inode(dentry));
notify_change(_user_ns, dentry, , NULL);
inode_unlock(d_inode(dentry));
-   err = vfs_unlink(d_inode(parent.dentry), dentry, NULL);
+   err = vfs_unlink(_user_ns, d_inode(parent.dentry), 
dentry, NULL);
if (!err || err == -ENOENT)
deleted = 1;
}
diff --git a/fs/cachefiles/namei.c b/fs/cachefiles/namei.c
index 7b987de0babe..ae01fac0a80c 100644
--- a/fs/cachefiles/namei.c
+++ b/fs/cachefiles/namei.c
@@ -311,7 +311,7 @@ static int cachefiles_bury_object(struct cachefiles_cache 
*cache,
cachefiles_io_error(cache, "Unlink security error");
} else {
trace_cachefiles_unlink(object, rep, why);
-   ret = vfs_unlink(d_inode(dir), rep, NULL);
+   ret = vfs_unlink(_user_ns, d_inode(dir), rep, 
NULL);
 
if (preemptive)
cachefiles_mark_object_buried(cache, rep, why);
@@ -413,8 +413,10 @@ static int cachefiles_bury_object(struct cachefiles_cache 
*cache,
cachefiles_io_error(cache, "Rename security error %d", ret);
} else {
struct renamedata rd = {
+   .old_user_ns= _user_ns,
.old_dir= d_inode(dir),
.old_dentry = rep,
+   .new_user_ns= _user_ns,
.new_dir= d_inode(cache->graveyard),
.new_dentry = grave,
};
@@ -566,7 +568,7 @@ int cachefiles_walk_to_object(struct cachefiles_object 
*parent,
if (ret < 0)
goto create_error;
start = jiffies;
-   ret = vfs_mkdir(d_inode(dir), next, 0);
+   ret = vfs_mkdir(_user_ns, d_inode(dir), next, 0);
cachefiles_

[PATCH v2 24/39] notify: handle idmapped mounts

2020-11-15 Thread Christian Brauner

Enable notify implementations to handle idmapped mounts by passing down the
mount's user namespace.

Cc: Christoph Hellwig 
Cc: David Howells 
Cc: Al Viro 
Cc: linux-fsde...@vger.kernel.org
Signed-off-by: Christian Brauner 
---
/* v2 */
patch introduced
---
 fs/notify/fanotify/fanotify_user.c | 2 +-
 fs/notify/inotify/inotify_user.c   | 3 ++-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/fs/notify/fanotify/fanotify_user.c 
b/fs/notify/fanotify/fanotify_user.c
index de4d01bb1d8d..e3b2cb6a9d81 100644
--- a/fs/notify/fanotify/fanotify_user.c
+++ b/fs/notify/fanotify/fanotify_user.c
@@ -702,7 +702,7 @@ static int fanotify_find_path(int dfd, const char __user 
*filename,
}
 
/* you can only watch an inode if you have read permissions on it */
-   ret = inode_permission(_user_ns, path->dentry->d_inode, MAY_READ);
+   ret = inode_permission(mnt_user_ns(path->mnt), path->dentry->d_inode, 
MAY_READ);
if (ret) {
path_put(path);
goto out;
diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c
index e995fd4e4e53..f39f5b81f2b3 100644
--- a/fs/notify/inotify/inotify_user.c
+++ b/fs/notify/inotify/inotify_user.c
@@ -31,6 +31,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include "inotify.h"
 #include "../fdinfo.h"
@@ -343,7 +344,7 @@ static int inotify_find_inode(const char __user *dirname, 
struct path *path,
if (error)
return error;
/* you can only watch an inode if you have read permissions on it */
-   error = inode_permission(_user_ns, path->dentry->d_inode, 
MAY_READ);
+   error = inode_permission(mnt_user_ns(path->mnt), path->dentry->d_inode, 
MAY_READ);
if (error) {
path_put(path);
return error;
-- 
2.29.2

--
Linux-audit mailing list
Linux-audit@redhat.com
https://www.redhat.com/mailman/listinfo/linux-audit

[PATCH v2 39/39] tests: add vfs/idmapped mounts test suite

2020-11-15 Thread Christian Brauner

 8 core.detached_mount_outside_current_mount_namespace
 #  RUN   core.change_idmapping ...
 #OK  core.change_idmapping
 ok 9 core.change_idmapping
 #  RUN   core.create_delete_rename ...
 #OK  core.create_delete_rename
 ok 10 core.create_delete_rename
 #  RUN   core.create_delete_rename_userns ...
 #OK  core.create_delete_rename_userns
 ok 11 core.create_delete_rename_userns
 #  RUN   core.hardlinks ...
 #OK  core.hardlinks
 ok 12 core.hardlinks
 #  RUN   core.rename ...
 #OK  core.rename
 ok 13 core.rename
 #  RUN   core.create_userns ...
 #OK  core.create_userns
 ok 14 core.create_userns
 #  RUN   core.create_userns_device_node ...
 #OK  core.create_userns_device_node
 ok 15 core.create_userns_device_node
 #  RUN   core.expected_uid_gid ...
 #OK  core.expected_uid_gid
 ok 16 core.expected_uid_gid
 #  RUN   core.expected_uid_gid_userns ...
 #OK  core.expected_uid_gid_userns
 ok 17 core.expected_uid_gid_userns
 #  RUN   core.expected_fscaps_userns ...
 #OK  core.expected_fscaps_userns
 ok 18 core.expected_fscaps_userns
 #  RUN   core.expected_fscaps_reverse ...
 #OK  core.expected_fscaps_reverse
 ok 19 core.expected_fscaps_reverse
 #  RUN   core.setid_binaries ...
 #OK  core.setid_binaries
 ok 20 core.setid_binaries
 #  RUN   core.setid_binaries_reverse ...
 #OK  core.setid_binaries_reverse
 ok 21 core.setid_binaries_reverse
 #  RUN   core.setid_binaries_userns ...
 #OK  core.setid_binaries_userns
 ok 22 core.setid_binaries_userns
 #  RUN   core.idmap_mount_tree ...
 #OK  core.idmap_mount_tree
 ok 23 core.idmap_mount_tree
 #  RUN   core.idmap_mount_tree_invalid ...
 #OK  core.idmap_mount_tree_invalid
 ok 24 core.idmap_mount_tree_invalid
 #  RUN   core.sticky_bit_unlink ...
 #OK  core.sticky_bit_unlink
 ok 25 core.sticky_bit_unlink
 #  RUN   core.sticky_bit_unlink_idmapped ...
 #OK  core.sticky_bit_unlink_idmapped
 ok 26 core.sticky_bit_unlink_idmapped
 #  RUN   core.sticky_bit_unlink_idmapped_userns ...
 #OK  core.sticky_bit_unlink_idmapped_userns
 ok 27 core.sticky_bit_unlink_idmapped_userns
 #  RUN   core.sticky_bit_rename ...
 #OK  core.sticky_bit_rename
 ok 28 core.sticky_bit_rename
 #  RUN   core.sticky_bit_rename_idmapped ...
 #OK  core.sticky_bit_rename_idmapped
 ok 29 core.sticky_bit_rename_idmapped
 #  RUN   core.sticky_bit_rename_idmapped_userns ...
 #OK  core.sticky_bit_rename_idmapped_userns
 ok 30 core.sticky_bit_rename_idmapped_userns
 #  RUN   core.follow_symlinks ...
 #OK  core.follow_symlinks
 ok 31 core.follow_symlinks
 #  RUN   core.follow_symlinks_idmapped ...
 #OK  core.follow_symlinks_idmapped
 ok 32 core.follow_symlinks_idmapped
 #  RUN   core.follow_symlinks_idmapped_userns ...
 #OK  core.follow_symlinks_idmapped_userns
 ok 33 core.follow_symlinks_idmapped_userns
 # PASSED: 33 / 33 tests passed.
 # Totals: pass:33 fail:0 xfail:0 xpass:0 skip:0 error:0

Cc: Christoph Hellwig 
Cc: David Howells 
Cc: Al Viro 
Cc: linux-fsde...@vger.kernel.org
Signed-off-by: Christian Brauner 
---
/* v2 */
patch introduced
---
 .../testing/selftests/idmap_mounts/.gitignore |1 +
 tools/testing/selftests/idmap_mounts/Makefile |7 +-
 tools/testing/selftests/idmap_mounts/core.c   | 3476 +
 .../testing/selftests/idmap_mounts/internal.h |   33 +-
 tools/testing/selftests/idmap_mounts/utils.c  |  136 +
 tools/testing/selftests/idmap_mounts/utils.h  |   17 +
 tools/testing/selftests/idmap_mounts/xattr.c  |  126 +-
 7 files changed, 3664 insertions(+), 132 deletions(-)
 create mode 100644 tools/testing/selftests/idmap_mounts/core.c
 create mode 100644 tools/testing/selftests/idmap_mounts/utils.c
 create mode 100644 tools/testing/selftests/idmap_mounts/utils.h

diff --git a/tools/testing/selftests/idmap_mounts/.gitignore 
b/tools/testing/selftests/idmap_mounts/.gitignore
index 18c5e90522ad..03c7198482d2 100644
--- a/tools/testing/selftests/idmap_mounts/.gitignore
+++ b/tools/testing/selftests/idmap_mounts/.gitignore
@@ -1 +1,2 @@
+core
 xattr
diff --git a/tools/testing/selftests/idmap_mounts/Makefile 
b/tools/testing/selftests/idmap_mounts/Makefile
index 1d495c99d924..67697b788353 100644
--- a/tools/testing/selftests/idmap_mounts/Makefile
+++ b/tools/testing/selftests/idmap_mounts/Makefile
@@ -1,9 +1,12 @@
 # SPDX-License-Identifier: GPL-2.0
 # Makefile for mount selftests.
 CFLAGS = -g -I../../../../usr/include/ -Wall -O2 -pthread
+LDLIBS += -lcap
 
-TEST_GEN_FILES += xattr
+TEST_GEN_FILES = xattr
+TEST_GEN_FILES += core
 
 include ../lib.mk
 
-$(OUTPUT)/xattr: xattr.c internal.h
+$(OUTPUT)/xattr

[PATCH v2 30/39] apparmor: handle idmapped mounts

2020-11-15 Thread Christian Brauner

The i_uid and i_gid are only ever used when logging for AppArmor. This is
already broken in a bunch of places where the global root id is reported
instead of the i_uid or i_gid of the file. Nonetheless, be kind and log the
mapped inode if we're coming from an idmapped mount.

Signed-off-by: Christian Brauner 
---
/* v2 */
unchanged
---
 security/apparmor/domain.c |  9 ++---
 security/apparmor/file.c   |  5 -
 security/apparmor/lsm.c| 12 
 3 files changed, 18 insertions(+), 8 deletions(-)

diff --git a/security/apparmor/domain.c b/security/apparmor/domain.c
index 16f184bc48de..4f997dba4573 100644
--- a/security/apparmor/domain.c
+++ b/security/apparmor/domain.c
@@ -10,12 +10,14 @@
 
 #include 
 #include 
+#include 
 #include 
 #include 
 #include 
 #include 
 #include 
 #include 
+#include 
 
 #include "include/audit.h"
 #include "include/apparmorfs.h"
@@ -858,8 +860,10 @@ int apparmor_bprm_creds_for_exec(struct linux_binprm *bprm)
const char *info = NULL;
int error = 0;
bool unsafe = false;
+   struct user_namespace *user_ns = mnt_user_ns(bprm->file->f_path.mnt);
+   kuid_t i_uid = i_uid_into_mnt(user_ns, file_inode(bprm->file));
struct path_cond cond = {
-   file_inode(bprm->file)->i_uid,
+   i_uid,
file_inode(bprm->file)->i_mode
};
 
@@ -967,8 +971,7 @@ int apparmor_bprm_creds_for_exec(struct linux_binprm *bprm)
error = fn_for_each(label, profile,
aa_audit_file(profile, , OP_EXEC, MAY_EXEC,
  bprm->filename, NULL, new,
- file_inode(bprm->file)->i_uid, info,
- error));
+ i_uid, info, error));
aa_put_label(new);
goto done;
 }
diff --git a/security/apparmor/file.c b/security/apparmor/file.c
index 92acf9a49405..d6d9e71f1900 100644
--- a/security/apparmor/file.c
+++ b/security/apparmor/file.c
@@ -11,6 +11,8 @@
 #include 
 #include 
 #include 
+#include 
+#include 
 
 #include "include/apparmor.h"
 #include "include/audit.h"
@@ -508,8 +510,9 @@ static int __file_path_perm(const char *op, struct aa_label 
*label,
 {
struct aa_profile *profile;
struct aa_perms perms = {};
+   struct user_namespace *user_ns = mnt_user_ns(file->f_path.mnt);
struct path_cond cond = {
-   .uid = file_inode(file)->i_uid,
+   .uid = i_uid_into_mnt(user_ns, file_inode(file)),
.mode = file_inode(file)->i_mode
};
char *buffer;
diff --git a/security/apparmor/lsm.c b/security/apparmor/lsm.c
index ffeaee5ed968..ece9afc3994f 100644
--- a/security/apparmor/lsm.c
+++ b/security/apparmor/lsm.c
@@ -224,7 +224,8 @@ static int common_perm(const char *op, const struct path 
*path, u32 mask,
  */
 static int common_perm_cond(const char *op, const struct path *path, u32 mask)
 {
-   struct path_cond cond = { d_backing_inode(path->dentry)->i_uid,
+   struct user_namespace *user_ns = mnt_user_ns(path->mnt);
+   struct path_cond cond = { i_uid_into_mnt(user_ns, 
d_backing_inode(path->dentry)),
  d_backing_inode(path->dentry)->i_mode
};
 
@@ -266,12 +267,13 @@ static int common_perm_rm(const char *op, const struct 
path *dir,
  struct dentry *dentry, u32 mask)
 {
struct inode *inode = d_backing_inode(dentry);
+   struct user_namespace *user_ns = mnt_user_ns(dir->mnt);
struct path_cond cond = { };
 
if (!inode || !path_mediated_fs(dentry))
return 0;
 
-   cond.uid = inode->i_uid;
+   cond.uid = i_uid_into_mnt(user_ns, inode);
cond.mode = inode->i_mode;
 
return common_perm_dir_dentry(op, dir, dentry, mask, );
@@ -361,11 +363,12 @@ static int apparmor_path_rename(const struct path 
*old_dir, struct dentry *old_d
 
label = begin_current_label_crit_section();
if (!unconfined(label)) {
+   struct user_namespace *user_ns = mnt_user_ns(old_dir->mnt);
struct path old_path = { .mnt = old_dir->mnt,
 .dentry = old_dentry };
struct path new_path = { .mnt = new_dir->mnt,
 .dentry = new_dentry };
-   struct path_cond cond = { d_backing_inode(old_dentry)->i_uid,
+   struct path_cond cond = { i_uid_into_mnt(user_ns, 
d_backing_inode(old_dentry)),
  d_backing_inode(old_dentry)->i_mode
};
 
@@ -420,8 +423,9 @@ static int apparmor_file_open(struct file *file)
 
label = aa_get_newest_cred_label(file->f_cred);
if (!unconfined(label)) {
+   struct user_namespace *user_ns = mnt_user_n

[PATCH v2 03/39] namespace: only take read lock in do_reconfigure_mnt()

2020-11-15 Thread Christian Brauner

do_reconfigure_mnt() used to take the down_write(>s_umount) lock
which seems unnecessary since we're not changing the superblock. We're
only checking whether it is already read-only. Setting other mount
attributes is protected by lock_mount_hash() afaict and not by s_umount.

So I think the history of down_write(>s_umount) lock being taken
when setting mount attributes dates back to the introduction of
MNT_READONLY in [2]. Afaict, this introduced the concept of having
read-only mounts in contrast to just having a read-only superblock. When
it got introduced it was simply plumbed into do_remount() which already
took down_write(>s_umount) because it was only used to actually
change the superblock before [2]. Afaict, it would've already been
possible back then to only use down_read(>s_umount) for
MS_BIND | MS_REMOUNT since actual mount options were protected by
the vfsmount lock already. But that would've meant special casing the
locking for MS_BIND | MS_REMOUNT in do_remount() which people might not
have considered worth it.
Then in [1] MS_BIND | MS_REMOUNT mount option changes were split out of
do_remount() into do_reconfigure_mnt() but the down_write(>s_umount)
lock was simply copied over.
Now that we have this be a separate helper only take
the down_read(>s_umount) lock since we're only interested in
checking whether the super block is currently read-only and blocking any
writers from changing it. Essentially, checking that the super block is
read-only has the advantage that we can avoid having to go into the
slowpath and through MNT_WRITE_HOLD and can simply set the read-only
flag on the mount in set_mount_attributes().

[1]: commit 43f5e655eff7 ("vfs: Separate changing mount flags full remount")
[2]: commit 2e4b7fcd9260 ("[PATCH] r/o bind mounts: honor mount writer counts 
at remount")
Cc: Christoph Hellwig 
Cc: David Howells 
Cc: Al Viro 
Cc: linux-fsde...@vger.kernel.org
Signed-off-by: Christian Brauner 
---
/* v2 */
unchanged
---
 fs/namespace.c | 12 ++--
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/fs/namespace.c b/fs/namespace.c
index 798bbf4f48ad..8497d149ecaa 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -2512,10 +2512,6 @@ static int change_mount_ro_state(struct mount *mnt, 
unsigned int mnt_flags)
return 0;
 }
 
-/*
- * Update the user-settable attributes on a mount.  The caller must hold
- * sb->s_umount for writing.
- */
 static void set_mount_attributes(struct mount *mnt, unsigned int mnt_flags)
 {
mnt_flags |= mnt->mnt.mnt_flags & ~MNT_USER_SETTABLE_MASK;
@@ -2565,13 +2561,17 @@ static int do_reconfigure_mnt(struct path *path, 
unsigned int mnt_flags)
if (!can_change_locked_flags(mnt, mnt_flags))
return -EPERM;
 
-   down_write(>s_umount);
+   /*
+* We're only checking whether the superblock is read-only not changing
+* it, so only take down_read(>s_umount).
+*/
+   down_read(>s_umount);
lock_mount_hash();
ret = change_mount_ro_state(mnt, mnt_flags);
if (ret == 0)
set_mount_attributes(mnt, mnt_flags);
unlock_mount_hash();
-   up_write(>s_umount);
+   up_read(>s_umount);
 
mnt_warn_timestamp_expiry(path, >mnt);
 
-- 
2.29.2

--
Linux-audit mailing list
Linux-audit@redhat.com
https://www.redhat.com/mailman/listinfo/linux-audit

[PATCH v2 02/39] mount: make {lock,unlock}_mount_hash() static

2020-11-15 Thread Christian Brauner

The helpers are only called in fs/namespace.c functions so there's no need to
have them exposed in a header as Christoph pointed out.

Suggested-by: Christoph Hellwig 
Cc: David Howells 
Cc: Al Viro 
Cc: linux-fsde...@vger.kernel.org
Signed-off-by: Christian Brauner 
---
/* v2 */
- Christoph Hellwig:
  - Add a patch to make {lock,unlock)_mount_hash() static.
---
 fs/mount.h | 10 --
 fs/namespace.c | 10 ++
 2 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/fs/mount.h b/fs/mount.h
index c7abb7b394d8..562d96d57bad 100644
--- a/fs/mount.h
+++ b/fs/mount.h
@@ -125,16 +125,6 @@ static inline void get_mnt_ns(struct mnt_namespace *ns)
 
 extern seqlock_t mount_lock;
 
-static inline void lock_mount_hash(void)
-{
-   write_seqlock(_lock);
-}
-
-static inline void unlock_mount_hash(void)
-{
-   write_sequnlock(_lock);
-}
-
 struct proc_mounts {
struct mnt_namespace *ns;
struct path root;
diff --git a/fs/namespace.c b/fs/namespace.c
index f183161833ad..798bbf4f48ad 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -87,6 +87,16 @@ EXPORT_SYMBOL_GPL(fs_kobj);
  */
 __cacheline_aligned_in_smp DEFINE_SEQLOCK(mount_lock);
 
+static inline void lock_mount_hash(void)
+{
+   write_seqlock(_lock);
+}
+
+static inline void unlock_mount_hash(void)
+{
+   write_sequnlock(_lock);
+}
+
 static inline struct hlist_head *m_hash(struct vfsmount *mnt, struct dentry 
*dentry)
 {
unsigned long tmp = ((unsigned long)mnt / L1_CACHE_BYTES);
-- 
2.29.2

--
Linux-audit mailing list
Linux-audit@redhat.com
https://www.redhat.com/mailman/listinfo/linux-audit

[PATCH v2 07/39] mount: attach mappings to mounts

2020-11-15 Thread Christian Brauner

In order to support per-mount idmappings vfsmounts will be marked with user
namespaces. The idmapping associated with that user namespace will be used to
map the ids of vfs objects when they are accessed through that mount.
By default all vfsmounts will be marked with the initial user namespace. The
initial user namespace is used to indicate that a mount is not idmapped. All
operations behave as before and no performance impact is seen.

Based on prior discussions we want to attach the whole user namespace and not
just a dedicated idmapping struct. This allows us to reuse all the helpers that
already exist for dealing with idmappings instead of introducing a whole new
range of helpers. In addition, if we decide in the future that we are confident
enough to enable unprivileged user to setup idmapped mounts we can allow
already idmapped mounts to be marked with another user namespace. For now, we
will enforce in later patches that once a mount has been idmapped it can't be
remapped. This keeps permission checking and life-cycle management simple
especially since users can always create a new mount with a different idmapping
anyway.

The idea to attach user namespaces to vfsmounts has been floated around in
various forms at Linux Plumbers in ~2018 with the original idea being tracing
back to a discussion during a conference in St. Petersburg between Christoph,
Tycho, and myself.

Cc: Christoph Hellwig 
Cc: David Howells 
Cc: Al Viro 
Cc: linux-fsde...@vger.kernel.org
Signed-off-by: Christian Brauner 
---
/* v2 */
patch introduced
- Christoph Hellwig:
  - Split internal implementation into separate patch and move syscall
implementation later.
---
 fs/namespace.c|  6 ++
 include/linux/fs.h|  1 +
 include/linux/mount.h | 12 
 3 files changed, 19 insertions(+)

diff --git a/fs/namespace.c b/fs/namespace.c
index 9fc8b22dba26..15fb0ae3f01f 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -220,6 +220,7 @@ static struct mount *alloc_vfsmnt(const char *name)
INIT_HLIST_NODE(>mnt_mp_list);
INIT_LIST_HEAD(>mnt_umounting);
INIT_HLIST_HEAD(>mnt_stuck_children);
+   mnt->mnt.mnt_user_ns = _user_ns;
}
return mnt;
 
@@ -559,6 +560,8 @@ int sb_prepare_remount_readonly(struct super_block *sb)
 
 static void free_vfsmnt(struct mount *mnt)
 {
+   if (mnt_idmapped(>mnt) && mnt_user_ns(>mnt) != _user_ns)
+   put_user_ns(mnt_user_ns(>mnt));
kfree_const(mnt->mnt_devname);
 #ifdef CONFIG_SMP
free_percpu(mnt->mnt_pcp);
@@ -1067,6 +1070,9 @@ static struct mount *clone_mnt(struct mount *old, struct 
dentry *root,
mnt->mnt.mnt_flags &= ~(MNT_WRITE_HOLD|MNT_MARKED|MNT_INTERNAL);
 
atomic_inc(>s_active);
+   mnt->mnt.mnt_user_ns = old->mnt.mnt_user_ns;
+   if (mnt_user_ns(>mnt) != _user_ns)
+   mnt->mnt.mnt_user_ns = get_user_ns(mnt->mnt.mnt_user_ns);
mnt->mnt.mnt_sb = sb;
mnt->mnt.mnt_root = dget(root);
mnt->mnt_mountpoint = mnt->mnt.mnt_root;
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 9e487cbf0f5c..9e05fb4f997c 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2260,6 +2260,7 @@ struct file_system_type {
 #define FS_HAS_SUBTYPE 4
 #define FS_USERNS_MOUNT8   /* Can be mounted by userns 
root */
 #define FS_DISALLOW_NOTIFY_PERM16  /* Disable fanotify permission 
events */
+#define FS_ALLOW_IDMAP 32  /* FS has been updated to handle vfs 
idmappings. */
 #define FS_THP_SUPPORT 8192/* Remove once all fs converted */
 #define FS_RENAME_DOES_D_MOVE  32768   /* FS will handle d_move() during 
rename() internally. */
int (*init_fs_context)(struct fs_context *);
diff --git a/include/linux/mount.h b/include/linux/mount.h
index aaf343b38671..3c7ba1bd4a21 100644
--- a/include/linux/mount.h
+++ b/include/linux/mount.h
@@ -31,6 +31,7 @@ struct fs_context;
 #define MNT_RELATIME   0x20
 #define MNT_READONLY   0x40/* does the user want this to be r/o? */
 #define MNT_NOSYMFOLLOW0x80
+#define MNT_IDMAPPED   0x400
 
 #define MNT_SHRINKABLE 0x100
 #define MNT_WRITE_HOLD 0x200
@@ -72,8 +73,19 @@ struct vfsmount {
struct dentry *mnt_root;/* root of the mounted tree */
struct super_block *mnt_sb; /* pointer to superblock */
int mnt_flags;
+   struct user_namespace *mnt_user_ns;
 } __randomize_layout;
 
+static inline bool mnt_idmapped(const struct vfsmount *mnt)
+{
+   return READ_ONCE(mnt->mnt_flags) & MNT_IDMAPPED;
+}
+
+static inline struct user_namespace *mnt_user_ns(const struct vfsmount *mnt)
+{
+   return mnt->mnt_user_ns;
+}
+
 struct file; /* forward dec */
 struct path;
 
-- 
2.29.2

--
Linux-audit mailing list
Linux-audit@redhat.com
https://www.redhat.com/mailman/listinfo/linux-audit

[PATCH v2 19/39] open: handle idmapped mounts in do_truncate()

2020-11-15 Thread Christian Brauner

When truncating files the vfs will verify that the caller is privileged
over the inode. Since the do_truncate() helper is only used in a few places
in the vfs code extend it to handle idmapped mounts instead of adding a new
helper.  If the inode is accessed through an idmapped mount it is mapped
according to the mount's user namespace. Afterwards the permissions checks
are identical to non-idmapped mounts. If the initial user namespace is
passed all mapping operations are a nop so non-idmapped mounts will not see
a change in behavior and will also not see any performance impact.

Cc: Christoph Hellwig 
Cc: David Howells 
Cc: Al Viro 
Cc: linux-fsde...@vger.kernel.org
Signed-off-by: Christian Brauner 
---
/* v2 */
unchanged
---
 fs/coredump.c  | 12 +---
 fs/inode.c | 13 +
 fs/namei.c |  6 +++---
 fs/open.c  | 21 +
 include/linux/fs.h |  4 ++--
 5 files changed, 36 insertions(+), 20 deletions(-)

diff --git a/fs/coredump.c b/fs/coredump.c
index 0cd9056d79cc..25beac7230ff 100644
--- a/fs/coredump.c
+++ b/fs/coredump.c
@@ -703,6 +703,7 @@ void do_coredump(const kernel_siginfo_t *siginfo)
goto close_fail;
}
} else {
+   struct user_namespace *user_ns;
struct inode *inode;
int open_flags = O_CREAT | O_RDWR | O_NOFOLLOW |
 O_LARGEFILE | O_EXCL;
@@ -786,7 +787,8 @@ void do_coredump(const kernel_siginfo_t *siginfo)
goto close_fail;
if (!(cprm.file->f_mode & FMODE_CAN_WRITE))
goto close_fail;
-   if (do_truncate(cprm.file->f_path.dentry, 0, 0, cprm.file))
+   user_ns = mnt_user_ns(cprm.file->f_path.mnt);
+   if (do_truncate(user_ns, cprm.file->f_path.dentry, 0, 0, 
cprm.file))
goto close_fail;
}
 
@@ -931,8 +933,12 @@ void dump_truncate(struct coredump_params *cprm)
 
if (file->f_op->llseek && file->f_op->llseek != no_llseek) {
offset = file->f_op->llseek(file, 0, SEEK_CUR);
-   if (i_size_read(file->f_mapping->host) < offset)
-   do_truncate(file->f_path.dentry, offset, 0, file);
+   if (i_size_read(file->f_mapping->host) < offset) {
+   struct user_namespace *user_ns;
+
+   user_ns = mnt_user_ns(file->f_path.mnt);
+   do_truncate(user_ns, file->f_path.dentry, offset, 0, 
file);
+   }
}
 }
 EXPORT_SYMBOL(dump_truncate);
diff --git a/fs/inode.c b/fs/inode.c
index 75c64f003c45..0ccdd673636d 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -1904,7 +1904,8 @@ int dentry_needs_remove_privs(struct dentry *dentry)
return mask;
 }
 
-static int __remove_privs(struct dentry *dentry, int kill)
+static int __remove_privs(struct user_namespace *user_ns, struct dentry 
*dentry,
+ int kill)
 {
struct iattr newattrs;
 
@@ -1913,7 +1914,7 @@ static int __remove_privs(struct dentry *dentry, int kill)
 * Note we call this on write, so notify_change will not
 * encounter any conflicting delegations:
 */
-   return notify_change(_user_ns, dentry, , NULL);
+   return notify_change(user_ns, dentry, , NULL);
 }
 
 /*
@@ -1939,8 +1940,12 @@ int file_remove_privs(struct file *file)
kill = dentry_needs_remove_privs(dentry);
if (kill < 0)
return kill;
-   if (kill)
-   error = __remove_privs(dentry, kill);
+   if (kill) {
+   struct user_namespace *user_ns;
+
+   user_ns = mnt_user_ns(file->f_path.mnt);
+   error = __remove_privs(user_ns, dentry, kill);
+   }
if (!error)
inode_has_no_xattr(inode);
 
diff --git a/fs/namei.c b/fs/namei.c
index b91bf923d22c..5601b6680d4c 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -2940,9 +2940,9 @@ static int handle_truncate(struct file *filp)
if (!error)
error = security_path_truncate(path);
if (!error) {
-   error = do_truncate(path->dentry, 0,
-   ATTR_MTIME|ATTR_CTIME|ATTR_OPEN,
-   filp);
+   error = do_truncate(mnt_user_ns(filp->f_path.mnt),
+   path->dentry, 0,
+   ATTR_MTIME | ATTR_CTIME | ATTR_OPEN, filp);
}
put_write_access(inode);
return error;
diff --git a/fs/open.c b/fs/open.c
index 2dc94689a7dc..137dcc52d2f8 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -35,8 +35,8 @@
 
 #include "internal.h"
 
-int do_truncate(struct dentry *dentry, loff_t length, unsigned int time_attrs,
-   struct file *filp)
+int do_truncate(struct user_namespace *user_ns, st

[PATCH v2 36/39] overlayfs: do not mount on top of idmapped mounts

2020-11-15 Thread Christian Brauner

Prevent overlayfs from being mounted on top of idmapped mounts until we
have ported it to handle this case and added proper testing for it.

Cc: Christoph Hellwig 
Cc: David Howells 
Cc: Al Viro 
Cc: linux-fsde...@vger.kernel.org
Signed-off-by: Christian Brauner 
---
/* v2 */
patch introduced
---
 fs/overlayfs/super.c | 12 
 1 file changed, 12 insertions(+)

diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c
index 0d4f2baf6836..3cacc3d3fb65 100644
--- a/fs/overlayfs/super.c
+++ b/fs/overlayfs/super.c
@@ -1708,6 +1708,12 @@ static struct ovl_entry *ovl_get_lowerstack(struct 
super_block *sb,
if (err)
goto out_err;
 
+   if (mnt_idmapped(stack[i].mnt)) {
+   err = -EINVAL;
+   pr_err("idmapped lower layers are currently 
unsupported\n");
+   goto out_err;
+   }
+
lower = strchr(lower, '\0') + 1;
}
 
@@ -1939,6 +1945,12 @@ static int ovl_fill_super(struct super_block *sb, void 
*data, int silent)
if (err)
goto out_err;
 
+   if (mnt_idmapped(upperpath.mnt)) {
+   err = -EINVAL;
+   pr_err("idmapped lower layers are currently 
unsupported\n");
+   goto out_err;
+   }
+
err = ovl_get_workdir(sb, ofs, );
if (err)
goto out_err;
-- 
2.29.2

--
Linux-audit mailing list
Linux-audit@redhat.com
https://www.redhat.com/mailman/listinfo/linux-audit

[PATCH v2 37/39] fs: introduce MOUNT_ATTR_IDMAP

2020-11-15 Thread Christian Brauner

Introduce a new mount bind mount property to allow idmapping mounts. The
MOUNT_ATTR_IDMAP flag can be set via the new mount_setattr() syscall
together with a file descriptor referring to a user namespace.

The user namespace referenced by the namespace file descriptor will be
attached to the bind mount. All interactions with the filesystem going
through that mount will be idmapped according to the mapping specified in
the user namespace attached to it.

Using user namespaces to mark mounts means we can reuse all the existing
infrastructure in the kernel that already exists to handle idmappings and can
also use this for permission checking to allow unprivileged user to create
idmapped mounts in the future.

Idmapping a mount is decoupled from the caller's user and mount namespace.
This means idmapped mounts can be created in the initial user namespace
which is an important use-case for systemd-homed, portable usb-sticks
between systems, sharing data between the initial user namespace and
unprivileged containers, and other use-cases that have been brought up. For
example, assume a home directory where all files are owned by uid and gid
1000 and the home directory is brought to a new laptop where the user has
id 12345. The system administrator can simply create a mount of this home
directory with a mapping of 1000:12345:1 other mappings to indicate the ids
should be kept. (With this it is e.g. also possible to create idmapped
mounts on the host with an identity mapping 1:1:10 where the root user
is not mapped. A user with root access that e.g. has been pivot rooted into
such a mount on the host will be not be able to execute, read, write, or
create files as root.)

Given that idmapping a mount is decoupled from the caller's user namespace
a sufficiently privileged process such as a container manager can set up a
shifted mount for the container and the container can simply pivot root to
it. There's no need for the container to do anything. The mount will appear
correctly mapped independent of the user namespace the container uses. This
means we don't need to mark a mount as idmappable.

In order to create an idmapped mount the caller must currently be privileged in
the user namespace of the superblock the mount belongs to. Currently, once a
mount has been idmapped we don't allow it to change its mapping. This can be
changed in the future if the use-cases arises.

Cc: Christoph Hellwig 
Cc: David Howells 
Cc: Mauricio Vásquez Bernal 
Cc: Al Viro 
Cc: linux-fsde...@vger.kernel.org
Signed-off-by: Christian Brauner 
---
/* v2 */
- Christoph Hellwig :
  - Drop kconfig option to make vfs idmappings unconditional.
  - Move introduction of MOUNT_ATTR_IDMAP to the end of the series after all
internal changes have been done.
  - Move MOUNT_ATTR_IDMAP handling from build_mount_kattr() to separate
build_mount_idmapped() helper.
  - Move MNT_IDMAPPED handling from do_mount_setattr() into separate
do_mount_idmap() helper.
  - Use more helpers instead of one big function for mount attribute changes.
- Mauricio Vásquez Bernal :
  - Recalculate flags before checking can_change_locked_flags().
---
 fs/internal.h  |   1 +
 fs/namespace.c | 122 -
 fs/proc_namespace.c|   1 +
 include/linux/fs.h |   2 +-
 include/linux/mount.h  |   2 +-
 include/uapi/linux/mount.h |   5 +-
 6 files changed, 128 insertions(+), 5 deletions(-)

diff --git a/fs/internal.h b/fs/internal.h
index a5a6c470dc07..b0978274155f 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -88,6 +88,7 @@ struct mount_kattr {
unsigned int propagation;
unsigned int lookup_flags;
bool recurse;
+   struct user_namespace *mnt_user_ns;
 };
 
 extern struct vfsmount *lookup_mnt(const struct path *);
diff --git a/fs/namespace.c b/fs/namespace.c
index 15fb0ae3f01f..f76292294dbd 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -25,6 +25,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -3465,7 +3466,8 @@ static int build_attr_flags(unsigned int attr_flags, 
unsigned int *flags)
   MOUNT_ATTR_NODEV |
   MOUNT_ATTR_NOEXEC |
   MOUNT_ATTR__ATIME |
-  MOUNT_ATTR_NODIRATIME))
+  MOUNT_ATTR_NODIRATIME |
+  MOUNT_ATTR_IDMAP))
return -EINVAL;
 
if (attr_flags & MOUNT_ATTR_RDONLY)
@@ -3478,6 +3480,8 @@ static int build_attr_flags(unsigned int attr_flags, 
unsigned int *flags)
aflags |= MNT_NOEXEC;
if (attr_flags & MOUNT_ATTR_NODIRATIME)
aflags |= MNT_NODIRATIME;
+   if (attr_flags & MOUNT_ATTR_IDMAP)
+   aflags |= MNT_IDMAPPED;
 
*flags = aflags;
return 0;
@@ -3505,6 +3509,14 @@ SYSCALL_DEFINE3(fsmount, int, fs_fd, unsigned int, flags,
if ((flags & ~(FSMOUN

[PATCH v2 10/39] inode: add idmapped mount aware init and permission helpers

2020-11-15 Thread Christian Brauner

The inode_owner_or_capable() helper determines whether the caller is the
owner of the inode or is capable with respect to that inode. Allow it to
handle idmapped mounts. If the inode is accessed through an idmapped mount
we first need to map it according to the mount's user namespace.
Afterwards the checks are identical to non-idmapped mounts. If the initial
user namespace is passed all operations are a nop so non-idmapped mounts
will not see a change in behavior and will not see any performance impact.

Similarly, we allow the inode_init_owner() helper to handle idmapped
mounts. It initializes a new inode on idmapped mounts by mapping the fsuid
and fsgid of the caller from the mount's user namespace. If the initial
user namespace is passed all operations are a nop so non-idmapped mounts
will not see a change in behavior and will also not see any performance
impact.

Cc: Christoph Hellwig 
Cc: David Howells 
Cc: Al Viro 
Cc: linux-fsde...@vger.kernel.org
Signed-off-by: Christian Brauner 
---
/* v2 */
- Christoph Hellwig:
  - Don't pollute the vfs with additional helpers simply extend the existing
helpers with an additional argument and switch all callers.
---
 fs/9p/acl.c  |  2 +-
 fs/9p/vfs_inode.c|  2 +-
 fs/attr.c|  6 +++---
 fs/bfs/dir.c |  2 +-
 fs/btrfs/inode.c |  2 +-
 fs/btrfs/ioctl.c | 10 +-
 fs/btrfs/tests/btrfs-tests.c |  2 +-
 fs/crypto/policy.c   |  2 +-
 fs/efivarfs/file.c   |  2 +-
 fs/ext2/ialloc.c |  2 +-
 fs/ext2/ioctl.c  |  6 +++---
 fs/ext4/ialloc.c |  2 +-
 fs/ext4/ioctl.c  | 14 +++---
 fs/f2fs/file.c   | 14 +++---
 fs/f2fs/namei.c  |  2 +-
 fs/f2fs/xattr.c  |  2 +-
 fs/fcntl.c   |  2 +-
 fs/gfs2/file.c   |  2 +-
 fs/hfsplus/inode.c   |  2 +-
 fs/hfsplus/ioctl.c   |  2 +-
 fs/hugetlbfs/inode.c |  2 +-
 fs/inode.c   | 23 ++-
 fs/jfs/ioctl.c   |  2 +-
 fs/jfs/jfs_inode.c   |  2 +-
 fs/minix/bitmap.c|  2 +-
 fs/namei.c   |  4 ++--
 fs/nilfs2/inode.c|  2 +-
 fs/nilfs2/ioctl.c|  2 +-
 fs/ocfs2/dlmfs/dlmfs.c   |  4 ++--
 fs/ocfs2/ioctl.c |  2 +-
 fs/ocfs2/namei.c |  2 +-
 fs/omfs/inode.c  |  2 +-
 fs/overlayfs/dir.c   |  2 +-
 fs/overlayfs/file.c  |  4 ++--
 fs/overlayfs/super.c |  2 +-
 fs/overlayfs/util.c  |  2 +-
 fs/posix_acl.c   |  2 +-
 fs/ramfs/inode.c |  2 +-
 fs/reiserfs/ioctl.c  |  4 ++--
 fs/reiserfs/namei.c  |  2 +-
 fs/sysv/ialloc.c |  2 +-
 fs/ubifs/dir.c   |  2 +-
 fs/ubifs/ioctl.c |  2 +-
 fs/udf/ialloc.c  |  2 +-
 fs/ufs/ialloc.c  |  2 +-
 fs/xattr.c   |  2 +-
 fs/xfs/xfs_ioctl.c   |  2 +-
 fs/zonefs/super.c|  2 +-
 include/linux/fs.h   |  7 ---
 kernel/bpf/inode.c   |  2 +-
 mm/madvise.c |  2 +-
 mm/mincore.c |  2 +-
 mm/shmem.c   |  2 +-
 security/selinux/hooks.c |  4 ++--
 54 files changed, 95 insertions(+), 89 deletions(-)

diff --git a/fs/9p/acl.c b/fs/9p/acl.c
index 6261719f6f2a..d77b28e8d57a 100644
--- a/fs/9p/acl.c
+++ b/fs/9p/acl.c
@@ -258,7 +258,7 @@ static int v9fs_xattr_set_acl(const struct xattr_handler 
*handler,
 
if (S_ISLNK(inode->i_mode))
return -EOPNOTSUPP;
-   if (!inode_owner_or_capable(inode))
+   if (!inode_owner_or_capable(_user_ns, inode))
return -EPERM;
if (value) {
/* update the cached acl value */
diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c
index ae0c38ad1fcb..f058e89df30f 100644
--- a/fs/9p/vfs_inode.c
+++ b/fs/9p/vfs_inode.c
@@ -251,7 +251,7 @@ int v9fs_init_inode(struct v9fs_session_info *v9ses,
 {
int err = 0;
 
-   inode_init_owner(inode, NULL, mode);
+   inode_init_owner(inode, _user_ns, NULL, mode);
inode->i_blocks = 0;
inode->i_rdev = rdev;
inode->i_atime = inode->i_mtime = inode->i_ctime = current_time(inode);
diff --git a/fs/attr.c b/fs/attr.c
index c9e29e589cec..00ae0b000146 100644
--- a/fs/attr.c
+++ b/fs/attr.c
@@ -87,7 +87,7 @@ int setattr_prepare(struct dentry *dentry, struct iattr *attr)
 
/* Make sure a caller can chmod. */
if (ia_valid & ATTR_MODE) {
-   if (!inode_owner_or_capable(inode))
+   if (!inode_owner_or_capable(_user_ns, inode))
return -EPERM;
/* Also check the setgid bit! */
if (!in_group_p((ia_valid & ATTR_GID) ? attr->ia_gid :
@@ -98,7 +98,7 @@ int setattr_prepare(struct dentry *dentry, struct iattr *attr)
 
/* Che

[PATCH v2 34/39] ext4: support idmapped mounts

2020-11-15 Thread Christian Brauner

Hellwig 
Cc: David Howells 
Cc: Al Viro 
Cc: linux-fsde...@vger.kernel.org
Signed-off-by: Christian Brauner 
---
/* v2 */
unchanged
---
 fs/ext4/Kconfig  |  9 +
 fs/ext4/acl.c|  2 +-
 fs/ext4/ext4.h   | 13 +++--
 fs/ext4/ialloc.c |  7 ---
 fs/ext4/inode.c  | 11 +++
 fs/ext4/ioctl.c  | 18 ++
 fs/ext4/namei.c  | 30 --
 fs/ext4/super.c  |  6 +-
 8 files changed, 59 insertions(+), 37 deletions(-)

diff --git a/fs/ext4/Kconfig b/fs/ext4/Kconfig
index 619dd35ddd48..5918c05cfe5b 100644
--- a/fs/ext4/Kconfig
+++ b/fs/ext4/Kconfig
@@ -118,3 +118,12 @@ config EXT4_KUNIT_TESTS
  to the KUnit documentation in Documentation/dev-tools/kunit/.
 
  If unsure, say N.
+
+config EXT4_IDMAP_MOUNTS
+   bool "Support vfs idmapped mounts in ext4"
+   depends on EXT4_FS
+   default n
+   help
+ The vfs allows to expose a filesystem at different mountpoints with
+ differnet idmappings. Allow ext4 to be exposed through idmapped
+ mounts.
diff --git a/fs/ext4/acl.c b/fs/ext4/acl.c
index 3ab0a69b974b..7c413bd4d2f2 100644
--- a/fs/ext4/acl.c
+++ b/fs/ext4/acl.c
@@ -246,7 +246,7 @@ ext4_set_acl(struct user_namespace *user_ns, struct inode 
*inode,
ext4_fc_start_update(inode);
 
if ((type == ACL_TYPE_ACCESS) && acl) {
-   error = posix_acl_update_mode(_user_ns, inode, , 
);
+   error = posix_acl_update_mode(user_ns, inode, , );
if (error)
goto out_stop;
if (mode != inode->i_mode)
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 4c8bdcea0a0c..aa0ddbc0d8c2 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -2702,18 +2702,19 @@ extern int ext4fs_dirhash(const struct inode *dir, 
const char *name, int len,
 
 /* ialloc.c */
 extern int ext4_mark_inode_used(struct super_block *sb, int ino);
-extern struct inode *__ext4_new_inode(handle_t *, struct inode *, umode_t,
+extern struct inode *__ext4_new_inode(struct user_namespace *, handle_t *,
+ struct inode *, umode_t,
  const struct qstr *qstr, __u32 goal,
  uid_t *owner, __u32 i_flags,
  int handle_type, unsigned int line_no,
  int nblocks);
 
-#define ext4_new_inode(handle, dir, mode, qstr, goal, owner, i_flags) \
-   __ext4_new_inode((handle), (dir), (mode), (qstr), (goal), (owner), \
-i_flags, 0, 0, 0)
-#define ext4_new_inode_start_handle(dir, mode, qstr, goal, owner, \
+#define ext4_new_inode(handle, dir, mode, qstr, goal, owner, i_flags)  
\
+   __ext4_new_inode(_user_ns, (handle), (dir), (mode), (qstr),   \
+(goal), (owner), i_flags, 0, 0, 0)
+#define ext4_new_inode_start_handle(user_ns, dir, mode, qstr, goal, owner, \
type, nblocks)  \
-   __ext4_new_inode(NULL, (dir), (mode), (qstr), (goal), (owner), \
+   __ext4_new_inode((user_ns), NULL, (dir), (mode), (qstr), (goal), 
(owner), \
 0, (type), __LINE__, (nblocks))
 
 
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index d91f69282311..93251687c6a2 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -919,7 +919,8 @@ static int ext4_xattr_credits_for_new_inode(struct inode 
*dir, mode_t mode,
  * For other inodes, search forward from the parent directory's block
  * group to find a free inode.
  */
-struct inode *__ext4_new_inode(handle_t *handle, struct inode *dir,
+struct inode *__ext4_new_inode(struct user_namespace *user_ns,
+  handle_t *handle, struct inode *dir,
   umode_t mode, const struct qstr *qstr,
   __u32 goal, uid_t *owner, __u32 i_flags,
   int handle_type, unsigned int line_no,
@@ -969,10 +970,10 @@ struct inode *__ext4_new_inode(handle_t *handle, struct 
inode *dir,
i_gid_write(inode, owner[1]);
} else if (test_opt(sb, GRPID)) {
inode->i_mode = mode;
-   inode->i_uid = current_fsuid();
+   inode->i_uid = fsuid_into_mnt(user_ns);
inode->i_gid = dir->i_gid;
} else
-   inode_init_owner(inode, _user_ns, dir, mode);
+   inode_init_owner(inode, user_ns, dir, mode);
 
if (ext4_has_feature_project(sb) &&
ext4_test_inode_flag(dir, EXT4_INODE_PROJINHERIT))
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 90a8c2f29616..c00ceb15fb1b 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -20,6 +20,7 @@
  */
 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -5324,7 +5325,7 @@ int ext4_setattr(struct user_namespace *user_ns, struct 
dentry *dentry,

[PATCH v2 16/39] namei: handle idmapped mounts in may_*() helpers

2020-11-15 Thread Christian Brauner

The may_follow_link(), may_linkat(), may_lookup(), may_open(), may_o_create(),
may_create_in_sticky(), may_delete(), and may_create() helpers determine
whether the caller is privileged enough to perform the associated operations.
Let them handle idmapped mounts by mappings the inode and fsids according to
the mount's user namespace. Afterwards the checks are identical to non-idmapped
inodes. If the initial user namespace is passed all operations are a nop so
non-idmapped mounts will not see a change in behavior and will also not see any
performance impact.
Since the may_*() helpers are not exposed to other parts of the kernel we can
simply extend them with an additional argument in case they don't already have
access to the mount's user namespace.

Cc: Christoph Hellwig 
Cc: David Howells 
Cc: Al Viro 
Cc: linux-fsde...@vger.kernel.org
Signed-off-by: Christian Brauner 
---
/* v2 */
unchanged
---
 fs/btrfs/ioctl.c   |   5 +-
 fs/inode.c |   2 +-
 fs/namei.c | 121 +++--
 fs/xattr.c |   2 +-
 include/linux/fs.h |  13 +++--
 5 files changed, 86 insertions(+), 57 deletions(-)

diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 39f25b5d06ed..ccac53bb2a1c 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -915,8 +915,9 @@ static int btrfs_may_delete(struct inode *dir, struct 
dentry *victim, int isdir)
return error;
if (IS_APPEND(dir))
return -EPERM;
-   if (check_sticky(dir, d_inode(victim)) || IS_APPEND(d_inode(victim)) ||
-   IS_IMMUTABLE(d_inode(victim)) || IS_SWAPFILE(d_inode(victim)))
+   if (check_sticky(_user_ns, dir, d_inode(victim)) ||
+   IS_APPEND(d_inode(victim)) || IS_IMMUTABLE(d_inode(victim)) ||
+   IS_SWAPFILE(d_inode(victim)))
return -EPERM;
if (isdir) {
if (!d_is_dir(victim))
diff --git a/fs/inode.c b/fs/inode.c
index 66d3f7397d86..75c64f003c45 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -1797,7 +1797,7 @@ bool atime_needs_update(const struct path *path, struct 
inode *inode)
/* Atime updates will likely cause i_uid and i_gid to be written
 * back improprely if their true value is unknown to the vfs.
 */
-   if (HAS_UNMAPPED_ID(inode))
+   if (HAS_UNMAPPED_ID(mnt_user_ns(mnt), inode))
return false;
 
if (IS_NOATIME(inode))
diff --git a/fs/namei.c b/fs/namei.c
index 35952c28ee29..4dc842d1cd3a 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -465,7 +465,7 @@ int inode_permission(struct user_namespace *user_ns,
 * written back improperly if their true value is unknown
 * to the vfs.
 */
-   if (HAS_UNMAPPED_ID(inode))
+   if (HAS_UNMAPPED_ID(user_ns, inode))
return -EACCES;
}
 
@@ -963,11 +963,16 @@ int sysctl_protected_regular __read_mostly;
  */
 static inline int may_follow_link(struct nameidata *nd, const struct inode 
*inode)
 {
+   struct user_namespace *user_ns;
+   kuid_t i_uid;
+
if (!sysctl_protected_symlinks)
return 0;
 
+   user_ns = mnt_user_ns(nd->path.mnt);
+   i_uid = i_uid_into_mnt(user_ns, inode);
/* Allowed if owner and follower match. */
-   if (uid_eq(current_cred()->fsuid, inode->i_uid))
+   if (uid_eq(current_cred()->fsuid, i_uid))
return 0;
 
/* Allowed if parent directory not sticky and world-writable. */
@@ -975,7 +980,7 @@ static inline int may_follow_link(struct nameidata *nd, 
const struct inode *inod
return 0;
 
/* Allowed if parent directory and link owner match. */
-   if (uid_valid(nd->dir_uid) && uid_eq(nd->dir_uid, inode->i_uid))
+   if (uid_valid(nd->dir_uid) && uid_eq(nd->dir_uid, i_uid))
return 0;
 
if (nd->flags & LOOKUP_RCU)
@@ -998,7 +1003,7 @@ static inline int may_follow_link(struct nameidata *nd, 
const struct inode *inod
  *
  * Otherwise returns true.
  */
-static bool safe_hardlink_source(struct inode *inode)
+static bool safe_hardlink_source(struct user_namespace *user_ns, struct inode 
*inode)
 {
umode_t mode = inode->i_mode;
 
@@ -1015,7 +1020,7 @@ static bool safe_hardlink_source(struct inode *inode)
return false;
 
/* Hardlinking to unreadable or unwritable sources is dangerous. */
-   if (inode_permission(_user_ns, inode, MAY_READ | MAY_WRITE))
+   if (inode_permission(user_ns, inode, MAY_READ | MAY_WRITE))
return false;
 
return true;
@@ -1036,9 +1041,12 @@ static bool safe_hardlink_source(struct inode *inode)
 int may_linkat(struct path *link)
 {
struct inode *inode = link->dentry->d_inode;
+   struct user_namespace *user_ns;
 
/* Inode writeback is not safe when the uid or gid are invalid. */
-   if (!uid_valid(inode->i_uid) |

[PATCH v2 01/39] namespace: take lock_mount_hash() directly when changing flags

2020-11-15 Thread Christian Brauner

Changing mount options always ends up taking lock_mount_hash() but when
MNT_READONLY is requested and neither the mount nor the superblock are
not already MNT_READONLY we end up taking the lock, dropping it, and
retaking it to change the other mount attributes. Instead of this,
acquire the lock once when changing mount properties. This simplifies
the locking in these codepath, makes them easier to reason about and
avoids having to reacquire the lock right after dropping it.

Cc: Christoph Hellwig 
Cc: David Howells 
Cc: Al Viro 
Cc: linux-fsde...@vger.kernel.org
Signed-off-by: Christian Brauner 
---
/* v2 */
- Christoph Hellwig:
  - Remove pointless __mnt_unmake_readonly() helper.
  - Even though Christoph suggested to lockdep_assert_held() into places that
require {lock,unlock}_mount_hash() it seems that seqlock's don't support
it.
---
 fs/namespace.c | 22 --
 1 file changed, 8 insertions(+), 14 deletions(-)

diff --git a/fs/namespace.c b/fs/namespace.c
index cebaa3e81794..f183161833ad 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -463,7 +463,6 @@ static int mnt_make_readonly(struct mount *mnt)
 {
int ret = 0;
 
-   lock_mount_hash();
mnt->mnt.mnt_flags |= MNT_WRITE_HOLD;
/*
 * After storing MNT_WRITE_HOLD, we'll read the counters. This store
@@ -497,18 +496,9 @@ static int mnt_make_readonly(struct mount *mnt)
 */
smp_wmb();
mnt->mnt.mnt_flags &= ~MNT_WRITE_HOLD;
-   unlock_mount_hash();
return ret;
 }
 
-static int __mnt_unmake_readonly(struct mount *mnt)
-{
-   lock_mount_hash();
-   mnt->mnt.mnt_flags &= ~MNT_READONLY;
-   unlock_mount_hash();
-   return 0;
-}
-
 int sb_prepare_remount_readonly(struct super_block *sb)
 {
struct mount *mnt;
@@ -2508,7 +2498,8 @@ static int change_mount_ro_state(struct mount *mnt, 
unsigned int mnt_flags)
if (readonly_request)
return mnt_make_readonly(mnt);
 
-   return __mnt_unmake_readonly(mnt);
+   mnt->mnt.mnt_flags &= ~MNT_READONLY;
+   return 0;
 }
 
 /*
@@ -2517,11 +2508,9 @@ static int change_mount_ro_state(struct mount *mnt, 
unsigned int mnt_flags)
  */
 static void set_mount_attributes(struct mount *mnt, unsigned int mnt_flags)
 {
-   lock_mount_hash();
mnt_flags |= mnt->mnt.mnt_flags & ~MNT_USER_SETTABLE_MASK;
mnt->mnt.mnt_flags = mnt_flags;
touch_mnt_namespace(mnt->mnt_ns);
-   unlock_mount_hash();
 }
 
 static void mnt_warn_timestamp_expiry(struct path *mountpoint, struct vfsmount 
*mnt)
@@ -2567,9 +2556,11 @@ static int do_reconfigure_mnt(struct path *path, 
unsigned int mnt_flags)
return -EPERM;
 
down_write(>s_umount);
+   lock_mount_hash();
ret = change_mount_ro_state(mnt, mnt_flags);
if (ret == 0)
set_mount_attributes(mnt, mnt_flags);
+   unlock_mount_hash();
up_write(>s_umount);
 
mnt_warn_timestamp_expiry(path, >mnt);
@@ -2610,8 +2601,11 @@ static int do_remount(struct path *path, int ms_flags, 
int sb_flags,
err = -EPERM;
if (ns_capable(sb->s_user_ns, CAP_SYS_ADMIN)) {
err = reconfigure_super(fc);
-   if (!err)
+   if (!err) {
+   lock_mount_hash();
set_mount_attributes(mnt, mnt_flags);
+   unlock_mount_hash();
+   }
}
up_write(>s_umount);
}
-- 
2.29.2

--
Linux-audit mailing list
Linux-audit@redhat.com
https://www.redhat.com/mailman/listinfo/linux-audit

[PATCH v2 33/39] fat: handle idmapped mounts

2020-11-15 Thread Christian Brauner

Let fat handle idmapped mounts. This allows to have the same fat mount appear
in multiple locations with different id mappings. This allows to expose a vfat
formatted USB stick to multiple user with different ids on the host or in user
namespaces:

mount -o uid=1000,gid=1000 /dev/sdb /mnt

u1001@f2-vm:/lower1$ ls -ln /mnt/
total 4
-rwxr-xr-x 1 1000 1000 4 Oct 28 03:44 aaa
-rwxr-xr-x 1 1000 1000 0 Oct 28 01:09 bbb
-rwxr-xr-x 1 1000 1000 0 Oct 28 01:10 ccc
-rwxr-xr-x 1 1000 1000 0 Oct 28 03:46 ddd
-rwxr-xr-x 1 1000 1000 0 Oct 28 04:01 eee

mount2 --idmap both:1000:1001:1

u1001@f2-vm:/lower1$ ls -ln /lower1/
total 4
-rwxr-xr-x 1 1001 1001 4 Oct 28 03:44 aaa
-rwxr-xr-x 1 1001 1001 0 Oct 28 01:09 bbb
-rwxr-xr-x 1 1001 1001 0 Oct 28 01:10 ccc
-rwxr-xr-x 1 1001 1001 0 Oct 28 03:46 ddd
-rwxr-xr-x 1 1001 1001 0 Oct 28 04:01 eee

u1001@f2-vm:/lower1$ touch /lower1/fff

u1001@f2-vm:/lower1$ ls -ln /lower1/fff
-rwxr-xr-x 1 1001 1001 0 Oct 28 04:03 /lower1/fff

u1001@f2-vm:/lower1$ ls -ln /mnt/fff
-rwxr-xr-x 1 1000 1000 0 Oct 28 04:03 /mnt/fff

Cc: Christoph Hellwig 
Cc: David Howells 
Cc: Al Viro 
Cc: linux-fsde...@vger.kernel.org
Signed-off-by: Christian Brauner 
---
/* v2 */
unchanged
---
 fs/fat/file.c| 15 ---
 fs/fat/namei_msdos.c |  2 +-
 fs/fat/namei_vfat.c  |  2 +-
 3 files changed, 10 insertions(+), 9 deletions(-)

diff --git a/fs/fat/file.c b/fs/fat/file.c
index 5b12cf209801..15dc8d27aa72 100644
--- a/fs/fat/file.c
+++ b/fs/fat/file.c
@@ -398,7 +398,7 @@ int fat_getattr(const struct path *path, struct kstat *stat,
u32 request_mask, unsigned int flags)
 {
struct inode *inode = d_inode(path->dentry);
-   generic_fillattr(_user_ns, inode, stat);
+   generic_fillattr(mnt_user_ns(path->mnt), inode, stat);
stat->blksize = MSDOS_SB(inode->i_sb)->cluster_size;
 
if (MSDOS_SB(inode->i_sb)->options.nfs == FAT_NFS_NOSTALE_RO) {
@@ -447,12 +447,13 @@ static int fat_sanitize_mode(const struct msdos_sb_info 
*sbi,
return 0;
 }
 
-static int fat_allow_set_time(struct msdos_sb_info *sbi, struct inode *inode)
+static int fat_allow_set_time(struct user_namespace *user_ns,
+ struct msdos_sb_info *sbi, struct inode *inode)
 {
umode_t allow_utime = sbi->options.allow_utime;
 
-   if (!uid_eq(current_fsuid(), inode->i_uid)) {
-   if (in_group_p(inode->i_gid))
+   if (!uid_eq(current_fsuid(), i_uid_into_mnt(user_ns, inode))) {
+   if (in_group_p(i_gid_into_mnt(user_ns, inode)))
allow_utime >>= 3;
if (allow_utime & MAY_WRITE)
return 1;
@@ -477,11 +478,11 @@ int fat_setattr(struct user_namespace *user_ns, struct 
dentry *dentry,
/* Check for setting the inode time. */
ia_valid = attr->ia_valid;
if (ia_valid & TIMES_SET_FLAGS) {
-   if (fat_allow_set_time(sbi, inode))
+   if (fat_allow_set_time(user_ns, sbi, inode))
attr->ia_valid &= ~TIMES_SET_FLAGS;
}
 
-   error = setattr_prepare(_user_ns, dentry, attr);
+   error = setattr_prepare(user_ns, dentry, attr);
attr->ia_valid = ia_valid;
if (error) {
if (sbi->options.quiet)
@@ -551,7 +552,7 @@ int fat_setattr(struct user_namespace *user_ns, struct 
dentry *dentry,
fat_truncate_time(inode, >ia_mtime, S_MTIME);
attr->ia_valid &= ~(ATTR_ATIME|ATTR_CTIME|ATTR_MTIME);
 
-   setattr_copy(_user_ns, inode, attr);
+   setattr_copy(user_ns, inode, attr);
mark_inode_dirty(inode);
 out:
return error;
diff --git a/fs/fat/namei_msdos.c b/fs/fat/namei_msdos.c
index 608b0606f3ca..0f871a1b6620 100644
--- a/fs/fat/namei_msdos.c
+++ b/fs/fat/namei_msdos.c
@@ -666,7 +666,7 @@ static struct file_system_type msdos_fs_type = {
.name   = "msdos",
.mount  = msdos_mount,
.kill_sb= kill_block_super,
-   .fs_flags   = FS_REQUIRES_DEV,
+   .fs_flags   = FS_REQUIRES_DEV | FS_ALLOW_IDMAP,
 };
 MODULE_ALIAS_FS("msdos");
 
diff --git a/fs/fat/namei_vfat.c b/fs/fat/namei_vfat.c
index 34903d14d6a6..177d939b95da 100644
--- a/fs/fat/namei_vfat.c
+++ b/fs/fat/namei_vfat.c
@@ -1063,7 +1063,7 @@ static struct file_system_type vfat_fs_type = {
.name   = "vfat",
.mount  = vfat_mount,
.kill_sb= kill_block_super,
-   .fs_flags   = FS_REQUIRES_DEV,
+   .fs_flags   = FS_REQUIRES_DEV | FS_ALLOW_IDMAP,
 };
 MODULE_ALIAS_FS("vfat");
 
-- 
2.29.2

--
Linux-audit mailing list
Linux-audit@redhat.com
https://www.redhat.com/mailman/listinfo/linux-audit

[PATCH v2 12/39] acl: handle idmapped mounts

2020-11-15 Thread Christian Brauner

The posix acl permission checking helpers determine whether a caller is
privileged over an inode according to the acls associated with the inode.
Add helpers that make it possible to handle acls on idampped mounts.

The vfs and the filesystems targeted by this first iteration make use of
posix_acl_fix_xattr_from_user() and posix_acl_fix_xattr_to_user() to
translate basic posix access and default permissions such as the ACL_USER
and ACL_GROUP type according to the initial user namespace (or the
superblock's user namespace) to and from the caller's current user
namespace. Adapt these two helpers to handle idmapped mounts whereby we
either shift from or into the mount's user namespace depending on in which
direction we're translating.
Similarly, cap_convert_nscap() is used by the vfs to translate user
namespace and non-user namespace aware filesystem capabilities from the
superblock's user namespace to the caller's user namespace. Enable it to
handle idmapped mounts by accounting for the mount's user namespace.

In addition the fileystems targeted in the first iteration of this patch
series make use of the posix_acl_chmod() and, posix_acl_update_mode()
helpers. Both helpers perform permission checks on the target inode. Let
them handle idmapped mounts. These two helpers are called when posix acls
are set by the respective filesystems to handle this case we extend the
->set() method to take an additional user namespace argument to pass the
mount's user namespace down.

Cc: Christoph Hellwig 
Cc: David Howells 
Cc: Al Viro 
Cc: linux-fsde...@vger.kernel.org
Signed-off-by: Christian Brauner 
---
/* v2 */
- Christoph Hellwig:
  - Don't pollute the vfs with additional helpers simply extend the existing
helpers with an additional argument and switch all callers.
---
 Documentation/filesystems/locking.rst |  6 +--
 Documentation/filesystems/porting.rst |  2 +
 fs/9p/acl.c   |  3 +-
 fs/9p/xattr.c |  1 +
 fs/afs/xattr.c|  2 +
 fs/btrfs/acl.c|  2 +-
 fs/btrfs/inode.c  |  2 +-
 fs/btrfs/xattr.c  |  2 +
 fs/ceph/acl.c |  2 +-
 fs/ceph/inode.c   |  2 +-
 fs/ceph/xattr.c   |  1 +
 fs/cifs/xattr.c   |  1 +
 fs/ecryptfs/inode.c   |  1 +
 fs/ext2/acl.c |  2 +-
 fs/ext2/inode.c   |  2 +-
 fs/ext2/xattr_security.c  |  1 +
 fs/ext2/xattr_trusted.c   |  1 +
 fs/ext2/xattr_user.c  |  1 +
 fs/ext4/acl.c |  2 +-
 fs/ext4/inode.c   |  2 +-
 fs/ext4/xattr_hurd.c  |  1 +
 fs/ext4/xattr_security.c  |  1 +
 fs/ext4/xattr_trusted.c   |  1 +
 fs/ext4/xattr_user.c  |  1 +
 fs/f2fs/acl.c |  2 +-
 fs/f2fs/file.c|  2 +-
 fs/f2fs/xattr.c   |  2 +
 fs/fuse/xattr.c   |  2 +
 fs/gfs2/acl.c |  2 +-
 fs/gfs2/inode.c   |  2 +-
 fs/gfs2/xattr.c   |  1 +
 fs/hfs/attr.c |  1 +
 fs/hfsplus/xattr.c|  1 +
 fs/hfsplus/xattr_security.c   |  1 +
 fs/hfsplus/xattr_trusted.c|  1 +
 fs/hfsplus/xattr_user.c   |  1 +
 fs/jffs2/acl.c|  2 +-
 fs/jffs2/fs.c |  2 +-
 fs/jffs2/security.c   |  1 +
 fs/jffs2/xattr_trusted.c  |  1 +
 fs/jffs2/xattr_user.c |  1 +
 fs/jfs/acl.c  |  2 +-
 fs/jfs/file.c |  2 +-
 fs/jfs/xattr.c|  2 +
 fs/kernfs/inode.c |  2 +
 fs/nfs/nfs4proc.c |  3 ++
 fs/nfsd/nfs2acl.c |  4 +-
 fs/nfsd/nfs3acl.c |  4 +-
 fs/nfsd/nfs4acl.c |  4 +-
 fs/ocfs2/acl.c|  2 +-
 fs/ocfs2/xattr.c  |  3 ++
 fs/orangefs/acl.c |  2 +-
 fs/orangefs/inode.c   |  2 +-
 fs/orangefs/xattr.c   |  1 +
 fs/overlayfs/super.c  |  3 ++
 fs/posix_acl.c| 54 +--
 fs/reiserfs/xattr_acl.c   |  4 +-
 fs/reiserfs/xattr_security.c  |  3 +-
 fs/reiserfs/xattr_trusted.c   |  3 +-
 fs/reiserfs/xattr_user.c  |  3 +-
 fs/ubifs/xattr.c  |  1 +
 fs/xattr.c| 10 ++---
 fs/xfs/xfs_acl.c  |  2 +-
 fs/xfs/xfs_iops.c |  2 +-
 fs/xfs/xfs_xattr.c|  3 +-
 include/linux/capability.h|  3 +-
 include/linux/posix_acl.h |  9 +++--
 include/linux/posix_acl_xattr.h   |

[PATCH v2 17/39] namei: introduce struct renamedata

2020-11-15 Thread Christian Brauner

In order to handle idmapped mounts we will extend the vfs rename helper to
take two new arguments in follow up patches. Since this operations already
takes a bunch of arguments add a simple struct renamedata (based on struct
nameidata) and make the current helper to use it before we extend it.

Cc: Christoph Hellwig 
Cc: David Howells 
Cc: Al Viro 
Cc: linux-fsde...@vger.kernel.org
Signed-off-by: Christian Brauner 
---
/* v2 */
unchanged
---
 fs/cachefiles/namei.c|  9 +++--
 fs/ecryptfs/inode.c  | 10 +++---
 fs/namei.c   | 21 +++--
 fs/nfsd/vfs.c|  8 +++-
 fs/overlayfs/overlayfs.h |  9 -
 include/linux/fs.h   | 12 +++-
 6 files changed, 55 insertions(+), 14 deletions(-)

diff --git a/fs/cachefiles/namei.c b/fs/cachefiles/namei.c
index ecc8ecbbfa5a..7b987de0babe 100644
--- a/fs/cachefiles/namei.c
+++ b/fs/cachefiles/namei.c
@@ -412,9 +412,14 @@ static int cachefiles_bury_object(struct cachefiles_cache 
*cache,
if (ret < 0) {
cachefiles_io_error(cache, "Rename security error %d", ret);
} else {
+   struct renamedata rd = {
+   .old_dir= d_inode(dir),
+   .old_dentry = rep,
+   .new_dir= d_inode(cache->graveyard),
+   .new_dentry = grave,
+   };
trace_cachefiles_rename(object, rep, grave, why);
-   ret = vfs_rename(d_inode(dir), rep,
-d_inode(cache->graveyard), grave, NULL, 0);
+   ret = vfs_rename();
if (ret != 0 && ret != -ENOMEM)
cachefiles_io_error(cache,
"Rename failed with error %d", ret);
diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c
index d98448c75051..838949ede439 100644
--- a/fs/ecryptfs/inode.c
+++ b/fs/ecryptfs/inode.c
@@ -590,6 +590,7 @@ ecryptfs_rename(struct inode *old_dir, struct dentry 
*old_dentry,
struct dentry *lower_new_dir_dentry;
struct dentry *trap;
struct inode *target_inode;
+   struct renamedata rd = {};
 
if (flags)
return -EINVAL;
@@ -619,9 +620,12 @@ ecryptfs_rename(struct inode *old_dir, struct dentry 
*old_dentry,
rc = -ENOTEMPTY;
goto out_lock;
}
-   rc = vfs_rename(d_inode(lower_old_dir_dentry), lower_old_dentry,
-   d_inode(lower_new_dir_dentry), lower_new_dentry,
-   NULL, 0);
+
+   rd.old_dir  = d_inode(lower_old_dir_dentry);
+   rd.old_dentry   = lower_old_dentry;
+   rd.new_dir  = d_inode(lower_new_dir_dentry);
+   rd.new_dentry   = lower_new_dentry;
+   rc = vfs_rename();
if (rc)
goto out_lock;
if (target_inode)
diff --git a/fs/namei.c b/fs/namei.c
index 4dc842d1cd3a..0a2450de83bb 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -4256,12 +4256,15 @@ SYSCALL_DEFINE2(link, const char __user *, oldname, 
const char __user *, newname
  *->i_mutex on parents, which works but leads to some truly excessive
  *locking].
  */
-int vfs_rename(struct inode *old_dir, struct dentry *old_dentry,
-  struct inode *new_dir, struct dentry *new_dentry,
-  struct inode **delegated_inode, unsigned int flags)
+int vfs_rename(struct renamedata *rd)
 {
int error;
struct user_namespace *user_ns = _user_ns;
+   struct inode *old_dir = rd->old_dir, *new_dir = rd->new_dir;
+   struct dentry *old_dentry = rd->old_dentry,
+ *new_dentry = rd->new_dentry;
+   struct inode **delegated_inode = rd->delegated_inode;
+   unsigned int flags = rd->flags;
bool is_dir = d_is_dir(old_dentry);
struct inode *source = old_dentry->d_inode;
struct inode *target = new_dentry->d_inode;
@@ -4385,6 +4388,7 @@ EXPORT_SYMBOL(vfs_rename);
 static int do_renameat2(int olddfd, const char __user *oldname, int newdfd,
const char __user *newname, unsigned int flags)
 {
+   struct renamedata rd;
struct dentry *old_dentry, *new_dentry;
struct dentry *trap;
struct path old_path, new_path;
@@ -4490,9 +4494,14 @@ static int do_renameat2(int olddfd, const char __user 
*oldname, int newdfd,
 _path, new_dentry, flags);
if (error)
goto exit5;
-   error = vfs_rename(old_path.dentry->d_inode, old_dentry,
-  new_path.dentry->d_inode, new_dentry,
-  _inode, flags);
+
+   rd.old_dir = old_path.dentry->d_inode;
+   rd.old_dentry  = old_dentry;
+   rd.new_dir = new_path.dentry->d_inode;
+   rd.new_dentry  = new_dentry;
+   rd.delegated_inode = _inode;
+   rd

[PATCH v2 06/39] fs: add id translation helpers

2020-11-15 Thread Christian Brauner

Add simple helpers to make it easy to map kuids into and from idmapped
mounts. We provide simple wrappers that filesystems can use to
e.g. initialize inodes similar to i_{uid,gid}_read() and
i_{uid,gid}_write(). Accessing an inode through an idmapped mount will
require the inode to be mapped according to the mount's user namespace.
If the fsids are used to compare against inodes or to initialize inodes
they are required to be shifted from the mount's user namespace. Passing
the initial user namespace to these helpers makes them a nop and so any
non-idmapped paths will not be impacted.

Cc: Christoph Hellwig 
Cc: David Howells 
Cc: Al Viro 
Cc: linux-fsde...@vger.kernel.org
Signed-off-by: Christian Brauner 
---
/* v2 */
- Christoph Hellwig :
  - Get rid of the ifdefs and the config option that hid idmapped mounts.
---
 include/linux/fs.h | 43 +++
 1 file changed, 43 insertions(+)

diff --git a/include/linux/fs.h b/include/linux/fs.h
index 21cc971fd960..9e487cbf0f5c 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -39,6 +39,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include 
 #include 
@@ -1574,6 +1575,48 @@ static inline void i_gid_write(struct inode *inode, 
gid_t gid)
inode->i_gid = make_kgid(inode->i_sb->s_user_ns, gid);
 }
 
+static inline kuid_t kuid_into_mnt(struct user_namespace *to, kuid_t kuid)
+{
+   return make_kuid(to, __kuid_val(kuid));
+}
+
+static inline kgid_t kgid_into_mnt(struct user_namespace *to, kgid_t kgid)
+{
+   return make_kgid(to, __kgid_val(kgid));
+}
+
+static inline kuid_t i_uid_into_mnt(struct user_namespace *to,
+   const struct inode *inode)
+{
+   return kuid_into_mnt(to, inode->i_uid);
+}
+
+static inline kgid_t i_gid_into_mnt(struct user_namespace *to,
+   const struct inode *inode)
+{
+   return kgid_into_mnt(to, inode->i_gid);
+}
+
+static inline kuid_t kuid_from_mnt(struct user_namespace *to, kuid_t kuid)
+{
+   return KUIDT_INIT(from_kuid(to, kuid));
+}
+
+static inline kgid_t kgid_from_mnt(struct user_namespace *to, kgid_t kgid)
+{
+   return KGIDT_INIT(from_kgid(to, kgid));
+}
+
+static inline kuid_t fsuid_into_mnt(struct user_namespace *to)
+{
+   return kuid_from_mnt(to, current_fsuid());
+}
+
+static inline kgid_t fsgid_into_mnt(struct user_namespace *to)
+{
+   return kgid_from_mnt(to, current_fsgid());
+}
+
 extern struct timespec64 current_time(struct inode *inode);
 
 /*
-- 
2.29.2

--
Linux-audit mailing list
Linux-audit@redhat.com
https://www.redhat.com/mailman/listinfo/linux-audit

[PATCH v2 35/39] ecryptfs: do not mount on top of idmapped mounts

2020-11-15 Thread Christian Brauner

Prevent ecryptfs from being mounted on top of idmapped mounts until we have
ported it to handle this case and added proper testing for it.

Cc: Christoph Hellwig 
Cc: David Howells 
Cc: Al Viro 
Cc: linux-fsde...@vger.kernel.org
Signed-off-by: Christian Brauner 
---
/* v2 */
patch introduced
---
 fs/ecryptfs/main.c | 6 ++
 1 file changed, 6 insertions(+)

diff --git a/fs/ecryptfs/main.c b/fs/ecryptfs/main.c
index e63259fdef28..c739f42157db 100644
--- a/fs/ecryptfs/main.c
+++ b/fs/ecryptfs/main.c
@@ -531,6 +531,12 @@ static struct dentry *ecryptfs_mount(struct 
file_system_type *fs_type, int flags
goto out_free;
}
 
+   if (mnt_idmapped(path.mnt)) {
+   rc = -EINVAL;
+   printk(KERN_ERR "Mounting on idmapped mounts currently 
disallowed\n");
+   goto out_free;
+   }
+
if (check_ruid && !uid_eq(d_inode(path.dentry)->i_uid, current_uid())) {
rc = -EPERM;
printk(KERN_ERR "Mount of device (uid: %d) not owned by "
-- 
2.29.2

--
Linux-audit mailing list
Linux-audit@redhat.com
https://www.redhat.com/mailman/listinfo/linux-audit

[PATCH v2 14/39] commoncap: handle idmapped mounts

2020-11-15 Thread Christian Brauner

When interacting with user namespace and non-user namespace aware
filesystem capabilities the vfs will perform various security checks to
determine whether or not the filesystem capabilities can be used by the
caller (e.g. during exec), or even whether they need to be removed. The
main infrastructure for this resides in the capability codepaths but they
are called through the LSM security infrastructure even though they are not
technically an LSM or optional. This extends the existing security hooks
security_inode_removexattr(), security_inode_killpriv(),
security_inode_getsecurity() to pass down the mount's user namespace and
makes them aware of idmapped mounts.
In order to actually get filesystem capabilities from disk the capability
infrastructure exposes the get_vfs_caps_from_disk() helper. For user
namespace aware filesystem capabilities a root uid is stored alongside the
capabilities.
In order to determine whether the caller can make use of the filesystem
capability or whether it needs to be ignored it is translated according to
the superblock's user namespace. If it can be translated to uid 0 according
to that id mapping the caller can use the filesystem capabilities stored on
disk. If we are accessing the inode that holds the filesystem capabilities
through an idmapped mount we need to map the root uid according to the
mount's user namespace.
Afterwards the checks are identical to non-idmapped mounts. Reading
filesystem caps from disk enforces that the root uid associated with the
filesystem capability must have a mapping in the superblock's user
namespace and that the caller is either in the same user namespace or is a
descendant of the superblock's user namespace. For filesystems that are
mountable inside user namespace the container can just mount the filesystem
and won't usually need to idmap it. If it does create an idmapped mount it
can mark it with a user namespace it has created and which is therefore a
descendant of the s_user_ns. For filesystems that are not mountable inside
user namespaces the descendant rule is trivially true because the s_user_ns
will be the initial user namespace.

If the initial user namespace is passed all operations are a nop so
non-idmapped mounts will not see a change in behavior and will also not see
any performance impact.

Cc: Christoph Hellwig 
Cc: David Howells 
Cc: Al Viro 
Cc: linux-fsde...@vger.kernel.org
Signed-off-by: Christian Brauner 
---
/* v2 */
- Christoph Hellwig:
  - Don't pollute the vfs with additional helpers simply extend the existing
helpers with an additional argument and switch all callers.
---
 fs/attr.c |  2 +-
 fs/xattr.c| 14 +--
 include/linux/capability.h|  4 +++-
 include/linux/lsm_hook_defs.h | 15 +++-
 include/linux/lsm_hooks.h |  1 +
 include/linux/security.h  | 44 +++
 kernel/auditsc.c  |  5 ++--
 security/commoncap.c  | 30 +++-
 security/security.c   | 25 
 security/selinux/hooks.c  | 20 +---
 security/smack/smack_lsm.c| 14 ++-
 11 files changed, 107 insertions(+), 67 deletions(-)

diff --git a/fs/attr.c b/fs/attr.c
index 4b36440236d4..e990cda1ea6f 100644
--- a/fs/attr.c
+++ b/fs/attr.c
@@ -113,7 +113,7 @@ int setattr_prepare(struct user_namespace *user_ns, struct 
dentry *dentry,
if (ia_valid & ATTR_KILL_PRIV) {
int error;
 
-   error = security_inode_killpriv(dentry);
+   error = security_inode_killpriv(user_ns, dentry);
if (error)
return error;
}
diff --git a/fs/xattr.c b/fs/xattr.c
index 438fedfcd402..8c50b2a935e4 100644
--- a/fs/xattr.c
+++ b/fs/xattr.c
@@ -259,7 +259,7 @@ __vfs_setxattr_locked(struct user_namespace *user_ns, 
struct dentry *dentry,
if (error)
return error;
 
-   error = security_inode_setxattr(dentry, name, value, size, flags);
+   error = security_inode_setxattr(user_ns, dentry, name, value, size, 
flags);
if (error)
goto out;
 
@@ -298,18 +298,18 @@ vfs_setxattr(struct user_namespace *user_ns, struct 
dentry *dentry,
 EXPORT_SYMBOL_GPL(vfs_setxattr);
 
 static ssize_t
-xattr_getsecurity(struct inode *inode, const char *name, void *value,
-   size_t size)
+xattr_getsecurity(struct user_namespace *user_ns, struct inode *inode,
+ const char *name, void *value, size_t size)
 {
void *buffer = NULL;
ssize_t len;
 
if (!value || !size) {
-   len = security_inode_getsecurity(inode, name, , false);
+   len = security_inode_getsecurity(user_ns, inode, name, , 
false);
goto out_noalloc;
}
 
-   len = security_inode_getsecurity(inode, name, , true);
+   len = security_inode_getsecurity(user_ns, inode, name, , true);
if (len

[PATCH v2 28/39] exec: handle idmapped mounts

2020-11-15 Thread Christian Brauner

When executing a setuid binary the kernel will verify in bprm_fill_uid() that
the inode has a mapping in the caller's user namespace before setting the
callers uid and gid. Let bprm_fill_uid() handle idmapped mounts. If the inode
is accessed through an idmapped mount it is mapped according to the mount's
user namespace. Afterwards the checks are identical to non-idmapped mounts.On
regular mounts this is a nop.

Cc: Christoph Hellwig 
Cc: David Howells 
Cc: Al Viro 
Cc: linux-fsde...@vger.kernel.org
Signed-off-by: Christian Brauner 
---
/* v2 */
unchanged
---
 fs/exec.c | 7 +--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/fs/exec.c b/fs/exec.c
index 10c06fdf78a7..7d6d3dd17e84 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1567,6 +1567,7 @@ static void check_unsafe_exec(struct linux_binprm *bprm)
 static void bprm_fill_uid(struct linux_binprm *bprm, struct file *file)
 {
/* Handle suid and sgid on files */
+   struct user_namespace *user_ns;
struct inode *inode;
unsigned int mode;
kuid_t uid;
@@ -1583,13 +1584,15 @@ static void bprm_fill_uid(struct linux_binprm *bprm, 
struct file *file)
if (!(mode & (S_ISUID|S_ISGID)))
return;
 
+   user_ns = mnt_user_ns(file->f_path.mnt);
+
/* Be careful if suid/sgid is set */
inode_lock(inode);
 
/* reload atomically mode/uid/gid now that lock held */
mode = inode->i_mode;
-   uid = inode->i_uid;
-   gid = inode->i_gid;
+   uid = i_uid_into_mnt(user_ns, inode);
+   gid = i_gid_into_mnt(user_ns, inode);
inode_unlock(inode);
 
/* We ignore suid/sgid if there are no mappings for them in the ns */
-- 
2.29.2

--
Linux-audit mailing list
Linux-audit@redhat.com
https://www.redhat.com/mailman/listinfo/linux-audit

[PATCH v2 13/39] xattr: handle idmapped mounts

2020-11-15 Thread Christian Brauner

From: Tycho Andersen 

When interacting with extended attributes the vfs verifies that the caller
is privileged over the inode with which the extended attribute is
associated. For posix access and posix default extended attributes a uid or
gid can be stored on-disk. Let the functions handle posix extended
attributes on idmapped mounts. If the inode is accessed through an idmapped
mount we need to map it according to the mount's user namespace. Afterwards
the checks are identical to non-idmapped mounts. This has no effect for
e.g. security xattrs that are set since no filesystem performs checks based
on the uid and gid of the inode as the vfs will have already done.

Cc: Christoph Hellwig 
Cc: David Howells 
Cc: Al Viro 
Cc: linux-fsde...@vger.kernel.org
Signed-off-by: Tycho Andersen 
Signed-off-by: Christian Brauner 
---
/* v2 */
- Christoph Hellwig:
  - Don't pollute the vfs with additional helpers simply extend the existing
helpers with an additional argument and switch all callers.
---
 fs/cachefiles/xattr.c |  16 ++--
 fs/ecryptfs/crypto.c  |   4 +-
 fs/ecryptfs/inode.c   |   4 +-
 fs/ecryptfs/mmap.c|   4 +-
 fs/nfsd/vfs.c |  12 +--
 fs/overlayfs/copy_up.c|  12 +--
 fs/overlayfs/dir.c|   2 +-
 fs/overlayfs/inode.c  |   8 +-
 fs/overlayfs/overlayfs.h  |   6 +-
 fs/overlayfs/super.c  |   4 +-
 fs/xattr.c| 120 +++---
 include/linux/xattr.h |  24 --
 security/apparmor/domain.c|   4 +-
 security/commoncap.c  |   6 +-
 security/integrity/evm/evm_crypto.c   |  11 +--
 security/integrity/evm/evm_main.c |   4 +-
 security/integrity/ima/ima_appraise.c |   8 +-
 security/selinux/hooks.c  |   2 +-
 security/smack/smack_lsm.c|   4 +-
 19 files changed, 143 insertions(+), 112 deletions(-)

diff --git a/fs/cachefiles/xattr.c b/fs/cachefiles/xattr.c
index 72e42438f3d7..3b6a3f1610f4 100644
--- a/fs/cachefiles/xattr.c
+++ b/fs/cachefiles/xattr.c
@@ -39,7 +39,7 @@ int cachefiles_check_object_type(struct cachefiles_object 
*object)
_enter("%p{%s}", object, type);
 
/* attempt to install a type label directly */
-   ret = vfs_setxattr(dentry, cachefiles_xattr_cache, type, 2,
+   ret = vfs_setxattr(_user_ns, dentry, cachefiles_xattr_cache, type, 
2,
   XATTR_CREATE);
if (ret == 0) {
_debug("SET"); /* we succeeded */
@@ -54,7 +54,7 @@ int cachefiles_check_object_type(struct cachefiles_object 
*object)
}
 
/* read the current type label */
-   ret = vfs_getxattr(dentry, cachefiles_xattr_cache, xtype, 3);
+   ret = vfs_getxattr(_user_ns, dentry, cachefiles_xattr_cache, 
xtype, 3);
if (ret < 0) {
if (ret == -ERANGE)
goto bad_type_length;
@@ -110,7 +110,7 @@ int cachefiles_set_object_xattr(struct cachefiles_object 
*object,
_debug("SET #%u", auxdata->len);
 
clear_bit(FSCACHE_COOKIE_AUX_UPDATED, >fscache.cookie->flags);
-   ret = vfs_setxattr(dentry, cachefiles_xattr_cache,
+   ret = vfs_setxattr(_user_ns, dentry, cachefiles_xattr_cache,
   >type, auxdata->len,
   XATTR_CREATE);
if (ret < 0 && ret != -ENOMEM)
@@ -140,7 +140,7 @@ int cachefiles_update_object_xattr(struct cachefiles_object 
*object,
_debug("SET #%u", auxdata->len);
 
clear_bit(FSCACHE_COOKIE_AUX_UPDATED, >fscache.cookie->flags);
-   ret = vfs_setxattr(dentry, cachefiles_xattr_cache,
+   ret = vfs_setxattr(_user_ns, dentry, cachefiles_xattr_cache,
   >type, auxdata->len,
   XATTR_REPLACE);
if (ret < 0 && ret != -ENOMEM)
@@ -171,7 +171,7 @@ int cachefiles_check_auxdata(struct cachefiles_object 
*object)
if (!auxbuf)
return -ENOMEM;
 
-   xlen = vfs_getxattr(dentry, cachefiles_xattr_cache,
+   xlen = vfs_getxattr(_user_ns, dentry, cachefiles_xattr_cache,
>type, 512 + 1);
ret = -ESTALE;
if (xlen < 1 ||
@@ -213,7 +213,7 @@ int cachefiles_check_object_xattr(struct cachefiles_object 
*object,
}
 
/* read the current type label */
-   ret = vfs_getxattr(dentry, cachefiles_xattr_cache,
+   ret = vfs_getxattr(_user_ns, dentry, cachefiles_xattr_cache,
   >type, 512 + 1);
if (ret < 0) {
if (ret == -ENODATA)
@@ -270,7 +270,7 @@ int cachefiles_check_object_xattr(struct cachefiles_object 
*object,
}
 
/* update the current label */
-   ret = vfs_setxattr(d

[PATCH v2 38/39] selftests: add idmapped mounts xattr selftest

2020-11-15 Thread Christian Brauner

From: Tycho Andersen 

Add some tests for setting extended attributes on idmapped mounts.

Cc: Christoph Hellwig 
Cc: David Howells 
Cc: Al Viro 
Cc: linux-fsde...@vger.kernel.org
Signed-off-by: Tycho Andersen 
Signed-off-by: Christian Brauner 
---
/* v2 */
patch introduced
---
 .../testing/selftests/idmap_mounts/.gitignore |   1 +
 tools/testing/selftests/idmap_mounts/Makefile |   9 +
 tools/testing/selftests/idmap_mounts/config   |   1 +
 .../testing/selftests/idmap_mounts/internal.h | 116 +++
 tools/testing/selftests/idmap_mounts/xattr.c  | 284 ++
 5 files changed, 411 insertions(+)
 create mode 100644 tools/testing/selftests/idmap_mounts/.gitignore
 create mode 100644 tools/testing/selftests/idmap_mounts/Makefile
 create mode 100644 tools/testing/selftests/idmap_mounts/config
 create mode 100644 tools/testing/selftests/idmap_mounts/internal.h
 create mode 100644 tools/testing/selftests/idmap_mounts/xattr.c

diff --git a/tools/testing/selftests/idmap_mounts/.gitignore 
b/tools/testing/selftests/idmap_mounts/.gitignore
new file mode 100644
index ..18c5e90522ad
--- /dev/null
+++ b/tools/testing/selftests/idmap_mounts/.gitignore
@@ -0,0 +1 @@
+xattr
diff --git a/tools/testing/selftests/idmap_mounts/Makefile 
b/tools/testing/selftests/idmap_mounts/Makefile
new file mode 100644
index ..1d495c99d924
--- /dev/null
+++ b/tools/testing/selftests/idmap_mounts/Makefile
@@ -0,0 +1,9 @@
+# SPDX-License-Identifier: GPL-2.0
+# Makefile for mount selftests.
+CFLAGS = -g -I../../../../usr/include/ -Wall -O2 -pthread
+
+TEST_GEN_FILES += xattr
+
+include ../lib.mk
+
+$(OUTPUT)/xattr: xattr.c internal.h
diff --git a/tools/testing/selftests/idmap_mounts/config 
b/tools/testing/selftests/idmap_mounts/config
new file mode 100644
index ..80730abc534b
--- /dev/null
+++ b/tools/testing/selftests/idmap_mounts/config
@@ -0,0 +1 @@
+CONFIG_IDMAP_MOUNTS=y
diff --git a/tools/testing/selftests/idmap_mounts/internal.h 
b/tools/testing/selftests/idmap_mounts/internal.h
new file mode 100644
index ..252803f35d71
--- /dev/null
+++ b/tools/testing/selftests/idmap_mounts/internal.h
@@ -0,0 +1,116 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#ifndef __IDMAP_INTERNAL_H
+#define __IDMAP_INTERNAL_H
+
+#define _GNU_SOURCE
+
+#include "../kselftest_harness.h"
+
+#ifndef __NR_mount_setattr
+   #if defined __alpha__
+   #define __NR_mount_setattr 551
+   #elif defined _MIPS_SIM
+   #if _MIPS_SIM == _MIPS_SIM_ABI32/* o32 */
+   #define __NR_mount_setattr 4441
+   #endif
+   #if _MIPS_SIM == _MIPS_SIM_NABI32   /* n32 */
+   #define __NR_mount_setattr 6441
+   #endif
+   #if _MIPS_SIM == _MIPS_SIM_ABI64/* n64 */
+   #define __NR_mount_setattr 5441
+   #endif
+   #elif defined __ia64__
+   #define __NR_mount_setattr (441 + 1024)
+   #else
+   #define __NR_mount_setattr 441
+   #endif
+
+#ifndef __NR_open_tree
+   #if defined __alpha__
+   #define __NR_open_tree 538
+   #elif defined _MIPS_SIM
+   #if _MIPS_SIM == _MIPS_SIM_ABI32/* o32 */
+   #define __NR_open_tree 4428
+   #endif
+   #if _MIPS_SIM == _MIPS_SIM_NABI32   /* n32 */
+   #define __NR_open_tree 6428
+   #endif
+   #if _MIPS_SIM == _MIPS_SIM_ABI64/* n64 */
+   #define __NR_open_tree 5428
+   #endif
+   #elif defined __ia64__
+   #define __NR_open_tree (428 + 1024)
+   #else
+   #define __NR_open_tree 428
+   #endif
+#endif
+
+#ifndef __NR_move_mount
+   #if defined __alpha__
+   #define __NR_move_mount 539
+   #elif defined _MIPS_SIM
+   #if _MIPS_SIM == _MIPS_SIM_ABI32/* o32 */
+   #define __NR_move_mount 4429
+   #endif
+   #if _MIPS_SIM == _MIPS_SIM_NABI32   /* n32 */
+   #define __NR_move_mount 6429
+   #endif
+   #if _MIPS_SIM == _MIPS_SIM_ABI64/* n64 */
+   #define __NR_move_mount 5429
+   #endif
+   #elif defined __ia64__
+   #define __NR_move_mount (428 + 1024)
+   #else
+   #define __NR_move_mount 429
+   #endif
+#endif
+
+
+struct mount_attr {
+   __u64 attr_set;
+   __u64 attr_clr;
+   __u64 propagation;
+   __u64 userns;
+};
+#endif
+
+#ifndef MOVE_MOUNT_F_EMPTY_PATH
+#define MOVE_MOUNT_F_EMPTY_PATH 0x0004 /* Empty from path permitted */
+#endif
+
+#ifndef MOUNT_ATTR_IDMAP
+#define MOUNT_ATTR_IDMAP 0x0010
+#endif
+
+#ifndef OPEN_TREE_CLONE
+#define OPEN_TREE_CLONE 1
+#endif
+
+#ifndef OPEN_TREE_CLOEXEC
+#define OPEN_TREE_CLOEXEC O_CLOEXEC
+#endif
+
+#ifndef AT

[PATCH v2 08/39] capability: handle idmapped mounts

2020-11-15 Thread Christian Brauner

In order to determine whether a caller holds privilege over a given
inode the capability framework exposes the two helpers
privileged_wrt_inode_uidgid() and capable_wrt_inode_uidgid(). The former
verifies that the inode has a mapping in the caller's user namespace and
the latter additionally verifies that the caller has the requested
capability in their current user namespace. If the inode is accessed
through an idmapped mount we first need to map it according to the
mount's user namespace. Afterwards the checks are identical to
non-idmapped inodes. If the initial user namespace is passed all
operations are a nop so non-idmapped mounts will not see a change in
behavior and will also not see any performance impact.

Cc: Christoph Hellwig 
Cc: David Howells 
Cc: Al Viro 
Cc: linux-fsde...@vger.kernel.org
Signed-off-by: Christian Brauner 
---
/* v2 */
- Christoph Hellwig:
  - Don't pollute the vfs with additional helpers simply extend the existing
helpers with an additional argument and switch all callers.
---
 fs/attr.c  |  8 
 fs/exec.c  |  2 +-
 fs/inode.c |  2 +-
 fs/namei.c | 13 -
 fs/overlayfs/super.c   |  2 +-
 fs/posix_acl.c |  2 +-
 fs/xfs/xfs_ioctl.c |  2 +-
 include/linux/capability.h |  7 +--
 kernel/capability.c| 14 +-
 security/commoncap.c   |  5 +++--
 10 files changed, 34 insertions(+), 23 deletions(-)

diff --git a/fs/attr.c b/fs/attr.c
index b4bbdbd4c8ca..d270f640a192 100644
--- a/fs/attr.c
+++ b/fs/attr.c
@@ -23,7 +23,7 @@ static bool chown_ok(const struct inode *inode, kuid_t uid)
if (uid_eq(current_fsuid(), inode->i_uid) &&
uid_eq(uid, inode->i_uid))
return true;
-   if (capable_wrt_inode_uidgid(inode, CAP_CHOWN))
+   if (capable_wrt_inode_uidgid(_user_ns, inode, CAP_CHOWN))
return true;
if (uid_eq(inode->i_uid, INVALID_UID) &&
ns_capable(inode->i_sb->s_user_ns, CAP_CHOWN))
@@ -36,7 +36,7 @@ static bool chgrp_ok(const struct inode *inode, kgid_t gid)
if (uid_eq(current_fsuid(), inode->i_uid) &&
(in_group_p(gid) || gid_eq(gid, inode->i_gid)))
return true;
-   if (capable_wrt_inode_uidgid(inode, CAP_CHOWN))
+   if (capable_wrt_inode_uidgid(_user_ns, inode, CAP_CHOWN))
return true;
if (gid_eq(inode->i_gid, INVALID_GID) &&
ns_capable(inode->i_sb->s_user_ns, CAP_CHOWN))
@@ -92,7 +92,7 @@ int setattr_prepare(struct dentry *dentry, struct iattr *attr)
/* Also check the setgid bit! */
if (!in_group_p((ia_valid & ATTR_GID) ? attr->ia_gid :
inode->i_gid) &&
-   !capable_wrt_inode_uidgid(inode, CAP_FSETID))
+   !capable_wrt_inode_uidgid(_user_ns, inode, CAP_FSETID))
attr->ia_mode &= ~S_ISGID;
}
 
@@ -193,7 +193,7 @@ void setattr_copy(struct inode *inode, const struct iattr 
*attr)
umode_t mode = attr->ia_mode;
 
if (!in_group_p(inode->i_gid) &&
-   !capable_wrt_inode_uidgid(inode, CAP_FSETID))
+   !capable_wrt_inode_uidgid(_user_ns, inode, CAP_FSETID))
mode &= ~S_ISGID;
inode->i_mode = mode;
}
diff --git a/fs/exec.c b/fs/exec.c
index 547a2390baf5..8e75d7a33514 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1398,7 +1398,7 @@ void would_dump(struct linux_binprm *bprm, struct file 
*file)
/* Ensure mm->user_ns contains the executable */
user_ns = old = bprm->mm->user_ns;
while ((user_ns != _user_ns) &&
-  !privileged_wrt_inode_uidgid(user_ns, inode))
+  !privileged_wrt_inode_uidgid(user_ns, _user_ns, 
inode))
user_ns = user_ns->parent;
 
if (old != user_ns) {
diff --git a/fs/inode.c b/fs/inode.c
index 9d78c37b00b8..7a15372d9c2d 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -2147,7 +2147,7 @@ void inode_init_owner(struct inode *inode, const struct 
inode *dir,
mode |= S_ISGID;
else if ((mode & (S_ISGID | S_IXGRP)) == (S_ISGID | S_IXGRP) &&
 !in_group_p(inode->i_gid) &&
-!capable_wrt_inode_uidgid(dir, CAP_FSETID))
+!capable_wrt_inode_uidgid(_user_ns, dir, 
CAP_FSETID))
mode &= ~S_ISGID;
} else
inode->i_gid = current_fsgid();
diff --git a/fs/namei.c b/fs/namei.c
index d4a6dd772303..3f52730af6c5 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -357,10 +357,11 @@ int generic_permission(struct inode *inode, int mask)
if (S_ISDIR(inode->i_

[PATCH v2 11/39] attr: handle idmapped mounts

2020-11-15 Thread Christian Brauner

When file attributes are changed filesystems mostly rely on the
setattr_prepare(), setattr_copy(), and notify_change() helpers for
initialization and permission checking. Let them handle idmapped mounts. If
the inode is accessed through an idmapped mount we need to map it according
to the mount's user namespace. Afterwards the checks are identical to
non-idmapped mounts. If the initial user namespace is passed all operations
are a nop so non-idmapped mounts will not see a change in behavior and will
also not see any performance impact. Helpers that perform checks on the
ia_uid and ia_gid fields in struct iattr assume that ia_uid and ia_gid are
intended values and so they won't be mapped according to the mount's user
namespace. This is more transparent to the caller.
If the initial user namespace is passed all operations are a nop so
non-idmapped mounts will not see a change in behavior and will not see any
performance impact.

Cc: Christoph Hellwig 
Cc: David Howells 
Cc: Al Viro 
Cc: linux-fsde...@vger.kernel.org
Signed-off-by: Christian Brauner 
---
/* v2 */
- Christoph Hellwig:
  - Don't pollute the vfs with additional helpers simply extend the existing
helpers with an additional argument and switch all callers.
---
 arch/powerpc/platforms/cell/spufs/inode.c |  2 +-
 drivers/base/devtmpfs.c   |  4 +-
 fs/9p/vfs_inode.c |  4 +-
 fs/9p/vfs_inode_dotl.c|  4 +-
 fs/adfs/inode.c   |  2 +-
 fs/affs/inode.c   |  4 +-
 fs/attr.c | 71 ++-
 fs/btrfs/inode.c  |  4 +-
 fs/cachefiles/interface.c |  4 +-
 fs/ceph/inode.c   |  2 +-
 fs/cifs/inode.c   |  8 +--
 fs/ecryptfs/inode.c   |  6 +-
 fs/exfat/file.c   |  4 +-
 fs/ext2/inode.c   |  4 +-
 fs/ext4/inode.c   |  4 +-
 fs/f2fs/file.c|  2 +-
 fs/fat/file.c |  4 +-
 fs/fuse/dir.c |  2 +-
 fs/gfs2/inode.c   |  4 +-
 fs/hfs/inode.c|  4 +-
 fs/hfsplus/inode.c|  4 +-
 fs/hostfs/hostfs_kern.c   |  4 +-
 fs/hpfs/inode.c   |  4 +-
 fs/hugetlbfs/inode.c  |  4 +-
 fs/inode.c|  2 +-
 fs/jffs2/fs.c |  2 +-
 fs/jfs/file.c |  4 +-
 fs/kernfs/inode.c |  4 +-
 fs/libfs.c|  4 +-
 fs/minix/file.c   |  4 +-
 fs/nfsd/nfsproc.c |  2 +-
 fs/nfsd/vfs.c |  4 +-
 fs/nilfs2/inode.c |  4 +-
 fs/ntfs/inode.c   |  2 +-
 fs/ocfs2/dlmfs/dlmfs.c|  4 +-
 fs/ocfs2/file.c   |  4 +-
 fs/omfs/file.c|  4 +-
 fs/open.c |  6 +-
 fs/orangefs/inode.c   |  4 +-
 fs/overlayfs/copy_up.c|  8 +--
 fs/overlayfs/dir.c|  2 +-
 fs/overlayfs/inode.c  |  4 +-
 fs/overlayfs/super.c  |  2 +-
 fs/proc/base.c|  4 +-
 fs/proc/generic.c |  4 +-
 fs/proc/proc_sysctl.c |  4 +-
 fs/ramfs/file-nommu.c |  4 +-
 fs/reiserfs/inode.c   |  4 +-
 fs/sysv/file.c|  4 +-
 fs/ubifs/file.c   |  2 +-
 fs/udf/file.c |  4 +-
 fs/ufs/inode.c|  4 +-
 fs/utimes.c   |  2 +-
 fs/xfs/xfs_iops.c |  2 +-
 fs/zonefs/super.c |  4 +-
 include/linux/fs.h|  8 ++-
 mm/shmem.c|  4 +-
 57 files changed, 151 insertions(+), 132 deletions(-)

diff --git a/arch/powerpc/platforms/cell/spufs/inode.c 
b/arch/powerpc/platforms/cell/spufs/inode.c
index 25390569e24c..3de526eb2275 100644
--- a/arch/powerpc/platforms/cell/spufs/inode.c
+++ b/arch/powerpc/platforms/cell/spufs/inode.c
@@ -98,7 +98,7 @@ spufs_setattr(struct dentry *dentry, struct iattr *attr)
if ((attr->ia_valid & ATTR_SIZE) &&
(attr->ia_size != inode->i_size))
return -EINVAL;
-   setattr_copy(inode, attr);
+   setattr_copy(_user_ns, inode, attr);
mark_inode_dirty(inode);
return 0;
 }
diff --git a/drivers/base/devtmpfs.c b/drivers/base/devtmpfs.c
index eac184e6d657..2e0c3cdb4184 100644
--- a/drivers/base/devtmpfs.c
+++ b/drivers/base/devtmpfs.c
@@ -221,7

[PATCH v2 22/39] utimes: handle idmapped mounts

2020-11-15 Thread Christian Brauner

Enable the vfs_utimes() helper to handle idmapped mounts by passing down
the mount's user namespace.

Cc: Christoph Hellwig 
Cc: David Howells 
Cc: Al Viro 
Cc: linux-fsde...@vger.kernel.org
Signed-off-by: Christian Brauner 
---
/* v2 */
unchanged
---
 fs/utimes.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/fs/utimes.c b/fs/utimes.c
index 1a4130bee157..ead17d623aaa 100644
--- a/fs/utimes.c
+++ b/fs/utimes.c
@@ -22,6 +22,7 @@ int vfs_utimes(const struct path *path, struct timespec64 
*times)
struct iattr newattrs;
struct inode *inode = path->dentry->d_inode;
struct inode *delegated_inode = NULL;
+   struct user_namespace *user_ns;
 
if (times) {
if (!nsec_valid(times[0].tv_nsec) ||
@@ -61,8 +62,9 @@ int vfs_utimes(const struct path *path, struct timespec64 
*times)
newattrs.ia_valid |= ATTR_TOUCH;
}
 retry_deleg:
+   user_ns = mnt_user_ns(path->mnt);
inode_lock(inode);
-   error = notify_change(_user_ns, path->dentry, , 
_inode);
+   error = notify_change(user_ns, path->dentry, , 
_inode);
inode_unlock(inode);
if (delegated_inode) {
error = break_deleg_wait(_inode);
-- 
2.29.2

--
Linux-audit mailing list
Linux-audit@redhat.com
https://www.redhat.com/mailman/listinfo/linux-audit

Re: [PATCH 03/34] fs: add mount_setattr()

2020-11-03 Thread Christian Brauner

On Sun, Nov 01, 2020 at 02:42:13PM +, Christoph Hellwig wrote:
> This has a bunch of crazy long lines.

Ok, will stick to 80 lines instead of the accepted but more lax 100.

> 
> Also some of the refatoring might be worth slpitting into prep patches.

Ok, will try to do that.

--
Linux-audit mailing list
Linux-audit@redhat.com
https://www.redhat.com/mailman/listinfo/linux-audit

Re: [PATCH 01/34] namespace: take lock_mount_hash() directly when changing flags

2020-11-03 Thread Christian Brauner

On Sun, Nov 01, 2020 at 02:41:08PM +, Christoph Hellwig wrote:
> > index cebaa3e81794..20ee291a7af4 100644
> > --- a/fs/namespace.c
> > +++ b/fs/namespace.c
> > @@ -463,7 +463,6 @@ static int mnt_make_readonly(struct mount *mnt)
> >  {
> > int ret = 0;
> >  
> > -   lock_mount_hash();
> 
> What about adding a lockdep_assert_lock_held in all the functions
> that used to take the lock to document the assumptions?

Good idea and will do. I wanted to do this but then didn't because I
haven't seen widespread use of lockdep assert in fs/namespace.c.

> 
> >  static int __mnt_unmake_readonly(struct mount *mnt)
> >  {
> > -   lock_mount_hash();
> > mnt->mnt.mnt_flags &= ~MNT_READONLY;
> > -   unlock_mount_hash();
> > return 0;
> 
> This helper is rather pointless now.

Ok, will remove.

> 
> >  static void set_mount_attributes(struct mount *mnt, unsigned int mnt_flags)
> >  {
> > -   lock_mount_hash();
> > mnt_flags |= mnt->mnt.mnt_flags & ~MNT_USER_SETTABLE_MASK;
> > mnt->mnt.mnt_flags = mnt_flags;
> > touch_mnt_namespace(mnt->mnt_ns);
> > -   unlock_mount_hash();
> 
> In linux-next there is an additional notify_mount after the unlock here.

Thanks! I can try rebasing on -next.

> 
> Also while you touch this lock_mount_hash/unlock_mount_hash could be
> moved to namespace.c and maked static now.

Ok, will try to do that.

--
Linux-audit mailing list
Linux-audit@redhat.com
https://www.redhat.com/mailman/listinfo/linux-audit

Re: [PATCH 05/34] fs: introduce MOUNT_ATTR_IDMAP

2020-11-03 Thread Christian Brauner

On Sun, Nov 01, 2020 at 02:45:44PM +, Christoph Hellwig wrote:
> On Thu, Oct 29, 2020 at 01:32:23AM +0100, Christian Brauner wrote:
> > Introduce a new mount bind mount property to allow idmapping mounts. The
> > MOUNT_ATTR_IDMAP flag can be set via the new mount_setattr() syscall
> > together with a file descriptor referring to a user namespace.
> 
> Shouldn't this go to the end of the series once all the infrastructure
> is in place?

Yeah, good idea. (I mostly did it to keep compile-times short when
rebasing.)

> 
> > +config IDMAP_MOUNTS
> > +   bool "Support id mappings per mount"
> > +   default n
> 
> n is the default default.

Ah, thanks.

> 
> But why do we need a config option here anyway?

My main concern was people complaining about code they want to compile
out. I've been burnt by that before but I'm happy to remove the config
option and make this unconditional.

> 
> > +#ifdef CONFIG_IDMAP_MOUNTS
> > +   if (kattr->attr_set & MNT_IDMAPPED) {
> > +   struct user_namespace *user_ns;
> > +   struct vfsmount *vmnt;
> 
> All the code here looks like it should go into a helper.

Will do.

> 
> > +   struct user_namespace *user_ns = 
> > READ_ONCE(m->mnt.mnt_user_ns);
> > +   WRITE_ONCE(m->mnt.mnt_user_ns, 
> > get_user_ns(kattr->userns));
> 
> More unreadable long lines.

Will wrap. I have somewhat adapted to the more lax 100 limit but I'm
happy to stick to 80.

> 
> > +   if (attr->attr_set & MOUNT_ATTR_IDMAP) {
> > +   struct ns_common *ns;
> > +   struct user_namespace *user_ns;
> > +   struct file *file;
> > +
> > +   file = fget(attr->userns);
> 
> The code here looks like another candidate for a self contained helper.

Noted.

> 
> > +
> > +static inline struct user_namespace *mnt_user_ns(const struct vfsmount 
> > *mnt)
> > +{
> > +#ifdef CONFIG_IDMAP_MOUNTS
> > +   return READ_ONCE(mnt->mnt_user_ns);
> > +#else
> > +   return _user_ns;
> > +#endif
> 
> How is the READ_ONCE on a pointer going to work?

Honestly, this is me following a pattern I've seen in other places and
it's mostly about visually indicating concurrency but I'll drop it.

Christian

--
Linux-audit mailing list
Linux-audit@redhat.com
https://www.redhat.com/mailman/listinfo/linux-audit

Re: [PATCH 07/34] capability: handle idmapped mounts

2020-11-03 Thread Christian Brauner

On Sun, Nov 01, 2020 at 02:48:09PM +, Christoph Hellwig wrote:
> >  /**
> >   * capable_wrt_inode_uidgid - Check nsown_capable and uid and gid mapped
> >   * @inode: The inode in question
> > @@ -501,9 +513,7 @@ bool privileged_wrt_inode_uidgid(struct user_namespace 
> > *ns, const struct inode *
> >   */
> >  bool capable_wrt_inode_uidgid(const struct inode *inode, int cap)
> >  {
> > +   return capable_wrt_mapped_inode_uidgid(_user_ns, inode, cap);
> >  }
> >  EXPORT_SYMBOL(capable_wrt_inode_uidgid);
> 
> Please avoid these silly wrappers and just switch all callers to pass
> the namespaces instead of creating boilerplate code.  Same for the other
> functions where you do this even even worse the method calls.

Christoph,

Thanks for the review!  

Ok, so I'll switch:
- all helpers to take an additional argument
  (capable_wrt_inode_uidgid()/inode_permission()/vfs_*() etc.)
- all inode method calls to take an additional argument (I assume that's
  what you're referring to: ->create()/->mknod()/->mkdir() etc.)
  I've always assumed that this is what we'd be doing in the end anyway
  (I've mentioned it in the commit message for the inode_operations
  method's. This will be a bit of work but we can get that done!)

--
Linux-audit mailing list
Linux-audit@redhat.com
https://www.redhat.com/mailman/listinfo/linux-audit

Re: [PATCH 06/34] fs: add id translation helpers

2020-11-03 Thread Christian Brauner

On Sun, Nov 01, 2020 at 02:46:32PM +, Christoph Hellwig wrote:
> > +static inline kuid_t kuid_into_mnt(struct user_namespace *to, kuid_t kuid)
> > +{
> > +#ifdef CONFIG_IDMAP_MOUNTS
> > +   return make_kuid(to, __kuid_val(kuid));
> > +#else
> > +   return kuid;
> > +#endif
> > +}
> > +
> > +static inline kgid_t kgid_into_mnt(struct user_namespace *to, kgid_t kgid)
> > +{
> > +#ifdef CONFIG_IDMAP_MOUNTS
> > +   return make_kgid(to, __kgid_val(kgid));
> > +#else
> > +   return kgid;
> > +#endif
> 
> If you want to keep the config option please at least have on
> #ifdef/#else/#endif instead of this mess.

Understood.

--
Linux-audit mailing list
Linux-audit@redhat.com
https://www.redhat.com/mailman/listinfo/linux-audit

Re: [PATCH 00/34] fs: idmapped mounts

2020-10-30 Thread Christian Brauner

On Thu, Oct 29, 2020 at 02:58:55PM -0700, Andy Lutomirski wrote:
> 
> 
> > On Oct 28, 2020, at 5:35 PM, Christian Brauner 
> >  wrote:
> > 
> > Hey everyone,
> > 
> > I vanished for a little while to focus on this work here so sorry for
> > not being available by mail for a while.
> > 
> > Since quite a long time we have issues with sharing mounts between
> > multiple unprivileged containers with different id mappings, sharing a
> > rootfs between multiple containers with different id mappings, and also
> > sharing regular directories and filesystems between users with different
> > uids and gids. The latter use-cases have become even more important with
> > the availability and adoption of systemd-homed (cf. [1]) to implement
> > portable home directories.
> > 
> > The solutions we have tried and proposed so far include the introduction
> > of fsid mappings, a tiny overlay based filesystem, and an approach to
> > call override creds in the vfs. None of these solutions have covered all
> > of the above use-cases.
> > 
> > The solution proposed here has it's origins in multiple discussions
> > during Linux Plumbers 2017 during and after the end of the containers
> > microconference.
> > To the best of my knowledge this involved Aleksa, Stéphane, Eric, David,
> > James, and myself. A variant of the solution proposed here has also been
> > discussed, again to the best of my knowledge, after a Linux conference
> > in St. Petersburg in Russia between Christoph, Tycho, and myself in 2017
> > after Linux Plumbers.
> > I've taken the time to finally implement a working version of this
> > solution over the last weeks to the best of my abilities. Tycho has
> > signed up for this sligthly crazy endeavour as well and he has helped
> > with the conversion of the xattr codepaths.
> > 
> > The core idea is to make idmappings a property of struct vfsmount
> > instead of tying it to a process being inside of a user namespace which
> > has been the case for all other proposed approaches.
> > It means that idmappings become a property of bind-mounts, i.e. each
> > bind-mount can have a separate idmapping. This has the obvious advantage
> > that idmapped mounts can be created inside of the initial user
> > namespace, i.e. on the host itself instead of requiring the caller to be
> > located inside of a user namespace. This enables such use-cases as e.g.
> > making a usb stick available in multiple locations with different
> > idmappings (see the vfat port that is part of this patch series).
> > 
> > The vfsmount struct gains a new struct user_namespace member. The
> > idmapping of the user namespace becomes the idmapping of the mount. A
> > caller that is either privileged with respect to the user namespace of
> > the superblock of the underlying filesystem or a caller that is
> > privileged with respect to the user namespace a mount has been idmapped
> > with can create a new bind-mount and mark it with a user namespace.
> 
> So one way of thinking about this is that a user namespace that has an 
> idmapped mount can, effectively, create or chown files with *any* on-disk uid 
> or gid by doing it directly (if that uid exists in-namespace, which is likely 
> for interesting ids like 0) or by creating a new userns with that id inside.
> 
> For a file system that is private to a container, this seems moderately safe, 
> although this may depend on what exactly “private” means. We probably want a 
> mechanism such that, if you are outside the namespace, a reference to a file 
> with the namespace’s vfsmnt does not confer suid privilege.
> 
> Imagine the following attack: user creates a namespace with a root user and 
> arranges to get an idmapped fs, e.g. by inserting an ext4 usb stick or using 
> whatever container management tool does this.  Inside the namespace, the user 
> creates a suid-root file.
> 
> Now, outside the namespace, the user has privilege over the namespace.  (I’m 
> assuming there is some tool that will idmap things in a namespace owned by an 
> unprivileged user, which seems likely.). So the user makes a new bind mount 
> and if maps it to the init namespace. Game over.
> 
> So I think we need to have some control to mitigate this in a comprehensible 
> way. A big hammer would be to require nosuid. A smaller hammer might be to 
> say that you can’t create a new idmapped mount unless you have privilege over 
> the userns that you want to use for the idmap and to say that a vfsmnt’s 
> paths don’t do suid outside the idmap namespace.  We already do the latter 
> for the vfsmnt’s mntns’s userns.

With this series, in order

Re: [PATCH 00/34] fs: idmapped mounts

2020-10-29 Thread Christian Brauner

On Thu, Oct 29, 2020 at 01:27:33PM +1100, Dave Chinner wrote:
> On Thu, Oct 29, 2020 at 01:32:18AM +0100, Christian Brauner wrote:
> > Hey everyone,
> > 
> > I vanished for a little while to focus on this work here so sorry for
> > not being available by mail for a while.
> > 
> > Since quite a long time we have issues with sharing mounts between
> > multiple unprivileged containers with different id mappings, sharing a
> > rootfs between multiple containers with different id mappings, and also
> > sharing regular directories and filesystems between users with different
> > uids and gids. The latter use-cases have become even more important with
> > the availability and adoption of systemd-homed (cf. [1]) to implement
> > portable home directories.
> > 
> > The solutions we have tried and proposed so far include the introduction
> > of fsid mappings, a tiny overlay based filesystem, and an approach to
> > call override creds in the vfs. None of these solutions have covered all
> > of the above use-cases.
> > 
> > The solution proposed here has it's origins in multiple discussions
> > during Linux Plumbers 2017 during and after the end of the containers
> > microconference.
> > To the best of my knowledge this involved Aleksa, Stéphane, Eric, David,
> > James, and myself. A variant of the solution proposed here has also been
> > discussed, again to the best of my knowledge, after a Linux conference
> > in St. Petersburg in Russia between Christoph, Tycho, and myself in 2017
> > after Linux Plumbers.
> > I've taken the time to finally implement a working version of this
> > solution over the last weeks to the best of my abilities. Tycho has
> > signed up for this sligthly crazy endeavour as well and he has helped
> > with the conversion of the xattr codepaths.
> > 
> > The core idea is to make idmappings a property of struct vfsmount
> > instead of tying it to a process being inside of a user namespace which
> > has been the case for all other proposed approaches.
> > It means that idmappings become a property of bind-mounts, i.e. each
> > bind-mount can have a separate idmapping. This has the obvious advantage
> > that idmapped mounts can be created inside of the initial user
> > namespace, i.e. on the host itself instead of requiring the caller to be
> > located inside of a user namespace. This enables such use-cases as e.g.
> > making a usb stick available in multiple locations with different
> > idmappings (see the vfat port that is part of this patch series).
> > 
> > The vfsmount struct gains a new struct user_namespace member. The
> > idmapping of the user namespace becomes the idmapping of the mount. A
> > caller that is either privileged with respect to the user namespace of
> > the superblock of the underlying filesystem or a caller that is
> > privileged with respect to the user namespace a mount has been idmapped
> > with can create a new bind-mount and mark it with a user namespace. The
> > user namespace the mount will be marked with can be specified by passing
> > a file descriptor refering to the user namespace as an argument to the
> > new mount_setattr() syscall together with the new MOUNT_ATTR_IDMAP flag.
> > By default vfsmounts are marked with the initial user namespace and no
> > behavioral or performance changes should be observed. All mapping
> > operations are nops for the initial user namespace.
> > 
> > When a file/inode is accessed through an idmapped mount the i_uid and
> > i_gid of the inode will be remapped according to the user namespace the
> > mount has been marked with. When a new object is created based on the
> > fsuid and fsgid of the caller they will similarly be remapped according
> > to the user namespace of the mount they care created from.
> > 
> > This means the user namespace of the mount needs to be passed down into
> > a few relevant inode_operations. This mostly includes inode operations
> > that create filesystem objects or change file attributes.
> 
> That's really quite ... messy.

I don't agree. It's changes all across the vfs but it's not hacky in any
way since it cleanly passes down an additional argument (I'm biased of
course.). 

> 
> Maybe I'm missing something, but if you have the user_ns to be used
> for the VFS operation we are about to execute then why can't we use
> the same model as current_fsuid/current_fsgid() for passing the
> filesystem credentials down to the filesystem operations?  i.e.
> attach it to the current->cred->fs_userns, and then the filesystem
> code that actually needs to know the current userns can call

[PATCH 16/34] namei: handle idmapped mounts in may_*() helpers

2020-10-29 Thread Christian Brauner

The may_follow_link(), may_linkat(), may_lookup(), may_open(), may_o_create(),
may_create_in_sticky(), may_delete(), and may_create() helpers determine
whether the caller is privileged enough to perform the associated operations.
Let them handle idmapped mounts by mappings the inode and fsids according to
the mount's user namespace. Afterwards the checks are identical to non-idmapped
inodes. If the initial user namespace is passed all operations are a nop so
non-idmapped mounts will not see a change in behavior and will also not see any
performance impact.
Since the may_*() helpers are not exposed to other parts of the kernel we can
simply extend them with an additional argument in case they don't already have
access to the mount's user namespace.

Signed-off-by: Christian Brauner 
---
 fs/namei.c | 106 +++--
 1 file changed, 63 insertions(+), 43 deletions(-)

diff --git a/fs/namei.c b/fs/namei.c
index 2635f6a57de5..76ee4d52bd5e 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -985,11 +985,14 @@ int sysctl_protected_regular __read_mostly;
  */
 static inline int may_follow_link(struct nameidata *nd, const struct inode 
*inode)
 {
+   struct user_namespace *user_ns;
+
if (!sysctl_protected_symlinks)
return 0;
 
+   user_ns = mnt_user_ns(nd->path.mnt);
/* Allowed if owner and follower match. */
-   if (uid_eq(current_cred()->fsuid, inode->i_uid))
+   if (uid_eq(current_cred()->fsuid, i_uid_into_mnt(user_ns, inode)))
return 0;
 
/* Allowed if parent directory not sticky and world-writable. */
@@ -1020,7 +1023,7 @@ static inline int may_follow_link(struct nameidata *nd, 
const struct inode *inod
  *
  * Otherwise returns true.
  */
-static bool safe_hardlink_source(struct inode *inode)
+static bool safe_hardlink_source(struct user_namespace *user_ns, struct inode 
*inode)
 {
umode_t mode = inode->i_mode;
 
@@ -1037,7 +1040,7 @@ static bool safe_hardlink_source(struct inode *inode)
return false;
 
/* Hardlinking to unreadable or unwritable sources is dangerous. */
-   if (inode_permission(inode, MAY_READ | MAY_WRITE))
+   if (mapped_inode_permission(user_ns, inode, MAY_READ | MAY_WRITE))
return false;
 
return true;
@@ -1058,6 +1061,7 @@ static bool safe_hardlink_source(struct inode *inode)
 int may_linkat(struct path *link)
 {
struct inode *inode = link->dentry->d_inode;
+   struct user_namespace *user_ns;
 
/* Inode writeback is not safe when the uid or gid are invalid. */
if (!uid_valid(inode->i_uid) || !gid_valid(inode->i_gid))
@@ -1069,7 +1073,9 @@ int may_linkat(struct path *link)
/* Source inode owner (or CAP_FOWNER) can hardlink all they like,
 * otherwise, it must be a safe source.
 */
-   if (safe_hardlink_source(inode) || inode_owner_or_capable(inode))
+   user_ns = mnt_user_ns(link->mnt);
+   if (safe_hardlink_source(user_ns, inode) ||
+   mapped_inode_owner_or_capable(user_ns, inode))
return 0;
 
audit_log_path_denied(AUDIT_ANOM_LINK, "linkat");
@@ -1097,14 +1103,17 @@ int may_linkat(struct path *link)
  *
  * Returns 0 if the open is allowed, -ve on error.
  */
-static int may_create_in_sticky(umode_t dir_mode, kuid_t dir_uid,
-   struct inode * const inode)
+static int may_create_in_sticky(struct nameidata *nd, struct inode *const 
inode)
 {
+   struct user_namespace *user_ns;
+   umode_t dir_mode = nd->dir_mode;
+   kuid_t dir_uid = nd->dir_uid;
+
+   user_ns = mnt_user_ns(nd->path.mnt);
if ((!sysctl_protected_fifos && S_ISFIFO(inode->i_mode)) ||
(!sysctl_protected_regular && S_ISREG(inode->i_mode)) ||
-   likely(!(dir_mode & S_ISVTX)) ||
-   uid_eq(inode->i_uid, dir_uid) ||
-   uid_eq(current_fsuid(), inode->i_uid))
+   likely(!(dir_mode & S_ISVTX)) || uid_eq(inode->i_uid, dir_uid) ||
+   uid_eq(current_fsuid(), i_uid_into_mnt(user_ns, inode)))
return 0;
 
if (likely(dir_mode & 0002) ||
@@ -1596,14 +1605,16 @@ static struct dentry *lookup_slow(const struct qstr 
*name,
 
 static inline int may_lookup(struct nameidata *nd)
 {
+   struct user_namespace *user_ns = mnt_user_ns(nd->path.mnt);
+
if (nd->flags & LOOKUP_RCU) {
-   int err = inode_permission(nd->inode, MAY_EXEC|MAY_NOT_BLOCK);
+   int err = mapped_inode_permission(user_ns, nd->inode, 
MAY_EXEC|MAY_NOT_BLOCK);
if (err != -ECHILD)
return err;
if (unlazy_walk(nd))
return -ECHILD;
}
-   return inode_permission(nd->inode, MAY_EXEC);
+   return mapped_inode_permission(user_ns, nd->inode, MAY_EXEC);

[PATCH 33/34] overlayfs: handle idmapped merged mounts

2020-10-29 Thread Christian Brauner

tu ubuntu0 Oct 28 13:25 A-FILE

 # Let's remove the idmapped /upper mount (overlayfs will have it's own private 
mount anyway)
 umount /upper

 # Let's look at these files in our upper directory with the idmapped mount 
removed
 ubuntu@f2-vm:/$ ls -al /upper/upper/
 root@f2-vm:~# ls -al /upper/upper/
 total 12
 drwxr-xr-x 3 1 1 4096 Oct 28 13:26 .
 drwxr-xr-x 4 1 1 4096 Oct 21 13:48 ..
 drwxr-xr-x 2 11000 11000 4096 Oct 28 13:26 A-DIR
 -rw-r--r-- 1 11000 110000 Oct 28 13:25 A-FILE

 # Let's create a few acls from the /merged directory on an already existing 
file
 # triggering a copy-up operation
 root@f2-vm:/merged# setfacl -m u:1000:rwx /merged/asdf
 root@f2-vm:/merged# getfacl /merged/asdf
 getfacl: Removing leading '/' from absolute path names
 # file: merged/asdf
 # owner: root
 # group: root
 user::rw-
 user:ubuntu:rwx
 group::r--
 mask::rwx
 other::r--

 # Let's look at this file from our upper directory from the initial user 
namespace
 root@f2-vm:/merged# getfacl /upper/upper/asdf
 getfacl: Removing leading '/' from absolute path names
 # file: upper/upper/asdf
 # owner: 1
 # group: 1
 user::rw-
 user:11000:rwx
 group::r--
 mask::rwx
 other::r--

Cc: Seth Forshee 
Cc: Amir Goldstein 
Signed-off-by: Christian Brauner 
---
 fs/overlayfs/copy_up.c   |  4 +--
 fs/overlayfs/dir.c   | 68 
 fs/overlayfs/inode.c | 40 +--
 fs/overlayfs/overlayfs.h |  9 +-
 fs/overlayfs/super.c |  3 ++
 5 files changed, 99 insertions(+), 25 deletions(-)

diff --git a/fs/overlayfs/copy_up.c b/fs/overlayfs/copy_up.c
index 1b8721796fd4..91134203c511 100644
--- a/fs/overlayfs/copy_up.c
+++ b/fs/overlayfs/copy_up.c
@@ -268,8 +268,8 @@ int ovl_set_attr(struct user_namespace *user_ns, struct 
dentry *upperdentry,
if (!err) {
struct iattr attr = {
.ia_valid = ATTR_UID | ATTR_GID,
-   .ia_uid = stat->uid,
-   .ia_gid = stat->gid,
+   .ia_uid = kuid_from_mnt(user_ns, stat->uid),
+   .ia_gid = kgid_from_mnt(user_ns, stat->gid),
};
err = notify_mapped_change(user_ns, upperdentry, , NULL);
}
diff --git a/fs/overlayfs/dir.c b/fs/overlayfs/dir.c
index 23d09de00957..1c0153b1ad6d 100644
--- a/fs/overlayfs/dir.c
+++ b/fs/overlayfs/dir.c
@@ -625,7 +625,8 @@ static int ovl_create_or_link(struct dentry *dentry, struct 
inode *inode,
return err;
 }
 
-static int ovl_create_object(struct dentry *dentry, int mode, dev_t rdev,
+static int ovl_create_object(struct user_namespace *user_ns,
+struct dentry *dentry, int mode, dev_t rdev,
 const char *link)
 {
int err;
@@ -649,7 +650,7 @@ static int ovl_create_object(struct dentry *dentry, int 
mode, dev_t rdev,
inode->i_state |= I_CREATING;
spin_unlock(>i_lock);
 
-   inode_init_owner(inode, dentry->d_parent->d_inode, mode);
+   mapped_inode_init_owner(inode, user_ns, dentry->d_parent->d_inode, 
mode);
attr.mode = inode->i_mode;
 
err = ovl_create_or_link(dentry, inode, , false);
@@ -663,31 +664,55 @@ static int ovl_create_object(struct dentry *dentry, int 
mode, dev_t rdev,
return err;
 }
 
-static int ovl_create(struct inode *dir, struct dentry *dentry, umode_t mode,
- bool excl)
+static int ovl_create_mapped(struct user_namespace *user_ns, struct inode *dir,
+struct dentry *dentry, umode_t mode, bool excl)
 {
-   return ovl_create_object(dentry, (mode & 0) | S_IFREG, 0, NULL);
+   return ovl_create_object(user_ns, dentry, (mode & 0) | S_IFREG, 0, 
NULL);
+}
+
+static int ovl_create(struct inode *dir, struct dentry *dentry, umode_t mode, 
bool excl)
+{
+   return ovl_create_mapped(_user_ns, dir, dentry, mode, excl);
+}
+
+static int ovl_mkdir_mapped(struct user_namespace *user_ns, struct inode *dir,
+   struct dentry *dentry, umode_t mode)
+{
+   return ovl_create_object(user_ns, dentry,
+(mode & 0) | S_IFDIR, 0, NULL);
 }
 
 static int ovl_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
 {
-   return ovl_create_object(dentry, (mode & 0) | S_IFDIR, 0, NULL);
+   return ovl_mkdir_mapped(_user_ns, dir, dentry, mode);
 }
 
-static int ovl_mknod(struct inode *dir, struct dentry *dentry, umode_t mode,
-dev_t rdev)
+static int ovl_mknod_mapped(struct user_namespace *user_ns, struct inode *dir,
+   struct dentry *dentry, umode_t mode, dev_t rdev)
 {
/* Don't allow creation of "whiteout" on overlay */
if (S_ISCHR(mode) && rdev == WHITEOUT_DEV)
return -EPERM;
 
-   return ovl_create_object(dentry, mode, rdev, NULL);
+   r

[PATCH 01/34] namespace: take lock_mount_hash() directly when changing flags

2020-10-29 Thread Christian Brauner

Changing mount options always ends up taking lock_mount_hash() but when
MNT_READONLY is requested and neither the mount nor the superblock are
not already MNT_READONLY we end up taking the lock, dropping it, and
retaking it to change the other mount attributes. Instead of this,
acquire the lock once when changing mount properties. This simplifies
the locking in these codepath, makes them easier to reason about and
avoids having to reacquire the lock right after dropping it.

Cc: Al Viro 
Cc: David Howells 
Cc: linux-fsde...@vger.kernel.org
Signed-off-by: Christian Brauner 
---
 fs/namespace.c | 13 ++---
 1 file changed, 6 insertions(+), 7 deletions(-)

diff --git a/fs/namespace.c b/fs/namespace.c
index cebaa3e81794..20ee291a7af4 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -463,7 +463,6 @@ static int mnt_make_readonly(struct mount *mnt)
 {
int ret = 0;
 
-   lock_mount_hash();
mnt->mnt.mnt_flags |= MNT_WRITE_HOLD;
/*
 * After storing MNT_WRITE_HOLD, we'll read the counters. This store
@@ -497,15 +496,12 @@ static int mnt_make_readonly(struct mount *mnt)
 */
smp_wmb();
mnt->mnt.mnt_flags &= ~MNT_WRITE_HOLD;
-   unlock_mount_hash();
return ret;
 }
 
 static int __mnt_unmake_readonly(struct mount *mnt)
 {
-   lock_mount_hash();
mnt->mnt.mnt_flags &= ~MNT_READONLY;
-   unlock_mount_hash();
return 0;
 }
 
@@ -2517,11 +2513,9 @@ static int change_mount_ro_state(struct mount *mnt, 
unsigned int mnt_flags)
  */
 static void set_mount_attributes(struct mount *mnt, unsigned int mnt_flags)
 {
-   lock_mount_hash();
mnt_flags |= mnt->mnt.mnt_flags & ~MNT_USER_SETTABLE_MASK;
mnt->mnt.mnt_flags = mnt_flags;
touch_mnt_namespace(mnt->mnt_ns);
-   unlock_mount_hash();
 }
 
 static void mnt_warn_timestamp_expiry(struct path *mountpoint, struct vfsmount 
*mnt)
@@ -2567,9 +2561,11 @@ static int do_reconfigure_mnt(struct path *path, 
unsigned int mnt_flags)
return -EPERM;
 
down_write(>s_umount);
+   lock_mount_hash();
ret = change_mount_ro_state(mnt, mnt_flags);
if (ret == 0)
set_mount_attributes(mnt, mnt_flags);
+   unlock_mount_hash();
up_write(>s_umount);
 
mnt_warn_timestamp_expiry(path, >mnt);
@@ -2610,8 +2606,11 @@ static int do_remount(struct path *path, int ms_flags, 
int sb_flags,
err = -EPERM;
if (ns_capable(sb->s_user_ns, CAP_SYS_ADMIN)) {
err = reconfigure_super(fc);
-   if (!err)
+   if (!err) {
+   lock_mount_hash();
set_mount_attributes(mnt, mnt_flags);
+   unlock_mount_hash();
+   }
}
up_write(>s_umount);
}
-- 
2.29.0

--
Linux-audit mailing list
Linux-audit@redhat.com
https://www.redhat.com/mailman/listinfo/linux-audit

[PATCH 30/34] ext4: support idmapped mounts

2020-10-29 Thread Christian Brauner

This enables ext4 to support idmapped mounts. All dedicated helpers we need for
this exist. The vfs will have already made sure that the fsids can be
translated after having been shifted if they are on an idmapped mount.

This implements helpers for the new inode operations that we've added. The core
change is the allocation of a new inode based on the mount's user namespace.
Code duplication is virtually non-existent because we can implement the
non-idmapped mount aware inode methods on top of the idmapped mount aware inode
methods. When the initial user namespace is passed the idmapped mount helpers
are nops and all mounts are marked with the initial user namespace by default.

It is also noteworthy that the idmapped mount implementation allows us to
cleanly handle ioctls() too. I've kept this as a single patch for now since the
change is overall fairly mechanical but I'm happy to split this.

Let's create simple example where we idmap an ext4 filesystem:

 root@f2-vm:~# truncate -s 5G ext4.img

 root@f2-vm:~# mkfs.ext4 ./ext4.img
 mke2fs 1.45.5 (07-Jan-2020)
 Discarding device blocks: done
 Creating filesystem with 1310720 4k blocks and 327680 inodes
 Filesystem UUID: 3fd91794-c6ca-4b0f-9964-289a000919cf
 Superblock backups stored on blocks:
 32768, 98304, 163840, 229376, 294912, 819200, 884736

 Allocating group tables: done
 Writing inode tables: done
 Creating journal (16384 blocks): done
 Writing superblocks and filesystem accounting information: done

 root@f2-vm:~# losetup -f --show ./ext4.img
 /dev/loop0

 root@f2-vm:~# mount /dev/loop0 /mnt

 root@f2-vm:~# ls -al /mnt/
 total 24
 drwxr-xr-x  3 root root  4096 Oct 28 13:34 .
 drwxr-xr-x 30 root root  4096 Oct 28 13:22 ..
 drwx--  2 root root 16384 Oct 28 13:34 lost+found

 # Let's create an idmapped mount at /idmapped1 where we map uid and gid 0 to
 # uid and gid 1000
 root@f2-vm:/# ./mount2 -mb:0:1000:1 /mnt/ /idmapped1/

 root@f2-vm:/# ls -al /idmapped1/
 total 24
 drwxr-xr-x  3 ubuntu ubuntu  4096 Oct 28 13:34 .
 drwxr-xr-x 30 root   root4096 Oct 28 13:22 ..
 drwx--  2 ubuntu ubuntu 16384 Oct 28 13:34 lost+found

 # Let's create an idmapped mount at /idmapped2 where we map uid and gid 0 to
 # uid and gid 2000
 root@f2-vm:/# ./mount2 -mb:0:2000:1 /mnt/ /idmapped2/

 root@f2-vm:/# ls -al /idmapped2/
 total 24
 drwxr-xr-x  3 2000 2000  4096 Oct 28 13:34 .
 drwxr-xr-x 31 root root  4096 Oct 28 13:39 ..
 drwx--  2 2000 2000 16384 Oct 28 13:34 lost+found

Let's create another example where we idmap the rootfs filesystem without a
mapping for uid 0 and gid 0:

 # Create an idmapped mount of for a full POSIX range of rootfs under /mnt
 # but without a mapping for uid 0 to reduce attack surface

 root@f2-vm:/# ./mount2 -mb:1:1:65536 / /mnt/

 # Since we don't have a mapping for uid and gid 0 all files owned by uid and
 # gid 0 should show up as uid and gid 65534:
 root@f2-vm:/# ls -al /mnt/
 total 664
 drwxr-xr-x 31 nobody nogroup   4096 Oct 28 13:39 .
 drwxr-xr-x 31 root   root  4096 Oct 28 13:39 ..
 lrwxrwxrwx  1 nobody nogroup  7 Aug 25 07:44 bin -> usr/bin
 drwxr-xr-x  4 nobody nogroup   4096 Oct 28 13:17 boot
 drwxr-xr-x  2 nobody nogroup   4096 Aug 25 07:48 dev
 drwxr-xr-x 81 nobody nogroup   4096 Oct 28 04:00 etc
 drwxr-xr-x  4 nobody nogroup   4096 Oct 28 04:00 home
 lrwxrwxrwx  1 nobody nogroup  7 Aug 25 07:44 lib -> usr/lib
 lrwxrwxrwx  1 nobody nogroup  9 Aug 25 07:44 lib32 -> usr/lib32
 lrwxrwxrwx  1 nobody nogroup  9 Aug 25 07:44 lib64 -> usr/lib64
 lrwxrwxrwx  1 nobody nogroup 10 Aug 25 07:44 libx32 -> usr/libx32
 drwx--  2 nobody nogroup  16384 Aug 25 07:47 lost+found
 drwxr-xr-x  2 nobody nogroup   4096 Aug 25 07:44 media
 drwxr-xr-x 31 nobody nogroup   4096 Oct 28 13:39 mnt
 drwxr-xr-x  2 nobody nogroup   4096 Aug 25 07:44 opt
 drwxr-xr-x  2 nobody nogroup   4096 Apr 15  2020 proc
 drwx--x--x  6 nobody nogroup   4096 Oct 28 13:34 root
 drwxr-xr-x  2 nobody nogroup   4096 Aug 25 07:46 run
 lrwxrwxrwx  1 nobody nogroup  8 Aug 25 07:44 sbin -> usr/sbin
 drwxr-xr-x  2 nobody nogroup   4096 Aug 25 07:44 srv
 drwxr-xr-x  2 nobody nogroup   4096 Apr 15  2020 sys
 drwxrwxrwt 10 nobody nogroup   4096 Oct 28 13:19 tmp
 drwxr-xr-x 14 nobody nogroup   4096 Oct 20 13:00 usr
 drwxr-xr-x 12 nobody nogroup   4096 Aug 25 07:45 var

 # Since we do have a mapping for uid and gid 1000 all files owned by uid and
 # gid 1000 should simply show up as uid and gid 1000:
 root@f2-vm:/# ls -al /mnt/home/ubuntu/
 total 40
 drwxr-xr-x 3 ubuntu ubuntu  4096 Oct 28 00:43 .
 drwxr-xr-x 4 nobody nogroup 4096 Oct 28 04:00 ..
 -rw--- 1 ubuntu ubuntu  2936 Oct 28 12:26 .bash_history
 -rw-r--r-- 1 ubuntu ubuntu   220 Feb 25  2020 .bash_logout
 -rw-r--r-- 1 ubuntu ubuntu  3771 Feb 25  2020 .bashrc
 -rw-r--r-- 1 ubuntu ubuntu   807 Feb 25  2020 .profile
 -rw-r--r-- 1 ubuntu ubuntu 0 Oct 16 16:11 .sudo_as_admin_successful
 -rw--- 1 ubuntu ubuntu  1144 Oct 28 00:43 .viminfo

Signed-off-by:

[PATCH 05/34] fs: introduce MOUNT_ATTR_IDMAP

2020-10-29 Thread Christian Brauner

Introduce a new mount bind mount property to allow idmapping mounts. The
MOUNT_ATTR_IDMAP flag can be set via the new mount_setattr() syscall
together with a file descriptor referring to a user namespace.

The user namespace referenced by the namespace file descriptor will be
attached to the bind mount. All interactions with the filesystem going
through that mount will be shifted according to the mapping specified in that
user namespace.

Using user namespaces to mark mounts means we can reuse all the existing
infrastructure in the kernel that already exists to handle idmappings and can
also use this for permission checking to allow unprivileged user to create
idmapped mounts.

Idmapping a mount is decoupled from the caller's user and mount namespace.
This means idmapped mounts can be created in the initial user namespace
which is an important use-case for e.g. systemd-homed, portable usb-sticks
between systems, and other use-cases that have been brought up. For example,
assume a home directory where all files are owned by uid and gid 1000 and the
home directory is brought to a new laptop where the user has id 12345. The
system administrator can simply create a mount of this home directory with a
mapping of 1000:12345:1 other mappings to indicate the ids should be kept.
(With this it is e.g. also possible to create idmapped mounts on the host with
 an identity mapping 1:1:10 where the root user is not mapped. A user with
 root access that e.g. has been pivot rooted into such a mount on the host will
 be not be able to execute, read, write, or create files as root.)

Given that idmapping a mount is decoupled from the caller's user namespace
a sufficiently privileged process such as a container manager can set up a
shifted mount for the container and the container can simply pivot root to
it. There's no need for the container to do anything. The mount will appear
correctly mapped independent of the user namespace the container uses. This
means we don't need to mark a mount as idmappable.

In order to create an idmapped mount the following conditions must be
fulfilled. The caller must either be privileged in the user namespace of
the superblock the mount belongs to or the mount must have already been
shifted before and the caller must be privileged in the user namespace that
this mount has been shifted to. The latter case means that shifted mounts
can e.g. be created by unprivileged users provided that the underlying
mount has already been idmapped to a user namespace they have privilege
over.

Once a mount has been idmapped it's idmapping cannot be changed. This is to
keep things simple. Callers that want another idmapping can simply create
another detached mount and idmap it.

The new CONFIG_IDMAP_MOUNTS option that can be used to compile the
kernel with idmapped mount support. It will default to off for quite
some time. Let's not be over confident.

Cc: Al Viro 
Cc: David Howells 
Cc: linux-fsde...@vger.kernel.org
Signed-off-by: Christian Brauner 
---
 fs/Kconfig |   6 ++
 fs/internal.h  |   1 +
 fs/namespace.c | 157 -
 include/linux/fs.h |   1 +
 include/linux/mount.h  |  20 -
 include/uapi/linux/mount.h |   6 +-
 6 files changed, 186 insertions(+), 5 deletions(-)

diff --git a/fs/Kconfig b/fs/Kconfig
index aa4c12282301..2d45ec3c7e04 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -15,6 +15,12 @@ config VALIDATE_FS_PARSER
  Enable this to perform validation of the parameter description for a
  filesystem when it is registered.
 
+config IDMAP_MOUNTS
+   bool "Support id mappings per mount"
+   default n
+   help
+ This allows the vfs to create idmappings per vfsmount.
+
 if BLOCK
 
 config FS_IOMAP
diff --git a/fs/internal.h b/fs/internal.h
index a5a6c470dc07..b6046b5186cd 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -88,6 +88,7 @@ struct mount_kattr {
unsigned int propagation;
unsigned int lookup_flags;
bool recurse;
+   struct user_namespace *userns;
 };
 
 extern struct vfsmount *lookup_mnt(const struct path *);
diff --git a/fs/namespace.c b/fs/namespace.c
index e9c515b012a4..aef39fc74afa 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -25,6 +25,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -210,6 +211,9 @@ static struct mount *alloc_vfsmnt(const char *name)
INIT_HLIST_NODE(>mnt_mp_list);
INIT_LIST_HEAD(>mnt_umounting);
INIT_HLIST_HEAD(>mnt_stuck_children);
+#ifdef CONFIG_IDMAP_MOUNTS
+   mnt->mnt.mnt_user_ns = _user_ns;
+#endif
}
return mnt;
 
@@ -555,6 +559,13 @@ int sb_prepare_remount_readonly(struct super_block *sb)
 
 static void free_vfsmnt(struct mount *mnt)
 {
+#ifdef CONFIG_IDMAP_MOUNTS
+   if ((mnt->mnt.mnt_flags & MNT_IDMAPPED) &&
+   mnt_user_ns(>mnt) != _us

[PATCH 34/34] fat: handle idmapped mounts

2020-10-29 Thread Christian Brauner

Let fat handle idmapped mounts. This allows to have the same fat mount appear
in multiple locations with different id mappings. This allows to expose a vfat
formatted USB stick to multiple user with different ids on the host or in user
namespaces:

mount -o uid=1000,gid=1000 /dev/sdb /mnt

u1001@f2-vm:/lower1$ ls -ln /mnt/
total 4
-rwxr-xr-x 1 1000 1000 4 Oct 28 03:44 aaa
-rwxr-xr-x 1 1000 1000 0 Oct 28 01:09 bbb
-rwxr-xr-x 1 1000 1000 0 Oct 28 01:10 ccc
-rwxr-xr-x 1 1000 1000 0 Oct 28 03:46 ddd
-rwxr-xr-x 1 1000 1000 0 Oct 28 04:01 eee

mount2 --idmap both:1000:1001:1

u1001@f2-vm:/lower1$ ls -ln /lower1/
total 4
-rwxr-xr-x 1 1001 1001 4 Oct 28 03:44 aaa
-rwxr-xr-x 1 1001 1001 0 Oct 28 01:09 bbb
-rwxr-xr-x 1 1001 1001 0 Oct 28 01:10 ccc
-rwxr-xr-x 1 1001 1001 0 Oct 28 03:46 ddd
-rwxr-xr-x 1 1001 1001 0 Oct 28 04:01 eee

u1001@f2-vm:/lower1$ touch /lower1/fff

u1001@f2-vm:/lower1$ ls -ln /lower1/fff
-rwxr-xr-x 1 1001 1001 0 Oct 28 04:03 /lower1/fff

u1001@f2-vm:/lower1$ ls -ln /mnt/fff
-rwxr-xr-x 1 1000 1000 0 Oct 28 04:03 /mnt/fff

Signed-off-by: Christian Brauner 
---
 fs/fat/fat.h |  2 ++
 fs/fat/file.c| 27 +++
 fs/fat/namei_msdos.c |  7 +++
 fs/fat/namei_vfat.c  |  7 +++
 4 files changed, 35 insertions(+), 8 deletions(-)

diff --git a/fs/fat/fat.h b/fs/fat/fat.h
index 922a0c6ba46c..56d661e93d2a 100644
--- a/fs/fat/fat.h
+++ b/fs/fat/fat.h
@@ -398,6 +398,8 @@ extern long fat_generic_ioctl(struct file *filp, unsigned 
int cmd,
 extern const struct file_operations fat_file_operations;
 extern const struct inode_operations fat_file_inode_operations;
 extern int fat_setattr(struct dentry *dentry, struct iattr *attr);
+extern int fat_setattr_mapped(struct user_namespace *user_ns,
+ struct dentry *dentry, struct iattr *attr);
 extern void fat_truncate_blocks(struct inode *inode, loff_t offset);
 extern int fat_getattr(const struct path *path, struct kstat *stat,
   u32 request_mask, unsigned int flags);
diff --git a/fs/fat/file.c b/fs/fat/file.c
index f9ee27cf4d7c..f97d46711b37 100644
--- a/fs/fat/file.c
+++ b/fs/fat/file.c
@@ -398,7 +398,7 @@ int fat_getattr(const struct path *path, struct kstat *stat,
u32 request_mask, unsigned int flags)
 {
struct inode *inode = d_inode(path->dentry);
-   generic_fillattr(inode, stat);
+   mapped_generic_fillattr(mnt_user_ns(path->mnt), inode, stat);
stat->blksize = MSDOS_SB(inode->i_sb)->cluster_size;
 
if (MSDOS_SB(inode->i_sb)->options.nfs == FAT_NFS_NOSTALE_RO) {
@@ -447,12 +447,13 @@ static int fat_sanitize_mode(const struct msdos_sb_info 
*sbi,
return 0;
 }
 
-static int fat_allow_set_time(struct msdos_sb_info *sbi, struct inode *inode)
+static int fat_allow_set_time(struct user_namespace *user_ns,
+ struct msdos_sb_info *sbi, struct inode *inode)
 {
umode_t allow_utime = sbi->options.allow_utime;
 
-   if (!uid_eq(current_fsuid(), inode->i_uid)) {
-   if (in_group_p(inode->i_gid))
+   if (!uid_eq(current_fsuid(), i_uid_into_mnt(user_ns, inode))) {
+   if (in_group_p(i_gid_into_mnt(user_ns, inode)))
allow_utime >>= 3;
if (allow_utime & MAY_WRITE)
return 1;
@@ -466,7 +467,8 @@ static int fat_allow_set_time(struct msdos_sb_info *sbi, 
struct inode *inode)
 /* valid file mode bits */
 #define FAT_VALID_MODE (S_IFREG | S_IFDIR | S_IRWXUGO)
 
-int fat_setattr(struct dentry *dentry, struct iattr *attr)
+int fat_setattr_mapped(struct user_namespace *user_ns, struct dentry *dentry,
+  struct iattr *attr)
 {
struct msdos_sb_info *sbi = MSDOS_SB(dentry->d_sb);
struct inode *inode = d_inode(dentry);
@@ -476,11 +478,11 @@ int fat_setattr(struct dentry *dentry, struct iattr *attr)
/* Check for setting the inode time. */
ia_valid = attr->ia_valid;
if (ia_valid & TIMES_SET_FLAGS) {
-   if (fat_allow_set_time(sbi, inode))
+   if (fat_allow_set_time(user_ns, sbi, inode))
attr->ia_valid &= ~TIMES_SET_FLAGS;
}
 
-   error = setattr_prepare(dentry, attr);
+   error = setattr_mapped_prepare(user_ns, dentry, attr);
attr->ia_valid = ia_valid;
if (error) {
if (sbi->options.quiet)
@@ -550,15 +552,24 @@ int fat_setattr(struct dentry *dentry, struct iattr *attr)
fat_truncate_time(inode, >ia_mtime, S_MTIME);
attr->ia_valid &= ~(ATTR_ATIME|ATTR_CTIME|ATTR_MTIME);
 
-   setattr_copy(inode, attr);
+   setattr_mapped_copy(user_ns, inode, attr);
mark_inode_dirty(inode);
 out:
return error;
 }
+EXPORT_SYMBOL_GPL(fat_setattr_mapped);
+
+int fat_setattr(struct dentry *dentry, struct iattr *attr)
+{
+   return fat_setattr_mapped(_us

[PATCH 20/34] open: handle idmapped mounts in do_truncate()

2020-10-29 Thread Christian Brauner

When truncating files the vfs will verify that the caller is privileged over
the inode. Since the do_truncate() helper is only used in a few places in the
vfs code extend it to handle idmapped mounts instead of adding a new helper.
If the inode is accessed through an idmapped mount it is mapped according to
the mount's user namespace. Afterwards the permissions checks are identical to
non-idmapped mounts. If the initial user namespace is passed all mapping
operations are a nop so non-idmapped mounts will not see a change in behavior
and will also not see any performance impact.

Signed-off-by: Christian Brauner 
---
 fs/coredump.c  | 12 +---
 fs/inode.c | 13 +
 fs/namei.c |  6 +++---
 fs/open.c  | 21 +
 include/linux/fs.h |  4 ++--
 5 files changed, 36 insertions(+), 20 deletions(-)

diff --git a/fs/coredump.c b/fs/coredump.c
index 0cd9056d79cc..25beac7230ff 100644
--- a/fs/coredump.c
+++ b/fs/coredump.c
@@ -703,6 +703,7 @@ void do_coredump(const kernel_siginfo_t *siginfo)
goto close_fail;
}
} else {
+   struct user_namespace *user_ns;
struct inode *inode;
int open_flags = O_CREAT | O_RDWR | O_NOFOLLOW |
 O_LARGEFILE | O_EXCL;
@@ -786,7 +787,8 @@ void do_coredump(const kernel_siginfo_t *siginfo)
goto close_fail;
if (!(cprm.file->f_mode & FMODE_CAN_WRITE))
goto close_fail;
-   if (do_truncate(cprm.file->f_path.dentry, 0, 0, cprm.file))
+   user_ns = mnt_user_ns(cprm.file->f_path.mnt);
+   if (do_truncate(user_ns, cprm.file->f_path.dentry, 0, 0, 
cprm.file))
goto close_fail;
}
 
@@ -931,8 +933,12 @@ void dump_truncate(struct coredump_params *cprm)
 
if (file->f_op->llseek && file->f_op->llseek != no_llseek) {
offset = file->f_op->llseek(file, 0, SEEK_CUR);
-   if (i_size_read(file->f_mapping->host) < offset)
-   do_truncate(file->f_path.dentry, offset, 0, file);
+   if (i_size_read(file->f_mapping->host) < offset) {
+   struct user_namespace *user_ns;
+
+   user_ns = mnt_user_ns(file->f_path.mnt);
+   do_truncate(user_ns, file->f_path.dentry, offset, 0, 
file);
+   }
}
 }
 EXPORT_SYMBOL(dump_truncate);
diff --git a/fs/inode.c b/fs/inode.c
index 22de3cb3b1f4..a9e2c8232e61 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -1904,7 +1904,8 @@ int dentry_needs_remove_privs(struct dentry *dentry)
return mask;
 }
 
-static int __remove_privs(struct dentry *dentry, int kill)
+static int __remove_privs(struct user_namespace *user_ns, struct dentry 
*dentry,
+ int kill)
 {
struct iattr newattrs;
 
@@ -1913,7 +1914,7 @@ static int __remove_privs(struct dentry *dentry, int kill)
 * Note we call this on write, so notify_change will not
 * encounter any conflicting delegations:
 */
-   return notify_change(dentry, , NULL);
+   return notify_mapped_change(user_ns, dentry, , NULL);
 }
 
 /*
@@ -1939,8 +1940,12 @@ int file_remove_privs(struct file *file)
kill = dentry_needs_remove_privs(dentry);
if (kill < 0)
return kill;
-   if (kill)
-   error = __remove_privs(dentry, kill);
+   if (kill) {
+   struct user_namespace *user_ns;
+
+   user_ns = mnt_user_ns(file->f_path.mnt);
+   error = __remove_privs(user_ns, dentry, kill);
+   }
if (!error)
inode_has_no_xattr(inode);
 
diff --git a/fs/namei.c b/fs/namei.c
index 7901ea09e80e..76c9637eccb9 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -2985,9 +2985,9 @@ static int handle_truncate(struct file *filp)
if (!error)
error = security_path_truncate(path);
if (!error) {
-   error = do_truncate(path->dentry, 0,
-   ATTR_MTIME|ATTR_CTIME|ATTR_OPEN,
-   filp);
+   error = do_truncate(mnt_user_ns(filp->f_path.mnt),
+   path->dentry, 0,
+   ATTR_MTIME | ATTR_CTIME | ATTR_OPEN, filp);
}
put_write_access(inode);
return error;
diff --git a/fs/open.c b/fs/open.c
index 9af548fb841b..efa462b6b9c7 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -35,8 +35,8 @@
 
 #include "internal.h"
 
-int do_truncate(struct dentry *dentry, loff_t length, unsigned int time_attrs,
-   struct file *filp)
+int do_truncate(struct user_namespace *user_ns, struct dentry *dentry,
+   loff_t length, unsigned int time_attrs, struct file *filp)
 {
int ret;
s

[PATCH 08/34] namei: add idmapped mount aware permission helpers

2020-10-29 Thread Christian Brauner

The two helpers inode_permission() and generic_permission() are used by
the vfs to perform basic permission checking by verifying that the
caller is privileged over an inode. In order to handle idmapped mount we
add the two helpers mapped_inode_permission() to
mapped_generic_permission() which take a user namespace argument. On
idmapped mounts the two new helpers will make sure to map the inode
according to the mount's user namespace and then peform identical
permission checks to inode_permission() and generic_permission(). If the
initial user namespace is passed mapped_inode_permission() and
mapped_generic_permission() are identical to inode_permission() and
generic_permission() so there will be no performance impact on
non-idmapped mounts. This also means that the inode_permission() and
generic_permission() helpers can be implemented on top of
mapped_inode_permission() and mapped_generic_permission() respectively
by just passing in the initial user namespace so no code is
unnecessarily duplicated.

Signed-off-by: Christian Brauner 
---
 fs/namei.c| 71 ---
 fs/posix_acl.c| 16 ++---
 include/linux/fs.h|  2 ++
 include/linux/posix_acl.h |  4 ++-
 4 files changed, 67 insertions(+), 26 deletions(-)

diff --git a/fs/namei.c b/fs/namei.c
index d4a6dd772303..2635f6a57de5 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -259,7 +259,7 @@ void putname(struct filename *name)
__putname(name);
 }
 
-static int check_acl(struct inode *inode, int mask)
+static int check_acl(struct user_namespace *user_ns, struct inode *inode, int 
mask)
 {
 #ifdef CONFIG_FS_POSIX_ACL
struct posix_acl *acl;
@@ -271,14 +271,14 @@ static int check_acl(struct inode *inode, int mask)
/* no ->get_acl() calls in RCU mode... */
if (is_uncached_acl(acl))
return -ECHILD;
-   return posix_acl_permission(inode, acl, mask);
+   return posix_acl_permission(user_ns, inode, acl, mask);
}
 
acl = get_acl(inode, ACL_TYPE_ACCESS);
if (IS_ERR(acl))
return PTR_ERR(acl);
if (acl) {
-   int error = posix_acl_permission(inode, acl, mask);
+   int error = posix_acl_permission(user_ns, inode, acl, mask);
posix_acl_release(acl);
return error;
}
@@ -293,12 +293,14 @@ static int check_acl(struct inode *inode, int mask)
  * Note that the POSIX ACL check cares about the MAY_NOT_BLOCK bit,
  * for RCU walking.
  */
-static int acl_permission_check(struct inode *inode, int mask)
+static int acl_permission_check(struct user_namespace *user_ns, struct inode 
*inode, int mask)
 {
unsigned int mode = inode->i_mode;
+   kuid_t i_uid;
 
/* Are we the owner? If so, ACL's don't matter */
-   if (likely(uid_eq(current_fsuid(), inode->i_uid))) {
+   i_uid = i_uid_into_mnt(user_ns, inode);
+   if (likely(uid_eq(current_fsuid(), i_uid))) {
mask &= 7;
mode >>= 6;
return (mask & ~mode) ? -EACCES : 0;
@@ -306,7 +308,7 @@ static int acl_permission_check(struct inode *inode, int 
mask)
 
/* Do we have ACL's? */
if (IS_POSIXACL(inode) && (mode & S_IRWXG)) {
-   int error = check_acl(inode, mask);
+   int error = check_acl(user_ns, inode, mask);
if (error != -EAGAIN)
return error;
}
@@ -320,7 +322,8 @@ static int acl_permission_check(struct inode *inode, int 
mask)
 * about? Need to check group ownership if so.
 */
if (mask & (mode ^ (mode >> 3))) {
-   if (in_group_p(inode->i_gid))
+   kgid_t kgid = i_gid_into_mnt(user_ns, inode);
+   if (in_group_p(kgid))
mode >>= 3;
}
 
@@ -329,7 +332,7 @@ static int acl_permission_check(struct inode *inode, int 
mask)
 }
 
 /**
- * generic_permission -  check for access rights on a Posix-like filesystem
+ * mapped_generic_permission -  check for access rights on a Posix-like 
filesystem
  * @inode: inode to check access rights for
  * @mask:  right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC,
  * %MAY_NOT_BLOCK ...)
@@ -343,24 +346,25 @@ static int acl_permission_check(struct inode *inode, int 
mask)
  * request cannot be satisfied (eg. requires blocking or too much complexity).
  * It would then be called again in ref-walk mode.
  */
-int generic_permission(struct inode *inode, int mask)
+int mapped_generic_permission(struct user_namespace *user_ns, struct inode 
*inode,
+ int mask)
 {
int ret;
 
/*
 * Do the basic permission checks.
 */
-   ret = acl_permission_check(inode, mask);
+   ret = acl_permission_check(user_ns, inode, mask);
if (ret != -EACCES)

[PATCH 07/34] capability: handle idmapped mounts

2020-10-29 Thread Christian Brauner

In order to determine whether a caller holds privilege over a given
inode the capability framework exposes the two helpers
privileged_wrt_inode_uidgid() and capable_wrt_inode_uidgid(). The former
verifies that the inode has a mapping in the caller's user namespace and
the latter additionally verifies that the caller has the requested
capability in their current user namespace. If the inode is accessed
through an idmapped mount we first need to map it according to the
mount's user namespace. Afterwards the checks are identical to
non-idmapped inodes. If the initial user namespace is passed all
operations are a nop so non-idmapped mounts will not see a change in
behavior and will also not see any performance impact.
Since the privileged_wrt_inode_uidgid() helper only has one caller it
makes more sense to simply add an additional user namespace argument and
adapt the single callsite it is used in. The capable_wrt_inode_uidgid()
helper is used in more places so we introduce a new
capable_wrt_mapped_inode_uidgid() helper which can be used by the vfs.

Signed-off-by: Christian Brauner 
---
 fs/exec.c  |  2 +-
 include/linux/capability.h |  6 +-
 kernel/capability.c| 22 --
 3 files changed, 22 insertions(+), 8 deletions(-)

diff --git a/fs/exec.c b/fs/exec.c
index 547a2390baf5..8e75d7a33514 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1398,7 +1398,7 @@ void would_dump(struct linux_binprm *bprm, struct file 
*file)
/* Ensure mm->user_ns contains the executable */
user_ns = old = bprm->mm->user_ns;
while ((user_ns != _user_ns) &&
-  !privileged_wrt_inode_uidgid(user_ns, inode))
+  !privileged_wrt_inode_uidgid(user_ns, _user_ns, 
inode))
user_ns = user_ns->parent;
 
if (old != user_ns) {
diff --git a/include/linux/capability.h b/include/linux/capability.h
index 1e7fe311cabe..308d88096745 100644
--- a/include/linux/capability.h
+++ b/include/linux/capability.h
@@ -247,8 +247,12 @@ static inline bool ns_capable_setid(struct user_namespace 
*ns, int cap)
return true;
 }
 #endif /* CONFIG_MULTIUSER */
-extern bool privileged_wrt_inode_uidgid(struct user_namespace *ns, const 
struct inode *inode);
+extern bool privileged_wrt_inode_uidgid(struct user_namespace *ns,
+   struct user_namespace *mnt_user_ns,
+   const struct inode *inode);
 extern bool capable_wrt_inode_uidgid(const struct inode *inode, int cap);
+extern bool capable_wrt_mapped_inode_uidgid(struct user_namespace *mnt_user_ns,
+   const struct inode *inode, int cap);
 extern bool file_ns_capable(const struct file *file, struct user_namespace 
*ns, int cap);
 extern bool ptracer_capable(struct task_struct *tsk, struct user_namespace 
*ns);
 static inline bool perfmon_capable(void)
diff --git a/kernel/capability.c b/kernel/capability.c
index de7eac903a2a..427776414487 100644
--- a/kernel/capability.c
+++ b/kernel/capability.c
@@ -484,12 +484,24 @@ EXPORT_SYMBOL(file_ns_capable);
  *
  * Return true if the inode uid and gid are within the namespace.
  */
-bool privileged_wrt_inode_uidgid(struct user_namespace *ns, const struct inode 
*inode)
+bool privileged_wrt_inode_uidgid(struct user_namespace *ns,
+struct user_namespace *mnt_user_ns,
+const struct inode *inode)
 {
-   return kuid_has_mapping(ns, inode->i_uid) &&
-   kgid_has_mapping(ns, inode->i_gid);
+   return kuid_has_mapping(ns, i_uid_into_mnt(mnt_user_ns, inode)) &&
+  kgid_has_mapping(ns, i_gid_into_mnt(mnt_user_ns, inode));
 }
 
+bool capable_wrt_mapped_inode_uidgid(struct user_namespace *mnt_user_ns,
+const struct inode *inode, int cap)
+{
+   struct user_namespace *ns = current_user_ns();
+
+   return ns_capable(ns, cap) &&
+  privileged_wrt_inode_uidgid(ns, mnt_user_ns, inode);
+}
+EXPORT_SYMBOL(capable_wrt_mapped_inode_uidgid);
+
 /**
  * capable_wrt_inode_uidgid - Check nsown_capable and uid and gid mapped
  * @inode: The inode in question
@@ -501,9 +513,7 @@ bool privileged_wrt_inode_uidgid(struct user_namespace *ns, 
const struct inode *
  */
 bool capable_wrt_inode_uidgid(const struct inode *inode, int cap)
 {
-   struct user_namespace *ns = current_user_ns();
-
-   return ns_capable(ns, cap) && privileged_wrt_inode_uidgid(ns, inode);
+   return capable_wrt_mapped_inode_uidgid(_user_ns, inode, cap);
 }
 EXPORT_SYMBOL(capable_wrt_inode_uidgid);
 
-- 
2.29.0

--
Linux-audit mailing list
Linux-audit@redhat.com
https://www.redhat.com/mailman/listinfo/linux-audit

[PATCH 18/34] namei: prepare for idmapped mounts

2020-10-29 Thread Christian Brauner

The various vfs_*() helpers are called by filesystems or by the vfs itself to
perform core operations create, link, mkdir, mknod, rename, rmdir, tmpfile and
unlink. Add a set of helpers that handle idmapped mounts. If the inode is
accessed through an idmapped mount it is mapped according to the mount's user
namespace. Afterwards the checks and operations are identical to non-idmapped
mounts. If the initial user namespace is passed all mapping operations are a
nop so non-idmapped mounts will not see a change in behavior and will also not
see any performance impact. It also means that the non-idmapped-mount aware
helpers can be implemented on top of their idmapped-mount aware counterparts by
passing the initial user namespace.

Signed-off-by: Christian Brauner 
---
 fs/namei.c   | 229 +++
 fs/overlayfs/overlayfs.h |   2 +-
 include/linux/fs.h   |  32 +-
 3 files changed, 192 insertions(+), 71 deletions(-)

diff --git a/fs/namei.c b/fs/namei.c
index 781f11795a22..a8a3de936cfc 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -2835,10 +2835,10 @@ void unlock_rename(struct dentry *p1, struct dentry *p2)
 }
 EXPORT_SYMBOL(unlock_rename);
 
-int vfs_create(struct inode *dir, struct dentry *dentry, umode_t mode,
-   bool want_excl)
+int vfs_mapped_create(struct user_namespace *user_ns, struct inode *dir,
+ struct dentry *dentry, umode_t mode, bool want_excl)
 {
-   int error = may_create(_user_ns, dir, dentry);
+   int error = may_create(user_ns, dir, dentry);
if (error)
return error;
 
@@ -2854,6 +2854,13 @@ int vfs_create(struct inode *dir, struct dentry *dentry, 
umode_t mode,
fsnotify_create(dir, dentry);
return error;
 }
+EXPORT_SYMBOL(vfs_mapped_create);
+
+int vfs_create(struct inode *dir, struct dentry *dentry, umode_t mode,
+   bool want_excl)
+{
+   return vfs_mapped_create(_user_ns, dir, dentry, mode, want_excl);
+}
 EXPORT_SYMBOL(vfs_create);
 
 int vfs_mkobj(struct dentry *dentry, umode_t mode,
@@ -3313,7 +3320,9 @@ static int do_open(struct nameidata *nd,
return error;
 }
 
-struct dentry *vfs_tmpfile(struct dentry *dentry, umode_t mode, int open_flag)
+struct dentry *vfs_mapped_tmpfile(struct user_namespace *user_ns,
+ struct dentry *dentry, umode_t mode,
+ int open_flag)
 {
struct dentry *child = NULL;
struct inode *dir = dentry->d_inode;
@@ -3321,7 +3330,7 @@ struct dentry *vfs_tmpfile(struct dentry *dentry, umode_t 
mode, int open_flag)
int error;
 
/* we want directory to be writable */
-   error = inode_permission(dir, MAY_WRITE | MAY_EXEC);
+   error = mapped_inode_permission(user_ns, dir, MAY_WRITE | MAY_EXEC);
if (error)
goto out_err;
error = -EOPNOTSUPP;
@@ -3350,12 +3359,19 @@ struct dentry *vfs_tmpfile(struct dentry *dentry, 
umode_t mode, int open_flag)
dput(child);
return ERR_PTR(error);
 }
+EXPORT_SYMBOL(vfs_mapped_tmpfile);
+
+struct dentry *vfs_tmpfile(struct dentry *dentry, umode_t mode, int open_flag)
+{
+   return vfs_mapped_tmpfile(_user_ns, dentry, mode, open_flag);
+}
 EXPORT_SYMBOL(vfs_tmpfile);
 
 static int do_tmpfile(struct nameidata *nd, unsigned flags,
const struct open_flags *op,
struct file *file)
 {
+   struct user_namespace *user_ns;
struct dentry *child;
struct path path;
int error = path_lookupat(nd, flags | LOOKUP_DIRECTORY, );
@@ -3364,7 +3380,8 @@ static int do_tmpfile(struct nameidata *nd, unsigned 
flags,
error = mnt_want_write(path.mnt);
if (unlikely(error))
goto out;
-   child = vfs_tmpfile(path.dentry, op->mode, op->open_flag);
+   user_ns = mnt_user_ns(path.mnt);
+   child = vfs_mapped_tmpfile(user_ns, path.dentry, op->mode, 
op->open_flag);
error = PTR_ERR(child);
if (IS_ERR(child))
goto out2;
@@ -3576,10 +3593,11 @@ inline struct dentry *user_path_create(int dfd, const 
char __user *pathname,
 }
 EXPORT_SYMBOL(user_path_create);
 
-int vfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t 
dev)
+int vfs_mapped_mknod(struct user_namespace *user_ns, struct inode *dir,
+struct dentry *dentry, umode_t mode, dev_t dev)
 {
bool is_whiteout = S_ISCHR(mode) && dev == WHITEOUT_DEV;
-   int error = may_create(_user_ns, dir, dentry);
+   int error = may_create(user_ns, dir, dentry);
 
if (error)
return error;
@@ -3604,6 +3622,12 @@ int vfs_mknod(struct inode *dir, struct dentry *dentry, 
umode_t mode, dev_t dev)
fsnotify_create(dir, dentry);
return error;
 }
+EXPORT_SYMBOL(vfs_mapped_mknod);
+
+int vfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t 
dev)
+{
+   return vf

[PATCH 14/34] commoncap: handle idmapped mounts

2020-10-29 Thread Christian Brauner

When interacting with user namespace and non-user namespace aware
filesystem capabilities the vfs will perform various security checks to
determine whether or not the filesystem capabilities can be used by the
caller (e.g. during exec), or even whether they need to be removed. The
main infrastructure for this resides in the capability codepaths but
they are called through the LSM security infrastructure even though they
are not technically an LSM or optional. This extends the existing
security hooks security_inode_removexattr(), security_inode_killpriv(),
security_inode_getsecurity() to pass down the mount's user namespace and
makes them aware of idmapped mounts.
In order to actually get filesystem capabilities from disk the capability
infrastructure exposes the get_vfs_caps_from_disk() helper. For user namespace
aware filesystem capabilities a root uid is stored alongside the capabilities.
In order to determine whether the caller can make use of the filesystem
capability or whether it needs to be ignored it is translated according to the
superblock's user namespace. If it can be translated to uid 0 according to that
id mapping the caller can use the filesystem capabilities stored on disk. If we
are accessing the inode that holds the filesystem capabilities through an
idmapped mount we need to map root uid according to the mount's user namespace.
Afterwards the checks are identical to non-idmapped mounts. Reading filesystem
caps from disk enforces that the root uid associated with the filesystem
capability must have a mapping in the superblock's user namespace and that the
caller is either in the same user namespace or is a descendant of the 
superblock's user
namespace. For filesystems that are mountable inside user namespace the
container can just mount the filesystem and won't usually need to idmap it. If
it does create an idmapped mount it can mark it with a user namespace it has
created and which is therefore a descendant of the s_user_ns. For filesystems
that are not mountable inside user namespaces the descendant rule is trivially
true because the s_user_ns will be the initial user namespace.

If the initial user namespace is passed all operations are a nop so
non-idmapped mounts will not see a change in behavior and will also not
see any performance impact. It also means that the non-idmapped-mount
aware helpers can be implemented on top of their idmapped-mount aware
counterparts by passing the initial user namespace.

Signed-off-by: Christian Brauner 
---
 fs/attr.c |  2 +-
 fs/xattr.c| 12 ++--
 include/linux/capability.h|  3 +++
 include/linux/lsm_hook_defs.h | 10 ++
 include/linux/lsm_hooks.h |  1 +
 include/linux/security.h  | 36 +++
 kernel/auditsc.c  |  3 ++-
 security/commoncap.c  | 35 --
 security/security.c   | 18 +++---
 security/selinux/hooks.c  | 13 -
 security/smack/smack_lsm.c| 11 ++-
 11 files changed, 93 insertions(+), 51 deletions(-)

diff --git a/fs/attr.c b/fs/attr.c
index f39c03ac85e0..4daf6ac6de6d 100644
--- a/fs/attr.c
+++ b/fs/attr.c
@@ -113,7 +113,7 @@ int setattr_mapped_prepare(struct user_namespace *user_ns,
if (ia_valid & ATTR_KILL_PRIV) {
int error;
 
-   error = security_inode_killpriv(dentry);
+   error = security_inode_killpriv(user_ns, dentry);
if (error)
return error;
}
diff --git a/fs/xattr.c b/fs/xattr.c
index cdda2baeb9f7..40b02227257e 100644
--- a/fs/xattr.c
+++ b/fs/xattr.c
@@ -331,18 +331,18 @@ vfs_setxattr(struct dentry *dentry, const char *name, 
const void *value,
 EXPORT_SYMBOL_GPL(vfs_setxattr);
 
 static ssize_t
-xattr_getsecurity(struct inode *inode, const char *name, void *value,
-   size_t size)
+xattr_getsecurity(struct user_namespace *user_ns, struct inode *inode,
+ const char *name, void *value, size_t size)
 {
void *buffer = NULL;
ssize_t len;
 
if (!value || !size) {
-   len = security_inode_getsecurity(inode, name, , false);
+   len = security_inode_getsecurity(user_ns, inode, name, , 
false);
goto out_noalloc;
}
 
-   len = security_inode_getsecurity(inode, name, , true);
+   len = security_inode_getsecurity(user_ns, inode, name, , true);
if (len < 0)
return len;
if (size < len) {
@@ -440,7 +440,7 @@ vfs_mapped_getxattr(struct user_namespace *user_ns, struct 
dentry *dentry,
if (!strncmp(name, XATTR_SECURITY_PREFIX,
XATTR_SECURITY_PREFIX_LEN)) {
const char *suffix = name + XATTR_SECURITY_PREFIX_LEN;
-   int ret = xattr_getsecurity(inode, suffix, value, size);
+   int ret = xattr_getsecurity(user_ns, inode, suffix, va

[PATCH 19/34] namei: add lookup helpers with idmapped mounts aware permission checking

2020-10-29 Thread Christian Brauner

The lookup_one_len(), lookup_one_len_unlocked(), and lookup_positive_unlocked()
helpers are used by filesystems targeted in this first iteration to lookup
dentries if the caller is privileged over the inode of the base dentry. Add
three new helpers lookup_one_len_mapped(), lookup_one_len_mapped_unlocked(),
and lookup_one_len_mapped_unlocked() to handle idmapped mounts. If the inode is
accessed through an idmapped mount it is mapped according to the mount's user
namespace. Afterwards the permissions checks are identical to non-idmapped
mounts. If the initial user namespace is passed all mapping operations are a
nop so non-idmapped mounts will not see a change in behavior and will also not
see any performance impact. It also means that the non-idmapped-mount aware
helpers can be implemented on top of their idmapped-mount aware counterparts by
passing the initial user namespace.

Signed-off-by: Christian Brauner 
---
 fs/namei.c| 47 ---
 include/linux/namei.h |  6 ++
 2 files changed, 41 insertions(+), 12 deletions(-)

diff --git a/fs/namei.c b/fs/namei.c
index a8a3de936cfc..7901ea09e80e 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -2517,8 +2517,9 @@ int vfs_path_lookup(struct dentry *dentry, struct 
vfsmount *mnt,
 }
 EXPORT_SYMBOL(vfs_path_lookup);
 
-static int lookup_one_len_common(const char *name, struct dentry *base,
-int len, struct qstr *this)
+static int lookup_one_len_common(const char *name, struct dentry *base, int 
len,
+struct qstr *this,
+struct user_namespace *mnt_user_ns)
 {
this->name = name;
this->len = len;
@@ -2546,7 +2547,7 @@ static int lookup_one_len_common(const char *name, struct 
dentry *base,
return err;
}
 
-   return inode_permission(base->d_inode, MAY_EXEC);
+   return mapped_inode_permission(mnt_user_ns, base->d_inode, MAY_EXEC);
 }
 
 /**
@@ -2570,7 +2571,7 @@ struct dentry *try_lookup_one_len(const char *name, 
struct dentry *base, int len
 
WARN_ON_ONCE(!inode_is_locked(base->d_inode));
 
-   err = lookup_one_len_common(name, base, len, );
+   err = lookup_one_len_common(name, base, len, , _user_ns);
if (err)
return ERR_PTR(err);
 
@@ -2589,7 +2590,8 @@ EXPORT_SYMBOL(try_lookup_one_len);
  *
  * The caller must hold base->i_mutex.
  */
-struct dentry *lookup_one_len(const char *name, struct dentry *base, int len)
+struct dentry *lookup_one_len_mapped(const char *name, struct dentry *base, 
int len,
+struct user_namespace *mnt_user_ns)
 {
struct dentry *dentry;
struct qstr this;
@@ -2597,13 +2599,19 @@ struct dentry *lookup_one_len(const char *name, struct 
dentry *base, int len)
 
WARN_ON_ONCE(!inode_is_locked(base->d_inode));
 
-   err = lookup_one_len_common(name, base, len, );
+   err = lookup_one_len_common(name, base, len, , mnt_user_ns);
if (err)
return ERR_PTR(err);
 
dentry = lookup_dcache(, base, 0);
return dentry ? dentry : __lookup_slow(, base, 0);
 }
+EXPORT_SYMBOL(lookup_one_len_mapped);
+
+struct dentry *lookup_one_len(const char *name, struct dentry *base, int len)
+{
+   return lookup_one_len_mapped(name, base, len, _user_ns);
+}
 EXPORT_SYMBOL(lookup_one_len);
 
 /**
@@ -2618,14 +2626,14 @@ EXPORT_SYMBOL(lookup_one_len);
  * Unlike lookup_one_len, it should be called without the parent
  * i_mutex held, and will take the i_mutex itself if necessary.
  */
-struct dentry *lookup_one_len_unlocked(const char *name,
-  struct dentry *base, int len)
+struct dentry *lookup_one_len_mapped_unlocked(const char *name, struct dentry 
*base,
+ int len, struct user_namespace 
*mnt_user_ns)
 {
struct qstr this;
int err;
struct dentry *ret;
 
-   err = lookup_one_len_common(name, base, len, );
+   err = lookup_one_len_common(name, base, len, , mnt_user_ns);
if (err)
return ERR_PTR(err);
 
@@ -2634,6 +2642,13 @@ struct dentry *lookup_one_len_unlocked(const char *name,
ret = lookup_slow(, base, 0);
return ret;
 }
+EXPORT_SYMBOL(lookup_one_len_mapped_unlocked);
+
+struct dentry *lookup_one_len_unlocked(const char *name,
+  struct dentry *base, int len)
+{
+   return lookup_one_len_mapped_unlocked(name, base, len, _user_ns);
+}
 EXPORT_SYMBOL(lookup_one_len_unlocked);
 
 /*
@@ -2644,16 +2659,24 @@ EXPORT_SYMBOL(lookup_one_len_unlocked);
  * need to be very careful; pinned positives have ->d_inode stable, so
  * this one avoids such problems.
  */
-struct dentry *lookup_positive_unlocked(const char *name,
-  struct dentry *base, int len)
+struct dentry *lookup_positive_mapped_u

[PATCH 22/34] af_unix: handle idmapped mounts

2020-10-29 Thread Christian Brauner

When binding a non-abstract AF_UNIX socket it will gain a representation in the
filesystem. Enable the socket infrastructure to handle idmapped mounts by using
the new vfs_mapped_mknod() helper. Non-idmapped mounts will not see any altered
behavior.

Signed-off-by: Christian Brauner 
---
 net/unix/af_unix.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index 41c3303c3357..f79f7ce3243f 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -996,7 +996,7 @@ static int unix_mknod(const char *sun_path, umode_t mode, 
struct path *res)
 */
err = security_path_mknod(, dentry, mode, 0);
if (!err) {
-   err = vfs_mknod(d_inode(path.dentry), dentry, mode, 0);
+   err = vfs_mapped_mknod(mnt_user_ns(path.mnt), 
d_inode(path.dentry), dentry, mode, 0);
if (!err) {
res->mnt = mntget(path.mnt);
res->dentry = dget(dentry);
-- 
2.29.0

--
Linux-audit mailing list
Linux-audit@redhat.com
https://www.redhat.com/mailman/listinfo/linux-audit

[PATCH 17/34] namei: introduce struct renamedata

2020-10-29 Thread Christian Brauner

In order to handle idmapped mounts we will extend the vfs rename helper
to take two new arguments in follow up patches. Since this operations already
takes a bunch of arguments add a simple struct renamedata (based on struct
nameidata) and make the current helper to use it before we extend it.

Signed-off-by: Christian Brauner 
---
 fs/namei.c | 144 -
 1 file changed, 88 insertions(+), 56 deletions(-)

diff --git a/fs/namei.c b/fs/namei.c
index 76ee4d52bd5e..781f11795a22 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -4221,62 +4221,24 @@ SYSCALL_DEFINE2(link, const char __user *, oldname, 
const char __user *, newname
return do_linkat(AT_FDCWD, oldname, AT_FDCWD, newname, 0);
 }
 
-/**
- * vfs_rename - rename a filesystem object
- * @old_dir:   parent of source
- * @old_dentry:source
- * @new_dir:   parent of destination
- * @new_dentry:destination
- * @delegated_inode: returns an inode needing a delegation break
- * @flags: rename flags
- *
- * The caller must hold multiple mutexes--see lock_rename()).
- *
- * If vfs_rename discovers a delegation in need of breaking at either
- * the source or destination, it will return -EWOULDBLOCK and return a
- * reference to the inode in delegated_inode.  The caller should then
- * break the delegation and retry.  Because breaking a delegation may
- * take a long time, the caller should drop all locks before doing
- * so.
- *
- * Alternatively, a caller may pass NULL for delegated_inode.  This may
- * be appropriate for callers that expect the underlying filesystem not
- * to be NFS exported.
- *
- * The worst of all namespace operations - renaming directory. "Perverted"
- * doesn't even start to describe it. Somebody in UCB had a heck of a trip...
- * Problems:
- *
- * a) we can get into loop creation.
- * b) race potential - two innocent renames can create a loop together.
- *That's where 4.4 screws up. Current fix: serialization on
- *sb->s_vfs_rename_mutex. We might be more accurate, but that's another
- *story.
- * c) we have to lock _four_ objects - parents and victim (if it exists),
- *and source (if it is not a directory).
- *And that - after we got ->i_mutex on parents (until then we don't 
know
- *whether the target exists).  Solution: try to be smart with locking
- *order for inodes.  We rely on the fact that tree topology may change
- *only under ->s_vfs_rename_mutex _and_ that parent of the object we
- *move will be locked.  Thus we can rank directories by the tree
- *(ancestors first) and rank all non-directories after them.
- *That works since everybody except rename does "lock parent, lookup,
- *lock child" and rename is under ->s_vfs_rename_mutex.
- *HOWEVER, it relies on the assumption that any object with ->lookup()
- *has no more than 1 dentry.  If "hybrid" objects will ever appear,
- *we'd better make sure that there's no link(2) for them.
- * d) conversion from fhandle to dentry may come in the wrong moment - when
- *we are removing the target. Solution: we will have to grab ->i_mutex
- *in the fhandle_to_dentry code. [FIXME - current nfsfh.c relies on
- *->i_mutex on parents, which works but leads to some truly excessive
- *locking].
- */
-int vfs_rename(struct inode *old_dir, struct dentry *old_dentry,
-  struct inode *new_dir, struct dentry *new_dentry,
-  struct inode **delegated_inode, unsigned int flags)
+struct renamedata {
+   struct inode *old_dir;
+   struct dentry *old_dentry;
+   struct inode *new_dir;
+   struct dentry *new_dentry;
+   struct inode **delegated_inode;
+   unsigned int flags;
+} __randomize_layout;
+
+static int __vfs_rename(struct renamedata *rd)
 {
int error;
struct user_namespace *user_ns = _user_ns;
+   struct inode *old_dir = rd->old_dir, *new_dir = rd->new_dir;
+   struct dentry *old_dentry = rd->old_dentry,
+ *new_dentry = rd->new_dentry;
+   struct inode **delegated_inode = rd->delegated_inode;
+   unsigned int flags = rd->flags;
bool is_dir = d_is_dir(old_dentry);
struct inode *source = old_dentry->d_inode;
struct inode *target = new_dentry->d_inode;
@@ -4395,11 +4357,76 @@ int vfs_rename(struct inode *old_dir, struct dentry 
*old_dentry,
 
return error;
 }
+
+/**
+ * vfs_rename - rename a filesystem object
+ * @old_dir:   parent of source
+ * @old_dentry:source
+ * @new_dir:   parent of destination
+ * @new_dentry:destination
+ * @delegated_inode: returns an inode needing a delegation break
+ * @flags: rename flags
+ *
+ * The caller must hold multiple mutexes--see lock_rename()).
+ *
+ * If vfs_rename discovers a delegation in need of br

[PATCH 04/34] tests: add mount_setattr() selftests

2020-10-29 Thread Christian Brauner

Add a range of selftests for the new mount_setattr() syscall to verify
that it works as expected. This tests that:
- no invalid flags can be specified
- changing properties of a single mount works and leaves other mounts in
  the mount tree unchanged
- changing a mount tre to read-only when one of the mounts has writers
  fails and leaves the whole mount tree unchanged
- changing mount properties from multiple threads works
- changing atime settings works
- changing mount propagation works
- changing the mount options of a mount tree where the individual mounts
  in the tree have different mount options only changes the flags that
  were requested to change
- changing mount options from another mount namespace fails
- changing mount options from another user namespace fails

[==] Running 9 tests from 2 test cases.
[ RUN  ] mount_setattr.invalid_attributes
[   OK ] mount_setattr.invalid_attributes
[ RUN  ] mount_setattr.basic
[   OK ] mount_setattr.basic
[ RUN  ] mount_setattr.basic_recursive
[   OK ] mount_setattr.basic_recursive
[ RUN  ] mount_setattr.mount_has_writers
[   OK ] mount_setattr.mount_has_writers
[ RUN  ] mount_setattr.mixed_mount_options
[   OK ] mount_setattr.mixed_mount_options
[ RUN  ] mount_setattr.time_changes
[   OK ] mount_setattr.time_changes
[ RUN  ] mount_setattr.multi_threaded
[   OK ] mount_setattr.multi_threaded
[ RUN  ] mount_setattr.wrong_user_namespace
[   OK ] mount_setattr.wrong_user_namespace
[ RUN  ] mount_setattr.wrong_mount_namespace
[   OK ] mount_setattr.wrong_mount_namespace
[==] 9 / 9 tests passed.
[  PASSED  ]

Cc: Al Viro 
Cc: David Howells 
Cc: linux-fsde...@vger.kernel.org
Signed-off-by: Christian Brauner 
---
 tools/testing/selftests/Makefile  |   1 +
 .../selftests/mount_setattr/.gitignore|   1 +
 .../testing/selftests/mount_setattr/Makefile  |   7 +
 tools/testing/selftests/mount_setattr/config  |   1 +
 .../mount_setattr/mount_setattr_test.c| 888 ++
 5 files changed, 898 insertions(+)
 create mode 100644 tools/testing/selftests/mount_setattr/.gitignore
 create mode 100644 tools/testing/selftests/mount_setattr/Makefile
 create mode 100644 tools/testing/selftests/mount_setattr/config
 create mode 100644 tools/testing/selftests/mount_setattr/mount_setattr_test.c

diff --git a/tools/testing/selftests/Makefile b/tools/testing/selftests/Makefile
index d9c283503159..87b7107dd9a6 100644
--- a/tools/testing/selftests/Makefile
+++ b/tools/testing/selftests/Makefile
@@ -34,6 +34,7 @@ TARGETS += memfd
 TARGETS += memory-hotplug
 TARGETS += mincore
 TARGETS += mount
+TARGETS += mount_setattr
 TARGETS += mqueue
 TARGETS += net
 TARGETS += net/forwarding
diff --git a/tools/testing/selftests/mount_setattr/.gitignore 
b/tools/testing/selftests/mount_setattr/.gitignore
new file mode 100644
index ..5f74d8488472
--- /dev/null
+++ b/tools/testing/selftests/mount_setattr/.gitignore
@@ -0,0 +1 @@
+mount_setattr_test
diff --git a/tools/testing/selftests/mount_setattr/Makefile 
b/tools/testing/selftests/mount_setattr/Makefile
new file mode 100644
index ..2250f7dcb81e
--- /dev/null
+++ b/tools/testing/selftests/mount_setattr/Makefile
@@ -0,0 +1,7 @@
+# SPDX-License-Identifier: GPL-2.0
+# Makefile for mount selftests.
+CFLAGS = -g -I../../../../usr/include/ -Wall -O2 -pthread
+
+TEST_GEN_FILES += mount_setattr_test
+
+include ../lib.mk
diff --git a/tools/testing/selftests/mount_setattr/config 
b/tools/testing/selftests/mount_setattr/config
new file mode 100644
index ..416bd53ce982
--- /dev/null
+++ b/tools/testing/selftests/mount_setattr/config
@@ -0,0 +1 @@
+CONFIG_USER_NS=y
diff --git a/tools/testing/selftests/mount_setattr/mount_setattr_test.c 
b/tools/testing/selftests/mount_setattr/mount_setattr_test.c
new file mode 100644
index ..7d320cfa7d3b
--- /dev/null
+++ b/tools/testing/selftests/mount_setattr/mount_setattr_test.c
@@ -0,0 +1,888 @@
+// SPDX-License-Identifier: GPL-2.0
+#define _GNU_SOURCE
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+#include "../kselftest_harness.h"
+
+#ifndef CLONE_NEWNS
+#define CLONE_NEWNS 0x0002
+#endif
+
+#ifndef CLONE_NEWUSER
+#define CLONE_NEWUSER 0x1000
+#endif
+
+#ifndef MS_REC
+#define MS_REC 16384
+#endif
+
+#ifndef MS_RELATIME
+#define MS_RELATIME (1 << 21)
+#endif
+
+#ifndef MS_STRICTATIME
+#define MS_STRICTATIME (1 << 24)
+#endif
+
+#ifndef MOUNT_ATTR_RDONLY
+#define MOUNT_ATTR_RDONLY 0x0001
+#endif
+
+#ifndef MOUNT_ATTR_NOSUID
+#define MOUNT_ATTR_NOSUID 0x0002
+#endif
+
+#ifndef MOUNT_ATTR_NOEXEC
+#define MOUNT_ATTR_NOEXEC 0x0008
+#endif
+
+#ifndef MOUNT_ATTR_NODIRATIME
+#define MOUNT_ATTR_NODIRATIME 0x0080
+#endif
+
+#ifndef MOUNT_ATTR__ATIME
+#define MOUNT_

[PATCH 25/34] exec: handle idmapped mounts

2020-10-29 Thread Christian Brauner

When executing a setuid binary the kernel will verify in bprm_fill_uid() that
the inode has a mapping in the caller's user namespace before setting the
callers uid and gid. Let bprm_fill_uid() handle idmapped mounts. If the inode
is accessed through an idmapped mount it is mapped according to the mount's
user namespace. Afterwards the checks are identical to non-idmapped mounts.On
regular mounts this is a nop.

Signed-off-by: Christian Brauner 
---
 fs/exec.c | 7 +--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/fs/exec.c b/fs/exec.c
index 76de175eeba8..cd11ab505a41 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1567,6 +1567,7 @@ static void check_unsafe_exec(struct linux_binprm *bprm)
 static void bprm_fill_uid(struct linux_binprm *bprm, struct file *file)
 {
/* Handle suid and sgid on files */
+   struct user_namespace *user_ns;
struct inode *inode;
unsigned int mode;
kuid_t uid;
@@ -1583,13 +1584,15 @@ static void bprm_fill_uid(struct linux_binprm *bprm, 
struct file *file)
if (!(mode & (S_ISUID|S_ISGID)))
return;
 
+   user_ns = mnt_user_ns(file->f_path.mnt);
+
/* Be careful if suid/sgid is set */
inode_lock(inode);
 
/* reload atomically mode/uid/gid now that lock held */
mode = inode->i_mode;
-   uid = inode->i_uid;
-   gid = inode->i_gid;
+   uid = i_uid_into_mnt(user_ns, inode);
+   gid = i_gid_into_mnt(user_ns, inode);
inode_unlock(inode);
 
/* We ignore suid/sgid if there are no mappings for them in the ns */
-- 
2.29.0

--
Linux-audit mailing list
Linux-audit@redhat.com
https://www.redhat.com/mailman/listinfo/linux-audit

[PATCH 12/34] xattr: handle idmapped mounts

2020-10-29 Thread Christian Brauner

From: Tycho Andersen 

When interacting with extended attributes the vfs verifies that the
caller is privileged over the inode with which the extended attribute is
associated. Add helpers to handle extended attributes on idmapped
mounts. If the inode is accessed through an idmapped mount we need to
map it according to the mount's user namespace. Afterwards the checks
are identical to non-idmapped mounts.
This patch adds helpers to get, set, and remove extended attributes on
idmapped mounts. The four helpers vfs_mapped_getxattr(),
vfs_mapped_setxattr(), __vfs_mapped_removexattr(), and
vfs_mapped_removexattr() are either used directly by the vfs (e.g.
vfs_mapped_getxattr_alloc()) or by the filesystems targeted in this
first interation.

If the initial user namespace is passed all operations are a nop so
non-idmapped mounts will not see a change in behavior and will also not
see any performance impact. It also means that the non-idmapped-mount
aware helpers can be implemented on top of their idmapped-mount aware
counterparts by passing the initial user namespace.

Signed-off-by: Tycho Andersen 
Signed-off-by: Christian Brauner 
---
 fs/xattr.c| 252 +-
 include/linux/xattr.h |  23 
 2 files changed, 196 insertions(+), 79 deletions(-)

diff --git a/fs/xattr.c b/fs/xattr.c
index 96ff53b42251..cdda2baeb9f7 100644
--- a/fs/xattr.c
+++ b/fs/xattr.c
@@ -83,7 +83,8 @@ xattr_resolve_name(struct inode *inode, const char **name)
  * because different namespaces have very different rules.
  */
 static int
-xattr_permission(struct inode *inode, const char *name, int mask)
+xattr_permission(struct user_namespace *user_ns, struct inode *inode,
+const char *name, int mask)
 {
/*
 * We can never set or remove an extended attribute on a read-only
@@ -127,11 +128,11 @@ xattr_permission(struct inode *inode, const char *name, 
int mask)
if (!S_ISREG(inode->i_mode) && !S_ISDIR(inode->i_mode))
return (mask & MAY_WRITE) ? -EPERM : -ENODATA;
if (S_ISDIR(inode->i_mode) && (inode->i_mode & S_ISVTX) &&
-   (mask & MAY_WRITE) && !inode_owner_or_capable(inode))
+   (mask & MAY_WRITE) && 
!mapped_inode_owner_or_capable(user_ns, inode))
return -EPERM;
}
 
-   return inode_permission(inode, mask);
+   return mapped_inode_permission(user_ns, inode, mask);
 }
 
 /*
@@ -161,9 +162,10 @@ xattr_supported_namespace(struct inode *inode, const char 
*prefix)
 }
 EXPORT_SYMBOL(xattr_supported_namespace);
 
-int
-__vfs_setxattr(struct dentry *dentry, struct inode *inode, const char *name,
-  const void *value, size_t size, int flags)
+static int
+__vfs_mapped_setxattr(struct user_namespace *user_ns, struct dentry *dentry,
+ struct inode *inode, const char *name, const void *value,
+ size_t size, int flags)
 {
const struct xattr_handler *handler;
 
@@ -174,7 +176,14 @@ __vfs_setxattr(struct dentry *dentry, struct inode *inode, 
const char *name,
return -EOPNOTSUPP;
if (size == 0)
value = "";  /* empty EA, do not remove */
-   return handler->set(handler, dentry, inode, name, value, size, flags);
+   return xattr_handler_set(handler, user_ns, dentry, inode, name, value, 
size, flags);
+}
+
+int
+__vfs_setxattr(struct dentry *dentry, struct inode *inode, const char *name,
+  const void *value, size_t size, int flags)
+{
+   return __vfs_mapped_setxattr(_user_ns, dentry, inode, name, value, 
size, flags);
 }
 EXPORT_SYMBOL(__vfs_setxattr);
 
@@ -182,6 +191,7 @@ EXPORT_SYMBOL(__vfs_setxattr);
  *  __vfs_setxattr_noperm - perform setxattr operation without performing
  *  permission checks.
  *
+ *  @user_ns - user namespace of the mount
  *  @dentry - object to perform setxattr on
  *  @name - xattr name to set
  *  @value - value to set @name to
@@ -194,8 +204,10 @@ EXPORT_SYMBOL(__vfs_setxattr);
  *  is executed. It also assumes that the caller will make the appropriate
  *  permission checks.
  */
-int __vfs_setxattr_noperm(struct dentry *dentry, const char *name,
-   const void *value, size_t size, int flags)
+static int
+__vfs_mapped_setxattr_noperm(struct user_namespace *user_ns,
+struct dentry *dentry, const char *name,
+const void *value, size_t size, int flags)
 {
struct inode *inode = dentry->d_inode;
int error = -EAGAIN;
@@ -205,7 +217,7 @@ int __vfs_setxattr_noperm(struct dentry *dentry, const char 
*name,
if (issec)
inode->i_flags &= ~S_NOSEC;
if (inode->i_opflags & IOP_XATTR) {
-   error = __vfs_setxattr(dentry, inode, name, value, size, flags);
+   error = __

[PATCH 02/34] namespace: only take read lock in do_reconfigure_mnt()

2020-10-29 Thread Christian Brauner

do_reconfigure_mnt() used to take the down_write(>s_umount) lock
which seems unnecessary since we're not changing the superblock. We're
only checking whether it is already read-only. Setting other mount
attributes is protected by lock_mount_hash() afaict and not by s_umount.

So I think the history of down_write(>s_umount) lock being taken
when setting mount attributes dates back to the introduction of
MNT_READONLY in [2]. Afaict, this introduced the concept of having
read-only mounts in contrast to just having a read-only superblock. When
it got introduced it was simply plumbed into do_remount() which already
took down_write(>s_umount) because it was only used to actually
change the superblock before [2]. Afaict, it would've already been
possible back then to only use down_read(>s_umount) for
MS_BIND | MS_REMOUNT since actual mount options were protected by
the vfsmount lock already. But that would've meant special casing the
locking for MS_BIND | MS_REMOUNT in do_remount() which people might not
have considered worth it.
Then in [1] MS_BIND | MS_REMOUNT mount option changes were split out of
do_remount() into do_reconfigure_mnt() but the down_write(>s_umount)
lock was simply copied over.
Now that we have this be a separate helper only take
the down_read(>s_umount) lock since we're only interested in
checking whether the super block is currently read-only and blocking any
writers from changing it. Essentially, checking that the super block is
read-only has the advantage that we can avoid having to go into the
slowpath and through MNT_WRITE_HOLD and can simply set the read-only
flag on the mount in set_mount_attributes().

[1]: commit 43f5e655eff7 ("vfs: Separate changing mount flags full remount")
[2]: commit 2e4b7fcd9260 ("[PATCH] r/o bind mounts: honor mount writer counts 
at remount")
Cc: Al Viro 
Cc: David Howells 
Cc: linux-fsde...@vger.kernel.org
Signed-off-by: Christian Brauner 
---
 fs/namespace.c | 12 ++--
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/fs/namespace.c b/fs/namespace.c
index 20ee291a7af4..6e0d5bb63197 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -2507,10 +2507,6 @@ static int change_mount_ro_state(struct mount *mnt, 
unsigned int mnt_flags)
return __mnt_unmake_readonly(mnt);
 }
 
-/*
- * Update the user-settable attributes on a mount.  The caller must hold
- * sb->s_umount for writing.
- */
 static void set_mount_attributes(struct mount *mnt, unsigned int mnt_flags)
 {
mnt_flags |= mnt->mnt.mnt_flags & ~MNT_USER_SETTABLE_MASK;
@@ -2560,13 +2556,17 @@ static int do_reconfigure_mnt(struct path *path, 
unsigned int mnt_flags)
if (!can_change_locked_flags(mnt, mnt_flags))
return -EPERM;
 
-   down_write(>s_umount);
+   /*
+* We're only checking whether the superblock is read-only not changing
+* it, so only take down_read(>s_umount).
+*/
+   down_read(>s_umount);
lock_mount_hash();
ret = change_mount_ro_state(mnt, mnt_flags);
if (ret == 0)
set_mount_attributes(mnt, mnt_flags);
unlock_mount_hash();
-   up_write(>s_umount);
+   up_read(>s_umount);
 
mnt_warn_timestamp_expiry(path, >mnt);
 
-- 
2.29.0

--
Linux-audit mailing list
Linux-audit@redhat.com
https://www.redhat.com/mailman/listinfo/linux-audit

[PATCH 21/34] open: handle idmapped mounts

2020-10-29 Thread Christian Brauner

For core file operations such as changing directories or chrooting, determining
file access, changing mode or ownership the vfs will verify that the caller is
privileged over the inode. Extend the various helpers to handle idmapped
mounts. If the inode is accessed through an idmapped mount it is mapped
according to the mount's user namespace. Afterwards the permissions checks are
identical to non-idmapped mounts. When changing file ownership we need to map
the mount from the mount's user namespace. If the initial user namespace is
passed all mapping operations are a nop so non-idmapped mounts will not see a
change in behavior and will also not see any performance impact.

Signed-off-by: Christian Brauner 
---
 fs/open.c | 31 ---
 1 file changed, 24 insertions(+), 7 deletions(-)

diff --git a/fs/open.c b/fs/open.c
index efa462b6b9c7..ca113399010a 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -401,6 +401,7 @@ static const struct cred *access_override_creds(void)
 
 static long do_faccessat(int dfd, const char __user *filename, int mode, int 
flags)
 {
+   struct user_namespace *user_ns;
struct path path;
struct inode *inode;
int res;
@@ -441,7 +442,8 @@ static long do_faccessat(int dfd, const char __user 
*filename, int mode, int fla
goto out_path_release;
}
 
-   res = inode_permission(inode, mode | MAY_ACCESS);
+   user_ns = mnt_user_ns(path.mnt);
+   res = mapped_inode_permission(user_ns, inode, mode | MAY_ACCESS);
/* SuS v2 requires we report a read only fs too */
if (res || !(mode & S_IWOTH) || special_file(inode->i_mode))
goto out_path_release;
@@ -489,6 +491,7 @@ SYSCALL_DEFINE2(access, const char __user *, filename, int, 
mode)
 
 SYSCALL_DEFINE1(chdir, const char __user *, filename)
 {
+   struct user_namespace *user_ns;
struct path path;
int error;
unsigned int lookup_flags = LOOKUP_FOLLOW | LOOKUP_DIRECTORY;
@@ -497,7 +500,8 @@ SYSCALL_DEFINE1(chdir, const char __user *, filename)
if (error)
goto out;
 
-   error = inode_permission(path.dentry->d_inode, MAY_EXEC | MAY_CHDIR);
+   user_ns = mnt_user_ns(path.mnt);
+   error = mapped_inode_permission(user_ns, path.dentry->d_inode, MAY_EXEC 
| MAY_CHDIR);
if (error)
goto dput_and_out;
 
@@ -515,6 +519,7 @@ SYSCALL_DEFINE1(chdir, const char __user *, filename)
 
 SYSCALL_DEFINE1(fchdir, unsigned int, fd)
 {
+   struct user_namespace *user_ns;
struct fd f = fdget_raw(fd);
int error;
 
@@ -526,7 +531,8 @@ SYSCALL_DEFINE1(fchdir, unsigned int, fd)
if (!d_can_lookup(f.file->f_path.dentry))
goto out_putf;
 
-   error = inode_permission(file_inode(f.file), MAY_EXEC | MAY_CHDIR);
+   user_ns = mnt_user_ns(f.file->f_path.mnt);
+   error = mapped_inode_permission(user_ns, file_inode(f.file), MAY_EXEC | 
MAY_CHDIR);
if (!error)
set_fs_pwd(current->fs, >f_path);
 out_putf:
@@ -537,6 +543,7 @@ SYSCALL_DEFINE1(fchdir, unsigned int, fd)
 
 SYSCALL_DEFINE1(chroot, const char __user *, filename)
 {
+   struct user_namespace *user_ns;
struct path path;
int error;
unsigned int lookup_flags = LOOKUP_FOLLOW | LOOKUP_DIRECTORY;
@@ -545,7 +552,8 @@ SYSCALL_DEFINE1(chroot, const char __user *, filename)
if (error)
goto out;
 
-   error = inode_permission(path.dentry->d_inode, MAY_EXEC | MAY_CHDIR);
+   user_ns = mnt_user_ns(path.mnt);
+   error = mapped_inode_permission(user_ns, path.dentry->d_inode, MAY_EXEC 
| MAY_CHDIR);
if (error)
goto dput_and_out;
 
@@ -570,6 +578,7 @@ SYSCALL_DEFINE1(chroot, const char __user *, filename)
 
 int chmod_common(const struct path *path, umode_t mode)
 {
+   struct user_namespace *user_ns;
struct inode *inode = path->dentry->d_inode;
struct inode *delegated_inode = NULL;
struct iattr newattrs;
@@ -585,7 +594,8 @@ int chmod_common(const struct path *path, umode_t mode)
goto out_unlock;
newattrs.ia_mode = (mode & S_IALLUGO) | (inode->i_mode & ~S_IALLUGO);
newattrs.ia_valid = ATTR_MODE | ATTR_CTIME;
-   error = notify_change(path->dentry, , _inode);
+   user_ns = mnt_user_ns(path->mnt);
+   error = notify_mapped_change(user_ns, path->dentry, , 
_inode);
 out_unlock:
inode_unlock(inode);
if (delegated_inode) {
@@ -646,6 +656,7 @@ SYSCALL_DEFINE2(chmod, const char __user *, filename, 
umode_t, mode)
 
 int chown_common(const struct path *path, uid_t user, gid_t group)
 {
+   struct user_namespace *user_ns;
struct inode *inode = path->dentry->d_inode;
struct inode *delegated_inode = NULL;
int error;
@@ -656,6 +667,12 @@ int chown_common(const struct path *path, uid_t user, 
gid_t group

[PATCH 31/34] expfs: handle idmapped mounts

2020-10-29 Thread Christian Brauner

In follow-up patches we will port overlayfs to support idmapped mounts and
since it makes use of expfs port expfs to handle idmapped mounts by using one
of our newly introduced idmapped mount aware lookup helpers.

Signed-off-by: Christian Brauner 
---
 fs/exportfs/expfs.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/fs/exportfs/expfs.c b/fs/exportfs/expfs.c
index 2dd55b172d57..8066e8bdc912 100644
--- a/fs/exportfs/expfs.c
+++ b/fs/exportfs/expfs.c
@@ -145,7 +145,7 @@ static struct dentry *reconnect_one(struct vfsmount *mnt,
if (err)
goto out_err;
dprintk("%s: found name: %s\n", __func__, nbuf);
-   tmp = lookup_one_len_unlocked(nbuf, parent, strlen(nbuf));
+   tmp = lookup_one_len_mapped_unlocked(nbuf, parent, strlen(nbuf), 
mnt_user_ns(mnt));
if (IS_ERR(tmp)) {
dprintk("%s: lookup failed: %d\n", __func__, PTR_ERR(tmp));
err = PTR_ERR(tmp);
@@ -525,7 +525,7 @@ struct dentry *exportfs_decode_fh(struct vfsmount *mnt, 
struct fid *fid,
}
 
inode_lock(target_dir->d_inode);
-   nresult = lookup_one_len(nbuf, target_dir, strlen(nbuf));
+   nresult = lookup_one_len_mapped(nbuf, target_dir, strlen(nbuf), 
mnt_user_ns(mnt));
if (!IS_ERR(nresult)) {
if (unlikely(nresult->d_inode != result->d_inode)) {
dput(nresult);
-- 
2.29.0

--
Linux-audit mailing list
Linux-audit@redhat.com
https://www.redhat.com/mailman/listinfo/linux-audit

[PATCH 23/34] utimes: handle idmapped mounts

2020-10-29 Thread Christian Brauner

Enable the vfs_utimes() helper to handle idmapped mounts by passing down the
mount's user namespace to the earlier introduced notify_mapped_change() helper.

Signed-off-by: Christian Brauner 
---
 fs/utimes.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/fs/utimes.c b/fs/utimes.c
index fd3cc4226224..1c0b1f56fce2 100644
--- a/fs/utimes.c
+++ b/fs/utimes.c
@@ -22,6 +22,7 @@ int vfs_utimes(const struct path *path, struct timespec64 
*times)
struct iattr newattrs;
struct inode *inode = path->dentry->d_inode;
struct inode *delegated_inode = NULL;
+   struct user_namespace *user_ns;
 
if (times) {
if (!nsec_valid(times[0].tv_nsec) ||
@@ -61,8 +62,9 @@ int vfs_utimes(const struct path *path, struct timespec64 
*times)
newattrs.ia_valid |= ATTR_TOUCH;
}
 retry_deleg:
+   user_ns = mnt_user_ns(path->mnt);
inode_lock(inode);
-   error = notify_change(path->dentry, , _inode);
+   error = notify_mapped_change(user_ns, path->dentry, , 
_inode);
inode_unlock(inode);
if (delegated_inode) {
error = break_deleg_wait(_inode);
-- 
2.29.0

--
Linux-audit mailing list
Linux-audit@redhat.com
https://www.redhat.com/mailman/listinfo/linux-audit

[PATCH 26/34] fs: add helpers for idmap mounts

2020-10-29 Thread Christian Brauner

When the kernel is configured with CONFIG_IDMAP_MOUNTS additional inode methods
are provided. A filesystem that is aware of idmapped mounts will receive the
user namespace the mount has been marked with as an additional argument. This
can be used for additional permission checking and also to enable filesystems
to translate between uids and gids if they need to. We have implemented all
relevant helpers in earlier patches.

In this iteration I've decided to add a set of new inode methods instead of
adapting the existing ones. This is mainly done to keep the noise-level as low
as possible. But we're very happy to adapt the existing methods and all
filesystems using it instead of adding dedicated new helpers. In any case we
expect to be done to a single set of inode methods ones we've transitioned
filesystems whether or not we add new methods or not.

Signed-off-by: Christian Brauner 
---
 fs/attr.c  |   2 +-
 fs/namei.c |  24 +
 fs/posix_acl.c |   4 +-
 include/linux/fs.h | 129 +
 4 files changed, 146 insertions(+), 13 deletions(-)

diff --git a/fs/attr.c b/fs/attr.c
index 4daf6ac6de6d..d13ef3f8eac0 100644
--- a/fs/attr.c
+++ b/fs/attr.c
@@ -378,7 +378,7 @@ int notify_mapped_change(struct user_namespace *user_ns, 
struct dentry *dentry,
return error;
 
if (inode->i_op->setattr)
-   error = inode->i_op->setattr(dentry, attr);
+   error = iop_setattr(inode, user_ns, dentry, attr);
else
error = simple_setattr(dentry, attr);
 
diff --git a/fs/namei.c b/fs/namei.c
index 76c9637eccb9..d6dbfab126d7 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -407,7 +407,7 @@ static inline int do_inode_permission(struct user_namespace 
*user_ns, struct ino
 {
if (unlikely(!(inode->i_opflags & IOP_FASTPERM))) {
if (likely(inode->i_op->permission))
-   return inode->i_op->permission(inode, mask);
+   return iop_permission(inode, user_ns, inode, mask);
 
/* This gets set once for the inode lifetime */
spin_lock(>i_lock);
@@ -2872,7 +2872,7 @@ int vfs_mapped_create(struct user_namespace *user_ns, 
struct inode *dir,
error = security_inode_create(dir, dentry, mode);
if (error)
return error;
-   error = dir->i_op->create(dir, dentry, mode, want_excl);
+   error = iop_create(dir, user_ns, dir, dentry, mode, want_excl);
if (!error)
fsnotify_create(dir, dentry);
return error;
@@ -3175,14 +3175,18 @@ static struct dentry *lookup_open(struct nameidata *nd, 
struct file *file,
 
/* Negative dentry, just create the file */
if (!dentry->d_inode && (open_flag & O_CREAT)) {
+   struct user_namespace *user_ns;
+
file->f_mode |= FMODE_CREATED;
audit_inode_child(dir_inode, dentry, AUDIT_TYPE_CHILD_CREATE);
if (!dir_inode->i_op->create) {
error = -EACCES;
goto out_dput;
}
-   error = dir_inode->i_op->create(dir_inode, dentry, mode,
-   open_flag & O_EXCL);
+
+   user_ns = mnt_user_ns(nd->path.mnt);
+   error = iop_create(dir_inode, user_ns, dir_inode, dentry, mode,
+  open_flag & O_EXCL);
if (error)
goto out_dput;
}
@@ -3363,7 +3367,7 @@ struct dentry *vfs_mapped_tmpfile(struct user_namespace 
*user_ns,
child = d_alloc(dentry, _name);
if (unlikely(!child))
goto out_err;
-   error = dir->i_op->tmpfile(dir, child, mode);
+   error = iop_tmpfile(dir, user_ns, dir, child, mode);
if (error)
goto out_err;
error = -ENOENT;
@@ -3640,7 +3644,7 @@ int vfs_mapped_mknod(struct user_namespace *user_ns, 
struct inode *dir,
if (error)
return error;
 
-   error = dir->i_op->mknod(dir, dentry, mode, dev);
+   error = iop_mknod(dir, user_ns, dir, dentry, mode, dev);
if (!error)
fsnotify_create(dir, dentry);
return error;
@@ -3750,7 +3754,7 @@ int vfs_mapped_mkdir(struct user_namespace *user_ns, 
struct inode *dir,
if (max_links && dir->i_nlink >= max_links)
return -EMLINK;
 
-   error = dir->i_op->mkdir(dir, dentry, mode);
+   error = iop_mkdir(dir, user_ns, dir, dentry, mode);
if (!error)
fsnotify_mkdir(dir, dentry);
return error;
@@ -4089,7 +4093,7 @@ int vfs_mapped_symlink(struct user_namespace *user_ns, 
struct inode *dir,
if (error)
return error;
 
-   error = dir->i_op->symlink(dir, dentry, oldname);
+   e

[PATCH 24/34] would_dump: handle idmapped mounts

2020-10-29 Thread Christian Brauner

When determining whether or not to create a coredump the vfs will verify that
the caller is privileged over the inode. Make the would_dump() helper handle
idmapped mounts by passing down the mount's user namespace of the exec file.

Signed-off-by: Christian Brauner 
---
 fs/exec.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/fs/exec.c b/fs/exec.c
index 8e75d7a33514..76de175eeba8 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1391,14 +1391,15 @@ EXPORT_SYMBOL(begin_new_exec);
 void would_dump(struct linux_binprm *bprm, struct file *file)
 {
struct inode *inode = file_inode(file);
-   if (inode_permission(inode, MAY_READ) < 0) {
+   struct user_namespace *ns = mnt_user_ns(file->f_path.mnt);
+   if (mapped_inode_permission(ns, inode, MAY_READ) < 0) {
struct user_namespace *old, *user_ns;
bprm->interp_flags |= BINPRM_FLAGS_ENFORCE_NONDUMP;
 
/* Ensure mm->user_ns contains the executable */
user_ns = old = bprm->mm->user_ns;
while ((user_ns != _user_ns) &&
-  !privileged_wrt_inode_uidgid(user_ns, _user_ns, 
inode))
+  !privileged_wrt_inode_uidgid(user_ns, ns, inode))
user_ns = user_ns->parent;
 
if (old != user_ns) {
-- 
2.29.0

--
Linux-audit mailing list
Linux-audit@redhat.com
https://www.redhat.com/mailman/listinfo/linux-audit

[PATCH 11/34] acl: handle idmapped mounts

2020-10-29 Thread Christian Brauner

The posix acl permission checking helpers determine whether a caller is
privileged over an inode according to the acls associated with the
inode. Add helpers that make it possible to handle acls on idampped
mounts.

The vfs and the filesystems targeted by this first iteration make use of
posix_acl_fix_xattr_from_user() and posix_acl_fix_xattr_to_user() to
translate basic posix access and default permissions such as the
ACL_USER and ACL_GROUP type according to the initial user namespace (or
the superblock's user namespace) to and from the caller's current user
namespace. Adapt these two helpers to handle idmapped mounts whereby we
either shift from or into the mount's user namespace depending on in
which direction we're translating.
Similarly, cap_convert_nscap() is used by the vfs to translate user
namespace and non-user namespace aware filesystem capabilities from the
superblock's user namespace to the caller's user namespace. Enable it to
handle idmapped mounts by accounting for the mount's user namespace.

In addition the fileystems targeted in the first iteration of this patch
series make use of the posix_acl_chmod() and, posix_acl_update_mode()
helpers. Both helpers perform permission checks on the target inode. Add
two new helpers posix_mapped_acl_chmod() and
posix_mapped_acl_update_mode() to handle idmapped mounts. These two
helpers are called when acls are set by the respective filesystems to
handle this case we add a new ->set_mapped() method to struct
xattr_handler which passes the mount's user namespace down.

To this end the standard posix access and default attribute handlers
posix_acl_access_xattr_handler and posix_acl_default_xattr_handler gain
a new posix_acl_xattr_set_mapped() callback which serves as the
implemtation of the newly added ->set_mapped() method in struct
xattr_handler. This callback maps the inode according to the mount's
user namespace but otherwise performs identical checks as its
non-idmapped aware counterpart.

If the initial user namespace is passed to any of the new helpers the
permission checking is identical to their non-idmapped aware
counterparts without any performance impact. This means that the
non-idmapped aware helpers can simply be implemented on top of their
idmapped-mount aware counterparts by passing the initial user namespace
without any change in behavior or performance.

Signed-off-by: Christian Brauner 
---
 fs/posix_acl.c  | 110 ++--
 fs/xattr.c  |   6 +-
 include/linux/capability.h  |   3 +-
 include/linux/posix_acl.h   |  10 +++
 include/linux/posix_acl_xattr.h |  12 ++--
 include/linux/xattr.h   |   6 ++
 security/commoncap.c|  15 +++--
 7 files changed, 128 insertions(+), 34 deletions(-)

diff --git a/fs/posix_acl.c b/fs/posix_acl.c
index f15b6ad35ec3..665eb7921e1c 100644
--- a/fs/posix_acl.c
+++ b/fs/posix_acl.c
@@ -558,7 +558,7 @@ __posix_acl_chmod(struct posix_acl **acl, gfp_t gfp, 
umode_t mode)
 EXPORT_SYMBOL(__posix_acl_chmod);
 
 int
-posix_acl_chmod(struct inode *inode, umode_t mode)
+posix_mapped_acl_chmod(struct user_namespace *user_ns, struct inode *inode, 
umode_t mode)
 {
struct posix_acl *acl;
int ret = 0;
@@ -582,6 +582,12 @@ posix_acl_chmod(struct inode *inode, umode_t mode)
posix_acl_release(acl);
return ret;
 }
+
+int
+posix_acl_chmod(struct inode *inode, umode_t mode)
+{
+   return posix_mapped_acl_chmod(_user_ns, inode, mode);
+}
 EXPORT_SYMBOL(posix_acl_chmod);
 
 int
@@ -636,7 +642,8 @@ posix_acl_create(struct inode *dir, umode_t *mode,
 EXPORT_SYMBOL_GPL(posix_acl_create);
 
 /**
- * posix_acl_update_mode  -  update mode in set_acl
+ * posix_mapped_acl_update_mode  -  update mode in set_acl
+ * @user_ns: user namespace the inode is accessed from
  * @inode: target inode
  * @mode_p: mode (pointer) for update
  * @acl: acl pointer
@@ -650,8 +657,9 @@ EXPORT_SYMBOL_GPL(posix_acl_create);
  *
  * Called from set_acl inode operations.
  */
-int posix_acl_update_mode(struct inode *inode, umode_t *mode_p,
- struct posix_acl **acl)
+int posix_mapped_acl_update_mode(struct user_namespace *user_ns,
+struct inode *inode, umode_t *mode_p,
+struct posix_acl **acl)
 {
umode_t mode = inode->i_mode;
int error;
@@ -661,12 +669,34 @@ int posix_acl_update_mode(struct inode *inode, umode_t 
*mode_p,
return error;
if (error == 0)
*acl = NULL;
-   if (!in_group_p(inode->i_gid) &&
-   !capable_wrt_inode_uidgid(inode, CAP_FSETID))
+   if (!in_group_p(i_gid_into_mnt(user_ns, inode)) &&
+   !capable_wrt_mapped_inode_uidgid(user_ns, inode, CAP_FSETID))
mode &= ~S_ISGID;
*mode_p = mode;
return 0;
 }
+EXPORT_SYMBOL(posix_mapped_acl_update_mode);
+
+/**
+ * posix_acl_update_mode  -  update mode in

[PATCH 06/34] fs: add id translation helpers

2020-10-29 Thread Christian Brauner

Add simple helpers to make it easy to map kuids into and from idmapped
mounts. We provide simple wrappers that filesystems can use to
e.g. initialize inodes similar to i_{uid,gid}_read() and
i_{uid,gid}_write(). Accessing an inode through an idmapped mount will
require the inode to be mapped according to the mount's user namespace.
If the fsids are used to compare against inodes or to initialize inodes
they are required to be shifted from the mount's user namespace. Passing
the initial user namespace to these helpers makes them a nop and so any
non-idmapped paths will not be impacted.

Signed-off-by: Christian Brauner 
---
 include/linux/fs.h | 75 ++
 1 file changed, 75 insertions(+)

diff --git a/include/linux/fs.h b/include/linux/fs.h
index 8314cd351673..8a891b80d0b4 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -39,6 +39,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include 
 #include 
@@ -1574,6 +1575,80 @@ static inline void i_gid_write(struct inode *inode, 
gid_t gid)
inode->i_gid = make_kgid(inode->i_sb->s_user_ns, gid);
 }
 
+static inline kuid_t kuid_into_mnt(struct user_namespace *to, kuid_t kuid)
+{
+#ifdef CONFIG_IDMAP_MOUNTS
+   return make_kuid(to, __kuid_val(kuid));
+#else
+   return kuid;
+#endif
+}
+
+static inline kgid_t kgid_into_mnt(struct user_namespace *to, kgid_t kgid)
+{
+#ifdef CONFIG_IDMAP_MOUNTS
+   return make_kgid(to, __kgid_val(kgid));
+#else
+   return kgid;
+#endif
+}
+
+static inline kuid_t i_uid_into_mnt(struct user_namespace *to,
+   const struct inode *inode)
+{
+#ifdef CONFIG_IDMAP_MOUNTS
+   return kuid_into_mnt(to, inode->i_uid);
+#else
+   return inode->i_uid;
+#endif
+}
+
+static inline kgid_t i_gid_into_mnt(struct user_namespace *to,
+   const struct inode *inode)
+{
+#ifdef CONFIG_IDMAP_MOUNTS
+   return kgid_into_mnt(to, inode->i_gid);
+#else
+   return inode->i_gid;
+#endif
+}
+
+static inline kuid_t kuid_from_mnt(struct user_namespace *to, kuid_t kuid)
+{
+#ifdef CONFIG_IDMAP_MOUNTS
+   return KUIDT_INIT(from_kuid(to, kuid));
+#else
+   return kuid;
+#endif
+}
+
+static inline kgid_t kgid_from_mnt(struct user_namespace *to, kgid_t kgid)
+{
+#ifdef CONFIG_IDMAP_MOUNTS
+   return KGIDT_INIT(from_kgid(to, kgid));
+#else
+   return kgid;
+#endif
+}
+
+static inline kuid_t fsuid_into_mnt(struct user_namespace *to)
+{
+#ifdef CONFIG_IDMAP_MOUNTS
+   return kuid_from_mnt(to, current_fsuid());
+#else
+   return current_fsuid();
+#endif
+}
+
+static inline kgid_t fsgid_into_mnt(struct user_namespace *to)
+{
+#ifdef CONFIG_IDMAP_MOUNTS
+   return kgid_from_mnt(to, current_fsgid());
+#else
+   return current_fsgid();
+#endif
+}
+
 extern struct timespec64 current_time(struct inode *inode);
 
 /*
-- 
2.29.0

--
Linux-audit mailing list
Linux-audit@redhat.com
https://www.redhat.com/mailman/listinfo/linux-audit

[PATCH 28/34] audit: handle idmapped mounts

2020-10-29 Thread Christian Brauner

Audit will sometimes log the inode's i_uid and i_gid. Enable audit to log the
mapped inode when it is accessed from an idmapped mount.

Signed-off-by: Christian Brauner 
---
 fs/namei.c| 14 +++---
 include/linux/audit.h | 10 ++
 ipc/mqueue.c  |  8 
 kernel/auditsc.c  | 26 ++
 4 files changed, 31 insertions(+), 27 deletions(-)

diff --git a/fs/namei.c b/fs/namei.c
index d6dbfab126d7..545ce391a12d 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -1006,7 +1006,7 @@ static inline int may_follow_link(struct nameidata *nd, 
const struct inode *inod
if (nd->flags & LOOKUP_RCU)
return -ECHILD;
 
-   audit_inode(nd->name, nd->stack[0].link.dentry, 0);
+   audit_inode(nd->name, user_ns, nd->stack[0].link.dentry, 0);
audit_log_path_denied(AUDIT_ANOM_LINK, "follow_link");
return -EACCES;
 }
@@ -2413,7 +2413,7 @@ int filename_lookup(int dfd, struct filename *name, 
unsigned flags,
retval = path_lookupat(, flags | LOOKUP_REVAL, path);
 
if (likely(!retval))
-   audit_inode(name, path->dentry,
+   audit_inode(name, mnt_user_ns(path->mnt), path->dentry,
flags & LOOKUP_MOUNTPOINT ? AUDIT_INODE_NOEVAL : 0);
restore_nameidata();
putname(name);
@@ -2455,7 +2455,7 @@ static struct filename *filename_parentat(int dfd, struct 
filename *name,
if (likely(!retval)) {
*last = nd.last;
*type = nd.last_type;
-   audit_inode(name, parent->dentry, AUDIT_INODE_PARENT);
+   audit_inode(name, mnt_user_ns(parent->mnt), parent->dentry, 
AUDIT_INODE_PARENT);
} else {
putname(name);
name = ERR_PTR(retval);
@@ -3239,7 +3239,7 @@ static const char *open_last_lookups(struct nameidata *nd,
if (unlikely(error))
return ERR_PTR(error);
}
-   audit_inode(nd->name, dir, AUDIT_INODE_PARENT);
+   audit_inode(nd->name, mnt_user_ns(nd->path.mnt), dir, 
AUDIT_INODE_PARENT);
/* trailing slashes? */
if (unlikely(nd->last.name[nd->last.len]))
return ERR_PTR(-EISDIR);
@@ -3305,7 +3305,7 @@ static int do_open(struct nameidata *nd,
return error;
}
if (!(file->f_mode & FMODE_CREATED))
-   audit_inode(nd->name, nd->path.dentry, 0);
+   audit_inode(nd->name, mnt_user_ns(nd->path.mnt), 
nd->path.dentry, 0);
if (open_flag & O_CREAT) {
if ((open_flag & O_EXCL) && !(file->f_mode & FMODE_CREATED))
return -EEXIST;
@@ -3414,7 +3414,7 @@ static int do_tmpfile(struct nameidata *nd, unsigned 
flags,
goto out2;
dput(path.dentry);
path.dentry = child;
-   audit_inode(nd->name, child, 0);
+   audit_inode(nd->name, user_ns, child, 0);
/* Don't check for other permissions, the inode was just created */
error = may_open(, 0, op->open_flag);
if (error)
@@ -3433,7 +3433,7 @@ static int do_o_path(struct nameidata *nd, unsigned 
flags, struct file *file)
struct path path;
int error = path_lookupat(nd, flags, );
if (!error) {
-   audit_inode(nd->name, path.dentry, 0);
+   audit_inode(nd->name, mnt_user_ns(path.mnt), path.dentry, 0);
error = vfs_open(, file);
path_put();
}
diff --git a/include/linux/audit.h b/include/linux/audit.h
index b3d859831a31..217d2b0c273e 100644
--- a/include/linux/audit.h
+++ b/include/linux/audit.h
@@ -293,8 +293,8 @@ extern void __audit_syscall_exit(int ret_success, long 
ret_value);
 extern struct filename *__audit_reusename(const __user char *uptr);
 extern void __audit_getname(struct filename *name);
 extern void __audit_getcwd(void);
-extern void __audit_inode(struct filename *name, const struct dentry *dentry,
-   unsigned int flags);
+extern void __audit_inode(struct filename *name, struct user_namespace 
*user_ns,
+ const struct dentry *dentry, unsigned int flags);
 extern void __audit_file(const struct file *);
 extern void __audit_inode_child(struct inode *parent,
const struct dentry *dentry,
@@ -357,10 +357,11 @@ static inline void audit_getcwd(void)
__audit_getcwd();
 }
 static inline void audit_inode(struct filename *name,
+   struct user_namespace *user_ns,
const struct dentry *dentry,
unsigned int aflags) {
if (unlikely(!audit_dummy_context()))
-   __audit_inode(name, dentry, aflags);
+

[PATCH 15/34] stat: add mapped_generic_fillattr()

2020-10-29 Thread Christian Brauner

The generic_fillattr() helper fills in the basic attributes associated with an
inode. Add a mapped_generic_fillattr() helper to handle idmapped mounts. If the
inode is accessed through an idmapped mount we need to map it according to the
mount's user namespace. If the initial user namespace is passed all operations
are a nop so non-idmapped mounts will not see a change in behavior and will
also not see any performance impact. This also means that the
non-idmapped-mount aware generic_fillattr() helper can be implemented on top of
the idmapped-mount aware mapped_generic_fillattr() helper.

Signed-off-by: Christian Brauner 
---
 fs/stat.c  | 18 +-
 include/linux/fs.h |  1 +
 2 files changed, 14 insertions(+), 5 deletions(-)

diff --git a/fs/stat.c b/fs/stat.c
index dacecdda2e79..ee6d92aec7ac 100644
--- a/fs/stat.c
+++ b/fs/stat.c
@@ -25,7 +25,8 @@
 #include "mount.h"
 
 /**
- * generic_fillattr - Fill in the basic attributes from the inode struct
+ * mapped_generic_fillattr - Fill in the basic attributes from the inode 
struct on idmapped mounts
+ * @user_ns: the user namespace from which we access this inode
  * @inode: Inode to use as the source
  * @stat: Where to fill in the attributes
  *
@@ -33,14 +34,15 @@
  * found on the VFS inode structure.  This is the default if no getattr inode
  * operation is supplied.
  */
-void generic_fillattr(struct inode *inode, struct kstat *stat)
+void mapped_generic_fillattr(struct user_namespace *mnt_user_ns,
+struct inode *inode, struct kstat *stat)
 {
stat->dev = inode->i_sb->s_dev;
stat->ino = inode->i_ino;
stat->mode = inode->i_mode;
stat->nlink = inode->i_nlink;
-   stat->uid = inode->i_uid;
-   stat->gid = inode->i_gid;
+   stat->uid = i_uid_into_mnt(mnt_user_ns, inode);
+   stat->gid = i_gid_into_mnt(mnt_user_ns, inode);
stat->rdev = inode->i_rdev;
stat->size = i_size_read(inode);
stat->atime = inode->i_atime;
@@ -49,6 +51,12 @@ void generic_fillattr(struct inode *inode, struct kstat 
*stat)
stat->blksize = i_blocksize(inode);
stat->blocks = inode->i_blocks;
 }
+EXPORT_SYMBOL(mapped_generic_fillattr);
+
+void generic_fillattr(struct inode *inode, struct kstat *stat)
+{
+   mapped_generic_fillattr(_user_ns, inode, stat);
+}
 EXPORT_SYMBOL(generic_fillattr);
 
 /**
@@ -87,7 +95,7 @@ int vfs_getattr_nosec(const struct path *path, struct kstat 
*stat,
return inode->i_op->getattr(path, stat, request_mask,
query_flags);
 
-   generic_fillattr(inode, stat);
+   mapped_generic_fillattr(mnt_user_ns(path->mnt), inode, stat);
return 0;
 }
 EXPORT_SYMBOL(vfs_getattr_nosec);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index f41d93b0e6d7..e66852dee65d 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -3162,6 +3162,7 @@ extern int page_symlink(struct inode *inode, const char 
*symname, int len);
 extern const struct inode_operations page_symlink_inode_operations;
 extern void kfree_link(void *);
 extern void generic_fillattr(struct inode *, struct kstat *);
+extern void mapped_generic_fillattr(struct user_namespace *, struct inode *, 
struct kstat *);
 extern int vfs_getattr_nosec(const struct path *, struct kstat *, u32, 
unsigned int);
 extern int vfs_getattr(const struct path *, struct kstat *, u32, unsigned int);
 void __inode_add_bytes(struct inode *inode, loff_t bytes);
-- 
2.29.0

--
Linux-audit mailing list
Linux-audit@redhat.com
https://www.redhat.com/mailman/listinfo/linux-audit

[PATCH 10/34] attr: handle idmapped mounts

2020-10-29 Thread Christian Brauner

When file attributes are changed filesystems mostly rely on the
setattr_prepare(), setattr_copy(), and notify_change() helpers for
initialization and permission checking. Add the
setattr_mapped_prepare(), setattr_mapped_copy(), and
notify_mapped_change() helpers to handle idmapped mounts. If the inode
is accessed through an idmapped mount we need to map it according to the
mount's user namespace. Afterwards the checks are identical to
non-idmapped mounts. If the initial user namespace is passed all
operations are a nop so non-idmapped mounts will not see a change in
behavior and will also not see any performance impact. It also means
that the inode_owner_or_capable() helper can be implemented on top of
mapped_inode_owner_or_capable() by passing in the initial user
namespace. Helpers that perform checks on the ia_uid and ia_gid fields
in struct iattr assume that ia_uid and ia_gid are intended values and so
they won't be mapped according to the mount's user namespace. This is
more transparent to the caller and further aligns the permission for
notify_change() and notify_mapped_change().

If the initial user namespace is passed all operations are a nop so
non-idmapped mounts will not see a change in behavior and will also not
see any performance impact. It also means that the
setattr_prepare(), setattr_copy(), and notify_change() helpers can
simply be implemented on top of setattr_mapped_prepare(),
setattr_mapped_copy(), and notify_mapped_change() by passing in the
initial user namespace.

Signed-off-by: Christian Brauner 
---
 fs/attr.c  | 136 ++---
 include/linux/fs.h |   6 ++
 2 files changed, 110 insertions(+), 32 deletions(-)

diff --git a/fs/attr.c b/fs/attr.c
index b4bbdbd4c8ca..f39c03ac85e0 100644
--- a/fs/attr.c
+++ b/fs/attr.c
@@ -18,34 +18,39 @@
 #include 
 #include 
 
-static bool chown_ok(const struct inode *inode, kuid_t uid)
+static bool chown_ok(struct user_namespace *user_ns,
+const struct inode *inode,
+kuid_t uid)
 {
-   if (uid_eq(current_fsuid(), inode->i_uid) &&
-   uid_eq(uid, inode->i_uid))
+   kuid_t kuid = i_uid_into_mnt(user_ns, inode);
+   if (uid_eq(current_fsuid(), kuid) && uid_eq(uid, kuid))
return true;
-   if (capable_wrt_inode_uidgid(inode, CAP_CHOWN))
+   if (capable_wrt_mapped_inode_uidgid(user_ns, inode, CAP_CHOWN))
return true;
-   if (uid_eq(inode->i_uid, INVALID_UID) &&
+   if (uid_eq(kuid, INVALID_UID) &&
ns_capable(inode->i_sb->s_user_ns, CAP_CHOWN))
return true;
return false;
 }
 
-static bool chgrp_ok(const struct inode *inode, kgid_t gid)
+static bool chgrp_ok(struct user_namespace *user_ns,
+const struct inode *inode, kgid_t gid)
 {
-   if (uid_eq(current_fsuid(), inode->i_uid) &&
-   (in_group_p(gid) || gid_eq(gid, inode->i_gid)))
+   kgid_t kgid = i_gid_into_mnt(user_ns, inode);
+   if (uid_eq(current_fsuid(), i_uid_into_mnt(user_ns, inode)) &&
+   (in_group_p(gid) || gid_eq(gid, kgid)))
return true;
-   if (capable_wrt_inode_uidgid(inode, CAP_CHOWN))
+   if (capable_wrt_mapped_inode_uidgid(user_ns, inode, CAP_CHOWN))
return true;
-   if (gid_eq(inode->i_gid, INVALID_GID) &&
+   if (gid_eq(kgid, INVALID_GID) &&
ns_capable(inode->i_sb->s_user_ns, CAP_CHOWN))
return true;
return false;
 }
 
 /**
- * setattr_prepare - check if attribute changes to a dentry are allowed
+ * setattr_mapped_prepare - check if attribute changes to a dentry are allowed
+ * @user_ns:   user namespace of the mount
  * @dentry:dentry to check
  * @attr:  attributes to change
  *
@@ -58,7 +63,8 @@ static bool chgrp_ok(const struct inode *inode, kgid_t gid)
  * Should be called as the first thing in ->setattr implementations,
  * possibly after taking additional locks.
  */
-int setattr_prepare(struct dentry *dentry, struct iattr *attr)
+int setattr_mapped_prepare(struct user_namespace *user_ns,
+  struct dentry *dentry, struct iattr *attr)
 {
struct inode *inode = d_inode(dentry);
unsigned int ia_valid = attr->ia_valid;
@@ -78,27 +84,27 @@ int setattr_prepare(struct dentry *dentry, struct iattr 
*attr)
goto kill_priv;
 
/* Make sure a caller can chown. */
-   if ((ia_valid & ATTR_UID) && !chown_ok(inode, attr->ia_uid))
+   if ((ia_valid & ATTR_UID) && !chown_ok(user_ns, inode, attr->ia_uid))
return -EPERM;
 
/* Make sure caller can chgrp. */
-   if ((ia_valid & ATTR_GID) && !chgrp_ok(inode, attr->ia_gid))
+   if ((ia_valid & ATTR_GID) && !chgrp_ok(user_ns, inode, attr->ia_gid))

[PATCH 27/34] apparmor: handle idmapped mounts

2020-10-29 Thread Christian Brauner

The i_uid and i_gid are only ever used when logging for AppArmor. This is
already broken in a bunch of places where the global root id is reported
instead of the i_uid or i_gid of the file. Nonetheless, be kind and log the
mapped inode if we're coming from an idmapped mount.

Signed-off-by: Christian Brauner 
---
 security/apparmor/domain.c |  9 ++---
 security/apparmor/file.c   |  5 -
 security/apparmor/lsm.c| 12 
 3 files changed, 18 insertions(+), 8 deletions(-)

diff --git a/security/apparmor/domain.c b/security/apparmor/domain.c
index f919ebd042fd..91b132961b67 100644
--- a/security/apparmor/domain.c
+++ b/security/apparmor/domain.c
@@ -10,12 +10,14 @@
 
 #include 
 #include 
+#include 
 #include 
 #include 
 #include 
 #include 
 #include 
 #include 
+#include 
 
 #include "include/audit.h"
 #include "include/apparmorfs.h"
@@ -858,8 +860,10 @@ int apparmor_bprm_creds_for_exec(struct linux_binprm *bprm)
const char *info = NULL;
int error = 0;
bool unsafe = false;
+   struct user_namespace *user_ns = mnt_user_ns(bprm->file->f_path.mnt);
+   kuid_t i_uid = i_uid_into_mnt(user_ns, file_inode(bprm->file));
struct path_cond cond = {
-   file_inode(bprm->file)->i_uid,
+   i_uid,
file_inode(bprm->file)->i_mode
};
 
@@ -967,8 +971,7 @@ int apparmor_bprm_creds_for_exec(struct linux_binprm *bprm)
error = fn_for_each(label, profile,
aa_audit_file(profile, , OP_EXEC, MAY_EXEC,
  bprm->filename, NULL, new,
- file_inode(bprm->file)->i_uid, info,
- error));
+ i_uid, info, error));
aa_put_label(new);
goto done;
 }
diff --git a/security/apparmor/file.c b/security/apparmor/file.c
index 92acf9a49405..d6d9e71f1900 100644
--- a/security/apparmor/file.c
+++ b/security/apparmor/file.c
@@ -11,6 +11,8 @@
 #include 
 #include 
 #include 
+#include 
+#include 
 
 #include "include/apparmor.h"
 #include "include/audit.h"
@@ -508,8 +510,9 @@ static int __file_path_perm(const char *op, struct aa_label 
*label,
 {
struct aa_profile *profile;
struct aa_perms perms = {};
+   struct user_namespace *user_ns = mnt_user_ns(file->f_path.mnt);
struct path_cond cond = {
-   .uid = file_inode(file)->i_uid,
+   .uid = i_uid_into_mnt(user_ns, file_inode(file)),
.mode = file_inode(file)->i_mode
};
char *buffer;
diff --git a/security/apparmor/lsm.c b/security/apparmor/lsm.c
index ffeaee5ed968..ece9afc3994f 100644
--- a/security/apparmor/lsm.c
+++ b/security/apparmor/lsm.c
@@ -224,7 +224,8 @@ static int common_perm(const char *op, const struct path 
*path, u32 mask,
  */
 static int common_perm_cond(const char *op, const struct path *path, u32 mask)
 {
-   struct path_cond cond = { d_backing_inode(path->dentry)->i_uid,
+   struct user_namespace *user_ns = mnt_user_ns(path->mnt);
+   struct path_cond cond = { i_uid_into_mnt(user_ns, 
d_backing_inode(path->dentry)),
  d_backing_inode(path->dentry)->i_mode
};
 
@@ -266,12 +267,13 @@ static int common_perm_rm(const char *op, const struct 
path *dir,
  struct dentry *dentry, u32 mask)
 {
struct inode *inode = d_backing_inode(dentry);
+   struct user_namespace *user_ns = mnt_user_ns(dir->mnt);
struct path_cond cond = { };
 
if (!inode || !path_mediated_fs(dentry))
return 0;
 
-   cond.uid = inode->i_uid;
+   cond.uid = i_uid_into_mnt(user_ns, inode);
cond.mode = inode->i_mode;
 
return common_perm_dir_dentry(op, dir, dentry, mask, );
@@ -361,11 +363,12 @@ static int apparmor_path_rename(const struct path 
*old_dir, struct dentry *old_d
 
label = begin_current_label_crit_section();
if (!unconfined(label)) {
+   struct user_namespace *user_ns = mnt_user_ns(old_dir->mnt);
struct path old_path = { .mnt = old_dir->mnt,
 .dentry = old_dentry };
struct path new_path = { .mnt = new_dir->mnt,
 .dentry = new_dentry };
-   struct path_cond cond = { d_backing_inode(old_dentry)->i_uid,
+   struct path_cond cond = { i_uid_into_mnt(user_ns, 
d_backing_inode(old_dentry)),
  d_backing_inode(old_dentry)->i_mode
};
 
@@ -420,8 +423,9 @@ static int apparmor_file_open(struct file *file)
 
label = aa_get_newest_cred_label(file->f_cred);
if (!unconfined(label)) {
+   struct user_namespace *user_ns = mnt_user_ns(file->f_path.mnt)

[PATCH 32/34] overlayfs: handle idmapped lower directories

2020-10-29 Thread Christian Brauner

   3 root   root4096 Sep 25 08:04 dev
 drwxr-xr-x  61 root   root4096 Sep 25 08:04 etc

 # Create a file as as root
 root@f2-vm:/merged# touch /merged/A-FILE

 root@f2-vm:/merged# ls -al /merged/A-FILE
 -rw-r--r-- 1 root root 0 Oct 28 12:16 /merged/A-FILE

 # Chown the file to a simple user
 root@f2-vm:/merged# chown 1000:1000 /merged/A-FILE

 root@f2-vm:/merged# ls -al /merged/A-FILE
 -rw-r--r-- 1 ubuntu ubuntu 0 Oct 28 12:16 /merged/A-FILE

 # Create a directory and delegate to simple user
 root@f2-vm:/merged# mkdir /merged/A-DIR

 root@f2-vm:/merged# chown 1000:1000 /merged/A-DIR/

 # Login as user
 root@f2-vm:/merged# sudo -u ubuntu -- bash -i

 # Create a file as simpel user
 ubuntu@f2-vm:/merged$ touch /merged/A-DIR/A-USER-FILE

 ubuntu@f2-vm:/merged$ ls -al /merged/A-DIR/A-USER-FILE
 -rw-rw-r-- 1 ubuntu ubuntu 0 Oct 28 12:18 /merged/A-DIR/A-USER-FILE

 # Let's look at these files in our idmapped upper directory
 ubuntu@f2-vm:/$ ls -alR /upper/upper/
 /upper/upper/:
 total 12
 drwxr-xr-x 3 root   root   4096 Oct 28 12:23 .
 drwxr-xr-x 4 root   root   4096 Oct 21 13:48 ..
 drwxr-xr-x 2 ubuntu ubuntu 4096 Oct 28 12:18 A-DIR
 -rw-r--r-- 1 ubuntu ubuntu0 Oct 28 12:16 A-FILE

 /upper/upper/A-DIR:
 total 8
 drwxr-xr-x 2 ubuntu ubuntu 4096 Oct 28 12:18 .
 drwxr-xr-x 3 root   root   4096 Oct 28 12:23 ..
 -rw-rw-r-- 1 ubuntu ubuntu0 Oct 28 12:18 A-USER-FILE

 # Let's remove the idmapped /upper mount (overlayfs will have it's own private 
mount anyway)
 umount /upper

 # Let's look at these files in our upper directory with the idmapped mount 
removed
 ubuntu@f2-vm:/$ ls -alR /upper/upper/
 /upper/upper/:
 total 12
 drwxr-xr-x 3 1 1 4096 Oct 28 12:23 .
 drwxr-xr-x 4 1 1 4096 Oct 21 13:48 ..
 drwxr-xr-x 2 11000 11000 4096 Oct 28 12:18 A-DIR
 -rw-r--r-- 1 11000 110000 Oct 28 12:16 A-FILE

 /upper/upper/A-DIR:
 total 8
 drwxr-xr-x 2 11000 11000 4096 Oct 28 12:18 .
 drwxr-xr-x 3 1 1 4096 Oct 28 12:23 ..
 -rw-rw-r-- 1 11000 110000 Oct 28 12:18 A-USER-FILE

 # Let's create a few acls from the /merged directory  on an already existing 
file
 # triggering a copy-up operation
  root@f2-vm:/merged# setfacl -m u:1000:rwx /merged/asdf
  root@f2-vm:/merged# getfacl /merged/asdf
  getfacl: Removing leading '/' from absolute path names
  # file: merged/asdf
  # owner: root
  # group: root
  user::rw-
  user:ubuntu:rwx
  group::r--
  mask::rwx
  other::r--

  # Let's look at this file from our upper directory
  root@f2-vm:/merged# getfacl /upper/upper/asdf
  getfacl: Removing leading '/' from absolute path names
  # file: upper/upper/asdf
  # owner: 1
  # group: 1
  user::rw-
  user:11000:rwx
  group::r--
  mask::rwx
  other::r--

Cc: Seth Forshee 
Cc: Amir Goldstein 
Signed-off-by: Christian Brauner 
---
 fs/overlayfs/copy_up.c   | 100 +++---
 fs/overlayfs/dir.c   | 151 ++-
 fs/overlayfs/export.c|   3 +-
 fs/overlayfs/file.c  |  23 +++---
 fs/overlayfs/inode.c |  89 ++-
 fs/overlayfs/namei.c |  64 ++---
 fs/overlayfs/overlayfs.h | 149 ++
 fs/overlayfs/ovl_entry.h |   1 +
 fs/overlayfs/readdir.c   |  34 +
 fs/overlayfs/super.c | 106 ---
 fs/overlayfs/util.c  |  38 +-
 11 files changed, 494 insertions(+), 264 deletions(-)

diff --git a/fs/overlayfs/copy_up.c b/fs/overlayfs/copy_up.c
index 955ecd4030f0..1b8721796fd4 100644
--- a/fs/overlayfs/copy_up.c
+++ b/fs/overlayfs/copy_up.c
@@ -43,7 +43,8 @@ static bool ovl_must_copy_xattr(const char *name)
   !strncmp(name, XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN);
 }
 
-int ovl_copy_xattr(struct super_block *sb, struct dentry *old,
+int ovl_copy_xattr(struct super_block *sb, struct user_namespace *old_user_ns,
+  struct dentry *old, struct user_namespace *new_user_ns,
   struct dentry *new)
 {
ssize_t list_size, size, value_size = 0;
@@ -85,9 +86,9 @@ int ovl_copy_xattr(struct super_block *sb, struct dentry *old,
if (ovl_is_private_xattr(sb, name))
continue;
 retry:
-   size = vfs_getxattr(old, name, value, value_size);
+   size = vfs_mapped_getxattr(old_user_ns, old, name, value, 
value_size);
if (size == -ERANGE)
-   size = vfs_getxattr(old, name, NULL, 0);
+   size = vfs_mapped_getxattr(old_user_ns, old, name, 
NULL, 0);
 
if (size < 0) {
error = size;
@@ -114,7 +115,7 @@ int ovl_copy_xattr(struct super_block *sb, struct dentry 
*old,
error = 0;
continue; /* Discard */
}
-   error = vfs_setxattr(new, name, value, size, 0);
+   error = vfs_mapped_setxattr(new_user_ns, new, name, value, 
size, 0);

[PATCH 13/34] selftests: add idmapped mounts xattr selftest

2020-10-29 Thread Christian Brauner

From: Tycho Andersen 

Add some tests for setting extended attributes on idmapped mounts.

Signed-off-by: Tycho Andersen 
Signed-off-by: Christian Brauner 
---
 .../testing/selftests/idmap_mounts/.gitignore |   1 +
 tools/testing/selftests/idmap_mounts/Makefile |   8 +
 tools/testing/selftests/idmap_mounts/config   |   1 +
 tools/testing/selftests/idmap_mounts/xattr.c  | 389 ++
 4 files changed, 399 insertions(+)
 create mode 100644 tools/testing/selftests/idmap_mounts/.gitignore
 create mode 100644 tools/testing/selftests/idmap_mounts/Makefile
 create mode 100644 tools/testing/selftests/idmap_mounts/config
 create mode 100644 tools/testing/selftests/idmap_mounts/xattr.c

diff --git a/tools/testing/selftests/idmap_mounts/.gitignore 
b/tools/testing/selftests/idmap_mounts/.gitignore
new file mode 100644
index ..18c5e90522ad
--- /dev/null
+++ b/tools/testing/selftests/idmap_mounts/.gitignore
@@ -0,0 +1 @@
+xattr
diff --git a/tools/testing/selftests/idmap_mounts/Makefile 
b/tools/testing/selftests/idmap_mounts/Makefile
new file mode 100644
index ..ce0549b09b2a
--- /dev/null
+++ b/tools/testing/selftests/idmap_mounts/Makefile
@@ -0,0 +1,8 @@
+# SPDX-License-Identifier: GPL-2.0
+# Makefile for mount selftests.
+CFLAGS = -g -I../../../../usr/include/ -Wall -O2 -pthread
+
+TEST_GEN_FILES += xattr
+
+include ../lib.mk
+
diff --git a/tools/testing/selftests/idmap_mounts/config 
b/tools/testing/selftests/idmap_mounts/config
new file mode 100644
index ..80730abc534b
--- /dev/null
+++ b/tools/testing/selftests/idmap_mounts/config
@@ -0,0 +1 @@
+CONFIG_IDMAP_MOUNTS=y
diff --git a/tools/testing/selftests/idmap_mounts/xattr.c 
b/tools/testing/selftests/idmap_mounts/xattr.c
new file mode 100644
index ..a3d70294ce43
--- /dev/null
+++ b/tools/testing/selftests/idmap_mounts/xattr.c
@@ -0,0 +1,389 @@
+// SPDX-License-Identifier: GPL-2.0
+#define _GNU_SOURCE
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+#include "../kselftest_harness.h"
+
+#ifndef __NR_mount_setattr
+   #if defined __alpha__
+   #define __NR_mount_setattr 551
+   #elif defined _MIPS_SIM
+   #if _MIPS_SIM == _MIPS_SIM_ABI32/* o32 */
+   #define __NR_mount_setattr 4441
+   #endif
+   #if _MIPS_SIM == _MIPS_SIM_NABI32   /* n32 */
+   #define __NR_mount_setattr 6441
+   #endif
+   #if _MIPS_SIM == _MIPS_SIM_ABI64/* n64 */
+   #define __NR_mount_setattr 5441
+   #endif
+   #elif defined __ia64__
+   #define __NR_mount_setattr (441 + 1024)
+   #else
+   #define __NR_mount_setattr 441
+   #endif
+
+#ifndef __NR_open_tree
+   #if defined __alpha__
+   #define __NR_open_tree 538
+   #elif defined _MIPS_SIM
+   #if _MIPS_SIM == _MIPS_SIM_ABI32/* o32 */
+   #define __NR_open_tree 4428
+   #endif
+   #if _MIPS_SIM == _MIPS_SIM_NABI32   /* n32 */
+   #define __NR_open_tree 6428
+   #endif
+   #if _MIPS_SIM == _MIPS_SIM_ABI64/* n64 */
+   #define __NR_open_tree 5428
+   #endif
+   #elif defined __ia64__
+   #define __NR_open_tree (428 + 1024)
+   #else
+   #define __NR_open_tree 428
+   #endif
+#endif
+
+#ifndef __NR_move_mount
+   #if defined __alpha__
+   #define __NR_move_mount 539
+   #elif defined _MIPS_SIM
+   #if _MIPS_SIM == _MIPS_SIM_ABI32/* o32 */
+   #define __NR_move_mount 4429
+   #endif
+   #if _MIPS_SIM == _MIPS_SIM_NABI32   /* n32 */
+   #define __NR_move_mount 6429
+   #endif
+   #if _MIPS_SIM == _MIPS_SIM_ABI64/* n64 */
+   #define __NR_move_mount 5429
+   #endif
+   #elif defined __ia64__
+   #define __NR_move_mount (428 + 1024)
+   #else
+   #define __NR_move_mount 429
+   #endif
+#endif
+
+
+struct mount_attr {
+   __u64 attr_set;
+   __u64 attr_clr;
+   __u64 propagation;
+   __u32 userns;
+};
+#endif
+
+#ifndef MOVE_MOUNT_F_EMPTY_PATH
+#define MOVE_MOUNT_F_EMPTY_PATH 0x0004 /* Empty from path permitted */
+#endif
+
+#ifndef MOUNT_ATTR_SHIFT
+#define MOUNT_ATTR_SHIFT 0x0010
+#endif
+
+#ifndef OPEN_TREE_CLONE
+#define OPEN_TREE_CLONE 1
+#endif
+
+#ifndef OPEN_TREE_CLOEXEC
+#define OPEN_TREE_CLOEXEC O_CLOEXEC
+#endif
+
+#ifndef AT_RECURSIVE
+#define AT_RECURSIVE 0x8000 /* Apply to the entire subtree */
+#endif
+
+static inline int sys_mount_setattr(int dfd, const char *path, unsigned int 
flags,
+   struct mount_attr *attr, size_t size)
+{
+   return syscall(__NR_mount_setattr, dfd, p

[PATCH 29/34] ima: handle idmapped mounts

2020-10-29 Thread Christian Brauner

IMA does sometimes access the inode's i_uid and compares it against the rules'
fowner. Enable IMA to handle idmapped mounts by passing down the mount's user
namespace. We simply make use of the helpers we introduced before.

Signed-off-by: Christian Brauner 
---
 fs/attr.c|  2 +-
 fs/namei.c   |  4 +--
 include/linux/ima.h  | 15 ++-
 security/integrity/ima/ima.h | 19 -
 security/integrity/ima/ima_api.c | 10 ---
 security/integrity/ima/ima_appraise.c| 14 +-
 security/integrity/ima/ima_asymmetric_keys.c |  2 +-
 security/integrity/ima/ima_main.c| 28 
 security/integrity/ima/ima_policy.c  | 17 ++--
 security/integrity/ima/ima_queue_keys.c  |  2 +-
 10 files changed, 66 insertions(+), 47 deletions(-)

diff --git a/fs/attr.c b/fs/attr.c
index d13ef3f8eac0..9b05608bacd3 100644
--- a/fs/attr.c
+++ b/fs/attr.c
@@ -384,7 +384,7 @@ int notify_mapped_change(struct user_namespace *user_ns, 
struct dentry *dentry,
 
if (!error) {
fsnotify_change(dentry, ia_valid);
-   ima_inode_post_setattr(dentry);
+   ima_inode_post_setattr(user_ns, dentry);
evm_inode_post_setattr(dentry, ia_valid);
}
 
diff --git a/fs/namei.c b/fs/namei.c
index 545ce391a12d..ba78b57e1d86 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -3379,7 +3379,7 @@ struct dentry *vfs_mapped_tmpfile(struct user_namespace 
*user_ns,
inode->i_state |= I_LINKABLE;
spin_unlock(>i_lock);
}
-   ima_post_create_tmpfile(inode);
+   ima_post_create_tmpfile(user_ns, inode);
return child;
 
 out_err:
@@ -3703,7 +3703,7 @@ static long do_mknodat(int dfd, const char __user 
*filename, umode_t mode,
error = vfs_mapped_create(user_ns, path.dentry->d_inode,
  dentry, mode, true);
if (!error)
-   ima_post_path_mknod(dentry);
+   ima_post_path_mknod(user_ns, dentry);
break;
case S_IFCHR: case S_IFBLK:
error = vfs_mapped_mknod(user_ns, path.dentry->d_inode,
diff --git a/include/linux/ima.h b/include/linux/ima.h
index 8fa7bcfb2da2..c3e3c260ad40 100644
--- a/include/linux/ima.h
+++ b/include/linux/ima.h
@@ -16,7 +16,7 @@ struct linux_binprm;
 #ifdef CONFIG_IMA
 extern int ima_bprm_check(struct linux_binprm *bprm);
 extern int ima_file_check(struct file *file, int mask);
-extern void ima_post_create_tmpfile(struct inode *inode);
+extern void ima_post_create_tmpfile(struct user_namespace *user_ns, struct 
inode *inode);
 extern void ima_file_free(struct file *file);
 extern int ima_file_mmap(struct file *file, unsigned long prot);
 extern int ima_file_mprotect(struct vm_area_struct *vma, unsigned long prot);
@@ -27,7 +27,8 @@ extern int ima_read_file(struct file *file, enum 
kernel_read_file_id id,
 bool contents);
 extern int ima_post_read_file(struct file *file, void *buf, loff_t size,
  enum kernel_read_file_id id);
-extern void ima_post_path_mknod(struct dentry *dentry);
+extern void ima_post_path_mknod(struct user_namespace *user_ns,
+   struct dentry *dentry);
 extern int ima_file_hash(struct file *file, char *buf, size_t buf_size);
 extern void ima_kexec_cmdline(int kernel_fd, const void *buf, int size);
 
@@ -61,7 +62,8 @@ static inline int ima_file_check(struct file *file, int mask)
return 0;
 }
 
-static inline void ima_post_create_tmpfile(struct inode *inode)
+static inline void ima_post_create_tmpfile(struct user_namespace *user_ns,
+  struct inode *inode)
 {
 }
 
@@ -105,7 +107,8 @@ static inline int ima_post_read_file(struct file *file, 
void *buf, loff_t size,
return 0;
 }
 
-static inline void ima_post_path_mknod(struct dentry *dentry)
+static inline void ima_post_path_mknod(struct user_namespace *user_ns,
+  struct dentry *dentry)
 {
return;
 }
@@ -141,7 +144,7 @@ static inline void ima_post_key_create_or_update(struct key 
*keyring,
 
 #ifdef CONFIG_IMA_APPRAISE
 extern bool is_ima_appraise_enabled(void);
-extern void ima_inode_post_setattr(struct dentry *dentry);
+extern void ima_inode_post_setattr(struct user_namespace *user_ns, struct 
dentry *dentry);
 extern int ima_inode_setxattr(struct dentry *dentry, const char *xattr_name,
   const void *xattr_value, size_t xattr_value_len);
 extern int ima_inode_removexattr(struct dentry *dentry, const char 
*xattr_name);
@@ -151,7 +154,7 @@ static inline bool is_ima_appraise_enabled(void)
return 0;
 }
 
-static inline void ima_inode_post_setattr(struct dentry *den

[PATCH 03/34] fs: add mount_setattr()

2020-10-29 Thread Christian Brauner

This implements the mount_setattr() syscall. While the new mount api
allows to change the properties of a superblock there is currently no
way to change the mount properties of a mount or mount tree using mount
file descriptors which the new mount api is based on. In addition the
old mount api has the restriction that mount options cannot be
applied recursively. This hasn't changed since changing mount options on
a per-mount basis was implemented in [1] and has been a frequent
request.
The legacy mount is currently unable to accommodate this behavior
without introducing a whole new set of flags because MS_REC | MS_REMOUNT
| MS_BIND | MS_RDONLY | MS_NOEXEC | [...] only apply the mount option to
the topmost mount. Changing MS_REC to apply to the whole mount tree
would mean introducing a significant uapi change and would likely cause
significant regressions.

The new mount_setattr() syscall allows to recursively clear and set
mount options in one shot. Multiple calls to change mount options
requesting the same changes are idempotent:

int mount_setattr(int dfd, const char *path, unsigned flags,
  struct mount_attr *uattr, size_t usize);

Flags to modify path resolution behavior are specified in the @flags
argument. Currently, AT_EMPTY_PATH, AT_RECURSIVE, AT_SYMLINK_NOFOLLOW,
and AT_NO_AUTOMOUNT are supported. If useful, additional lookup flags to
restrict path resolution as introduced with openat2() might be supported
in the future.

mount_setattr() can be expected to grow over time and is designed with
extensibility in mind. It follows the extensible syscall pattern we have
used with other syscalls such as openat2(), clone3(),
sched_{set,get}attr(), and others.
The set of mount options is passed in the uapi struct mount_attr which
currently has the following layout:

struct mount_attr {
__u64 attr_set;
__u64 attr_clr;
__u32 propagation;
};

The @attr_set and @attr_clr members are used to clear and set mount
options. This way a user can e.g. request that a set of flags is to be
raised such as turning mounts readonly by raising MOUNT_ATTR_RDONLY in
@attr_set while at the same time requesting that another set of flags is
to be lowered such as removing noexec from a mount tree by specifying
MOUNT_ATTR_NOEXEC in @attr_clr.
Note, since the MOUNT_ATTR_ values are an enum starting from 0,
not a bitmap, users wanting to transition to a different atime setting
cannot simply specify the atime setting in @attr_set, but must also
specify MOUNT_ATTR__ATIME in the @attr_clr field. So we ensure that
MOUNT_ATTR__ATIME can't be partially set in @attr_clr and that @attr_set
can't have any atime bits set if MOUNT_ATTR__ATIME isn't set in
@attr_clr.

The @propagation field lets callers specify the propagation type of a
mount tree. Propagation is a single property that has four different
settings and as such is not really a flag argument but an enum.
Specifically, it would be unclear what setting and clearing propagation
settings in combination would amount to. The legacy mount() syscall thus
forbids the combination of multiple propagation settings too. The goal
is to keep the semantics of mount propagation somewhat simple as they
are overly complex as it is.

Changing mount option has quite a few moving parts and the locking is
quite intricate so it is not unlikely that I got subtleties wrong.

[1]: commit 2e4b7fcd9260 ("[PATCH] r/o bind mounts: honor mount writer counts 
at remount")
Cc: David Howells 
Cc: Aleksa Sarai 
Cc: Al Viro 
Cc: linux-...@vger.kernel.org
Cc: linux-fsde...@vger.kernel.org
Signed-off-by: Christian Brauner 
---
 arch/alpha/kernel/syscalls/syscall.tbl  |   1 +
 arch/arm/tools/syscall.tbl  |   1 +
 arch/arm64/include/asm/unistd32.h   |   2 +
 arch/ia64/kernel/syscalls/syscall.tbl   |   1 +
 arch/m68k/kernel/syscalls/syscall.tbl   |   1 +
 arch/microblaze/kernel/syscalls/syscall.tbl |   1 +
 arch/mips/kernel/syscalls/syscall_n32.tbl   |   1 +
 arch/mips/kernel/syscalls/syscall_n64.tbl   |   1 +
 arch/mips/kernel/syscalls/syscall_o32.tbl   |   1 +
 arch/parisc/kernel/syscalls/syscall.tbl |   1 +
 arch/powerpc/kernel/syscalls/syscall.tbl|   1 +
 arch/s390/kernel/syscalls/syscall.tbl   |   1 +
 arch/sh/kernel/syscalls/syscall.tbl |   1 +
 arch/sparc/kernel/syscalls/syscall.tbl  |   1 +
 arch/x86/entry/syscalls/syscall_32.tbl  |   1 +
 arch/x86/entry/syscalls/syscall_64.tbl  |   1 +
 arch/xtensa/kernel/syscalls/syscall.tbl |   1 +
 fs/internal.h   |   8 +
 fs/namespace.c  | 286 ++--
 include/linux/syscalls.h|   3 +
 include/uapi/asm-generic/unistd.h   |   4 +-
 include/uapi/linux/mount.h  |  22 ++
 tools/include/uapi/asm-generic/unistd.h |   4 +-
 23 files changed, 319 insertions(+), 26 deletions(-)

diff --git a/arch/alpha/kernel/syscalls/syscall.tbl 
b/arch/alpha/kerne

[PATCH 00/34] fs: idmapped mounts

2020-10-29 Thread Christian Brauner

096 Apr 15  2020 proc
drwx--  2 2 2 4096 Sep 24 07:43 root
drwxr-xr-x  2 2 2 4096 Sep 24 07:45 run
lrwxrwxrwx  1 2 28 Sep 24 07:43 sbin -> usr/sbin
drwxr-xr-x  2 2 2 4096 Sep 24 07:43 srv
drwxr-xr-x  2 2 2 4096 Apr 15  2020 sys
drwxrwxrwt  2 2 2 4096 Sep 24 07:44 tmp
drwxr-xr-x 13 2 2 4096 Sep 24 07:43 usr
drwxr-xr-x 12 2 2 4096 Sep 24 07:44 var
root@f2-vm:~# /mount-idmapped --map-mount b:2:1:10 
/var/lib/lxc/f2/rootfs/ /mnt
root@f2-vm:~# ls -al /mnt
total 68
drwxr-xr-x 17 1 1 4096 Sep 24 07:48 .
drwxr-xr-x 34 root  root  4096 Oct 28 22:24 ..
lrwxrwxrwx  1 1 17 Sep 24 07:43 bin -> usr/bin
drwxr-xr-x  2 1 1 4096 Apr 15  2020 boot
drwxr-xr-x  3 1 1 4096 Oct 16 19:26 dev
drwxr-xr-x 61 1 1 4096 Oct 16 19:26 etc
drwxr-xr-x  3 1 1 4096 Sep 24 07:45 home
lrwxrwxrwx  1 1 17 Sep 24 07:43 lib -> usr/lib
lrwxrwxrwx  1 1 19 Sep 24 07:43 lib32 -> usr/lib32
lrwxrwxrwx  1 1 19 Sep 24 07:43 lib64 -> usr/lib64
lrwxrwxrwx  1 1 1   10 Sep 24 07:43 libx32 -> usr/libx32
drwxr-xr-x  2 1 1 4096 Sep 24 07:43 media
drwxr-xr-x  2 1 1 4096 Sep 24 07:43 mnt
drwxr-xr-x  2 1 1 4096 Sep 24 07:43 opt
drwxr-xr-x  2 1 1 4096 Apr 15  2020 proc
drwx--  2 1 1 4096 Sep 24 07:43 root
drwxr-xr-x  2 1 1 4096 Sep 24 07:45 run
lrwxrwxrwx  1 1 18 Sep 24 07:43 sbin -> usr/sbin
drwxr-xr-x  2 1 1 4096 Sep 24 07:43 srv
drwxr-xr-x  2 1 1 4096 Apr 15  2020 sys
drwxrwxrwt  2 1 1 4096 Sep 24 07:44 tmp
drwxr-xr-x 13 1 1 4096 Sep 24 07:43 usr
drwxr-xr-x 12 1 1 4096 Sep 24 07:44 var
root@f2-vm:~# lxc-start f2 # uses /mnt as rootfs
root@f2-vm:~# lxc-attach f2 -- cat /proc/1/uid_map
 0  1  1
root@f2-vm:~# lxc-attach f2 -- cat /proc/1/gid_map
 0  1  1
root@f2-vm:~# lxc-attach f2 -- ls -al /
total 52
drwxr-xr-x  17 root   root4096 Sep 24 07:48 .
drwxr-xr-x  17 root   root4096 Sep 24 07:48 ..
lrwxrwxrwx   1 root   root   7 Sep 24 07:43 bin -> usr/bin
drwxr-xr-x   2 root   root4096 Apr 15  2020 boot
drwxr-xr-x   5 root   root 500 Oct 28 23:39 dev
drwxr-xr-x  61 root   root4096 Oct 28 23:39 etc
drwxr-xr-x   3 root   root4096 Sep 24 07:45 home
lrwxrwxrwx   1 root   root   7 Sep 24 07:43 lib -> usr/lib
lrwxrwxrwx   1 root   root   9 Sep 24 07:43 lib32 -> usr/lib32
lrwxrwxrwx   1 root   root   9 Sep 24 07:43 lib64 -> usr/lib64
lrwxrwxrwx   1 root   root  10 Sep 24 07:43 libx32 -> usr/libx32
drwxr-xr-x   2 root   root4096 Sep 24 07:43 media
drwxr-xr-x   2 root   root4096 Sep 24 07:43 mnt
drwxr-xr-x   2 root   root4096 Sep 24 07:43 opt
dr-xr-xr-x 232 nobody nogroup0 Oct 28 23:39 proc
drwx--   2 root   root4096 Oct 28 23:41 root
drwxr-xr-x  12 root   root 360 Oct 28 23:39 run
lrwxrwxrwx   1 root   root   8 Sep 24 07:43 sbin -> usr/sbin
drwxr-xr-x   2 root   root4096 Sep 24 07:43 srv
dr-xr-xr-x  13 nobody nogroup0 Oct 28 23:39 sys
drwxrwxrwt  11 root   root4096 Oct 28 23:40 tmp
drwxr-xr-x  13 root   root4096 Sep 24 07:43 usr
drwxr-xr-x  12 root   root4096 Sep 24 07:44 var
root@f2-vm:~# lxc-attach f2 -- ls -al /my-file
-rw-r--r-- 1 root root 0 Oct 28 23:43 /my-file
root@f2-vm:~# ls -al /var/lib/lxc/f2/rootfs/my-file
-rw-r--r-- 1 2 2 0 Oct 28 23:43 /var/lib/lxc/f2/rootfs/my-file

[1]: https://systemd.io/HOME_DIRECTORY/
 "If the UID assigned to a user does not match the owner of the home
  directory in the file system, the home directory is automatically
  and recursively chown()ed to the correct UID."
  This has huge performance impact and is also problematic since it
  chowns all files independent of ownership.
[2]: https://github.com/brauner/mount-idmapped

In no particular order I'd like to say thanks to:
Al for pointing me into the direction to avoid inode alias issues during
lookup. David for various discussions around this. Tycho for helping
with this series and on future patches if this is in any shape or form
acceptable. Alban Crequy for pointing out more application container
use-cases. Stéphane for various valuable input on various use-cases and
letting me work on this. Amir for explaining and discussing aspects of
overlayfs with me.
I'd like to especially thank Seth Forshee because he provided a lot of
good analysis, suggestions, and participated in short-notice discussions
in both chat and video.

This series can be found and pulled in three locations:
https://git.kernel.org/pub/scm/linux/kernel/git/brauner/linux.git/log/?h=idmapped_mounts
https://github.com/brauner/linux/tree/idmapped_mounts
https://gitlab.com/brauner/linux/-/commits/idmapped_mounts

Thanks!
Christian

Christian Brauner (32):
  namespace: take lock_mount_hash() directly when changing flags
  namespac

[PATCH 09/34] inode: add idmapped mount aware init and permission helpers

2020-10-29 Thread Christian Brauner

The inode_owner_or_capable() helper determines whether the caller is the
owner of the inode or is capable with respect to that inode. Add a new
mapped_inode_owner_or_capable() helper to handle idmapped mounts. If the
If the inode is accessed through an idmapped mount we first need to map
it according to the mount's user namespace. Afterwards the checks are
identical to non-idmapped mounts. If the initial user namespace is
passed all operations are a nop so non-idmapped mounts will not see a
change in behavior and will also not see any performance impact. It also
means that the inode_owner_or_capable() helper can be implemented on top
of mapped_inode_owner_or_capable() by passing in the initial user
namespace.

Similarly, we add a new mapped_inode_init_owner() helper which
initializes a new inode on idmapped mounts by mapping the fsuid and
fsgid of the caller from the mount's user namespace. If the initial user
namespace is passed all operations are a nop so non-idmapped mounts will
not see a change in behavior and will also not see any performance
impact. It also means that the inode_init_owner() helper can be
implemented on top of mapped_inode_init_owner() by passing in the
initial user namespace.

Signed-off-by: Christian Brauner 
---
 fs/inode.c | 53 --
 include/linux/fs.h |  4 
 2 files changed, 46 insertions(+), 11 deletions(-)

diff --git a/fs/inode.c b/fs/inode.c
index 9d78c37b00b8..22de3cb3b1f4 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -2130,15 +2130,17 @@ void init_special_inode(struct inode *inode, umode_t 
mode, dev_t rdev)
 EXPORT_SYMBOL(init_special_inode);
 
 /**
- * inode_init_owner - Init uid,gid,mode for new inode according to posix 
standards
+ * mapped_inode_init_owner - Init uid,gid,mode for new inode according to posix
+ *   standards on idmapped mounts
  * @inode: New inode
+ * @user_ns: User namespace the inode is accessed from
  * @dir: Directory inode
  * @mode: mode of the new inode
  */
-void inode_init_owner(struct inode *inode, const struct inode *dir,
-   umode_t mode)
+void mapped_inode_init_owner(struct inode *inode, struct user_namespace 
*user_ns,
+const struct inode *dir, umode_t mode)
 {
-   inode->i_uid = current_fsuid();
+   inode->i_uid = fsuid_into_mnt(user_ns);
if (dir && dir->i_mode & S_ISGID) {
inode->i_gid = dir->i_gid;
 
@@ -2146,34 +2148,63 @@ void inode_init_owner(struct inode *inode, const struct 
inode *dir,
if (S_ISDIR(mode))
mode |= S_ISGID;
else if ((mode & (S_ISGID | S_IXGRP)) == (S_ISGID | S_IXGRP) &&
-!in_group_p(inode->i_gid) &&
-!capable_wrt_inode_uidgid(dir, CAP_FSETID))
+!in_group_p(i_gid_into_mnt(user_ns, inode)) &&
+!capable_wrt_mapped_inode_uidgid(user_ns, dir, 
CAP_FSETID))
mode &= ~S_ISGID;
} else
-   inode->i_gid = current_fsgid();
+   inode->i_gid = fsgid_into_mnt(user_ns);
inode->i_mode = mode;
 }
+EXPORT_SYMBOL(mapped_inode_init_owner);
+
+/**
+ * inode_init_owner - Init uid,gid,mode for new inode according to posix 
standards
+ * @inode: New inode
+ * @dir: Directory inode
+ * @mode: mode of the new inode
+ */
+void inode_init_owner(struct inode *inode, const struct inode *dir,
+   umode_t mode)
+{
+   return mapped_inode_init_owner(inode, _user_ns, dir, mode);
+}
 EXPORT_SYMBOL(inode_init_owner);
 
 /**
- * inode_owner_or_capable - check current task permissions to inode
+ * mapped_inode_owner_or_capable - check current task permissions to inode on 
idmapped mounts
+ * @user_ns: User namespace the inode is accessed from
  * @inode: inode being checked
  *
  * Return true if current either has CAP_FOWNER in a namespace with the
  * inode owner uid mapped, or owns the file.
  */
-bool inode_owner_or_capable(const struct inode *inode)
+bool mapped_inode_owner_or_capable(struct user_namespace *user_ns, const 
struct inode *inode)
 {
+   kuid_t i_uid;
struct user_namespace *ns;
 
-   if (uid_eq(current_fsuid(), inode->i_uid))
+   i_uid = i_uid_into_mnt(user_ns, inode);
+   if (uid_eq(current_fsuid(), i_uid))
return true;
 
ns = current_user_ns();
-   if (kuid_has_mapping(ns, inode->i_uid) && ns_capable(ns, CAP_FOWNER))
+   if (kuid_has_mapping(ns, i_uid) && ns_capable(ns, CAP_FOWNER))
return true;
return false;
 }
+EXPORT_SYMBOL(mapped_inode_owner_or_capable);
+
+/**
+ * inode_owner_or_capable - check current task permissions to inode
+ * @inode: inode being checked
+ *
+ * Return true if current either has CAP_FOWNER in a namespace with the
+ * inode owner uid mapped, or

96 matches

Mail list logo