Re: [PATCH] binderfs: implement "max" mount option

2018-12-23 Thread Christian Brauner
On Sat, Dec 22, 2018 at 10:18:06PM +0100, Christian Brauner wrote:
> Since binderfs can be mounted by userns root in non-initial user namespaces
> some precautions are in order. First, a way to set a maximum on the number
> of binder devices that can be allocated per binderfs instance and second, a
> way to reserve a reasonable chunk of binderfs devices for the initial ipc
> namespace.
> A first approach as seen in [1] used sysctls similiar to devpts but was
> shown to be flawed (cf. [2] and [3]) since some aspects were unneeded. This
> is an alternative approach which avoids sysctls completely and instead
> switches to a single mount option.
> 
> Starting with this commit binderfs instances can be mounted with a limit on
> the number of binder devices that can be allocated. The max= mount
> option serves as a per-instance limit. If max= is set then only
>  number of binder devices can be allocated in this binderfs
> instance.
> Additionally, the binderfs instance in the initial ipc namespace will
> always have a reserve of at least 1024 binder devices unless explicitly
> capped via max=.
> 
> This allows to safely bind-mount binderfs instances into unprivileged user
> namespaces since userns root in a non-initial user namespace cannot change
> the mount option as long as it does not own the mount namespace the
> binderfs mount was created in and hence cannot drain the host of minor
> device numbers
> 
> [1]: https://lore.kernel.org/lkml/20181221133909.18794-1-christ...@brauner.io/
> [2]; https://lore.kernel.org/lkml/20181221163316.ga8...@kroah.com/
> [3]: 
> https://lore.kernel.org/lkml/cahrssex+gdvw4fkkk8oznair9g5icjlyodo8hykv3o0o1jt...@mail.gmail.com/
> [4]: https://lore.kernel.org/lkml/20181221192044.5yvfnuri7gdop...@brauner.io/
> 
> Cc: Todd Kjos 
> Cc: Greg Kroah-Hartman 
> Signed-off-by: Christian Brauner 
> ---
>  drivers/android/binderfs.c | 109 +++--
>  1 file changed, 104 insertions(+), 5 deletions(-)
> 
> diff --git a/drivers/android/binderfs.c b/drivers/android/binderfs.c
> index 7496b10532aa..93aee00994d4 100644
> --- a/drivers/android/binderfs.c
> +++ b/drivers/android/binderfs.c
> @@ -20,6 +20,7 @@
>  #include 
>  #include 
>  #include 
> +#include 
>  #include 
>  #include 
>  #include 
> @@ -39,6 +40,11 @@
>  #define INODE_OFFSET 3
>  #define INTSTRLEN 21
>  #define BINDERFS_MAX_MINOR (1U << MINORBITS)
> +/*
> + * Ensure that the initial ipc namespace always has a good chunk of devices
> + * available.
> + */
> +#define BINDERFS_MAX_MINOR_CAPPED (BINDERFS_MAX_MINOR - 1024)
>  
>  static struct vfsmount *binderfs_mnt;
>  
> @@ -46,6 +52,24 @@ static dev_t binderfs_dev;
>  static DEFINE_MUTEX(binderfs_minors_mutex);
>  static DEFINE_IDA(binderfs_minors);
>  
> +/**
> + * binderfs_mount_opts - mount options for binderfs
> + * @max: maximum number of allocatable binderfs binder devices
> + */
> +struct binderfs_mount_opts {
> + int max;
> +};
> +
> +enum {
> + Opt_max,
> + Opt_err
> +};
> +
> +static const match_table_t tokens = {
> + { Opt_max, "max=%d" },
> + { Opt_err, NULL }
> +};
> +
>  /**
>   * binderfs_info - information about a binderfs mount
>   * @ipc_ns: The ipc namespace the binderfs mount belongs to.
> @@ -55,13 +79,16 @@ static DEFINE_IDA(binderfs_minors);
>   *  created.
>   * @root_gid:   gid that needs to be used when a new binder device is
>   *  created.
> + * @mount_opts: The mount options in use.
> + * @device_count:   The current number of allocated binder devices.
>   */
>  struct binderfs_info {
>   struct ipc_namespace *ipc_ns;
>   struct dentry *control_dentry;
>   kuid_t root_uid;
>   kgid_t root_gid;
> -
> + struct binderfs_mount_opts mount_opts;
> + atomic_t device_count;
>  };
>  
>  static inline struct binderfs_info *BINDERFS_I(const struct inode *inode)
> @@ -107,13 +134,22 @@ static int binderfs_binder_device_create(struct inode 
> *ref_inode,
>   struct inode *inode = NULL;
>   struct super_block *sb = ref_inode->i_sb;
>   struct binderfs_info *info = sb->s_fs_info;
> + bool use_reserve = (info->ipc_ns == _ipc_ns);
>  
>   /* Reserve new minor number for the new device. */
>   mutex_lock(_minors_mutex);
> - minor = ida_alloc_max(_minors, BINDERFS_MAX_MINOR, GFP_KERNEL);
> + if (atomic_inc_return(>device_count) < info->mount_opts.max)
> + minor = ida_alloc_max(_minors,
> +   use_reserve ? BINDERFS_MAX_MINOR :
> + BINDERFS_MAX_MINOR_CAPPED,
> +   GFP_KERNEL);
> + else
> + minor = -ENOSPC;
>   mutex_unlock(_minors_mutex);
> - if (minor < 0)
> + if (minor < 0) {
> + atomic_dec(>device_count);
>   return minor;
> + }
>  
>   ret = -ENOMEM;
>   device = kzalloc(sizeof(*device), GFP_KERNEL);
> @@ -187,6 +223,7 @@ static 

Re: [PATCH] binderfs: implement "max" mount option

2018-12-23 Thread Christian Brauner
On Sun, Dec 23, 2018 at 12:29:44PM +0100, Greg KH wrote:
> On Sat, Dec 22, 2018 at 10:18:06PM +0100, Christian Brauner wrote:
> > Since binderfs can be mounted by userns root in non-initial user namespaces
> > some precautions are in order. First, a way to set a maximum on the number
> > of binder devices that can be allocated per binderfs instance and second, a
> > way to reserve a reasonable chunk of binderfs devices for the initial ipc
> > namespace.
> > A first approach as seen in [1] used sysctls similiar to devpts but was
> > shown to be flawed (cf. [2] and [3]) since some aspects were unneeded. This
> > is an alternative approach which avoids sysctls completely and instead
> > switches to a single mount option.
> > 
> > Starting with this commit binderfs instances can be mounted with a limit on
> > the number of binder devices that can be allocated. The max= mount
> > option serves as a per-instance limit. If max= is set then only
> >  number of binder devices can be allocated in this binderfs
> > instance.
> 
> Ok, this is fine, but why such a big default?  You only need 4 to run a
> modern android system, and anyone using binder outside of android is
> really too crazy to ever be using it in a container :)
> 
> > Additionally, the binderfs instance in the initial ipc namespace will
> > always have a reserve of at least 1024 binder devices unless explicitly
> > capped via max=.
> 
> Again, why so many?  And why wouldn't that initial ipc namespace already
> have their device nodes created _before_ anything else is mounted?

Right, my issue is with re-creating devices, like if binderfs gets
unmounted or if devices get removed via rm. But we can lower the number
to 4 (see below).

> 
> Some comments on the patch below:

Thanks!

> 
> > +/*
> > + * Ensure that the initial ipc namespace always has a good chunk of devices
> > + * available.
> > + */
> > +#define BINDERFS_MAX_MINOR_CAPPED (BINDERFS_MAX_MINOR - 1024)
> 
> Again that seems crazy big, how about splitting this into two different
> patches, one for the max= stuff, and one for this "reserve some minors"
> thing, so we can review them separately.

Yes, let's do that. I will also lower this to 4 reserved devices.

> 
> >  
> >  static struct vfsmount *binderfs_mnt;
> >  
> > @@ -46,6 +52,24 @@ static dev_t binderfs_dev;
> >  static DEFINE_MUTEX(binderfs_minors_mutex);
> >  static DEFINE_IDA(binderfs_minors);
> >  
> > +/**
> > + * binderfs_mount_opts - mount options for binderfs
> > + * @max: maximum number of allocatable binderfs binder devices
> > + */
> > +struct binderfs_mount_opts {
> > +   int max;
> > +};
> > +
> > +enum {
> > +   Opt_max,
> > +   Opt_err
> > +};
> > +
> > +static const match_table_t tokens = {
> > +   { Opt_max, "max=%d" },
> > +   { Opt_err, NULL }
> > +};
> > +
> >  /**
> >   * binderfs_info - information about a binderfs mount
> >   * @ipc_ns: The ipc namespace the binderfs mount belongs to.
> > @@ -55,13 +79,16 @@ static DEFINE_IDA(binderfs_minors);
> >   *  created.
> >   * @root_gid:   gid that needs to be used when a new binder device is
> >   *  created.
> > + * @mount_opts: The mount options in use.
> > + * @device_count:   The current number of allocated binder devices.
> >   */
> >  struct binderfs_info {
> > struct ipc_namespace *ipc_ns;
> > struct dentry *control_dentry;
> > kuid_t root_uid;
> > kgid_t root_gid;
> > -
> > +   struct binderfs_mount_opts mount_opts;
> > +   atomic_t device_count;
> 
> Why atomic?
> 
> You should already have the lock held every time this is accessed,
> so no need to use an atomic value, just use an int.
> 
> > /* Reserve new minor number for the new device. */
> > mutex_lock(_minors_mutex);
> > -   minor = ida_alloc_max(_minors, BINDERFS_MAX_MINOR, GFP_KERNEL);
> > +   if (atomic_inc_return(>device_count) < info->mount_opts.max)
> 
> No need for atomic, see, your lock is held :)

Habit, to be honest.

Thanks, fixed version to follow in a bit.
Christian


Re: [PATCH] binderfs: implement "max" mount option

2018-12-23 Thread Greg KH
On Sat, Dec 22, 2018 at 10:18:06PM +0100, Christian Brauner wrote:
> Since binderfs can be mounted by userns root in non-initial user namespaces
> some precautions are in order. First, a way to set a maximum on the number
> of binder devices that can be allocated per binderfs instance and second, a
> way to reserve a reasonable chunk of binderfs devices for the initial ipc
> namespace.
> A first approach as seen in [1] used sysctls similiar to devpts but was
> shown to be flawed (cf. [2] and [3]) since some aspects were unneeded. This
> is an alternative approach which avoids sysctls completely and instead
> switches to a single mount option.
> 
> Starting with this commit binderfs instances can be mounted with a limit on
> the number of binder devices that can be allocated. The max= mount
> option serves as a per-instance limit. If max= is set then only
>  number of binder devices can be allocated in this binderfs
> instance.

Ok, this is fine, but why such a big default?  You only need 4 to run a
modern android system, and anyone using binder outside of android is
really too crazy to ever be using it in a container :)

> Additionally, the binderfs instance in the initial ipc namespace will
> always have a reserve of at least 1024 binder devices unless explicitly
> capped via max=.

Again, why so many?  And why wouldn't that initial ipc namespace already
have their device nodes created _before_ anything else is mounted?

Some comments on the patch below:

> +/*
> + * Ensure that the initial ipc namespace always has a good chunk of devices
> + * available.
> + */
> +#define BINDERFS_MAX_MINOR_CAPPED (BINDERFS_MAX_MINOR - 1024)

Again that seems crazy big, how about splitting this into two different
patches, one for the max= stuff, and one for this "reserve some minors"
thing, so we can review them separately.

>  
>  static struct vfsmount *binderfs_mnt;
>  
> @@ -46,6 +52,24 @@ static dev_t binderfs_dev;
>  static DEFINE_MUTEX(binderfs_minors_mutex);
>  static DEFINE_IDA(binderfs_minors);
>  
> +/**
> + * binderfs_mount_opts - mount options for binderfs
> + * @max: maximum number of allocatable binderfs binder devices
> + */
> +struct binderfs_mount_opts {
> + int max;
> +};
> +
> +enum {
> + Opt_max,
> + Opt_err
> +};
> +
> +static const match_table_t tokens = {
> + { Opt_max, "max=%d" },
> + { Opt_err, NULL }
> +};
> +
>  /**
>   * binderfs_info - information about a binderfs mount
>   * @ipc_ns: The ipc namespace the binderfs mount belongs to.
> @@ -55,13 +79,16 @@ static DEFINE_IDA(binderfs_minors);
>   *  created.
>   * @root_gid:   gid that needs to be used when a new binder device is
>   *  created.
> + * @mount_opts: The mount options in use.
> + * @device_count:   The current number of allocated binder devices.
>   */
>  struct binderfs_info {
>   struct ipc_namespace *ipc_ns;
>   struct dentry *control_dentry;
>   kuid_t root_uid;
>   kgid_t root_gid;
> -
> + struct binderfs_mount_opts mount_opts;
> + atomic_t device_count;

Why atomic?

You should already have the lock held every time this is accessed,
so no need to use an atomic value, just use an int.

>   /* Reserve new minor number for the new device. */
>   mutex_lock(_minors_mutex);
> - minor = ida_alloc_max(_minors, BINDERFS_MAX_MINOR, GFP_KERNEL);
> + if (atomic_inc_return(>device_count) < info->mount_opts.max)

No need for atomic, see, your lock is held :)

thanks,

greg k-h


[PATCH] binderfs: implement "max" mount option

2018-12-22 Thread Christian Brauner
Since binderfs can be mounted by userns root in non-initial user namespaces
some precautions are in order. First, a way to set a maximum on the number
of binder devices that can be allocated per binderfs instance and second, a
way to reserve a reasonable chunk of binderfs devices for the initial ipc
namespace.
A first approach as seen in [1] used sysctls similiar to devpts but was
shown to be flawed (cf. [2] and [3]) since some aspects were unneeded. This
is an alternative approach which avoids sysctls completely and instead
switches to a single mount option.

Starting with this commit binderfs instances can be mounted with a limit on
the number of binder devices that can be allocated. The max= mount
option serves as a per-instance limit. If max= is set then only
 number of binder devices can be allocated in this binderfs
instance.
Additionally, the binderfs instance in the initial ipc namespace will
always have a reserve of at least 1024 binder devices unless explicitly
capped via max=.

This allows to safely bind-mount binderfs instances into unprivileged user
namespaces since userns root in a non-initial user namespace cannot change
the mount option as long as it does not own the mount namespace the
binderfs mount was created in and hence cannot drain the host of minor
device numbers

[1]: https://lore.kernel.org/lkml/20181221133909.18794-1-christ...@brauner.io/
[2]; https://lore.kernel.org/lkml/20181221163316.ga8...@kroah.com/
[3]: 
https://lore.kernel.org/lkml/cahrssex+gdvw4fkkk8oznair9g5icjlyodo8hykv3o0o1jt...@mail.gmail.com/
[4]: https://lore.kernel.org/lkml/20181221192044.5yvfnuri7gdop...@brauner.io/

Cc: Todd Kjos 
Cc: Greg Kroah-Hartman 
Signed-off-by: Christian Brauner 
---
 drivers/android/binderfs.c | 109 +++--
 1 file changed, 104 insertions(+), 5 deletions(-)

diff --git a/drivers/android/binderfs.c b/drivers/android/binderfs.c
index 7496b10532aa..93aee00994d4 100644
--- a/drivers/android/binderfs.c
+++ b/drivers/android/binderfs.c
@@ -20,6 +20,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -39,6 +40,11 @@
 #define INODE_OFFSET 3
 #define INTSTRLEN 21
 #define BINDERFS_MAX_MINOR (1U << MINORBITS)
+/*
+ * Ensure that the initial ipc namespace always has a good chunk of devices
+ * available.
+ */
+#define BINDERFS_MAX_MINOR_CAPPED (BINDERFS_MAX_MINOR - 1024)
 
 static struct vfsmount *binderfs_mnt;
 
@@ -46,6 +52,24 @@ static dev_t binderfs_dev;
 static DEFINE_MUTEX(binderfs_minors_mutex);
 static DEFINE_IDA(binderfs_minors);
 
+/**
+ * binderfs_mount_opts - mount options for binderfs
+ * @max: maximum number of allocatable binderfs binder devices
+ */
+struct binderfs_mount_opts {
+   int max;
+};
+
+enum {
+   Opt_max,
+   Opt_err
+};
+
+static const match_table_t tokens = {
+   { Opt_max, "max=%d" },
+   { Opt_err, NULL }
+};
+
 /**
  * binderfs_info - information about a binderfs mount
  * @ipc_ns: The ipc namespace the binderfs mount belongs to.
@@ -55,13 +79,16 @@ static DEFINE_IDA(binderfs_minors);
  *  created.
  * @root_gid:   gid that needs to be used when a new binder device is
  *  created.
+ * @mount_opts: The mount options in use.
+ * @device_count:   The current number of allocated binder devices.
  */
 struct binderfs_info {
struct ipc_namespace *ipc_ns;
struct dentry *control_dentry;
kuid_t root_uid;
kgid_t root_gid;
-
+   struct binderfs_mount_opts mount_opts;
+   atomic_t device_count;
 };
 
 static inline struct binderfs_info *BINDERFS_I(const struct inode *inode)
@@ -107,13 +134,22 @@ static int binderfs_binder_device_create(struct inode 
*ref_inode,
struct inode *inode = NULL;
struct super_block *sb = ref_inode->i_sb;
struct binderfs_info *info = sb->s_fs_info;
+   bool use_reserve = (info->ipc_ns == _ipc_ns);
 
/* Reserve new minor number for the new device. */
mutex_lock(_minors_mutex);
-   minor = ida_alloc_max(_minors, BINDERFS_MAX_MINOR, GFP_KERNEL);
+   if (atomic_inc_return(>device_count) < info->mount_opts.max)
+   minor = ida_alloc_max(_minors,
+ use_reserve ? BINDERFS_MAX_MINOR :
+   BINDERFS_MAX_MINOR_CAPPED,
+ GFP_KERNEL);
+   else
+   minor = -ENOSPC;
mutex_unlock(_minors_mutex);
-   if (minor < 0)
+   if (minor < 0) {
+   atomic_dec(>device_count);
return minor;
+   }
 
ret = -ENOMEM;
device = kzalloc(sizeof(*device), GFP_KERNEL);
@@ -187,6 +223,7 @@ static int binderfs_binder_device_create(struct inode 
*ref_inode,
kfree(name);
kfree(device);
mutex_lock(_minors_mutex);
+   atomic_dec(>device_count);
ida_free(_minors, minor);
mutex_unlock(_minors_mutex);
iput(inode);