from:"Christian Brauner"

Re: [PATCH v3 04/47] filelock: add some new helper functions

2024-02-05 Thread Christian Brauner

On Mon, Feb 05, 2024 at 07:06:00AM -0500, Jeff Layton wrote:
> On Mon, 2024-02-05 at 12:57 +0100, Christian Brauner wrote:
> > On Mon, Feb 05, 2024 at 06:55:44AM -0500, Jeff Layton wrote:
> > > On Mon, 2024-02-05 at 12:36 +0100, Christian Brauner wrote:
> > > > > diff --git a/include/linux/filelock.h b/include/linux/filelock.h
> > > > > index 085ff6ba0653..a814664b1053 100644
> > > > > --- a/include/linux/filelock.h
> > > > > +++ b/include/linux/filelock.h
> > > > > @@ -147,6 +147,29 @@ int fcntl_setlk64(unsigned int, struct file *, 
> > > > > unsigned int,
> > > > >  int fcntl_setlease(unsigned int fd, struct file *filp, int arg);
> > > > >  int fcntl_getlease(struct file *filp);
> > > > >  
> > > > > 
> > > > > 
> > > > > 
> > > > > 
> > > > > 
> > > > > 
> > > > > 
> > > > > +static inline bool lock_is_unlock(struct file_lock *fl)
> > > > > +{
> > > > > + return fl->fl_type == F_UNLCK;
> > > > > +}
> > > > > +
> > > > > +static inline bool lock_is_read(struct file_lock *fl)
> > > > > +{
> > > > > + return fl->fl_type == F_RDLCK;
> > > > > +}
> > > > > +
> > > > > +static inline bool lock_is_write(struct file_lock *fl)
> > > > > +{
> > > > > + return fl->fl_type == F_WRLCK;
> > > > > +}
> > > > > +
> > > > > +static inline void locks_wake_up(struct file_lock *fl)
> > > > > +{
> > > > > + wake_up(>fl_wait);
> > > > > +}
> > > > > +
> > > > > +/* for walking lists of file_locks linked by fl_list */
> > > > > +#define for_each_file_lock(_fl, _head)   
> > > > > list_for_each_entry(_fl, _head, fl_list)
> > > > > +
> > > > 
> > > > This causes a build warning for fs/ceph/ and fs/afs when
> > > > !CONFIG_FILE_LOCKING. I'm about to fold the following diff into this
> > > > patch. The diff looks a bit wonky but essentially I've moved
> > > > lock_is_unlock(), lock_is_{read,write}(), locks_wake_up() and
> > > > for_each_file_lock() out of the ifdef CONFIG_FILE_LOCKING:
> > > > 
> > > 
> > > I sent a patch for this problem yesterday. Did you not get it?
> > 
> > Whoops, probably missed it on the trip back from fosdem.
> > I'll double check now.
> 
> No worries. If you choose to go with your version, you can add:

No, I took yours. :)

Re: [PATCH v3 04/47] filelock: add some new helper functions

2024-02-05 Thread Christian Brauner

On Mon, Feb 05, 2024 at 06:55:44AM -0500, Jeff Layton wrote:
> On Mon, 2024-02-05 at 12:36 +0100, Christian Brauner wrote:
> > > diff --git a/include/linux/filelock.h b/include/linux/filelock.h
> > > index 085ff6ba0653..a814664b1053 100644
> > > --- a/include/linux/filelock.h
> > > +++ b/include/linux/filelock.h
> > > @@ -147,6 +147,29 @@ int fcntl_setlk64(unsigned int, struct file *, 
> > > unsigned int,
> > >  int fcntl_setlease(unsigned int fd, struct file *filp, int arg);
> > >  int fcntl_getlease(struct file *filp);
> > >  
> > > 
> > > 
> > > 
> > > 
> > > 
> > > 
> > > 
> > > +static inline bool lock_is_unlock(struct file_lock *fl)
> > > +{
> > > + return fl->fl_type == F_UNLCK;
> > > +}
> > > +
> > > +static inline bool lock_is_read(struct file_lock *fl)
> > > +{
> > > + return fl->fl_type == F_RDLCK;
> > > +}
> > > +
> > > +static inline bool lock_is_write(struct file_lock *fl)
> > > +{
> > > + return fl->fl_type == F_WRLCK;
> > > +}
> > > +
> > > +static inline void locks_wake_up(struct file_lock *fl)
> > > +{
> > > + wake_up(>fl_wait);
> > > +}
> > > +
> > > +/* for walking lists of file_locks linked by fl_list */
> > > +#define for_each_file_lock(_fl, _head)   list_for_each_entry(_fl, _head, 
> > > fl_list)
> > > +
> > 
> > This causes a build warning for fs/ceph/ and fs/afs when
> > !CONFIG_FILE_LOCKING. I'm about to fold the following diff into this
> > patch. The diff looks a bit wonky but essentially I've moved
> > lock_is_unlock(), lock_is_{read,write}(), locks_wake_up() and
> > for_each_file_lock() out of the ifdef CONFIG_FILE_LOCKING:
> > 
> 
> I sent a patch for this problem yesterday. Did you not get it?

Whoops, probably missed it on the trip back from fosdem.
I'll double check now.

Re: [PATCH v3 04/47] filelock: add some new helper functions

2024-02-05 Thread Christian Brauner

> diff --git a/include/linux/filelock.h b/include/linux/filelock.h
> index 085ff6ba0653..a814664b1053 100644
> --- a/include/linux/filelock.h
> +++ b/include/linux/filelock.h
> @@ -147,6 +147,29 @@ int fcntl_setlk64(unsigned int, struct file *, unsigned 
> int,
>  int fcntl_setlease(unsigned int fd, struct file *filp, int arg);
>  int fcntl_getlease(struct file *filp);
>  
> +static inline bool lock_is_unlock(struct file_lock *fl)
> +{
> + return fl->fl_type == F_UNLCK;
> +}
> +
> +static inline bool lock_is_read(struct file_lock *fl)
> +{
> + return fl->fl_type == F_RDLCK;
> +}
> +
> +static inline bool lock_is_write(struct file_lock *fl)
> +{
> + return fl->fl_type == F_WRLCK;
> +}
> +
> +static inline void locks_wake_up(struct file_lock *fl)
> +{
> + wake_up(>fl_wait);
> +}
> +
> +/* for walking lists of file_locks linked by fl_list */
> +#define for_each_file_lock(_fl, _head)   list_for_each_entry(_fl, _head, 
> fl_list)
> +

This causes a build warning for fs/ceph/ and fs/afs when
!CONFIG_FILE_LOCKING. I'm about to fold the following diff into this
patch. The diff looks a bit wonky but essentially I've moved
lock_is_unlock(), lock_is_{read,write}(), locks_wake_up() and
for_each_file_lock() out of the ifdef CONFIG_FILE_LOCKING:

diff --git a/include/linux/filelock.h b/include/linux/filelock.h
index a814664b1053..62be9c6b1e59 100644
--- a/include/linux/filelock.h
+++ b/include/linux/filelock.h
@@ -133,20 +133,6 @@ struct file_lock_context {
struct list_headflc_lease;
 };

-#ifdef CONFIG_FILE_LOCKING
-int fcntl_getlk(struct file *, unsigned int, struct flock *);
-int fcntl_setlk(unsigned int, struct file *, unsigned int,
-   struct flock *);
-
-#if BITS_PER_LONG == 32
-int fcntl_getlk64(struct file *, unsigned int, struct flock64 *);
-int fcntl_setlk64(unsigned int, struct file *, unsigned int,
-   struct flock64 *);
-#endif
-
-int fcntl_setlease(unsigned int fd, struct file *filp, int arg);
-int fcntl_getlease(struct file *filp);
-
 static inline bool lock_is_unlock(struct file_lock *fl)
 {
return fl->fl_type == F_UNLCK;
@@ -170,6 +156,20 @@ static inline void locks_wake_up(struct file_lock *fl)
 /* for walking lists of file_locks linked by fl_list */
 #define for_each_file_lock(_fl, _head) list_for_each_entry(_fl, _head, fl_list)

+#ifdef CONFIG_FILE_LOCKING
+int fcntl_getlk(struct file *, unsigned int, struct flock *);
+int fcntl_setlk(unsigned int, struct file *, unsigned int,
+   struct flock *);
+
+#if BITS_PER_LONG == 32
+int fcntl_getlk64(struct file *, unsigned int, struct flock64 *);
+int fcntl_setlk64(unsigned int, struct file *, unsigned int,
+   struct flock64 *);
+#endif
+
+int fcntl_setlease(unsigned int fd, struct file *filp, int arg);
+int fcntl_getlease(struct file *filp);
+
 /* fs/locks.c */
 void locks_free_lock_context(struct inode *inode);
 void locks_free_lock(struct file_lock *fl);

Re: [PATCH v3 00/47] filelock: split file leases out of struct file_lock

2024-02-02 Thread Christian Brauner

On Wed, 31 Jan 2024 18:01:41 -0500, Jeff Layton wrote:
> I'm not sure this is much prettier than the last, but contracting
> "fl_core" to "c", as Neil suggested is a bit easier on the eyes.
> 
> I also added a few small helpers and converted several users over to
> them. That reduces the size of the per-fs conversion patches later in
> the series. I played with some others too, but they were too awkward
> or not frequently used enough to make it worthwhile.
> 
> [...]

Fyi, I've merged this series as in I've turned this series into a pull
request based on the patches. And this has a merge commit of the
following form:

commit 363af2435e403ac323ab2543da91f5984047bdb8
Merge: 6613476e225e 6c6109548454
Author: Christian Brauner 
AuthorDate: Fri Feb 2 12:09:26 2024 +0100
Commit: Christian Brauner 
CommitDate: Fri Feb 2 12:09:26 2024 +0100

Merge patch series "filelock: split file leases out of struct file_lock"

Pull file locking patch series from Jeff Layton:

For larger series such as this this is what I think we should end up
doing because it gives bigger series an overall summary without forcing
the author to always provide a tag or branch or whatever. Often the
cover letter description is good for long term contributors already. So
I stole most of it from v1.

Thanks for basing this on a mainline tag!

---

Applied to the vfs.file branch of the vfs/vfs.git tree.
Patches in the vfs.file branch should appear in linux-next soon.

Please report any outstanding bugs that were missed during review in a
new review to the original patch series allowing us to drop it.

It's encouraged to provide Acked-bys and Reviewed-bys even though the
patch has now been applied. If possible patch trailers will be updated.

Note that commit hashes shown below are subject to change due to rebase,
trailer updates or similar. If in doubt, please check the listed branch.

tree:   https://git.kernel.org/pub/scm/linux/kernel/git/vfs/vfs.git
branch: vfs.file

[01/47] filelock: fl_pid field should be signed int
https://git.kernel.org/vfs/vfs/c/0e9876d8e88d
[02/47] filelock: rename some fields in tracepoints
https://git.kernel.org/vfs/vfs/c/587a67b6830b
[03/47] filelock: rename fl_pid variable in lock_get_status
https://git.kernel.org/vfs/vfs/c/6021d62c677f
[04/47] filelock: add some new helper functions
https://git.kernel.org/vfs/vfs/c/403594111407
[05/47] 9p: rename fl_type variable in v9fs_file_do_lock
https://git.kernel.org/vfs/vfs/c/2911c0e3a5dd
[06/47] afs: convert to using new filelock helpers
https://git.kernel.org/vfs/vfs/c/46a9b98baecc
[07/47] ceph: convert to using new filelock helpers
https://git.kernel.org/vfs/vfs/c/7c82f3103915
[08/47] dlm: convert to using new filelock helpers
https://git.kernel.org/vfs/vfs/c/7851cb526662
[09/47] gfs2: convert to using new filelock helpers
https://git.kernel.org/vfs/vfs/c/47bc8fa51b46
[10/47] lockd: convert to using new filelock helpers
https://git.kernel.org/vfs/vfs/c/b9570e87b652
[11/47] nfs: convert to using new filelock helpers
https://git.kernel.org/vfs/vfs/c/28ad1884a338
[12/47] nfsd: convert to using new filelock helpers
https://git.kernel.org/vfs/vfs/c/4e2cd366d826
[13/47] ocfs2: convert to using new filelock helpers
https://git.kernel.org/vfs/vfs/c/a336b91b2340
[14/47] smb/client: convert to using new filelock helpers
https://git.kernel.org/vfs/vfs/c/39647541cb26
[15/47] smb/server: convert to using new filelock helpers
https://git.kernel.org/vfs/vfs/c/1d9b1c4525f6
[16/47] filelock: drop the IS_* macros
https://git.kernel.org/vfs/vfs/c/22716eba8323
[17/47] filelock: split common fields into struct file_lock_core
https://git.kernel.org/vfs/vfs/c/b2566e35e7d6
[18/47] filelock: have fs/locks.c deal with file_lock_core directly
https://git.kernel.org/vfs/vfs/c/424dc929f8f1
[19/47] filelock: convert more internal functions to use file_lock_core
https://git.kernel.org/vfs/vfs/c/2d1cfb3cf69e
[20/47] filelock: make posix_same_owner take file_lock_core pointers
https://git.kernel.org/vfs/vfs/c/c91b6f218894
[21/47] filelock: convert posix_owner_key to take file_lock_core arg
https://git.kernel.org/vfs/vfs/c/6944d789d1a1
[22/47] filelock: make locks_{insert,delete}_global_locks take file_lock_core 
arg
https://git.kernel.org/vfs/vfs/c/ff30006ce158
[23/47] filelock: convert locks_{insert,delete}_global_blocked
https://git.kernel.org/vfs/vfs/c/b7ae01bb4138
[24/47] filelock: make __locks_delete_block and __locks_wake_up_blocks take 
file_lock_core
https://git.kernel.org/vfs/vfs/c/6ada65e99171
[25/47] filelock: convert __locks_insert_block, conflict and deadlock checks to 
use file_lock_core
https://git.kernel.org/vfs/vfs/c/f449edd19f07
[26/47] filelock: convert fl_blocker to file_lock_core
https://git.ker

Re: [PATCH 00/20] filelock: split struct file_lock into file_lock and file_lease structs

2024-01-17 Thread Christian Brauner

> I'd like to have this considered for inclusion in v6.9. Christian, would
> you be amenable to shepherding this into mainline (assuming there are no
> major objections, of course)?

Yes, of course I will be happy to.

Re: [PATCH] tracefs/eventfs: Use root and instance inodes as default ownership

2024-01-12 Thread Christian Brauner

On Thu, Jan 11, 2024 at 04:53:19PM -0500, Steven Rostedt wrote:
> On Thu, 11 Jan 2024 22:01:32 +0100
> Christian Brauner  wrote:
> 
> > What I'm pointing out in the current logic is that the caller is
> > taxed twice:
> > 
> > (1) Once when the VFS has done inode_permission(MAY_EXEC, "xfs")
> > (2) And again when you call lookup_one_len() in eventfs_start_creating()
> > _because_ the permission check in lookup_one_len() is the exact
> > same permission check again that the vfs has done
> > inode_permission(MAY_EXEC, "xfs").
> 
> As I described in: 
> https://lore.kernel.org/all/20240110133154.6e18f...@gandalf.local.home/
> 
> The eventfs files below "events" doesn't need the .permissions callback at
> all. It's only there because the "events" inode uses it.
> 
> The .permissions call for eventfs has:

It doesn't matter whether there's a ->permission handler. If you don't
add one explicitly the VFS will simply call generic_permission():

inode_permission()
-> do_inode_permission()
   {
if (unlikely(!(inode->i_opflags & IOP_FASTPERM))) {
if (likely(inode->i_op->permission))
return inode->i_op->permission(idmap, inode, mask);
   
/* This gets set once for the inode lifetime */
spin_lock(>i_lock);
inode->i_opflags |= IOP_FASTPERM;
spin_unlock(>i_lock);
}
return generic_permission(idmap, inode, mask);
   }

> Anyway, the issue is with "events" directory and remounting, because like
> the tracefs system, the inode and dentry for "evnets" is created at boot
> up, before the mount happens. The VFS layer is going to check the
> permissions of its inode and dentry, which will be incorrect if the mount
> was mounted with a "gid" option.

The gid option has nothing to do with this and it is just handled fine
if you remove the second permission checking in (2).

You need to remove the inode_permission() code from
eventfs_start_creating(). It is just an internal lookup and the fact
that you have it in there allows userspace to break readdir on the
eventfs portions of tracefs as I've shown in the parts of the mail that
you cut off.

Re: [PATCH] tracefs/eventfs: Use root and instance inodes as default ownership

2024-01-11 Thread Christian Brauner

On Wed, Jan 10, 2024 at 08:07:46AM -0500, Steven Rostedt wrote:
> On Wed, 10 Jan 2024 12:45:36 +0100
> Christian Brauner  wrote:
> 
> > So say you do:
> > 
> > mkdir /sys/kernel/tracing/instances/foo
> > 
> > After this has returned we know everything we need to know about the new
> > tracefs instance including the ownership and the mode of all inodes in
> > /sys/kernel/tracing/instances/foo/events/* and below precisely because
> > ownership is always inherited from the parent dentry and recorded in the
> > metadata struct eventfs_inode.
> > 
> > So say someone does:
> > 
> > open("/sys/kernel/tracing/instances/foo/events/xfs");
> > 
> > and say this is the first time that someone accesses that events/
> > directory.
> > 
> > When the open pathwalk is done, the vfs will determine via
> > 
> > [1] may_lookup(inode_of(events))
> > 
> > whether you are able to list entries such as "xfs" in that directory.
> > The vfs checks inode_permission(MAY_EXEC) on "events" and if that holds
> > it ends up calling i_op->eventfs_root_lookup(events).
> > 
> > At this point tracefs/eventfs adds the inodes for all entries in that
> > "events" directory including "xfs" based on the metadata it recorded
> > during the mkdir. Since now someone is actually interested in them. And
> > it initializes the inodes with ownership and everything and adds the
> > dentries that belong into that directory.
> > 
> > Nothing here depends on the permissions of the caller. The only
> > permission that mattered was done in the VFS in [1]. If the caller has
> > permissions to enter a directory they can lookup and list its contents.
> > And its contents where determined/fixed etc when mkdir was called.
> > 
> > So we just need to add the required objects into the caches (inode,
> > dentry) whose addition we intentionally defered until someone actually
> > needed them.
> > 
> > So, eventfs_root_lookup() now initializes the inodes with the ownership
> > from the stored metadata or from the parent dentry and splices in inodes
> > and dentries. No permission checking is needed for this because it is
> > always a recheck of what the vfs did in [1].
> > 
> > We now return to the vfs and path walk continues to the final component
> > that you actually want to open which is that "xfs" directory in this
> > example. We check the permissions on that inode via may_open("xfs") and
> > we open that directory returning an fd to userspace ultimately.
> > 
> > (I'm going by memory since I need to step out the door.)
> 
> So, let's say we do:
> 
>  chgrp -R rostedt /sys/kernel/tracing/

The rostedt group needs exec permissions and "other" cannot have exec
permissions otherwise you can trivially list the entries even if it's
owned by root:

chmod 750 /sys/kernel/tracing

user1@localhost:~$ ls -aln /sys/kernel/ | grep tracing
drwxr-x---   6 0 10000 Jan 11 18:23 tracing

> 
> But I don't want rostedt to have access to xfs
> 
>  chgrp -R root /sys/kernel/tracing/events/xfs

chmod 750 /sys/kernel/tracing/events/xfs

user1@localhost:~$ ls -aln /sys/kernel/tracing/events/ | grep xfs
drwxr-x--- 601 0 0 0 Jan 11 18:24 xfs

This ensure that if a user is in the group and the group has exec perms
lookup is possible (For root this will usually work because
CAP_DAC_READ_SEARCH overrides the exec requirement.).

> 
> Both actions will create the inodes and dentries of all files and
> directories (because of "-R"). But once that is done, the ref counts go to
> zero. They stay around until reclaim. But then I open Chrome ;-) and it
> reclaims all the dentries and inodes, so we are back to here we were on
> boot.
> 
> Now as rostedt I do:
> 
>  ls /sys/kernel/tracing/events/xfs
> 
> The VFS layer doesn't know if I have permission to that or not, because all
> the inodes and dentries have been freed. It has to call back to eventfs to
> find out. Which the eventfs_root_lookup() and eventfs_iterate_shared() will
> recreated the inodes with the proper permission.

Very roughly, ignoring most of the complexity of lookup and focussing on
the permission checking:

When a caller looks up an entry in a directory then the VFS will call
inode_permission(MAY_EXEC) on the directory the caller is trying to
perform that lookup in.

If the caller wants to lookup the "events" entry in the "tracing"
directory then the VFS will call inode_permission(MAY_EXEC, "tracing")
and then - assuming it's not in the cache - call into the lookup method
of the filesystem.

After the VFS has determin

Re: [PATCH] tracefs/eventfs: Use root and instance inodes as default ownership

2024-01-10 Thread Christian Brauner

On Mon, Jan 08, 2024 at 10:23:31AM -0500, Steven Rostedt wrote:
> On Mon, 8 Jan 2024 12:04:54 +0100
> Christian Brauner  wrote:
> 
> > > > IOW, the inode_permission() in lookup_one_len() that eventfs does is
> > > > redundant and just wrong.  
> > > 
> > > I don't think so.  
> > 
> > I'm very well aware that the dentries and inode aren't created during
> > mkdir but the completely directory layout is determined. You're just
> > splicing in dentries and inodes during lookup and readdir.
> > 
> > If mkdir /sys/kernel/tracing/instances/foo has succeeded and you later
> > do a lookup/readdir on
> > 
> > ls -al /sys/kernel/tracing/instances/foo/events
> > 
> > Why should the creation of the dentries and inodes ever fail due to a
> > permission failure?
> 
> They shouldn't.
> 
> > The vfs did already verify that you had the required
> > permissions to list entries in that directory. Why should filling up
> > /sys/kernel/tracing/instances/foo/events ever fail then? It shouldn't
> > That tracefs instance would be half-functional. And again, right now
> > that inode_permission() check cannot even fail.
> 
> And it shouldn't. But without dentries and inodes, how does VFS know what
> is allowed to open the files?

So say you do:

mkdir /sys/kernel/tracing/instances/foo

After this has returned we know everything we need to know about the new
tracefs instance including the ownership and the mode of all inodes in
/sys/kernel/tracing/instances/foo/events/* and below precisely because
ownership is always inherited from the parent dentry and recorded in the
metadata struct eventfs_inode.

So say someone does:

open("/sys/kernel/tracing/instances/foo/events/xfs");

and say this is the first time that someone accesses that events/
directory.

When the open pathwalk is done, the vfs will determine via

[1] may_lookup(inode_of(events))

whether you are able to list entries such as "xfs" in that directory.
The vfs checks inode_permission(MAY_EXEC) on "events" and if that holds
it ends up calling i_op->eventfs_root_lookup(events).

At this point tracefs/eventfs adds the inodes for all entries in that
"events" directory including "xfs" based on the metadata it recorded
during the mkdir. Since now someone is actually interested in them. And
it initializes the inodes with ownership and everything and adds the
dentries that belong into that directory.

Nothing here depends on the permissions of the caller. The only
permission that mattered was done in the VFS in [1]. If the caller has
permissions to enter a directory they can lookup and list its contents.
And its contents where determined/fixed etc when mkdir was called.

So we just need to add the required objects into the caches (inode,
dentry) whose addition we intentionally defered until someone actually
needed them.

So, eventfs_root_lookup() now initializes the inodes with the ownership
from the stored metadata or from the parent dentry and splices in inodes
and dentries. No permission checking is needed for this because it is
always a recheck of what the vfs did in [1].

We now return to the vfs and path walk continues to the final component
that you actually want to open which is that "xfs" directory in this
example. We check the permissions on that inode via may_open("xfs") and
we open that directory returning an fd to userspace ultimately.

(I'm going by memory since I need to step out the door.)

Re: [PATCH] tracefs/eventfs: Use root and instance inodes as default ownership

2024-01-08 Thread Christian Brauner

On Sun, Jan 07, 2024 at 01:32:28PM -0500, Steven Rostedt wrote:
> On Sun, 7 Jan 2024 13:29:12 -0500
> Steven Rostedt  wrote:
> 
> > > 
> > > IOW, the inode_permission() in lookup_one_len() that eventfs does is
> > > redundant and just wrong.  
> > 
> > I don't think so.
> 
> Just to make it clear. eventfs has nothing to do with mkdir instance/foo.
> It exists without that. Although one rationale to do eventfs was so

Every instance/foo/ tracefs instances also contains an events directory
and thus a eventfs portion. Eventfs is just a subtree of tracefs. It's
not a separate filesystem. Both eventfs and tracefs are on the same
single, system wide superblock.

> that the instance directories wouldn't recreate the same 10thousands
> event inodes and dentries for every mkdir done.

I know but that's irrelevant to what I'm trying to tell you.

A mkdir /sys/kernel/tracing/instances/foo creates a new tracefs
instance. With or without the on-demand dentry and inode creation for
the eventfs portion that tracefs "instance" has now been created in its
entirety including all the required information for someone to later
come along and perform a lookup on /sys/kernel/tracing/instances/foo/events.

All you've done is to defer the addition of the dentries and inodes when
someone does actually look at the events directory of the tracefs
instance.

Whether you choose to splice in the dentries and inodes for the eventfs
portion during lookup and readdir or if you had chosen to not do the
on-demand thing at all and the entries were created at the same time as
the mkdir call are equivalent from the perspective of permission
checking.

If you have the required permissions to look at the events directory
then there's no reason why listing the directory entries in there should
fail. This can't even happen right now.

Re: [PATCH] tracefs/eventfs: Use root and instance inodes as default ownership

2024-01-08 Thread Christian Brauner

> > * Tracefs supports the creation of instances from userspace via mkdir.
> >   For example,
> > 
> > mkdir /sys/kernel/tracing/instances/foo
> > 
> >   And here the idmapping is relevant so we need to make the helpers
> >   aware of the idmapping.
> > 
> >   I just went and plumbed this through to most helpers.
> > 
> > There's some subtlety in eventfs. Afaict, the directories and files for
> > the individual events are created on-demand during lookup or readdir.
> > 
> > The ownership of these events is again inherited from the parent inode
> > or recovered from stored state. In both cases the actual idmapping is
> > irrelevant.
> > 
> > The callchain here is:
> > 
> > eventfs_root_lookup("xfs", "events")
> > -> create_{dir,file}_dentry("xfs", "events")
> >-> create_{dir,file}("xfs", "events")
> >   -> eventfs_start_creating("xfs", "events")
> >  -> lookup_one_len("xfs", "events")  
> > 
> > And the subtlety is that lookup_one_len() does permission checking on
> > the parent inode (IOW, if you want a dentry for "blech" under "events"
> > it'll do a permission check on events->d_inode) for exec permissions
> > and then goes on to give you a new dentry.
> > 
> > Usually this call would have to be changed to lookup_one() and the
> > idmapping be handed down to it. But I think that's irrelevant here.
> > 
> > Lookup generally doesn't need to be aware of idmappings at all. The
> > permission checking is done purely in the vfs via may_lookup() and the
> > idmapping is irrelevant because we always initialize inodes with the
> > filesystem level ownership (see the idmappings.rst) documentation if
> > you're interested in excessive details (otherwise you get inode aliases
> > which you really don't want).
> > 
> > For tracefs it would not matter for lookup per se but only because
> > tracefs seemingly creates inodes/dentries during lookup (and readdir()).
> 
> tracefs creates the inodes/dentries at boot up, it's eventfs that does
> it on demand during lookup.
> 
> For inodes/dentries:
> 
>  /sys/kernel/tracing/* is all created at boot up, except for "events".

Yes.

>  /sys/kernel/tracing/events/* is created on demand.

Yes.

> 
> > 
> > But imho the permission checking done in current eventfs_root_lookup()
> > via lookup_one_len() is meaningless in any way; possibly even
> > (conceptually) wrong.
> > 
> > Because, the actual permission checking for the creation of the eventfs
> > entries isn't really done during lookup or readdir, it's done when mkdir
> > is called:
> > 
> > mkdir /sys/kernel/tracing/instances/foo
> 
> No. that creates a entire new tracefs instance, which happens to
> include another eventfs directory.

Yes, I'm aware of all that.

> No. Only the meta data is created for the eventfs directory with a
> mkdir instances/foo. The inodes and dentries are not there.

I know, that is what I'm saying.

> 
> > 
> > When one goes and looksup stuff under foo/events/ or readdir the entries
> > in that directory:
> > 
> > fd = open("foo/events")
> > readdir(fd, ...)
> > 
> > then they are licensed to list an entry in that directory. So all that
> > needs to be done is to actually list those files in that directory. And
> > since they already exist (they were created during mkdir) we just need
> > to splice in inodes and dentries for them. But for that we shouldn't
> > check permissions on the directory again. Because we've done that
> > already correctly when the VFS called may_lookup().
> 
> No they do not exist.

I am aware.

> 
> > 
> > IOW, the inode_permission() in lookup_one_len() that eventfs does is
> > redundant and just wrong.
> 
> I don't think so.

I'm very well aware that the dentries and inode aren't created during
mkdir but the completely directory layout is determined. You're just
splicing in dentries and inodes during lookup and readdir.

If mkdir /sys/kernel/tracing/instances/foo has succeeded and you later
do a lookup/readdir on

ls -al /sys/kernel/tracing/instances/foo/events

Why should the creation of the dentries and inodes ever fail due to a
permission failure? The vfs did already verify that you had the required
permissions to list entries in that directory. Why should filling up
/sys/kernel/tracing/instances/foo/events ever fail then? It shouldn't
That tracefs instance would be half-functional. And again, right now
that inode_permission() check cannot even fail.

Re: [PATCH] tracefs/eventfs: Use root and instance inodes as default ownership

2024-01-07 Thread Christian Brauner

On Sun, Jan 07, 2024 at 06:42:33PM +0100, Christian Brauner wrote:
> On Sun, Jan 07, 2024 at 01:42:39PM +0100, Christian Brauner wrote:
> > > > So tracefs supports remounting with different uid/gid mount options and
> > > > then actually wades through _all_ of the inodes and changes their
> > > > ownership internally? What's the use-case for this? Containers?
> > > 
> > > No, in fact tracing doesn't work well with containers as tracing is global
> > > to the entire machine. It can work with privileged containers though.
> > 
> > At least the tracefs interface is easily supportable within a delegation
> > model. IOW, you have a privileged process that delegates relevant
> > portions to a container via idmapped mounts _without_ doing the insane thing
> > and making it mountable by a container aka the fs-to-CVE pipeline.
> > 
> > > 
> > > The reason for this is because tracefs was based off of debugfs where the
> > > files and directores are created at boot up and mounted later. The reason
> > > to do this was to allow users to mount with gid=GID to allow a given group
> > > to have access to tracing. Without this update, tracefs would ignore it
> > > like debugfs and proc does today.
> > > 
> > > I think its time I explain the purpose of tracefs and how it came to be.
> > > 
> > > The tracing system required a way to control tracing and read the traces.
> > > It could have just used a new system like perf (although
> > > /sys/kernel/debug/tracing predates perf), where it created a single 
> > > ioctl()
> > > like system call do do everything.
> > > 
> > > As the ftrace tracing came from PREEMPT_RT latency tracer and my own 
> > > logdev
> > > tracer, which both have an embedded background, I chose an interface that
> > > could work with just an unmodified version of busybox. That is, I wanted 
> > > it
> > > to work with just cat and echo.
> > > 
> > > The main difference with tracefs compared to other file systems is that it
> > > is a control interface, where writes happen as much as reads. The data 
> > > read
> > > is controlled. The closest thing I can think of is how cgroups work.
> > > 
> > > As tracing is a privileged operation, but something that could be changed
> > > to allow a group to have access to, I wanted to make it easy for an admin
> > > to decide who gets to do what at boot up via the /etc/fstab file.
> > 
> > Yeah, ok. I think you could achieve the same thing via idmapped mounts. You
> > just need to swap out the mnt on /sys/kernel/tracing with an idmapped mount.
> > 
> > mount(8) should just give you the ability to specify "map the ids I 
> > explicitly
> > want to remap to something else and for the rest use the identity mapping". 
> > I
> > wanted that for other reasons anyway.
> > 
> > So in one of the next versions of mount(8) you can then do (where --beneath
> > means place the mount beneath the current one and --replace is
> > self-explanatory):
> > 
> > sudo mount --beneath -o X-mount.idmap='g:0:1234:1 u:0:0:1' 
> > /sys/kernel/tracing
> > sudo umount /sys/kernel/tracing
> > 
> > or as a shortcut provided by mount(8):
> > 
> > sudo mount --replace -o X-mount.idmap='g:0:1234:1 u:0:0:1' 
> > /sys/kernel/tracing 
> > 
> > In both cases you replace the mount without unmounting tracefs.
> > 
> > I can illustrate this right now though:
> > 
> > user1@localhost:~$ sudo mount --bind -o X-mount.idmap='g:0:1000:1 
> > u:0:1000:1' /sys/kernel/tracing/ /mnt/
> > 
> > # This is a tool I wrote for testing the patchset I wrote back then.
> > user1@localhost:~/data/move-mount-beneath$ sudo ./move-mount --beneath 
> > --detached /mnt /sys/kernel/tracing
> > Mounting beneath top mount
> > Creating anonymous mount
> > Attaching mount /mnt -> /sys/kernel/tracing
> > Creating single detached mount
> > 
> > user1@localhost:~/data/move-mount-beneath$
> > 
> > # Now there's two mounts stacked on top of each other.
> > user1@localhost:~/data/move-mount-beneath$ findmnt | grep tracing
> > | `-/sys/kernel/tracingtracefstracefs 
> > rw,nosuid,nodev,noexec,relatime,idmapped
> > |   `-/sys/kernel/tracing  tracefstracefs 
> > rw,nosuid,nodev,noexec,relatime
> > 
> > user1@localhost:~/data/move-mount-beneath$ sudo ls -al 
> > /sys/kernel/tracing/| head
> > total 0
> > drwx--  6 root root 0 J

Re: [PATCH] tracefs/eventfs: Use root and instance inodes as default ownership

2024-01-07 Thread Christian Brauner

On Sun, Jan 07, 2024 at 01:42:39PM +0100, Christian Brauner wrote:
> > > So tracefs supports remounting with different uid/gid mount options and
> > > then actually wades through _all_ of the inodes and changes their
> > > ownership internally? What's the use-case for this? Containers?
> > 
> > No, in fact tracing doesn't work well with containers as tracing is global
> > to the entire machine. It can work with privileged containers though.
> 
> At least the tracefs interface is easily supportable within a delegation
> model. IOW, you have a privileged process that delegates relevant
> portions to a container via idmapped mounts _without_ doing the insane thing
> and making it mountable by a container aka the fs-to-CVE pipeline.
> 
> > 
> > The reason for this is because tracefs was based off of debugfs where the
> > files and directores are created at boot up and mounted later. The reason
> > to do this was to allow users to mount with gid=GID to allow a given group
> > to have access to tracing. Without this update, tracefs would ignore it
> > like debugfs and proc does today.
> > 
> > I think its time I explain the purpose of tracefs and how it came to be.
> > 
> > The tracing system required a way to control tracing and read the traces.
> > It could have just used a new system like perf (although
> > /sys/kernel/debug/tracing predates perf), where it created a single ioctl()
> > like system call do do everything.
> > 
> > As the ftrace tracing came from PREEMPT_RT latency tracer and my own logdev
> > tracer, which both have an embedded background, I chose an interface that
> > could work with just an unmodified version of busybox. That is, I wanted it
> > to work with just cat and echo.
> > 
> > The main difference with tracefs compared to other file systems is that it
> > is a control interface, where writes happen as much as reads. The data read
> > is controlled. The closest thing I can think of is how cgroups work.
> > 
> > As tracing is a privileged operation, but something that could be changed
> > to allow a group to have access to, I wanted to make it easy for an admin
> > to decide who gets to do what at boot up via the /etc/fstab file.
> 
> Yeah, ok. I think you could achieve the same thing via idmapped mounts. You
> just need to swap out the mnt on /sys/kernel/tracing with an idmapped mount.
> 
> mount(8) should just give you the ability to specify "map the ids I explicitly
> want to remap to something else and for the rest use the identity mapping". I
> wanted that for other reasons anyway.
> 
> So in one of the next versions of mount(8) you can then do (where --beneath
> means place the mount beneath the current one and --replace is
> self-explanatory):
> 
> sudo mount --beneath -o X-mount.idmap='g:0:1234:1 u:0:0:1' /sys/kernel/tracing
> sudo umount /sys/kernel/tracing
> 
> or as a shortcut provided by mount(8):
> 
> sudo mount --replace -o X-mount.idmap='g:0:1234:1 u:0:0:1' 
> /sys/kernel/tracing 
> 
> In both cases you replace the mount without unmounting tracefs.
> 
> I can illustrate this right now though:
> 
> user1@localhost:~$ sudo mount --bind -o X-mount.idmap='g:0:1000:1 u:0:1000:1' 
> /sys/kernel/tracing/ /mnt/
> 
> # This is a tool I wrote for testing the patchset I wrote back then.
> user1@localhost:~/data/move-mount-beneath$ sudo ./move-mount --beneath 
> --detached /mnt /sys/kernel/tracing
> Mounting beneath top mount
> Creating anonymous mount
> Attaching mount /mnt -> /sys/kernel/tracing
> Creating single detached mount
> 
> user1@localhost:~/data/move-mount-beneath$
> 
> # Now there's two mounts stacked on top of each other.
> user1@localhost:~/data/move-mount-beneath$ findmnt | grep tracing
> | `-/sys/kernel/tracingtracefstracefs 
> rw,nosuid,nodev,noexec,relatime,idmapped
> |   `-/sys/kernel/tracing  tracefstracefs 
> rw,nosuid,nodev,noexec,relatime
> 
> user1@localhost:~/data/move-mount-beneath$ sudo ls -al /sys/kernel/tracing/| 
> head
> total 0
> drwx--  6 root root 0 Jan  7 13:33 .
> drwxr-xr-x 16 root root 0 Jan  7 13:33 ..
> -r--r-  1 root root 0 Jan  7 13:33 README
> -r--r-  1 root root 0 Jan  7 13:33 available_events
> -r--r-  1 root root 0 Jan  7 13:33 available_filter_functions
> -r--r-  1 root root 0 Jan  7 13:33 available_filter_functions_addrs
> -r--r-  1 root root 0 Jan  7 13:33 available_tracers
> -rw-r-  1 root root 0 Jan  7 13:33 buffer_percent
> -rw-r-  1 root root 0 Jan  7 13:33 buffer_size_kb
> 
> # Reveal updated mount
> user1@localhost:~/data/move-mount-beneath$ sudo umount /sys/k

Re: [PATCH] tracefs/eventfs: Use root and instance inodes as default ownership

2024-01-07 Thread Christian Brauner

ould have to be changed to lookup_one() and the
idmapping be handed down to it. But I think that's irrelevant here.

Lookup generally doesn't need to be aware of idmappings at all. The
permission checking is done purely in the vfs via may_lookup() and the
idmapping is irrelevant because we always initialize inodes with the
filesystem level ownership (see the idmappings.rst) documentation if
you're interested in excessive details (otherwise you get inode aliases
which you really don't want).

For tracefs it would not matter for lookup per se but only because
tracefs seemingly creates inodes/dentries during lookup (and readdir()).

But imho the permission checking done in current eventfs_root_lookup()
via lookup_one_len() is meaningless in any way; possibly even
(conceptually) wrong.

Because, the actual permission checking for the creation of the eventfs
entries isn't really done during lookup or readdir, it's done when mkdir
is called:

mkdir /sys/kernel/tracing/instances/foo

Here, all possible entries beneath foo including "events" and further
below are recorded and stored. So once mkdir returns it basically means
that it succeeded with the creation of all the necessary directories and
files. For all purposes the foo/events/ directory and below have all the
entries that matter. They have been created. It's comparable to them not
being in the {d,i}cache, I guess.

When one goes and looksup stuff under foo/events/ or readdir the entries
in that directory:

fd = open("foo/events")
readdir(fd, ...)

then they are licensed to list an entry in that directory. So all that
needs to be done is to actually list those files in that directory. And
since they already exist (they were created during mkdir) we just need
to splice in inodes and dentries for them. But for that we shouldn't
check permissions on the directory again. Because we've done that
already correctly when the VFS called may_lookup().

IOW, the inode_permission() in lookup_one_len() that eventfs does is
redundant and just wrong.

Luckily, I don't think we need to even change anything because all
directories that eventfs creates always grant exec permissions to the
other group so lookup_one_len() will trivially succeed. IIUC.

Drafted-by-with-no-guarantees-whatsoever-that-this-wont-burn-the-house-down: 
Christian Brauner 
---
 fs/tracefs/event_inode.c |   8 +-
 fs/tracefs/inode.c   |  38 +++--
 fs/tracefs/internal.h|   3 +-
 include/linux/tracefs.h  |  20 +--
 kernel/trace/ftrace.c|  43 +++---
 kernel/trace/trace.c | 201 ++-
 kernel/trace/trace.h |  22 +--
 kernel/trace/trace_dynevent.c|   2 +-
 kernel/trace/trace_events.c  |  27 ++--
 kernel/trace/trace_events_synth.c|   5 +-
 kernel/trace/trace_functions.c   |   6 +-
 kernel/trace/trace_functions_graph.c |   4 +-
 kernel/trace/trace_hwlat.c   |   8 +-
 kernel/trace/trace_kprobe.c  |   4 +-
 kernel/trace/trace_osnoise.c |  48 ---
 kernel/trace/trace_printk.c  |   4 +-
 kernel/trace/trace_stack.c   |  11 +-
 kernel/trace/trace_stat.c|   6 +-
 kernel/trace/trace_uprobe.c  |   4 +-
 19 files changed, 251 insertions(+), 213 deletions(-)

diff --git a/fs/tracefs/event_inode.c b/fs/tracefs/event_inode.c
index 2ccc849a5bda..e2f352bd8779 100644
--- a/fs/tracefs/event_inode.c
+++ b/fs/tracefs/event_inode.c
@@ -852,11 +852,11 @@ struct eventfs_inode *eventfs_create_dir(const char 
*name, struct eventfs_inode
  *
  * See eventfs_create_dir() for use of @entries.
  */
-struct eventfs_inode *eventfs_create_events_dir(const char *name, struct 
dentry *parent,
-   const struct eventfs_entry 
*entries,
-   int size, void *data)
+struct eventfs_inode *eventfs_create_events_dir(
+   struct mnt_idmap *idmap, const char *name, struct dentry *parent,
+   const struct eventfs_entry *entries, int size, void *data)
 {
-   struct dentry *dentry = tracefs_start_creating(name, parent);
+   struct dentry *dentry = tracefs_start_creating(idmap, name, parent);
struct eventfs_inode *ei;
struct tracefs_inode *ti;
struct inode *inode;
diff --git a/fs/tracefs/inode.c b/fs/tracefs/inode.c
index ae648deed019..f4f4904eb3a0 100644
--- a/fs/tracefs/inode.c
+++ b/fs/tracefs/inode.c
@@ -68,7 +68,7 @@ static const struct file_operations tracefs_file_operations = 
{
 };
 
 static struct tracefs_dir_ops {
-   int (*mkdir)(const char *name);
+   int (*mkdir)(struct mnt_idmap *idmap, const char *name);
int (*rmdir)(const char *name);
 } tracefs_ops __ro_after_init;
 
@@ -104,7 +104,7 @@ static int tracefs_syscall_mkdir(struct mnt_idmap *idmap,
 * mkdir routine to handle races.
 */
inode_unlock(inode);
-   ret = tracefs_ops.mkdir(na

Re: [PATCH] tracefs/eventfs: Use root and instance inodes as default ownership

2024-01-05 Thread Christian Brauner

On Wed, Jan 03, 2024 at 08:32:46PM -0500, Steven Rostedt wrote:
> From: "Steven Rostedt (Google)" 
> 
> Instead of walking the dentries on mount/remount to update the gid values of
> all the dentries if a gid option is specified on mount, just update the root
> inode. Add .getattr, .setattr, and .permissions on the tracefs inode
> operations to update the permissions of the files and directories.
> 
> For all files and directories in the top level instance:
> 
>  /sys/kernel/tracing/*
> 
> It will use the root inode as the default permissions. The inode that
> represents: /sys/kernel/tracing (or wherever it is mounted).
> 
> When an instance is created:
> 
>  mkdir /sys/kernel/tracing/instance/foo
> 
> The directory "foo" and all its files and directories underneath will use
> the default of what foo is when it was created. A remount of tracefs will
> not affect it.

That kinda sounds like eventfs should actually be a separate filesystem.
But I don't know enough about the relationship between the two concepts.

> 
> If a user were to modify the permissions of any file or directory in
> tracefs, it will also no longer be modified by a change in ownership of a
> remount.

Very odd semantics and I would recommend to avoid that. It's just plain
weird imo.

> 
> The events directory, if it is in the top level instance, will use the
> tracefs root inode as the default ownership for itself and all the files and
> directories below it.
> 
> For the events directory in an instance ("foo"), it will keep the ownership
> of what it was when it was created, and that will be used as the default
> ownership for the files and directories beneath it.
> 
> Link: 
> https://lore.kernel.org/linux-trace-kernel/CAHk-=wjvdgkjdxbbvln2wbznqp4ush46e3gqj9m7ug6dpx2...@mail.gmail.com/
> 
> Signed-off-by: Steven Rostedt (Google) 
> ---

So tracefs supports remounting with different uid/gid mount options and
then actually wades through _all_ of the inodes and changes their
ownership internally? What's the use-case for this? Containers?

Aside from optimizing this and the special semantics for this eventfs
stuff that you really should think twice of doing, here's one idea for
an extension that might alleviate some of the pain:

If you need flexible dynamic ownership change to e.g., be able to
delegate (all, a directory, a single file of) tracefs to
unprivileged/containers/whatever then you might want to consider
supporting idmapped mounts for tracefs. Because then you can do stuff
like:

user1@localhost:~/data/scripts$ sudo mount --bind -o X-mount.idmap='g:0:1000:1 
u:0:1234:1' /run/ /mnt
user1@localhost:~/data/scripts$ ls -ln /run/
total 12
drwxr-xr-x  2 0  0   40 Jan  5 12:12 credentials
drwx--  2 0  0   40 Jan  5 11:57 cryptsetup
drwxr-xr-x  2 0  0   60 Jan  5 11:57 dbus
drwx--  6 0  0  280 Jan  5 11:57 incus_agent
prw---  1 0  00 Jan  5 11:57 initctl
drwxrwxrwt  4 0  0   80 Jan  5 11:57 lock
drwxr-xr-x  3 0  0   60 Jan  5 11:57 log
drwx--  2 0  0   40 Jan  5 11:57 lvm
-r--r--r--  1 0  0   33 Jan  5 11:57 machine-id
-rw-r--r--  1 0  0  101 Jan  5 11:58 motd.dynamic
drwxr-xr-x  2 0  0   40 Jan  5 11:57 mount
drwx--  2 0  0   40 Jan  5 11:57 multipath
drwxr-xr-x  2 0  0   40 Jan  5 11:57 sendsigs.omit.d
lrwxrwxrwx  1 0  08 Jan  5 11:57 shm -> /dev/shm
drwx--x--x  2 0  0   40 Jan  5 11:57 sudo
drwxr-xr-x 24 0  0  660 Jan  5 14:30 systemd
drwxr-xr-x  6 0  0  140 Jan  5 14:30 udev
drwxr-xr-x  4 0  0   80 Jan  5 11:58 user
-rw-rw-r--  1 0 43 2304 Jan  5 15:15 utmp

user1@localhost:~/data/scripts$ ls -ln /mnt/
total 12
drwxr-xr-x  2 1234  1000   40 Jan  5 12:12 credentials
drwx--  2 1234  1000   40 Jan  5 11:57 cryptsetup
drwxr-xr-x  2 1234  1000   60 Jan  5 11:57 dbus
drwxr-xr-x  2 1234  1000   40 Jan  5 11:57 incus_agent
prw---  1 1234  10000 Jan  5 11:57 initctl
drwxr-xr-x  2 1234  1000   40 Jan  5 11:57 lock
drwxr-xr-x  3 1234  1000   60 Jan  5 11:57 log
drwx--  2 1234  1000   40 Jan  5 11:57 lvm
-r--r--r--  1 1234  1000   33 Jan  5 11:57 machine-id
-rw-r--r--  1 1234  1000  101 Jan  5 11:58 motd.dynamic
drwxr-xr-x  2 1234  1000   40 Jan  5 11:57 mount
drwx--  2 1234  1000   40 Jan  5 11:57 multipath
drwxr-xr-x  2 1234  1000   40 Jan  5 11:57 sendsigs.omit.d
lrwxrwxrwx  1 1234  10008 Jan  5 11:57 shm -> /dev/shm
drwx--x--x  2 1234  1000   40 Jan  5 11:57 sudo
drwxr-xr-x 24 1234  1000  660 Jan  5 14:30 systemd
drwxr-xr-x  6 1234  1000  140 Jan  5 14:30 udev
drwxr-xr-x  4 1234  1000   80 Jan  5 11:58 user
-rw-rw-r--  1 1234 65534 2304 Jan  5 15:15 utmp

Where you can see that ownership of this tmpfs instance in this example
is changed. I'm not trying to advocate here but this will probably
ultimately be nicer for your users because it means that a container
manager or whatever can be handed a part of tracefs (or all of it) and
the ownership and access rights for that thing is correct. And you can
get rid of that gid based access completely.

You can change uids, gids, or both. You can

Re: [PATCH] fs : Fix warning using plain integer as NULL

2023-11-08 Thread Christian Brauner

On Wed, 08 Nov 2023 10:15:50 +0530, Abhinav Singh wrote:
> Sparse static analysis tools generate a warning with this message
> "Using plain integer as NULL pointer". In this case this warning is
> being shown because we are trying to initialize  pointer to NULL using
> integer value 0.
> 
> 

Applied to the vfs.misc branch of the vfs/vfs.git tree.
Patches in the vfs.misc branch should appear in linux-next soon.

Please report any outstanding bugs that were missed during review in a
new review to the original patch series allowing us to drop it.

It's encouraged to provide Acked-bys and Reviewed-bys even though the
patch has now been applied. If possible patch trailers will be updated.

Note that commit hashes shown below are subject to change due to rebase,
trailer updates or similar. If in doubt, please check the listed branch.

tree:   https://git.kernel.org/pub/scm/linux/kernel/git/vfs/vfs.git
branch: vfs.misc

[1/1] fs : Fix warning using plain integer as NULL
  https://git.kernel.org/vfs/vfs/c/372bfbd2ea43

Re: [PATCH 03/19] fs: release anon dev_t in deactivate_locked_super

2023-09-15 Thread Christian Brauner

> Lifetime rules for fs-private parts of superblock are really private to

Fine, I'll drop that. It's still correct that a filesystem needs to take
care when it frees sb->s_fs_info. See the RCU fun you just encountered.

Re: [PATCH 03/19] fs: release anon dev_t in deactivate_locked_super

2023-09-15 Thread Christian Brauner

> > tree of any filesystem (in-tree one or not) will have to go through the
> > changes and figure out WTF to do with their existing code.  We are
> > going to play whack-a-mole for at least several years as development
> > branches get rebased and merged.
> 
> Let me write something up.

So here I've written two porting.rst patches that aim to reflect the
current state of things (They do _not_ reflect what's in Christoph's
series here as that'ss again pretty separate and will require additional
spelling out.).

I'm adding explanation for both the old and new logic fwiw. I hope to
upstream these docs soon so we all have something to point to.

>From 200666901f53db74edf309d48e3c74fd275a822a Mon Sep 17 00:00:00 2001
From: Christian Brauner 
Date: Fri, 15 Sep 2023 16:01:02 +0200
Subject: [PATCH 1/2] porting: document new block device opening order

Signed-off-by: Christian Brauner 
---
 Documentation/filesystems/porting.rst | 24 
 1 file changed, 24 insertions(+)

diff --git a/Documentation/filesystems/porting.rst 
b/Documentation/filesystems/porting.rst
index deac4e973ddc..f436b64b77bf 100644
--- a/Documentation/filesystems/porting.rst
+++ b/Documentation/filesystems/porting.rst
@@ -949,3 +949,27 @@ mmap_lock held.  All in-tree users have been audited and 
do not seem to
 depend on the mmap_lock being held, but out of tree users should verify
 for themselves.  If they do need it, they can return VM_FAULT_RETRY to
 be called with the mmap_lock held.
+
+---
+
+**mandatory**
+
+The order of opening block devices and matching or creating superblocks has
+changed.
+
+The old logic opened block devices first and then tried to find a
+suitable superblock to reuse based on the block device pointer.
+
+The new logic finds or creates a superblock first, opening block devices
+afterwards. Since opening block devices cannot happen under s_umount because of
+lock ordering requirements s_umount is now dropped while opening block
+devices and reacquired before calling fill_super().
+
+In the old logic concurrent mounters would find the superblock on the list of
+active superblock for the filesystem type. Since the first opener of the block
+device would hold s_umount they would wait until the superblock became either
+born or died prematurely due to initialization failure.
+
+Since the new logic drops s_umount concurrent mounters could grab s_umount and
+would spin. Instead they are now made to wait using an explicit wait-wake
+mechanism without having to hold s_umount.
-- 
2.34.1

>From 1f09898322b4402219d8d3219d399c9e56a76bae Mon Sep 17 00:00:00 2001
From: Christian Brauner 
Date: Fri, 15 Sep 2023 16:01:40 +0200
Subject: [PATCH 2/2] porting: document superblock as block device holder

Signed-off-by: Christian Brauner 
---
 Documentation/filesystems/porting.rst | 79 +++
 1 file changed, 79 insertions(+)

diff --git a/Documentation/filesystems/porting.rst 
b/Documentation/filesystems/porting.rst
index f436b64b77bf..fefefaf289b4 100644
--- a/Documentation/filesystems/porting.rst
+++ b/Documentation/filesystems/porting.rst
@@ -973,3 +973,82 @@ born or died prematurely due to initialization failure.
 Since the new logic drops s_umount concurrent mounters could grab s_umount and
 would spin. Instead they are now made to wait using an explicit wait-wake
 mechanism without having to hold s_umount.
+
+---
+
+**mandatory**
+
+The holder of a block device is now the superblock.
+
+The holder of a block device used to be the file_system_type which wasn't
+particularly useful. It wasn't possible to go from block device to owning
+superblock without matching on the device pointer stored in the superblock.
+This mechanism would only work for a single device so the block layer couldn't
+find the owning superblock associated with additional devices.
+
+In the old mechanism reusing or creating a superblock for racing mount(2) and
+umount(2) relied on the file_system_type as the holder. This was severly
+underdocumented however:
+
+(1) If the concurrent mount(2) managed to grab an active reference before the
+umount(2) dropped the last active reference in deactivate_locked_super()
+the mounter would simply reuse the existing superblock.
+
+(2) If the mounter came after deactivate_locked_super() but before
+the superblock had been removed from the list of superblocks of the
+filesystem type the mounter would wait until the superblock was shutdown
+and allocated a new superblock.
+
+(3) If the mounter came after deactivate_locked_super() and after
+the superblock had been removed from the list of superblocks of the
+filesystem type the mounter would allocate a new superblock.
+
+Because the holder of the block device was the filesystem type any concurrent
+mounter could open the block device without risking seeing EBUSY because the
+block device was still in use.
+
+Making the superblock the owner of the block device changes

Re: [PATCH 03/19] fs: release anon dev_t in deactivate_locked_super

2023-09-15 Thread Christian Brauner

On Thu, Sep 14, 2023 at 05:58:05PM +0100, Al Viro wrote:
> On Thu, Sep 14, 2023 at 04:02:25PM +0200, Christian Brauner wrote:
> 
> > Yes, you're right that making the superblock and not the filesytem type
> > the bd_holder changes the logic and we are aware of that of course. And
> > it requires changes such as moving additional block device closing from
> > where some callers currently do it.
> 
> Details, please?

Filesystems like xfs and ext4 that closed additional block devices (For
example, the logdev= mount option for xfs.) in put_super() could go
through stuff like:

blkdev_put()
-> bdev->bd_disk->fops->release() == lo_release()
   -> __loop_clr_fd()
  -> disk_force_media_change()
 -> __invalidate_device()
-> get_super()

which wouldn't have been a problem before because get_super() matched on
sb->s_bdev which obviously doesn't work because a log device or rt
device or whatever isn't the main block device. So we couldn't have
deadlocked.

But the fact that it is called in that manner from that place in the
first place is wildly adventurous especially considering that there
isn't __a single comment__ in that code why that is safe. So good luck
figuring this all out.

Now that we don't have to do that s_bdev matching thing anymore because
we directly associate the superblock with the block device we can go
straight from block device to superblock. But now calling blkdev_put()
under put_super() which holds s_umount could deadlock. So it's moved to
kill_sb where it should've always been called. Even without the
potential deadlock in the new scheme that's cleaner and easier to
understand imho and it just works for any block device.

> Note that Christoph's series has mashed (2) and (3) together, resulting
> in UAF in a bunch of places.  And I'm dead serious about

Yes, that I did fix as far as I'm aware. If the rules would've been
written down where when something was freed we would've had an easier
time figuring this out though. But they weren't so we missed it.

> Documentation/filesystems/porting being the right place; any development

Yes, agreed. I'll write a document for Christoph's next version.

I know that what you're saying is roughly that we shouldn't make the
same mistake as were done before but the fact that the old lifetime
rules weren't documented in any meaningful way and now we get grumbled
at in turn makes me grumble a bit. :) But overall point duly taken.

> tree of any filesystem (in-tree one or not) will have to go through the
> changes and figure out WTF to do with their existing code.  We are
> going to play whack-a-mole for at least several years as development
> branches get rebased and merged.

Let me write something up.

> 
> Incidentally, I'm going to add a (belated by 10 years) chunk in porting.rst
> re making sure that anything in superblock that might be needed by methods
> called in RCU mode should *not* be freed without an RCU delay...  Should've
> done that back in 3.12 merge window when RCU'd vfsmounts went in; as it
> is, today we have several filesystems with exact same kind of breakage.
> hfsplus and affs breakage had been there in 3.13 (missed those two), exfat
> and ntfs3 - introduced later, by initial merges of filesystems in question.
> Missed on review...

Cool, thanks for adding that.

Re: [PATCH 03/19] fs: release anon dev_t in deactivate_locked_super

2023-09-15 Thread Christian Brauner

On Thu, Sep 14, 2023 at 08:23:31PM +0100, Al Viro wrote:
> On Thu, Sep 14, 2023 at 05:58:05PM +0100, Al Viro wrote:
> 
> > Incidentally, I'm going to add a (belated by 10 years) chunk in porting.rst
> > re making sure that anything in superblock that might be needed by methods
> > called in RCU mode should *not* be freed without an RCU delay...  Should've
> > done that back in 3.12 merge window when RCU'd vfsmounts went in; as it
> > is, today we have several filesystems with exact same kind of breakage.
> > hfsplus and affs breakage had been there in 3.13 (missed those two), exfat
> > and ntfs3 - introduced later, by initial merges of filesystems in question.
> > Missed on review...
> > 
> > Hell knows - perhaps Documentation/filesystems/whack-a-mole might be a good
> > idea...

pitfalls.rst or common-bugs.rst

or something like that.

> 
> Actually, utf8 casefolding stuff also has the same problem, so ext4 and f2fs
> with casefolding are also affected ;-/

Re: [PATCH 03/19] fs: release anon dev_t in deactivate_locked_super

2023-09-14 Thread Christian Brauner

> Christoph, could you explain what the hell do we need that for?  It does
> create the race in question and AFAICS 2c18a63b760a (and followups trying
> to plug holes in it) had been nothing but headache.
> 
> Old logics: if mount attempt with a different fs type happens, -EBUSY
> is precisely corrent - we would've gotten just that if mount() came
> before umount().  If the type matches, we might
>   1) come before deactivate_locked_super() by umount(2).
> No problem, we succeed.
>   2) come after the beginning of shutdown, but before the
> removal from the list; fine, we'll wait for the sucker to be
> unlocked (which happens in the end of generic_shutdown_super()),
> notice it's dead and create a new superblock.  Since the only
> part left on the umount side is closing the device, we are
> just fine.
>   3) come after the removal from the list.  So we won't
> wait for the old superblock to be unlocked, other than that
> it's exactly the same as (2).  It doesn't matter whether we
> open the device before or after close by umount - same owner
> anyway, no -EBUSY.
> 
> Your "owner shall be the superblock" breaks that...
> 
> If you want to mess with _three_-way split of ->kill_sb(),
> please start with writing down the rules re what should
> go into each of those parts; such writeup should go into
> Documentation/filesystems/porting anyway, even if the
> split is a two-way one, BTW.

Hm, I think that characterization of Christoph's changes is a bit harsh.

Yes, you're right that making the superblock and not the filesytem type
the bd_holder changes the logic and we are aware of that of course. And
it requires changes such as moving additional block device closing from
where some callers currently do it.

But the filesytem type is not a very useful holder itself and has other
drawbacks. The obvious one being that it requires us to wade through all
superblocks on the system trying to find the superblock associated with
a given block device continously grabbing and dropping sb_lock and
s_umount. None of that is very pleasant nor elegant and it is for sure
not very easy to understand (Plus, it's broken for btrfs freezing and
syncing via block level ioctls.).

Using the superblock as holder makes this go away and is overall a lot
more useful and intuitive and can be extended to filesystems with
multiple devices (Of which we apparently are bound to get more.).

So I think this change is worth the pain.

It's a fair point that these lifetime rules should be documented in
Documentation/filesystems/. The old lifetime documentation is too sparse
to be useful though.

Re: [PATCH 03/19] fs: release anon dev_t in deactivate_locked_super

2023-09-14 Thread Christian Brauner

> BTW, this part of commit message in 2c18a63b760a is rather confused:
> Recent rework moved block device closing out of sb->put_super() and into
> sb->kill_sb() to avoid deadlocks as s_umount is held in put_super() and
> blkdev_put() can end up taking s_umount again.
> 
> That was *NOT* what a recent rework had done.  Block device closing had never
> been inside ->put_super() - at no point since that (closing, that is) had been
> introduced back in 0.97 ;-)  ->put_super() predates it (0.95c+).

I think the commit message probably just isn't clear enough. The main
block device of a superblock isn't closed in sb->put_super(). That's
always been closed in kill_block_super() after generic_shutdown_super().

But afaict filesystem like ext4 and xfs may have additional block
devices open exclusively and closed them in sb->put_super():

xfs_fs_put_super()
-> xfs_close_devices()
   -> xfs_blkdev_put()
  -> blkdev_put()

ext4_put_super()
-> ext4_blkdev_remove()
   -> blkdev_put()

Re: [PATCH 01/19] fs: reflow deactivate_locked_super

2023-09-13 Thread Christian Brauner

On Wed, Sep 13, 2023 at 08:09:55AM -0300, Christoph Hellwig wrote:
> Return early for the case where the super block isn't cleaned up to
> reduce level of indentation.
> 
> Signed-off-by: Christoph Hellwig 
> ---
>  fs/super.c | 35 ++-
>  1 file changed, 18 insertions(+), 17 deletions(-)
> 
> diff --git a/fs/super.c b/fs/super.c
> index 2d762ce67f6e6c..127a17d958a482 100644
> --- a/fs/super.c
> +++ b/fs/super.c
> @@ -476,27 +476,28 @@ static void kill_super_notify(struct super_block *sb)
>  void deactivate_locked_super(struct super_block *s)

I wouldn't mind s/s/sb/ here as well. So we stop using @s in some and
@sb in other places.

Otherwise looks good to me,
Reviewed-by: Christian Brauner

Re: [PATCH v3.4] capabilities: require CAP_SETFCAP to map uid 0

2021-04-20 Thread Christian Brauner

On Tue, Apr 20, 2021 at 08:43:34AM -0500, Serge Hallyn wrote:
> cap_setfcap is required to create file capabilities.
> 
> Since 8db6c34f1dbc ("Introduce v3 namespaced file capabilities"), a
> process running as uid 0 but without cap_setfcap is able to work around
> this as follows: unshare a new user namespace which maps parent uid 0
> into the child namespace.  While this task will not have new
> capabilities against the parent namespace, there is a loophole due to
> the way namespaced file capabilities are represented as xattrs.  File
> capabilities valid in userns 1 are distinguished from file capabilities
> valid in userns 2 by the kuid which underlies uid 0.  Therefore the
> restricted root process can unshare a new self-mapping namespace, add a
> namespaced file capability onto a file, then use that file capability in
> the parent namespace.
> 
> To prevent that, do not allow mapping parent uid 0 if the process which
> opened the uid_map file does not have CAP_SETFCAP, which is the capability
> for setting file capabilities.
> 
> As a further wrinkle:  a task can unshare its user namespace, then
> open its uid_map file itself, and map (only) its own uid.  In this
> case we do not have the credential from before unshare,  which was
> potentially more restricted.  So, when creating a user namespace, we
> record whether the creator had CAP_SETFCAP.  Then we can use that
> during map_write().
> 
> With this patch:
> 
> 1. Unprivileged user can still unshare -Ur
> 
> ubuntu@caps:~$ unshare -Ur
> root@caps:~# logout
> 
> 2. Root user can still unshare -Ur
> 
> ubuntu@caps:~$ sudo bash
> root@caps:/home/ubuntu# unshare -Ur
> root@caps:/home/ubuntu# logout
> 
> 3. Root user without CAP_SETFCAP cannot unshare -Ur:
> 
> root@caps:/home/ubuntu# /sbin/capsh --drop=cap_setfcap --
> root@caps:/home/ubuntu# /sbin/setcap cap_setfcap=p /sbin/setcap
> unable to set CAP_SETFCAP effective capability: Operation not permitted
> root@caps:/home/ubuntu# unshare -Ur
> unshare: write failed /proc/self/uid_map: Operation not permitted
> 
> Note: an alternative solution would be to allow uid 0 mappings by
> processes without CAP_SETFCAP, but to prevent such a namespace from
> writing any file capabilities.  This approach can be seen here:
> 
> https://git.kernel.org/pub/scm/linux/kernel/git/sergeh/linux.git/log/?h=2021-04-15/setfcap-nsfscaps-v4
> 
> History:
> 
> Commit 95ebabde382 ("capabilities: Don't allow writing ambiguous v3 file
> capabilities") tried to fix the issue by preventing v3 fscaps to be
> written to disk when the root uid would map to the same uid in nested
> user namespaces. This led to regressions for various workloads. For
> example, see [1]. Ultimately this is a valid use-case we have to support
> meaning we had to revert this change in 3b0c2d3eaa83 ("Revert
> 95ebabde382c ("capabilities: Don't allow writing ambiguous v3 file
> capabilities")").
> 
> [1]: https://github.com/containers/buildah/issues/3071
> 
> Signed-off-by: Serge Hallyn 
> Reviewed-by: Andrew G. Morgan 
> Tested-by: Christian Brauner 
> Reviewed-by: Christian Brauner 
> Tested-by: Giuseppe Scrivano 
> Cc: "Eric W. Biederman" 

If there's no objections then Linus can probably just pick up the single
patch here directly:
https://lore.kernel.org/lkml/20210420134334.ga11...@mail.hallyn.com

I'm not sure it's worth waiting and releasing another kernel with this
bug. This tigthens the semantics nicely and makes for a simple check at
userns creation time instead of repeatedly checking at setxattr(). With
all the testing done we can be quite confident the risk of regressions
is way lower than the old patch and even if we see one I think this
version of the fix is actually worth the risk.

Christian

Re: [PATCH] capabilities: require CAP_SETFCAP to map uid 0 (v3.3)

2021-04-20 Thread Christian Brauner

On Mon, Apr 19, 2021 at 10:42:08PM -0500, Serge Hallyn wrote:
> On Mon, Apr 19, 2021 at 06:09:11PM +0200, Christian Brauner wrote:
> > On Mon, Apr 19, 2021 at 07:25:14AM -0500, Serge Hallyn wrote:
> > > cap_setfcap is required to create file capabilities.
> > > 
> > > Since 8db6c34f1dbc ("Introduce v3 namespaced file capabilities"), a
> > > process running as uid 0 but without cap_setfcap is able to work around
> > > this as follows: unshare a new user namespace which maps parent uid 0
> > > into the child namespace.  While this task will not have new
> > > capabilities against the parent namespace, there is a loophole due to
> > > the way namespaced file capabilities are represented as xattrs.  File
> > > capabilities valid in userns 1 are distinguished from file capabilities
> > > valid in userns 2 by the kuid which underlies uid 0.  Therefore the
> > > restricted root process can unshare a new self-mapping namespace, add a
> > > namespaced file capability onto a file, then use that file capability in
> > > the parent namespace.
> > > 
> > > To prevent that, do not allow mapping parent uid 0 if the process which
> > > opened the uid_map file does not have CAP_SETFCAP, which is the capability
> > > for setting file capabilities.
> > > 
> > > As a further wrinkle:  a task can unshare its user namespace, then
> > > open its uid_map file itself, and map (only) its own uid.  In this
> > > case we do not have the credential from before unshare,  which was
> > > potentially more restricted.  So, when creating a user namespace, we
> > > record whether the creator had CAP_SETFCAP.  Then we can use that
> > > during map_write().
> > > 
> > > With this patch:
> > > 
> > > 1. Unprivileged user can still unshare -Ur
> > > 
> > > ubuntu@caps:~$ unshare -Ur
> > > root@caps:~# logout
> > > 
> > > 2. Root user can still unshare -Ur
> > > 
> > > ubuntu@caps:~$ sudo bash
> > > root@caps:/home/ubuntu# unshare -Ur
> > > root@caps:/home/ubuntu# logout
> > > 
> > > 3. Root user without CAP_SETFCAP cannot unshare -Ur:
> > > 
> > > root@caps:/home/ubuntu# /sbin/capsh --drop=cap_setfcap --
> > > root@caps:/home/ubuntu# /sbin/setcap cap_setfcap=p /sbin/setcap
> > > unable to set CAP_SETFCAP effective capability: Operation not permitted
> > > root@caps:/home/ubuntu# unshare -Ur
> > > unshare: write failed /proc/self/uid_map: Operation not permitted
> > > 
> > > Note: an alternative solution would be to allow uid 0 mappings by
> > > processes without CAP_SETFCAP, but to prevent such a namespace from
> > > writing any file capabilities.  This approach can be seen here:
> > > 
> > > https://git.kernel.org/pub/scm/linux/kernel/git/sergeh/linux.git/log/?h=2021-04-15/setfcap-nsfscaps-v4
> > > 
> > 
> > Ah, can you link to the previous fix and its revert, please? I think
> > that was mentioned in the formerly private thread as well but we forgot:
> > 
> > commit 95ebabde382c371572297915b104e55403674e73
> > Author: Eric W. Biederman 
> > Date:   Thu Dec 17 09:42:00 2020 -0600
> > 
> > capabilities: Don't allow writing ambiguous v3 file capabilities
> > 
> > commit 3b0c2d3eaa83da259d7726192cf55a137769012f
> > Author: Eric W. Biederman 
> > Date:   Fri Mar 12 15:07:09 2021 -0600
> > 
> > Revert 95ebabde382c ("capabilities: Don't allow writing ambiguous v3 
> > file capabilities")
> 
> Sure.
> 
> Is there a tag for that kind of thing or do I just mention it at the end
> of the description?

In this case it might make sense to just have a little paragraph that
explains the regression. How do you feel about adding?:

  Commit 95ebabde382 ("capabilities: Don't allow writing ambiguous v3 file
  capabilities") tried to fix the issue by preventing v3 fscaps to be
  written to disk when the root uid would map to the same uid in nested
  user namespaces. This led to regressions for various workloads. For
  example, see [1]. Ultimately this is a valid use-case we have to support
  meaning we had to revert this change in 3b0c2d3eaa83 ("Revert
  95ebabde382c ("capabilities: Don't allow writing ambiguous v3 file
  capabilities")").
  
  [1]: https://github.com/containers/buildah/issues/3071

Re: [PATCH] capabilities: require CAP_SETFCAP to map uid 0 (v3.3)

2021-04-19 Thread Christian Brauner

On Mon, Apr 19, 2021 at 07:25:14AM -0500, Serge Hallyn wrote:
> cap_setfcap is required to create file capabilities.
> 
> Since 8db6c34f1dbc ("Introduce v3 namespaced file capabilities"), a
> process running as uid 0 but without cap_setfcap is able to work around
> this as follows: unshare a new user namespace which maps parent uid 0
> into the child namespace.  While this task will not have new
> capabilities against the parent namespace, there is a loophole due to
> the way namespaced file capabilities are represented as xattrs.  File
> capabilities valid in userns 1 are distinguished from file capabilities
> valid in userns 2 by the kuid which underlies uid 0.  Therefore the
> restricted root process can unshare a new self-mapping namespace, add a
> namespaced file capability onto a file, then use that file capability in
> the parent namespace.
> 
> To prevent that, do not allow mapping parent uid 0 if the process which
> opened the uid_map file does not have CAP_SETFCAP, which is the capability
> for setting file capabilities.
> 
> As a further wrinkle:  a task can unshare its user namespace, then
> open its uid_map file itself, and map (only) its own uid.  In this
> case we do not have the credential from before unshare,  which was
> potentially more restricted.  So, when creating a user namespace, we
> record whether the creator had CAP_SETFCAP.  Then we can use that
> during map_write().
> 
> With this patch:
> 
> 1. Unprivileged user can still unshare -Ur
> 
> ubuntu@caps:~$ unshare -Ur
> root@caps:~# logout
> 
> 2. Root user can still unshare -Ur
> 
> ubuntu@caps:~$ sudo bash
> root@caps:/home/ubuntu# unshare -Ur
> root@caps:/home/ubuntu# logout
> 
> 3. Root user without CAP_SETFCAP cannot unshare -Ur:
> 
> root@caps:/home/ubuntu# /sbin/capsh --drop=cap_setfcap --
> root@caps:/home/ubuntu# /sbin/setcap cap_setfcap=p /sbin/setcap
> unable to set CAP_SETFCAP effective capability: Operation not permitted
> root@caps:/home/ubuntu# unshare -Ur
> unshare: write failed /proc/self/uid_map: Operation not permitted
> 
> Note: an alternative solution would be to allow uid 0 mappings by
> processes without CAP_SETFCAP, but to prevent such a namespace from
> writing any file capabilities.  This approach can be seen here:
> 
> https://git.kernel.org/pub/scm/linux/kernel/git/sergeh/linux.git/log/?h=2021-04-15/setfcap-nsfscaps-v4
> 

Ah, can you link to the previous fix and its revert, please? I think
that was mentioned in the formerly private thread as well but we forgot:

commit 95ebabde382c371572297915b104e55403674e73
Author: Eric W. Biederman 
Date:   Thu Dec 17 09:42:00 2020 -0600

capabilities: Don't allow writing ambiguous v3 file capabilities

commit 3b0c2d3eaa83da259d7726192cf55a137769012f
Author: Eric W. Biederman 
Date:   Fri Mar 12 15:07:09 2021 -0600

Revert 95ebabde382c ("capabilities: Don't allow writing ambiguous v3 file 
capabilities")

> Signed-off-by: Serge Hallyn 
> Reviewed-by: Andrew G. Morgan 
> Tested-by: Christian Brauner 
> Reviewed-by: Christian Brauner 
> Cc: "Eric W. Biederman"

Re: [PATCH] capabilities: require CAP_SETFCAP to map uid 0 (v3.2)

2021-04-19 Thread Christian Brauner

On Mon, Apr 19, 2021 at 05:52:39PM +0200, Giuseppe Scrivano wrote:
> ebied...@xmission.com (Eric W. Biederman) writes:
> 
> > Guiseppe can you take a look at this?
> >
> > This is a second attempt at tightening up the semantics of writing to
> > file capabilities from a user namespace.
> >
> > The first attempt was reverted with 3b0c2d3eaa83 ("Revert 95ebabde382c
> > ("capabilities: Don't allow writing ambiguous v3 file capabilities")"),
> > which corrected the issue reported in:
> > https://github.com/containers/buildah/issues/3071
> >
> > There is a report the podman testsuite passes.  While different this
> > looks in many ways much more strict than the code that was reverted.  So
> > while I can imagine this change doesn't cause problems as is, I will be
> > surprised.
> 
> thanks for pulling me in the discussion.
> 
> I've tested the patch with several cases similar to the issue we had in
> the past and the patch seems to work well.  
> 
> Podman creates all the user namespaces within the same parent user
> namespace.  In the parent user namespace all the capabilities are kept
> and AFAIK Docker does the same.  I'd expect a change in behavior only
> for nested user namespaces in containers where CAP_SETFCAP is not
> granted, but that is not a common configuration given that CAP_SETFCAP
> is added by default.

Same for us and we do have extensive nested container workloads with
other runtimes running containers too.

> 
> 
> > "Serge E. Hallyn"  writes:
> >
> >> +/**
> >> + * verify_root_map() - check the uid 0 mapping
> >> + * @file: idmapping file
> >> + * @map_ns: user namespace of the target process
> >> + * @new_map: requested idmap
> >> + *
> >> + * If a process requested a mapping for uid 0 onto uid 0, verify that the
> >> + * process writing the map had the CAP_SETFCAP capability as the target 
> >> process
> >> + * will be able to write fscaps that are valid in ancestor user 
> >> namespaces.
> >> + *
> >> + * Return: true if the mapping is allowed, false if not.
> >> + */
> >> +static bool verify_root_map(const struct file *file,
> >> +  struct user_namespace *map_ns,
> >> +  struct uid_gid_map *new_map)
> >> +{
> >> +  int idx;
> >> +  const struct user_namespace *file_ns = file->f_cred->user_ns;
> >> +  struct uid_gid_extent *extent0 = NULL;
> >> +
> >> +  for (idx = 0; idx < new_map->nr_extents; idx++) {
> >> +  u32 lower_first;
> 
> nit: lower_first seems unused?
> 
> >> +
> >> +  if (new_map->nr_extents <= UID_GID_MAP_MAX_BASE_EXTENTS)
> >> +  extent0 = _map->extent[idx];
> >> +  else
> >> +  extent0 = _map->forward[idx];
> >> +  if (extent0->lower_first == 0)
> >> +  break;
> >> +
> >> +  extent0 = NULL;
> >> +  }
> 
> Tested-by: Giuseppe Scrivano 

Thanks for running the tests and confirming my results!
Christian

Re: [PATCH] linux/mount.h: Remove duplicate struct declaration

2021-04-19 Thread Christian Brauner

On Mon, Apr 19, 2021 at 07:33:04PM +0800, Wan Jiabing wrote:
> struct path is declared at 85th line.
> The declaration here is unnecessary. Remove it.
> 
> Signed-off-by: Wan Jiabing 
> ---

Looks good,
Reviewed-by: Christian Brauner

Re: [PATCH] capabilities: require CAP_SETFCAP to map uid 0 (v3.2)

2021-04-18 Thread Christian Brauner

On Sat, Apr 17, 2021 at 03:04:34PM -0500, Serge Hallyn wrote:
> A process running as uid 0 but without cap_setfcap currently can simply
> unshare a new user namespace with uid 0 mapped to 0.  While this task
> will not have new capabilities against the parent namespace, there is
> a loophole due to the way namespaced file capabilities work.  File
> capabilities valid in userns 1 are distinguised from file capabilities
> valid in userns 2 by the kuid which underlies uid 0.  Therefore
> the restricted root process can unshare a new self-mapping namespace,
> add a namespaced file capability onto a file, then use that file
> capability in the parent namespace.
> 
> To prevent that, do not allow mapping uid 0 if the process which
> opened the uid_map file does not have CAP_SETFCAP, which is the capability
> for setting file capabilities.
> 
> A further wrinkle:  a task can unshare its user namespace, then
> open its uid_map file itself, and map (only) its own uid.  In this
> case we do not have the credential from before unshare,  which was
> potentially more restricted.  So, when creating a user namespace, we
> record whether the creator had CAP_SETFCAP.  Then we can use that
> during map_write().
> 
> With this patch:
> 
> 1. unprivileged user can still unshare -Ur
> 
> ubuntu@caps:~$ unshare -Ur
> root@caps:~# logout
> 
> 2. root user can still unshare -Ur
> 
> ubuntu@caps:~$ sudo bash
> root@caps:/home/ubuntu# unshare -Ur
> root@caps:/home/ubuntu# logout
> 
> 3. root user without CAP_SETFCAP cannot unshare -Ur:
> 
> root@caps:/home/ubuntu# /sbin/capsh --drop=cap_setfcap --
> root@caps:/home/ubuntu# /sbin/setcap cap_setfcap=p /sbin/setcap
> unable to set CAP_SETFCAP effective capability: Operation not permitted
> root@caps:/home/ubuntu# unshare -Ur
> unshare: write failed /proc/self/uid_map: Operation not permitted
> 
> Signed-off-by: Serge Hallyn 
> 
> Changelog:
>* fix logic in the case of writing to another task's uid_map
>* rename 'ns' to 'map_ns', and make a file_ns local variable
>* use /* comments */
>* update the CAP_SETFCAP comment in capability.h
>* rename parent_unpriv to parent_can_setfcap (and reverse the
>  logic)
>* remove printks
>* clarify (i hope) the code comments
>* update capability.h comment
>* renamed parent_can_setfcap to parent_could_setfcap
>* made the check its own disallowed_0_mapping() fn
>* moved the check into new_idmap_permitted
>* rename disallowed_0_mapping to verify_root_mapping
>* change verify_root_mapping to Christian's suggested flow
> ---

Thank you. This looks good. I tested this with:

- fstests
- LXD testsuite
- Podman testsuite
- libcap testsuite

Tested-by: Christian Brauner 
Reviewed-by: Christian Brauner

Re: [RFC PATCH] capabilities: require CAP_SETFCAP to map uid 0 (v3)

2021-04-16 Thread Christian Brauner

On Thu, Apr 15, 2021 at 11:58:51PM -0500, Serge Hallyn wrote:
> (Eric - this patch (v3) is a cleaned up version of the previous approach.
> v4 is at 
> https://git.kernel.org/pub/scm/linux/kernel/git/sergeh/linux.git/log/?h=2021-04-15/setfcap-nsfscaps-v4
> and is the approach you suggested.  I can send it also as a separate patch
> if you like)
> 
> A process running as uid 0 but without cap_setfcap currently can simply
> unshare a new user namespace with uid 0 mapped to 0.  While this task
> will not have new capabilities against the parent namespace, there is
> a loophole due to the way namespaced file capabilities work.  File
> capabilities valid in userns 1 are distinguised from file capabilities
> valid in userns 2 by the kuid which underlies uid 0.  Therefore
> the restricted root process can unshare a new self-mapping namespace,
> add a namespaced file capability onto a file, then use that file
> capability in the parent namespace.
> 
> To prevent that, do not allow mapping uid 0 if the process which
> opened the uid_map file does not have CAP_SETFCAP, which is the capability
> for setting file capabilities.
> 
> A further wrinkle:  a task can unshare its user namespace, then
> open its uid_map file itself, and map (only) its own uid.  In this
> case we do not have the credential from before unshare,  which was
> potentially more restricted.  So, when creating a user namespace, we
> record whether the creator had CAP_SETFCAP.  Then we can use that
> during map_write().
> 
> With this patch:
> 
> 1. unprivileged user can still unshare -Ur
> 
> ubuntu@caps:~$ unshare -Ur
> root@caps:~# logout
> 
> 2. root user can still unshare -Ur
> 
> ubuntu@caps:~$ sudo bash
> root@caps:/home/ubuntu# unshare -Ur
> root@caps:/home/ubuntu# logout
> 
> 3. root user without CAP_SETFCAP cannot unshare -Ur:
> 
> root@caps:/home/ubuntu# /sbin/capsh --drop=cap_setfcap --
> root@caps:/home/ubuntu# /sbin/setcap cap_setfcap=p /sbin/setcap
> unable to set CAP_SETFCAP effective capability: Operation not permitted
> root@caps:/home/ubuntu# unshare -Ur
> unshare: write failed /proc/self/uid_map: Operation not permitted
> 
> Signed-off-by: Serge Hallyn 
> 
> Changelog:
>* fix logic in the case of writing to another task's uid_map
>* rename 'ns' to 'map_ns', and make a file_ns local variable
>* use /* comments */
>* update the CAP_SETFCAP comment in capability.h
>* rename parent_unpriv to parent_can_setfcap (and reverse the
>  logic)
>* remove printks
>* clarify (i hope) the code comments
>* update capability.h comment
>* renamed parent_can_setfcap to parent_could_setfcap
>* made the check its own disallowed_0_mapping() fn
>* moved the check into new_idmap_permitted
> ---

Thank you for working on this fix!

I do prefer your approach of doing the check at user namespace creation
time instead of moving it into the setxattr() codepath.

Let me reiterate that the ability to write through fscaps is a valid
usecase and this should continue to work but that for locked down user
namespace as Andrew wants to use them your patch provides a clean
solution.
We've are using identity mappings in quite a few scenarios partially
when performing tests but also to write through fscaps.
We also had reports of users that use identity mappings. They create
their rootfs by running image extraction in an identity mapped userns
where fscaps are written through.
Podman has use-cases for this feature as well and has been affected by
the regression of the first fix.

>  include/linux/user_namespace.h  |  3 ++
>  include/uapi/linux/capability.h |  3 +-
>  kernel/user_namespace.c | 61 +++--
>  3 files changed, 63 insertions(+), 4 deletions(-)
> 
> diff --git a/include/linux/user_namespace.h b/include/linux/user_namespace.h
> index 64cf8ebdc4ec..f6c5f784be5a 100644
> --- a/include/linux/user_namespace.h
> +++ b/include/linux/user_namespace.h
> @@ -63,6 +63,9 @@ struct user_namespace {
>   kgid_t  group;
>   struct ns_commonns;
>   unsigned long   flags;
> + /* parent_could_setfcap: true if the creator if this ns had CAP_SETFCAP
> +  * in its effective capability set at the child ns creation time. */
> + boolparent_could_setfcap;
>  
>  #ifdef CONFIG_KEYS
>   /* List of joinable keyrings in this namespace.  Modification access of
> diff --git a/include/uapi/linux/capability.h b/include/uapi/linux/capability.h
> index c6ca33034147..2ddb4226cd23 100644
> --- a/include/uapi/linux/capability.h
> +++ b/include/uapi/linux/capability.h
> @@ -335,7 +335,8 @@ struct vfs_ns_cap_data {
>  
>  #define CAP_AUDIT_CONTROL30
>  
> -/* Set or remove capabilities on files */
> +/* Set or remove capabilities on files.
> +   Map uid=0 into a child user namespace. */
>  
>  #define CAP_SETFCAP   31
>  
> diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
> index af612945a4d0..8c75028a9aae 100644
>

Re: [PATCH] fs: split receive_fd_replace from __receive_fd

2021-04-16 Thread Christian Brauner

On Fri, Apr 16, 2021 at 04:15:43AM +, Al Viro wrote:
> On Fri, Apr 02, 2021 at 12:01:05PM -0700, Kees Cook wrote:
> > On Thu, Mar 25, 2021 at 09:22:09AM +0100, Christoph Hellwig wrote:
> > > receive_fd_replace shares almost no code with the general case, so split
> > > it out.  Also remove the "Bump the sock usage counts" comment from
> > > both copies, as that is now what __receive_sock actually does.
> > > 
> > > Signed-off-by: Christoph Hellwig 
> > 
> > I'm okay with repeating code in fs/file.c. What I wanted to avoid was
> > open coded combinations in various callers.
> 
> ... and that got you a lovely userland ABI, where you have
> 
>   (1) newfd >= 0, SECCOMP_ADDFD_FLAG_SETFD is present => replace
>   (2) newfd < 0, SECCOMP_ADDFD_FLAG_SETFD is present => insert
>   (3) newfd == 0, SECCOMP_ADDFD_FLAG_SETFD not present => insert
>   (4) newfd != 0, SECCOMP_ADDFD_FLAG_SETFD not present => -EINVAL
> 
> IMO (2) is a bug.  Whether we still can fix it or not... no idea, depends
> on whether the actual userland has come to depend upon it.

The number of users actively making use of this is rn more or less only
projects I maintain. There's a proposal to make that API part of another
project but they can just adapt to the new behavior too since there's no
released version. So we could just risk that change.

> 
> I suggest turning (2) into an error (-EBADF is what you'd get from
> attempt to set something at such descriptor) and seeing if anything
> breaks.  And having SECCOMP_ADDFD_FLAG_SETFD status passed into kaddfd
> explicitly, with explicit check in seccomp_handle_addfd().  As in
> 
> commit 42eb0d54c08a0331d6d295420f602237968d792b
> Author: Christoph Hellwig 
> Date:   Thu Mar 25 09:22:09 2021 +0100
> 
> fs: split receive_fd_replace from __receive_fd
> 
> receive_fd_replace shares almost no code with the general case, so split
> it out.  Also remove the "Bump the sock usage counts" comment from
> both copies, as that is now what __receive_sock actually does.
> 
> [AV: ... and make the only user of receive_fd_replace() choose between
> it and receive_fd() according to what userland had passed to it in
> flags]
> 
> Signed-off-by: Christoph Hellwig 
> Signed-off-by: Al Viro 
> 
> diff --git a/fs/file.c b/fs/file.c
> index f3a4bac2cbe9..d8ccb95a7f41 100644
> --- a/fs/file.c
> +++ b/fs/file.c
> @@ -1068,8 +1068,6 @@ int replace_fd(unsigned fd, struct file *file, unsigned 
> flags)
>  
>  /**
>   * __receive_fd() - Install received file into file descriptor table
> - *
> - * @fd: fd to install into (if negative, a new fd will be allocated)
>   * @file: struct file that was received from another process
>   * @ufd: __user pointer to write new fd number to
>   * @o_flags: the O_* flags to apply to the new fd entry
> @@ -1083,7 +1081,7 @@ int replace_fd(unsigned fd, struct file *file, unsigned 
> flags)
>   *
>   * Returns newly install fd or -ve on error.
>   */
> -int __receive_fd(int fd, struct file *file, int __user *ufd, unsigned int 
> o_flags)
> +int __receive_fd(struct file *file, int __user *ufd, unsigned int o_flags)
>  {
>   int new_fd;
>   int error;
> @@ -1092,32 +1090,33 @@ int __receive_fd(int fd, struct file *file, int 
> __user *ufd, unsigned int o_flag
>   if (error)
>   return error;
>  
> - if (fd < 0) {
> - new_fd = get_unused_fd_flags(o_flags);
> - if (new_fd < 0)
> - return new_fd;
> - } else {
> - new_fd = fd;
> - }
> + new_fd = get_unused_fd_flags(o_flags);
> + if (new_fd < 0)
> + return new_fd;
>  
>   if (ufd) {
>   error = put_user(new_fd, ufd);
>   if (error) {
> - if (fd < 0)
> - put_unused_fd(new_fd);
> + put_unused_fd(new_fd);
>   return error;
>   }
>   }
>  
> - if (fd < 0) {
> - fd_install(new_fd, get_file(file));
> - } else {
> - error = replace_fd(new_fd, file, o_flags);
> - if (error)
> - return error;
> - }
> + fd_install(new_fd, get_file(file));
> + __receive_sock(file);
> + return new_fd;
> +}
>  
> - /* Bump the sock usage counts, if any. */
> +int receive_fd_replace(int new_fd, struct file *file, unsigned int o_flags)
> +{
> + int error;
> +
> + error = security_file_receive(file);
> + if (error)
> + return error;
> + error = replace_fd(new_fd, file, o_flags);
> + if (error)
> + return error;
>   __receive_sock(file);
>   return new_fd;
>  }
> diff --git a/include/linux/file.h b/include/linux/file.h
> index 225982792fa2..2de2e4613d7b 100644
> --- a/include/linux/file.h
> +++ b/include/linux/file.h
> @@ -92,23 +92,20 @@ extern void put_unused_fd(unsigned int fd);
>  
>  extern void fd_install(unsigned int fd, struct file *file);

Re: [PATCH] Documentation: syscalls: add a note about ABI-agnostic types

2021-04-14 Thread Christian Brauner

On Wed, Apr 14, 2021 at 12:46:01PM +0300, Mike Rapoport wrote:
> On Wed, Apr 14, 2021 at 10:46:05AM +0200, Christian Brauner wrote:
> > On Wed, Apr 14, 2021 at 08:14:22AM +0200, Mauro Carvalho Chehab wrote:
> > > Em Tue, 13 Apr 2021 21:40:20 -0700
> > > Yury Norov  escreveu:
> > > 
> > > > Ping?
> > > > 
> > > > On Fri, Apr 09, 2021 at 01:43:04PM -0700, Yury Norov wrote:
> > > > > Recently added memfd_secret() syscall had a flags parameter passed
> > > > > as unsigned long, which requires creation of compat entry for it.
> > > > > It was possible to change the type of flags to unsigned int and so
> > > > > avoid bothering with compat layer.
> > > > > 
> > > > > https://www.spinics.net/lists/linux-mm/msg251550.html
> > > > > 
> > > > > Documentation/process/adding-syscalls.rst doesn't point clearly about
> > > > > preference of ABI-agnostic types. This patch adds such notification.
> > > > > 
> > > > > Signed-off-by: Yury Norov 
> > > > > ---
> > > > >  Documentation/process/adding-syscalls.rst | 7 +++
> > > > >  1 file changed, 7 insertions(+)
> > > > > 
> > > > > diff --git a/Documentation/process/adding-syscalls.rst 
> > > > > b/Documentation/process/adding-syscalls.rst
> > > > > index 9af35f4ec728..46add16edf14 100644
> > > > > --- a/Documentation/process/adding-syscalls.rst
> > > > > +++ b/Documentation/process/adding-syscalls.rst
> > > > > @@ -172,6 +172,13 @@ arguments (i.e. parameter 1, 3, 5), to allow use 
> > > > > of contiguous pairs of 32-bit
> > > > >  registers.  (This concern does not apply if the arguments are part 
> > > > > of a
> > > > >  structure that's passed in by pointer.)
> > > > >  
> > > > > +Whenever possible, try to use ABI-agnostic types for passing 
> > > > > parameters to
> > > > > +a syscall in order to avoid creating compat entry for it. Linux 
> > > > > supports two
> > > > > +ABI models - ILP32 and LP64. 
> > > 
> > > > > + The types like ``void *``, ``long``, ``size_t``,
> > > > > +``off_t`` have different size in those ABIs;
> > > 
> > > In the case of pointers, the best is to use __u64. The pointer can then
> > > be read on Kernelspace with something like this:
> > > 
> > >   static inline void __user *media_get_uptr(__u64 arg)
> > >   {
> > >   return (void __user *)(uintptr_t)arg;
> > >   }
> > > 
> > > 
> > > > > types like ``char`` and  ``int``
> > > > > +have the same size and don't require a compat layer support. For 
> > > > > flags, it's
> > > > > +always better to use ``unsigned int``.
> > > > > +
> > > 
> > > I don't think this is true for all compilers on userspace, as the C
> > > standard doesn't define how many bits an int/unsigned int has. 
> > > So, even if this is today's reality, things may change in the future.
> > > 
> > > For instance, I remember we had to replace "int" and "enum" by "__u32" 
> > > and "long" by "__u64" at the media uAPI in the past, when we start
> > > seeing x86_64 Kernels with 32-bits userspace and when cameras started 
> > > being supported on arm32.
> > > 
> > > We did have some real bugs with "enum", as, on that time, some
> > > compilers (gcc, I guess) were optimizing them to have less than
> > > 32 bits on certain architectures, when it fits.
> > 
> > Fwiw, Aleksa and I have written extended syscall documentation
> > documenting the agreement that we came to in a dedicated session with a
> > wide range of kernel folks during Linux Plumbers last year. We simply
> > never had time to actually send this series but fwiw here it is. It also
> > mentions the use of correct types. If people feel it's worth it I can
> > send as a proper series:
> 
> Yes, please.

Ok, I'll try to fix the commit messages and send it out.

Christian

Re: [PATCH] Documentation: syscalls: add a note about ABI-agnostic types

2021-04-14 Thread Christian Brauner

On Wed, Apr 14, 2021 at 08:14:22AM +0200, Mauro Carvalho Chehab wrote:
> Em Tue, 13 Apr 2021 21:40:20 -0700
> Yury Norov  escreveu:
> 
> > Ping?
> > 
> > On Fri, Apr 09, 2021 at 01:43:04PM -0700, Yury Norov wrote:
> > > Recently added memfd_secret() syscall had a flags parameter passed
> > > as unsigned long, which requires creation of compat entry for it.
> > > It was possible to change the type of flags to unsigned int and so
> > > avoid bothering with compat layer.
> > > 
> > > https://www.spinics.net/lists/linux-mm/msg251550.html
> > > 
> > > Documentation/process/adding-syscalls.rst doesn't point clearly about
> > > preference of ABI-agnostic types. This patch adds such notification.
> > > 
> > > Signed-off-by: Yury Norov 
> > > ---
> > >  Documentation/process/adding-syscalls.rst | 7 +++
> > >  1 file changed, 7 insertions(+)
> > > 
> > > diff --git a/Documentation/process/adding-syscalls.rst 
> > > b/Documentation/process/adding-syscalls.rst
> > > index 9af35f4ec728..46add16edf14 100644
> > > --- a/Documentation/process/adding-syscalls.rst
> > > +++ b/Documentation/process/adding-syscalls.rst
> > > @@ -172,6 +172,13 @@ arguments (i.e. parameter 1, 3, 5), to allow use of 
> > > contiguous pairs of 32-bit
> > >  registers.  (This concern does not apply if the arguments are part of a
> > >  structure that's passed in by pointer.)
> > >  
> > > +Whenever possible, try to use ABI-agnostic types for passing parameters 
> > > to
> > > +a syscall in order to avoid creating compat entry for it. Linux supports 
> > > two
> > > +ABI models - ILP32 and LP64. 
> 
> > > + The types like ``void *``, ``long``, ``size_t``,
> > > +``off_t`` have different size in those ABIs;
> 
> In the case of pointers, the best is to use __u64. The pointer can then
> be read on Kernelspace with something like this:
> 
>   static inline void __user *media_get_uptr(__u64 arg)
>   {
>   return (void __user *)(uintptr_t)arg;
>   }
> 
> 
> > > types like ``char`` and  ``int``
> > > +have the same size and don't require a compat layer support. For flags, 
> > > it's
> > > +always better to use ``unsigned int``.
> > > +
> 
> I don't think this is true for all compilers on userspace, as the C
> standard doesn't define how many bits an int/unsigned int has. 
> So, even if this is today's reality, things may change in the future.
> 
> For instance, I remember we had to replace "int" and "enum" by "__u32" 
> and "long" by "__u64" at the media uAPI in the past, when we start
> seeing x86_64 Kernels with 32-bits userspace and when cameras started 
> being supported on arm32.
> 
> We did have some real bugs with "enum", as, on that time, some
> compilers (gcc, I guess) were optimizing them to have less than
> 32 bits on certain architectures, when it fits.

Fwiw, Aleksa and I have written extended syscall documentation
documenting the agreement that we came to in a dedicated session with a
wide range of kernel folks during Linux Plumbers last year. We simply
never had time to actually send this series but fwiw here it is. It also
mentions the use of correct types. If people feel it's worth it I can
send as a proper series:

>From 9035258aaa23c5e1bb5bc2242f97221a3e5b9a87 Mon Sep 17 00:00:00 2001
From: Christian Brauner 
Date: Fri, 4 Sep 2020 14:27:47 +0200
Subject: [PATCH 1/6] docs: split extensibility section into two subsections

The section already explains two different formats that are available to
extend a syscall. Move each into its own subsection. This clarifies the
structure and will be useful when we extend each section in follow-up
patches.

Signed-off-by: Christian Brauner 
Signed-off-by: Aleksa Sarai 
Co-developed-by: Aleksa Sarai 
Signed-off-by: Christian Brauner 
---
 Documentation/process/adding-syscalls.rst | 6 ++
 1 file changed, 6 insertions(+)

diff --git a/Documentation/process/adding-syscalls.rst 
b/Documentation/process/adding-syscalls.rst
index 906c47f1a9e5..3853ce57e757 100644
--- a/Documentation/process/adding-syscalls.rst
+++ b/Documentation/process/adding-syscalls.rst
@@ -65,6 +65,9 @@ together with the corresponding follow-up system calls --
 ``pipe``/``pipe2``, ``renameat``/``renameat2`` -- so
 learn from the history of the kernel and plan for extensions from the start.)
 
+Baseline extensibility: adding a flag argument
+~~
+
 For simpler system calls that only take a couple of arguments, the preferred
 way to allow for future extensibility is to inc

Re: [PATCH 1/1] seccomp: Always "goto wait" if the list is empty

2021-04-13 Thread Christian Brauner

On Tue, Apr 13, 2021 at 06:01:51PM +0200, Rodrigo Campos wrote:
> It is possible for the thread with the seccomp filter attached (target)
> to be waken up by an addfd message, but the list be empty. This happens
> when the addfd ioctl on the other side (seccomp agent) is interrupted by
> a signal such as SIGURG. In that case, the target erroneously and
> prematurely returns from the syscall to userspace even though the
> seccomp agent didn't ask for it.
> 
> This happens in the following scenario:
> 
> seccomp_notify_addfd()   | 
> seccomp_do_user_notification()
>  |
>  |  err = 
> wait_for_completion_interruptible();
>  complete(>ready);   |
>  ret = wait_for_completion_interruptible();|
>  // interrupted  |
>  |
>  mutex_lock(>notify_lock);   |
>  list_del(); |
>  mutex_unlock(>notify_lock); |
>  |  
> mutex_lock(>notify_lock);
>  |  // This 
> is false, addfd is false
>  |  if (addfd 
> && n.state != SECCOMP_NOTIFY_REPLIED)
>  |
>  |  ret = 
> n.val;
>  |  err = 
> n.error;
>  |  flags = 
> n.flags;
> 
> So, the process blocked in seccomp_do_user_notification() will see a
> response. As n is 0 initialized and wasn't set, it will see a 0 as
> return value from the syscall.
> 
> The seccomp agent, when retrying the interrupted syscall, will see an
> ENOENT error as the notification no longer exists (it was already
> answered by this bug).
> 
> This patch fixes the issue by splitting the if in two parts: if we were
> woken up and the state is not replied, we will always do a "goto wait".
> And if that happens and there is an addfd element on the list, we will
> add the fd before "goto wait".
> 
> This issue is present since 5.9, when addfd was added.
> 
> Fixes: 7cf97b1254550
> Cc: sta...@vger.kernel.org # 5.9+
> Signed-off-by: Rodrigo Campos 
> ---

So the agent will see the return value from
wait_for_completion_interruptible() and know that the addfd wasn't
successful and the target will notice that no addfd request has actually
been added and essentially try again. Seems like a decent fix and can be
backported cleanly. I assume seccomp testsuite passes.

Acked-by: Christian Brauner

[GIT PULL] close_range fix

2021-04-08 Thread Christian Brauner

Hi Linus,

/* Summary */
Syzbot reported a bug in close_range. Debugging this showed we didn't
recalculate the current maximum fd number for CLOSE_RANGE_UNSHARE |
CLOSE_RANGE_CLOEXEC after we unshared the file descriptors table.

So max_fd could exceed the current fdtable maximum causing us to set excessive
bits. As a concrete example, let's say the user requested everything from fd 4
to ~0UL to be closed and their current fdtable size is 256 with their highest
open fd being 4. With CLOSE_RANGE_UNSHARE the caller will end up with a new
fdtable which has room for 64 file descriptors since that is the lowest fdtable
size we accept. But now max_fd will still point to 255 and needs to be
adjusted. Fix this by retrieving the correct maximum fd value in
__range_cloexec().

I've carried this fix for a little while but since there was no linux-next
release over easter I waited until now.

With this change close_range() can be simplified a bit but imho we are in no
hurry to do that and so I'll defer this for the 5.13 merge window.

(Fwiw, the two follow-up patches sit in
 https://git.kernel.org/brauner/h/fs/close_range.)

/* Testing */
All patches have seen exposure in linux-next and are based on v5.12-rc4.
The selftests pass and the reproducer provided by syzbot did not trigger. The
patch also has a Tested-by from Dmitry but I had already pushed it out by the
time that came in so it's missing from the patch itself.

/* Conflicts */
At the time of creating this pr no merge conflicts were reported. A test merge
and build with today's master 2021-04-08 12:20:00 CET worked fine.

The following changes since commit 0d02ec6b3136c73c09e7859f0d0e4e2c4c07b49b:

  Linux 5.12-rc4 (2021-03-21 14:56:43 -0700)

are available in the Git repository at:

  g...@gitolite.kernel.org:pub/scm/linux/kernel/git/brauner/linux 
tags/for-linus-2021-04-08

for you to fetch changes up to 9b5b872215fe6d1ca6a1ef411f130bd58e269012:

  file: fix close_range() for unshare+cloexec (2021-04-02 14:11:10 +0200)

Please consider pulling these changes from the signed for-linus-2021-04-08 tag.

Thanks!
Christian


for-linus-2021-04-08


Christian Brauner (1):
  file: fix close_range() for unshare+cloexec

 fs/file.c | 21 +
 1 file changed, 17 insertions(+), 4 deletions(-)

Re: [RFC bpf-next 1/1] bpf: Introduce iter_pagecache

2021-04-08 Thread Christian Brauner

On Wed, Apr 07, 2021 at 02:46:11PM -0700, Daniel Xu wrote:
> This commit introduces the bpf page cache iterator. This iterator allows
> users to run a bpf prog against each page in the "page cache".
> Internally, the "page cache" is extremely tied to VFS superblock + inode
> combo. Because of this, iter_pagecache will only examine pages in the
> caller's mount namespace.
> 
> Signed-off-by: Daniel Xu 
> ---
>  kernel/bpf/Makefile |   2 +-
>  kernel/bpf/pagecache_iter.c | 293 
>  2 files changed, 294 insertions(+), 1 deletion(-)
>  create mode 100644 kernel/bpf/pagecache_iter.c
> 
> diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile
> index 7f33098ca63f..3deb6a8d3f75 100644
> --- a/kernel/bpf/Makefile
> +++ b/kernel/bpf/Makefile
> @@ -6,7 +6,7 @@ cflags-nogcse-$(CONFIG_X86)$(CONFIG_CC_IS_GCC) := -fno-gcse
>  endif
>  CFLAGS_core.o += $(call cc-disable-warning, override-init) 
> $(cflags-nogcse-yy)
>  
> -obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o 
> bpf_iter.o map_iter.o task_iter.o prog_iter.o
> +obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o 
> bpf_iter.o pagecache_iter.o map_iter.o task_iter.o prog_iter.o
>  obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o 
> bpf_lru_list.o lpm_trie.o map_in_map.o
>  obj-$(CONFIG_BPF_SYSCALL) += local_storage.o queue_stack_maps.o ringbuf.o
>  obj-$(CONFIG_BPF_SYSCALL) += bpf_local_storage.o bpf_task_storage.o
> diff --git a/kernel/bpf/pagecache_iter.c b/kernel/bpf/pagecache_iter.c
> new file mode 100644
> index ..8442ab0d4221
> --- /dev/null
> +++ b/kernel/bpf/pagecache_iter.c
> @@ -0,0 +1,293 @@
> +// SPDX-License-Identifier: GPL-2.0-only
> +/* Copyright (c) 2021 Facebook */
> +
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +#include "../../fs/mount.h"

This is a private header on purpose. Outside of fs/ poking around in
struct mount or struct mount_namespace should not be done.

> +
> +struct bpf_iter_seq_pagecache_info {
> + struct mnt_namespace *ns;
> + struct radix_tree_root superblocks;
> + struct super_block *cur_sb;
> + struct inode *cur_inode;
> + unsigned long cur_page_idx;
> +};
> +
> +static struct super_block *goto_next_sb(struct bpf_iter_seq_pagecache_info 
> *info)
> +{
> + struct super_block *sb = NULL;
> + struct radix_tree_iter iter;
> + void **slot;
> +
> + radix_tree_for_each_slot(slot, >superblocks, ,
> +  ((unsigned long)info->cur_sb + 1)) {
> + sb = (struct super_block *)iter.index;
> + break;
> + }
> +
> + info->cur_sb = sb;
> + info->cur_inode = NULL;
> + info->cur_page_idx = 0;
> + return sb;
> +}
> +
> +static bool inode_unusual(struct inode *inode) {
> + return ((inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW)) ||
> + (inode->i_mapping->nrpages == 0));
> +}
> +
> +static struct inode *goto_next_inode(struct bpf_iter_seq_pagecache_info 
> *info)
> +{
> + struct inode *prev_inode = info->cur_inode;
> + struct inode *inode;
> +
> +retry:
> + BUG_ON(!info->cur_sb);
> + spin_lock(>cur_sb->s_inode_list_lock);
> +
> + if (!info->cur_inode) {
> + list_for_each_entry(inode, >cur_sb->s_inodes, i_sb_list) {
> + spin_lock(>i_lock);
> + if (inode_unusual(inode)) {
> + spin_unlock(>i_lock);
> + continue;
> + }
> + __iget(inode);
> + spin_unlock(>i_lock);
> + info->cur_inode = inode;
> + break;
> + }
> + } else {
> + inode = info->cur_inode;
> + info->cur_inode = NULL;
> + list_for_each_entry_continue(inode, >cur_sb->s_inodes,
> +  i_sb_list) {
> + spin_lock(>i_lock);
> + if (inode_unusual(inode)) {
> + spin_unlock(>i_lock);
> + continue;
> + }
> + __iget(inode);
> + spin_unlock(>i_lock);
> + info->cur_inode = inode;
> + break;
> + }
> + }
> +
> + /* Seen all inodes in this superblock */
> + if (!info->cur_inode) {
> + spin_unlock(>cur_sb->s_inode_list_lock);
> + if (!goto_next_sb(info)) {
> + inode = NULL;
> + goto out;
> + }
> +
> + goto retry;
> + }
> +
> + spin_unlock(>cur_sb->s_inode_list_lock);
> + info->cur_page_idx = 0;
> +out:
> + iput(prev_inode);
> + return info->cur_inode;
> +}
> +
> +static struct page *goto_next_page(struct bpf_iter_seq_pagecache_info *info)
> +{
> + struct

Re: [RFC bpf-next 0/1] bpf: Add page cache iterator

2021-04-08 Thread Christian Brauner

On Wed, Apr 07, 2021 at 02:46:10PM -0700, Daniel Xu wrote:
> There currently does not exist a way to answer the question: "What is in
> the page cache?". There are various heuristics and counters but nothing
> that can tell you anything like:
> 
>   * 3M from /home/dxu/foo.txt
>   * 5K from ...
>   * etc.
> 
> The answer to the question is particularly useful in the stacked
> container world. Stacked containers implies multiple containers are run
> on the same physical host. Memory is precious resource on some (if not

Just to clarify: what are "stacked containers"? Do you mean nested
containers, i.e. containers running within containers?

Christian

Re: High kmalloc-32 slab cache consumption with 10k containers

2021-04-07 Thread Christian Brauner

On Wed, Apr 07, 2021 at 08:28:07AM +1000, Dave Chinner wrote:
> On Mon, Apr 05, 2021 at 11:18:48AM +0530, Bharata B Rao wrote:
> > Hi,
> > 
> > When running 1 (more-or-less-empty-)containers on a bare-metal Power9
> > server(160 CPUs, 2 NUMA nodes, 256G memory), it is seen that memory
> > consumption increases quite a lot (around 172G) when the containers are
> > running. Most of it comes from slab (149G) and within slab, the majority of
> > it comes from kmalloc-32 cache (102G)
> > 
> > The major allocator of kmalloc-32 slab cache happens to be the list_head
> > allocations of list_lru_one list. These lists are created whenever a
> > FS mount happens. Specially two such lists are registered by alloc_super(),
> > one for dentry and another for inode shrinker list. And these lists
> > are created for all possible NUMA nodes and for all given memcgs
> > (memcg_nr_cache_ids to be particular)
> > 
> > If,
> > 
> > A = Nr allocation request per mount: 2 (one for dentry and inode list)
> > B = Nr NUMA possible nodes
> > C = memcg_nr_cache_ids
> > D = size of each kmalloc-32 object: 32 bytes,
> > 
> > then for every mount, the amount of memory consumed by kmalloc-32 slab
> > cache for list_lru creation is A*B*C*D bytes.
> > 
> > Following factors contribute to the excessive allocations:
> > 
> > - Lists are created for possible NUMA nodes.
> > - memcg_nr_cache_ids grows in bulk (see memcg_alloc_cache_id() and 
> > additional
> >   list_lrus are created when it grows. Thus we end up creating list_lru_one
> >   list_heads even for those memcgs which are yet to be created.
> >   For example, when 1 memcgs are created, memcg_nr_cache_ids reach
> >   a value of 12286.
> 
> So, by your numbers, we have 2 * 2 * 12286 * 32 = 1.5MB per mount.
> 
> So for that to make up 100GB of RAM, you must have somewhere over
> 500,000 mounted superblocks on the machine?
> 
> That implies 50+ unique mounted superblocks per container, which
> seems like an awful lot.
> 
> > - When a memcg goes offline, the list elements are drained to the parent
> >   memcg, but the list_head entry remains.
> > - The lists are destroyed only when the FS is unmounted. So list_heads
> >   for non-existing memcgs remain and continue to contribute to the
> >   kmalloc-32 allocation. This is presumably done for performance
> >   reason as they get reused when new memcgs are created, but they end up
> >   consuming slab memory until then.
> > - In case of containers, a few file systems get mounted and are specific
> >   to the container namespace and hence to a particular memcg, but we
> >   end up creating lists for all the memcgs.
> >   As an example, if 7 FS mounts are done for every container and when
> >   10k containers are created, we end up creating 2*7*12286 list_lru_one
> >   lists for each NUMA node. It appears that no elements will get added
> >   to other than 2*7=14 of them in the case of containers.
> 
> Yeah, at first glance this doesn't strike me as a problem with the
> list_lru structure, it smells more like a problem resulting from a
> huge number of superblock instantiations on the machine. Which,
> probably, mostly have no significant need for anything other than a
> single memcg awareness?
> 
> Can you post a typical /proc/self/mounts output from one of these
> idle/empty containers so we can see exactly how many mounts and
> their type are being instantiated in each container?

Similar to Michal I wonder how much of that is really used in production
environments. From our experience it really depends on the type of
container we're talking about.
For a regular app container that essentially serves as an application
isolator the number of mounts could be fairly limited and essentially be
restricted to:

tmpfs
devptfs
sysfs
[cgroupfs]
and a few bind-mounts of standard devices such as
/dev/null
/dev/zero
/dev/full
.
.
.
from the host's devtmpfs into the container.

Then there are containers that behave like regular systems and are
managed like regular systems and those might have quite a bit more. For
example, here is the output of a regular unprivileged Fedora 33
container I created out of the box:

[root@f33 ~]# findmnt 
TARGETSOURCE
   FSTYPE  OPTIONS
/ 
/dev/mapper/ubuntu--vg-ubuntu--lv[/var/lib/lxd/storage-pools/default/containers/f33/rootfs]
│   
   xfs 
rw,relatime,attr2,inode64,logbufs=8,logbsize=32k,noquota
├─/runtmpfs 
   tmpfs   
rw,nosuid,nodev,size=3226884k,nr_inodes=819200,mode=755,uid=10,gid=10
│ └─/run/user/0   tmpfs 
   tmpfs

Re: High kmalloc-32 slab cache consumption with 10k containers

2021-04-07 Thread Christian Brauner

On Wed, Apr 07, 2021 at 01:54:48PM +0200, Michal Hocko wrote:
> On Mon 05-04-21 11:18:48, Bharata B Rao wrote:
> > Hi,
> > 
> > When running 1 (more-or-less-empty-)containers on a bare-metal Power9
> > server(160 CPUs, 2 NUMA nodes, 256G memory), it is seen that memory
> > consumption increases quite a lot (around 172G) when the containers are
> > running. Most of it comes from slab (149G) and within slab, the majority of
> > it comes from kmalloc-32 cache (102G)
> 
> Is this 10k cgroups a testing enviroment or does anybody really use that
> in production? I would be really curious to hear how that behaves when
> those containers are not idle. E.g. global memory reclaim iterating over
> 10k memcgs will likely be very visible. I do remember playing with
> similar setups few years back and the overhead was very high.

Ccing Stéphane Graber who has experience/insight about stuff like this.

Christian

Re: [PATCH v5 08/12] evm: Pass user namespace to set/remove xattr hooks

2021-04-07 Thread Christian Brauner

On Wed, Apr 07, 2021 at 12:52:48PM +0200, Roberto Sassu wrote:
> In preparation for 'evm: Allow setxattr() and setattr() for unmodified
> metadata', this patch passes mnt_userns to the inode set/remove xattr hooks
> so that the GID of the inode on an idmapped mount is correctly determined
> by posix_acl_update_mode().
> 
> Cc: Christian Brauner 
> Cc: Andreas Gruenbacher 
> Signed-off-by: Roberto Sassu 
> ---

Looks good,
Reviewed-by: Christian Brauner

Re: [PATCH v5 09/12] evm: Allow setxattr() and setattr() for unmodified metadata

2021-04-07 Thread Christian Brauner

On Wed, Apr 07, 2021 at 12:52:49PM +0200, Roberto Sassu wrote:
> With the patch to allow xattr/attr operations if a portable signature
> verification fails, cp and tar can copy all xattrs/attrs so that at the
> end of the process verification succeeds.
> 
> However, it might happen that the xattrs/attrs are already set to the
> correct value (taken at signing time) and signature verification succeeds
> before the copy has completed. For example, an archive might contains files
> owned by root and the archive is extracted by root.
> 
> Then, since portable signatures are immutable, all subsequent operations
> fail (e.g. fchown()), even if the operation is legitimate (does not alter
> the current value).
> 
> This patch avoids this problem by reporting successful operation to user
> space when that operation does not alter the current value of xattrs/attrs.
> 
> Cc: Christian Brauner 
> Cc: Andreas Gruenbacher 
> Signed-off-by: Roberto Sassu 
> ---
>  security/integrity/evm/evm_main.c | 107 ++
>  1 file changed, 107 insertions(+)
> 
> diff --git a/security/integrity/evm/evm_main.c 
> b/security/integrity/evm/evm_main.c
> index 74f9f3a2ae53..2a8fcba67d47 100644
> --- a/security/integrity/evm/evm_main.c
> +++ b/security/integrity/evm/evm_main.c
> @@ -18,6 +18,7 @@
>  #include 
>  #include 
>  #include 
> +#include 
>  
>  #include 
>  #include 
> @@ -328,6 +329,89 @@ static enum integrity_status 
> evm_verify_current_integrity(struct dentry *dentry)
>   return evm_verify_hmac(dentry, NULL, NULL, 0, NULL);
>  }
>  
> +/*
> + * evm_xattr_acl_change - check if passed ACL changes the inode mode
> + * @mnt_userns: user namespace of the idmapped mount
> + * @dentry: pointer to the affected dentry
> + * @xattr_name: requested xattr
> + * @xattr_value: requested xattr value
> + * @xattr_value_len: requested xattr value length
> + *
> + * Check if passed ACL changes the inode mode, which is protected by EVM.
> + *
> + * Returns 1 if passed ACL causes inode mode change, 0 otherwise.
> + */
> +static int evm_xattr_acl_change(struct user_namespace *mnt_userns,
> + struct dentry *dentry, const char *xattr_name,
> + const void *xattr_value, size_t xattr_value_len)
> +{
> + umode_t mode;
> + struct posix_acl *acl = NULL, *acl_res;
> + struct inode *inode = d_backing_inode(dentry);
> + int rc;
> +
> + /* user_ns is not relevant here, ACL_USER/ACL_GROUP don't have impact
> +  * on the inode mode (see posix_acl_equiv_mode()).
> +  */
> + acl = posix_acl_from_xattr(_user_ns, xattr_value, xattr_value_len);
> + if (IS_ERR_OR_NULL(acl))
> + return 1;
> +
> + acl_res = acl;
> + /* Passing mnt_userns is necessary to correctly determine the GID in
> +  * an idmapped mount, as the GID is used to clear the setgid bit in
> +  * the inode mode.
> +  */
> + rc = posix_acl_update_mode(mnt_userns, inode, , _res);
> +
> + posix_acl_release(acl);
> +
> + if (rc)
> + return 1;
> +
> + if (inode->i_mode != mode)
> + return 1;
> +
> + return 0;
> +}
> +
> +/*
> + * evm_xattr_change - check if passed xattr value differs from current value
> + * @mnt_userns: user namespace of the idmapped mount
> + * @dentry: pointer to the affected dentry
> + * @xattr_name: requested xattr
> + * @xattr_value: requested xattr value
> + * @xattr_value_len: requested xattr value length
> + *
> + * Check if passed xattr value differs from current value.
> + *
> + * Returns 1 if passed xattr value differs from current value, 0 otherwise.
> + */
> +static int evm_xattr_change(struct user_namespace *mnt_userns,
> + struct dentry *dentry, const char *xattr_name,
> + const void *xattr_value, size_t xattr_value_len)
> +{
> + char *xattr_data = NULL;
> + int rc = 0;
> +
> + if (posix_xattr_acl(xattr_name))
> + return evm_xattr_acl_change(mnt_userns, dentry, xattr_name,
> + xattr_value, xattr_value_len);
> +
> + rc = vfs_getxattr_alloc(_user_ns, dentry, xattr_name, _data,
> + 0, GFP_NOFS);
> + if (rc < 0)
> + return 1;
> +
> + if (rc == xattr_value_len)
> + rc = memcmp(xattr_value, xattr_data, rc);

Afaik memcmp() can return values greater than 1 and less than 0 so it
might make sense to explicitly do sm like:
rc = memcmp() ? 1 : 0;
or
!!memcmp()
or alter the comment for evm_xattr_change().

other than that

Reviewed-by: Christian Brauner 

> +

Re: [syzbot] WARNING in mntput_no_expire (2)

2021-04-06 Thread Christian Brauner

On Tue, Apr 06, 2021 at 02:15:01PM +, Al Viro wrote:
> On Tue, Apr 06, 2021 at 03:22:05PM +0200, Christian Brauner wrote:
> 
> > Why is a another function in charge of checking the return value of an
> > initialization function. If something like path_init() fails why is the
> > next caller responsible for rejecting it's return value and then we're
> > passing that failure value through the whole function with if (!err)
> > ladders but as I said it's mostly style preferences.
> 
> Because otherwise you either need *all* paths leading to link_path_walk()
> duplicate the logics (and get hurt whenever you miss one) or have "well,
> in some cases link_path_walk() handles ERR_PTR() given to it, in some
> cases its caller do" mess.
> 
> > > > s = path_init(nd, flags);
> > > > -   if (IS_ERR(s))
> > > > -   return PTR_ERR(s);
> > > 
> > > Where has that come from, BTW?  Currently path_lookupat() does no such 
> > > thing.
> > 
> > Hm? Are you maybe overlooking path_init() which assigns straight into
> > the variable declaration? Or are you referring to sm else?
> 
> I'm referring to the fact that your diff is with an already modified 
> path_lookupat()
> _and_ those modifications have managed to introduce a bug your patch reverts.
> No terminate_walk() paired with that path_init() failure, i.e. path_init() is
> responsible for cleanups on its (many) failure exits...

Note that the paste post the patch was just a doodle to illustrate the
point not sm to review in earnest (I should probably comment prefix
things like this with "untested".).

Re: [PATCH v1 1/1] kernel.h: Split out panic and oops helpers

2021-04-06 Thread Christian Brauner

On Tue, Apr 06, 2021 at 04:31:58PM +0300, Andy Shevchenko wrote:
> kernel.h is being used as a dump for all kinds of stuff for a long time.
> Here is the attempt to start cleaning it up by splitting out panic and
> oops helpers.
> 
> At the same time convert users in header and lib folder to use new header.
> Though for time being include new header back to kernel.h to avoid twisted
> indirected includes for existing users.
> 
> Signed-off-by: Andy Shevchenko 
> ---

(I think David has tried something like this a few years ago too?)
Good idea in any case. (Be good to see kbuild do an allmodconfig build
of this though.)
Acked-by: Christian Brauner 

>  arch/powerpc/kernel/setup-common.c   |  1 +
>  arch/x86/include/asm/desc.h  |  1 +
>  arch/x86/kernel/cpu/mshyperv.c   |  1 +
>  arch/x86/kernel/setup.c  |  1 +
>  drivers/char/ipmi/ipmi_msghandler.c  |  1 +
>  drivers/remoteproc/remoteproc_core.c |  1 +
>  include/asm-generic/bug.h|  3 +-
>  include/linux/kernel.h   | 84 +---
>  include/linux/panic.h| 98 
>  include/linux/panic_notifier.h   | 12 
>  kernel/hung_task.c   |  1 +
>  kernel/kexec_core.c  |  1 +
>  kernel/panic.c   |  1 +
>  kernel/rcu/tree.c|  2 +
>  kernel/sysctl.c  |  1 +
>  kernel/trace/trace.c |  1 +
>  16 files changed, 126 insertions(+), 84 deletions(-)
>  create mode 100644 include/linux/panic.h
>  create mode 100644 include/linux/panic_notifier.h
> 
> diff --git a/arch/powerpc/kernel/setup-common.c 
> b/arch/powerpc/kernel/setup-common.c
> index 74a98fff2c2f..046fe21b5c3b 100644
> --- a/arch/powerpc/kernel/setup-common.c
> +++ b/arch/powerpc/kernel/setup-common.c
> @@ -9,6 +9,7 @@
>  #undef DEBUG
>  
>  #include 
> +#include 
>  #include 
>  #include 
>  #include 
> diff --git a/arch/x86/include/asm/desc.h b/arch/x86/include/asm/desc.h
> index 476082a83d1c..ceb12683b6d1 100644
> --- a/arch/x86/include/asm/desc.h
> +++ b/arch/x86/include/asm/desc.h
> @@ -9,6 +9,7 @@
>  #include 
>  #include 
>  
> +#include 
>  #include 
>  #include 
>  
> diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c
> index 22f13343b5da..9e5c6f2b044d 100644
> --- a/arch/x86/kernel/cpu/mshyperv.c
> +++ b/arch/x86/kernel/cpu/mshyperv.c
> @@ -17,6 +17,7 @@
>  #include 
>  #include 
>  #include 
> +#include 
>  #include 
>  #include 
>  #include 
> diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
> index 59e5e0903b0c..570699eecf90 100644
> --- a/arch/x86/kernel/setup.c
> +++ b/arch/x86/kernel/setup.c
> @@ -14,6 +14,7 @@
>  #include 
>  #include 
>  #include 
> +#include 
>  #include 
>  #include 
>  #include 
> diff --git a/drivers/char/ipmi/ipmi_msghandler.c 
> b/drivers/char/ipmi/ipmi_msghandler.c
> index 8a0e97b33cae..e96cb5c4f97a 100644
> --- a/drivers/char/ipmi/ipmi_msghandler.c
> +++ b/drivers/char/ipmi/ipmi_msghandler.c
> @@ -16,6 +16,7 @@
>  
>  #include 
>  #include 
> +#include 
>  #include 
>  #include 
>  #include 
> diff --git a/drivers/remoteproc/remoteproc_core.c 
> b/drivers/remoteproc/remoteproc_core.c
> index 626a6b90fba2..76dd8e2b1e7e 100644
> --- a/drivers/remoteproc/remoteproc_core.c
> +++ b/drivers/remoteproc/remoteproc_core.c
> @@ -20,6 +20,7 @@
>  #include 
>  #include 
>  #include 
> +#include 
>  #include 
>  #include 
>  #include 
> diff --git a/include/asm-generic/bug.h b/include/asm-generic/bug.h
> index 76a10e0dca9f..719410b93f99 100644
> --- a/include/asm-generic/bug.h
> +++ b/include/asm-generic/bug.h
> @@ -17,7 +17,8 @@
>  #endif
>  
>  #ifndef __ASSEMBLY__
> -#include 
> +#include 
> +#include 
>  
>  #ifdef CONFIG_BUG
>  
> diff --git a/include/linux/kernel.h b/include/linux/kernel.h
> index 09035ac67d4b..6c5a05ac1ecb 100644
> --- a/include/linux/kernel.h
> +++ b/include/linux/kernel.h
> @@ -14,6 +14,7 @@
>  #include 
>  #include 
>  #include 
> +#include 
>  #include 
>  #include 
>  #include 
> @@ -70,7 +71,6 @@
>  #define lower_32_bits(n) ((u32)((n) & 0x))
>  
>  struct completion;
> -struct pt_regs;
>  struct user;
>  
>  #ifdef CONFIG_PREEMPT_VOLUNTARY
> @@ -175,14 +175,6 @@ void __might_fault(const char *file, int line);
>  static inline void might_fault(void) { }
>  #endif
>  
> -extern struct atomic_notifier_head panic_notifier_list;
> -extern long (*panic_blink)(int state);
> -__printf(1, 2)
> -void panic(const char *fmt, ...) __noreturn __cold;
> -vo

Re: [syzbot] WARNING in mntput_no_expire (2)

2021-04-06 Thread Christian Brauner

On Tue, Apr 06, 2021 at 01:13:13PM +, Al Viro wrote:
> On Tue, Apr 06, 2021 at 02:35:05PM +0200, Christian Brauner wrote:
> 
> > And while we're at it might I bring up the possibility of an additional
> > cleanup of how we currently call path_init().
> > Right now we pass the return value from path_init() directly into e.g.
> > link_path_walk() which as a first thing checks for error. Which feels
> > rather wrong and has always confused me when looking at these codepaths.
> 
> Why?

Why is a another function in charge of checking the return value of an
initialization function. If something like path_init() fails why is the
next caller responsible for rejecting it's return value and then we're
passing that failure value through the whole function with if (!err)
ladders but as I said it's mostly style preferences.

> 
> > I get that it might make sense for reasons unrelated to path_init() that
> > link_path_walk() checks its first argument for error but path_init()
> > should be checked for error right away especially now that we return
> > early when LOOKUP_CACHED is set without LOOKUP_RCU.
> 
> But you are making the _callers_ of path_init() do that, for no good
> reason.

I'm confused why having callers of functions responsible for checking
error values is such an out-of-band concept suddenly. I don't think it's
worth arguing over this though.

> 
> > thing especially in longer functions such as path_lookupat() where it
> > gets convoluted pretty quickly. I think it would be cleaner to have
> > something like [1]. The early exists make the code easier to reason
> > about imho. But I get that that's a style discussion.
> 
> Your variant is a lot more brittle, actually.
> 
> > @@ -2424,33 +2424,49 @@ static int path_lookupat(struct nameidata *nd, 
> > unsigned flags, struct path *path
> > int err;
> > 
> > s = path_init(nd, flags);
> > -   if (IS_ERR(s))
> > -   return PTR_ERR(s);
> 
> Where has that come from, BTW?  Currently path_lookupat() does no such thing.

Hm? Are you maybe overlooking path_init() which assigns straight into
the variable declaration? Or are you referring to sm else?

static int path_lookupat(struct nameidata *nd, unsigned flags, struct path 
*path)
{
const char *s = path_init(nd, flags);
int err;

if (unlikely(flags & LOOKUP_DOWN) && !IS_ERR(s)) {
err = handle_lookup_down(nd);
if (unlikely(err < 0))
s = ERR_PTR(err);
}

while (!(err = link_path_walk(s, nd)) &&
   (s = lookup_last(nd)) != NULL)
;

Re: [syzbot] WARNING in mntput_no_expire (2)

2021-04-06 Thread Christian Brauner

On Tue, Apr 06, 2021 at 01:38:39AM +, Al Viro wrote:
> On Mon, Apr 05, 2021 at 10:07:37PM +0200, Christian Brauner wrote:
> 
> > > diff --git a/fs/namei.c b/fs/namei.c
> > > index 216f16e74351..82344f1139ff 100644
> > > --- a/fs/namei.c
> > > +++ b/fs/namei.c
> > > @@ -2289,6 +2289,9 @@ static const char *path_init(struct nameidata *nd, 
> > > unsigned flags)
> > >   int error;
> > >   const char *s = nd->name->name;
> > >  
> > > + nd->path.mnt = NULL;
> > > + nd->path.dentry = NULL;
> > > +
> > >   /* LOOKUP_CACHED requires RCU, ask caller to retry */
> > >   if ((flags & (LOOKUP_RCU | LOOKUP_CACHED)) == LOOKUP_CACHED)
> > >   return ERR_PTR(-EAGAIN);
> > > @@ -2322,8 +2325,6 @@ static const char *path_init(struct nameidata *nd, 
> > > unsigned flags)
> > >   }
> > >  
> > >   nd->root.mnt = NULL;
> > > - nd->path.mnt = NULL;
> > > - nd->path.dentry = NULL;
> > >  
> > >   /* Absolute pathname -- fetch the root (LOOKUP_IN_ROOT uses nd->dfd). */
> > >   if (*s == '/' && !(flags & LOOKUP_IN_ROOT)) {
> > 
> > Bingo. That fixes it.
> 
> *grumble*
> 
> OK, I suppose it'll do for backports, but longer term... I don't like how
> convoluted the rules for nameidata fields' validity are.  In particular,
> for nd->path I would rather have it
>   * cleared in set_nameidata()
>   * cleared when it become invalid.  That would be
>   * places that drop rcu_read_lock() without having legitimized 
> the sucker
> (already done, except for terminate_walk())
>   * terminate_walk() in non-RCU case after path_put(>path)
> 
> OTOH... wait a sec - the whole thing is this cycle regression, so...
> 
> Could you verify that the variant below fixes that crap?
> 
> Make sure nd->path.mnt and nd->path.dentry are always valid pointers
> 
> Initialize them in set_nameidata() and make sure that terminate_walk() clears 
> them
> once the pointers become potentially invalid (i.e. we leave RCU mode or drop 
> them
> in non-RCU one).  Currently we have "path_init() always initializes them and 
> nobody
> accesses them outside of path_init()/terminate_walk() segments", which is 
> asking
> for trouble.
> 
> With that change we would have nd->path.{mnt,dentry}
>   1) always valid - NULL or pointing to currently allocated objects.
>   2) non-NULL while we are successfully walking
>   3) NULL when we are not walking at all
>   4) contributing to refcounts whenever non-NULL outside of RCU mode.
> 
> Hopefully-fixes: 6c6ec2b0a3e0 ("fs: add support for LOOKUP_CACHED")
> Signed-off-by: Al Viro 
> ---
> diff --git a/fs/namei.c b/fs/namei.c
> index 216f16e74351..fc8760d4314e 100644
> --- a/fs/namei.c
> +++ b/fs/namei.c
> @@ -579,6 +579,8 @@ static void set_nameidata(struct nameidata *p, int dfd, 
> struct filename *name)
>   p->stack = p->internal;
>   p->dfd = dfd;
>   p->name = name;
> + p->path.mnt = NULL;
> + p->path.dentry = NULL;
>   p->total_link_count = old ? old->total_link_count : 0;
>   p->saved = old;
>   current->nameidata = p;
> @@ -652,6 +654,8 @@ static void terminate_walk(struct nameidata *nd)
>   rcu_read_unlock();
>   }
>   nd->depth = 0;
> + nd->path.mnt = NULL;
> + nd->path.dentry = NULL;
>  }
>  
>  /* path_put is needed afterwards regardless of success or failure */
> @@ -2322,8 +2326,6 @@ static const char *path_init(struct nameidata *nd, 
> unsigned flags)
>   }
>  
>   nd->root.mnt = NULL;
> - nd->path.mnt = NULL;
> - nd->path.dentry = NULL;
>  
>   /* Absolute pathname -- fetch the root (LOOKUP_IN_ROOT uses nd->dfd). */
>   if (*s == '/' && !(flags & LOOKUP_IN_ROOT)) {

Yeah, that works too.
Though I'm not a fan of this open-coding as it looks pretty brittle
especially if we ever introduce another LOOKUP_* flag that requires us
to clear some other field carefully. I think a tiny static inline void
helper might make it easier to grok what's going on.

And while we're at it might I bring up the possibility of an additional
cleanup of how we currently call path_init().
Right now we pass the return value from path_init() directly into e.g.
link_path_walk() which as a first thing checks for error. Which feels
rather wrong and has always confused me when looking at these codepaths.
I get that it might make sense for reasons unrelated to path_init() that
link_path_walk() check

Re: [syzbot] WARNING in mntput_no_expire (2)

2021-04-05 Thread Christian Brauner

On Mon, Apr 05, 2021 at 06:28:54PM +, Al Viro wrote:
> On Mon, Apr 05, 2021 at 06:23:49PM +, Al Viro wrote:
> > On Mon, Apr 05, 2021 at 07:08:01PM +0200, Christian Brauner wrote:
> > 
> > > Ah dentry count of -127 looks... odd.
> > 
> > dead + 1...
> > 
> > void lockref_mark_dead(struct lockref *lockref)
> > {
> > assert_spin_locked(>lock);
> > lockref->count = -128;
> > }
> > 
> > IOW, a leaked (uncounted) reference to dentry, that got dget() called on
> > it after dentry had been freed.
> > 
> > IOW, current->fs->pwd.dentry happens to point to an already freed
> > struct dentry here.  Joy...
> > 
> > Could you slap
> > 
> > spin_lock(>fs->lock);
> > WARN_ON(d_count(current->fs->pwd.dentry) < 0);
> > spin_unlock(>fs->lock);
> > 
> > before and after calls of io_issue_sqe() and see if it triggers?  We 
> > definitely
> > are seeing buggered dentry refcounting here.
> 
> Check if this helps, please.
> 
> diff --git a/fs/namei.c b/fs/namei.c
> index 216f16e74351..82344f1139ff 100644
> --- a/fs/namei.c
> +++ b/fs/namei.c
> @@ -2289,6 +2289,9 @@ static const char *path_init(struct nameidata *nd, 
> unsigned flags)
>   int error;
>   const char *s = nd->name->name;
>  
> + nd->path.mnt = NULL;
> + nd->path.dentry = NULL;
> +
>   /* LOOKUP_CACHED requires RCU, ask caller to retry */
>   if ((flags & (LOOKUP_RCU | LOOKUP_CACHED)) == LOOKUP_CACHED)
>   return ERR_PTR(-EAGAIN);
> @@ -2322,8 +2325,6 @@ static const char *path_init(struct nameidata *nd, 
> unsigned flags)
>   }
>  
>   nd->root.mnt = NULL;
> - nd->path.mnt = NULL;
> - nd->path.dentry = NULL;
>  
>   /* Absolute pathname -- fetch the root (LOOKUP_IN_ROOT uses nd->dfd). */
>   if (*s == '/' && !(flags & LOOKUP_IN_ROOT)) {

Bingo. That fixes it.

Re: [syzbot] WARNING in mntput_no_expire (2)

2021-04-05 Thread Christian Brauner

On Mon, Apr 05, 2021 at 04:18:58PM +, Al Viro wrote:
> On Mon, Apr 05, 2021 at 01:44:37PM +0200, Christian Brauner wrote:
> > On Sun, Apr 04, 2021 at 08:17:21PM +, Al Viro wrote:
> > > On Sun, Apr 04, 2021 at 06:50:10PM +, Al Viro wrote:
> > > 
> > > > > Yeah, I have at least namei.o
> > > > > 
> > > > > https://drive.google.com/file/d/1AvO1St0YltIrA86DXjp1Xg3ojtS9owGh/view?usp=sharing
> > > > 
> > > > *grumble*
> > > > 
> > > > Is it reproducible without KASAN?  Would be much easier to follow the 
> > > > produced
> > > > asm...
> > > 
> > >   Looks like inode_permission(_, NULL, _) from may_lookup(nd).  I.e.
> > > nd->inode == NULL.
> > 
> > Yeah, I already saw that.
> > 
> > > 
> > >   Mind slapping BUG_ON(!nd->inode) right before may_lookup() call in
> > > link_path_walk() and trying to reproduce that oops?
> > 
> > Yep, no problem. If you run the reproducer in a loop for a little while
> > you eventually trigger the BUG_ON() and then you get the following splat
> > (and then an endless loop) in [1] with nd->inode NULL.
> > 
> > _But_ I managed to debug this further and was able to trigger the BUG_ON()
> > directly in path_init() in the AT_FDCWD branch (after all its 
> > AT_FDCWD(./file0)
> > with the patch in [3] (it's in LOOKUP_RCU) the corresponding splat is in 
> > [2].
> > So the crash happens for a PF_IO_WORKER thread with a NULL nd->inode for the
> > PF_IO_WORKER's pwd (The PF_IO_WORKER seems to be in async context.).
> 
> So we find current->fs->pwd.dentry negative, with current->fs->seq sampled
> equal before and after that?  Lovely...  The only places where we assign
> anything to ->pwd.dentry are
> void set_fs_pwd(struct fs_struct *fs, const struct path *path)
> {
> struct path old_pwd; 
> 
> path_get(path);
> spin_lock(>lock);
> write_seqcount_begin(>seq);
> old_pwd = fs->pwd;
> fs->pwd = *path;
> write_seqcount_end(>seq);
> spin_unlock(>lock);
> 
> if (old_pwd.dentry)
> path_put(_pwd);
> }
> where we have ->seq bumped between dget new/assignment/ dput old,
> copy_fs_struct() where we have
> spin_lock(>lock);
> fs->root = old->root;
> path_get(>root);
> fs->pwd = old->pwd;
> path_get(>pwd);
> spin_unlock(>lock);
> fs being freshly allocated instance that couldn't have been observed
> by anyone and chroot_fs_refs(), where we have
> spin_lock(>lock);
> write_seqcount_begin(>seq);
> hits += replace_path(>root, old_root, new_root);
> hits += replace_path(>pwd, old_root, new_root);
> write_seqcount_end(>seq);
> while (hits--) {
> count++;
> path_get(new_root);
> }
> spin_unlock(>lock);
> ...
> static inline int replace_path(struct path *p, const struct path *old, const 
> struct path *new)
> {
> if (likely(p->dentry != old->dentry || p->mnt != old->mnt))
> return 0;
> *p = *new;
> return 1;
> }
> Here we have new_root->dentry pinned from the very beginning,
> and assignments are wrapped into bumps of ->seq.  Moreover,
> we are holding ->lock through that sequence (as all writers
> do), so these references can't be dropped before path_get()
> bumps new_root->dentry refcount.
> 
> chroot_fs_refs() is called only by pivot_root(2):
> chroot_fs_refs(, );
> and there new is set by
> error = user_path_at(AT_FDCWD, new_root,
>  LOOKUP_FOLLOW | LOOKUP_DIRECTORY, );
> if (error)
> goto out0;
> which pins new.dentry *and* verifies that it's positive and a directory,
> at that.  Since pinned positive dentry can't be made negative by anybody
> else, we know it will remain in that state until
>   path_put();
> well downstream of chroot_fs_refs().  In copy_fs_struct() we are
> copying someone's ->pwd, so it's also pinned positive.  And it
> won't be dropped outside of old->lock, so by the time somebody
> manages to drop the reference in old, path_get() effects will be
> visible (old->lock serving as a barrier).
> 
> That leaves set_fs

Re: [syzbot] WARNING in mntput_no_expire (2)

2021-04-05 Thread Christian Brauner

On Sun, Apr 04, 2021 at 08:17:21PM +, Al Viro wrote:
> On Sun, Apr 04, 2021 at 06:50:10PM +, Al Viro wrote:
> 
> > > Yeah, I have at least namei.o
> > > 
> > > https://drive.google.com/file/d/1AvO1St0YltIrA86DXjp1Xg3ojtS9owGh/view?usp=sharing
> > 
> > *grumble*
> > 
> > Is it reproducible without KASAN?  Would be much easier to follow the 
> > produced
> > asm...
> 
>   Looks like inode_permission(_, NULL, _) from may_lookup(nd).  I.e.
> nd->inode == NULL.

Yeah, I already saw that.

> 
>   Mind slapping BUG_ON(!nd->inode) right before may_lookup() call in
> link_path_walk() and trying to reproduce that oops?

Yep, no problem. If you run the reproducer in a loop for a little while
you eventually trigger the BUG_ON() and then you get the following splat
(and then an endless loop) in [1] with nd->inode NULL.

_But_ I managed to debug this further and was able to trigger the BUG_ON()
directly in path_init() in the AT_FDCWD branch (after all its AT_FDCWD(./file0)
with the patch in [3] (it's in LOOKUP_RCU) the corresponding splat is in [2].
So the crash happens for a PF_IO_WORKER thread with a NULL nd->inode for the
PF_IO_WORKER's pwd (The PF_IO_WORKER seems to be in async context.).

[3]:
diff --git a/fs/namei.c b/fs/namei.c
index 28a70diff --git a/fs/namei.c b/fs/namei.c
index 28a7006bdb04..8a31ccb61c45 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -2336,8 +2336,8 @@ static const char *path_init(struct nameidata *nd, 
unsigned flags)

/* Relative pathname -- get the starting-point it is relative to. */
if (nd->dfd == AT_FDCWD) {
+   struct fs_struct *fs = current->fs;
if (flags & LOOKUP_RCU) {
-   struct fs_struct *fs = current->fs;
unsigned seq;

do {
@@ -2347,9 +2347,14 @@ static const char *path_init(struct nameidata *nd, 
unsigned flags)
nd->seq = 
__read_seqcount_begin(>path.dentry->d_seq);
} while (read_seqcount_retry(>seq, seq));
} else {
-   get_fs_pwd(current->fs, >path);
+   get_fs_pwd(fs, >path);
nd->inode = nd->path.dentry->d_inode;
}
+
+   BUG_ON(!fs->users);
+   BUG_ON(fs->users < 0);
+   BUG_ON(!nd->inode && (current->flags & PF_IO_WORKER));
+   BUG_ON(!nd->inode);
} else {
/* Caller must check execute permissions on the starting path 
component */
struct fd f = fdget_raw(nd->dfd);

[1]:
[  257.564526] [ cut here ]
[  257.564549] kernel BUG at fs/namei.c:2208!
[  257.564998] [ cut here ]
[  257.565773] kernel BUG at fs/namei.c:2208!
[  257.567632] invalid opcode:  [#1] PREEMPT SMP KASAN
[  257.569113] CPU: 3 PID: 6086 Comm: uring_viro Tainted: GW   E 
5.12.0-rc5-b47394a22e3dce3d03165e48ef0462f41c198ac7 #47
[  257.572028] Hardware name: QEMU Standard PC (Q35 + ICH9, 2009)/LXD, BIOS 
0.0.0 02/06/2015
[  257.573687] RIP: 0010:link_path_walk.part.0+0x6c9/0xd30
[  257.575008] Code: 00 48 8b 43 18 48 89 44 24 18 48 8b 44 24 38 80 38 00 0f 
85 b1 04 00 00 49 8b 6e 30 48 85 ed 0f 85 82 fb ff ff e8 67 35 a8 ff <0f> 0b e8 
60 35 a8 ff 49 83 c7 01 48 bf 00 00 00 00 00 fc ff df 4c
[  257.580853] RSP: 0018:c900052af5c0 EFLAGS: 00010246
[  257.582513] RAX:  RBX: 88802f7b2da0 RCX: 8ecd175c
[  257.584211] RDX:  RSI: 8880133a RDI: 0002
[  257.585985] RBP:  R08: 0001 R09: fbfff388f13e
[  257.588061] R10: 8880322e0353 R11: fbfff388f13d R12: c900052af820
[  257.590059] R13: 8880133a R14: c900052af820 R15: 888031149120
[  257.592605] FS:  7f445b331800() GS:888015b8() 
knlGS:
[  257.595266] CS:  0010 DS:  ES:  CR0: 80050033
[  257.597262] CR2: 5578c2c17008 CR3: 307a4000 CR4: 00350ee0
[  257.599546] DR0:  DR1:  DR2: 
[  257.601331] DR3:  DR6: fffe0ff0 DR7: 0400
[  257.602938] Call Trace:
[  257.603764]  ? write_comp_data+0x2a/0x90
[  257.605637]  ? __sanitizer_cov_trace_pc+0x1d/0x50
[  257.607107]  ? path_init+0x662/0x1800
[  257.608713]  ? walk_component+0x6a0/0x6a0
[  257.610125]  path_openat+0x269/0x2790
[  257.611601]  ? path_lookupat.isra.0+0x530/0x530
[  257.613180]  ? rcu_read_lock_bh_held+0xb0/0xb0
[  257.614586]  ? lockdep_hardirqs_on_prepare+0x400/0x400
[  257.615898]  do_filp_open+0x197/0x270
[  257.616881]  ? rcu_read_lock_bh_held+0xb0/0xb0
[  257.618293]  ? may_open_dev+0xf0/0xf0
[  257.619700]  ? do_raw_spin_lock+0x125/0x2e0
[  257.621159]  ? write_comp_data+0x2a/0x90
[  257.622574]  ? __sanitizer_cov_trace_pc+0x1d/0x50
[  257.624271]  ? _raw_spin_unlock+0x29/0x40
[  257.625737]  ? alloc_fd+0x499/0x640

Re: [syzbot] WARNING in mntput_no_expire (2)

2021-04-04 Thread Christian Brauner

On Sun, Apr 04, 2021 at 04:44:06PM +, Al Viro wrote:
> On Sun, Apr 04, 2021 at 06:40:40PM +0200, Christian Brauner wrote:
> 
> > > Very interesting.  What happens if you call loop() twice?  And now I 
> > > wonder
> > > whether it's root or cwd, actually...  Hmm...
> > > 
> > > How about this:
> > >   fd = open("/proc/self/mountinfo", 0);
> > >   mkdir("./newroot/foo", 0777);
> > >   mount("./newroot/foo", "./newroot/foo", 0, MS_BIND, NULL);
> > >   chroot("./newroot");
> > >   chdir("/foo");
> > >   while (1) {
> > >   static char buf[4096];
> > >   int n = read(fd, buf, 4096);
> > >   if (n <= 0)
> > >   break;
> > >   write(1, buf, n);
> > >   }
> > >   close(fd);
> > >   drop_caps();
> > >   loop();
> > > as the end of namespace_sandbox_proc(), instead of
> > >   chroot("./newroot");
> > >   chdir("/");
> > >   drop_caps();
> > >   loop();
> > > sequence we have there?
> > 
> > Uhum, well then we oops properly with a null-deref.
> 
> Cute...  Could you dump namei.o (ideally - with namei.s) from your build
> someplace public?

Yeah, I have at least namei.o

https://drive.google.com/file/d/1AvO1St0YltIrA86DXjp1Xg3ojtS9owGh/view?usp=sharing

Christian

Re: [syzbot] WARNING in mntput_no_expire (2)

2021-04-04 Thread Christian Brauner

On Sun, Apr 04, 2021 at 06:52:08PM +0200, Christian Brauner wrote:
> On Sun, Apr 04, 2021 at 06:40:40PM +0200, Christian Brauner wrote:
> > On Sun, Apr 04, 2021 at 03:56:02PM +, Al Viro wrote:
> > > On Sun, Apr 04, 2021 at 01:34:45PM +0200, Christian Brauner wrote:
> > > 
> > > > Sorry for not replying to your earlier mail but I've been debugging this
> > > > too. My current theory is that it's related to LOOKUP_ROOT_GRABBED when
> > > > LOOKUP_CACHED is specified _possibly_ with an interaction how
> > > > create_io_thread() is created with CLONE_FS. The reproducer requires you
> > > > either have called pivot_root() or chroot() in order for the failure to
> > > > happen. So I think the fact that we skip legitimize_root() when
> > > > LOOKUP_CACHED is set might figure into this. I can keep digging.
> > > > 
> > > 
> > > > Funny enough I already placed a printk statement into the place you
> > > > wanted one too so I just amended mine. Here's what you get:
> > > > 
> > > > If pivot pivot_root() is used before the chroot() you get:
> > > > 
> > > > [  637.464555] : count(-1) | mnt_mntpoint(/) | mnt->mnt.mnt_root(/) 
> > > > | id(579) | dev(tmpfs)
> > > > 
> > > > if you only call chroot, i.e. make the pivot_root() branch a simple
> > > > if (true) you get:
> > > > 
> > > > [  955.206117] : count(-2) | mnt_mntpoint(/) | mnt->mnt.mnt_root(/) 
> > > > | id(580) | dev(tmpfs)
> > > 
> > > Very interesting.  What happens if you call loop() twice?  And now I 
> > > wonder
> > > whether it's root or cwd, actually...  Hmm...
> > > 
> > > How about this:
> > >   fd = open("/proc/self/mountinfo", 0);
> > >   mkdir("./newroot/foo", 0777);
> > >   mount("./newroot/foo", "./newroot/foo", 0, MS_BIND, NULL);
> > >   chroot("./newroot");
> > >   chdir("/foo");
> > >   while (1) {
> > >   static char buf[4096];
> > >   int n = read(fd, buf, 4096);
> > >   if (n <= 0)
> > >   break;
> > >   write(1, buf, n);
> > >   }
> > >   close(fd);
> > >   drop_caps();
> > >   loop();
> > > as the end of namespace_sandbox_proc(), instead of
> > >   chroot("./newroot");
> > >   chdir("/");
> > >   drop_caps();
> > >   loop();
> > > sequence we have there?
> > 
> > Uhum, well then we oops properly with a null-deref.
> 
> And note that the reproducer also requires CLONE_NEWNS which causes the
> fs_struct to be unshared as well. I'm not completely in the clear what
> would happen if a new io worker thread were to be created after the
> caller has called unshare(CLONE_NEWNS).

And here's a non-null-deref version:

[  647.257107] : count(-1) | mnt_mntpoint(foo) | mnt->mnt.mnt_root(foo) | 
id(1358) | dev(tmpfs)

which is

1358 1326 0:66 /newroot/foo /home/ubuntu/syzkaller.Wgqj6W/syz-tmp/newroot/foo 
rw,relatime - tmpfs  rw

Just for kicks, here's the full mount table:

1224 513 8:2 / / rw,relatime - ext4 /dev/sda2 rw
1225 1224 0:5 / /dev rw,nosuid,noexec,relatime - devtmpfs udev 
rw,size=302716k,nr_inodes=75679,mode=755
1226 1225 0:26 / /dev/pts rw,nosuid,noexec,relatime - devpts devpts 
rw,gid=5,mode=620,ptmxmode=000
1227 1225 0:28 / /dev/shm rw,nosuid,nodev - tmpfs tmpfs rw
1228 1225 0:48 / /dev/hugepages rw,relatime - hugetlbfs hugetlbfs rw,pagesize=2M
1229 1225 0:21 / /dev/mqueue rw,nosuid,nodev,noexec,relatime - mqueue mqueue rw
1230 1224 0:27 / /run rw,nosuid,nodev,noexec,relatime - tmpfs tmpfs 
rw,size=62152k,mode=755
1231 1230 0:29 / /run/lock rw,nosuid,nodev,noexec,relatime - tmpfs tmpfs 
rw,size=5120k
1232 1230 0:49 / /run/lxd_agent rw,relatime - tmpfs tmpfs 
rw,size=51200k,mode=700
1233 1230 0:59 / /run/user/1000 rw,nosuid,nodev,relatime - tmpfs tmpfs 
rw,size=62148k,nr_inodes=15537,mode=700,uid=1000,gid=1000
1234 1224 0:24 / /sys rw,nosuid,nodev,noexec,relatime - sysfs sysfs rw
1235 1234 0:6 / /sys/kernel/security rw,nosuid,nodev,noexec,relatime - 
securityfs securityfs rw
1236 1234 0:30 / /sys/fs/cgroup ro,nosuid,nodev,noexec - tmpfs tmpfs 
ro,size=4096k,nr_inodes=1024,mode=755
1237 1236 0:31 /../../.. /sys/fs/cgroup/unified rw,nosuid,nodev,noexec,relatime 
- cgroup2 cgroup2 rw
1238 1236 0:32 /../../.. /sys/fs/cgroup/systemd rw,nosuid,nodev,noexec,relatime 
- cgroup cgroup rw,xattr,name=systemd
1239 1236 0:36 / /sys/fs/cgroup/perf_event rw,nosuid,nodev,noexec,relatime - 
cgroup cgroup rw,perf_event
1240 1236 0:37 /.. /sys/fs/cgroup

Re: [syzbot] WARNING in mntput_no_expire (2)

2021-04-04 Thread Christian Brauner

On Sun, Apr 04, 2021 at 06:40:40PM +0200, Christian Brauner wrote:
> On Sun, Apr 04, 2021 at 03:56:02PM +, Al Viro wrote:
> > On Sun, Apr 04, 2021 at 01:34:45PM +0200, Christian Brauner wrote:
> > 
> > > Sorry for not replying to your earlier mail but I've been debugging this
> > > too. My current theory is that it's related to LOOKUP_ROOT_GRABBED when
> > > LOOKUP_CACHED is specified _possibly_ with an interaction how
> > > create_io_thread() is created with CLONE_FS. The reproducer requires you
> > > either have called pivot_root() or chroot() in order for the failure to
> > > happen. So I think the fact that we skip legitimize_root() when
> > > LOOKUP_CACHED is set might figure into this. I can keep digging.
> > > 
> > 
> > > Funny enough I already placed a printk statement into the place you
> > > wanted one too so I just amended mine. Here's what you get:
> > > 
> > > If pivot pivot_root() is used before the chroot() you get:
> > > 
> > > [  637.464555] : count(-1) | mnt_mntpoint(/) | mnt->mnt.mnt_root(/) | 
> > > id(579) | dev(tmpfs)
> > > 
> > > if you only call chroot, i.e. make the pivot_root() branch a simple
> > > if (true) you get:
> > > 
> > > [  955.206117] : count(-2) | mnt_mntpoint(/) | mnt->mnt.mnt_root(/) | 
> > > id(580) | dev(tmpfs)
> > 
> > Very interesting.  What happens if you call loop() twice?  And now I wonder
> > whether it's root or cwd, actually...  Hmm...
> > 
> > How about this:
> > fd = open("/proc/self/mountinfo", 0);
> > mkdir("./newroot/foo", 0777);
> > mount("./newroot/foo", "./newroot/foo", 0, MS_BIND, NULL);
> > chroot("./newroot");
> > chdir("/foo");
> > while (1) {
> > static char buf[4096];
> > int n = read(fd, buf, 4096);
> > if (n <= 0)
> > break;
> > write(1, buf, n);
> > }
> > close(fd);
> > drop_caps();
> > loop();
> > as the end of namespace_sandbox_proc(), instead of
> > chroot("./newroot");
> > chdir("/");
> > drop_caps();
> > loop();
> > sequence we have there?
> 
> Uhum, well then we oops properly with a null-deref.

And note that the reproducer also requires CLONE_NEWNS which causes the
fs_struct to be unshared as well. I'm not completely in the clear what
would happen if a new io worker thread were to be created after the
caller has called unshare(CLONE_NEWNS).

> 
> f1-vm login: [  395.046971][ T5856] general protection fault, probably for 
> non-canonical address 0xdc00:  [#1] PREEMPT SMP KASAN
> [  395.049716][ T5856] KASAN: null-ptr-deref in range 
> [0x-0x0007]
> [  395.052847][ T5856] CPU: 1 PID: 5856 Comm: iou-wrk-5851 Tainted: G 
>E 5.12.0-rc5-020f68e042a19f59784f0962004d848181d13b9e #46
> [  395.056594][ T5856] Hardware name: QEMU Standard PC (Q35 + ICH9, 
> 2009)/LXD, BIOS 0.0.0 02/06/2015
> [  395.058962][ T5856] RIP: 0010:inode_permission+0x4e/0x530
> [  395.060362][ T5856] Code: ff 89 de e8 34 42 a9 ff 85 db 0f 85 ef 00 00 00 
> e8 87 40 a9 ff 4c 8d 7d 02 48 b8 00 00 00 00 00 fc ff df 4c 89 fa 48 c1 ea 03 
> <0f> b6 14 02 4c 89 f8 83 e0 07 83 c0 01 38 d0 7c 08 84 d2 0f 85 c2
> [  395.065442][ T5856] RSP: 0018:c9000681f640 EFLAGS: 00010246
> [  395.067274][ T5856] RAX: dc00 RBX:  RCX: 
> 84ac11fc
> [  395.069834][ T5856] RDX:  RSI: 88801321 RDI: 
> 0002
> [  395.072527][ T5856] RBP:  R08: 0001 R09: 
> ed144f22
> [  395.075287][ T5856] R10: 888031c105d3 R11: ed144f21 R12: 
> c9000681f8e0
> [  395.077026][ T5856] R13: 0001 R14: 8e441e60 R15: 
> 0002
> [  395.079001][ T5856] FS:  7f136886f800() GS:888015a8() 
> knlGS:
> [  395.081181][ T5856] CS:  0010 DS:  ES:  CR0: 80050033
> [  395.082659][ T5856] CR2: 55ecfefe8e70 CR3: 236b7000 CR4: 
> 00350ee0
> [  395.084727][ T5856] DR0:  DR1:  DR2: 
> 
> [  395.087117][ T5856] DR3:  DR6: fffe0ff0 DR7: 
> 0400
> [  395.089613][ T5856] Call Trace:
> [  395.090570][ T5856]  link_path_walk.part.0+0x790/0xc10
> [  395.091735][ T5856]  ? write_comp_data+0x2a/0x90
> [  395.092931][ T5856]  ? __sanitizer_cov_trace_pc+0x1d/0x50
> [  395.094

Re: [syzbot] WARNING in mntput_no_expire (2)

2021-04-04 Thread Christian Brauner

On Sun, Apr 04, 2021 at 03:56:02PM +, Al Viro wrote:
> On Sun, Apr 04, 2021 at 01:34:45PM +0200, Christian Brauner wrote:
> 
> > Sorry for not replying to your earlier mail but I've been debugging this
> > too. My current theory is that it's related to LOOKUP_ROOT_GRABBED when
> > LOOKUP_CACHED is specified _possibly_ with an interaction how
> > create_io_thread() is created with CLONE_FS. The reproducer requires you
> > either have called pivot_root() or chroot() in order for the failure to
> > happen. So I think the fact that we skip legitimize_root() when
> > LOOKUP_CACHED is set might figure into this. I can keep digging.
> > 
> 
> > Funny enough I already placed a printk statement into the place you
> > wanted one too so I just amended mine. Here's what you get:
> > 
> > If pivot pivot_root() is used before the chroot() you get:
> > 
> > [  637.464555] : count(-1) | mnt_mntpoint(/) | mnt->mnt.mnt_root(/) | 
> > id(579) | dev(tmpfs)
> > 
> > if you only call chroot, i.e. make the pivot_root() branch a simple
> > if (true) you get:
> > 
> > [  955.206117] : count(-2) | mnt_mntpoint(/) | mnt->mnt.mnt_root(/) | 
> > id(580) | dev(tmpfs)
> 
> Very interesting.  What happens if you call loop() twice?  And now I wonder
> whether it's root or cwd, actually...  Hmm...
> 
> How about this:
>   fd = open("/proc/self/mountinfo", 0);
>   mkdir("./newroot/foo", 0777);
>   mount("./newroot/foo", "./newroot/foo", 0, MS_BIND, NULL);
>   chroot("./newroot");
>   chdir("/foo");
>   while (1) {
>   static char buf[4096];
>   int n = read(fd, buf, 4096);
>   if (n <= 0)
>   break;
>   write(1, buf, n);
>   }
>   close(fd);
>   drop_caps();
>   loop();
> as the end of namespace_sandbox_proc(), instead of
>   chroot("./newroot");
>   chdir("/");
>   drop_caps();
>   loop();
> sequence we have there?

Uhum, well then we oops properly with a null-deref.

f1-vm login: [  395.046971][ T5856] general protection fault, probably for 
non-canonical address 0xdc00:  [#1] PREEMPT SMP KASAN
[  395.049716][ T5856] KASAN: null-ptr-deref in range 
[0x-0x0007]
[  395.052847][ T5856] CPU: 1 PID: 5856 Comm: iou-wrk-5851 Tainted: G   
 E 5.12.0-rc5-020f68e042a19f59784f0962004d848181d13b9e #46
[  395.056594][ T5856] Hardware name: QEMU Standard PC (Q35 + ICH9, 2009)/LXD, 
BIOS 0.0.0 02/06/2015
[  395.058962][ T5856] RIP: 0010:inode_permission+0x4e/0x530
[  395.060362][ T5856] Code: ff 89 de e8 34 42 a9 ff 85 db 0f 85 ef 00 00 00 e8 
87 40 a9 ff 4c 8d 7d 02 48 b8 00 00 00 00 00 fc ff df 4c 89 fa 48 c1 ea 03 <0f> 
b6 14 02 4c 89 f8 83 e0 07 83 c0 01 38 d0 7c 08 84 d2 0f 85 c2
[  395.065442][ T5856] RSP: 0018:c9000681f640 EFLAGS: 00010246
[  395.067274][ T5856] RAX: dc00 RBX:  RCX: 
84ac11fc
[  395.069834][ T5856] RDX:  RSI: 88801321 RDI: 
0002
[  395.072527][ T5856] RBP:  R08: 0001 R09: 
ed144f22
[  395.075287][ T5856] R10: 888031c105d3 R11: ed144f21 R12: 
c9000681f8e0
[  395.077026][ T5856] R13: 0001 R14: 8e441e60 R15: 
0002
[  395.079001][ T5856] FS:  7f136886f800() GS:888015a8() 
knlGS:
[  395.081181][ T5856] CS:  0010 DS:  ES:  CR0: 80050033
[  395.082659][ T5856] CR2: 55ecfefe8e70 CR3: 236b7000 CR4: 
00350ee0
[  395.084727][ T5856] DR0:  DR1:  DR2: 

[  395.087117][ T5856] DR3:  DR6: fffe0ff0 DR7: 
0400
[  395.089613][ T5856] Call Trace:
[  395.090570][ T5856]  link_path_walk.part.0+0x790/0xc10
[  395.091735][ T5856]  ? write_comp_data+0x2a/0x90
[  395.092931][ T5856]  ? __sanitizer_cov_trace_pc+0x1d/0x50
[  395.094093][ T5856]  ? walk_component+0x6a0/0x6a0
[  395.095021][ T5856]  ? percpu_counter_add_batch+0xbc/0x180
[  395.096255][ T5856]  path_openat+0x269/0x2790
[  395.097305][ T5856]  ? path_lookupat.isra.0+0x530/0x530
[  395.098391][ T5856]  ? ret_from_fork+0x1f/0x30
[  395.099431][ T5856]  ? lockdep_hardirqs_on_prepare+0x400/0x400
[  395.101326][ T5856]  do_filp_open+0x208/0x270
[  395.102894][ T5856]  ? rcu_read_lock_bh_held+0xb0/0xb0
[  395.104525][ T5856]  ? may_open_dev+0xf0/0xf0
[  395.105968][ T5856]  ? do_raw_spin_lock+0x125/0x2e0
[  395.107426][ T5856]  ? write_comp_data+0x2a/0x90
[  395.108651][ T5856]  ? __sanitizer_cov_trace_pc+0x1d/0x50
[  395.110033][ T5856]  ? _raw_spin_unlock+0x29/0x40
[  395.111397][ T5856]  ?

Re: [syzbot] WARNING in mntput_no_expire (2)

2021-04-04 Thread Christian Brauner

On Sun, Apr 04, 2021 at 02:34:08AM +, Al Viro wrote:
> On Thu, Apr 01, 2021 at 07:11:12PM +, Al Viro wrote:
> 
> > > I _think_ I see what the issue is. It seems that an assumption made in
> > > this commit might be wrong and we're missing a mnt_add_count() bump that
> > > we would otherwise have gotten if we've moved the failure handling into
> > > the unlazy helpers themselves.
> > > 
> > > Al, does that sound plausible?
> > 
> > mnt_add_count() on _what_?  Failure in legitimize_links() ends up with
> > nd->path.mnt zeroed, in both callers.  So which vfsmount would be
> > affected?

It looks to me like it's the vfsmount of the nd->root after we called
chroot or pivot_root.

> 
> Could you turn that WARN_ON(count < 0) into
>   if (WARN_ON(count < 0))
>   printk(KERN_ERR "id = %d, dev = %s, count = %d\n",
>   mnt->mnt_id,
>   mnt->mnt_sb->s_id,
>   count);
> add system("cat /proc/self/mountinfo"); right after sandbox_common()
> call and try to reproduce that?

Sorry for not replying to your earlier mail but I've been debugging this
too. My current theory is that it's related to LOOKUP_ROOT_GRABBED when
LOOKUP_CACHED is specified _possibly_ with an interaction how
create_io_thread() is created with CLONE_FS. The reproducer requires you
either have called pivot_root() or chroot() in order for the failure to
happen. So I think the fact that we skip legitimize_root() when
LOOKUP_CACHED is set might figure into this. I can keep digging.

Funny enough I already placed a printk statement into the place you
wanted one too so I just amended mine. Here's what you get:

If pivot pivot_root() is used before the chroot() you get:

[  637.464555] : count(-1) | mnt_mntpoint(/) | mnt->mnt.mnt_root(/) | 
id(579) | dev(tmpfs)

if you only call chroot, i.e. make the pivot_root() branch a simple
if (true) you get:

[  955.206117] : count(-2) | mnt_mntpoint(/) | mnt->mnt.mnt_root(/) | 
id(580) | dev(tmpfs)

The cat /proc/self/mountinfo is for the id(579) below:

514 513 8:2 / / rw,relatime - ext4 /dev/sda2 rw
515 514 0:5 / /dev rw,nosuid,noexec,relatime - devtmpfs udev 
rw,size=302716k,nr_inodes=75679,mode=755
516 515 0:26 / /dev/pts rw,nosuid,noexec,relatime - devpts devpts 
rw,gid=5,mode=620,ptmxmode=000
517 515 0:28 / /dev/shm rw,nosuid,nodev - tmpfs tmpfs rw
518 515 0:48 / /dev/hugepages rw,relatime - hugetlbfs hugetlbfs rw,pagesize=2M
519 515 0:21 / /dev/mqueue rw,nosuid,nodev,noexec,relatime - mqueue mqueue rw
520 514 0:27 / /run rw,nosuid,nodev,noexec,relatime - tmpfs tmpfs 
rw,size=62152k,mode=755
521 520 0:29 / /run/lock rw,nosuid,nodev,noexec,relatime - tmpfs tmpfs 
rw,size=5120k
522 520 0:49 / /run/lxd_agent rw,relatime - tmpfs tmpfs rw,size=51200k,mode=700
523 520 0:59 / /run/user/1000 rw,nosuid,nodev,relatime - tmpfs tmpfs 
rw,size=62148k,nr_inodes=15537,mode=700,uid=1000,gid=1000
524 514 0:24 / /sys rw,nosuid,nodev,noexec,relatime - sysfs sysfs rw
525 524 0:6 / /sys/kernel/security rw,nosuid,nodev,noexec,relatime - securityfs 
securityfs rw
526 524 0:30 / /sys/fs/cgroup ro,nosuid,nodev,noexec - tmpfs tmpfs 
ro,size=4096k,nr_inodes=1024,mode=755
527 526 0:31 /../../.. /sys/fs/cgroup/unified rw,nosuid,nodev,noexec,relatime - 
cgroup2 cgroup2 rw
528 526 0:32 /../../.. /sys/fs/cgroup/systemd rw,nosuid,nodev,noexec,relatime - 
cgroup cgroup rw,xattr,name=systemd
529 526 0:36 /../../.. /sys/fs/cgroup/memory rw,nosuid,nodev,noexec,relatime - 
cgroup cgroup rw,memory
530 526 0:37 /.. /sys/fs/cgroup/cpu,cpuacct rw,nosuid,nodev,noexec,relatime - 
cgroup cgroup rw,cpu,cpuacct
531 526 0:38 / /sys/fs/cgroup/hugetlb rw,nosuid,nodev,noexec,relatime - cgroup 
cgroup rw,hugetlb
549 526 0:39 /.. /sys/fs/cgroup/blkio rw,nosuid,nodev,noexec,relatime - cgroup 
cgroup rw,blkio
550 526 0:40 /../../.. /sys/fs/cgroup/freezer rw,nosuid,nodev,noexec,relatime - 
cgroup cgroup rw,freezer
551 526 0:41 /../../.. /sys/fs/cgroup/pids rw,nosuid,nodev,noexec,relatime - 
cgroup cgroup rw,pids
552 526 0:42 / /sys/fs/cgroup/net_cls,net_prio rw,nosuid,nodev,noexec,relatime 
- cgroup cgroup rw,net_cls,net_prio
553 526 0:43 / /sys/fs/cgroup/perf_event rw,nosuid,nodev,noexec,relatime - 
cgroup cgroup rw,perf_event
554 526 0:44 / /sys/fs/cgroup/cpuset rw,nosuid,nodev,noexec,relatime - cgroup 
cgroup rw,cpuset,clone_children
555 526 0:45 /.. /sys/fs/cgroup/devices rw,nosuid,nodev,noexec,relatime - 
cgroup cgroup rw,devices
556 526 0:46 / /sys/fs/cgroup/rdma rw,nosuid,nodev,noexec,relatime - cgroup 
cgroup rw,rdma
557 524 0:33 / /sys/fs/pstore rw,nosuid,nodev,noexec,relatime - pstore pstore rw
558 524 0:34 / /sys/firmware/efi/efivars rw,nosuid,nodev,noexec,relatime - 
efivarfs efivarfs rw
559 524 0:35 / /sys/fs/bpf rw,nosuid,nodev,noexec,relatime - bpf none 
rw,mode=700
560 524 0:7 / /sys/kernel/debug rw,nosuid,nodev,noexec,relatime - debugfs 
debugfs rw
561 524 0:12 / /sys/kernel/tracing rw,nosuid,nodev,noexec,relatime - tracefs 
tracefs rw

Re: [syzbot] WARNING in mntput_no_expire (2)

2021-04-01 Thread Christian Brauner

On Thu, Apr 01, 2021 at 05:16:03AM -0700, syzbot wrote:
> syzbot has bisected this issue to:
> 
> commit 73d90386b559d6f4c3c5db5e6bb1b68aae8fd3e7
> Author: Damien Le Moal 
> Date:   Thu Jan 28 04:47:27 2021 +
> 
> nvme: cleanup zone information initialization
> 
> bisection log:  https://syzkaller.appspot.com/x/bisect.txt?x=1440e986d0
> start commit:   d19cc4bf Merge tag 'trace-v5.12-rc5' of git://git.kernel.o..
> git tree:   upstream
> final oops: https://syzkaller.appspot.com/x/report.txt?x=1640e986d0
> console output: https://syzkaller.appspot.com/x/log.txt?x=1240e986d0
> kernel config:  https://syzkaller.appspot.com/x/.config?x=d1a3d65a48dbd1bc
> dashboard link: https://syzkaller.appspot.com/bug?extid=c88a7030da47945a3cc3
> syz repro:  https://syzkaller.appspot.com/x/repro.syz?x=12f50d11d0
> C reproducer:   https://syzkaller.appspot.com/x/repro.c?x=137694a1d0
> 
> Reported-by: syzbot+c88a7030da47945a3...@syzkaller.appspotmail.com
> Fixes: 73d90386b559 ("nvme: cleanup zone information initialization")
> 
> For information about bisection process see: https://goo.gl/tpsmEJ#bisection

That seems like a bogus bisect. Please see the bisect I provided in the
thread which seems more likely (Fyi, b4 mbox on the original thread
should help to get at those messages.).

Christian

Re: [syzbot] WARNING in mntput_no_expire (2)

2021-04-01 Thread Christian Brauner

On Thu, Apr 01, 2021 at 02:09:20AM -0700, syzbot wrote:
> Hello,
> 
> syzbot found the following issue on:
> 
> HEAD commit:d19cc4bf Merge tag 'trace-v5.12-rc5' of git://git.kernel.o..
> git tree:   upstream
> console output: https://syzkaller.appspot.com/x/log.txt?x=1018f281d0
> kernel config:  https://syzkaller.appspot.com/x/.config?x=d1a3d65a48dbd1bc
> dashboard link: https://syzkaller.appspot.com/bug?extid=c88a7030da47945a3cc3
> syz repro:  https://syzkaller.appspot.com/x/repro.syz?x=12f50d11d0
> C reproducer:   https://syzkaller.appspot.com/x/repro.c?x=137694a1d0
> 
> IMPORTANT: if you fix the issue, please add the following tag to the commit:
> Reported-by: syzbot+c88a7030da47945a3...@syzkaller.appspotmail.com
> 
> [ cut here ]
> WARNING: CPU: 1 PID: 8409 at fs/namespace.c:1186 mntput_no_expire+0xaca/0xcb0 
> fs/namespace.c:1186
> Modules linked in:
> CPU: 1 PID: 8409 Comm: syz-executor035 Not tainted 5.12.0-rc5-syzkaller #0
> Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS 
> Google 01/01/2011
> RIP: 0010:mntput_no_expire+0xaca/0xcb0 fs/namespace.c:1186
> Code: ff 48 c7 c2 e0 cb 78 89 be c2 02 00 00 48 c7 c7 a0 cb 78 89 c6 05 e5 6d 
> e5 0b 01 e8 ff e1 f6 06 e9 3f fd ff ff e8 c6 a5 a8 ff <0f> 0b e9 fc fc ff ff 
> e8 ba a5 a8 ff e8 55 dc 94 ff 31 ff 89 c5 89
> RSP: 0018:c9000165fc78 EFLAGS: 00010293
> RAX:  RBX: 1920002cbf95 RCX: 
> RDX: 88802072d4c0 RSI: 81cb4b8a RDI: 0003
> RBP: 888011656900 R08:  R09: 8fa978af
> R10: 81cb4884 R11:  R12: 0008
> R13: c9000165fcc8 R14: dc00 R15: 
> FS:  () GS:8880b9d0() knlGS:
> CS:  0010 DS:  ES:  CR0: 80050033
> CR2: 55a722053160 CR3: 0bc8e000 CR4: 001506e0
> DR0:  DR1:  DR2: 
> DR3:  DR6: fffe0ff0 DR7: 0400
> Call Trace:
>  mntput fs/namespace.c:1232 [inline]
>  cleanup_mnt+0x523/0x530 fs/namespace.c:1132
>  task_work_run+0xdd/0x1a0 kernel/task_work.c:140
>  exit_task_work include/linux/task_work.h:30 [inline]
>  do_exit+0xbfc/0x2a60 kernel/exit.c:825
>  do_group_exit+0x125/0x310 kernel/exit.c:922
>  __do_sys_exit_group kernel/exit.c:933 [inline]
>  __se_sys_exit_group kernel/exit.c:931 [inline]
>  __x64_sys_exit_group+0x3a/0x50 kernel/exit.c:931
>  do_syscall_64+0x2d/0x70 arch/x86/entry/common.c:46
>  entry_SYSCALL_64_after_hwframe+0x44/0xae
> RIP: 0033:0x446af9
> Code: Unable to access opcode bytes at RIP 0x446acf.
> RSP: 002b:005dfe48 EFLAGS: 0246 ORIG_RAX: 00e7
> RAX: ffda RBX: 004ce450 RCX: 00446af9
> RDX: 003c RSI: 00e7 RDI: 0001
> RBP: 0001 R08: ffbc R09: 
> R10:  R11: 0246 R12: 004ce450
> R13: 0001 R14:  R15: 0001

[+Cc Jens + io_uring]

Hm, this reproducer uses io_uring and it's the io_uring_enter() that
triggers this reliably. With this reproducer I've managed to reproduce
the issue on v5.12-rc4, and v5.12-rc3, v5.12-rc2 and v5.12-rc1.
It's not reproducible at
9820b4dca0f9c6b7ab8b4307286cdace171b724d
which is the commit immediately before the first v5.12 io_uring merge.
It's first reproducible with the first io_uring merge for v5.12, i.e.
5bbb336ba75d95611a7b9456355b48705016bdb1

Christian

Re: [syzbot] WARNING in mntput_no_expire (2)

2021-04-01 Thread Christian Brauner

On Thu, Apr 01, 2021 at 07:46:13PM +0200, Christian Brauner wrote:
> On Thu, Apr 01, 2021 at 10:09:18AM -0600, Jens Axboe wrote:
> > On 4/1/21 9:45 AM, Christian Brauner wrote:
> > > On Thu, Apr 01, 2021 at 02:09:20AM -0700, syzbot wrote:
> > >> Hello,
> > >>
> > >> syzbot found the following issue on:
> > >>
> > >> HEAD commit:d19cc4bf Merge tag 'trace-v5.12-rc5' of 
> > >> git://git.kernel.o..
> > >> git tree:   upstream
> > >> console output: https://syzkaller.appspot.com/x/log.txt?x=1018f281d0
> > >> kernel config:  
> > >> https://syzkaller.appspot.com/x/.config?x=d1a3d65a48dbd1bc
> > >> dashboard link: 
> > >> https://syzkaller.appspot.com/bug?extid=c88a7030da47945a3cc3
> > >> syz repro:  
> > >> https://syzkaller.appspot.com/x/repro.syz?x=12f50d11d0
> > >> C reproducer:   https://syzkaller.appspot.com/x/repro.c?x=137694a1d0
> > >>
> > >> IMPORTANT: if you fix the issue, please add the following tag to the 
> > >> commit:
> > >> Reported-by: syzbot+c88a7030da47945a3...@syzkaller.appspotmail.com
> > >>
> > >> [ cut here ]
> > >> WARNING: CPU: 1 PID: 8409 at fs/namespace.c:1186 
> > >> mntput_no_expire+0xaca/0xcb0 fs/namespace.c:1186
> > >> Modules linked in:
> > >> CPU: 1 PID: 8409 Comm: syz-executor035 Not tainted 5.12.0-rc5-syzkaller 
> > >> #0
> > >> Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS 
> > >> Google 01/01/2011
> > >> RIP: 0010:mntput_no_expire+0xaca/0xcb0 fs/namespace.c:1186
> > >> Code: ff 48 c7 c2 e0 cb 78 89 be c2 02 00 00 48 c7 c7 a0 cb 78 89 c6 05 
> > >> e5 6d e5 0b 01 e8 ff e1 f6 06 e9 3f fd ff ff e8 c6 a5 a8 ff <0f> 0b e9 
> > >> fc fc ff ff e8 ba a5 a8 ff e8 55 dc 94 ff 31 ff 89 c5 89
> > >> RSP: 0018:c9000165fc78 EFLAGS: 00010293
> > >> RAX:  RBX: 1920002cbf95 RCX: 
> > >> RDX: 88802072d4c0 RSI: 81cb4b8a RDI: 0003
> > >> RBP: 888011656900 R08:  R09: 8fa978af
> > >> R10: 81cb4884 R11:  R12: 0008
> > >> R13: c9000165fcc8 R14: dc00 R15: 
> > >> FS:  () GS:8880b9d0() 
> > >> knlGS:
> > >> CS:  0010 DS:  ES:  CR0: 80050033
> > >> CR2: 55a722053160 CR3: 0bc8e000 CR4: 001506e0
> > >> DR0:  DR1:  DR2: 
> > >> DR3:  DR6: fffe0ff0 DR7: 0400
> > >> Call Trace:
> > >>  mntput fs/namespace.c:1232 [inline]
> > >>  cleanup_mnt+0x523/0x530 fs/namespace.c:1132
> > >>  task_work_run+0xdd/0x1a0 kernel/task_work.c:140
> > >>  exit_task_work include/linux/task_work.h:30 [inline]
> > >>  do_exit+0xbfc/0x2a60 kernel/exit.c:825
> > >>  do_group_exit+0x125/0x310 kernel/exit.c:922
> > >>  __do_sys_exit_group kernel/exit.c:933 [inline]
> > >>  __se_sys_exit_group kernel/exit.c:931 [inline]
> > >>  __x64_sys_exit_group+0x3a/0x50 kernel/exit.c:931
> > >>  do_syscall_64+0x2d/0x70 arch/x86/entry/common.c:46
> > >>  entry_SYSCALL_64_after_hwframe+0x44/0xae
> > >> RIP: 0033:0x446af9
> > >> Code: Unable to access opcode bytes at RIP 0x446acf.
> > >> RSP: 002b:005dfe48 EFLAGS: 0246 ORIG_RAX: 00e7
> > >> RAX: ffda RBX: 004ce450 RCX: 00446af9
> > >> RDX: 003c RSI: 00e7 RDI: 0001
> > >> RBP: 0001 R08: ffbc R09: 
> > >> R10:  R11: 0246 R12: 004ce450
> > >> R13: 0001 R14:  R15: 0001
> > > 
> > > [+Cc Jens + io_uring]
> > > 
> > > Hm, this reproducer uses io_uring and it's the io_uring_enter() that
> > > triggers this reliably. With this reproducer I've managed to reproduce
> > > the issue on v5.12-rc4, and v5.12-rc3, v5.12-rc2 and v5.12-rc1.
> > > It's not reproducible at
> > > 9820b4dca0f9c6b7ab8b4307286cdace171b724d
> > > which is the commit immediately before the first v5.12 io_uring merge.
> > > It's first reproducible with the first io_uring merge

Re: [syzbot] WARNING in mntput_no_expire (2)

2021-04-01 Thread Christian Brauner

On Thu, Apr 01, 2021 at 10:09:18AM -0600, Jens Axboe wrote:
> On 4/1/21 9:45 AM, Christian Brauner wrote:
> > On Thu, Apr 01, 2021 at 02:09:20AM -0700, syzbot wrote:
> >> Hello,
> >>
> >> syzbot found the following issue on:
> >>
> >> HEAD commit:d19cc4bf Merge tag 'trace-v5.12-rc5' of 
> >> git://git.kernel.o..
> >> git tree:   upstream
> >> console output: https://syzkaller.appspot.com/x/log.txt?x=1018f281d0
> >> kernel config:  https://syzkaller.appspot.com/x/.config?x=d1a3d65a48dbd1bc
> >> dashboard link: 
> >> https://syzkaller.appspot.com/bug?extid=c88a7030da47945a3cc3
> >> syz repro:  https://syzkaller.appspot.com/x/repro.syz?x=12f50d11d0
> >> C reproducer:   https://syzkaller.appspot.com/x/repro.c?x=137694a1d0
> >>
> >> IMPORTANT: if you fix the issue, please add the following tag to the 
> >> commit:
> >> Reported-by: syzbot+c88a7030da47945a3...@syzkaller.appspotmail.com
> >>
> >> [ cut here ]
> >> WARNING: CPU: 1 PID: 8409 at fs/namespace.c:1186 
> >> mntput_no_expire+0xaca/0xcb0 fs/namespace.c:1186
> >> Modules linked in:
> >> CPU: 1 PID: 8409 Comm: syz-executor035 Not tainted 5.12.0-rc5-syzkaller #0
> >> Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS 
> >> Google 01/01/2011
> >> RIP: 0010:mntput_no_expire+0xaca/0xcb0 fs/namespace.c:1186
> >> Code: ff 48 c7 c2 e0 cb 78 89 be c2 02 00 00 48 c7 c7 a0 cb 78 89 c6 05 e5 
> >> 6d e5 0b 01 e8 ff e1 f6 06 e9 3f fd ff ff e8 c6 a5 a8 ff <0f> 0b e9 fc fc 
> >> ff ff e8 ba a5 a8 ff e8 55 dc 94 ff 31 ff 89 c5 89
> >> RSP: 0018:c9000165fc78 EFLAGS: 00010293
> >> RAX:  RBX: 1920002cbf95 RCX: 
> >> RDX: 88802072d4c0 RSI: 81cb4b8a RDI: 0003
> >> RBP: 888011656900 R08:  R09: 8fa978af
> >> R10: 81cb4884 R11:  R12: 0008
> >> R13: c9000165fcc8 R14: dc00 R15: 
> >> FS:  () GS:8880b9d0() 
> >> knlGS:
> >> CS:  0010 DS:  ES:  CR0: 80050033
> >> CR2: 55a722053160 CR3: 0bc8e000 CR4: 001506e0
> >> DR0:  DR1:  DR2: 
> >> DR3:  DR6: fffe0ff0 DR7: 0400
> >> Call Trace:
> >>  mntput fs/namespace.c:1232 [inline]
> >>  cleanup_mnt+0x523/0x530 fs/namespace.c:1132
> >>  task_work_run+0xdd/0x1a0 kernel/task_work.c:140
> >>  exit_task_work include/linux/task_work.h:30 [inline]
> >>  do_exit+0xbfc/0x2a60 kernel/exit.c:825
> >>  do_group_exit+0x125/0x310 kernel/exit.c:922
> >>  __do_sys_exit_group kernel/exit.c:933 [inline]
> >>  __se_sys_exit_group kernel/exit.c:931 [inline]
> >>  __x64_sys_exit_group+0x3a/0x50 kernel/exit.c:931
> >>  do_syscall_64+0x2d/0x70 arch/x86/entry/common.c:46
> >>  entry_SYSCALL_64_after_hwframe+0x44/0xae
> >> RIP: 0033:0x446af9
> >> Code: Unable to access opcode bytes at RIP 0x446acf.
> >> RSP: 002b:005dfe48 EFLAGS: 0246 ORIG_RAX: 00e7
> >> RAX: ffda RBX: 004ce450 RCX: 00446af9
> >> RDX: 003c RSI: 00e7 RDI: 0001
> >> RBP: 0001 R08: ffbc R09: 
> >> R10:  R11: 0246 R12: 004ce450
> >> R13: 0001 R14:  R15: 0001
> > 
> > [+Cc Jens + io_uring]
> > 
> > Hm, this reproducer uses io_uring and it's the io_uring_enter() that
> > triggers this reliably. With this reproducer I've managed to reproduce
> > the issue on v5.12-rc4, and v5.12-rc3, v5.12-rc2 and v5.12-rc1.
> > It's not reproducible at
> > 9820b4dca0f9c6b7ab8b4307286cdace171b724d
> > which is the commit immediately before the first v5.12 io_uring merge.
> > It's first reproducible with the first io_uring merge for v5.12, i.e.
> > 5bbb336ba75d95611a7b9456355b48705016bdb1
> 
> Thanks, that's good info. I'll take a look at it and see if I can
> reproduce.

Ok, I was deep into this anyway and it didn't make much sense to do
anything else at that point so I bisected this a bit further. The first
bad commit is:

commit 3a81fd02045c329f25e5900fa61f613c9b317644
Author: Jens Axboe 
Date:   Thu Dec 10 12:25:36 2020 -0700

io_uring: enable LOOKUP_CACHED path resolution for filename looku

Re: [syzbot] KASAN: null-ptr-deref Read in filp_close (2)

2021-03-29 Thread Christian Brauner

On Mon, Mar 29, 2021 at 11:21:34AM +0200, Christian Brauner wrote:
> On Sat, Mar 27, 2021 at 11:33:37PM +, Al Viro wrote:
> > On Fri, Mar 26, 2021 at 02:50:11PM +0100, Christian Brauner wrote:
> > > @@ -632,6 +632,7 @@ EXPORT_SYMBOL(close_fd); /* for ksys_close() */
> > >  static inline void __range_cloexec(struct files_struct *cur_fds,
> > >  unsigned int fd, unsigned int max_fd)
> > >  {
> > > + unsigned int cur_max;
> > >   struct fdtable *fdt;
> > >  
> > >   if (fd > max_fd)
> > > @@ -639,7 +640,12 @@ static inline void __range_cloexec(struct 
> > > files_struct *cur_fds,
> > >  
> > >   spin_lock(_fds->file_lock);
> > >   fdt = files_fdtable(cur_fds);
> > > - bitmap_set(fdt->close_on_exec, fd, max_fd - fd + 1);
> > > + /* make very sure we're using the correct maximum value */
> > > + cur_max = fdt->max_fds;
> > > + cur_max--;
> > > + cur_max = min(max_fd, cur_max);
> > > + if (fd <= cur_max)
> > > + bitmap_set(fdt->close_on_exec, fd, cur_max - fd + 1);
> > >   spin_unlock(_fds->file_lock);
> > >  }
> > 
> > Umm...  That's harder to follow than it ought to be.  What's the point of
> > having
> > max_fd = min(max_fd, cur_max);
> > done in the caller, anyway?  Note that in __range_close() you have to
> > compare with re-fetched ->max_fds (look at pick_file()), so...
> 
> Yeah, I'll massage that patch a bit. I wanted to know whether this fixes
> the issue first though.
> 
> > 
> > BTW, I really wonder if the cost of jerking ->file_lock up and down
> > in that loop in __range_close() is negligible.  What values do we
> 
> Just for the record, I remember you pointing at that originally. Linus
> argued that this likely wasn't going to be a problem and that if people
> see performance hits we'll optimize.
> 
> > typically get from callers and how sparse does descriptor table tend
> > to be for those?
> 
> Weirdly, I can actually somewhat answer that question since I tend to
> regularly "survey" large userspace projects I know or am involved in
> that adopt new APIs we added just to see how they use it.
> 
> A few users:
> 1. crun
>
> https://github.com/containers/crun/blob/a1c0ef1b886ca30c2fb0906c7c43be04b555c52c/src/libcrun/utils.c#L1490
>ret = syscall_close_range (n, UINT_MAX, CLOSE_RANGE_CLOEXEC);
> 
> 2. LXD
>
> https://github.com/lxc/lxd/blob/f12f03a4ba4645892ef6cc167c24da49d1217b02/lxd/main_forkexec.go#L293
>ret = close_range(EXEC_PIPE_FD + 1, UINT_MAX, CLOSE_RANGE_UNSHARE);
> 
> 3. LXC
>
> https://github.com/lxc/lxc/blob/1718e6d6018d5d6072a01d92a11d5aafc314f98f/src/lxc/rexec.c#L165
>ret = close_range(STDERR_FILENO + 1, MAX_FILENO, CLOSE_RANGE_CLOEXEC);
> 
> Of these three 1. and 3. don't matter because they rely on
> CLOSE_RANGE_CLOEXEC and exec.
> For 2. I can say that the fdtable is likely going to be sparse.
> close_range() here is basically used to prevent accidental fd leaks
> across an exec. So 2. should never have more > 4 file. In fact, this
> could and should probably be switched to CLOSE_RANGE_CLOEXEC too.
> 
> The next two cases might be more interesting:
> 
> 4. systemd
>- 
> https://github.com/systemd/systemd/blob/fe96c0f86d15e844d74d539c6cff7f971078cf84/src/basic/fd-util.c#L228
>  close_range(3, -1, 0)
>- 
> https://github.com/systemd/systemd/blob/fe96c0f86d15e844d74d539c6cff7f971078cf84/src/basic/fd-util.c#L271
>  
> https://github.com/systemd/systemd/blob/fe96c0f86d15e844d74d539c6cff7f971078cf84/src/basic/fd-util.c#L288
>  /* Close everything between the start and end fds (both of which shall 
> stay open) */
>  if (close_range(start + 1, end - 1, 0) < 0) {
>  if (close_range(sorted[n_sorted-1] + 1, -1, 0) >= 0)
> 
> 5. Python
>
> https://github.com/python/cpython/blob/9976834f807ea63ca51bc4f89be457d734148682/Python/fileutils.c#L2250
> 
> systemd has the regular case that others have too where it simply closes
> all fds over 3 and it also has the more complicated case where it has an
> ordered array of fds closing up to the lower bound and after the upper
> bound up to the maximum. PID 1 can have a large number of fds open
> because of socket activation so here close_range() will encounter less
> sparse fd tables where it needs to close a lot of fds.
> 
> For Python's os.closerange() implementation which depends on our syscall
> it's harder to say given that this will be used by a lot of projects but
> I would _guess_ that if people use closerange() they do so because they

Re: [syzbot] KASAN: null-ptr-deref Read in filp_close (2)

2021-03-29 Thread Christian Brauner

On Sat, Mar 27, 2021 at 11:33:37PM +, Al Viro wrote:
> On Fri, Mar 26, 2021 at 02:50:11PM +0100, Christian Brauner wrote:
> > @@ -632,6 +632,7 @@ EXPORT_SYMBOL(close_fd); /* for ksys_close() */
> >  static inline void __range_cloexec(struct files_struct *cur_fds,
> >unsigned int fd, unsigned int max_fd)
> >  {
> > +   unsigned int cur_max;
> > struct fdtable *fdt;
> >  
> > if (fd > max_fd)
> > @@ -639,7 +640,12 @@ static inline void __range_cloexec(struct files_struct 
> > *cur_fds,
> >  
> > spin_lock(_fds->file_lock);
> > fdt = files_fdtable(cur_fds);
> > -   bitmap_set(fdt->close_on_exec, fd, max_fd - fd + 1);
> > +   /* make very sure we're using the correct maximum value */
> > +   cur_max = fdt->max_fds;
> > +   cur_max--;
> > +   cur_max = min(max_fd, cur_max);
> > +   if (fd <= cur_max)
> > +   bitmap_set(fdt->close_on_exec, fd, cur_max - fd + 1);
> > spin_unlock(_fds->file_lock);
> >  }
> 
> Umm...  That's harder to follow than it ought to be.  What's the point of
> having
> max_fd = min(max_fd, cur_max);
> done in the caller, anyway?  Note that in __range_close() you have to
> compare with re-fetched ->max_fds (look at pick_file()), so...

Yeah, I'll massage that patch a bit. I wanted to know whether this fixes
the issue first though.

> 
> BTW, I really wonder if the cost of jerking ->file_lock up and down
> in that loop in __range_close() is negligible.  What values do we

Just for the record, I remember you pointing at that originally. Linus
argued that this likely wasn't going to be a problem and that if people
see performance hits we'll optimize.

> typically get from callers and how sparse does descriptor table tend
> to be for those?

Weirdly, I can actually somewhat answer that question since I tend to
regularly "survey" large userspace projects I know or am involved in
that adopt new APIs we added just to see how they use it.

A few users:
1. crun

https://github.com/containers/crun/blob/a1c0ef1b886ca30c2fb0906c7c43be04b555c52c/src/libcrun/utils.c#L1490
   ret = syscall_close_range (n, UINT_MAX, CLOSE_RANGE_CLOEXEC);

2. LXD

https://github.com/lxc/lxd/blob/f12f03a4ba4645892ef6cc167c24da49d1217b02/lxd/main_forkexec.go#L293
   ret = close_range(EXEC_PIPE_FD + 1, UINT_MAX, CLOSE_RANGE_UNSHARE);

3. LXC

https://github.com/lxc/lxc/blob/1718e6d6018d5d6072a01d92a11d5aafc314f98f/src/lxc/rexec.c#L165
   ret = close_range(STDERR_FILENO + 1, MAX_FILENO, CLOSE_RANGE_CLOEXEC);

Of these three 1. and 3. don't matter because they rely on
CLOSE_RANGE_CLOEXEC and exec.
For 2. I can say that the fdtable is likely going to be sparse.
close_range() here is basically used to prevent accidental fd leaks
across an exec. So 2. should never have more > 4 file. In fact, this
could and should probably be switched to CLOSE_RANGE_CLOEXEC too.

The next two cases might be more interesting:

4. systemd
   - 
https://github.com/systemd/systemd/blob/fe96c0f86d15e844d74d539c6cff7f971078cf84/src/basic/fd-util.c#L228
 close_range(3, -1, 0)
   - 
https://github.com/systemd/systemd/blob/fe96c0f86d15e844d74d539c6cff7f971078cf84/src/basic/fd-util.c#L271

https://github.com/systemd/systemd/blob/fe96c0f86d15e844d74d539c6cff7f971078cf84/src/basic/fd-util.c#L288
 /* Close everything between the start and end fds (both of which shall 
stay open) */
 if (close_range(start + 1, end - 1, 0) < 0) {
 if (close_range(sorted[n_sorted-1] + 1, -1, 0) >= 0)

5. Python

https://github.com/python/cpython/blob/9976834f807ea63ca51bc4f89be457d734148682/Python/fileutils.c#L2250

systemd has the regular case that others have too where it simply closes
all fds over 3 and it also has the more complicated case where it has an
ordered array of fds closing up to the lower bound and after the upper
bound up to the maximum. PID 1 can have a large number of fds open
because of socket activation so here close_range() will encounter less
sparse fd tables where it needs to close a lot of fds.

For Python's os.closerange() implementation which depends on our syscall
it's harder to say given that this will be used by a lot of projects but
I would _guess_ that if people use closerange() they do so because they
actually have something to close.

In short, I would think that close_range() without the
CLOSE_RANGE_CLOEXEC feature will usually be used in scenarios where
there's work to be done, i.e. where the caller likely knows that they
might inherit a non-trivial number of file descriptors (usually after a
fork) that they want to close and they want to do it either because they
don't exec or they don't know when they'll exec. All others I'd expect
to switch to CLOSE_RANGE_CLOEXEC on kernels where it's supported.

Christian

Re: [syzbot] KASAN: null-ptr-deref Read in filp_close (2)

2021-03-26 Thread Christian Brauner

On Fri, Mar 26, 2021 at 10:34:28AM +0100, Christian Brauner wrote:
> On Fri, Mar 26, 2021, 10:21 Dmitry Vyukov  wrote:
> 
> > On Fri, Mar 26, 2021 at 10:12 AM Christian Brauner
> >  wrote:
> > >
> > > On Fri, Mar 26, 2021 at 09:02:08AM +0100, Dmitry Vyukov wrote:
> > > > On Fri, Mar 26, 2021 at 8:55 AM syzbot
> > > >  wrote:
> > > > >
> > > > > Hello,
> > > > >
> > > > > syzbot found the following issue on:
> > > > >
> > > > > HEAD commit:5ee96fa9 Merge tag 'irq-urgent-2021-03-21' of git://
> > git.ke..
> > > > > git tree:   upstream
> > > > > console output:
> > https://syzkaller.appspot.com/x/log.txt?x=17fb84bed0
> > > > > kernel config:
> > https://syzkaller.appspot.com/x/.config?x=6abda3336c698a07
> > > > > dashboard link:
> > https://syzkaller.appspot.com/bug?extid=283ce5a46486d6acdbaf
> > > > >
> > > > > Unfortunately, I don't have any reproducer for this issue yet.
> > > > >
> > > > > IMPORTANT: if you fix the issue, please add the following tag to the
> > commit:
> > > > > Reported-by: syzbot+283ce5a46486d6acd...@syzkaller.appspotmail.com
> > > >
> > > > I was able to reproduce this with the following C program:
> > > >
> > https://gist.githubusercontent.com/dvyukov/00fb7aae489f22c60b4e64b45ef14d60/raw/cb368ca523d01986c2917f4414add0893b8f4243/gistfile1.txt
> > > >
> > > > +Christian
> > > > The repro also contains close_range as the previous similar crash:
> > > >
> > https://syzkaller.appspot.com/bug?id=1bef50bdd9622a1969608d1090b2b4a588d0c6ac
> > > > I don't know if it's related or not in this case, but looks suspicious.
> > >
> > > Hm, I fail to reproduce this with your repro. Do you need to have it run
> > > for a long time?
> > > One thing that strucky my eye is that binfmt_misc gets setup which made
> > > me go huh and I see commit
> > >
> > > commit e7850f4d844e0acfac7e570af611d89deade3146
> > > Author: Lior Ribak 
> > > Date:   Fri Mar 12 21:07:41 2021 -0800
> > >
> > > binfmt_misc: fix possible deadlock in bm_register_write
> > >
> > > which uses filp_close() after having called open_exec() on the
> > > interpreter which makes me wonder why this doesn't have to use fput()
> > > like in all other codepaths for binfmnt_*.
> > >
> > > Can you revert this commit and see if you can reproduce this issue.
> > > Maybe this is a complete red herring but worth a try.
> >
> >
> > This program reproduces the crash for me almost immediately. Are you
> > sure you used the right commit/config?
> >
> 
> I was trying to reproduce on v5.12-rc3 with all KASAN, KCSAN, KFENCE etc.
> turned on.
> I have an appointment I need to go to but will try to reproduce with commit
> and config you provided when I get home.
> I really hope it's not reproducible with v5.12-rc3 and only later commits
> since that would allow easier bisection.

Ok, I think I know what's going on. This fixes it for me. Can you test
too, please? I tried the #syz test way but syzbot doesn't have the
reproducer you gave me:

Thank you!
Christian

>From eeb120d02f40b15a925f54ebcf2b4c747c741ad0 Mon Sep 17 00:00:00 2001
From: Christian Brauner 
Date: Fri, 26 Mar 2021 13:33:03 +0100
Subject: [PATCH] file: fix close_range() for unshare+cloexec

syzbot reported a bug when putting the last reference to a tasks file
descriptor table. Debugging this showed we didn't recalculate the
current maximum fd number for CLOSE_RANGE_UNSHARE | CLOSE_RANGE_CLOEXEC
after we unshared the file descriptors table. So max_fd could exceed the
current fdtable maximum causing us to set excessive bits. As a concrete
example, let's say the user requested everything from fd 4 to ~0UL to be
closed and their current fdtable size is 256 with their highest open fd
being 4.  With CLOSE_RANGE_UNSHARE the caller will end up with a new
fdtable which has room for 64 file descriptors since that is the lowest
fdtable size we accept. But now max_fd will still point to 255 and needs
to be adjusted. Fix this by retrieving the correct maximum fd value in
__range_cloexec().

Reported-by: syzbot+283ce5a46486d6acd...@syzkaller.appspotmail.com
Cc: Christoph Hellwig 
Cc: Al Viro 
Cc: linux-fsde...@vger.kernel.org
Cc: sta...@vger.kernel.org
Signed-off-by: Christian Brauner 
---
 fs/file.c | 8 +++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/fs/file.c b/fs/file.c
index f3a4bac2cbe9..5ef62377d924 100644
--- a/fs/file.c
+++ b/fs/fi

Re: [syzbot] KASAN: null-ptr-deref Read in filp_close (2)

2021-03-26 Thread Christian Brauner

On Fri, Mar 26, 2021 at 09:02:08AM +0100, Dmitry Vyukov wrote:
> On Fri, Mar 26, 2021 at 8:55 AM syzbot
>  wrote:
> >
> > Hello,
> >
> > syzbot found the following issue on:
> >
> > HEAD commit:5ee96fa9 Merge tag 'irq-urgent-2021-03-21' of git://git.ke..
> > git tree:   upstream
> > console output: https://syzkaller.appspot.com/x/log.txt?x=17fb84bed0
> > kernel config:  https://syzkaller.appspot.com/x/.config?x=6abda3336c698a07
> > dashboard link: https://syzkaller.appspot.com/bug?extid=283ce5a46486d6acdbaf
> >
> > Unfortunately, I don't have any reproducer for this issue yet.
> >
> > IMPORTANT: if you fix the issue, please add the following tag to the commit:
> > Reported-by: syzbot+283ce5a46486d6acd...@syzkaller.appspotmail.com
> 
> I was able to reproduce this with the following C program:
> https://gist.githubusercontent.com/dvyukov/00fb7aae489f22c60b4e64b45ef14d60/raw/cb368ca523d01986c2917f4414add0893b8f4243/gistfile1.txt
> 
> +Christian
> The repro also contains close_range as the previous similar crash:
> https://syzkaller.appspot.com/bug?id=1bef50bdd9622a1969608d1090b2b4a588d0c6ac
> I don't know if it's related or not in this case, but looks suspicious.

Hm, I fail to reproduce this with your repro. Do you need to have it run
for a long time?
One thing that strucky my eye is that binfmt_misc gets setup which made
me go huh and I see commit

commit e7850f4d844e0acfac7e570af611d89deade3146
Author: Lior Ribak 
Date:   Fri Mar 12 21:07:41 2021 -0800

binfmt_misc: fix possible deadlock in bm_register_write

which uses filp_close() after having called open_exec() on the
interpreter which makes me wonder why this doesn't have to use fput()
like in all other codepaths for binfmnt_*.

Can you revert this commit and see if you can reproduce this issue.
Maybe this is a complete red herring but worth a try.

Christian

> 
> 
> > ==
> > BUG: KASAN: null-ptr-deref in instrument_atomic_read 
> > include/linux/instrumented.h:71 [inline]
> > BUG: KASAN: null-ptr-deref in atomic64_read 
> > include/asm-generic/atomic-instrumented.h:837 [inline]
> > BUG: KASAN: null-ptr-deref in atomic_long_read 
> > include/asm-generic/atomic-long.h:29 [inline]
> > BUG: KASAN: null-ptr-deref in filp_close+0x22/0x170 fs/open.c:1289
> > Read of size 8 at addr 0077 by task syz-executor.4/16965
> >
> > CPU: 0 PID: 16965 Comm: syz-executor.4 Not tainted 5.12.0-rc3-syzkaller #0
> > Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS 
> > Google 01/01/2011
> > Call Trace:
> >  __dump_stack lib/dump_stack.c:79 [inline]
> >  dump_stack+0x141/0x1d7 lib/dump_stack.c:120
> >  __kasan_report mm/kasan/report.c:403 [inline]
> >  kasan_report.cold+0x5f/0xd8 mm/kasan/report.c:416
> >  check_region_inline mm/kasan/generic.c:180 [inline]
> >  kasan_check_range+0x13d/0x180 mm/kasan/generic.c:186
> >  instrument_atomic_read include/linux/instrumented.h:71 [inline]
> >  atomic64_read include/asm-generic/atomic-instrumented.h:837 [inline]
> >  atomic_long_read include/asm-generic/atomic-long.h:29 [inline]
> >  filp_close+0x22/0x170 fs/open.c:1289
> >  close_files fs/file.c:403 [inline]
> >  put_files_struct fs/file.c:418 [inline]
> >  put_files_struct+0x1d0/0x350 fs/file.c:415
> >  exit_files+0x7e/0xa0 fs/file.c:435
> >  do_exit+0xbc2/0x2a60 kernel/exit.c:820
> >  do_group_exit+0x125/0x310 kernel/exit.c:922
> >  get_signal+0x42c/0x2100 kernel/signal.c:2773
> >  arch_do_signal_or_restart+0x2a8/0x1eb0 arch/x86/kernel/signal.c:789
> >  handle_signal_work kernel/entry/common.c:147 [inline]
> >  exit_to_user_mode_loop kernel/entry/common.c:171 [inline]
> >  exit_to_user_mode_prepare+0x148/0x250 kernel/entry/common.c:208
> >  __syscall_exit_to_user_mode_work kernel/entry/common.c:290 [inline]
> >  syscall_exit_to_user_mode+0x19/0x60 kernel/entry/common.c:301
> >  entry_SYSCALL_64_after_hwframe+0x44/0xae
> > RIP: 0033:0x466459
> > Code: Unable to access opcode bytes at RIP 0x46642f.
> > RSP: 002b:7feb5e334218 EFLAGS: 0246 ORIG_RAX: 00ca
> > RAX: fe00 RBX: 0056bf68 RCX: 00466459
> > RDX:  RSI: 0080 RDI: 0056bf68
> > RBP: 0056bf60 R08:  R09: 
> > R10:  R11: 0246 R12: 0056bf6c
> > R13: 00a9fb1f R14: 7feb5e334300 R15: 00022000
> > ==
> >
> >
> > ---
> > This report is generated by a bot. It may contain errors.
> > See https://goo.gl/tpsmEJ for more information about syzbot.
> > syzbot engineers can be reached at syzkal...@googlegroups.com.
> >
> > syzbot will keep track of this issue. See:
> > https://goo.gl/tpsmEJ#status for how to communicate with syzbot.
> >
> > --
> > You received this message because you are subscribed to the Google Groups 
> > "syzkaller-bugs" group.
> > To unsubscribe from this group and stop

Re: [PATCH v4 08/11] evm: Allow setxattr() and setattr() for unmodified metadata

2021-03-25 Thread Christian Brauner

On Thu, Mar 25, 2021 at 01:13:41PM +0100, Christian Brauner wrote:
> On Thu, Mar 25, 2021 at 10:53:43AM +, Roberto Sassu wrote:
> > > From: Roberto Sassu
> > > Sent: Friday, March 5, 2021 4:19 PM
> > > With the patch to allow xattr/attr operations if a portable signature
> > > verification fails, cp and tar can copy all xattrs/attrs so that at the
> > > end of the process verification succeeds.
> > > 
> > > However, it might happen that the xattrs/attrs are already set to the
> > > correct value (taken at signing time) and signature verification succeeds
> > > before the copy has completed. For example, an archive might contains 
> > > files
> > > owned by root and the archive is extracted by root.
> > > 
> > > Then, since portable signatures are immutable, all subsequent operations
> > > fail (e.g. fchown()), even if the operation is legitimate (does not alter
> > > the current value).
> > > 
> > > This patch avoids this problem by reporting successful operation to user
> > > space when that operation does not alter the current value of 
> > > xattrs/attrs.
> > > 
> > > Signed-off-by: Roberto Sassu 
> > > ---
> > >  security/integrity/evm/evm_main.c | 96
> > > +++
> > >  1 file changed, 96 insertions(+)
> > > 
> > > diff --git a/security/integrity/evm/evm_main.c
> > > b/security/integrity/evm/evm_main.c
> > > index eab536fa260f..a07516dcb920 100644
> > > --- a/security/integrity/evm/evm_main.c
> > > +++ b/security/integrity/evm/evm_main.c
> > > @@ -18,6 +18,7 @@
> > >  #include 
> > >  #include 
> > >  #include 
> > > +#include 
> > > 
> > >  #include 
> > >  #include 
> > > @@ -328,6 +329,79 @@ static enum integrity_status
> > > evm_verify_current_integrity(struct dentry *dentry)
> > >   return evm_verify_hmac(dentry, NULL, NULL, 0, NULL);
> > >  }
> > > 
> > > +/*
> > > + * evm_xattr_acl_change - check if passed ACL changes the inode mode
> > > + * @dentry: pointer to the affected dentry
> > > + * @xattr_name: requested xattr
> > > + * @xattr_value: requested xattr value
> > > + * @xattr_value_len: requested xattr value length
> > > + *
> > > + * Check if passed ACL changes the inode mode, which is protected by
> > > EVM.
> > > + *
> > > + * Returns 1 if passed ACL causes inode mode change, 0 otherwise.
> > > + */
> > > +static int evm_xattr_acl_change(struct dentry *dentry, const char
> > > *xattr_name,
> > > + const void *xattr_value, size_t
> > > xattr_value_len)
> > > +{
> > > + umode_t mode;
> > > + struct posix_acl *acl = NULL, *acl_res;
> > > + struct inode *inode = d_backing_inode(dentry);
> > > + int rc;
> > > +
> > > + /* UID/GID in ACL have been already converted from user to init ns
> > > */
> > > + acl = posix_acl_from_xattr(_user_ns, xattr_value,
> > > xattr_value_len);
> > > + if (!acl)
> > 
> > Based on Mimi's review, I will change this to:
> > 
> > if (IS_ERR_OR_NULL(acl))
> > 
> > > + return 1;
> > > +
> > > + acl_res = acl;
> > > + rc = posix_acl_update_mode(_user_ns, inode, ,
> > > _res);
> > 
> > About this part, probably it is not correct.
> > 
> > I'm writing a test for this patch that checks if operations
> > that don't change the file mode succeed and those that
> > do fail.
> > 
> > mount-idmapped --map-mount b:3001:0:1 /mnt /mnt-idmapped
> > pushd /mnt
> > echo "test" > test-file
> > chown 3001 test-file
> > chgrp 3001 test-file
> > chmod 2644 test-file
> > 
> > setfacl --set u::rw,g::r,o::r,m:r test-file (expected to succeed, caller 
> > has CAP_FSETID, so S_ISGID is not dropped)
> > setfacl --set u::rw,g::r,o::r,m:rw test-file (expected to fail)
> > pushd /mnt-idmapped
> > capsh --drop=cap_fsetid -- -c setfacl --set u::rw,g::r,o::r test-file 
> > (expected to succeed, caller is in the owning group of test-file, so 
> > S_ISGID is not dropped)
> > 
> > After adding a debug line in posix_acl_update_mode():
> > printk("%s: %d(%d) %d\n", __func__, in_group_p(i_gid_into_mnt(mnt_userns, 
> > inode)), __kgid_val(i_gid_into_mnt(mnt_userns, inode)), 
> > capable_wrt_inode_uidgid(mnt_userns, inode, CAP_FSETID));
> > 
> > wit

Re: [PATCH v4 08/11] evm: Allow setxattr() and setattr() for unmodified metadata

2021-03-25 Thread Christian Brauner

On Thu, Mar 25, 2021 at 10:53:43AM +, Roberto Sassu wrote:
> > From: Roberto Sassu
> > Sent: Friday, March 5, 2021 4:19 PM
> > With the patch to allow xattr/attr operations if a portable signature
> > verification fails, cp and tar can copy all xattrs/attrs so that at the
> > end of the process verification succeeds.
> > 
> > However, it might happen that the xattrs/attrs are already set to the
> > correct value (taken at signing time) and signature verification succeeds
> > before the copy has completed. For example, an archive might contains files
> > owned by root and the archive is extracted by root.
> > 
> > Then, since portable signatures are immutable, all subsequent operations
> > fail (e.g. fchown()), even if the operation is legitimate (does not alter
> > the current value).
> > 
> > This patch avoids this problem by reporting successful operation to user
> > space when that operation does not alter the current value of xattrs/attrs.
> > 
> > Signed-off-by: Roberto Sassu 
> > ---
> >  security/integrity/evm/evm_main.c | 96
> > +++
> >  1 file changed, 96 insertions(+)
> > 
> > diff --git a/security/integrity/evm/evm_main.c
> > b/security/integrity/evm/evm_main.c
> > index eab536fa260f..a07516dcb920 100644
> > --- a/security/integrity/evm/evm_main.c
> > +++ b/security/integrity/evm/evm_main.c
> > @@ -18,6 +18,7 @@
> >  #include 
> >  #include 
> >  #include 
> > +#include 
> > 
> >  #include 
> >  #include 
> > @@ -328,6 +329,79 @@ static enum integrity_status
> > evm_verify_current_integrity(struct dentry *dentry)
> > return evm_verify_hmac(dentry, NULL, NULL, 0, NULL);
> >  }
> > 
> > +/*
> > + * evm_xattr_acl_change - check if passed ACL changes the inode mode
> > + * @dentry: pointer to the affected dentry
> > + * @xattr_name: requested xattr
> > + * @xattr_value: requested xattr value
> > + * @xattr_value_len: requested xattr value length
> > + *
> > + * Check if passed ACL changes the inode mode, which is protected by
> > EVM.
> > + *
> > + * Returns 1 if passed ACL causes inode mode change, 0 otherwise.
> > + */
> > +static int evm_xattr_acl_change(struct dentry *dentry, const char
> > *xattr_name,
> > +   const void *xattr_value, size_t
> > xattr_value_len)
> > +{
> > +   umode_t mode;
> > +   struct posix_acl *acl = NULL, *acl_res;
> > +   struct inode *inode = d_backing_inode(dentry);
> > +   int rc;
> > +
> > +   /* UID/GID in ACL have been already converted from user to init ns
> > */
> > +   acl = posix_acl_from_xattr(_user_ns, xattr_value,
> > xattr_value_len);
> > +   if (!acl)
> 
> Based on Mimi's review, I will change this to:
> 
> if (IS_ERR_OR_NULL(acl))
> 
> > +   return 1;
> > +
> > +   acl_res = acl;
> > +   rc = posix_acl_update_mode(_user_ns, inode, ,
> > _res);
> 
> About this part, probably it is not correct.
> 
> I'm writing a test for this patch that checks if operations
> that don't change the file mode succeed and those that
> do fail.
> 
> mount-idmapped --map-mount b:3001:0:1 /mnt /mnt-idmapped
> pushd /mnt
> echo "test" > test-file
> chown 3001 test-file
> chgrp 3001 test-file
> chmod 2644 test-file
> 
> setfacl --set u::rw,g::r,o::r,m:r test-file (expected to succeed, caller has 
> CAP_FSETID, so S_ISGID is not dropped)
> setfacl --set u::rw,g::r,o::r,m:rw test-file (expected to fail)
> pushd /mnt-idmapped
> capsh --drop=cap_fsetid -- -c setfacl --set u::rw,g::r,o::r test-file 
> (expected to succeed, caller is in the owning group of test-file, so S_ISGID 
> is not dropped)
> 
> After adding a debug line in posix_acl_update_mode():
> printk("%s: %d(%d) %d\n", __func__, in_group_p(i_gid_into_mnt(mnt_userns, 
> inode)), __kgid_val(i_gid_into_mnt(mnt_userns, inode)), 
> capable_wrt_inode_uidgid(mnt_userns, inode, CAP_FSETID));
> 
> without passing mnt_userns:
> [  748.262582] setfacl --set u::rw,g::r,o::r,m:r test-file
> [  748.268021] posix_acl_update_mode: 0(3001) 1
> [  748.268035] posix_acl_update_mode: 0(3001) 1
> [  748.268570] setfacl --set u::rw,g::r,o::r,m:rw test-file
> [  748.274193] posix_acl_update_mode: 0(3001) 1
> [  748.279198] capsh --drop=cap_fsetid -- -c setfacl --set u::rw,g::r,o::r 
> test-file
> [  748.287894] posix_acl_update_mode: 0(3001) 0
> 
> passing mnt_userns:
> [   81.159766] setfacl --set u::rw,g::r,o::r,m:r test-file
> [   81.165207] posix_acl_update_mode: 0(3001) 1
> [   81.165226] posix_acl_update_mode: 0(3001) 1
> [   81.165732] setfacl --set u::rw,g::r,o::r,m:rw test-file
> [   81.170978] posix_acl_update_mode: 0(3001) 1
> [   81.176014] capsh --drop=cap_fsetid -- -c setfacl --set u::rw,g::r,o::r 
> test-file
> [   81.184648] posix_acl_update_mode: 1(0) 0
> [   81.184663] posix_acl_update_mode: 1(0) 0
> 
> The difference is that, by passing mnt_userns, the caller (root) is
> in the owning group of the file (3001 -> 0). Without passing mnt_userns,
> it is not (3001 -> 3001).
> 
> Christian, Andreas, could you confirm that this is correct?

Hey Robert,

Thanks for

[PATCH] fanotify_user: use upper_32_bits() to verify mask

2021-03-25 Thread Christian Brauner

From: Christian Brauner 

I don't see an obvious reason why the upper 32 bit check needs to be
open-coded this way. Switch to upper_32_bits() which is more idiomatic and
should conceptually be the same check.

Cc: Amir Goldstein 
Cc: Jan Kara 
Signed-off-by: Christian Brauner 
---
 fs/notify/fanotify/fanotify_user.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/notify/fanotify/fanotify_user.c 
b/fs/notify/fanotify/fanotify_user.c
index 9e0c1afac8bd..d5683fa9d495 100644
--- a/fs/notify/fanotify/fanotify_user.c
+++ b/fs/notify/fanotify/fanotify_user.c
@@ -1126,7 +1126,7 @@ static int do_fanotify_mark(int fanotify_fd, unsigned int 
flags, __u64 mask,
 __func__, fanotify_fd, flags, dfd, pathname, mask);
 
/* we only use the lower 32 bits as of right now. */
-   if (mask & ((__u64)0x << 32))
+   if (upper_32_bits(mask))
return -EINVAL;
 
if (flags & ~FANOTIFY_MARK_FLAGS)

base-commit: 0d02ec6b3136c73c09e7859f0d0e4e2c4c07b49b
-- 
2.27.0

Re: split receive_fd_replace from __receive_fd

2021-03-25 Thread Christian Brauner

On Thu, Mar 25, 2021 at 09:22:08AM +0100, Christoph Hellwig wrote:
> The receive_fd_replace case shares almost no logic with the more general
> __receive_fd case, so split it into a separate function.
> 
> BTW, I'm not sure if receive_fd_replace is such a useful primitive to
> start with, why not just open code it in seccomp?

I tend to agree and argued in a similar fashion back when we added this
but we ultimately decided to add it. So now we're back to the original
argument. :)

Christian

Re: [PATCH v2 01/18] vfs: add miscattr ops

2021-03-24 Thread Christian Brauner

On Mon, Mar 22, 2021 at 03:48:59PM +0100, Miklos Szeredi wrote:
> There's a substantial amount of boilerplate in filesystems handling
> FS_IOC_[GS]ETFLAGS/ FS_IOC_FS[GS]ETXATTR ioctls.
> 
> Also due to userspace buffers being involved in the ioctl API this is
> difficult to stack, as shown by overlayfs issues related to these ioctls.
> 
> Introduce a new internal API named "miscattr" (fsxattr can be confused with
> xattr, xflags is inappropriate, since this is more than just flags).
> 
> There's significant overlap between flags and xflags and this API handles
> the conversions automatically, so filesystems may choose which one to use.
> 
> In ->miscattr_get() a hint is provided to the filesystem whether flags or
> xattr are being requested by userspace, but in this series this hint is
> ignored by all filesystems, since generating all the attributes is cheap.
> 
> If a filesystem doesn't implemement the miscattr API, just fall back to
> f_op->ioctl().  When all filesystems are converted, the fallback can be
> removed.
> 
> 32bit compat ioctls are now handled by the generic code as well.
> 
> Signed-off-by: Miklos Szeredi 
> ---

Fwiw, I think this is a good cleanup. Changing something like the
miscattr_set() method to take a mnt_userns would've been less
churn then having to audit all ioctls individually.

(Only one small comment below.)

>  Documentation/filesystems/locking.rst |   5 +
>  Documentation/filesystems/vfs.rst |  15 ++
>  fs/ioctl.c| 329 ++
>  include/linux/fs.h|   4 +
>  include/linux/miscattr.h  |  53 +
>  5 files changed, 406 insertions(+)
>  create mode 100644 include/linux/miscattr.h
> 
> diff --git a/Documentation/filesystems/locking.rst 
> b/Documentation/filesystems/locking.rst
> index b7dcc86c92a4..a5aa2046d48f 100644
> --- a/Documentation/filesystems/locking.rst
> +++ b/Documentation/filesystems/locking.rst
> @@ -80,6 +80,9 @@ prototypes::
>   struct file *, unsigned open_flag,
>   umode_t create_mode);
>   int (*tmpfile) (struct inode *, struct dentry *, umode_t);
> + int (*miscattr_set)(struct user_namespace *mnt_userns,
> + struct dentry *dentry, struct miscattr *ma);
> + int (*miscattr_get)(struct dentry *dentry, struct miscattr *ma);
>  
>  locking rules:
>   all may block
> @@ -107,6 +110,8 @@ fiemap:   no
>  update_time: no
>  atomic_open: shared (exclusive if O_CREAT is set in open flags)
>  tmpfile: no
> +miscattr_get:no or exclusive
> +miscattr_set:exclusive
>   =
>  
>  
> diff --git a/Documentation/filesystems/vfs.rst 
> b/Documentation/filesystems/vfs.rst
> index 2049bbf5e388..f125ce6c3b47 100644
> --- a/Documentation/filesystems/vfs.rst
> +++ b/Documentation/filesystems/vfs.rst
> @@ -441,6 +441,9 @@ As of kernel 2.6.22, the following members are defined:
>  unsigned open_flag, umode_t create_mode);
>   int (*tmpfile) (struct user_namespace *, struct inode *, struct 
> dentry *, umode_t);
>   int (*set_acl)(struct user_namespace *, struct inode *, struct 
> posix_acl *, int);
> + int (*miscattr_set)(struct user_namespace *mnt_userns,
> + struct dentry *dentry, struct miscattr *ma);
> + int (*miscattr_get)(struct dentry *dentry, struct miscattr *ma);
>   };
>  
>  Again, all methods are called without any locks being held, unless
> @@ -588,6 +591,18 @@ otherwise noted.
>   atomically creating, opening and unlinking a file in given
>   directory.
>  
> +``miscattr_get``
> + called on ioctl(FS_IOC_GETFLAGS) and ioctl(FS_IOC_FSGETXATTR) to
> + retrieve miscellaneous filesystem flags and attributes.  Also
> + called before the relevant SET operation to check what is being
> + changed (in this case with i_rwsem locked exclusive).  If unset,
> + then fall back to f_op->ioctl().
> +
> +``miscattr_set``
> + called on ioctl(FS_IOC_SETFLAGS) and ioctl(FS_IOC_FSSETXATTR) to
> + change miscellaneous filesystem flags and attributes.  Callers hold
> + i_rwsem exclusive.  If unset, then fall back to f_op->ioctl().
> +
>  
>  The Address Space Object
>  
> diff --git a/fs/ioctl.c b/fs/ioctl.c
> index 4e6cc0a7d69c..e5f3820809a4 100644
> --- a/fs/ioctl.c
> +++ b/fs/ioctl.c
> @@ -19,6 +19,9 @@
>  #include 
>  #include 
>  #include 
> +#include 
> +#include 
> +#include 
>  
>  #include "internal.h"
>  
> @@ -657,6 +660,311 @@ static int ioctl_file_dedupe_range(struct file *file,
>   return ret;
>  }
>  
> +/**
> + * miscattr_fill_xflags - initialize miscattr with xflags
> + * @ma:  miscattr pointer
> + * @xflags:  FS_XFLAG_* flags
> + *
> + * Set ->fsx_xflags, ->xattr_valid and ->flags (translated xflags).  All
> + * other fields

Re: [PATCH v2 01/18] vfs: add miscattr ops

2021-03-24 Thread Christian Brauner

On Mon, Mar 22, 2021 at 03:33:38PM -0700, Darrick J. Wong wrote:
> On Mon, Mar 22, 2021 at 03:48:59PM +0100, Miklos Szeredi wrote:
> > There's a substantial amount of boilerplate in filesystems handling
> > FS_IOC_[GS]ETFLAGS/ FS_IOC_FS[GS]ETXATTR ioctls.
> > 
> > Also due to userspace buffers being involved in the ioctl API this is
> > difficult to stack, as shown by overlayfs issues related to these ioctls.
> > 
> > Introduce a new internal API named "miscattr" (fsxattr can be confused with
> > xattr, xflags is inappropriate, since this is more than just flags).
> > 
> > There's significant overlap between flags and xflags and this API handles
> > the conversions automatically, so filesystems may choose which one to use.
> > 
> > In ->miscattr_get() a hint is provided to the filesystem whether flags or
> > xattr are being requested by userspace, but in this series this hint is
> > ignored by all filesystems, since generating all the attributes is cheap.
> > 
> > If a filesystem doesn't implemement the miscattr API, just fall back to
> > f_op->ioctl().  When all filesystems are converted, the fallback can be
> > removed.
> > 
> > 32bit compat ioctls are now handled by the generic code as well.
> > 
> > Signed-off-by: Miklos Szeredi 
> > ---
> >  Documentation/filesystems/locking.rst |   5 +
> >  Documentation/filesystems/vfs.rst |  15 ++
> >  fs/ioctl.c| 329 ++
> >  include/linux/fs.h|   4 +
> >  include/linux/miscattr.h  |  53 +
> >  5 files changed, 406 insertions(+)
> >  create mode 100644 include/linux/miscattr.h
> > 
> > diff --git a/Documentation/filesystems/locking.rst 
> > b/Documentation/filesystems/locking.rst
> > index b7dcc86c92a4..a5aa2046d48f 100644
> > --- a/Documentation/filesystems/locking.rst
> > +++ b/Documentation/filesystems/locking.rst
> > @@ -80,6 +80,9 @@ prototypes::
> > struct file *, unsigned open_flag,
> > umode_t create_mode);
> > int (*tmpfile) (struct inode *, struct dentry *, umode_t);
> > +   int (*miscattr_set)(struct user_namespace *mnt_userns,
> > +   struct dentry *dentry, struct miscattr *ma);
> > +   int (*miscattr_get)(struct dentry *dentry, struct miscattr *ma);
> >  
> >  locking rules:
> > all may block
> > @@ -107,6 +110,8 @@ fiemap: no
> >  update_time:   no
> >  atomic_open:   shared (exclusive if O_CREAT is set in open flags)
> >  tmpfile:   no
> > +miscattr_get:  no or exclusive
> > +miscattr_set:  exclusive
> >     =
> >  
> >  
> > diff --git a/Documentation/filesystems/vfs.rst 
> > b/Documentation/filesystems/vfs.rst
> > index 2049bbf5e388..f125ce6c3b47 100644
> > --- a/Documentation/filesystems/vfs.rst
> > +++ b/Documentation/filesystems/vfs.rst
> > @@ -441,6 +441,9 @@ As of kernel 2.6.22, the following members are defined:
> >unsigned open_flag, umode_t create_mode);
> > int (*tmpfile) (struct user_namespace *, struct inode *, struct 
> > dentry *, umode_t);
> > int (*set_acl)(struct user_namespace *, struct inode *, struct 
> > posix_acl *, int);
> > +   int (*miscattr_set)(struct user_namespace *mnt_userns,
> > +   struct dentry *dentry, struct miscattr *ma);
> > +   int (*miscattr_get)(struct dentry *dentry, struct miscattr *ma);
> > };
> >  
> >  Again, all methods are called without any locks being held, unless
> > @@ -588,6 +591,18 @@ otherwise noted.
> > atomically creating, opening and unlinking a file in given
> > directory.
> >  
> > +``miscattr_get``
> 
> I wish this wasn't named "misc" because miscellaneous is vague.
> 
> fileattr_get, perhaps?
> 
> (FWIW I'm not /that/ passionate about starting a naming bikeshed, feel
> free to ignore.)
> 
> > +   called on ioctl(FS_IOC_GETFLAGS) and ioctl(FS_IOC_FSGETXATTR) to
> > +   retrieve miscellaneous filesystem flags and attributes.  Also
> 
> "...miscellaneous *file* flags and attributes."
> 
> > +   called before the relevant SET operation to check what is being
> > +   changed (in this case with i_rwsem locked exclusive).  If unset,
> > +   then fall back to f_op->ioctl().
> > +
> > +``miscattr_set``
> > +   called on ioctl(FS_IOC_SETFLAGS) and ioctl(FS_IOC_FSSETXATTR) to
> > +   change miscellaneous filesystem flags and attributes.  Callers hold
> 
> Same here.
> 
> > +   i_rwsem exclusive.  If unset, then fall back to f_op->ioctl().
> > +
> >  
> >  The Address Space Object
> >  
> > diff --git a/fs/ioctl.c b/fs/ioctl.c
> > index 4e6cc0a7d69c..e5f3820809a4 100644
> > --- a/fs/ioctl.c
> > +++ b/fs/ioctl.c
> > @@ -19,6 +19,9 @@
> >  #include 
> >  #include 
> >  #include 
> > +#include 
> > +#include 
> > +#include 
> >  
> >  #include "internal.h"
> >  
> > @@ -657,6 +660,311 @@ static int

Re: [PATCH 1/3] posic_acl: Add a helper determine if SGID should be cleared

2021-03-23 Thread Christian Brauner

On Mon, Mar 22, 2021 at 01:01:11PM -0400, Vivek Goyal wrote:
> On Sat, Mar 20, 2021 at 11:03:22AM +0100, Christian Brauner wrote:
> > On Fri, Mar 19, 2021 at 11:42:48PM +0100, Andreas Grünbacher wrote:
> > > Hi,
> > > 
> > > Am Fr., 19. März 2021 um 20:58 Uhr schrieb Vivek Goyal 
> > > :
> > > > posix_acl_update_mode() determines what's the equivalent mode and if 
> > > > SGID
> > > > needs to be cleared or not. I need to make use of this code in fuse
> > > > as well. Fuse will send this information to virtiofs file server and
> > > > file server will take care of clearing SGID if it needs to be done.
> > > >
> > > > Hence move this code in a separate helper so that more than one place
> > > > can call into it.
> > > >
> > > > Cc: Jan Kara 
> > > > Cc: Andreas Gruenbacher 
> > > > Cc: Alexander Viro 
> > > > Signed-off-by: Vivek Goyal 
> > > > ---
> > > >  fs/posix_acl.c|  3 +--
> > > >  include/linux/posix_acl.h | 11 +++
> > > >  2 files changed, 12 insertions(+), 2 deletions(-)
> > > >
> > > > diff --git a/fs/posix_acl.c b/fs/posix_acl.c
> > > > index f3309a7edb49..2d62494c4a5b 100644
> > > > --- a/fs/posix_acl.c
> > > > +++ b/fs/posix_acl.c
> > > > @@ -684,8 +684,7 @@ int posix_acl_update_mode(struct user_namespace 
> > > > *mnt_userns,
> > > > return error;
> > > > if (error == 0)
> > > > *acl = NULL;
> > > > -   if (!in_group_p(i_gid_into_mnt(mnt_userns, inode)) &&
> > > > -   !capable_wrt_inode_uidgid(mnt_userns, inode, CAP_FSETID))
> > > > +   if (posix_acl_mode_clear_sgid(mnt_userns, inode))
> > > > mode &= ~S_ISGID;
> > > > *mode_p = mode;
> > > > return 0;
> > > > diff --git a/include/linux/posix_acl.h b/include/linux/posix_acl.h
> > > > index 307094ebb88c..073c5e546de3 100644
> > > > --- a/include/linux/posix_acl.h
> > > > +++ b/include/linux/posix_acl.h
> > > > @@ -59,6 +59,17 @@ posix_acl_release(struct posix_acl *acl)
> > > >  }
> > > >
> > > >
> > > > +static inline bool
> > > > +posix_acl_mode_clear_sgid(struct user_namespace *mnt_userns,
> > > > + struct inode *inode)
> > > > +{
> > > > +   if (!in_group_p(i_gid_into_mnt(mnt_userns, inode)) &&
> > > > +   !capable_wrt_inode_uidgid(mnt_userns, inode, CAP_FSETID))
> > > > +   return true;
> > > > +
> > > > +   return false;
> > > 
> > > That's just
> > > 
> > > return !in_group_p(i_gid_into_mnt(mnt_userns, inode)) &&
> > > !capable_wrt_inode_uidgid(mnt_userns, inode, CAP_FSETID);
> > > 
> > > The same pattern we have in posix_acl_update_mode also exists in
> > > setattr_copy and inode_init_owner, and almost the same pattern exists
> > > in setattr_prepare, so can this be cleaned up as well? The function
> > > also isn't POSIX ACL specific, so the function name is misleading.
> > 
> > Good idea but that should probably be spun into a separate patchset that
> > only touches the vfs parts.
> 
> IIUC, suggestion is that I should write a VFS helper (and not posix
> acl helper) and use that helper at other places too in the code. 

If there are other callers outside of acls (which should be iirc) then
yes.

> 
> I will do that and post in a separate patch series.

Yeah, I think that makes more sense to have this be a separate change
instead of putting it together with the fuse change if it touches more
than one place.

Thanks!
Christian

Re: [PATCH] hfs/hfsplus: use WARN_ON for sanity check

2021-03-23 Thread Christian Brauner

On Mon, Mar 22, 2021 at 11:32:40PM +0100, Arnd Bergmann wrote:
> From: Arnd Bergmann 
> 
> gcc warns about a couple of instances in which a sanity check
> exists but the author wasn't sure how to react to it failing,
> which makes it look like a possible bug:
> 
> fs/hfsplus/inode.c: In function 'hfsplus_cat_read_inode':
> fs/hfsplus/inode.c:503:37: error: suggest braces around empty body in an 'if' 
> statement [-Werror=empty-body]
>   503 | /* panic? */;
>   | ^
> fs/hfsplus/inode.c:524:37: error: suggest braces around empty body in an 'if' 
> statement [-Werror=empty-body]
>   524 | /* panic? */;
>   | ^
> fs/hfsplus/inode.c: In function 'hfsplus_cat_write_inode':
> fs/hfsplus/inode.c:582:37: error: suggest braces around empty body in an 'if' 
> statement [-Werror=empty-body]
>   582 | /* panic? */;
>   | ^
> fs/hfsplus/inode.c:608:37: error: suggest braces around empty body in an 'if' 
> statement [-Werror=empty-body]
>   608 | /* panic? */;
>   | ^
> fs/hfs/inode.c: In function 'hfs_write_inode':
> fs/hfs/inode.c:464:37: error: suggest braces around empty body in an 'if' 
> statement [-Werror=empty-body]
>   464 | /* panic? */;
>   | ^
> fs/hfs/inode.c:485:37: error: suggest braces around empty body in an 'if' 
> statement [-Werror=empty-body]
>   485 | /* panic? */;
>   | ^
> 
> panic() is probably not the correct choice here, but a WARN_ON
> seems appropriate and avoids the compile-time warning.
> 
> Signed-off-by: Arnd Bergmann 
> ---

Thanks!
Reviewed-by: Christian Brauner

Re: [PATCH] kernel/sys.c: Fix a typo

2021-03-23 Thread Christian Brauner

On Tue, Mar 23, 2021 at 06:49:09AM +0530, Bhaskar Chowdhury wrote:
> 
> s/concurent/concurrent/
> 
> Signed-off-by: Bhaskar Chowdhury 
> ---

Looks good,
Reviewed-by: Christian Brauner

Re: [PATCH] kernel/signal: Modify the comment of function check_kill_permission

2021-03-23 Thread Christian Brauner

On Tue, Mar 23, 2021 at 01:03:56AM -0700, zhouchuangao wrote:
> Maybe it's easier for us to understand the function of
> check_kill_permission.
> 
> Signed-off-by: zhouchuangao 
> ---
>  kernel/signal.c | 6 --
>  1 file changed, 4 insertions(+), 2 deletions(-)
> 
> diff --git a/kernel/signal.c b/kernel/signal.c
> index f2a1b89..2fbf68a 100644
> --- a/kernel/signal.c
> +++ b/kernel/signal.c
> @@ -823,8 +823,10 @@ static bool kill_ok_by_cred(struct task_struct *t)
>  }
>  
>  /*
> - * Bad permissions for sending the signal
> - * - the caller must hold the RCU read lock
> + * Check if the signal has permission to kill the process.

Hey,

This phrasing is misleading imho. It's not about whether the signal has
permission but whether the caller has permissions to send the signal.

Christian

Re: [PATCH 03/11] security: commoncap: fix -Wstringop-overread warning

2021-03-22 Thread Christian Brauner

On Mon, Mar 22, 2021 at 05:02:41PM +0100, Arnd Bergmann wrote:
> From: Arnd Bergmann 
> 
> gcc-11 introdces a harmless warning for cap_inode_getsecurity:
> 
> security/commoncap.c: In function ‘cap_inode_getsecurity’:
> security/commoncap.c:440:33: error: ‘memcpy’ reading 16 bytes from a region 
> of size 0 [-Werror=stringop-overread]
>   440 | memcpy(>data, >data, 
> sizeof(__le32) * 2 * VFS_CAP_U32);
>   | 
> ^~
> 
> The problem here is that tmpbuf is initialized to NULL, so gcc assumes
> it is not accessible unless it gets set by vfs_getxattr_alloc().  This is
> a legitimate warning as far as I can tell, but the code is correct since
> it correctly handles the error when that function fails.
> 
> Add a separate NULL check to tell gcc about it as well.
> 
> Signed-off-by: Arnd Bergmann 
> ---

Seems reasonable,
Acked-by: Christian Brauner

Re: [PATCH] arcc/kernel/process: Few mundane typo fixes

2021-03-22 Thread Christian Brauner

On Mon, Mar 22, 2021 at 06:21:55PM +0530, Bhaskar Chowdhury wrote:
> s/defintion/definition/
> s/succeded/succeeded/
> s/commiting/committing/
> s/interrutps/interrupts/
> 
> Signed-off-by: Bhaskar Chowdhury 
> ---

Since you aim to be mirroring the path arc/kernel/process there's a typo
in the patch subject :)
s/arcc/arc/

otherwise

Reviewed-by: Christian Brauner

Re: [PATCH] [v2] posix-acl: avoid -Wempty-body warning

2021-03-22 Thread Christian Brauner

On Mon, Mar 22, 2021 at 02:13:59PM +0100, Arnd Bergmann wrote:
> From: Arnd Bergmann 
> 
> The fallthrough comment for an ignored cmpxchg() return value
> produces a harmless warning with 'make W=1':
> 
> fs/posix_acl.c: In function 'get_acl':
> fs/posix_acl.c:127:36: error: suggest braces around empty body in an 'if' 
> statement [-Werror=empty-body]
>   127 | /* fall through */ ;
>   |^
> 
> Simplify it as a step towards a clean W=1 build.  As all architectures
> define cmpxchg() as a statement expression these days, it is no longer
> necessary to evaluate its return code, and the if() can just be droped.
> 
> Signed-off-by: Arnd Bergmann 
> ---

Thanks you!
Reviewed-by: Christian Brauner

Re: [PATCH] posix-acl: avoid -Wempty-body warning

2021-03-22 Thread Christian Brauner

On Mon, Mar 22, 2021 at 02:02:54PM +0100, Arnd Bergmann wrote:
> On Mon, Mar 22, 2021 at 1:15 PM Christian Brauner
>  wrote:
> > On Mon, Mar 22, 2021 at 12:38:24PM +0100, Arnd Bergmann wrote:
> > > From: Arnd Bergmann 
> > >
> > > The fallthrough comment for an ignored cmpxchg() return value
> > > produces a harmless warning with 'make W=1':
> > >
> > > fs/posix_acl.c: In function 'get_acl':
> > > fs/posix_acl.c:127:36: error: suggest braces around empty body in an 'if' 
> > > statement [-Werror=empty-body]
> > >   127 | /* fall through */ ;
> > >   |^
> > >
> > > Rewrite it as gcc suggests as a step towards a clean W=1 build.
> > > On most architectures, we could just drop the if() entirely, but
> > > in some cases this causes a different warning.
> >
> > And you don't see the warning for the second unconditional
> > cmpxchg(p, sentinel, ACL_NOT_CACHED);
> > below?
> 
> I would have expected both to show that warning, didn't notice the other
> one.  I now see that all architectures use statement expressions for cmpxchg()
> and xchg() these days, after we fixed m68k, alpha and ia64, so the
> changelog text here no longer makes sense.
> 
> Should I just remove the if() then?

I think so. It seems like the straightforward thing to do. The comment
above this cmpxchg() also explains clearly what the expectations are.
At least to me the visual hint due to the "!= ACL_NOT_CACHED" check in
the if condition doesn't provide any additional clarity.

Christian

Re: [PATCH] posix-acl: avoid -Wempty-body warning

2021-03-22 Thread Christian Brauner

On Mon, Mar 22, 2021 at 12:38:24PM +0100, Arnd Bergmann wrote:
> From: Arnd Bergmann 
> 
> The fallthrough comment for an ignored cmpxchg() return value
> produces a harmless warning with 'make W=1':
> 
> fs/posix_acl.c: In function 'get_acl':
> fs/posix_acl.c:127:36: error: suggest braces around empty body in an 'if' 
> statement [-Werror=empty-body]
>   127 | /* fall through */ ;
>   |^
> 
> Rewrite it as gcc suggests as a step towards a clean W=1 build.
> On most architectures, we could just drop the if() entirely, but
> in some cases this causes a different warning.

And you don't see the warning for the second unconditional
cmpxchg(p, sentinel, ACL_NOT_CACHED);
below?

> 
> Signed-off-by: Arnd Bergmann 
> ---

In any case that should be fine,
Reviewed-by: Christian Brauner

Re: [PATCH 1/3] posic_acl: Add a helper determine if SGID should be cleared

2021-03-20 Thread Christian Brauner

On Fri, Mar 19, 2021 at 11:42:48PM +0100, Andreas Grünbacher wrote:
> Hi,
> 
> Am Fr., 19. März 2021 um 20:58 Uhr schrieb Vivek Goyal :
> > posix_acl_update_mode() determines what's the equivalent mode and if SGID
> > needs to be cleared or not. I need to make use of this code in fuse
> > as well. Fuse will send this information to virtiofs file server and
> > file server will take care of clearing SGID if it needs to be done.
> >
> > Hence move this code in a separate helper so that more than one place
> > can call into it.
> >
> > Cc: Jan Kara 
> > Cc: Andreas Gruenbacher 
> > Cc: Alexander Viro 
> > Signed-off-by: Vivek Goyal 
> > ---
> >  fs/posix_acl.c|  3 +--
> >  include/linux/posix_acl.h | 11 +++
> >  2 files changed, 12 insertions(+), 2 deletions(-)
> >
> > diff --git a/fs/posix_acl.c b/fs/posix_acl.c
> > index f3309a7edb49..2d62494c4a5b 100644
> > --- a/fs/posix_acl.c
> > +++ b/fs/posix_acl.c
> > @@ -684,8 +684,7 @@ int posix_acl_update_mode(struct user_namespace 
> > *mnt_userns,
> > return error;
> > if (error == 0)
> > *acl = NULL;
> > -   if (!in_group_p(i_gid_into_mnt(mnt_userns, inode)) &&
> > -   !capable_wrt_inode_uidgid(mnt_userns, inode, CAP_FSETID))
> > +   if (posix_acl_mode_clear_sgid(mnt_userns, inode))
> > mode &= ~S_ISGID;
> > *mode_p = mode;
> > return 0;
> > diff --git a/include/linux/posix_acl.h b/include/linux/posix_acl.h
> > index 307094ebb88c..073c5e546de3 100644
> > --- a/include/linux/posix_acl.h
> > +++ b/include/linux/posix_acl.h
> > @@ -59,6 +59,17 @@ posix_acl_release(struct posix_acl *acl)
> >  }
> >
> >
> > +static inline bool
> > +posix_acl_mode_clear_sgid(struct user_namespace *mnt_userns,
> > + struct inode *inode)
> > +{
> > +   if (!in_group_p(i_gid_into_mnt(mnt_userns, inode)) &&
> > +   !capable_wrt_inode_uidgid(mnt_userns, inode, CAP_FSETID))
> > +   return true;
> > +
> > +   return false;
> 
> That's just
> 
> return !in_group_p(i_gid_into_mnt(mnt_userns, inode)) &&
> !capable_wrt_inode_uidgid(mnt_userns, inode, CAP_FSETID);
> 
> The same pattern we have in posix_acl_update_mode also exists in
> setattr_copy and inode_init_owner, and almost the same pattern exists
> in setattr_prepare, so can this be cleaned up as well? The function
> also isn't POSIX ACL specific, so the function name is misleading.

Good idea but that should probably be spun into a separate patchset that
only touches the vfs parts.

Christian

Re: seccomp: Delay filter activation

2021-03-19 Thread Christian Brauner

On Thu, Mar 18, 2021 at 08:39:13PM +, Sargun Dhillon wrote:
> On Thu, Mar 18, 2021 at 03:54:54PM +0100, Christian Brauner wrote:
> > Sorry, I just found that mail.
> > 
> > On Mon, Mar 01, 2021 at 03:44:06PM -0800, Kees Cook wrote:
> > > On Mon, Mar 01, 2021 at 02:21:56PM +0100, Christian Brauner wrote:
> > > > On Mon, Mar 01, 2021 at 12:09:09PM +0100, Christian Brauner wrote:
> > > > > On Sat, Feb 20, 2021 at 01:31:57AM -0800, Sargun Dhillon wrote:
> > > > > > We've run into a problem where attaching a filter can be quite messy
> > > > > > business because the filter itself intercepts sendmsg, and other
> > > > > > syscalls related to exfiltrating the listener FD. I believe that 
> > > > > > this
> > > > > > problem set has been brought up before, and although there are
> > > > > > "simpler" methods of exfiltrating the listener, like clone3 or
> > > > > > pidfd_getfd, but these are still less than ideal.
> > > 
> > > I'm trying to make sure I understand: the target process would like to
> > > have a filter attached that blocks sendmsg, but that would mean it has
> > > no way to send the listener FD to its manager?
> > 
> > With pidfd_getfd() that wouldn't be a problem, I think which is what I
> > was trying to say. Unless the supervising task doen't have enough
> > privilege over the supervised task which seems like an odd scenario but
> > is technically possible, I guess.
> > 
> > > 
> > > And you'd want to have listening working for sendmsg (otherwise you
> > > could do it with two filters, I imagine)?
> > > 
> > > > >   int fd_filter = seccomp(SECCOMP_SET_MODE_FILTER, 
> > > > > SECCOMP_FILTER_DETACHED, );
> > > > > 
> > > > >   BARRIER_WAIT_SETUP_DONE;
> > > > > 
> > > > >   int ret = seccomp(SECCOMP_ATTACH_FILTER, 0, 
> > > > > INT_TO_PTR(fd_listener));
> > > > 
> > > > This obviously should've been sm like:
> > > > 
> > > > struct seccomp_filter_attach {
> > > > union {
> > > > __s32 pidfd;
> > > > __s32 pid;
> > > > };
> > > > __u32 fd_filter;
> > > > };
> > > > 
> > > > and then
> > > > 
> > > > int ret = seccomp(SECCOMP_ATTACH_FILTER, 0, seccomp_filter_attach);
> > > 
> > > Given the difficulty with TSYNC, I'm not excited about adding an
> > > "apply this filter to another process" API. :)
> > 
> > Just to give a more complete reason for suggesting something like this
> > without trying to argue that we must have this:
> > 
> > seccomp() has so far been an API that is caller-centric and by that I
> > mean that the caller loaded it's seccomp profile and sandboxed itself. As
> > such seccomp is an example of "caller-managed" security. This security
> > model has obvious advantages and fits into the general fork()-like world
> > of unix. But imho that self-management model breaks down as soon as a
> > file descriptor that can be used to refer to the object in question
> > enters into the picture. For seccomp this "breaking point" was the
> > seccomp notifier fd.
> > 
> > Because with the introduction of that fd we have introduced the concept
> > of supervisor and supervisee for seccomp which imho didn't really exist
> > in the same way before. It's pretty obvious from the type of language
> > that we now use both in userspace and in kernelspace when we talk about
> > the seccomp notifier.
> > 
> > At the current point we're somewhere in the middle between caller-managed
> > and supervised seccomp which brings up funny probelms and edge-cases.
> > One of them most obvious examples is in fact the question how to get the
> > seccomp notify fd out of the supervised task. This clearly points to the
> > fact that we're missing one of the fundamentals of an fd-based
> > supervision model: open(). This is why I was suggesting the
> > SECCOMP_ATTACH_FILTER command. It's in a sense an open-call for the
> > seccomp notify fd.
> > 
> > That all being said I know that it can be weird to implement this and if
> > you prefer we go with another simpler model to work around such things
> > than I fully understand.
> > 
> > Christian
> 
> So, beyond clone3 to get pidfds being kind of awkward, how do you see this

Afaict we

Re: [PATCH] fs/dcache: fix typos and sentence disorder

2021-03-18 Thread Christian Brauner

On Thu, Mar 18, 2021 at 04:35:34PM +, Al Viro wrote:
> On Thu, Mar 18, 2021 at 03:00:20PM +, Matthew Wilcox wrote:
> > On Thu, Mar 18, 2021 at 10:31:53PM +0800, Xiaofeng Cao wrote:
> > > change 'sould' to 'should'
> > > change 'colocated' to 'collocated'
> > 
> > uh.  collocated is incorrect.  colocated is correct.
> > https://www.merriam-webster.com/dictionary/colocate
> > https://www.merriam-webster.com/dictionary/collocate
> 
> A bit more condensed variant: these two are both derived from
> con- + loco, but have different meanings -
>   colocated: occupying the same place
>   collocated: sitting next to each other
> 
> In this case it's very much the former - the point of comment is that
> the fields in question share the same memory location, but we are
> guaranteed that any dentry we find in the alias list of an inode will
> have that location used for ->i_dentry.
> 
> "co-located" would probably work better there.
> 
> PS: history of that word pair is amusing.  Both are (English) past 
> participles,
> of co-locate and collocate resp.  The former had the (Latin) prefix applied in
> English to borrowing from Latin (co-locate < locate < locatus) , the latter
> is straight borrowing (collocate < collocatus).  Incidentally, in both cases
> the borrowed form had already been a past participle (of loco and
> colloco) resp.  And colloco had the same prefix (com-/con-/co-) applied
> in Latin, with regular assimilation of -nl- to -ll-.  But at that stage
> the meaning of the verb had been closer to "put in place" than to
> "be in place", so that gave "put next to each other" instead of "share
> the place".  Shift towards "be found next to each other" happened long after
> the prefix had been applied...

(Flashback to my latin exams. The only thing that is missing is
complete confusion about nested subordinate clauses... ;))

Re: [PATCH] fs/exec: fix typos and sentence disorder

2021-03-18 Thread Christian Brauner

On Thu, Mar 18, 2021 at 11:31:45PM +0800, Xiaofeng Cao wrote:
> change 'backwords' to 'backwards'
> change 'and argument' to 'an argument'
> change 'visibile' to 'visible'
> change 'wont't' to 'won't'
> reorganize sentence
> 
> Signed-off-by: Xiaofeng Cao 
> ---

Reviewed-by: Christian Brauner

Re: seccomp: Delay filter activation

2021-03-18 Thread Christian Brauner

Sorry, I just found that mail.

On Mon, Mar 01, 2021 at 03:44:06PM -0800, Kees Cook wrote:
> On Mon, Mar 01, 2021 at 02:21:56PM +0100, Christian Brauner wrote:
> > On Mon, Mar 01, 2021 at 12:09:09PM +0100, Christian Brauner wrote:
> > > On Sat, Feb 20, 2021 at 01:31:57AM -0800, Sargun Dhillon wrote:
> > > > We've run into a problem where attaching a filter can be quite messy
> > > > business because the filter itself intercepts sendmsg, and other
> > > > syscalls related to exfiltrating the listener FD. I believe that this
> > > > problem set has been brought up before, and although there are
> > > > "simpler" methods of exfiltrating the listener, like clone3 or
> > > > pidfd_getfd, but these are still less than ideal.
> 
> I'm trying to make sure I understand: the target process would like to
> have a filter attached that blocks sendmsg, but that would mean it has
> no way to send the listener FD to its manager?

With pidfd_getfd() that wouldn't be a problem, I think which is what I
was trying to say. Unless the supervising task doen't have enough
privilege over the supervised task which seems like an odd scenario but
is technically possible, I guess.

> 
> And you'd want to have listening working for sendmsg (otherwise you
> could do it with two filters, I imagine)?
> 
> > >   int fd_filter = seccomp(SECCOMP_SET_MODE_FILTER, 
> > > SECCOMP_FILTER_DETACHED, );
> > > 
> > >   BARRIER_WAIT_SETUP_DONE;
> > > 
> > >   int ret = seccomp(SECCOMP_ATTACH_FILTER, 0, INT_TO_PTR(fd_listener));
> > 
> > This obviously should've been sm like:
> > 
> > struct seccomp_filter_attach {
> > union {
> > __s32 pidfd;
> > __s32 pid;
> > };
> > __u32 fd_filter;
> > };
> > 
> > and then
> > 
> > int ret = seccomp(SECCOMP_ATTACH_FILTER, 0, seccomp_filter_attach);
> 
> Given the difficulty with TSYNC, I'm not excited about adding an
> "apply this filter to another process" API. :)

Just to give a more complete reason for suggesting something like this
without trying to argue that we must have this:

seccomp() has so far been an API that is caller-centric and by that I
mean that the caller loaded it's seccomp profile and sandboxed itself. As
such seccomp is an example of "caller-managed" security. This security
model has obvious advantages and fits into the general fork()-like world
of unix. But imho that self-management model breaks down as soon as a
file descriptor that can be used to refer to the object in question
enters into the picture. For seccomp this "breaking point" was the
seccomp notifier fd.

Because with the introduction of that fd we have introduced the concept
of supervisor and supervisee for seccomp which imho didn't really exist
in the same way before. It's pretty obvious from the type of language
that we now use both in userspace and in kernelspace when we talk about
the seccomp notifier.

At the current point we're somewhere in the middle between caller-managed
and supervised seccomp which brings up funny probelms and edge-cases.
One of them most obvious examples is in fact the question how to get the
seccomp notify fd out of the supervised task. This clearly points to the
fact that we're missing one of the fundamentals of an fd-based
supervision model: open(). This is why I was suggesting the
SECCOMP_ATTACH_FILTER command. It's in a sense an open-call for the
seccomp notify fd.

That all being said I know that it can be weird to implement this and if
you prefer we go with another simpler model to work around such things
than I fully understand.

Christian

Re: [PATCH] proc: fix incorrect pde_is_permanent check

2021-03-18 Thread Christian Brauner

On Thu, Mar 18, 2021 at 12:26:33PM +, Colin King wrote:
> From: Colin Ian King 
> 
> Currently the pde_is_permanent check is being run on root multiple times
> rather than on the next proc directory entry. This looks like a copy-paste
> error.  Fix this by replacing root with next.
> 
> Addresses-Coverity: ("Copy-paste error")
> Fixes: d919b33dafb3 ("proc: faster open/read/close with "permanent" files")
> Signed-off-by: Colin Ian King 
> ---

Thanks! Seems very much like it.
Acked-by: Christian Brauner

Re: [PATCH 1/2] audit: add support for the openat2 syscall

2021-03-18 Thread Christian Brauner

On Thu, Mar 18, 2021 at 11:48:45AM +0100, Christian Brauner wrote:
> [+Cc Aleksa, the author of openat2()]
> 
> and a comment below. :)
> 
> On Wed, Mar 17, 2021 at 09:47:17PM -0400, Richard Guy Briggs wrote:
> > The openat2(2) syscall was added in kernel v5.6 with commit fddb5d430ad9
> > ("open: introduce openat2(2) syscall")
> > 
> > Add the openat2(2) syscall to the audit syscall classifier.
> > 
> > See the github issue
> > https://github.com/linux-audit/audit-kernel/issues/67
> > 
> > Signed-off-by: Richard Guy Briggs 
> > ---
> >  arch/alpha/kernel/audit.c  | 2 ++
> >  arch/ia64/kernel/audit.c   | 2 ++
> >  arch/parisc/kernel/audit.c | 2 ++
> >  arch/parisc/kernel/compat_audit.c  | 2 ++
> >  arch/powerpc/kernel/audit.c| 2 ++
> >  arch/powerpc/kernel/compat_audit.c | 2 ++
> >  arch/s390/kernel/audit.c   | 2 ++
> >  arch/s390/kernel/compat_audit.c| 2 ++
> >  arch/sparc/kernel/audit.c  | 2 ++
> >  arch/sparc/kernel/compat_audit.c   | 2 ++
> >  arch/x86/ia32/audit.c  | 2 ++
> >  arch/x86/kernel/audit_64.c | 2 ++
> >  kernel/auditsc.c   | 3 +++
> >  lib/audit.c| 4 
> >  lib/compat_audit.c | 4 
> >  15 files changed, 35 insertions(+)
> > 
> > diff --git a/arch/alpha/kernel/audit.c b/arch/alpha/kernel/audit.c
> > index 96a9d18ff4c4..06a911b685d1 100644
> > --- a/arch/alpha/kernel/audit.c
> > +++ b/arch/alpha/kernel/audit.c
> > @@ -42,6 +42,8 @@ int audit_classify_syscall(int abi, unsigned syscall)
> > return 3;
> > case __NR_execve:
> > return 5;
> > +   case __NR_openat2:
> > +   return 6;
> > default:
> > return 0;
> > }
> > diff --git a/arch/ia64/kernel/audit.c b/arch/ia64/kernel/audit.c
> > index 5192ca899fe6..5eaa888c8fd3 100644
> > --- a/arch/ia64/kernel/audit.c
> > +++ b/arch/ia64/kernel/audit.c
> > @@ -43,6 +43,8 @@ int audit_classify_syscall(int abi, unsigned syscall)
> > return 3;
> > case __NR_execve:
> > return 5;
> > +   case __NR_openat2:
> > +   return 6;
> > default:
> > return 0;
> > }
> > diff --git a/arch/parisc/kernel/audit.c b/arch/parisc/kernel/audit.c
> > index 9eb47b2225d2..fc721a7727ba 100644
> > --- a/arch/parisc/kernel/audit.c
> > +++ b/arch/parisc/kernel/audit.c
> > @@ -52,6 +52,8 @@ int audit_classify_syscall(int abi, unsigned syscall)
> > return 3;
> > case __NR_execve:
> > return 5;
> > +   case __NR_openat2:
> > +   return 6;
> > default:
> > return 0;
> > }
> > diff --git a/arch/parisc/kernel/compat_audit.c 
> > b/arch/parisc/kernel/compat_audit.c
> > index 20c39c9d86a9..fc6d35918c44 100644
> > --- a/arch/parisc/kernel/compat_audit.c
> > +++ b/arch/parisc/kernel/compat_audit.c
> > @@ -35,6 +35,8 @@ int parisc32_classify_syscall(unsigned syscall)
> > return 3;
> > case __NR_execve:
> > return 5;
> > +   case __NR_openat2:
> > +   return 6;
> > default:
> > return 1;
> > }
> > diff --git a/arch/powerpc/kernel/audit.c b/arch/powerpc/kernel/audit.c
> > index a27f3d09..8f32700b0baa 100644
> > --- a/arch/powerpc/kernel/audit.c
> > +++ b/arch/powerpc/kernel/audit.c
> > @@ -54,6 +54,8 @@ int audit_classify_syscall(int abi, unsigned syscall)
> > return 4;
> > case __NR_execve:
> > return 5;
> > +   case __NR_openat2:
> > +   return 6;
> > default:
> > return 0;
> > }
> > diff --git a/arch/powerpc/kernel/compat_audit.c 
> > b/arch/powerpc/kernel/compat_audit.c
> > index 55c6ccda0a85..ebe45534b1c9 100644
> > --- a/arch/powerpc/kernel/compat_audit.c
> > +++ b/arch/powerpc/kernel/compat_audit.c
> > @@ -38,6 +38,8 @@ int ppc32_classify_syscall(unsigned syscall)
> > return 4;
> > case __NR_execve:
> > return 5;
> > +   case __NR_openat2:
> > +   return 6;
> > default:
> > return 1;
> > }
> > diff --git a/arch/s390/kernel/audit.c b/arch/s390/kernel/audit.c
> > index d395c6c9944c..d964cb94cfaf 100644
> > --- a/arch/s390/kernel/audit.c
> > +++ b/arch/s390/kernel/audit.c
> > @@ -54,6 +54,8 @@ int audit_classify_syscall(int abi, unsigned sysc

Re: [PATCH 0/2] audit: add support for openat2

2021-03-18 Thread Christian Brauner

On Wed, Mar 17, 2021 at 09:47:16PM -0400, Richard Guy Briggs wrote:
> The openat2(2) syscall was added in v5.6.  Add support for openat2 to the
> audit syscall classifier and for recording openat2 parameters that cannot
> be captured in the syscall parameters of the SYSCALL record.
> 
> Supporting userspace code can be found in
> https://github.com/rgbriggs/audit-userspace/tree/ghau-openat2
> 
> Supporting test case can be found in
> https://github.com/linux-audit/audit-testsuite/pull/103

Seems sensible, thank you.

Re: [PATCH 1/2] audit: add support for the openat2 syscall

2021-03-18 Thread Christian Brauner

[+Cc Aleksa, the author of openat2()]

and a comment below. :)

On Wed, Mar 17, 2021 at 09:47:17PM -0400, Richard Guy Briggs wrote:
> The openat2(2) syscall was added in kernel v5.6 with commit fddb5d430ad9
> ("open: introduce openat2(2) syscall")
> 
> Add the openat2(2) syscall to the audit syscall classifier.
> 
> See the github issue
> https://github.com/linux-audit/audit-kernel/issues/67
> 
> Signed-off-by: Richard Guy Briggs 
> ---
>  arch/alpha/kernel/audit.c  | 2 ++
>  arch/ia64/kernel/audit.c   | 2 ++
>  arch/parisc/kernel/audit.c | 2 ++
>  arch/parisc/kernel/compat_audit.c  | 2 ++
>  arch/powerpc/kernel/audit.c| 2 ++
>  arch/powerpc/kernel/compat_audit.c | 2 ++
>  arch/s390/kernel/audit.c   | 2 ++
>  arch/s390/kernel/compat_audit.c| 2 ++
>  arch/sparc/kernel/audit.c  | 2 ++
>  arch/sparc/kernel/compat_audit.c   | 2 ++
>  arch/x86/ia32/audit.c  | 2 ++
>  arch/x86/kernel/audit_64.c | 2 ++
>  kernel/auditsc.c   | 3 +++
>  lib/audit.c| 4 
>  lib/compat_audit.c | 4 
>  15 files changed, 35 insertions(+)
> 
> diff --git a/arch/alpha/kernel/audit.c b/arch/alpha/kernel/audit.c
> index 96a9d18ff4c4..06a911b685d1 100644
> --- a/arch/alpha/kernel/audit.c
> +++ b/arch/alpha/kernel/audit.c
> @@ -42,6 +42,8 @@ int audit_classify_syscall(int abi, unsigned syscall)
>   return 3;
>   case __NR_execve:
>   return 5;
> + case __NR_openat2:
> + return 6;
>   default:
>   return 0;
>   }
> diff --git a/arch/ia64/kernel/audit.c b/arch/ia64/kernel/audit.c
> index 5192ca899fe6..5eaa888c8fd3 100644
> --- a/arch/ia64/kernel/audit.c
> +++ b/arch/ia64/kernel/audit.c
> @@ -43,6 +43,8 @@ int audit_classify_syscall(int abi, unsigned syscall)
>   return 3;
>   case __NR_execve:
>   return 5;
> + case __NR_openat2:
> + return 6;
>   default:
>   return 0;
>   }
> diff --git a/arch/parisc/kernel/audit.c b/arch/parisc/kernel/audit.c
> index 9eb47b2225d2..fc721a7727ba 100644
> --- a/arch/parisc/kernel/audit.c
> +++ b/arch/parisc/kernel/audit.c
> @@ -52,6 +52,8 @@ int audit_classify_syscall(int abi, unsigned syscall)
>   return 3;
>   case __NR_execve:
>   return 5;
> + case __NR_openat2:
> + return 6;
>   default:
>   return 0;
>   }
> diff --git a/arch/parisc/kernel/compat_audit.c 
> b/arch/parisc/kernel/compat_audit.c
> index 20c39c9d86a9..fc6d35918c44 100644
> --- a/arch/parisc/kernel/compat_audit.c
> +++ b/arch/parisc/kernel/compat_audit.c
> @@ -35,6 +35,8 @@ int parisc32_classify_syscall(unsigned syscall)
>   return 3;
>   case __NR_execve:
>   return 5;
> + case __NR_openat2:
> + return 6;
>   default:
>   return 1;
>   }
> diff --git a/arch/powerpc/kernel/audit.c b/arch/powerpc/kernel/audit.c
> index a27f3d09..8f32700b0baa 100644
> --- a/arch/powerpc/kernel/audit.c
> +++ b/arch/powerpc/kernel/audit.c
> @@ -54,6 +54,8 @@ int audit_classify_syscall(int abi, unsigned syscall)
>   return 4;
>   case __NR_execve:
>   return 5;
> + case __NR_openat2:
> + return 6;
>   default:
>   return 0;
>   }
> diff --git a/arch/powerpc/kernel/compat_audit.c 
> b/arch/powerpc/kernel/compat_audit.c
> index 55c6ccda0a85..ebe45534b1c9 100644
> --- a/arch/powerpc/kernel/compat_audit.c
> +++ b/arch/powerpc/kernel/compat_audit.c
> @@ -38,6 +38,8 @@ int ppc32_classify_syscall(unsigned syscall)
>   return 4;
>   case __NR_execve:
>   return 5;
> + case __NR_openat2:
> + return 6;
>   default:
>   return 1;
>   }
> diff --git a/arch/s390/kernel/audit.c b/arch/s390/kernel/audit.c
> index d395c6c9944c..d964cb94cfaf 100644
> --- a/arch/s390/kernel/audit.c
> +++ b/arch/s390/kernel/audit.c
> @@ -54,6 +54,8 @@ int audit_classify_syscall(int abi, unsigned syscall)
>   return 4;
>   case __NR_execve:
>   return 5;
> + case __NR_openat2:
> + return 6;
>   default:
>   return 0;
>   }
> diff --git a/arch/s390/kernel/compat_audit.c b/arch/s390/kernel/compat_audit.c
> index 444fb1f66944..f7b32933ce0e 100644
> --- a/arch/s390/kernel/compat_audit.c
> +++ b/arch/s390/kernel/compat_audit.c
> @@ -39,6 +39,8 @@ int s390_classify_syscall(unsigned syscall)
>   return 4;
>   case __NR_execve:
>   return 5;
> + case __NR_openat2:
> + return 6;
>   default:
>   return 1;
>   }
> diff --git a/arch/sparc/kernel/audit.c b/arch/sparc/kernel/audit.c
> index a6e91bf34d48..b6dcca9c6520 100644
> --- a/arch/sparc/kernel/audit.c
> +++ b/arch/sparc/kernel/audit.c
> @@ -55,6 +55,8 @@ int audit_classify_syscall(int abi, unsigned int syscall)
>

Re: [PATCH v3 1/3] binder: BINDER_FREEZE ioctl

2021-03-17 Thread Christian Brauner

On Mon, Mar 15, 2021 at 06:16:28PM -0700, Li Li wrote:
> From: Marco Ballesio 
> 
> Frozen tasks can't process binder transactions, so a way is required to
> inform transmitting ends of communication failures due to the frozen
> state of their receiving counterparts. Additionally, races are possible
> between transitions to frozen state and binder transactions enqueued to
> a specific process.
> 
> Implement BINDER_FREEZE ioctl for user space to inform the binder driver
> about the intention to freeze or unfreeze a process. When the ioctl is
> called, block the caller until any pending binder transactions toward
> the target process are flushed. Return an error to transactions to
> processes marked as frozen.
> 
> Signed-off-by: Marco Ballesio 
> Co-developed-by: Todd Kjos 
> Signed-off-by: Todd Kjos 
> Signed-off-by: Li Li 
> ---
>  drivers/android/binder.c| 139 ++--
>  drivers/android/binder_internal.h   |  12 +++
>  include/uapi/linux/android/binder.h |  13 +++
>  3 files changed, 154 insertions(+), 10 deletions(-)
> 
> diff --git a/drivers/android/binder.c b/drivers/android/binder.c
> index c119736ca56a..b93ca53bb90f 100644
> --- a/drivers/android/binder.c
> +++ b/drivers/android/binder.c
> @@ -1506,6 +1506,12 @@ static void binder_free_transaction(struct 
> binder_transaction *t)
>  
>   if (target_proc) {
>   binder_inner_proc_lock(target_proc);
> + target_proc->outstanding_txns--;
> + if (target_proc->outstanding_txns < 0)
> + pr_warn("%s: Unexpected outstanding_txns %d\n",
> + __func__, target_proc->outstanding_txns);
> + if (!target_proc->outstanding_txns && target_proc->is_frozen)
> + wake_up_interruptible_all(_proc->freeze_wait);
>   if (t->buffer)
>   t->buffer->transaction = NULL;
>   binder_inner_proc_unlock(target_proc);
> @@ -2331,10 +2337,11 @@ static int binder_fixup_parent(struct 
> binder_transaction *t,
>   * If the @thread parameter is not NULL, the transaction is always queued
>   * to the waitlist of that specific thread.
>   *
> - * Return:   true if the transactions was successfully queued
> - *   false if the target process or thread is dead
> + * Return:   0 if the transaction was successfully queued
> + *   BR_DEAD_REPLY if the target process or thread is dead
> + *   BR_FROZEN_REPLY if the target process or thread is frozen
>   */
> -static bool binder_proc_transaction(struct binder_transaction *t,
> +static int binder_proc_transaction(struct binder_transaction *t,
>   struct binder_proc *proc,
>   struct binder_thread *thread)
>  {
> @@ -2354,10 +2361,11 @@ static bool binder_proc_transaction(struct 
> binder_transaction *t,
>  
>   binder_inner_proc_lock(proc);
>  
> - if (proc->is_dead || (thread && thread->is_dead)) {
> + if ((proc->is_frozen && !oneway) || proc->is_dead ||
> + (thread && thread->is_dead)) {
>   binder_inner_proc_unlock(proc);
>   binder_node_unlock(node);
> - return false;
> + return proc->is_frozen ? BR_FROZEN_REPLY : BR_DEAD_REPLY;
>   }
>  
>   if (!thread && !pending_async)
> @@ -2373,10 +2381,11 @@ static bool binder_proc_transaction(struct 
> binder_transaction *t,
>   if (!pending_async)
>   binder_wakeup_thread_ilocked(proc, thread, !oneway /* sync */);
>  
> + proc->outstanding_txns++;
>   binder_inner_proc_unlock(proc);
>   binder_node_unlock(node);
>  
> - return true;
> + return 0;
>  }
>  
>  /**
> @@ -3013,13 +3022,16 @@ static void binder_transaction(struct binder_proc 
> *proc,
>   if (reply) {
>   binder_enqueue_thread_work(thread, tcomplete);
>   binder_inner_proc_lock(target_proc);
> - if (target_thread->is_dead) {
> + if (target_thread->is_dead || target_proc->is_frozen) {
> + return_error = target_thread->is_dead ?
> + BR_DEAD_REPLY : BR_FROZEN_REPLY;
>   binder_inner_proc_unlock(target_proc);
>   goto err_dead_proc_or_thread;
>   }
>   BUG_ON(t->buffer->async_transaction != 0);
>   binder_pop_transaction_ilocked(target_thread, in_reply_to);
>   binder_enqueue_thread_work_ilocked(target_thread, >work);
> + target_proc->outstanding_txns++;
>   binder_inner_proc_unlock(target_proc);
>   wake_up_interruptible_sync(_thread->wait);
>   binder_free_transaction(in_reply_to);
> @@ -3038,7 +3050,9 @@ static void binder_transaction(struct binder_proc *proc,
>   t->from_parent = thread->transaction_stack;
>   thread->transaction_stack = t;
>

Re: [PATCH v3 0/3] Binder: Enable App Freezing Capability

2021-03-17 Thread Christian Brauner

On Mon, Mar 15, 2021 at 06:16:27PM -0700, Li Li wrote:
> From: Li Li 
> 
> To improve the user experience when switching between recently used
> applications, the background applications which are not currently needed
> are cached in the memory. Normally, a well designed application will not
> consume valuable CPU resources in the background. However, it's possible
> some applications are not able or willing to behave as expected, wasting
> energy even after being cached.
> 
> It is a good idea to freeze those applications when they're only being
> kept alive for the sake of faster startup and energy saving. These kernel
> patches will provide the necessary infrastructure for user space framework
> to freeze and thaw a cached process, check the current freezing status and
> correctly deal with outstanding binder transactions to frozen processes.
> 
> Changes in v2: avoid panic by using pr_warn for unexpected cases.
> Changes in v3: improved errcode logic in binder_proc_transaction().
> 
> Marco Ballesio (3):
>   binder: BINDER_FREEZE ioctl
>   binder: use EINTR for interrupted wait for work
>   binder: BINDER_GET_FROZEN_INFO ioctl
> 
>  drivers/android/binder.c| 198 ++--
>  drivers/android/binder_internal.h   |  18 +++
>  include/uapi/linux/android/binder.h |  20 +++
>  3 files changed, 224 insertions(+), 12 deletions(-)

[+Cc Jann]

Christian

Re: [PATCH] kernel:fork: Fix typo issue

2021-03-17 Thread Christian Brauner

On Wed, Mar 17, 2021 at 04:20:31PM +0800, Xiaofeng Cao wrote:
> change 'ancestoral' to 'ancestral'
> change 'reuseable' to 'reusable'
> delete 'do' grammatically
> 
> Signed-off-by: Xiaofeng Cao 
> ---

Thanks!
Reviewed-by: Christian Brauner

Re: [PATCH] kernel:signal: Fix typo issue

2021-03-17 Thread Christian Brauner

On Wed, Mar 17, 2021 at 04:46:52PM +0800, Xiaofeng Cao wrote:
> change 'situration' to 'situation'
> change 'delievered' to 'delivered'
> change 'overriden' to 'overridden'
> 
> Signed-off-by: Xiaofeng Cao 
> ---

Thanks! (Self-detected or through some tool?)
Reviewed-by: Christian Brauner

[GIT PULL] detached mounts fix

2021-03-10 Thread Christian Brauner

08 15:18:43 +0100)

Please consider pulling these changes from the signed for-linus-2021-03-10 tag.

Thanks!
Christian


for-linus-2021-03-10

--------
Christian Brauner (1):
  mount: fix mounting of detached mounts onto targets that reside on shared 
mounts

 fs/pnode.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

[1]:
/* SPDX-License-Identifier: LGPL-2.1+ */

#include 
#include 
#include 
#include 
#include 
#include 
#include 
#include 
#include 
#include 
#include 
#include 
#include 
#include 

/* open_tree() */
#ifndef OPEN_TREE_CLONE
#define OPEN_TREE_CLONE 1
#endif

#ifndef OPEN_TREE_CLOEXEC
#define OPEN_TREE_CLOEXEC O_CLOEXEC
#endif

#ifndef __NR_open_tree
#if defined __alpha__
#define __NR_open_tree 538
#elif defined _MIPS_SIM
#if _MIPS_SIM == _MIPS_SIM_ABI32/* o32 */
#define __NR_open_tree 4428
#endif
#if _MIPS_SIM == _MIPS_SIM_NABI32   /* n32 */
#define __NR_open_tree 6428
#endif
#if _MIPS_SIM == _MIPS_SIM_ABI64/* n64 */
#define __NR_open_tree 5428
#endif
#elif defined __ia64__
#define __NR_open_tree (428 + 1024)
#else
#define __NR_open_tree 428
#endif
#endif

/* move_mount() */
#ifndef MOVE_MOUNT_F_EMPTY_PATH
#define MOVE_MOUNT_F_EMPTY_PATH 0x0004 /* Empty from path permitted */
#endif

#ifndef __NR_move_mount
#if defined __alpha__
#define __NR_move_mount 539
#elif defined _MIPS_SIM
#if _MIPS_SIM == _MIPS_SIM_ABI32/* o32 */
#define __NR_move_mount 4429
#endif
#if _MIPS_SIM == _MIPS_SIM_NABI32   /* n32 */
#define __NR_move_mount 6429
#endif
#if _MIPS_SIM == _MIPS_SIM_ABI64/* n64 */
#define __NR_move_mount 5429
#endif
#elif defined __ia64__
#define __NR_move_mount (428 + 1024)
#else
#define __NR_move_mount 429
#endif
#endif

static inline int sys_open_tree(int dfd, const char *filename, unsigned int 
flags)
{
return syscall(__NR_open_tree, dfd, filename, flags);
}

static inline int sys_move_mount(int from_dfd, const char *from_pathname, int 
to_dfd,
 const char *to_pathname, unsigned int flags)
{
return syscall(__NR_move_mount, from_dfd, from_pathname, to_dfd, 
to_pathname, flags);
}

static bool is_shared_mountpoint(const char *path)
{
bool shared = false;
FILE *f = NULL;
char *line = NULL;
int i;
size_t len = 0;

f = fopen("/proc/self/mountinfo", "re");
if (!f)
return 0;

while (getline(, , f) > 0) {
char *slider1, *slider2;

for (slider1 = line, i = 0; slider1 && i < 4; i++)
slider1 = strchr(slider1 + 1, ' ');

if (!slider1)
continue;

slider2 = strchr(slider1 + 1, ' ');
if (!slider2)
continue;

*slider2 = '\0';
if (strcmp(slider1 + 1, path) == 0) {
/* This is the path. Is it shared? */
slider1 = strchr(slider2 + 1, ' ');
if (slider1 && strstr(slider1, "shared:")) {
shared = true;
break;
}
}
}
fclose(f);
free(line);

return shared;
}

static void usage(void)
{
const char *text = "mount-new [--recursive] \n";
fprintf(stderr, "%s", text);
_exit(EXIT_SUCCESS);
}

#define exit_usage(format, ...)  \
({   \
fprintf(stderr, format "\n", ##__VA_ARGS__); \
usage(); \
})

#define exit_log(format, ...)\
({   \
fprintf(stderr, format "\n", ##__VA_ARGS__); \
exit(EXIT_FAILURE);  \
})

static const struct option longopts[] = {
{"help",no_argument,0,  'a'},
{ NULL, no_argument,0,   0 },
};

int main(int argc, char *argv[])
{
int exit_code = EXIT_SUCCESS, index = 0;
int dfd, fd_tree, new_argc, ret;
char

Re: [PATCH 2/9] fs: add an argument-less alloc_anon_inode

2021-03-10 Thread Christian Brauner

On Tue, Mar 09, 2021 at 04:53:41PM +0100, Christoph Hellwig wrote:
> Add a new alloc_anon_inode helper that allocates an inode on
> the anon_inode file system.
> 
> Signed-off-by: Christoph Hellwig 
> ---

Looks good!
Reviewed-by: Christian Brauner

Re: [PATCH 1/9] fs: rename alloc_anon_inode to alloc_anon_inode_sb

2021-03-10 Thread Christian Brauner

On Tue, Mar 09, 2021 at 04:53:40PM +0100, Christoph Hellwig wrote:
> Rename alloc_inode to free the name for a new variant that does not
> need boilerplate to create a super_block first.
> 
> Signed-off-by: Christoph Hellwig 
> ---

Looks good (with the metioned fix in
https://lore.kernel.org/lkml/20210310083040.ga5...@lst.de)

Reviewed-by: Christian Brauner 

>  arch/powerpc/platforms/pseries/cmm.c | 2 +-
>  drivers/dma-buf/dma-buf.c| 2 +-
>  drivers/gpu/drm/drm_drv.c| 2 +-
>  drivers/misc/cxl/api.c   | 2 +-
>  drivers/misc/vmw_balloon.c   | 2 +-
>  drivers/scsi/cxlflash/ocxl_hw.c  | 2 +-
>  drivers/virtio/virtio_balloon.c  | 2 +-
>  fs/aio.c | 2 +-
>  fs/anon_inodes.c | 4 ++--
>  fs/libfs.c   | 2 +-
>  include/linux/fs.h   | 2 +-
>  kernel/resource.c| 2 +-
>  mm/z3fold.c  | 2 +-
>  mm/zsmalloc.c| 2 +-
>  14 files changed, 15 insertions(+), 15 deletions(-)
> 
> diff --git a/arch/powerpc/platforms/pseries/cmm.c 
> b/arch/powerpc/platforms/pseries/cmm.c
> index 45a3a3022a85c9..6d36b858b14df1 100644
> --- a/arch/powerpc/platforms/pseries/cmm.c
> +++ b/arch/powerpc/platforms/pseries/cmm.c
> @@ -580,7 +580,7 @@ static int cmm_balloon_compaction_init(void)
>   return rc;
>   }
>  
> - b_dev_info.inode = alloc_anon_inode(balloon_mnt->mnt_sb);
> + b_dev_info.inode = alloc_anon_inode_sb(balloon_mnt->mnt_sb);
>   if (IS_ERR(b_dev_info.inode)) {
>   rc = PTR_ERR(b_dev_info.inode);
>   b_dev_info.inode = NULL;
> diff --git a/drivers/dma-buf/dma-buf.c b/drivers/dma-buf/dma-buf.c
> index f264b70c383eb4..dedcc9483352dc 100644
> --- a/drivers/dma-buf/dma-buf.c
> +++ b/drivers/dma-buf/dma-buf.c
> @@ -445,7 +445,7 @@ static inline int is_dma_buf_file(struct file *file)
>  static struct file *dma_buf_getfile(struct dma_buf *dmabuf, int flags)
>  {
>   struct file *file;
> - struct inode *inode = alloc_anon_inode(dma_buf_mnt->mnt_sb);
> + struct inode *inode = alloc_anon_inode_sb(dma_buf_mnt->mnt_sb);
>  
>   if (IS_ERR(inode))
>   return ERR_CAST(inode);
> diff --git a/drivers/gpu/drm/drm_drv.c b/drivers/gpu/drm/drm_drv.c
> index 20d22e41d7ce74..87e7214a8e3565 100644
> --- a/drivers/gpu/drm/drm_drv.c
> +++ b/drivers/gpu/drm/drm_drv.c
> @@ -519,7 +519,7 @@ static struct inode *drm_fs_inode_new(void)
>   return ERR_PTR(r);
>   }
>  
> - inode = alloc_anon_inode(drm_fs_mnt->mnt_sb);
> + inode = alloc_anon_inode_sb(drm_fs_mnt->mnt_sb);
>   if (IS_ERR(inode))
>   simple_release_fs(_fs_mnt, _fs_cnt);
>  
> diff --git a/drivers/misc/cxl/api.c b/drivers/misc/cxl/api.c
> index b493de962153ba..2efbf6c98028ef 100644
> --- a/drivers/misc/cxl/api.c
> +++ b/drivers/misc/cxl/api.c
> @@ -73,7 +73,7 @@ static struct file *cxl_getfile(const char *name,
>   goto err_module;
>   }
>  
> - inode = alloc_anon_inode(cxl_vfs_mount->mnt_sb);
> + inode = alloc_anon_inode_sb(cxl_vfs_mount->mnt_sb);
>   if (IS_ERR(inode)) {
>   file = ERR_CAST(inode);
>   goto err_fs;
> diff --git a/drivers/misc/vmw_balloon.c b/drivers/misc/vmw_balloon.c
> index b837e7eba5f7dc..5d057a05ddbee8 100644
> --- a/drivers/misc/vmw_balloon.c
> +++ b/drivers/misc/vmw_balloon.c
> @@ -1900,7 +1900,7 @@ static __init int vmballoon_compaction_init(struct 
> vmballoon *b)
>   return PTR_ERR(vmballoon_mnt);
>  
>   b->b_dev_info.migratepage = vmballoon_migratepage;
> - b->b_dev_info.inode = alloc_anon_inode(vmballoon_mnt->mnt_sb);
> + b->b_dev_info.inode = alloc_anon_inode_sb(vmballoon_mnt->mnt_sb);
>  
>   if (IS_ERR(b->b_dev_info.inode))
>   return PTR_ERR(b->b_dev_info.inode);
> diff --git a/drivers/scsi/cxlflash/ocxl_hw.c b/drivers/scsi/cxlflash/ocxl_hw.c
> index 244fc27215dc79..40184ed926b557 100644
> --- a/drivers/scsi/cxlflash/ocxl_hw.c
> +++ b/drivers/scsi/cxlflash/ocxl_hw.c
> @@ -88,7 +88,7 @@ static struct file *ocxlflash_getfile(struct device *dev, 
> const char *name,
>   goto err2;
>   }
>  
> - inode = alloc_anon_inode(ocxlflash_vfs_mount->mnt_sb);
> + inode = alloc_anon_inode_sb(ocxlflash_vfs_mount->mnt_sb);
>   if (IS_ERR(inode)) {
>   rc = PTR_ERR(inode);
>   dev_err(dev, "%s: alloc_anon_inode failed rc=%d\n",
> diff --git a/drivers/virtio/virtio_balloon.c b/drivers/virtio/virtio_balloon.c
> index 8985fc2cea8615..cae76ee5bdd688 1006

Re: [GIT PULL] idmapped mounts for v5.12

2021-03-03 Thread Christian Brauner

On Wed, Mar 03, 2021 at 01:36:31PM -0600, Eric W. Biederman wrote:
> Christian Brauner  writes:
> 
> > Hi Linus,

Hi Eric,

> 
> > This series comes with an extensive xfstests suite covering both ext4 and 
> > xfs
> > https://git.kernel.org/brauner/xfstests-dev/h/idmapped_mounts
> > It covers truncation, creation, opening, xattrs, vfscaps, setid execution,
> > setgid inheritance and more both with idmapped and non-idmapped mounts.
> > It already helped to discover an unrelated xfs setgid inheritance bug which 
> > has
> > since been fixed in mainline. It will be sent for inclusion with the 
> > xfstests
> > project should you decide to merge this.
> 
> And yet chown is broken (details below), and in a very predictable way.

This is increadibly poor timing, there were 6 versions of this patchset
published over several months and you were Cced on all of them. All that
came from you during that time were a couple of odd comments.

If chown is broken please give us a specific reproducer for when it
yields the wrong ownership so we can fix it and add this as a test-case
to the testsuite so it never breaks again.

> 
> This is not considering that the entire concept is giving people a
> loaded footgun, that is very difficult to use safely.

The concept has seen a lot of interest by a lot of users during
development of this patchset and is already being integrated in
container runtimes and other tools by people who understand its behavior
and implication.
And fwiw, by this argument we simply should have never done user
namespaces or unprivileged filesystem mounts too.

> 
> 
> When the user namespace was implemented the two kinds of uids were very
> carefully separated from each other by type, so it would be take
> deliberate action to mix them.  These changes introduces a third type
> of uid and does not use the type system to keep them separate.  In just

I don't agree. This causes more confusion then it solves imho and is the
whole basis for your argument. This reads a bit confusing to me.

> a little bit of looking since I realized this problem I have found two
> bugs in chown where the wrong values are compared.
> 
> We now have the following types of uids and gids:
> - The userspace values.
> - The kernel values that are used for comparisons.
>   (The old fashioned kuid_t and kgid_t)
> - The values used for interfacing with the filesystems
>   underneath a mount.
>   (The beneath mount kuid_t and kgid_t)

I don't see why we would need yet another type for this. It is simply a
remapped or shifted kuid. A third type would introduce more confusion
most likely but I'm open to it if you have a clear idea what you want
and why you want it. It slightly feels like a strawman distinction to
push for the revert.

> - The values stored in the filesystem.
> 
> The third type is new, and the code mixes old fashioned kuid_t and
> kgid_t with the below mount kuid_t and kgid_t.
> 
> Starting with chown_common the code does:
> 
> int chown_common(const struct path *path, uid_t user, gid_t group)
> {
>   ...
>   uid = make_kuid(current_user_ns(), user);
>   gid = make_kgid(current_user_ns(), group);
> 
>   mnt_userns = mnt_user_ns(path->mnt);
>   uid = kuid_from_mnt(mnt_userns, uid);
>   gid = kgid_from_mnt(mnt_userns, gid);
> 
> retry_deleg:
>   newattrs.ia_valid =  ATTR_CTIME;
>   if (user != (uid_t) -1) {
>   if (!uid_valid(uid))
>   return -EINVAL;
>   newattrs.ia_valid |= ATTR_UID;
>   newattrs.ia_uid = uid;
>   }
>   if (group != (gid_t) -1) {
>   if (!gid_valid(gid))
>   return -EINVAL;
>   newattrs.ia_valid |= ATTR_GID;
>   newattrs.ia_gid = gid;
>   }
>   if (!S_ISDIR(inode->i_mode))
>   newattrs.ia_valid |=
>   ATTR_KILL_SUID | ATTR_KILL_SGID | ATTR_KILL_PRIV;
>   inode_lock(inode);
>   error = security_path_chown(path, uid, gid);
>   if (!error)
>   error = notify_change(mnt_userns, path->dentry, ,
> _inode);
>   inode_unlock(inode);
>   ...
> }
> 
> Here security_path_chown is expecting the old fashioned kuid_t and
> kgid_t but looking at the top of the function we can see that
> security_path_chown is getting the kuid_t and kgid_t from below the
> mount.

As it should. The idmapping of the mount is authorative. This attack
only makes sense if you assume that there is a third type.

> 
> The Tomoyo lsm cares.

Please explain or send a patch to fix Tomoyo.

> 
> 
> Notice that ia_uid and ia_gid in struct newattrs are below mount values.
> 
> 
&

Re: [PATCH v6 09/40] xattr: handle idmapped mounts

2021-03-03 Thread Christian Brauner

On Wed, Mar 03, 2021 at 02:45:07PM +, David Howells wrote:
> Christian Brauner  wrote:
> 
> > In order to answer this more confidently I need to know a bit more about
> > how cachefiles are supposed to work.
> > 
> > From what I gather here it seemed what this code is trying to set here
> > is an internal "CacheFiles.cache" extended attribute on the indode. This
> > extended attribute doesn't store any uids and gids or filesystem
> > capabilities so the user namespace isn't relevant for that since there
> > doesn't need to be any conversion.
> > 
> > What I need to know is what information do you use for cachefiles to
> > determine whether someone can set that "Cachefiles.cache" extended
> > attribute on the inode:
> > - Is it the mnt_userns of a/the mount of the filesystem you're caching for?
> > - The mnt_userns of the mnt of struct cachefiles_cache?
> > - Or the stashed or current creds of the caller?
> 
> Mostly it's about permission checking.  The cache driver wants to do accesses
> onto the files in cache using the context of whatever process writes the
> "bind" command to /dev/cachefiles, not the context of whichever process issued
> a read or write, say, on an NFS file that is being cached.
> 
> This causes standard UNIX perm checking, SELinux checking, etc. all to be
> switched to the appropriate context.  It also controls what appears in the
> audit logs.

(Audit always translates from and to init_user_ns. The changes to make
it aware of user namespaces proper are delayed until the audit id thing
is merged as Paul pointed out to me.)

> 
> There is an exception to this: It also governs the ownership of new files and
> directories created in the cache and what security labels will be set on them.

So from our offline discussion I gather that cachefilesd creates a cache
on a local filesystem (ext4, xfs etc.) for a network filesystem. The way
this is done is by writing "bind" to /dev/cachefiles and pointing it to
a directory to use as the cache.

This directory can currently also be an idmapped mount, say:

mount --bind --idmap /mnt /mnt

and then pointing cachefilesd via a "bind" operation to

/mnt

What I would expect is for cachefilesd to now take that idmapping into
account when creating files in /mnt but as it stands now, it doesn't.
This could leave users confused as the ownership of the files wouldn't
match to what they expressed in the idmapping. Since you're reworking
cachefilesd currently anyway, I would suggest we port cachefilesd to
support idmapped mounts once as part of your rework. I can help there
and until then we do:

diff --git a/fs/cachefiles/bind.c b/fs/cachefiles/bind.c
index dfb14dbddf51..51f21beafad9 100644
--- a/fs/cachefiles/bind.c
+++ b/fs/cachefiles/bind.c
@@ -115,6 +115,12 @@ static int cachefiles_daemon_add_cache(struct 
cachefiles_cache *cache)
if (ret < 0)
goto error_open_root;

+   if (mnt_user_ns(path.mnt) != _user_ns) {
+   ret = -EPERM;
+   pr_err("Caches on idmapped mounts are currently not 
supported\n");
+   goto error_open_root;
+   }
+
cache->mnt = path.mnt;
root = path.dentry;

This is safe to do because if a mount is visible in the filesystem it
can't change it's idmapping.

(Might even be worth if you add a helper at this point:

static inline bool mnt_is_idmapped(struct vfsmount *mnt)
{
return mnt_user_ns(mnt) != _user_ns;
}
)

Christian

Re: [PATCH v6 09/40] xattr: handle idmapped mounts

2021-03-03 Thread Christian Brauner

On Wed, Mar 03, 2021 at 01:24:02PM +, David Howells wrote:
> Christian Brauner  wrote:
> 
> > diff --git a/fs/cachefiles/xattr.c b/fs/cachefiles/xattr.c
> > index 72e42438f3d7..a591b5e09637 100644
> > --- a/fs/cachefiles/xattr.c
> > +++ b/fs/cachefiles/xattr.c
> > @@ -39,8 +39,8 @@ int cachefiles_check_object_type(struct cachefiles_object 
> > *object)
> > _enter("%p{%s}", object, type);
> >  
> > /* attempt to install a type label directly */
> > -   ret = vfs_setxattr(dentry, cachefiles_xattr_cache, type, 2,
> > -  XATTR_CREATE);
> > +   ret = vfs_setxattr(_user_ns, dentry, cachefiles_xattr_cache, type,
> > +  2, XATTR_CREATE);
> 

Hey David,

(Ok, recovered from my run-in with the swapfile bug. I even managed to
get my emails back.)

> Actually, on further consideration, this might be the wrong thing to do in
> cachefiles.  The creds are (or should be) overridden when accesses to the
> underlying filesystem are being made.
> 
> I wonder if this should be using current_cred()->user_ns or
> cache->cache_cred->user_ns instead.

Before I go into the second question please note that this is a no-op
change. So if this is wrong it was wrong before. Which is your point, I
guess.

Please also note that the mnt_userns is _never_ used for (capability)
permission checking, only for idmapping vfs objects and permission
checks based on the i_uid and i_gid. So if your argument about passing
one of those two user namespaces above has anything to do with
permission checking on caps it's most likely wrong. :)

In order to answer this more confidently I need to know a bit more about
how cachefiles are supposed to work.

>From what I gather here it seemed what this code is trying to set here
is an internal "CacheFiles.cache" extended attribute on the indode. This
extended attribute doesn't store any uids and gids or filesystem
capabilities so the user namespace isn't relevant for that since there
doesn't need to be any conversion.

What I need to know is what information do you use for cachefiles to
determine whether someone can set that "Cachefiles.cache" extended
attribute on the inode:
- Is it the mnt_userns of a/the mount of the filesystem you're caching for?
- The mnt_userns of the mnt of struct cachefiles_cache?
- Or the stashed or current creds of the caller?

Christian

Re: [PATCH bpf 2/4] nsfs: add an ioctl to discover the network namespace cookie

2021-03-02 Thread Christian Brauner

On Tue, Mar 02, 2021 at 09:47:10AM +, Lorenz Bauer wrote:
> On Mon, 1 Mar 2021 at 10:04, Christian Brauner
>  wrote:
> >
> > Hey Lorenz,
> >
> > Just to make sure: is it intentional that any user can retrieve the
> > cookie associated with any network namespace, i.e. you don't require any
> > form of permission checking in the owning user namespace of the network
> > namespace?
> >
> > Christian
> 
> Hi Christian,
> 
> I've decided to drop the patch set for now, but that was my intention, yes. Is
> there a downside I'm not aware of?

It depends on whether this cookie is in any way security or at least
information sensitive. For example, would leaking it between
unprivileged containers with different user+network namespace pairs
allow one container to gain access to information about the other
container that it shouldn't.

Christian

Re: seccomp: Delay filter activation

2021-03-01 Thread Christian Brauner

On Mon, Mar 01, 2021 at 12:09:09PM +0100, Christian Brauner wrote:
> On Sat, Feb 20, 2021 at 01:31:57AM -0800, Sargun Dhillon wrote:
> > We've run into a problem where attaching a filter can be quite messy
> > business because the filter itself intercepts sendmsg, and other
> > syscalls related to exfiltrating the listener FD. I believe that this
> > problem set has been brought up before, and although there are
> > "simpler" methods of exfiltrating the listener, like clone3 or
> > pidfd_getfd, but these are still less than ideal.
> 
> (You really like sending patches and discussion points in the middle of
> the merge window. :D I think everyone's panicked about getting their PRs
> in shape so it's not unlikely that this sometimes gets lost on the list. :))
> 
> It hasn't been a huge problem for us, especially since we added
> pidfd_getfd() this seemed like a straightforward problem to solve by
> selecting a fix fd number that is to be used for the listener. But I can
> see why it is annoying.
> 
> > 
> > One of the ideas that's been talked about (I want to say back at LSS
> > NA) is the idea of "delayed activation". I was thinking that it might
> > be nice to have a mechanism to do delayed attach, either activated on
> > execve / fork, or an ioctl on the listenerfd to activate the filter
> > and have a flag like SECCOMP_FILTER_FLAG_NEW_LISTENER_INACTIVE, which
> > indicates that the listener should be setup, but not enforcing, and
> > another ioctl to activate it.
> > 
> > The later approach is preferred due to simplicity, but I can see a
> > situation where you could accidentally get into a state where the
> > filter is not being enforced. Additionally, this may have unforeseen
> > implications with CRIU.
> 
> (If you were to expose an ioctl() that allows userspace to query the
> notifer state then CRIU shouldn't have a problem restoring the notifier
> in the correct state. Right now it doesn't do anyting fancy about the
> notifier, it just restores the task with the filter. It just has to
> learn about the new feature and that's fine imho.)
> 
> > 
> > I'm curious whether this is a problem others share, and whether any of
> > the aforementioned approaches seem reasonable.
> 
> So when I originally suggested the delayed activation I I had another
> related idea that I think I might have mentioned too: if we're already
> considering delaying filter activation I like to discuss the possibility
> of attaching a seccomp filter to a task.
> 
> Right now, if one task wants to attach to another task they need to
> recreate the whole seccomp filter and load it. That's not just pretty
> expensive but also only works if you have access to the rules that the
> filter was generated with. For container that's usually some sort of
> pseudo seccomp filter configuration language dumped into a config file
> from which it can be read.
> 
> So right now the status quo is:
> 
> struct sock_filter filter[] = {
> BPF_STMT(BPF_LD|BPF_W|BPF_ABS, offsetof(struct seccomp_data, nr)),
> BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, nr, 0, 1),
> BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_USER_NOTIF), /* Get me a listener 
> fd */
> BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
> };
> struct sock_fprog prog = {
> .len = (unsigned short)ARRAY_SIZE(filter),
> .filter = filter,
> };
> int fd = seccomp(SECCOMP_SET_MODE_FILTER, flags, );
> 
> and then the caller must send the fd to the manager or the manager uses
> pidfd_getfd().
> 
> But, why not get a bit crazy^wcreative; especially since seccomp() is
> already a multiplexer. We introduce a new seccomp flag:
> 
> #define SECCOMP_FILTER_DETACHED
> 
> and a new seccomp command:
> 
> #define SECCOMP_ATTACH_FILTER
> 
> And now we could do something like:
> 
> pid_t pid = fork();
> if (pid < 0)
>   return;
> 
> if (pid == 0) {
>   // do stuff
>   BARRIER_WAKE_SETUP_DONE;
> 
>   // do more unrelated stuff
> 
>   BARRIER_WAIT_SECCOMP_FILTER;
>   execve(exec-something);
> } else {
>   
>   int fd_filter;
> 
>   struct sock_filter filter[] = {
>   BPF_STMT(BPF_LD|BPF_W|BPF_ABS, offsetof(struct seccomp_data, 
> nr)),
>   BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, nr, 0, 1),
>   BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
>   };
>   
>   struct sock_fprog prog = {
>   .len = (unsigned short)ARRAY_SIZE(filter),
>   .filter = filter,
>   };
>   
>   int fd_filter = seccomp(SECCOMP_SET_MODE_FILTER, 
> SECCOMP_FILTER_DETACHED, );
> 
>   BAR

Re: [PATCH 0/5] kernel-doc fixes to latest fs changes

2021-03-01 Thread Christian Brauner

On Thu, Feb 04, 2021 at 07:00:54PM +0100, Lukas Bulwahn wrote:
> this patchset was motivated by new warnings with make htmldocs appearing on
> linux-next in the last week.
> 
> Please apply this on top of your latest work in fs on top of the mount user
> namespace refactoring, cf. the commits referred in the individual commit
> messages.
> 
> 
> Lukas Bulwahn (5):
>   fs: turn some comments into kernel-doc
>   fs: update kernel-doc for vfs_rename()
>   fs: update kernel-doc for may_create_in_sticky()
>   fs: update kernel-doc for vfs_tmpfile()
>   fs: update kernel-doc for new mnt_userns argument
> 
>  fs/libfs.c |  1 +
>  fs/namei.c | 13 ++---
>  fs/xattr.c |  2 ++
>  include/linux/fs.h | 17 ++---
>  4 files changed, 19 insertions(+), 14 deletions(-)

Thanks for fixing this up, Lukas. Randy has fixed some of my missing
comment updates as well so we only needed

>   fs: turn some comments into kernel-doc
>   fs: update kernel-doc for vfs_rename()

Christian

Re: [PATCH -next] fs: libfs: fix kernel-doc for mnt_userns

2021-03-01 Thread Christian Brauner

On Tue, Feb 16, 2021 at 09:48:25AM +0100, Christoph Hellwig wrote:
> On Mon, Feb 15, 2021 at 08:29:27PM -0800, Randy Dunlap wrote:
> > Fix kernel-doc warning in libfs.c.
> > 
> > ../fs/libfs.c:498: warning: Function parameter or member 'mnt_userns' not 
> > described in 'simple_setattr'
> 
> Shouldn't the subject say simple_setattr instead of mnt_userns?
> 
> Otherwise looks good:
> 
> Reviewed-by: Christoph Hellwig 

So I've picked this up but just as an fyi b4 fell all over its face on
this series always giving me partial patches or prefixing it with v2 or
v3 or sm. Really strange, I'll report this to Konstantin.

Christian

Re: [PATCH] kernel: Return -EFAULT if copy_to_user() fails

2021-03-01 Thread Christian Brauner

On Mon, Mar 01, 2021 at 07:28:11PM +0800, Wang Qing wrote:
> The copy_to_user() function returns the number of bytes remaining to be
> copied, but we want to return -EFAULT if the copy doesn't complete.
> 
> Signed-off-by: Wang Qing 
> ---

Hey Wang,

Thanks for the patch. It looks like override_release() is called in
three places and all three places return -EFAULT correctly. They simply
treat any as EFAULT. So not sure this change buys us anything.

Christian

Re: seccomp: Delay filter activation

2021-03-01 Thread Christian Brauner

On Sat, Feb 20, 2021 at 01:31:57AM -0800, Sargun Dhillon wrote:
> We've run into a problem where attaching a filter can be quite messy
> business because the filter itself intercepts sendmsg, and other
> syscalls related to exfiltrating the listener FD. I believe that this
> problem set has been brought up before, and although there are
> "simpler" methods of exfiltrating the listener, like clone3 or
> pidfd_getfd, but these are still less than ideal.

(You really like sending patches and discussion points in the middle of
the merge window. :D I think everyone's panicked about getting their PRs
in shape so it's not unlikely that this sometimes gets lost on the list. :))

It hasn't been a huge problem for us, especially since we added
pidfd_getfd() this seemed like a straightforward problem to solve by
selecting a fix fd number that is to be used for the listener. But I can
see why it is annoying.

> 
> One of the ideas that's been talked about (I want to say back at LSS
> NA) is the idea of "delayed activation". I was thinking that it might
> be nice to have a mechanism to do delayed attach, either activated on
> execve / fork, or an ioctl on the listenerfd to activate the filter
> and have a flag like SECCOMP_FILTER_FLAG_NEW_LISTENER_INACTIVE, which
> indicates that the listener should be setup, but not enforcing, and
> another ioctl to activate it.
> 
> The later approach is preferred due to simplicity, but I can see a
> situation where you could accidentally get into a state where the
> filter is not being enforced. Additionally, this may have unforeseen
> implications with CRIU.

(If you were to expose an ioctl() that allows userspace to query the
notifer state then CRIU shouldn't have a problem restoring the notifier
in the correct state. Right now it doesn't do anyting fancy about the
notifier, it just restores the task with the filter. It just has to
learn about the new feature and that's fine imho.)

> 
> I'm curious whether this is a problem others share, and whether any of
> the aforementioned approaches seem reasonable.

So when I originally suggested the delayed activation I I had another
related idea that I think I might have mentioned too: if we're already
considering delaying filter activation I like to discuss the possibility
of attaching a seccomp filter to a task.

Right now, if one task wants to attach to another task they need to
recreate the whole seccomp filter and load it. That's not just pretty
expensive but also only works if you have access to the rules that the
filter was generated with. For container that's usually some sort of
pseudo seccomp filter configuration language dumped into a config file
from which it can be read.

So right now the status quo is:

struct sock_filter filter[] = {
BPF_STMT(BPF_LD|BPF_W|BPF_ABS, offsetof(struct seccomp_data, nr)),
BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, nr, 0, 1),
BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_USER_NOTIF), /* Get me a listener 
fd */
BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
};
struct sock_fprog prog = {
.len = (unsigned short)ARRAY_SIZE(filter),
.filter = filter,
};
int fd = seccomp(SECCOMP_SET_MODE_FILTER, flags, );

and then the caller must send the fd to the manager or the manager uses
pidfd_getfd().

But, why not get a bit crazy^wcreative; especially since seccomp() is
already a multiplexer. We introduce a new seccomp flag:

#define SECCOMP_FILTER_DETACHED

and a new seccomp command:

#define SECCOMP_ATTACH_FILTER

And now we could do something like:

pid_t pid = fork();
if (pid < 0)
return;

if (pid == 0) {
// do stuff
BARRIER_WAKE_SETUP_DONE;

// do more unrelated stuff

BARRIER_WAIT_SECCOMP_FILTER;
execve(exec-something);
} else {

int fd_filter;

struct sock_filter filter[] = {
BPF_STMT(BPF_LD|BPF_W|BPF_ABS, offsetof(struct seccomp_data, 
nr)),
BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, nr, 0, 1),
BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
};

struct sock_fprog prog = {
.len = (unsigned short)ARRAY_SIZE(filter),
.filter = filter,
};

int fd_filter = seccomp(SECCOMP_SET_MODE_FILTER, 
SECCOMP_FILTER_DETACHED, );

BARRIER_WAIT_SETUP_DONE;

int ret = seccomp(SECCOMP_ATTACH_FILTER, 0, INT_TO_PTR(fd_listener));

BARRIER_WAKE_SECCOMP_FILTER;
}

And now you have attached a filter to another task. This would be super
elegant for a container manager. The container manager could also stash
the filter fd and when attaching to a container the manager can send the
attaching task the fd and the attaching task can do:

int ret = seccomp(SECCOMP_ATTACH_FILTER, 0, INT_TO_PTR(fd_filter));

too and would be attached to the same filter as the target task.

And for the listener fd case a container manager could simply set
SECCOMP_RET_USER_NOTIF as before

struct sock_filter filter[] =

1 2 3 4 5 6 7 8 9 10 >

1 - 100 of 2432 matches

Mail list logo