Re: [PATCH v2 4/7] tracefs: dentry lookup crapectomy

2024-01-31 Thread Al Viro
On Wed, Jan 31, 2024 at 09:26:42PM -0500, Steven Rostedt wrote:

> > Huh?  Just return NULL and be done with that - you'll get an
> > unhashed negative dentry and let the caller turn that into
> > -ENOENT...
> 
> We had a problem here with just returning NULL. It leaves the negative
> dentry around and doesn't get refreshed.

Why would that dentry stick around?  And how would anyone find
it, anyway, when it's not hashed?

> I did this:
> 
>  # cd /sys/kernel/tracing
>  # ls events/kprobes/sched/
> ls: cannot access 'events/kprobes/sched/': No such file or directory
>  # echo 'p:sched schedule' >> kprobe_events
>  # ls events/kprobes/sched/
> ls: cannot access 'events/kprobes/sched/': No such file or directory
> 
> When it should have been:
> 
>  # ls events/kprobes/sched/
> enable  filter  format  hist  hist_debug  id  inject  trigger
> 
> Leaving the negative dentry there will have it fail when the directory
> exists the next time.

Then you have something very deeply fucked up.  NULL or ERR_PTR(-ENOENT)
from ->lookup() in the last component of open() would do exactly the
same thing: dput() whatever had been passed to ->lookup() and fail
open(2) with -ENOENT.



Re: [PATCH v2 4/7] tracefs: dentry lookup crapectomy

2024-01-31 Thread Al Viro
On Wed, Jan 31, 2024 at 01:49:22PM -0500, Steven Rostedt wrote:

> @@ -329,32 +320,29 @@ static struct dentry *create_file(const char *name, 
> umode_t mode,
>  
>   ti = get_tracefs(inode);
>   ti->flags |= TRACEFS_EVENT_INODE;
> - d_instantiate(dentry, inode);
> +
> + d_add(dentry, inode);
>   fsnotify_create(dentry->d_parent->d_inode, dentry);

Seriously?  stat(2), have it go away from dcache on memory pressure,
lather, rinse, repeat...  Won't *snotify get confused by the stream
of creations of the same thing, with not a removal in sight?

> - return eventfs_end_creating(dentry);
> + return dentry;
>  };
>  
>  /**
> - * create_dir - create a dir in the tracefs filesystem
> + * lookup_dir_entry - look up a dir in the tracefs filesystem
> + * @dentry: the directory to look up
>   * @ei: the eventfs_inode that represents the directory to create
> - * @parent: parent dentry for this file.
>   *
> - * This function will create a dentry for a directory represented by
> + * This function will look up a dentry for a directory represented by
>   * a eventfs_inode.
>   */
> -static struct dentry *create_dir(struct eventfs_inode *ei, struct dentry 
> *parent)
> +static struct dentry *lookup_dir_entry(struct dentry *dentry,
> + struct eventfs_inode *pei, struct eventfs_inode *ei)
>  {
>   struct tracefs_inode *ti;
> - struct dentry *dentry;
>   struct inode *inode;
>  
> - dentry = eventfs_start_creating(ei->name, parent);
> - if (IS_ERR(dentry))
> - return dentry;
> -
>   inode = tracefs_get_inode(dentry->d_sb);
>   if (unlikely(!inode))
> - return eventfs_failed_creating(dentry);
> + return ERR_PTR(-ENOMEM);
>  
>   /* If the user updated the directory's attributes, use them */
>   update_inode_attr(dentry, inode, >attr,
> @@ -371,11 +359,14 @@ static struct dentry *create_dir(struct eventfs_inode 
> *ei, struct dentry *parent
>   /* Only directories have ti->private set to an ei, not files */
>   ti->private = ei;
>  
> + dentry->d_fsdata = ei;
> +ei->dentry = dentry; // Remove me!
> +
>   inc_nlink(inode);
> - d_instantiate(dentry, inode);
> + d_add(dentry, inode);
>   inc_nlink(dentry->d_parent->d_inode);

What will happen when that thing gets evicted from dcache,
gets looked up again, and again, and...?

>   fsnotify_mkdir(dentry->d_parent->d_inode, dentry);

Same re snotify confusion...

> - return eventfs_end_creating(dentry);
> + return dentry;
>  }
>  
>  static void free_ei(struct eventfs_inode *ei)
> @@ -425,7 +416,7 @@ void eventfs_set_ei_status_free(struct tracefs_inode *ti, 
> struct dentry *dentry)
>  }
>  
>  /**
> - * create_file_dentry - create a dentry for a file of an eventfs_inode
> + * lookup_file_dentry - create a dentry for a file of an eventfs_inode
>   * @ei: the eventfs_inode that the file will be created under
>   * @idx: the index into the d_children[] of the @ei
>   * @parent: The parent dentry of the created file.
> @@ -438,157 +429,21 @@ void eventfs_set_ei_status_free(struct tracefs_inode 
> *ti, struct dentry *dentry)
>   * address located at @e_dentry.
>   */
>  static struct dentry *
> -create_file_dentry(struct eventfs_inode *ei, int idx,
> -struct dentry *parent, const char *name, umode_t mode, void 
> *data,
> +lookup_file_dentry(struct dentry *dentry,
> +struct eventfs_inode *ei, int idx,
> +umode_t mode, void *data,
>  const struct file_operations *fops)
>  {
>   struct eventfs_attr *attr = NULL;
>   struct dentry **e_dentry = >d_children[idx];
> - struct dentry *dentry;
> -
> - WARN_ON_ONCE(!inode_is_locked(parent->d_inode));
>  
> - mutex_lock(_mutex);
> - if (ei->is_freed) {
> - mutex_unlock(_mutex);
> - return NULL;
> - }
> - /* If the e_dentry already has a dentry, use it */
> - if (*e_dentry) {
> - dget(*e_dentry);
> - mutex_unlock(_mutex);
> - return *e_dentry;
> - }
> -
> - /* ei->entry_attrs are protected by SRCU */
>   if (ei->entry_attrs)
>   attr = >entry_attrs[idx];
>  
> - mutex_unlock(_mutex);
> -
> - dentry = create_file(name, mode, attr, parent, data, fops);
> -
> - mutex_lock(_mutex);
> -
> - if (IS_ERR_OR_NULL(dentry)) {
> - /*
> -  * When the mutex was released, something else could have
> -  * created the dentry for this e_dentry. In which case
> -  * use that one.
> -  *
> -  * If ei->is_freed is set, the e_dentry is currently on its
> -  * way to being freed, don't return it. If e_dentry is NULL
> -  * it means it was already freed.
> -  */
> - if (ei->is_freed) {
> - dentry = NULL;
> - } else {
> - dentry = *e_dentry;
> -  

Re: [PATCH 6/6] eventfs: clean up dentry ops and add revalidate function

2024-01-30 Thread Al Viro
On Tue, Jan 30, 2024 at 07:39:55PM -0800, Linus Torvalds wrote:

> Why don't we, btw? It would be so much better if we did the
> d_release() from __d_free(). We have all that smarts in fs/dcache.c to
> decide if we need to RCU-delay it or not, and then we don't let
> filesystems use it.

Because we want to give filesystems a chance to do something useful
in it?  Something that might need access to the parent.  Or superblock,
for that matter...  By the time we are past the RCU delay there's
very little left; if you want to look at some per-superblock data,
you would have to do something like putting it into a refcounted
structure, with each dentry holding a counting reference, with obvious
effects...

We could add a separate method just for freeing stuff, but... we already
have 4 of them related to that path (->d_delete(), ->d_prune(), ->d_iput(),
->d_release()) and the things are pretty confusing as it is, without
throwing another one into the mix.

I'll look through the RCU pathwalk fixes (will repost the rebased set in
a couple of days, need to finish the re-audit of that stuff) and see how
much would such a method simplify, but I don't think it would buy us
a lot.

> So I guess the "make low-level filesystems do their own kfree_rcu() is
> what we're doing.
> 
> In this case it's as simple as doing that
> 
> -   kfree(ei);
> +   kfree_rcu(ei, rcu);
> 
> and we'd just make the rcu entry a union with something that isn't
> that 'is_freed' field so that it doesn't take more space.



Re: [PATCH 6/6] eventfs: clean up dentry ops and add revalidate function

2024-01-30 Thread Al Viro
On Tue, Jan 30, 2024 at 06:37:49PM -0800, Linus Torvalds wrote:
> On Tue, 30 Jan 2024 at 17:12, Al Viro  wrote:
> >
> > > + *
> > > + * Note that d_revalidate is called potentially under RCU,
> > > + * so it can't take the eventfs mutex etc. It's fine - if
> > > + * we open a file just as it's marked dead, things will
> > > + * still work just fine, and just see the old stale case.
> >
> > Looks like use after free, unless freeing ei is RCU-delayed...
> 
> We hold the ref to the ei in the very dentry that is doing d_revalidate().

What's to stop ->d_revalidate() from being called in parallel with
__dentry_kill()?



Re: [PATCH 6/6] eventfs: clean up dentry ops and add revalidate function

2024-01-30 Thread Al Viro
On Tue, Jan 30, 2024 at 11:03:55AM -0800, Linus Torvalds wrote:

> +void eventfs_d_release(struct dentry *dentry)
>  {
> - struct eventfs_inode *ei;
> -
> - mutex_lock(_mutex);
> - ei = dentry->d_fsdata;
> - if (ei) {
> - dentry->d_fsdata = NULL;
> - put_ei(ei);
> - }
> - mutex_unlock(_mutex);
> + put_ei(dentry->d_fsdata);
>  }

I'd rather pass ->d_fsdata to that sucker (or exposed put_ei(),
for that matter).

> @@ -857,6 +847,5 @@ void eventfs_remove_events_dir(struct eventfs_inode *ei)
>* sticks around while the other ei->dentry are created
>* and destroyed dynamically.
>*/
> - simple_recursive_removal(dentry, NULL);

That also needs to move earlier in the series - bisect hazard.

> + *
> + * Note that d_revalidate is called potentially under RCU,
> + * so it can't take the eventfs mutex etc. It's fine - if
> + * we open a file just as it's marked dead, things will
> + * still work just fine, and just see the old stale case.

Looks like use after free, unless freeing ei is RCU-delayed...

> + return !(ei && ei->is_freed);



Re: [PATCH 5/6] eventfs: get rid of dentry pointers without refcounts

2024-01-30 Thread Al Viro
On Tue, Jan 30, 2024 at 11:03:54AM -0800, Linus Torvalds wrote:

>   inode = tracefs_get_inode(dentry->d_sb);
>   if (unlikely(!inode))
> - return eventfs_failed_creating(dentry);
> + return ERR_PTR(-ENOMEM);

That belongs in the lookup crapectomy patch - bisect hazard from stray dput().
Same for similar chunks below.



Re: [PATCH 3/6] tracefs: dentry lookup crapectomy

2024-01-30 Thread Al Viro
On Tue, Jan 30, 2024 at 11:03:52AM -0800, Linus Torvalds wrote:
>  {
>   struct eventfs_inode *ei;
>  
> - mutex_lock(_mutex);
>   do {
>   // The parent is stable because we do not do renames
>   dentry = dentry->d_parent;
> @@ -247,7 +246,6 @@ static struct eventfs_inode *eventfs_find_events(struct 
> dentry *dentry)
>   }
>   // Walk upwards until you find the events inode
>   } while (!ei->is_events);
> - mutex_unlock(_mutex);

Unless I'm missing something, you've just lost exclusion with
removals (not that the original hadn't been suspicious in that
respect - what's to protect ei past that mutex_unlock?



Re: [PATCH 3/6] tracefs: dentry lookup crapectomy

2024-01-30 Thread Al Viro
On Tue, Jan 30, 2024 at 06:55:36PM -0500, Steven Rostedt wrote:

> Actually it's the tracefs_start_creating() locks the inode, the
> eventfs_start_creating() doesn't.

Right.

> > 
> > >   if (unlikely(!inode))
> > >   return eventfs_failed_creating(dentry);  
> > 
> > ... and that still unlocks it.

This is still bogus, though - both the stray dput() and
dropping a reference to internal mount.

struct dentry *eventfs_failed_creating(struct dentry *dentry)
{
dput(dentry);
simple_release_fs(_mount, _mount_count);
return NULL;
}



Re: [PATCH 3/6] tracefs: dentry lookup crapectomy

2024-01-30 Thread Al Viro
On Tue, Jan 30, 2024 at 11:03:52AM -0800, Linus Torvalds wrote:
> The dentry lookup for eventfs files was very broken, and had lots of
> signs of the old situation where the filesystem names were all created
> statically in the dentry tree, rather than being looked up dynamically
> based on the eventfs data structures.
> 
> You could see it in the naming - how it claimed to "create" dentries
> rather than just look up the dentries that were given it.
> 
> You could see it in various nonsensical and very incorrect operations,
> like using "simple_lookup()" on the dentries that were passed in, which
> only results in those dentries becoming negative dentries.  Which meant
> that any other lookup would possibly return ENOENT if it saw that
> negative dentry before the data rwas then later filled in.
> 
> You could see it in the immesnse amount of nonsensical code that didn't
> actually just do lookups.

> -static struct dentry *create_file(const char *name, umode_t mode,
> +static struct dentry *lookup_file(struct dentry *dentry,
> +   umode_t mode,
> struct eventfs_attr *attr,
> -   struct dentry *parent, void *data,
> +   void *data,
> const struct file_operations *fop)
>  {
>   struct tracefs_inode *ti;
> - struct dentry *dentry;
>   struct inode *inode;
>  
>   if (!(mode & S_IFMT))
> @@ -307,12 +304,6 @@ static struct dentry *create_file(const char *name, 
> umode_t mode,
>   if (WARN_ON_ONCE(!S_ISREG(mode)))
>   return NULL;
>  
> - WARN_ON_ONCE(!parent);
> - dentry = eventfs_start_creating(name, parent);

Used to lock the inode of parent.

>   if (unlikely(!inode))
>   return eventfs_failed_creating(dentry);

... and that still unlocks it.

> @@ -331,29 +322,25 @@ static struct dentry *create_file(const char *name, 
> umode_t mode,
>   ti->flags = TRACEFS_EVENT_INODE;
>   ti->private = NULL; // Directories have 'ei', files 
> not
>  
> - d_instantiate(dentry, inode);
> + d_add(dentry, inode);
>   fsnotify_create(dentry->d_parent->d_inode, dentry);
>   return eventfs_end_creating(dentry);

... and so does this.

>  };

Where has that inode_lock() gone and how could that possibly work?



Re: [linus:master] [eventfs] 852e46e239: BUG:unable_to_handle_page_fault_for_address

2024-01-29 Thread Al Viro
On Sun, Jan 28, 2024 at 08:36:12PM -0800, Linus Torvalds wrote:

[snip]

apologies for being MIA on that, will post tomorrow morning once I get some
sleep...



Re: [PATCH] tracefs/eventfs: Use root and instance inodes as default ownership

2024-01-03 Thread Al Viro
On Wed, Jan 03, 2024 at 09:25:06PM -0500, Steven Rostedt wrote:
> On Thu, 4 Jan 2024 01:48:37 +
> Al Viro  wrote:
> 
> > On Wed, Jan 03, 2024 at 08:32:46PM -0500, Steven Rostedt wrote:
> > 
> > > + /* Get the tracefs root from the parent */
> > > + inode = d_inode(dentry->d_parent);
> > > + inode = d_inode(inode->i_sb->s_root);  
> > 
> > That makes no sense.  First of all, for any positive dentry we have
> > dentry->d_sb == dentry->d_inode->i_sb.  And it's the same for all
> > dentries on given superblock.  So what's the point of that dance?
> > If you want the root inode, just go for d_inode(dentry->d_sb->s_root)
> > and be done with that...
> 
> That was more of thinking that the dentry and dentry->d_parent are
> different. As dentry is part of eventfs and dentry->d_parent is part of
> tracefs.

???

>Currently they both have the same superblock so yeah, I could just
> write it that way too and it would work. But in my head, I was thinking
> that they behave differently and maybe one day eventfs would get its own
> superblock which would not work.

->d_parent *always* points to the same filesystem; if you get an (automounted?)
mountpoint there, ->d_parent simply won't work - it will point to dentry itself.

> To explain this better:
> 
>   /sys/kernel/tracing/ is the parent of /sys/kernel/tracing/events
> 
> But everything but "events" in /sys/kernel/tracing/* is part of tracefs.
> Everything in /sys/kernel/tracing/events is part of eventfs.
> 
> That was my thought process. But as both tracefs and eventfs still use
> tracefs_get_inode(), it would work as you state.
> 
> I'll update that, as I don't foresee that eventfs will become its own file
> system.

There is no way to get to underlying mountpoint by dentry - simply because
the same fs instance can be mounted in any number of places.

A crude overview of taxonomy:

file_system_type: what filesystem instances belong to.  Not quite the same
thing as fs driver (one driver can provide several of those).  Usually
it's 1-to-1, but that's not required (e.g. NFS vs NFSv4, or ext[234], or...).

super_block: individual filesystem instance.  Hosts dentry tree (connected or
several disconnected parts - think NFSv4 or the state while trying to get
a dentry by fhandle, etc.).

dentry: object in a filesystem's directory tree(s).  Always belongs to
specific filesystem instance - that relationship never changes.  Tree
structure (and names) _within_ _filesystem_ belong on that level.
->d_parent is part of that tree structure; never NULL, root of a (sub)tree
has it pointing to itself.  Might be negative, might refer to a filesystem 
object
(file, directory, symlink, etc.).

inode: filesystem object (file, directory, etc.).  Always belongs to
specific filesystem instance.  Non-directory inodes might have any
number of dentry instances (aliases) refering to it; a directory one - no 
more than one.  Filesystem object contents belongs here; multiple hardlinks
have different dentries and the same inode.  Of course, filesystem type in
question might have no such thing as multiple hardlinks - that's up to
filesystem.  In general there is no way to find (or enumerate) such links;
e.g. a local filesystem might have an extra hardlink somewhere we had
never looked at and there won't be any dentries for such hardlinks and
no way to get them short of doing the full search of the entire tree.
The situation with e.g. NFS client is even worse, obviously.

mount: in a sense, mount to super_block is what dentry is to inode.  It
provides a view of (sub)tree hosted in given filesystem instance.  The
same filesystem may have any number of mounts, refering to its subtrees
(possibly the same subtree for each, possibly all different - up to
the callers of mount(2)).  They form mount tree(s) - that's where the
notions related to "this mounted on top of that" belong.  Note that
they can be moved around - with no telling the filesystem about that
happening.  Again, there's no such thing as "the mountpoint of given
filesystem instance" - it might be mounted in any number of places
at the same time.  Specific mount - sure, no problem, except that it
can move around.

namespace: mount tree.  Unlike everything prior, this one is a part of
process state - same as descriptor table, mappings, etc.

file: opened IO channel.  It does refer to specific mount and specific
dentry (and thus filesystem instance and an inode on it).  Current
IO position lives here, so does any per-open(2) state.

descriptor table: mapping from numbers to IO channels (opened files).
Again, a part of process state.  dup() creates a new entry, with
reference to the same file as the old one; multiple open() of the
same pathname will each yield a separate opened file.  _Some_ state
belongs here (close-on-exec, m

Re: [PATCH] tracefs/eventfs: Use root and instance inodes as default ownership

2024-01-03 Thread Al Viro
On Wed, Jan 03, 2024 at 08:32:46PM -0500, Steven Rostedt wrote:

> +static struct inode *instance_inode(struct dentry *parent, struct inode 
> *inode)
> +{
> + struct tracefs_inode *ti;
> + struct inode *root_inode;
> +
> + root_inode = d_inode(inode->i_sb->s_root);
> +
> + /* If parent is NULL then use root inode */
> + if (!parent)
> + return root_inode;
> +
> + /* Find the inode that is flagged as an instance or the root inode */
> + do {
> + inode = d_inode(parent);
> + if (inode == root_inode)
> + return root_inode;
> +
> + ti = get_tracefs(inode);
> +
> + if (ti->flags & TRACEFS_INSTANCE_INODE)
> + return inode;
> + } while ((parent = parent->d_parent));

*blink*

This is equivalent to
...
parent = parent->d_parent;
} while (true);

->d_parent is *never* NULL.  And what the hell is that loop supposed to do,
anyway?  Find the nearest ancestor tagged with TRACEFS_INSTANCE_INODE?

If root is not marked that way, I would suggest
if (!parent)
parent = inode->i_sb->s_root;
while (!IS_ROOT(parent)) {
struct tracefs_inode *ti = get_tracefs(parent->d_inode);
if (ti->flags & TRACEFS_INSTANCE_INODE)
break;
parent = parent->d_parent;
}
return parent->d_inode;



Re: [PATCH] tracefs/eventfs: Use root and instance inodes as default ownership

2024-01-03 Thread Al Viro
On Wed, Jan 03, 2024 at 08:32:46PM -0500, Steven Rostedt wrote:

> + /* Get the tracefs root from the parent */
> + inode = d_inode(dentry->d_parent);
> + inode = d_inode(inode->i_sb->s_root);

That makes no sense.  First of all, for any positive dentry we have
dentry->d_sb == dentry->d_inode->i_sb.  And it's the same for all
dentries on given superblock.  So what's the point of that dance?
If you want the root inode, just go for d_inode(dentry->d_sb->s_root)
and be done with that...



Re: [PATCH] eventfs: Stop using dcache_readdir() for getdents()

2024-01-03 Thread Al Viro
On Wed, Jan 03, 2024 at 01:54:36PM -0800, Linus Torvalds wrote:

> Again: UNTESTED, and meant as a "this is another way to avoid messing
> with the dentry tree manually, and just using the VFS interfaces we
> already have"

That would break chown(), though.  From conversation back in November:

17:50 #kernel: < viro> while we are at it, why not simply supply ->permission() 
and ->getattr() that would pick gid from superblock 
 and shove them into ->i_gid?
17:50 #kernel: < viro> and called the default variants
17:50 #kernel: < viro> no need to scan the tree, etc.
17:51 #kernel: < viro> how many place in VFS or VM give a fuck about GID of 
inode?
17:53 #kernel: < viro> stat() and permission checks
17:56 #kernel: < viro> but that boils down to "well, generic getattr and 
permission use that field and on-disk filesystems use it to keep track of what 
value to put on disk"
17:56 #kernel: < viro> you can trivially override the defaults for 
->permission() and ->getattr()
17:57 #kernel: < viro> and have them set the right ->i_gid whenever called

17:58 #kernel: < viro> what do you want to happen for chown() + remount?
17:58 #kernel: < viro> any group changes from the former lost on the latter?
18:00 #kernel: < viro> if you want the current semantics, slap generation 
counter in superblock (bumped on remount)
18:00 #kernel: < viro> sample it into inode on ->setattr()
18:01 #kernel: < viro> and have ->permission() and ->getattr() compare inode 
and superblock gen counts, picking ->i_gid from superblock if it's more recent 
there
18:02 #kernel: < viro> if you want the result of chown() to stick, have it 
stuff ~0U into inode's gen counter instead of sampling the superblock's counter 
there

18:17 #kernel: < viro> OK... so we need to filter SB_I_VERSION out of flags on 
mount/remount, lest the timestamp updates start playing silly buggers with it
18:18 #kernel: < viro> and use inode_..._iversion_raw() for access
18:19 #kernel: < viro> or use ->i_generation, perhaps...
18:20 #kernel: < viro> 32bit, but if somebody does 4G mount -o remount, they 
are deliberately asking for trouble

21:37 #kernel: < viro> hmm...
21:37 #kernel: < viro> ->d_revalidate() as well, probably
21:39 #kernel: < viro> rostedt: my apologies, looks like I had been too 
optimistic

I have the beginnings of patch along those lines stuck in the local tree, but
the problem had been that ->d_revalidate() is not always called on the way to
some places where ->i_uid/->i_gid is accessed ;-/

I can resurrect the analysis, but that'll take a few hours.



Re: [PATCH] eventfs: Process deletion of dentry more thoroughly

2023-10-31 Thread Al Viro
On Tue, Oct 31, 2023 at 02:47:03PM -0400, Steven Rostedt wrote:
> From: "Steven Rostedt (Google)" 
> 
> Looking at how dentry is removed via the tracefs system, I found that
> eventfs does not do everything that it did under tracefs. The tracefs
> removal of a dentry calls simple_recursive_removal() that does a lot more
> than a simple d_invalidate().

Umm...  Is there any reason not to use simple_recursive_removal() there?



Re: [PATCH 03/19] fs: release anon dev_t in deactivate_locked_super

2023-09-15 Thread Al Viro
On Fri, Sep 15, 2023 at 03:28:14PM +0100, Al Viro wrote:
> On Fri, Sep 15, 2023 at 04:12:07PM +0200, Christian Brauner wrote:
> > +   static void some_fs_kill_sb(struct super_block *sb)
> > +   {
> > +   struct some_fs_info *info = sb->s_fs_info;
> > +
> > +   kill_*_super(sb);
> > +   kfree(info);
> > +   }
> > +
> > +It's best practice to never deviate from this pattern.
> 
> The last part is flat-out incorrect.  If e.g. fatfs or cifs ever switches
> to that pattern, you'll get UAF - they need freeing of ->s_fs_info
> of anything that ever had been mounted done with RCU delay; moreover,
> unload_nls() in fatfs needs to be behind the same.
> 
> Lifetime rules for fs-private parts of superblock are really private to
> filesystem; their use by sget/sget_fc callbacks might impose restrictions
> on those, but that again is none of the VFS business.

PS: and no, we don't want to impose such RCU delay on every filesystem
out there; what's more, there's nothing to prohibit e.g. having ->s_fs_info
pointing to a refcounted fs-private object (possibly shared by various
superblocks), so freeing might very well be "drop the reference and destroy
if refcount has reached 0".


Re: [PATCH 03/19] fs: release anon dev_t in deactivate_locked_super

2023-09-15 Thread Al Viro
On Fri, Sep 15, 2023 at 04:12:07PM +0200, Christian Brauner wrote:
> + static void some_fs_kill_sb(struct super_block *sb)
> + {
> + struct some_fs_info *info = sb->s_fs_info;
> +
> + kill_*_super(sb);
> + kfree(info);
> + }
> +
> +It's best practice to never deviate from this pattern.

The last part is flat-out incorrect.  If e.g. fatfs or cifs ever switches
to that pattern, you'll get UAF - they need freeing of ->s_fs_info
of anything that ever had been mounted done with RCU delay; moreover,
unload_nls() in fatfs needs to be behind the same.

Lifetime rules for fs-private parts of superblock are really private to
filesystem; their use by sget/sget_fc callbacks might impose restrictions
on those, but that again is none of the VFS business.


Re: [PATCH 03/19] fs: release anon dev_t in deactivate_locked_super

2023-09-14 Thread Al Viro
On Thu, Sep 14, 2023 at 05:58:05PM +0100, Al Viro wrote:

> Incidentally, I'm going to add a (belated by 10 years) chunk in porting.rst
> re making sure that anything in superblock that might be needed by methods
> called in RCU mode should *not* be freed without an RCU delay...  Should've
> done that back in 3.12 merge window when RCU'd vfsmounts went in; as it
> is, today we have several filesystems with exact same kind of breakage.
> hfsplus and affs breakage had been there in 3.13 (missed those two), exfat
> and ntfs3 - introduced later, by initial merges of filesystems in question.
> Missed on review...
> 
> Hell knows - perhaps Documentation/filesystems/whack-a-mole might be a good
> idea...

Actually, utf8 casefolding stuff also has the same problem, so ext4 and f2fs
with casefolding are also affected ;-/


Re: [PATCH 03/19] fs: release anon dev_t in deactivate_locked_super

2023-09-14 Thread Al Viro
On Thu, Sep 14, 2023 at 04:02:25PM +0200, Christian Brauner wrote:

> Yes, you're right that making the superblock and not the filesytem type
> the bd_holder changes the logic and we are aware of that of course. And
> it requires changes such as moving additional block device closing from
> where some callers currently do it.

Details, please?

> But the filesytem type is not a very useful holder itself and has other
> drawbacks. The obvious one being that it requires us to wade through all
> superblocks on the system trying to find the superblock associated with
> a given block device continously grabbing and dropping sb_lock and
> s_umount. None of that is very pleasant nor elegant and it is for sure
> not very easy to understand (Plus, it's broken for btrfs freezing and
> syncing via block level ioctls.).

"Constantly" is a bit of a stretch - IIRC, we grabbed sb_lock once, then
went through the list comparing ->s_bdev (without any extra locking),
then bumped ->s_count on the found superblock, dropped sb_lock,
grabbed ->s_umount on the sucker and verified it's still alive.

Repeated grabbing of any lock happened only on a race with fs shutdown;
normal case is one spin_lock, one spin_unlock, one down_read().

Oh, well...

> Using the superblock as holder makes this go away and is overall a lot
> more useful and intuitive and can be extended to filesystems with
> multiple devices (Of which we apparently are bound to get more.).
>
> So I think this change is worth the pain.
> 
> It's a fair point that these lifetime rules should be documented in
> Documentation/filesystems/. The old lifetime documentation is too sparse
> to be useful though.

What *are* these lifetime rules?  Seriously, you have 3 chunks of
fs-dependent actions at the moment:
* the things needed to get rid of internal references pinning
inodes/dentries + whatever else we need done before generic_shutdown_super()
* the stuff to be done between generic_shutdown_super() and
making the sucker invisible to sget()/sget_fc()
* the stuff that must be done after we are sure that sget
callbacks won't be looking at this instance.

Note that Christoph's series has mashed (2) and (3) together, resulting
in UAF in a bunch of places.  And I'm dead serious about
Documentation/filesystems/porting being the right place; any development
tree of any filesystem (in-tree one or not) will have to go through the
changes and figure out WTF to do with their existing code.  We are
going to play whack-a-mole for at least several years as development
branches get rebased and merged.

Incidentally, I'm going to add a (belated by 10 years) chunk in porting.rst
re making sure that anything in superblock that might be needed by methods
called in RCU mode should *not* be freed without an RCU delay...  Should've
done that back in 3.12 merge window when RCU'd vfsmounts went in; as it
is, today we have several filesystems with exact same kind of breakage.
hfsplus and affs breakage had been there in 3.13 (missed those two), exfat
and ntfs3 - introduced later, by initial merges of filesystems in question.
Missed on review...

Hell knows - perhaps Documentation/filesystems/whack-a-mole might be a good
idea...


Re: [PATCH 03/19] fs: release anon dev_t in deactivate_locked_super

2023-09-13 Thread Al Viro
On Thu, Sep 14, 2023 at 03:37:05AM +0100, Al Viro wrote:
> On Thu, Sep 14, 2023 at 12:27:12AM +0100, Al Viro wrote:
> > On Wed, Sep 13, 2023 at 08:09:57AM -0300, Christoph Hellwig wrote:
> > > Releasing an anon dev_t is a very common thing when freeing a
> > > super_block, as that's done for basically any not block based file
> > > system (modulo the odd mtd special case).  So instead of requiring
> > > a special ->kill_sb helper and a lot of boilerplate in more complicated
> > > file systems, just release the anon dev_t in deactivate_locked_super if
> > > the super_block was using one.
> > > 
> > > As the freeing is done after the main call to kill_super_notify, this
> > > removes the need for having two slightly different call sites for it.
> > 
> > Huh?  At this stage in your series freeing is still in ->kill_sb()
> > instances, after the calls of kill_anon_super() you've turned into
> > the calls of generic_shutdown_super().
> > 
> > You do split it off into a separate method later in the series, but
> > at this point you are reopening the same UAF that had been dealt with
> > in dc3216b14160 "super: ensure valid info".
> > 
> > Either move the introduction of ->free_sb() before that one, or
> > split it into lifting put_anon_bdev() (left here) and getting rid
> > of kill_anon_super() (after ->free_sb() introduction).
> 
> Actually, looking at the final stage in the series, you still have
> kill_super_notify() done *AFTER* ->free_sb() call.  So the problem
> persists until the very end...

It's worse - look at the rationale for 2c18a63b760a "super: wait until
we passed kill super".  Basically, "don't remove from the lists
until after block device closing".  IOW, we have

* stuff that needs to be done before generic_shutdown_super() (things
like pinned dentries on ramfs, etc.)
* generic_shutdown_super() itself (dentry/inode eviction, optionally
->put_super())
* stuff that needs to be done before eviction from the lists (block
device closing, since 2c18a63b760a)
* eviction from the lists
* stuff that needs to be done *after* eviction from the lists.

BTW, this part of commit message in 2c18a63b760a is rather confused:
Recent rework moved block device closing out of sb->put_super() and into
sb->kill_sb() to avoid deadlocks as s_umount is held in put_super() and
blkdev_put() can end up taking s_umount again.

That was *NOT* what a recent rework had done.  Block device closing had never
been inside ->put_super() - at no point since that (closing, that is) had been
introduced back in 0.97 ;-)  ->put_super() predates it (0.95c+).

The race is real, but the cause is not some kind of move of blkdev_put().
Your 2ea6f68932f7 "fs: use the super_block as holder when mounting file
systems" is where it actually came from.

Christoph, could you explain what the hell do we need that for?  It does
create the race in question and AFAICS 2c18a63b760a (and followups trying
to plug holes in it) had been nothing but headache.

Old logics: if mount attempt with a different fs type happens, -EBUSY
is precisely corrent - we would've gotten just that if mount() came
before umount().  If the type matches, we might
1) come before deactivate_locked_super() by umount(2).
No problem, we succeed.
2) come after the beginning of shutdown, but before the
removal from the list; fine, we'll wait for the sucker to be
unlocked (which happens in the end of generic_shutdown_super()),
notice it's dead and create a new superblock.  Since the only
part left on the umount side is closing the device, we are
just fine.
3) come after the removal from the list.  So we won't
wait for the old superblock to be unlocked, other than that
it's exactly the same as (2).  It doesn't matter whether we
open the device before or after close by umount - same owner
anyway, no -EBUSY.

Your "owner shall be the superblock" breaks that...

If you want to mess with _three_-way split of ->kill_sb(),
please start with writing down the rules re what should
go into each of those parts; such writeup should go into
Documentation/filesystems/porting anyway, even if the
split is a two-way one, BTW.


Re: [PATCH 03/19] fs: release anon dev_t in deactivate_locked_super

2023-09-13 Thread Al Viro
On Thu, Sep 14, 2023 at 12:27:12AM +0100, Al Viro wrote:
> On Wed, Sep 13, 2023 at 08:09:57AM -0300, Christoph Hellwig wrote:
> > Releasing an anon dev_t is a very common thing when freeing a
> > super_block, as that's done for basically any not block based file
> > system (modulo the odd mtd special case).  So instead of requiring
> > a special ->kill_sb helper and a lot of boilerplate in more complicated
> > file systems, just release the anon dev_t in deactivate_locked_super if
> > the super_block was using one.
> > 
> > As the freeing is done after the main call to kill_super_notify, this
> > removes the need for having two slightly different call sites for it.
> 
> Huh?  At this stage in your series freeing is still in ->kill_sb()
> instances, after the calls of kill_anon_super() you've turned into
> the calls of generic_shutdown_super().
> 
> You do split it off into a separate method later in the series, but
> at this point you are reopening the same UAF that had been dealt with
> in dc3216b14160 "super: ensure valid info".
> 
> Either move the introduction of ->free_sb() before that one, or
> split it into lifting put_anon_bdev() (left here) and getting rid
> of kill_anon_super() (after ->free_sb() introduction).

Actually, looking at the final stage in the series, you still have
kill_super_notify() done *AFTER* ->free_sb() call.  So the problem
persists until the very end...


Re: [PATCH 13/19] fs: convert kill_block_super to block_free_sb

2023-09-13 Thread Al Viro
On Wed, Sep 13, 2023 at 08:10:07AM -0300, Christoph Hellwig wrote:

> -static void affs_kill_sb(struct super_block *sb)
> +static void affs_free_sb(struct super_block *sb)
>  {
>   struct affs_sb_info *sbi = AFFS_SB(sb);
> - kill_block_super(sb);
> +
> + block_free_sb(sb);
>   if (sbi) {
>   affs_free_bitmap(sb);
>   affs_brelse(sbi->s_root_bh);



Yep, that's printk + brelse()...  Could we have that
block_free_sb() (an awful name aside) done after the
if (sbi) { ... } there?


Re: [PATCH 11/19] fs: add new shutdown_sb and free_sb methods

2023-09-13 Thread Al Viro
On Wed, Sep 13, 2023 at 08:10:05AM -0300, Christoph Hellwig wrote:
> Currently super_blocks are shut down using the ->kill_sb method, which
> must call generic_shutdown_super, but allows the file system to
> add extra work before or after the call to generic_shutdown_super.
> 
> File systems tend to get rather confused by this, so add an alternative
> shutdown sequence where generic_shutdown_super is called by the core
> code, and there are extra ->shutdown_sb and ->free_sb hooks before and
> after it.  To remove the amount of boilerplate code ->shutdown_sb is only
> called if the super has finished initialization and ->d_root is set.

The last sentence doesn't match the patchset.  That aside, there
is an issue with method names.

->shutdown_sb() is... odd.  ->begin_shutdown_sb(), perhaps?  For the
majority of filesystems it's NULL, after all...

Worse, ->free_sb() is seriously misguiding - the name implies that
we are, well, freeing a superblock passed to it.  Which is not what is
happening here - superblock itself is freed only when all passive
references go away.  It's asking for trouble down the road.

We already have more than enough confusion in the area.  Note, BTW,
that there's a delicate issue around RCU accesses and freeing stuff -
->d_compare() can bloody well be called when superblock is getting
shut down.  For anything that might be needed by it (or by other
RCU'd methods) we must arrange for RCU-delayed destruction.
E.g. in case of fatfs we have sbi freeing done via call_rcu() (from
fat_put_super(), called by generic_shutdown_super()).



Oh, bugger...  AFAICS, exfat has a problem - exfat_free_sbi() is called
directly from exfat_kill_sb(), without any concern for this:
static int exfat_utf8_d_cmp(const struct dentry *dentry, unsigned int len,
const char *str, const struct qstr *name)
{
struct super_block *sb = dentry->d_sb;
unsigned int alen = exfat_striptail_len(name->len, name->name,
EXFAT_SB(sb)->options.keep_last_dots);

That kfree() needs to be RCU-delayed...  While we are at it, there's
this:
static int exfat_d_hash(const struct dentry *dentry, struct qstr *qstr)
{
struct super_block *sb = dentry->d_sb;
struct nls_table *t = EXFAT_SB(sb)->nls_io;
and we need this
unload_nls(sbi->nls_io);
in exfat_put_super() RCU-delayed as well.  And I suspect that
exfat_free_upcase_table(sbi);
right after it needs the same treatment.

AFFS: similar problem, wants ->s_fs_info freeing RCU-delayed.

hfsplus: similar, including non-delayed unlock_nls() calls.

ntfs3:
/*
 * Try slow way with current upcase table
 */
sbi = dentry->d_sb->s_fs_info;
uni1 = __getname();
if (!uni1)
return -ENOMEM;
__getname().  "Give me a page and you might block, while you are
at it".  Done from ->d_compare().  Called under dentry->d_lock
and rcu_read_lock().  OK, any further investigation of that
one is... probably not worth bothering with at that point.

Other in-tree instances appear to be correct.  I'll push fixes for
those (well, ntfs3 aside) out tomorrow.


Re: [PATCH 09/19] zonefs: remove duplicate cleanup in zonefs_fill_super

2023-09-13 Thread Al Viro
On Wed, Sep 13, 2023 at 08:10:03AM -0300, Christoph Hellwig wrote:
> When ->fill_super fails, ->kill_sb is called which already cleans up
> the inodes and zgroups.

Ugh...  The use of "->" strongly suggests that you are talking about
a method; 'fill_super' here actually refers to callback passed to
mount_bdev().  Have a pity for those who'll be trying to parse it
- that might be yourself a couple of years down the road...

Something like

"If zonefs_fill_super() returns an error, its caller (mount_bdev()) will
make sure to call zonefs_kill_super(), which already cleans up
the inodes and zgroups.", perhaps?

> 
> Drop the extra cleanup code in zonefs_fill_super.
> 
> Signed-off-by: Christoph Hellwig 
> ---
>  fs/zonefs/super.c | 21 +
>  1 file changed, 5 insertions(+), 16 deletions(-)
> 
> diff --git a/fs/zonefs/super.c b/fs/zonefs/super.c
> index 9d1a9808fbbba6..35b2554ce2ac2e 100644
> --- a/fs/zonefs/super.c
> +++ b/fs/zonefs/super.c
> @@ -1309,13 +1309,12 @@ static int zonefs_fill_super(struct super_block *sb, 
> void *data, int silent)
>   /* Initialize the zone groups */
>   ret = zonefs_init_zgroups(sb);
>   if (ret)
> - goto cleanup;
> + return ret;
>  
>   /* Create the root directory inode */
> - ret = -ENOMEM;
>   inode = new_inode(sb);
>   if (!inode)
> - goto cleanup;
> + return -ENOMEM;
>  
>   inode->i_ino = bdev_nr_zones(sb->s_bdev);
>   inode->i_mode = S_IFDIR | 0555;
> @@ -1333,7 +1332,7 @@ static int zonefs_fill_super(struct super_block *sb, 
> void *data, int silent)
>  
>   sb->s_root = d_make_root(inode);
>   if (!sb->s_root)
> - goto cleanup;
> + return -ENOMEM;
>  
>   /*
>* Take a reference on the zone groups directory inodes
> @@ -1341,19 +1340,9 @@ static int zonefs_fill_super(struct super_block *sb, 
> void *data, int silent)
>*/
>   ret = zonefs_get_zgroup_inodes(sb);
>   if (ret)
> - goto cleanup;
> -
> - ret = zonefs_sysfs_register(sb);
> - if (ret)
> - goto cleanup;
> -
> - return 0;
> -
> -cleanup:
> - zonefs_release_zgroup_inodes(sb);
> - zonefs_free_zgroups(sb);
> + return ret;
>  
> - return ret;
> + return zonefs_sysfs_register(sb);
>  }
>  
>  static struct dentry *zonefs_mount(struct file_system_type *fs_type,
> -- 
> 2.39.2
> 


Re: [PATCH 05/19] fs: assign an anon dev_t in common code

2023-09-13 Thread Al Viro
On Wed, Sep 13, 2023 at 08:09:59AM -0300, Christoph Hellwig wrote:

> diff --git a/fs/super.c b/fs/super.c
> index bbe55f0651cca4..5c685b4944c2d6 100644
> --- a/fs/super.c
> +++ b/fs/super.c
> @@ -787,7 +787,7 @@ struct super_block *sget_fc(struct fs_context *fc,
>   struct super_block *s = NULL;
>   struct super_block *old;
>   struct user_namespace *user_ns = fc->global ? _user_ns : 
> fc->user_ns;
> - int err;
> + int err = 0;
>  
>  retry:
>   spin_lock(_lock);
> @@ -806,14 +806,26 @@ struct super_block *sget_fc(struct fs_context *fc,
>   }
>  
>   s->s_fs_info = fc->s_fs_info;
> - err = set(s, fc);
> - if (err) {
> - s->s_fs_info = NULL;
> - spin_unlock(_lock);
> - destroy_unused_super(s);
> - return ERR_PTR(err);
> + if (set) {
> + err = set(s, fc);
> + if (err) {
> + s->s_fs_info = NULL;

Pointless (as the original had been); destroy_unused_super() doesn't
even look at ->s_fs_info.

> + goto unlock_and_destroy;
> + }
>   }
>   fc->s_fs_info = NULL;

Here we are transferring the ownership from fc to superblock; it used to sit
right next to insertion into lists and all failure exits past that point must
go through deactivate_locked_super(), so ->kill_sb() would be called on those
and it would take care of s->s_fs_info.  However, your variant has that
ownership transfer done at the point before get_anon_bdev(), and that got
you a new failure exit where you are still calling destroy_unused_super():

> + if (!s->s_dev) {
> + /*
> +  * If the file system didn't set a s_dev (which is usually only
> +  * done for block based file systems), set an anonymous dev_t
> +  * here now so that we always have a valid ->s_dev.
> +  */
> + err = get_anon_bdev(>s_dev);
> + if (err)
> + goto unlock_and_destroy;

This.  And that's a leak - fc has no reference left in it, and your
unlock_and_destroy won't even look at what's in ->s_fs_info, let alone know
what to do with it.

IOW, clearing fc->s_fs_info should've been done after that chunk.

And looking at the change in sget(),

> + if (set) {
> + err = set(s, data);
> + if (err)
> + goto unlock_and_destroy;
>   }
> +
> + if (!s->s_dev) {
> + err = get_anon_bdev(>s_dev);
> + if (err)
> + goto unlock_and_destroy;
> + }

I'd rather expressed it (both there and in sget_fc() as well) as
if (set)
err = set(s, data);
if (!err && !s->s_dev)
err = get_anon_bdev(>s_dev);
if (err)
goto unlock_and_destroy;

That's really what your transformation does - you are lifting the
calls of get_anon_bdev() (in guise of set_anon_super()) from the
tails of 'set' callbacks into the caller, making them conditional
upon the lack of other errors from 'set' and upon the ->s_dev left
zero and allow NULL for the case when that was all that had been
there.

The only place where you do something different is this:

> @@ -1191,7 +1191,6 @@ static struct dentry *ceph_real_mount(struct 
> ceph_fs_client *fsc,
>  static int ceph_set_super(struct super_block *s, struct fs_context *fc)
>  {
>   struct ceph_fs_client *fsc = s->s_fs_info;
> - int ret;
>  
>   dout("set_super %p\n", s);
>  
> @@ -1211,11 +1210,7 @@ static int ceph_set_super(struct super_block *s, 
> struct fs_context *fc)
>   s->s_flags |= SB_NODIRATIME | SB_NOATIME;
>  
>   ceph_fscrypt_set_ops(s);
> -
> - ret = set_anon_super_fc(s, fc);
> - if (ret != 0)
> - fsc->sb = NULL;
> - return ret;
> + return 0;

fsc->sb = NULL has disappeared here; it *is* OK (the caller won't look at
fsc->sb after failed sget_fc()), but that's worth a mention somewhere.
A separate commit removing that clearing fsc->sb in ceph_set_super(),
perhaps?


Re: [PATCH 03/19] fs: release anon dev_t in deactivate_locked_super

2023-09-13 Thread Al Viro
On Wed, Sep 13, 2023 at 08:09:57AM -0300, Christoph Hellwig wrote:
> Releasing an anon dev_t is a very common thing when freeing a
> super_block, as that's done for basically any not block based file
> system (modulo the odd mtd special case).  So instead of requiring
> a special ->kill_sb helper and a lot of boilerplate in more complicated
> file systems, just release the anon dev_t in deactivate_locked_super if
> the super_block was using one.
> 
> As the freeing is done after the main call to kill_super_notify, this
> removes the need for having two slightly different call sites for it.

Huh?  At this stage in your series freeing is still in ->kill_sb()
instances, after the calls of kill_anon_super() you've turned into
the calls of generic_shutdown_super().

You do split it off into a separate method later in the series, but
at this point you are reopening the same UAF that had been dealt with
in dc3216b14160 "super: ensure valid info".

Either move the introduction of ->free_sb() before that one, or
split it into lifting put_anon_bdev() (left here) and getting rid
of kill_anon_super() (after ->free_sb() introduction).


Re: [RFC][PATCH] fix short copy handling in copy_mc_pipe_to_iter()

2022-06-14 Thread Al Viro
On Tue, Jun 14, 2022 at 01:53:50AM +0100, Al Viro wrote:

> FWIW, I've got quite a bit of cleanups in the local tree; reordering and
> cleaning that queue up at the moment, will post tonight or tomorrow.
> 
> I've looked into doing allocations page-by-page (instead of single
> push_pipe(), followed by copying into those).  Doable, but it ends
> up being much messier.

Hmm...  Maybe not - a possible interface would be
append_pipe(iter, size, )

that would either do kmap_local_page() on the last buffer (if it's
anonymous and has space in it) or allocated and mapped a page and
added a new buffer.  Returning the mapped address and offset from it.
Then these loops would looks like this:

while (left) {
p = append_pipe(iter, left, );
if (!p)
break;
chunk = min(left, PAGE_SIZE - off);
rem = copy(p + off, whatever, chunk);
chunk -= rem;
kunmap_local(p);

copied += chunk;
left -= chunk;

if (unlikely(rem)) {
pipe_revert(i, rem);
break;
}
}
return copied;

with no push_pipe() used at all.  For operations that can't fail,
the things are simplified in an obvious way (rem is always 0).

Or we could have append_pipe() return a struct page * and leave
kmap_local_page() to the caller...

struct page *append_pipe(struct iov_iter *i, size_t size, unsigned *off)
{
struct pipe_inode_info *pipe = i->pipe;
unsigned offset = i->iov_offset;
struct page_buffer *buf;
struct page *page;

if (offset && offset < PAGE_SIZE) {
// some space in the last buffer; can we add to it?
buf = pipe_buf(pipe, pipe->head - 1);
if (allocated(buf)) {
size = min(size, PAGE_SIZE - offset);
buf->len += size;
i->iov_offset += size;
i->count -= size;
*off = offset;
return buf->page;   // or kmap_local_page(...)
}
}
// OK, we need a new buffer
size = min(size, PAGE_SIZE);
if (pipe_full(.))
return NULL;
page = alloc_page(GFP_USER);
if (!page)
return NULL;
// got it...
buf = pipe_buf(pipe, pipe->head++);
*buf = (struct pipe_buffer){.ops = _pipe_buf_ops,
.page = page, .len = size };
i->head = pipe->head - 1;
i->iov_offset = size;
i->count -= size;
*off = 0;
return page; // or kmap_local_page(...)
}

(matter of fact, the last part could use another helper in my tree - there
the tail would be
// OK, we need a new buffer
size = min(size, PAGE_SIZE);
page = push_anon(pipe, size);
if (!page)
return NULL;
i->head = pipe->head - 1;
i->iov_offset = size;
i->count -= size;
*off = 0;
return page;
)

Would that be readable enough from your POV?  That way push_pipe()
loses almost all callers and after the "make iov_iter_get_pages()
advancing" part of the series it simply goes away...

It's obviously too intrusive for backports, though - there I'd very much
prefer the variant I posted.

Comments?

PS: re local helpers:

static inline struct pipe_buffer *pipe_buf(const struct pipe_inode_info *pipe,
   unsigned int slot)
{
return >bufs[slot & (pipe->ring_size - 1)];
}

pretty much all places where we cache pipe->ring_size - 1 had been
absolutely pointless; there are several exceptions, but back in 2019
"pipe: Use head and tail pointers for the ring, not cursor and length"
went overboard with microoptimizations...



Re: [RFC][PATCH] fix short copy handling in copy_mc_pipe_to_iter()

2022-06-14 Thread Al Viro
On Tue, Jun 14, 2022 at 07:36:19AM +0100, David Howells wrote:
> Al Viro  wrote:
> 
> > What's wrong with
> > p_occupancy = pipe_occupancy(head, tail);
> > if (p_occupancy >= pipe->max_usage)
> > return 0;
> > else
> > return pipe->max_usage - p_occupancy;
> 
> Because "pipe->max_usage - p_occupancy" can be negative.

Sure can.  And in that case you return 0; no problem wiht that.
It's what happens when occupancy is below max_usage that is weird.

> post_one_notification() is limited by pipe->ring_size, not pipe->max_usage.
> 
> The idea is to allow some slack in a watch pipe for the watch_queue code to
> use that userspace can't.

Sure.  And if this function is supposed to report how many times would
userspace be able to grab a slot, it's returning the wrong value.

Look: 32-slot ring.  max_usage is 16.  14 slots are already occupied.
Userland (sure as hell, anything in iov_iter.c) will be able to occupy
two more before it runs into the pipe_full().  And your function returns
min(32 - 14, 16), i.e. 16.

What am I missing here?



Re: [RFC][PATCH] fix short copy handling in copy_mc_pipe_to_iter()

2022-06-13 Thread Al Viro
On Mon, Jun 13, 2022 at 11:28:34PM +0100, Al Viro wrote:
> On Mon, Jun 13, 2022 at 10:54:36AM -0700, Linus Torvalds wrote:
> > On Sun, Jun 12, 2022 at 5:10 PM Al Viro  wrote:
> > >
> > > Unlike other copying operations on ITER_PIPE, copy_mc_to_iter() can
> > > result in a short copy.  In that case we need to trim the unused
> > > buffers, as well as the length of partially filled one - it's not
> > > enough to set ->head, ->iov_offset and ->count to reflect how
> > > much had we copied.  Not hard to fix, fortunately...
> > >
> > > I'd put a helper (pipe_discard_from(pipe, head)) into pipe_fs_i.h,
> > > rather than iov_iter.c -
> > 
> > Actually, since this "copy_mc_xyz()" stuff is going to be entirely
> > impossible to debug and replicate for any normal situation, I would
> > suggest we take the approach that we (long ago) used to take with
> > copy_from_user(): zero out the destination buffer, so that developers
> > that can't test the faulting behavior don't have to worry about it.
> > 
> > And then the existing code is fine: it will break out of the loop, but
> > it won't do the odd revert games and the "randomnoise.len -= rem"
> > thing that I can't wrap my head around.
> > 
> > Hmm?
> 
> Not really - we would need to zero the rest of those pages somehow.
> They are already allocated and linked into pipe; leaving them
> there (and subsequent ones hadn't seen any stores whatsoever - they
> are fresh out of alloc_page(GFP_USER)) is a non-starter.
> 
> We could do allocation as we go, but that's a much more intrusive
> change...

FWIW, I've got quite a bit of cleanups in the local tree; reordering and
cleaning that queue up at the moment, will post tonight or tomorrow.

I've looked into doing allocations page-by-page (instead of single
push_pipe(), followed by copying into those).  Doable, but it ends
up being much messier.

IMO this "truncate on failure" approach is saner.



Re: [RFC][PATCH] fix short copy handling in copy_mc_pipe_to_iter()

2022-06-13 Thread Al Viro
On Tue, Jun 14, 2022 at 12:25:03AM +0100, Al Viro wrote:

> The more I'm looking at that thing, the more it smells like a bug;
> it had the same 3 callers since the time it had been introduced.
> 
> 1) pipe_get_pages().  We are about to try and allocate up to that
> many pipe buffers.  Allocation (done in push_pipe()) is done only
> if we have !pipe_full(pipe->head, pipe->tail, pipe->max_usage).
> 
> It simply won't give you more than max_usage - occupancy.
> Your function returns min(ring_size - occupancy, max_usage), which
> is always greater than or equal to that (ring_size >= max_usage).
> 
> 2) pipe_get_pages_alloc().  Same story, same push_pipe() being
> called, same "we'll never get that much - it'll hit the limit
> first".
> 
> 3) iov_iter_npages() in case of ITER_PIPE.  Again, the value
> is bogus - it should not be greater than the amount of pages
> we would be able to write there.
> 
> AFAICS, 6718b6f855a0 "pipe: Allow pipes to have kernel-reserved slots"
> broke it for cases when ring_size != max_usage...

Unless I'm missing something, the following would do the right thing.
Dave?

diff --git a/include/linux/pipe_fs_i.h b/include/linux/pipe_fs_i.h
index 4ea496924106..c22173d6e500 100644
--- a/include/linux/pipe_fs_i.h
+++ b/include/linux/pipe_fs_i.h
@@ -165,15 +165,10 @@ static inline bool pipe_full(unsigned int head, unsigned 
int tail,
 static inline unsigned int pipe_space_for_user(unsigned int head, unsigned int 
tail,
   struct pipe_inode_info *pipe)
 {
-   unsigned int p_occupancy, p_space;
-
-   p_occupancy = pipe_occupancy(head, tail);
+   unsigned int p_occupancy = pipe_occupancy(head, tail);
if (p_occupancy >= pipe->max_usage)
return 0;
-   p_space = pipe->ring_size - p_occupancy;
-   if (p_space > pipe->max_usage)
-   p_space = pipe->max_usage;
-   return p_space;
+   return pipe->max_usage - p_occupancy;
 }
 
 /**



Re: [RFC][PATCH] fix short copy handling in copy_mc_pipe_to_iter()

2022-06-13 Thread Al Viro
On Mon, Jun 13, 2022 at 11:28:34PM +0100, Al Viro wrote:

> Dave, could you explain what's going on there?  Note that pipe_write()
> does *not* use that thing at all; it's only splice (i.e. ITER_PIPE
> stuff) that is using it.
> 
> What's wrong with
> p_occupancy = pipe_occupancy(head, tail);
> if (p_occupancy >= pipe->max_usage)
> return 0;
>   else
>   return pipe->max_usage - p_occupancy;
> 
> which would match the way you are using ->max_usage in pipe_write()
> et.al.  Including the use in copy_page_to_iter_pipe(), BTW...

The more I'm looking at that thing, the more it smells like a bug;
it had the same 3 callers since the time it had been introduced.

1) pipe_get_pages().  We are about to try and allocate up to that
many pipe buffers.  Allocation (done in push_pipe()) is done only
if we have !pipe_full(pipe->head, pipe->tail, pipe->max_usage).

It simply won't give you more than max_usage - occupancy.
Your function returns min(ring_size - occupancy, max_usage), which
is always greater than or equal to that (ring_size >= max_usage).

2) pipe_get_pages_alloc().  Same story, same push_pipe() being
called, same "we'll never get that much - it'll hit the limit
first".

3) iov_iter_npages() in case of ITER_PIPE.  Again, the value
is bogus - it should not be greater than the amount of pages
we would be able to write there.

AFAICS, 6718b6f855a0 "pipe: Allow pipes to have kernel-reserved slots"
broke it for cases when ring_size != max_usage...



Re: [RFC][PATCH] fix short copy handling in copy_mc_pipe_to_iter()

2022-06-13 Thread Al Viro
On Mon, Jun 13, 2022 at 10:54:36AM -0700, Linus Torvalds wrote:
> On Sun, Jun 12, 2022 at 5:10 PM Al Viro  wrote:
> >
> > Unlike other copying operations on ITER_PIPE, copy_mc_to_iter() can
> > result in a short copy.  In that case we need to trim the unused
> > buffers, as well as the length of partially filled one - it's not
> > enough to set ->head, ->iov_offset and ->count to reflect how
> > much had we copied.  Not hard to fix, fortunately...
> >
> > I'd put a helper (pipe_discard_from(pipe, head)) into pipe_fs_i.h,
> > rather than iov_iter.c -
> 
> Actually, since this "copy_mc_xyz()" stuff is going to be entirely
> impossible to debug and replicate for any normal situation, I would
> suggest we take the approach that we (long ago) used to take with
> copy_from_user(): zero out the destination buffer, so that developers
> that can't test the faulting behavior don't have to worry about it.
> 
> And then the existing code is fine: it will break out of the loop, but
> it won't do the odd revert games and the "randomnoise.len -= rem"
> thing that I can't wrap my head around.
> 
> Hmm?

Not really - we would need to zero the rest of those pages somehow.
They are already allocated and linked into pipe; leaving them
there (and subsequent ones hadn't seen any stores whatsoever - they
are fresh out of alloc_page(GFP_USER)) is a non-starter.

We could do allocation as we go, but that's a much more intrusive
change...

BTW, speaking of pipes:
static inline unsigned int pipe_space_for_user(unsigned int head, unsigned int 
tail,
   struct pipe_inode_info *pipe)
{
unsigned int p_occupancy, p_space;

p_occupancy = pipe_occupancy(head, tail);
if (p_occupancy >= pipe->max_usage)
return 0;
p_space = pipe->ring_size - p_occupancy;
if (p_space > pipe->max_usage)
p_space = pipe->max_usage;
return p_space;
}

OK, if head - tail >= max_usage, we get 0.  Fair enough, since
pipe_full() callers will get "it's full, sod off" in that situation.
But...  what the hell is the rest doing?  p_space is the amount of
slots not in use.  So we return the lesser of it and max_usage?

Suppose we have 128 slots in the ring, with max_usage being below
that (e.g. 64).  63 slots are in use; you can add at most one.
And p_space is 65, so this sucker will return 64.

Dave, could you explain what's going on there?  Note that pipe_write()
does *not* use that thing at all; it's only splice (i.e. ITER_PIPE
stuff) that is using it.

What's wrong with
p_occupancy = pipe_occupancy(head, tail);
if (p_occupancy >= pipe->max_usage)
return 0;
else
return pipe->max_usage - p_occupancy;

which would match the way you are using ->max_usage in pipe_write()
et.al.  Including the use in copy_page_to_iter_pipe(), BTW...



[RFC][PATCH] fix short copy handling in copy_mc_pipe_to_iter()

2022-06-12 Thread Al Viro
[commit in question sits in vfs.git#fixes]

Unlike other copying operations on ITER_PIPE, copy_mc_to_iter() can
result in a short copy.  In that case we need to trim the unused
buffers, as well as the length of partially filled one - it's not
enough to set ->head, ->iov_offset and ->count to reflect how
much had we copied.  Not hard to fix, fortunately...

I'd put a helper (pipe_discard_from(pipe, head)) into pipe_fs_i.h,
rather than iov_iter.c - it has nothing to do with iov_iter and
having it will allow us to avoid an ugly kludge in fs/splice.c.
We could put it into lib/iov_iter.c for now and move it later,
but I don't see the point going that way...

Fixes: ca146f6f091e "lib/iov_iter: Fix pipe handling in _copy_to_iter_mcsafe()"
Signed-off-by: Al Viro 
---
diff --git a/include/linux/pipe_fs_i.h b/include/linux/pipe_fs_i.h
index cb0fd633a610..4ea496924106 100644
--- a/include/linux/pipe_fs_i.h
+++ b/include/linux/pipe_fs_i.h
@@ -229,6 +229,15 @@ static inline bool pipe_buf_try_steal(struct 
pipe_inode_info *pipe,
return buf->ops->try_steal(pipe, buf);
 }
 
+static inline void pipe_discard_from(struct pipe_inode_info *pipe,
+   unsigned int old_head)
+{
+   unsigned int mask = pipe->ring_size - 1;
+
+   while (pipe->head > old_head)
+   pipe_buf_release(pipe, >bufs[--pipe->head & mask]);
+}
+
 /* Differs from PIPE_BUF in that PIPE_SIZE is the length of the actual
memory allocation, whereas PIPE_BUF makes atomicity guarantees.  */
 #define PIPE_SIZE  PAGE_SIZE
diff --git a/lib/iov_iter.c b/lib/iov_iter.c
index 0b64695ab632..2bf20b48a04a 100644
--- a/lib/iov_iter.c
+++ b/lib/iov_iter.c
@@ -689,6 +689,7 @@ static size_t copy_mc_pipe_to_iter(const void *addr, size_t 
bytes,
struct pipe_inode_info *pipe = i->pipe;
unsigned int p_mask = pipe->ring_size - 1;
unsigned int i_head;
+   unsigned int valid = pipe->head;
size_t n, off, xfer = 0;
 
if (!sanity(i))
@@ -702,11 +703,17 @@ static size_t copy_mc_pipe_to_iter(const void *addr, 
size_t bytes,
rem = copy_mc_to_kernel(p + off, addr + xfer, chunk);
chunk -= rem;
kunmap_local(p);
-   i->head = i_head;
-   i->iov_offset = off + chunk;
-   xfer += chunk;
-   if (rem)
+   if (chunk) {
+   i->head = i_head;
+   i->iov_offset = off + chunk;
+   xfer += chunk;
+   valid = i_head + 1;
+   }
+   if (rem) {
+   pipe->bufs[i_head & p_mask].len -= rem;
+   pipe_discard_from(pipe, valid);
break;
+   }
n -= chunk;
off = 0;
i_head++;



Re: Invalid License ID: GPL-3.0 for arch/mips/boot/dts/loongson/loongson64-2k1000.dtsi

2021-04-20 Thread Al Viro
On Tue, Apr 20, 2021 at 11:39:04AM +0200, Lukas Bulwahn wrote:
> Dear Qing,
> 
> ./scripts/spdxcheck.py reports:
> 
> arch/mips/boot/dts/loongson/loongson64-2k1000.dtsi: 1:28 Invalid
> License ID: GPL-3.0
> 
> You have contributed this file with commit b1a792601f26 ("MIPS:
> Loongson64: DeviceTree for Loongson-2K1000") to the current
> linux-next.
> 
> Do you intend to contribute this under this non-default license
> (GPL-3.0) for the kernel project or did you simply mean the default
> license GPL-2.0?

Ouch...  That's quite likely to be impossible to distribute, since the
requirements of v2 and v3 are mutually incompatible and quite a bit of
the kernel is under v2-only, not v2-or-any-later-version.

Seriously, folks - talk to lawyers; if the result is *NOT* a mere aggregation
(i.e. is derived work wrt the kernel proper), you are in a copyright violation.

Moreover, if anything in that file is derived from others' work, you can't
use GPL-3.0 unless the terms for everything it's derived from allow
redistribution under GPL-3.0.  Anything derived from others' GPL-2.0 work
(as opposed to GPL-2.0-or-later one) can't be relicensed to GPL-3.0 at
all.


Re: [PATCH 2/3] nios2: Cleanup deprecated function strlen_user

2021-04-20 Thread Al Viro
On Tue, Apr 20, 2021 at 04:32:33PM +0200, Arnd Bergmann wrote:
> On Tue, Apr 20, 2021 at 3:37 PM  wrote:
> >
> > From: Guo Ren 
> >
> > $ grep strlen_user * -r
> > arch/csky/include/asm/uaccess.h:#define strlen_user(str) strnlen_user(str, 
> > 32767)
> > arch/csky/lib/usercopy.c: * strlen_user: - Get the size of a string in user 
> > space.
> > arch/ia64/lib/strlen.S: // Please note that in the case of strlen() as 
> > opposed to strlen_user()
> > arch/mips/lib/strnlen_user.S: *  make strlen_user and strnlen_user access 
> > the first few KSEG0
> > arch/nds32/include/asm/uaccess.h:extern __must_check long strlen_user(const 
> > char __user * str);
> > arch/nios2/include/asm/uaccess.h:extern __must_check long strlen_user(const 
> > char __user *str);
> > arch/riscv/include/asm/uaccess.h:extern long __must_check strlen_user(const 
> > char __user *str);
> > kernel/trace/trace_probe_tmpl.h:static nokprobe_inline int 
> > fetch_store_strlen_user(unsigned long addr);
> > kernel/trace/trace_probe_tmpl.h:ret += 
> > fetch_store_strlen_user(val + code->offset);
> > kernel/trace/trace_uprobe.c:fetch_store_strlen_user(unsigned long addr)
> > kernel/trace/trace_kprobe.c:fetch_store_strlen_user(unsigned long addr)
> > kernel/trace/trace_kprobe.c:return 
> > fetch_store_strlen_user(addr);
> 
> I would suggest using "grep strlen_user * -rw", to let the whole-word match
> filter out the irrelevant ones for the changelog.
> 
> > See grep result, nobody uses it.
> >
> > Signed-off-by: Guo Ren 
> > Cc: Arnd Bergmann 
> 
> All three patches
> 
> Reviewed-by: Arnd Bergmann 
> 
> Do you want me to pick them up in the asm-generic tree?

Might make sense to check -next from time to time...  See
a0d8d552783b "whack-a-mole: kill strlen_user() (again)" in there


Re: [PATCH] net/rds: Avoid potential use after free in rds_send_remove_from_sock

2021-04-19 Thread Al Viro
On Tue, Apr 06, 2021 at 07:09:12PM -0500, Aditya Pakki wrote:

> --- a/net/rds/send.c
> +++ b/net/rds/send.c
> @@ -665,7 +665,7 @@ static void rds_send_remove_from_sock(struct list_head 
> *messages, int status)
>  unlock_and_drop:
>   spin_unlock_irqrestore(>m_rs_lock, flags);
>   rds_message_put(rm);
> - if (was_on_sock)
> + if (was_on_sock && rm)
>   rds_message_put(rm);

Look at the code immediately prior to the site of your "fix".
Think for a second what will happen if we *could* get there with rm
equal to NULL (with your patch applied, that is).  Now, try to construct
the sequence of events that would lead to that situation.  Either you
will arrive at a real bug (in which case your fix does not actually
fix anything) *OR* you will get closer to realization that "defensive
programming" tends to be worthless garbage.  In both case the result
would be useful...

Incidentally, locate the place where that variable is
last modified and find the precondition required for rm == NULL
downstream of that.

Plainly put, the patch demonstrates either complete lack of
understanding or somebody not acting in good faith.  If it's the latter[1],
may I suggest the esteemed sociologists to fuck off and stop
testing the reviewers with deliberately spewed excrements?

[1] 
https://github.com/QiushiWu/QiushiWu.github.io/blob/main/papers/OpenSourceInsecurity.pdf


Re: [PATCH] x86/uaccess: small optimization in unsafe_copy_to_user()

2021-04-17 Thread Al Viro
On Sat, Apr 17, 2021 at 03:11:49PM -0700, Linus Torvalds wrote:
> On Sat, Apr 17, 2021 at 1:35 PM Al Viro  wrote:
> >
> > No, wait - we have non-NULL buf->prev_reclen, so we'll hit
> > with buf->error completely ignored.  Nevermind.
> 
> Yeah, I'm pretty sure I even tested that -EINTR case at one point.
> 
> Of course, it could easily have gotten broken again, so that's not a
> strong argument.
> 
> That said, the "buf->err" handling has always been very confusing, and
> it would be lovely to get rid of that confusion.
> 
> I don't remember why we did it that way, but I think it's because
> low-level filesystems ended up changing the error that the filldir()
> function returned or something like that.

Propagating errors from e.g. filldir() out through ->iterate() instances 
turns out to be painful.  If anything, considering that there's a lot
more ->iterate/->iterate_shared instances that ->actor ones, it would
make sense to change the calling conventions for the latter.  IOW,
stop pretending that they return an error value and just have them
return a bool: "should the caller keep going".

Here's what I've got sitting in a local branch; not sure if it would
be better to invert the rules, though - I went for "should we keep
going", but we could do "should we stop" instead.

Change calling conventions for filldir_t

filldir_t instances (directory iterators callbacks) used to return 0 for
"OK, keep going" or -E... for "stop".  Note that it's *NOT* how the
error values are reported - the rules for those are callback-dependent
and ->iterate{,_shared}() instances only care about zero vs. non-zero
(look at emit_dir() and friends).

So let's just return bool - it's less confusing that way.

Signed-off-by: Al Viro 
---
diff --git a/Documentation/filesystems/porting.rst 
b/Documentation/filesystems/porting.rst
index 0302035781be..268c2ac616d1 100644
--- a/Documentation/filesystems/porting.rst
+++ b/Documentation/filesystems/porting.rst
@@ -890,3 +890,14 @@ been called or returned with non -EIOCBQUEUED code.
 
 mnt_want_write_file() can now only be paired with mnt_drop_write_file(),
 whereas previously it could be paired with mnt_drop_write() as well.
+
+---
+
+*mandatory*
+
+filldir_t (readdir callbacks) calling conventions have changed.  Instead of
+returning 0 or -E... it returns bool now.  true means "keep going" (as 0 used
+to to) and false - "no more" (as -E... in old calling conventions).  Rationale:
+callers never looked at specific -E... values anyway.  ->iterate() and
+->iterate_shared() instance require no changes at all, all filldir_t ones in
+the tree converted.
diff --git a/arch/alpha/kernel/osf_sys.c b/arch/alpha/kernel/osf_sys.c
index d5367a1c6300..6864794b663a 100644
--- a/arch/alpha/kernel/osf_sys.c
+++ b/arch/alpha/kernel/osf_sys.c
@@ -107,7 +107,7 @@ struct osf_dirent_callback {
int error;
 };
 
-static int
+static bool
 osf_filldir(struct dir_context *ctx, const char *name, int namlen,
loff_t offset, u64 ino, unsigned int d_type)
 {
@@ -119,11 +119,11 @@ osf_filldir(struct dir_context *ctx, const char *name, 
int namlen,
 
buf->error = -EINVAL;   /* only used if we fail */
if (reclen > buf->count)
-   return -EINVAL;
+   return false;
d_ino = ino;
if (sizeof(d_ino) < sizeof(ino) && d_ino != ino) {
buf->error = -EOVERFLOW;
-   return -EOVERFLOW;
+   return false;
}
if (buf->basep) {
if (put_user(offset, buf->basep))
@@ -140,10 +140,10 @@ osf_filldir(struct dir_context *ctx, const char *name, 
int namlen,
dirent = (void __user *)dirent + reclen;
buf->dirent = dirent;
buf->count -= reclen;
-   return 0;
+   return true;
 Efault:
buf->error = -EFAULT;
-   return -EFAULT;
+   return false;
 }
 
 SYSCALL_DEFINE4(osf_getdirentries, unsigned int, fd,
diff --git a/fs/afs/dir.c b/fs/afs/dir.c
index 714fcca9af99..aa3bdf389d47 100644
--- a/fs/afs/dir.c
+++ b/fs/afs/dir.c
@@ -24,9 +24,9 @@ static int afs_readdir(struct file *file, struct dir_context 
*ctx);
 static int afs_d_revalidate(struct dentry *dentry, unsigned int flags);
 static int afs_d_delete(const struct dentry *dentry);
 static void afs_d_iput(struct dentry *dentry, struct inode *inode);
-static int afs_lookup_one_filldir(struct dir_context *ctx, const char *name, 
int nlen,
+static bool afs_lookup_one_filldir(struct dir_context *ctx, const char *name, 
int nlen,
  loff_t fpos, u64 ino, unsigned dtype);
-static int afs_lookup_filldir(struct dir_context *ctx, const char *name, int 
nlen,
+static bool afs_lookup_filldir(struct dir_context *ctx, const char *name, int 
nlen,
  loff_t fpos,

Re: [PATCH] x86/uaccess: small optimization in unsafe_copy_to_user()

2021-04-17 Thread Al Viro
On Sat, Apr 17, 2021 at 08:30:42PM +, Al Viro wrote:

> and that thing is still there.  However, it does *NOT* do what it might
> appear to be doing; it ends up with getdents() returning -EINVAL instead.
> What we need is
>   buf->error = -EINTR;
> in addition to that return (in all 3 such places).  Do you have any problems 
> with
> the following delta?

No, wait - we have non-NULL buf->prev_reclen, so we'll hit
if (buf.prev_reclen) {
struct compat_linux_dirent __user * lastdirent;
lastdirent = (void __user *)buf.current_dir - buf.prev_reclen;

if (put_user(buf.ctx.pos, >d_off)) 
error = -EFAULT;
else
error = count - buf.count;
}
with buf->error completely ignored.  Nevermind.


Re: [PATCH] x86/uaccess: small optimization in unsafe_copy_to_user()

2021-04-17 Thread Al Viro
[tytso Cc'd]

On Sat, Apr 17, 2021 at 06:09:44PM +, Al Viro wrote:

> > Al - fairly trivial patch applied, comments?
> 
> Should be fine...  FWIW, I've a patch in the same area, making those suckers
> return bool.  Seeing that they are only ever called via dir_emit(), 
> dir_emit_dot()
> and dir_emit_dotdot() and all of those return ->actor(...) == 0...
> 
> Anyway, that'd be trivial to rebase on top of yours.

Actually... looking through that patch now I've noticed something fishy:
in 1f60fbe72749 ("ext4: allow readdir()'s of large empty directories to
be interrupted" we had
@@ -169,6 +169,8 @@ static int filldir(struct dir_context *ctx, const char 
*name, int namlen,
}
dirent = buf->previous;
if (dirent) {
+   if (signal_pending(current))
+   return -EINTR;

and that thing is still there.  However, it does *NOT* do what it might
appear to be doing; it ends up with getdents() returning -EINVAL instead.
What we need is
buf->error = -EINTR;
in addition to that return (in all 3 such places).  Do you have any problems 
with
the following delta?

diff --git a/fs/readdir.c b/fs/readdir.c
index 19434b3c982c..c742db935847 100644
--- a/fs/readdir.c
+++ b/fs/readdir.c
@@ -239,8 +239,10 @@ static int filldir(struct dir_context *ctx, const char 
*name, int namlen,
return -EOVERFLOW;
}
prev_reclen = buf->prev_reclen;
-   if (prev_reclen && signal_pending(current))
+   if (prev_reclen && signal_pending(current)) {
+   buf->error = -EINTR;
return -EINTR;
+   }
dirent = buf->current_dir;
prev = (void __user *) dirent - prev_reclen;
if (!user_write_access_begin(prev, reclen + prev_reclen))
@@ -321,8 +323,10 @@ static int filldir64(struct dir_context *ctx, const char 
*name, int namlen,
if (reclen > buf->count)
return -EINVAL;
prev_reclen = buf->prev_reclen;
-   if (prev_reclen && signal_pending(current))
+   if (prev_reclen && signal_pending(current)) {
+   buf->error = -EINTR;
return -EINTR;
+   }
dirent = buf->current_dir;
prev = (void __user *)dirent - prev_reclen;
if (!user_write_access_begin(prev, reclen + prev_reclen))
@@ -488,8 +492,10 @@ static int compat_filldir(struct dir_context *ctx, const 
char *name, int namlen,
return -EOVERFLOW;
}
prev_reclen = buf->prev_reclen;
-   if (prev_reclen && signal_pending(current))
+   if (prev_reclen && signal_pending(current)) {
+   buf->error = -EINTR;
return -EINTR;
+   }
dirent = buf->current_dir;
prev = (void __user *) dirent - prev_reclen;
if (!user_write_access_begin(prev, reclen + prev_reclen))


Re: [PATCH] x86/uaccess: small optimization in unsafe_copy_to_user()

2021-04-17 Thread Al Viro
On Sat, Apr 17, 2021 at 09:27:04AM -0700, Linus Torvalds wrote:
> On Sat, Apr 17, 2021 at 9:08 AM Linus Torvalds
>  wrote:
> >
> > Side note: I'm, looking at the readdir cases that I wrote, and I have
> > to just say that is broken too. So "stones and glass houses" etc, and
> > I'll have to fix that too.
> 
> In particular, the very very old OLD_READDIR interface that only fills
> in one dirent at a time didn't call verify_dirent_name(). Same for the
> compat version.
> 
> This requires a corrupt filesystem to be an issue (and even then,
> most/all would have the length of a directory entry in an 'unsigned
> char', so even corrupt filesystems would generally never have a
> negative name length).
> 
> So I don't think it's an issue in _practice_, but at the same time it
> is very much an example of the same issue that put_cmsg() has in
> net-next: unsafe user copies should be fully guarded and not have some
> "but this would never happen because callers would never do anything
> bad".
> 
> Al - fairly trivial patch applied, comments?

Should be fine...  FWIW, I've a patch in the same area, making those suckers
return bool.  Seeing that they are only ever called via dir_emit(), 
dir_emit_dot()
and dir_emit_dotdot() and all of those return ->actor(...) == 0...

Anyway, that'd be trivial to rebase on top of yours.


Re: [PATCH 04/13] Kbuild: Rust support

2021-04-16 Thread Al Viro
On Sat, Apr 17, 2021 at 12:04:16AM +0200, Willy Tarreau wrote:

> Yep but I kept it just to have comparable output code since in C
> you'd simply use "goto leave" and not have this function call to
> do the cleanup.

... or use any number of other technics; the real question was how
much of cleanups would be skipped by that syntax sugar.

IME anything that interacts with flow control should be as explicit
and unambiguous as possible.  _Especially_ concerning how large
a scope are we leaving.

There's a bunch of disciplines that make use of that kind of tools
and do it more or less safely, but they need to be well-specified
and very well understood.  And some tools (C++-style exceptions,
for example) simply need to be taken out and shot, but that's
a separate story^Wflamewar...


Re: [PATCH] x86/uaccess: small optimization in unsafe_copy_to_user()

2021-04-16 Thread Al Viro
On Fri, Apr 16, 2021 at 12:24:13PM -0700, Eric Dumazet wrote:
> From: Eric Dumazet 
> 
> We have to loop only to copy u64 values.
> After this first loop, we copy at most one u32, one u16 and one byte.

Does it actually yield a better code?

FWIW, this
void bar(unsigned);
void foo(unsigned n)
{
while (n >= 8) {
bar(n);
n -= 8;
}
while (n >= 4) {
bar(n);
n -= 4;
}
while (n >= 2) {
bar(n);
n -= 2;
}
while (n >= 1) {
bar(n);
n -= 1;
}
}

will compile (with -O2) to
pushq   %rbp
pushq   %rbx
movl%edi, %ebx
subq$8, %rsp
cmpl$7, %edi
jbe .L2
movl%edi, %ebp
.L3:
movl%ebp, %edi
subl$8, %ebp
callbar@PLT
cmpl$7, %ebp
ja  .L3
andl$7, %ebx
.L2:
cmpl$3, %ebx
jbe .L4
movl%ebx, %edi
andl$3, %ebx
callbar@PLT
.L4:
cmpl$1, %ebx
jbe .L5
movl%ebx, %edi
andl$1, %ebx
callbar@PLT
.L5:
testl   %ebx, %ebx
je  .L1
addq$8, %rsp
movl$1, %edi
popq%rbx
popq%rbp
jmp bar@PLT
.L1:
addq$8, %rsp
popq%rbx
popq%rbp
ret

i.e. loop + if + if + if...


Re: [PATCH 04/13] Kbuild: Rust support

2021-04-16 Thread Al Viro
On Fri, Apr 16, 2021 at 07:47:32PM +0200, Miguel Ojeda wrote:
> On Fri, Apr 16, 2021 at 7:05 PM Linus Torvalds
>  wrote:
> >
> > Typical Rust error handling should match the regular kernel
> > IS_ERR/ERR_PTR/PTR_ERR model fairly well, although the syntax is
> > fairly different (and it's not limited to pointers).
> 
> Yeah, exactly. We already have a `KernelResult` type which is a
> `Result`, where `Error` is a wrapper for the usual kernel
> int errors.
> 
> So, for instance, a function that can either fail or return `Data`
> would have a declaration like:
> 
> pub fn foo() -> KernelResult
> 
> A caller that needs to handle the error can use pattern matching or
> one of the methods in `Result`. And if they only need to bubble the
> error up, they can use the ? operator:
> 
> pub fn bar() -> KernelResult {
> let data = foo()?;
> 
> // `data` is already a `Data` here, not a `KernelResult`
> }

Umm...  A fairly common situation is

foo() returns a pointer to struct foo instance or ERR_PTR()
bar() returns a pointer to struct bar instance of ERR_PTR()

bar()
{
struct foo *p;
struct bar *res;
 // do some work, grab a mutex, etc.
p = foo();
if (IS_ERR(p))
res = ERR_CAST(p); // (void *)p, internally; conceptually it's
   // ERR_PTR(PTR_ERR(p));
else
res = blah();
 // matching cleanup
return res;
}

How well would ? operator fit that pattern?  _If_ it's just a syntax sugar
along the lines of "if argument matches Err(_), return Err(_)", the types
shouldn't be an issue, but that might need some fun with releasing resources,
etc.  If it's something more elaborate... details, please.


Re: [PATCH] hfsplus: prevent negative dentries when casefolded

2021-04-16 Thread Al Viro
On Sat, Apr 17, 2021 at 01:20:12AM +0800, Chung-Chiang Cheng wrote:
> hfsplus uses the case-insensitive filenames by default, but VFS negative
> dentries are incompatible with case-insensitive. For example, the
> following instructions will get a cached filename 'aaa' which isn't
> expected. There is no such problem in macOS.

Look into the way vfat et.al. are dealing with that (see their
->d_revalidate()).


Re: [PATCH] lsm:fix a missing-check bug in smack_sb_eat_lsm_opts()

2021-04-16 Thread Al Viro
On Fri, Apr 16, 2021 at 05:53:03PM +0800,  Zhongjun Tan wrote:

> @@ -710,13 +711,14 @@ static int smack_sb_eat_lsm_opts(char *options, void 
> **mnt_opts)
>   token = match_opt_prefix(from, len, );
>   if (token != Opt_error) {
>   arg = kmemdup_nul(arg, from + len - arg, GFP_KERNEL);
> + if (!arg) {
> + rc = -ENOMEM;
> + goto free_mnt_opts;
>   rc = smack_add_opt(token, arg, mnt_opts);

if (arg)
rc = smack_add_opt(token, arg, mnt_opts);
else
rc = -ENOMEM;

and no other changes are needed anywhere...


Re: [PATCH 00/13] [RFC] Rust support

2021-04-15 Thread Al Viro
On Fri, Apr 16, 2021 at 03:22:16AM +0100, Wedson Almeida Filho wrote:

> > HTML is not a valid documentation format. Heck, markdown itself is
> > barely readable.
> 
> Are you stating [what you perceive as] a fact or just venting? If the former,
> would you mind enlightening us with some evidence?

How about "not everyone uses a browser as a part of their workflow"?
I realize that it might sound ridiculous for folks who spent a while
around Mozilla, but it's really true and kernel community actually
has quite a few of such freaks.  And as one of those freaks I can tell
you where exactly I would like you to go and what I would like you to do
with implicit suggestions to start a browser when I need to read some
in-tree documentation.

Linus might have different reasons, obviously.


Re: [PATCH] fs: split receive_fd_replace from __receive_fd

2021-04-15 Thread Al Viro
On Fri, Apr 02, 2021 at 12:01:05PM -0700, Kees Cook wrote:
> On Thu, Mar 25, 2021 at 09:22:09AM +0100, Christoph Hellwig wrote:
> > receive_fd_replace shares almost no code with the general case, so split
> > it out.  Also remove the "Bump the sock usage counts" comment from
> > both copies, as that is now what __receive_sock actually does.
> > 
> > Signed-off-by: Christoph Hellwig 
> 
> I'm okay with repeating code in fs/file.c. What I wanted to avoid was
> open coded combinations in various callers.

... and that got you a lovely userland ABI, where you have

(1) newfd >= 0, SECCOMP_ADDFD_FLAG_SETFD is present => replace
(2) newfd < 0, SECCOMP_ADDFD_FLAG_SETFD is present => insert
(3) newfd == 0, SECCOMP_ADDFD_FLAG_SETFD not present => insert
(4) newfd != 0, SECCOMP_ADDFD_FLAG_SETFD not present => -EINVAL

IMO (2) is a bug.  Whether we still can fix it or not... no idea, depends
on whether the actual userland has come to depend upon it.

I suggest turning (2) into an error (-EBADF is what you'd get from
attempt to set something at such descriptor) and seeing if anything
breaks.  And having SECCOMP_ADDFD_FLAG_SETFD status passed into kaddfd
explicitly, with explicit check in seccomp_handle_addfd().  As in

commit 42eb0d54c08a0331d6d295420f602237968d792b
Author: Christoph Hellwig 
Date:   Thu Mar 25 09:22:09 2021 +0100

fs: split receive_fd_replace from __receive_fd

receive_fd_replace shares almost no code with the general case, so split
it out.  Also remove the "Bump the sock usage counts" comment from
both copies, as that is now what __receive_sock actually does.

[AV: ... and make the only user of receive_fd_replace() choose between
it and receive_fd() according to what userland had passed to it in
flags]

Signed-off-by: Christoph Hellwig 
Signed-off-by: Al Viro 

diff --git a/fs/file.c b/fs/file.c
index f3a4bac2cbe9..d8ccb95a7f41 100644
--- a/fs/file.c
+++ b/fs/file.c
@@ -1068,8 +1068,6 @@ int replace_fd(unsigned fd, struct file *file, unsigned 
flags)
 
 /**
  * __receive_fd() - Install received file into file descriptor table
- *
- * @fd: fd to install into (if negative, a new fd will be allocated)
  * @file: struct file that was received from another process
  * @ufd: __user pointer to write new fd number to
  * @o_flags: the O_* flags to apply to the new fd entry
@@ -1083,7 +1081,7 @@ int replace_fd(unsigned fd, struct file *file, unsigned 
flags)
  *
  * Returns newly install fd or -ve on error.
  */
-int __receive_fd(int fd, struct file *file, int __user *ufd, unsigned int 
o_flags)
+int __receive_fd(struct file *file, int __user *ufd, unsigned int o_flags)
 {
int new_fd;
int error;
@@ -1092,32 +1090,33 @@ int __receive_fd(int fd, struct file *file, int __user 
*ufd, unsigned int o_flag
if (error)
return error;
 
-   if (fd < 0) {
-   new_fd = get_unused_fd_flags(o_flags);
-   if (new_fd < 0)
-   return new_fd;
-   } else {
-   new_fd = fd;
-   }
+   new_fd = get_unused_fd_flags(o_flags);
+   if (new_fd < 0)
+   return new_fd;
 
if (ufd) {
error = put_user(new_fd, ufd);
if (error) {
-   if (fd < 0)
-   put_unused_fd(new_fd);
+   put_unused_fd(new_fd);
return error;
}
}
 
-   if (fd < 0) {
-   fd_install(new_fd, get_file(file));
-   } else {
-   error = replace_fd(new_fd, file, o_flags);
-   if (error)
-   return error;
-   }
+   fd_install(new_fd, get_file(file));
+   __receive_sock(file);
+   return new_fd;
+}
 
-   /* Bump the sock usage counts, if any. */
+int receive_fd_replace(int new_fd, struct file *file, unsigned int o_flags)
+{
+   int error;
+
+   error = security_file_receive(file);
+   if (error)
+   return error;
+   error = replace_fd(new_fd, file, o_flags);
+   if (error)
+   return error;
__receive_sock(file);
return new_fd;
 }
diff --git a/include/linux/file.h b/include/linux/file.h
index 225982792fa2..2de2e4613d7b 100644
--- a/include/linux/file.h
+++ b/include/linux/file.h
@@ -92,23 +92,20 @@ extern void put_unused_fd(unsigned int fd);
 
 extern void fd_install(unsigned int fd, struct file *file);
 
-extern int __receive_fd(int fd, struct file *file, int __user *ufd,
+extern int __receive_fd(struct file *file, int __user *ufd,
unsigned int o_flags);
 static inline int receive_fd_user(struct file *file, int __user *ufd,
  unsigned int o_flags)
 {
if (ufd == NULL)
return -EFAULT;
-   return __receive_fd(

Re: [PATCH] fs: split receive_fd_replace from __receive_fd

2021-04-15 Thread Al Viro
On Thu, Mar 25, 2021 at 09:22:09AM +0100, Christoph Hellwig wrote:
> receive_fd_replace shares almost no code with the general case, so split
> it out.  Also remove the "Bump the sock usage counts" comment from
> both copies, as that is now what __receive_sock actually does.

Nice, except that you've misread that, er, lovely API.  This

> -static inline int receive_fd_replace(int fd, struct file *file, unsigned int 
> o_flags)
> -{
> - return __receive_fd(fd, file, NULL, o_flags);
> + return __receive_fd(file, NULL, o_flags);
>  }

can get called with negative fd (in which case it turns into an alias for
receive_fd(), of course).  As the result, that ioctl got broken in case
when SECCOMP_ADDFD_FLAG_SETFD is not set.  Trivially fixed by having the
only caller check the damn condition and call either receive_fd_replace()
or receive_fd().


Re: [PATCH] direct-io: use read lock for DIO_LOCKING flag

2021-04-15 Thread Al Viro
On Thu, Apr 15, 2021 at 12:24:13PM +0200, Jan Kara wrote:
> On Thu 15-04-21 17:43:32, Chao Yu wrote:
> > 9902af79c01a ("parallel lookups: actual switch to rwsem") changes inode
> > lock from mutex to rwsem, however, we forgot to adjust lock for
> > DIO_LOCKING flag in do_blockdev_direct_IO(),

The change in question had nothing to do with the use of ->i_mutex for
regular files data access.

> > so let's change to hold read
> > lock to mitigate performance regression in the case of read DIO vs read DIO,
> > meanwhile it still keeps original functionality of avoiding buffered access
> > vs direct access.
> > 
> > Signed-off-by: Chao Yu 
> 
> Thanks for the patch but this is not safe. Originally we had exclusive lock
> (with i_mutex), switching to rwsem doesn't change that requirement. It may
> be OK for some filesystems to actually use shared acquisition of rwsem for
> DIO reads but it is not clear that is fine for all filesystems (and I
> suspect those filesystems that actually do care already don't use
> DIO_LOCKING flag or were already converted to iomap_dio_rw()). So unless
> you do audit of all filesystems using do_blockdev_direct_IO() with
> DIO_LOCKING flag and make sure they are all fine with inode lock in shared
> mode, this is a no-go.

Aye.  Frankly, I would expect that anyone bothering with that kind of
analysis for given filesystem (and there are fairly unpleasant ones in the
list) would just use the fruits of those efforts to convert it over to
iomap.

"Read DIO" does not mean that accesses to private in-core data structures used
by given filesystem can be safely done in parallel.  So blanket patch like
that is not safe at all.


Re: [PATCH v2 0/2] Fix binfmt_flat loader for RISC-V

2021-04-15 Thread Al Viro
On Thu, Apr 15, 2021 at 07:56:05AM +0200, Christoph Hellwig wrote:
> binfmt_flat tends to go through Greg's uclinux tree, adding him and
> the list.

FWIW, my involvement with binfmt_flat had been pretty much nil -
the least trivial had been "binfmt_flat: flat_{get,put}_addr_from_rp()
should be able to fail" about 4 years ago and that fell out of hunting
for places where __get_user() had been used without checking error values.

It's in fs/*, but I've no way to test it and I have pretty much
zero familiarity with the guts of that one, so I can't give any useful
feedback on that series.  So consider the Christoph's comment seconded -
you want it reviewed by gerg et.al., and it probably ought to go via
gerg/uclinux.git tree.

I'm reasonably familiar with binfmt_{elf,misc,script}; anything
else gets touched as part of larger series and only with sanity checks
from other folks, if the changes are not entirely trivial.


Re: linux-next: build warning after merge of the vfs tree

2021-04-15 Thread Al Viro
On Mon, Apr 12, 2021 at 03:07:56PM +0200, Miklos Szeredi wrote:
> Hi Al,
> 
> Fixed fileattr branch pushed to:
> 
>   git://git.kernel.org/pub/scm/linux/kernel/git/mszeredi/vfs.git fileattr_v6

Merged and pushed out...


Re: [PATCH] Fix 'assignment to __be16' warning

2021-04-15 Thread Al Viro
On Mon, Apr 12, 2021 at 11:53:02AM +, Bence Csókás wrote:
> While the preamble field _is_ technically big-endian, its value is always 
> 0x2A2A,
> which is the same in either endianness, therefore it should be u16 instead.

Just replace the assignment with htons(0x2A2A) and be done with that - it's
a constant expression and compiler will yield the same assembler.


Re: [PATCH RFC 1/6] dcache: sweep cached negative dentries to the end of list of siblings

2021-04-15 Thread Al Viro
On Wed, Apr 14, 2021 at 03:00:48AM +, Al Viro wrote:

> Ugh...  So when dput() drives the refcount down to 0 you hit lock_parent()
> and only then bother to check if the sucker had been negative in the first
  ^
  had zero refcount, of course.
> place?

> > @@ -1970,6 +2021,8 @@ void d_instantiate(struct dentry *entry, struct inode 
> > * inode)
> >  {
> > BUG_ON(!hlist_unhashed(>d_u.d_alias));
> > if (inode) {
> > +   if (d_is_tail_negative(entry))
> > +   recycle_negative(entry);
> > security_d_instantiate(entry, inode);
> > spin_lock(>i_lock);
> > __d_instantiate(entry, inode);
> 
> Wait a bloody minute.  What about d_instantiate_new() right next to it?

Another fun question: where's the proof that __d_add(dentry, non_NULL_inode)
won't happen to dentry marked tail-negative?  From a quick grep I see at
least one such place - on success cifs_do_create() does
d_drop(direntry);
d_add(direntry, newinode);
and it would bloody well evade what you are doing in d_instantiate().
Same seems to be true for nfs_link()...


Re: [PATCH RFC 1/6] dcache: sweep cached negative dentries to the end of list of siblings

2021-04-15 Thread Al Viro
On Wed, Apr 14, 2021 at 03:41:10AM +, Al Viro wrote:

> > +   if (!d_is_tail_negative(dentry)) {
> > +   parent = lock_parent(dentry);
> > +   if (!parent)
> > +   return;
> 
> Wait a minute.  It's not a good environment for calling lock_parent().
> Who said that dentry won't get freed right under it?

[snip]

FWIW, in that state (dentry->d_lock held) we have
* stable ->d_flags
* stable ->d_count
* stable ->d_inode
IOW, we can bloody well check ->d_count *before* bothering with lock_parent().
It does not get rid of the problems with lifetimes, though.  We could
do something along the lines of

rcu_read_lock()
if retain_dentry()
parent = NULL
if that dentry might need to be moved in list
parent = lock_parent()
// if reached __dentry_kill(), d_count() will be -128,
// so the check below will exclude those
if that dentry does need to be moved
move it to the end of list
unlock dentry and parent (if any)
rcu_read_unlock()
return
here, but your other uses of lock_parent() also need attention.  And
recursive call of dput() in trim_negative() (#6/6) is really asking
for trouble.


Re: [PATCH RFC 6/6] dcache: prevent flooding with negative dentries

2021-04-13 Thread Al Viro
On Thu, Jan 21, 2021 at 06:49:45PM +0530, Gautham Ananthakrishna wrote:

> + spin_lock(>d_lock);
> + parent = lock_parent(victim);
> +
> + rcu_read_unlock();

Similar story.  As soon as you hit that rcu_read_unlock(), the memory
pointed to by victim might be reused.  If you have hit __lock_parent(),
victim->d_lock had been dropped and regained.  Which means that freeing
might've been already scheduled.  Unlike #1/6, here you won't get
memory corruption in lock_parent() itself, but...

> +
> + if (d_count(victim) || !d_is_negative(victim) ||
> + (victim->d_flags & DCACHE_REFERENCED)) {
> + if (parent)
> + spin_unlock(>d_lock);
> + spin_unlock(>d_lock);

... starting from here you just might.


Re: [PATCH RFC 1/6] dcache: sweep cached negative dentries to the end of list of siblings

2021-04-13 Thread Al Viro
On Thu, Jan 21, 2021 at 06:49:40PM +0530, Gautham Ananthakrishna wrote:

> +static void sweep_negative(struct dentry *dentry)
> +{
> + struct dentry *parent;
> +
> + if (!d_is_tail_negative(dentry)) {
> + parent = lock_parent(dentry);
> + if (!parent)
> + return;

Wait a minute.  It's not a good environment for calling lock_parent().
Who said that dentry won't get freed right under it?

Right now callers of __lock_parent() either hold a reference to dentry
*or* are called for a positive dentry, with inode->i_lock held.
You are introducing something very different - 

>   if (likely(retain_dentry(dentry))) {
> + if (d_is_negative(dentry))
> + sweep_negative(dentry);
>   spin_unlock(>d_lock);

Here we can be called for a negative dentry with refcount already *NOT*
held by us.  Look:

static inline struct dentry *lock_parent(struct dentry *dentry)
{
struct dentry *parent = dentry->d_parent;
if (IS_ROOT(dentry))
return NULL;
isn't a root

if (likely(spin_trylock(>d_lock)))
return parent;

no such luck - someone's already holding parent's ->d_lock

return __lock_parent(dentry);
and here we have
static struct dentry *__lock_parent(struct dentry *dentry)
{
struct dentry *parent;
rcu_read_lock();  

OK, anything we see in its ->d_parent is guaranteed to stay
allocated until we get to matching rcu_read_unlock()

spin_unlock(>d_lock);
dropped the spinlock, now it's fair game for d_move(), d_drop(), etc.

again:
parent = READ_ONCE(dentry->d_parent);
dentry couldn't have been reused, so it's the last value stored there.
Points to still allocated struct dentry instance, so we can...

spin_lock(>d_lock);
grab its ->d_lock.

/*
 * We can't blindly lock dentry until we are sure
 * that we won't violate the locking order.
 * Any changes of dentry->d_parent must have
 * been done with parent->d_lock held, so
 * spin_lock() above is enough of a barrier
 * for checking if it's still our child.
 */
if (unlikely(parent != dentry->d_parent)) {
spin_unlock(>d_lock);
goto again;
}
Nevermind, it's still equal to our ->d_parent.  So we have
the last valid parent's ->d_lock held

rcu_read_unlock();
What's to hold dentry allocated now?  IF we held its refcount - no
problem, it can't go away.  If we held its ->d_inode->i_lock - ditto
(it wouldn't get to __dentry_kill() until we drop that, since all
callers do acquire that lock and it couldn't get scheduled for
freeing until it gets through most of __dentry_kill()).

IOW, we are free to grab dentry->d_lock again.
if (parent != dentry)
spin_lock_nested(>d_lock, DENTRY_D_LOCK_NESTED);
else
parent = NULL;
return parent;
}

With your patch, though, you've got a call site where neither condition
is guaranteed.  Current kernel is fine - we are holding ->d_lock there,
and we don't touch dentry after it gets dropped.  Again, it can't get
scheduled for freeing until after we drop ->d_lock, so we are safe.
With that change, however, you've got a hard-to-hit memory corruptor
there...


Re: [PATCH RFC 1/6] dcache: sweep cached negative dentries to the end of list of siblings

2021-04-13 Thread Al Viro
On Thu, Jan 21, 2021 at 06:49:40PM +0530, Gautham Ananthakrishna wrote:
> From: Konstantin Khlebnikov 
> 
> For disk filesystems result of every negative lookup is cached, content of
> directories is usually cached too. Production of negative dentries isn't
> limited with disk speed. It's really easy to generate millions of them if
> system has enough memory. Negative dentries are linked into siblings list
> along with normal positive dentries. Some operations walks dcache tree but
> looks only for positive dentries: most important is fsnotify/inotify.
> 
> This patch moves negative dentries to the end of list at final dput() and
> marks with flag which tells that all following dentries are negative too.
> Reverse operation is required before instantiating negative dentry.

> +static void sweep_negative(struct dentry *dentry)
> +{
> + struct dentry *parent;
> +
> + if (!d_is_tail_negative(dentry)) {
> + parent = lock_parent(dentry);
> + if (!parent)
> + return;
> +
> + if (!d_count(dentry) && d_is_negative(dentry) &&
> + !d_is_tail_negative(dentry)) {
> + dentry->d_flags |= DCACHE_TAIL_NEGATIVE;
> + list_move_tail(>d_child, >d_subdirs);
> + }
> +
> + spin_unlock(>d_lock);
> + }
> +}

Ugh...  So when dput() drives the refcount down to 0 you hit lock_parent()
and only then bother to check if the sucker had been negative in the first
place?

> @@ -1970,6 +2021,8 @@ void d_instantiate(struct dentry *entry, struct inode * 
> inode)
>  {
>   BUG_ON(!hlist_unhashed(>d_u.d_alias));
>   if (inode) {
> + if (d_is_tail_negative(entry))
> + recycle_negative(entry);
>   security_d_instantiate(entry, inode);
>   spin_lock(>i_lock);
>   __d_instantiate(entry, inode);

Wait a bloody minute.  What about d_instantiate_new() right next to it?


Re: [PATCH RFC 0/6] fix the negative dentres bloating system memory usage

2021-04-13 Thread Al Viro
On Thu, Jan 21, 2021 at 06:49:39PM +0530, Gautham Ananthakrishna wrote:

> We tested this patch set recently and found it limiting negative dentry to a
> small part of total memory. The following is the test result we ran on two
> types of servers, one is 256G memory with 24 CPUS and another is 3T memory
> with 384 CPUS. The test case is using a lot of processes to generate negative
> dentry in parallel, the following is the test result after 72 hours, the
> negative dentry number is stable around that number even after running longer
> for much longer time. Without the patch set, in less than half an hour 197G 
> was
> taken by negative dentry on 256G system, in 1 day 2.4T was taken on 3T system.
> 
> system memory   neg-dentry-number   neg-dentry-mem-usage
> 256G5525908410.6G
> 3T  202306756   38.8G
> 
> For perf test, we ran the following, and no regression found.
> 
> 1. create 1M negative dentry and then touch them to convert them to positive
>dentry
> 
> 2. create 10K/100K/1M files
> 
> 3. remove 10K/100K/1M files
> 
> 4. kernel compile

Good for you; how would that work for thinner boxen, though?  I agree that if 
you
have 8M hash buckets your "no more than 3 unused negatives per bucket" is 
generous
enough for everything, but that's less obvious for something with e.g 4 or 8 
gigs.
And believe it or not, there are real-world boxen like that ;-)


Re: [GIT PULL] fileattr API

2021-04-08 Thread Al Viro
On Fri, Apr 09, 2021 at 01:52:11AM +, Al Viro wrote:
> On Wed, Apr 07, 2021 at 09:22:52PM +0200, Miklos Szeredi wrote:
> > Hi Al,
> > 
> > Please pull from:
> > 
> >   git://git.kernel.org/pub/scm/linux/kernel/git/mszeredi/vfs.git fileattr_v4
> > 
> > Convert all (with the exception of CIFS) filesystems from handling
> > FS_IOC_[GS]ETFLAGS and FS_IOC_FS[GS]ETXATTR themselves to new i_ops and
> > common code moved into the VFS for these ioctls.  This removes boilerplate
> > from filesystems, and allows these operations to be properly stacked in
> > overlayfs.
> 
> Umm...  v4 or v5?

Looks like they only differ in making a couple of fuse helpers static.
Grabbed and merged into #for-next; will push if it passes the smoke
test...


Re: [GIT PULL] fileattr API

2021-04-08 Thread Al Viro
On Wed, Apr 07, 2021 at 09:22:52PM +0200, Miklos Szeredi wrote:
> Hi Al,
> 
> Please pull from:
> 
>   git://git.kernel.org/pub/scm/linux/kernel/git/mszeredi/vfs.git fileattr_v4
> 
> Convert all (with the exception of CIFS) filesystems from handling
> FS_IOC_[GS]ETFLAGS and FS_IOC_FS[GS]ETXATTR themselves to new i_ops and
> common code moved into the VFS for these ioctls.  This removes boilerplate
> from filesystems, and allows these operations to be properly stacked in
> overlayfs.

Umm...  v4 or v5?


Re: [PATCH v3 2/4] kernfs: use VFS negative dentry caching

2021-04-08 Thread Al Viro
On Fri, Apr 09, 2021 at 09:15:06AM +0800, Ian Kent wrote:
> + parent = kernfs_dentry_node(dentry->d_parent);
> + if (parent) {
> + const void *ns = NULL;
> +
> + if (kernfs_ns_enabled(parent))
> + ns = kernfs_info(dentry->d_parent->d_sb)->ns;

For any dentry d, we have d->d_parent->d_sb == d->d_sb.  All the time.
If you ever run into the case where that would not be true, you've found
a critical bug.

> + kn = kernfs_find_ns(parent, dentry->d_name.name, ns);
> + if (kn)
> + goto out_bad;
> + }

Umm...  What's to prevent a race with successful rename(2)?  IOW, what's
there to stabilize ->d_parent and ->d_name while we are in that function?


Re: [PATCH v6 01/30] iov_iter: Add ITER_XARRAY

2021-04-08 Thread Al Viro
On Thu, Apr 08, 2021 at 03:04:07PM +0100, David Howells wrote:
> Add an iterator, ITER_XARRAY, that walks through a set of pages attached to
> an xarray, starting at a given page and offset and walking for the
> specified amount of bytes.  The iterator supports transparent huge pages.
> 
> The iterate_xarray() macro calls the helper function with rcu_access()
> helped.  I think that this is only a problem for iov_iter_for_each_range()
> - and that returns an error for ITER_XARRAY (also, this function does not
> appear to be called).

Unused since lustre had gone away.

> +#define iterate_all_kinds(i, n, v, I, B, K, X) { \

Do you have any users that would pass different B and X?

> @@ -1440,7 +1665,7 @@ ssize_t iov_iter_get_pages_alloc(struct iov_iter *i,
>   return v.bv_len;
>   }),({
>   return -EFAULT;
> - })
> + }), 0

Correction - users that might get that flavour.  This one explicitly checks
for xarray and doesn't get to iterate_... in that case.


Re: [RFC bpf-next 1/1] bpf: Introduce iter_pagecache

2021-04-08 Thread Al Viro
On Thu, Apr 08, 2021 at 01:49:35PM -0700, Daniel Xu wrote:

> Ah right, sorry. Nobody will clean up the super_block.
> 
> > IOW, NAK.  The objects you are playing with have non-trivial lifecycle
> > and poking into the guts of data structures without bothering to
> > understand it is not a good idea.
> > 
> > Rule of the thumb: if your code ends up using fields that are otherwise
> > handled by a small part of codebase, the odds are that you need to be
> > bloody careful.  In particular, ->ns_lock has 3 users - all in
> > fs/namespace.c.  ->list/->mnt_list: all users in fs/namespace.c and
> > fs/pnode.c.  ->s_active: majority in fs/super.c, with several outliers
> > in filesystems and safety of those is not trivial.
> > 
> > Any time you see that kind of pattern, you are risking to reprise
> > a scene from The Modern Times - the one with Charlie taking a trip
> > through the guts of machinery.
> 
> I'll take a closer look at the lifetime semantics.
> 
> Hopefully the overall goal of the patch is ok. Happy to iterate on the
> implementation details until it's correct.

That depends.  Note that bumping ->s_active means that umount of that
sucker will *NOT* shut it down - that would happen only on the thread
doing the final deactivation.  What's more, having e.g. a USB stick
mounted, doing umount(1), having it complete successfully, pulling the
damn thing out and getting writes lost would make for a nasty surprise
for users.

With your approach it seems to be inevitable.  Holding namespace_sem
through the entire thing would prevent that, but's it's a non-starter
for other reasons (starting with "it's a system-wide lock, so that'd
be highly antisocial").  Are there any limits on what could be done
to the pages, anyway?  Because if it's "anything user wanted to do",
it's *really* not feasible.


Re: [RFC bpf-next 1/1] bpf: Introduce iter_pagecache

2021-04-08 Thread Al Viro
On Wed, Apr 07, 2021 at 02:46:11PM -0700, Daniel Xu wrote:

> +static void fini_seq_pagecache(void *priv_data)
> +{
> + struct bpf_iter_seq_pagecache_info *info = priv_data;
> + struct radix_tree_iter iter;
> + struct super_block *sb;
> + void **slot;
> +
> + radix_tree_for_each_slot(slot, >superblocks, , 0) {
> + sb = (struct super_block *)iter.index;
> + atomic_dec(>s_active);
> + radix_tree_delete(>superblocks, iter.index);
> + }

... and if in the meanwhile all other contributors to ->s_active have
gone away, that will result in...?

IOW, NAK.  The objects you are playing with have non-trivial lifecycle
and poking into the guts of data structures without bothering to
understand it is not a good idea.

Rule of the thumb: if your code ends up using fields that are otherwise
handled by a small part of codebase, the odds are that you need to be
bloody careful.  In particular, ->ns_lock has 3 users - all in
fs/namespace.c.  ->list/->mnt_list: all users in fs/namespace.c and
fs/pnode.c.  ->s_active: majority in fs/super.c, with several outliers
in filesystems and safety of those is not trivial.

Any time you see that kind of pattern, you are risking to reprise
a scene from The Modern Times - the one with Charlie taking a trip
through the guts of machinery.


[git pull] breakage in LOOKUP_MOUNTPOINT handling

2021-04-07 Thread Al Viro
Brown paperbag time: dumb braino in the series that went into
5.7 broke the "don't step into ->d_weak_revalidate() when umount(2)
looks the victim up" behaviour.  Spotted only now - saw
if (!err && unlikely(nd->flags & LOOKUP_MOUNTPOINT)) {
err = handle_lookup_down(nd);
nd->flags &= ~LOOKUP_JUMPED; // no d_weak_revalidate(), 
please...
}
went "why do we clear that flag here - nothing below that point is going
to check it anyway" / "wait a minute, what is it doing *after* complete_walk()
(which is where we check that flag and call ->d_weak_revalidate())" / "how could
that possibly _not_ break?", followed by reproducing the breakage and verifying
that the obvious fix of that braino does, indeed, fix it.

FWIW, reproducer is (assuming that $DIR exists and is exported r/w to localhost)
mkdir $DIR/a
mkdir /tmp/foo
mount --bind /tmp/foo /tmp/foo
mkdir /tmp/foo/a
mkdir /tmp/foo/b
mount -t nfs4 localhost:$DIR/a /tmp/foo/a
mount -t nfs4 localhost:$DIR /tmp/foo/b
rmdir /tmp/foo/b/a
umount /tmp/foo/b
umount /tmp/foo/a
umount -l /tmp/foo  # will get everything under /tmp/foo, no matter what

Correct behaviour is successful umount; broken kernels (5.7-rc1 and later) get
umount.nfs4: /tmp/foo/a: Stale file handle
Note that bind mount is there to be able to recover - on broken kernels we'd
get stuck with impossible-to-umount filesystem if not for that.

FWIW, that braino had been posted for review back then, at least
twice.  Unfortunately, the call of complete_walk() was outside of diff
context, so the bogosity hadn't been immediately obvious from the patch
alone ;-/

The following changes since commit 7d01ef7585c07afaf487759a48486228cd065726:

  Make sure nd->path.mnt and nd->path.dentry are always valid pointers 
(2021-04-06 12:33:07 -0400)

are available in the git repository at:

  git://git.kernel.org/pub/scm/linux/kernel/git/viro/vfs.git fixes

for you to fetch changes up to 4f0ed93fb92d3528c73c80317509df3f800a222b:

  LOOKUP_MOUNTPOINT: we are cleaning "jumped" flag too late (2021-04-06 
20:33:00 -0400)


Al Viro (1):
  LOOKUP_MOUNTPOINT: we are cleaning "jumped" flag too late

 fs/namei.c | 8 
 1 file changed, 4 insertions(+), 4 deletions(-)


[git pull] a couple of fixes in vfs.git

2021-04-06 Thread Al Viro
Fairly old hostfs bug (in setups that are not used
by anyone, apparently) + fix for this cycle regression:
extra dput/mntput in LOOKUP_CACHED failure handling.

The following changes since commit a38fd8748464831584a19438cbb3082b5a2dab15:

  Linux 5.12-rc2 (2021-03-05 17:33:41 -0800)

are available in the git repository at:

  git://git.kernel.org/pub/scm/linux/kernel/git/viro/vfs.git fixes

for you to fetch changes up to 7d01ef7585c07afaf487759a48486228cd065726:

  Make sure nd->path.mnt and nd->path.dentry are always valid pointers 
(2021-04-06 12:33:07 -0400)

--------
Al Viro (2):
  hostfs: fix memory handling in follow_link()
  Make sure nd->path.mnt and nd->path.dentry are always valid pointers

 fs/hostfs/hostfs_kern.c | 7 +++
 fs/namei.c  | 6 --
 2 files changed, 7 insertions(+), 6 deletions(-)


Re: [syzbot] WARNING in mntput_no_expire (2)

2021-04-06 Thread Al Viro
On Tue, Apr 06, 2021 at 02:15:01PM +, Al Viro wrote:

> I'm referring to the fact that your diff is with an already modified 
> path_lookupat()
> _and_ those modifications have managed to introduce a bug your patch reverts.
> No terminate_walk() paired with that path_init() failure, i.e. path_init() is
> responsible for cleanups on its (many) failure exits...

I can't tell without seeing the variant your diff is against, but at a guess
it had a non-trivial amount of trouble with missed rcu_read_unlock() in
cases when path_init() fails after having done rcu_read_lock().  For trivial
testcase, consider passing -1 for dfd, so that it would fail with -EBADF.
Or passing 0 for dfd and "blah" for name (assuming your stdin is not a 
directory).
Sure, you could handle those in path_init() (or delay grabbing rcu_read_lock()
in there, spreading it in a bunch of branches), but duplicated cleanup logics
for a bunch of failure exits is asking for trouble.


Re: [syzbot] WARNING in mntput_no_expire (2)

2021-04-06 Thread Al Viro
On Tue, Apr 06, 2021 at 03:22:05PM +0200, Christian Brauner wrote:

> Why is a another function in charge of checking the return value of an
> initialization function. If something like path_init() fails why is the
> next caller responsible for rejecting it's return value and then we're
> passing that failure value through the whole function with if (!err)
> ladders but as I said it's mostly style preferences.

Because otherwise you either need *all* paths leading to link_path_walk()
duplicate the logics (and get hurt whenever you miss one) or have "well,
in some cases link_path_walk() handles ERR_PTR() given to it, in some
cases its caller do" mess.

> > > s = path_init(nd, flags);
> > > -   if (IS_ERR(s))
> > > -   return PTR_ERR(s);
> > 
> > Where has that come from, BTW?  Currently path_lookupat() does no such 
> > thing.
> 
> Hm? Are you maybe overlooking path_init() which assigns straight into
> the variable declaration? Or are you referring to sm else?

I'm referring to the fact that your diff is with an already modified 
path_lookupat()
_and_ those modifications have managed to introduce a bug your patch reverts.
No terminate_walk() paired with that path_init() failure, i.e. path_init() is
responsible for cleanups on its (many) failure exits...


Re: [syzbot] WARNING in mntput_no_expire (2)

2021-04-06 Thread Al Viro
On Tue, Apr 06, 2021 at 02:35:05PM +0200, Christian Brauner wrote:

> And while we're at it might I bring up the possibility of an additional
> cleanup of how we currently call path_init().
> Right now we pass the return value from path_init() directly into e.g.
> link_path_walk() which as a first thing checks for error. Which feels
> rather wrong and has always confused me when looking at these codepaths.

Why?

> I get that it might make sense for reasons unrelated to path_init() that
> link_path_walk() checks its first argument for error but path_init()
> should be checked for error right away especially now that we return
> early when LOOKUP_CACHED is set without LOOKUP_RCU.

But you are making the _callers_ of path_init() do that, for no good
reason.

> thing especially in longer functions such as path_lookupat() where it
> gets convoluted pretty quickly. I think it would be cleaner to have
> something like [1]. The early exists make the code easier to reason
> about imho. But I get that that's a style discussion.

Your variant is a lot more brittle, actually.

> @@ -2424,33 +2424,49 @@ static int path_lookupat(struct nameidata *nd, 
> unsigned flags, struct path *path
> int err;
> 
> s = path_init(nd, flags);
> -   if (IS_ERR(s))
> -   return PTR_ERR(s);

Where has that come from, BTW?  Currently path_lookupat() does no such thing.


Re: [syzbot] WARNING in mntput_no_expire (2)

2021-04-05 Thread Al Viro
On Tue, Apr 06, 2021 at 01:38:39AM +, Al Viro wrote:
> On Mon, Apr 05, 2021 at 10:07:37PM +0200, Christian Brauner wrote:
> 
> > > diff --git a/fs/namei.c b/fs/namei.c
> > > index 216f16e74351..82344f1139ff 100644
> > > --- a/fs/namei.c
> > > +++ b/fs/namei.c
> > > @@ -2289,6 +2289,9 @@ static const char *path_init(struct nameidata *nd, 
> > > unsigned flags)
> > >   int error;
> > >   const char *s = nd->name->name;
> > >  
> > > + nd->path.mnt = NULL;
> > > + nd->path.dentry = NULL;
> > > +
> > >   /* LOOKUP_CACHED requires RCU, ask caller to retry */
> > >   if ((flags & (LOOKUP_RCU | LOOKUP_CACHED)) == LOOKUP_CACHED)
> > >   return ERR_PTR(-EAGAIN);
> > > @@ -2322,8 +2325,6 @@ static const char *path_init(struct nameidata *nd, 
> > > unsigned flags)
> > >   }
> > >  
> > >   nd->root.mnt = NULL;
> > > - nd->path.mnt = NULL;
> > > - nd->path.dentry = NULL;
> > >  
> > >   /* Absolute pathname -- fetch the root (LOOKUP_IN_ROOT uses nd->dfd). */
> > >   if (*s == '/' && !(flags & LOOKUP_IN_ROOT)) {
> > 
> > Bingo. That fixes it.
> 
> *grumble*
> 
> OK, I suppose it'll do for backports, but longer term... I don't like how
> convoluted the rules for nameidata fields' validity are.  In particular,
> for nd->path I would rather have it
>   * cleared in set_nameidata()
>   * cleared when it become invalid.  That would be
>   * places that drop rcu_read_lock() without having legitimized 
> the sucker
> (already done, except for terminate_walk())
>   * terminate_walk() in non-RCU case after path_put(>path)
> 
> OTOH... wait a sec - the whole thing is this cycle regression, so...

BTW, there's another piece of brittleness that almost accidentally doesn't
end up biting us - nd->depth should be cleared in set_nameidata(), not
in path_init().  In this case we are saved by the fact that the only
really early failure in path_init() can't happen on the first call,
so if we do leave the sucker before zeroing nd->depth, we are saved by
the fact that terminate_walk() has just been called and it *does*
clear nd->depth.  It's obviously cleaner to have it initialized from
the very beginning and not bother with it in path_init() at all.
Separate patch, though - this is not, strictly speaking, a bug.


Re: [syzbot] WARNING in mntput_no_expire (2)

2021-04-05 Thread Al Viro
On Mon, Apr 05, 2021 at 10:07:37PM +0200, Christian Brauner wrote:

> > diff --git a/fs/namei.c b/fs/namei.c
> > index 216f16e74351..82344f1139ff 100644
> > --- a/fs/namei.c
> > +++ b/fs/namei.c
> > @@ -2289,6 +2289,9 @@ static const char *path_init(struct nameidata *nd, 
> > unsigned flags)
> > int error;
> > const char *s = nd->name->name;
> >  
> > +   nd->path.mnt = NULL;
> > +   nd->path.dentry = NULL;
> > +
> > /* LOOKUP_CACHED requires RCU, ask caller to retry */
> > if ((flags & (LOOKUP_RCU | LOOKUP_CACHED)) == LOOKUP_CACHED)
> > return ERR_PTR(-EAGAIN);
> > @@ -2322,8 +2325,6 @@ static const char *path_init(struct nameidata *nd, 
> > unsigned flags)
> > }
> >  
> > nd->root.mnt = NULL;
> > -   nd->path.mnt = NULL;
> > -   nd->path.dentry = NULL;
> >  
> > /* Absolute pathname -- fetch the root (LOOKUP_IN_ROOT uses nd->dfd). */
> > if (*s == '/' && !(flags & LOOKUP_IN_ROOT)) {
> 
> Bingo. That fixes it.

*grumble*

OK, I suppose it'll do for backports, but longer term... I don't like how
convoluted the rules for nameidata fields' validity are.  In particular,
for nd->path I would rather have it
* cleared in set_nameidata()
* cleared when it become invalid.  That would be
* places that drop rcu_read_lock() without having legitimized 
the sucker
  (already done, except for terminate_walk())
* terminate_walk() in non-RCU case after path_put(>path)

OTOH... wait a sec - the whole thing is this cycle regression, so...

Could you verify that the variant below fixes that crap?

Make sure nd->path.mnt and nd->path.dentry are always valid pointers

Initialize them in set_nameidata() and make sure that terminate_walk() clears 
them
once the pointers become potentially invalid (i.e. we leave RCU mode or drop 
them
in non-RCU one).  Currently we have "path_init() always initializes them and 
nobody
accesses them outside of path_init()/terminate_walk() segments", which is asking
for trouble.

With that change we would have nd->path.{mnt,dentry}
1) always valid - NULL or pointing to currently allocated objects.
2) non-NULL while we are successfully walking
3) NULL when we are not walking at all
4) contributing to refcounts whenever non-NULL outside of RCU mode.

Hopefully-fixes: 6c6ec2b0a3e0 ("fs: add support for LOOKUP_CACHED")
Signed-off-by: Al Viro 
---
diff --git a/fs/namei.c b/fs/namei.c
index 216f16e74351..fc8760d4314e 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -579,6 +579,8 @@ static void set_nameidata(struct nameidata *p, int dfd, 
struct filename *name)
p->stack = p->internal;
p->dfd = dfd;
p->name = name;
+   p->path.mnt = NULL;
+   p->path.dentry = NULL;
p->total_link_count = old ? old->total_link_count : 0;
p->saved = old;
current->nameidata = p;
@@ -652,6 +654,8 @@ static void terminate_walk(struct nameidata *nd)
rcu_read_unlock();
}
nd->depth = 0;
+   nd->path.mnt = NULL;
+   nd->path.dentry = NULL;
 }
 
 /* path_put is needed afterwards regardless of success or failure */
@@ -2322,8 +2326,6 @@ static const char *path_init(struct nameidata *nd, 
unsigned flags)
}
 
nd->root.mnt = NULL;
-   nd->path.mnt = NULL;
-   nd->path.dentry = NULL;
 
/* Absolute pathname -- fetch the root (LOOKUP_IN_ROOT uses nd->dfd). */
if (*s == '/' && !(flags & LOOKUP_IN_ROOT)) {


Re: [syzbot] WARNING in mntput_no_expire (2)

2021-04-05 Thread Al Viro
On Mon, Apr 05, 2021 at 06:23:49PM +, Al Viro wrote:
> On Mon, Apr 05, 2021 at 07:08:01PM +0200, Christian Brauner wrote:
> 
> > Ah dentry count of -127 looks... odd.
> 
> dead + 1...
> 
> void lockref_mark_dead(struct lockref *lockref)
> {
> assert_spin_locked(>lock);
>   lockref->count = -128;
> }
> 
> IOW, a leaked (uncounted) reference to dentry, that got dget() called on
> it after dentry had been freed.
> 
>   IOW, current->fs->pwd.dentry happens to point to an already freed
> struct dentry here.  Joy...
> 
>   Could you slap
> 
> spin_lock(>fs->lock);
> WARN_ON(d_count(current->fs->pwd.dentry) < 0);
> spin_unlock(>fs->lock);
> 
> before and after calls of io_issue_sqe() and see if it triggers?  We 
> definitely
> are seeing buggered dentry refcounting here.

Check if this helps, please.

diff --git a/fs/namei.c b/fs/namei.c
index 216f16e74351..82344f1139ff 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -2289,6 +2289,9 @@ static const char *path_init(struct nameidata *nd, 
unsigned flags)
int error;
const char *s = nd->name->name;
 
+   nd->path.mnt = NULL;
+   nd->path.dentry = NULL;
+
/* LOOKUP_CACHED requires RCU, ask caller to retry */
if ((flags & (LOOKUP_RCU | LOOKUP_CACHED)) == LOOKUP_CACHED)
return ERR_PTR(-EAGAIN);
@@ -2322,8 +2325,6 @@ static const char *path_init(struct nameidata *nd, 
unsigned flags)
}
 
nd->root.mnt = NULL;
-   nd->path.mnt = NULL;
-   nd->path.dentry = NULL;
 
/* Absolute pathname -- fetch the root (LOOKUP_IN_ROOT uses nd->dfd). */
if (*s == '/' && !(flags & LOOKUP_IN_ROOT)) {


Re: [syzbot] WARNING in mntput_no_expire (2)

2021-04-05 Thread Al Viro
On Mon, Apr 05, 2021 at 07:08:01PM +0200, Christian Brauner wrote:

> Ah dentry count of -127 looks... odd.

dead + 1...

void lockref_mark_dead(struct lockref *lockref)
{
assert_spin_locked(>lock);
lockref->count = -128;
}

IOW, a leaked (uncounted) reference to dentry, that got dget() called on
it after dentry had been freed.

IOW, current->fs->pwd.dentry happens to point to an already freed
struct dentry here.  Joy...

Could you slap

spin_lock(>fs->lock);
WARN_ON(d_count(current->fs->pwd.dentry) < 0);
spin_unlock(>fs->lock);

before and after calls of io_issue_sqe() and see if it triggers?  We definitely
are seeing buggered dentry refcounting here.


Re: [syzbot] WARNING in mntput_no_expire (2)

2021-04-05 Thread Al Viro
On Mon, Apr 05, 2021 at 01:44:37PM +0200, Christian Brauner wrote:
> On Sun, Apr 04, 2021 at 08:17:21PM +0000, Al Viro wrote:
> > On Sun, Apr 04, 2021 at 06:50:10PM +, Al Viro wrote:
> > 
> > > > Yeah, I have at least namei.o
> > > > 
> > > > https://drive.google.com/file/d/1AvO1St0YltIrA86DXjp1Xg3ojtS9owGh/view?usp=sharing
> > > 
> > > *grumble*
> > > 
> > > Is it reproducible without KASAN?  Would be much easier to follow the 
> > > produced
> > > asm...
> > 
> > Looks like inode_permission(_, NULL, _) from may_lookup(nd).  I.e.
> > nd->inode == NULL.
> 
> Yeah, I already saw that.
> 
> > 
> > Mind slapping BUG_ON(!nd->inode) right before may_lookup() call in
> > link_path_walk() and trying to reproduce that oops?
> 
> Yep, no problem. If you run the reproducer in a loop for a little while
> you eventually trigger the BUG_ON() and then you get the following splat
> (and then an endless loop) in [1] with nd->inode NULL.
> 
> _But_ I managed to debug this further and was able to trigger the BUG_ON()
> directly in path_init() in the AT_FDCWD branch (after all its 
> AT_FDCWD(./file0)
> with the patch in [3] (it's in LOOKUP_RCU) the corresponding splat is in [2].
> So the crash happens for a PF_IO_WORKER thread with a NULL nd->inode for the
> PF_IO_WORKER's pwd (The PF_IO_WORKER seems to be in async context.).

So we find current->fs->pwd.dentry negative, with current->fs->seq sampled
equal before and after that?  Lovely...  The only places where we assign
anything to ->pwd.dentry are
void set_fs_pwd(struct fs_struct *fs, const struct path *path)
{
struct path old_pwd; 

path_get(path);
spin_lock(>lock);
write_seqcount_begin(>seq);
old_pwd = fs->pwd;
fs->pwd = *path;
write_seqcount_end(>seq);
spin_unlock(>lock);

if (old_pwd.dentry)
path_put(_pwd);
}
where we have ->seq bumped between dget new/assignment/ dput old,
copy_fs_struct() where we have
spin_lock(>lock);
fs->root = old->root;
path_get(>root);
fs->pwd = old->pwd;
path_get(>pwd);
spin_unlock(>lock);
fs being freshly allocated instance that couldn't have been observed
by anyone and chroot_fs_refs(), where we have
spin_lock(>lock);
write_seqcount_begin(>seq);
hits += replace_path(>root, old_root, new_root);
hits += replace_path(>pwd, old_root, new_root);
write_seqcount_end(>seq);
while (hits--) {
count++;
path_get(new_root);
}
spin_unlock(>lock);
...
static inline int replace_path(struct path *p, const struct path *old, const 
struct path *new)
{
if (likely(p->dentry != old->dentry || p->mnt != old->mnt))
return 0;
*p = *new;
return 1;
}
Here we have new_root->dentry pinned from the very beginning,
and assignments are wrapped into bumps of ->seq.  Moreover,
we are holding ->lock through that sequence (as all writers
do), so these references can't be dropped before path_get()
bumps new_root->dentry refcount.

chroot_fs_refs() is called only by pivot_root(2):
chroot_fs_refs(, );
and there new is set by
error = user_path_at(AT_FDCWD, new_root,
 LOOKUP_FOLLOW | LOOKUP_DIRECTORY, );
if (error)
goto out0;
which pins new.dentry *and* verifies that it's positive and a directory,
at that.  Since pinned positive dentry can't be made negative by anybody
else, we know it will remain in that state until
path_put();
well downstream of chroot_fs_refs().  In copy_fs_struct() we are
copying someone's ->pwd, so it's also pinned positive.  And it
won't be dropped outside of old->lock, so by the time somebody
manages to drop the reference in old, path_get() effects will be
visible (old->lock serving as a barrier).

That leaves set_fs_pwd() calls:
fs/init.c:54:   set_fs_pwd(current->fs, );
init_chdir(), path set by LOOKUP_DIRECTORY patwalk.  Pinned positive.
fs/namespace.c:4207:set_fs_pwd(current->fs, );
init_mount_tree(), root.dentry being ->mnt_root of rootfs.  Pinned
positive (and it would've oopsed much earlier had that been it)
fs/namespace.c:4485:set_fs_pwd(fs, );
mntns_install(), root filled by successful LOOKUP_DOWN for "/"
from mnt_ns->root.  Should be pinned positive.
fs/open.c:501:  set_fs_pwd(current->fs, );
chdir(2), path set b

Re: [syzbot] WARNING in mntput_no_expire (2)

2021-04-04 Thread Al Viro
On Sun, Apr 04, 2021 at 06:50:10PM +, Al Viro wrote:

> > Yeah, I have at least namei.o
> > 
> > https://drive.google.com/file/d/1AvO1St0YltIrA86DXjp1Xg3ojtS9owGh/view?usp=sharing
> 
> *grumble*
> 
> Is it reproducible without KASAN?  Would be much easier to follow the produced
> asm...

Looks like inode_permission(_, NULL, _) from may_lookup(nd).  I.e.
nd->inode == NULL.

Mind slapping BUG_ON(!nd->inode) right before may_lookup() call in
link_path_walk() and trying to reproduce that oops?


Re: [syzbot] WARNING in mntput_no_expire (2)

2021-04-04 Thread Al Viro
On Sun, Apr 04, 2021 at 07:05:13PM +0200, Christian Brauner wrote:
> On Sun, Apr 04, 2021 at 04:44:06PM +0000, Al Viro wrote:
> > On Sun, Apr 04, 2021 at 06:40:40PM +0200, Christian Brauner wrote:
> > 
> > > > Very interesting.  What happens if you call loop() twice?  And now I 
> > > > wonder
> > > > whether it's root or cwd, actually...  Hmm...
> > > > 
> > > > How about this:
> > > > fd = open("/proc/self/mountinfo", 0);
> > > > mkdir("./newroot/foo", 0777);
> > > > mount("./newroot/foo", "./newroot/foo", 0, MS_BIND, NULL);
> > > > chroot("./newroot");
> > > > chdir("/foo");
> > > > while (1) {
> > > > static char buf[4096];
> > > > int n = read(fd, buf, 4096);
> > > > if (n <= 0)
> > > > break;
> > > > write(1, buf, n);
> > > > }
> > > > close(fd);
> > > > drop_caps();
> > > > loop();
> > > > as the end of namespace_sandbox_proc(), instead of
> > > > chroot("./newroot");
> > > > chdir("/");
> > > > drop_caps();
> > > > loop();
> > > > sequence we have there?
> > > 
> > > Uhum, well then we oops properly with a null-deref.
> > 
> > Cute...  Could you dump namei.o (ideally - with namei.s) from your build
> > someplace public?
> 
> Yeah, I have at least namei.o
> 
> https://drive.google.com/file/d/1AvO1St0YltIrA86DXjp1Xg3ojtS9owGh/view?usp=sharing

*grumble*

Is it reproducible without KASAN?  Would be much easier to follow the produced
asm...


Re: [syzbot] WARNING in mntput_no_expire (2)

2021-04-04 Thread Al Viro
On Sun, Apr 04, 2021 at 06:40:40PM +0200, Christian Brauner wrote:

> > Very interesting.  What happens if you call loop() twice?  And now I wonder
> > whether it's root or cwd, actually...  Hmm...
> > 
> > How about this:
> > fd = open("/proc/self/mountinfo", 0);
> > mkdir("./newroot/foo", 0777);
> > mount("./newroot/foo", "./newroot/foo", 0, MS_BIND, NULL);
> > chroot("./newroot");
> > chdir("/foo");
> > while (1) {
> > static char buf[4096];
> > int n = read(fd, buf, 4096);
> > if (n <= 0)
> > break;
> > write(1, buf, n);
> > }
> > close(fd);
> > drop_caps();
> > loop();
> > as the end of namespace_sandbox_proc(), instead of
> > chroot("./newroot");
> > chdir("/");
> > drop_caps();
> > loop();
> > sequence we have there?
> 
> Uhum, well then we oops properly with a null-deref.

Cute...  Could you dump namei.o (ideally - with namei.s) from your build
someplace public?


Re: [syzbot] WARNING in mntput_no_expire (2)

2021-04-04 Thread Al Viro
On Sun, Apr 04, 2021 at 01:34:45PM +0200, Christian Brauner wrote:

> Sorry for not replying to your earlier mail but I've been debugging this
> too. My current theory is that it's related to LOOKUP_ROOT_GRABBED when
> LOOKUP_CACHED is specified _possibly_ with an interaction how
> create_io_thread() is created with CLONE_FS. The reproducer requires you
> either have called pivot_root() or chroot() in order for the failure to
> happen. So I think the fact that we skip legitimize_root() when
> LOOKUP_CACHED is set might figure into this. I can keep digging.
> 

> Funny enough I already placed a printk statement into the place you
> wanted one too so I just amended mine. Here's what you get:
> 
> If pivot pivot_root() is used before the chroot() you get:
> 
> [  637.464555] : count(-1) | mnt_mntpoint(/) | mnt->mnt.mnt_root(/) | 
> id(579) | dev(tmpfs)
> 
> if you only call chroot, i.e. make the pivot_root() branch a simple
> if (true) you get:
> 
> [  955.206117] : count(-2) | mnt_mntpoint(/) | mnt->mnt.mnt_root(/) | 
> id(580) | dev(tmpfs)

Very interesting.  What happens if you call loop() twice?  And now I wonder
whether it's root or cwd, actually...  Hmm...

How about this:
fd = open("/proc/self/mountinfo", 0);
mkdir("./newroot/foo", 0777);
mount("./newroot/foo", "./newroot/foo", 0, MS_BIND, NULL);
chroot("./newroot");
chdir("/foo");
while (1) {
static char buf[4096];
int n = read(fd, buf, 4096);
if (n <= 0)
break;
write(1, buf, n);
}
close(fd);
drop_caps();
loop();
as the end of namespace_sandbox_proc(), instead of
chroot("./newroot");
chdir("/");
drop_caps();
loop();
sequence we have there?

> The cat /proc/self/mountinfo is for the id(579) below:
... and it misses the damn thing, since we call it before the mount
in question had been created ;-/  So we'd probably be better off not
trying to be clever and just doing that as explicit (and completely
untested) read-write loop above.


Re: [syzbot] WARNING in mntput_no_expire (2)

2021-04-03 Thread Al Viro
On Sun, Apr 04, 2021 at 02:34:08AM +, Al Viro wrote:

> I really wonder what mount is it happening to.  BTW, how painful would
> it be to teach syzcaller to turn those cascades of
>   NONFAILING(*(uint8_t*)0x2080 = 0x12);
>   NONFAILING(*(uint8_t*)0x2081 = 0);
>   NONFAILING(*(uint16_t*)0x2082 = 0);
>   NONFAILING(*(uint32_t*)0x2084 = 0xff9c);
>   NONFAILING(*(uint64_t*)0x2088 = 0);
>   NONFAILING(*(uint64_t*)0x2090 = 0x2180);
>   NONFAILING(memcpy((void*)0x2180, "./file0\000", 8));
>   NONFAILING(*(uint32_t*)0x2098 = 0);
>   NONFAILING(*(uint32_t*)0x209c = 0x80);
>   NONFAILING(*(uint64_t*)0x20a0 = 0x23456);
>   
>   NONFAILING(syz_io_uring_submit(r[1], r[2], 0x2080, 0));
> into something more readable?  Bloody annoyance every time...  Sure, I can
> manually translate it into
>   struct io_uring_sqe *sqe = (void *)0x2080;
>   char *s = (void *)0x2180;
>   memset(sqe, '\0', sizeof(*sqe));
>   sqe->opcode = 0x12; // IORING_OP_OPENAT?
>   sqe->fd = -100; // AT_FDCWD?
>   sqe->addr = s;
>   strcpy(s, "./file0");
>   sqe->open_flags = 0x80; // O_EXCL???
>   sqe->user_data = 0x23456;   // random tag?
>   syz_io_uring_submit(r[1], r[2], (unsigned long)p, 0);
> but it's really annoying as hell, especially since syz_io_uring_submit()
> comes from syzcaller and the damn thing _knows_ that the third argument
> is sodding io_uring_sqe, and never passed to anything other than
> memcpy() in there, at that, so the exact address can't matter.

... especially since the native syzcaller reproducer clearly *does* have
that information.  Simply putting that into comments side-by-side with
what gets put into C reproducer would be nice, especially if it goes with
field names...


Re: [syzbot] WARNING in mntput_no_expire (2)

2021-04-03 Thread Al Viro
On Thu, Apr 01, 2021 at 07:11:12PM +, Al Viro wrote:

> > I _think_ I see what the issue is. It seems that an assumption made in
> > this commit might be wrong and we're missing a mnt_add_count() bump that
> > we would otherwise have gotten if we've moved the failure handling into
> > the unlazy helpers themselves.
> > 
> > Al, does that sound plausible?
> 
> mnt_add_count() on _what_?  Failure in legitimize_links() ends up with
> nd->path.mnt zeroed, in both callers.  So which vfsmount would be
> affected?

Could you turn that WARN_ON(count < 0) into
if (WARN_ON(count < 0))
printk(KERN_ERR "id = %d, dev = %s, count = %d\n",
mnt->mnt_id,
mnt->mnt_sb->s_id,
count);
add system("cat /proc/self/mountinfo"); right after sandbox_common()
call and try to reproduce that?

I really wonder what mount is it happening to.  BTW, how painful would
it be to teach syzcaller to turn those cascades of
NONFAILING(*(uint8_t*)0x2080 = 0x12);
NONFAILING(*(uint8_t*)0x2081 = 0);
NONFAILING(*(uint16_t*)0x2082 = 0);
NONFAILING(*(uint32_t*)0x2084 = 0xff9c);
NONFAILING(*(uint64_t*)0x2088 = 0);
NONFAILING(*(uint64_t*)0x2090 = 0x2180);
NONFAILING(memcpy((void*)0x2180, "./file0\000", 8));
NONFAILING(*(uint32_t*)0x2098 = 0);
NONFAILING(*(uint32_t*)0x209c = 0x80);
NONFAILING(*(uint64_t*)0x20a0 = 0x23456);

NONFAILING(syz_io_uring_submit(r[1], r[2], 0x2080, 0));
into something more readable?  Bloody annoyance every time...  Sure, I can
manually translate it into
struct io_uring_sqe *sqe = (void *)0x2080;
char *s = (void *)0x2180;
memset(sqe, '\0', sizeof(*sqe));
sqe->opcode = 0x12; // IORING_OP_OPENAT?
sqe->fd = -100; // AT_FDCWD?
sqe->addr = s;
strcpy(s, "./file0");
sqe->open_flags = 0x80; // O_EXCL???
sqe->user_data = 0x23456;   // random tag?
syz_io_uring_submit(r[1], r[2], (unsigned long)p, 0);
but it's really annoying as hell, especially since syz_io_uring_submit()
comes from syzcaller and the damn thing _knows_ that the third argument
is sodding io_uring_sqe, and never passed to anything other than
memcpy() in there, at that, so the exact address can't matter.

Incidentally, solitary O_EXCL (without O_CREAT) is... curious.  Does that
sucker still trigger without it?  I.e. with store to 0x209c replaced
with storing 0?


Re: [PATCH] Where we are for this patch?

2021-04-01 Thread Al Viro
On Thu, Apr 01, 2021 at 12:17:44PM -0700, Roy Yang wrote:
> Both Android and Chrome OS really want this feature; For Container-Optimized 
> OS, we have customers
> interested in the defense too.
> 
> Thank you very much.
> 
> Change-Id: I1eb1b726007aa8f9c374b934cc1c690fb4924aa3

You forgot to tell what patch you are refering to.  Your
Change-Id (whatever the hell that is) doesn't help at all.  Don't
assume that keys in your internal database make sense for the
rest of the world, especially when they appear to contain a hash
of something...


Re: [syzbot] WARNING in mntput_no_expire (2)

2021-04-01 Thread Al Viro
On Thu, Apr 01, 2021 at 07:59:19PM +0200, Christian Brauner wrote:

> I _think_ I see what the issue is. It seems that an assumption made in
> this commit might be wrong and we're missing a mnt_add_count() bump that
> we would otherwise have gotten if we've moved the failure handling into
> the unlazy helpers themselves.
> 
> Al, does that sound plausible?

mnt_add_count() on _what_?  Failure in legitimize_links() ends up with
nd->path.mnt zeroed, in both callers.  So which vfsmount would be
affected?

Rules:
in RCU mode: no mounts pinned
out of RCU mode: nd->path.mnt and all nd->stack[i].link.mnt for
i below nd->depth are either NULL or pinned

Transition from RCU to non-RCU mode happens in try_to_unlazy() and
try_to_unlazy_next().

References (if any) are dropped by eventual terminate_walk() (after
that the contents of nameidata is junk).

__legitimize_mnt() is the primitive for pinning.  Return values:
0 -- successfully pinned (or given NULL as an argument)
1 -- failed, refcount not affected
-1 -- failed, refcount bumped.
It stays in RCU mode in all cases.

One user is __legitimize_path(); it also stays in RCU mode.  If it
fails to legitimize path->mnt, it will zero it *IF* __legitimize_mnt()
reports that refcount hadn't been taken.  In all other cases,
path->mnt is pinned.  IOW, the caller is responsible for path_put()
regardless of the outcome.

Another user is legitimize_mnt().  _That_ will make sure that
refcount is unaffected in case of failure (IOW, if __legitimize_mnt()
reports failure with refcount bumped, we drop out of RCU mode,
do mntput() and go back).

On failure in legitimize_links() we either leave nd->depth equal to zero
(in which case all nd->stack[...].link.mnt are to be ignored) or
we set it one higher than the last attempted legitimize_path() in there.
In the latter case, all entries in nd->stack below the value we put into
nd->depth had legitimize_path() called (and thus have ->mnt either NULL
or pinned) and everything starting from nd->depth is to be ignored.

nd->path handling:
1) Callers of legitimize_links() are responsible for zeroing nd->path.mnt
on legitimize_links() failure.  Both do that, AFAICS.
2) in try_to_unlazy() we proceed to call legitimize_path() on nd->path.
Once that call is done, we have nd->path.mnt pinned or NULL, so nothing
further is needed with it.
3) in try_to_unlazy_next() we use legitimize_mnt() instead.  Failure
of that is handled by zeroing nd->path.mnt; success means that nd->path.mnt
is pinned and should be left alone.

We could use __legitimize_mnt() in try_to_unlazy_next() (basically,
substitute the body of legitimize_mnt() there and massage it a bit),
but that ends up being harder to follow:
res = __legitimize_mnt(nd->path.mnt, nd->m_seq);
if (unlikely(res)) {
if (res < 0)// pinned, leave it there
goto out1;
else// not pinned, zero it
goto out2;
}
instead of
if (unlikely(!legitimize_mnt(nd->path.mnt, nd->m_seq)))
goto out2;
we have now.


Re: [PATCH v5 00/27] Memory Folios

2021-03-31 Thread Al Viro
On Tue, Mar 30, 2021 at 10:09:29PM +0100, Matthew Wilcox wrote:

> That's a very Intel-centric way of looking at it.  Other architectures
> support a multitude of page sizes, from the insane ia64 (4k, 8k, 16k, then
> every power of four up to 4GB) to more reasonable options like (4k, 32k,
> 256k, 2M, 16M, 128M).  But we (in software) shouldn't constrain ourselves
> to thinking in terms of what the hardware currently supports.  Google
> have data showing that for their workloads, 32kB is the goldilocks size.
> I'm sure for some workloads, it's much higher and for others it's lower.
> But for almost no workload is 4kB the right choice any more, and probably
> hasn't been since the late 90s.

Out of curiosity I looked at the distribution of file sizes in the
kernel tree:
71455 files total
0--4Kb  36702
4--8Kb  11820
8--16Kb 10066
16--32Kb6984
32--64Kb3804
64--128Kb   1498
128--256Kb  393
256--512Kb  108
512Kb--1Mb  35
1--2Mb  25
2--4Mb  5
4--6Mb  7
6--8Mb  4
12Mb2 
14Mb1
16Mb1

... incidentally, everything bigger than 1.2Mb lives^Wshambles under
drivers/gpu/drm/amd/include/asic_reg/

Page size   Footprint
4Kb 1128Mb
8Kb 1324Mb
16Kb1764Mb
32Kb2739Mb
64Kb4832Mb
128Kb   9191Mb
256Kb   18062Mb
512Kb   35883Mb
1Mb 71570Mb
2Mb 142958Mb

So for kernel builds (as well as grep over the tree, etc.) uniform 2Mb pages
would be... interesting.


Re: [PATCH v2] fs: Improve eventpoll logging to stop indicting timerfd

2021-03-31 Thread Al Viro
On Wed, Mar 31, 2021 at 07:16:45PM -0700, Manish Varma wrote:
> timerfd doesn't create any wakelocks, but eventpoll can.  When it does,
> it names them after the underlying file descriptor, and since all
> timerfd file descriptors are named "[timerfd]" (which saves memory on
> systems like desktops with potentially many timerfd instances), all
> wakesources created as a result of using the eventpoll-on-timerfd idiom
> are called... "[timerfd]".
> 
> However, it becomes impossible to tell which "[timerfd]" wakesource is
> affliated with which process and hence troubleshooting is difficult.
> 
> This change addresses this problem by changing the way eventpoll
> wakesources are named:
> 
> 1) the top-level per-process eventpoll wakesource is now named "epoll:P"
> (instead of just "eventpoll"), where P, is the PID of the creating
> process.
> 2) individual per-underlying-filedescriptor eventpoll wakesources are
> now named "epollitemN:P.F", where N is a unique ID token and P is PID
> of the creating process and F is the name of the underlying file
> descriptor.
> 
> All together that should be splitted up into a change to eventpoll and
> timerfd (or other file descriptors).

FWIW, it smells like a variant of wakeup_source_register() that would
take printf format + arguments would be a good idea.  I.e. something
like

> + snprintf(buf, sizeof(buf), "epoll:%d", task_pid);
> + epi->ep->ws = wakeup_source_register(NULL, buf);

... = wakeup_source_register(NULL, "epoll:%d", task_pid);

etc.


Re: [PATCH 1/2] fs/namespace: corrent/improve kernel-doc notation

2021-03-31 Thread Al Viro
On Wed, Mar 31, 2021 at 02:24:18PM -0600, Jonathan Corbet wrote:
> Randy Dunlap  writes:
> 
> > Fix kernel-doc warnings in fs/namespace.c:
> >
> > ./fs/namespace.c:1379: warning: Function parameter or member 'm' not 
> > described in 'may_umount_tree'
> > ./fs/namespace.c:1379: warning: Excess function parameter 'mnt' description 
> > in 'may_umount_tree'
> > ./fs/namespace.c:1950: warning: Function parameter or member 'path' not 
> > described in 'clone_private_mount'
> >
> > Also convert path_is_mountpoint() comments to kernel-doc.
> >
> > Signed-off-by: Randy Dunlap 
> > Cc: Al Viro 
> > Cc: Jonathan Corbet 
> > Cc: linux-...@vger.kernel.org
> > ---
> > Jon, Al has OK-ed you to merge this patch (and the next one, please).
> >
> >  fs/namespace.c |   14 --
> >  1 file changed, 8 insertions(+), 6 deletions(-)
> 
> An actual acked-by would have been nice, oh well.  Meanwhile, I've
> applied them with fixes to the typos in both changelogs :)

Generally speaking, I'm only glad to leave handling kernel-doc patches
to somebody else, especially when they are that trivial ;-)

Anyway,
Acked-by: Al Viro 


Re: [PATCH v31 07/12] landlock: Support filesystem access-control

2021-03-31 Thread Al Viro
On Wed, Mar 31, 2021 at 07:33:50PM +0200, Mickaël Salaün wrote:

> > +static inline u64 unmask_layers(
> > +   const struct landlock_ruleset *const domain,
> > +   const struct path *const path, const u32 access_request,
> > +   u64 layer_mask)
> > +{
> > +   const struct landlock_rule *rule;
> > +   const struct inode *inode;
> > +   size_t i;
> > +
> > +   if (d_is_negative(path->dentry))
> > +   /* Continues to walk while there is no mapped inode. */
 ^
Odd comment, that...

> > +static int check_access_path(const struct landlock_ruleset *const domain,
> > +   const struct path *const path, u32 access_request)
> > +{

> > +   walker_path = *path;
> > +   path_get(_path);

> > +   while (true) {
> > +   struct dentry *parent_dentry;
> > +
> > +   layer_mask = unmask_layers(domain, _path,
> > +   access_request, layer_mask);
> > +   if (layer_mask == 0) {
> > +   /* Stops when a rule from each layer grants access. */
> > +   allowed = true;
> > +   break;
> > +   }
> > +
> > +jump_up:
> > +   if (walker_path.dentry == walker_path.mnt->mnt_root) {
> > +   if (follow_up(_path)) {
> > +   /* Ignores hidden mount points. */
> > +   goto jump_up;
> > +   } else {
> > +   /*
> > +* Stops at the real root.  Denies access
> > +* because not all layers have granted access.
> > +*/
> > +   allowed = false;
> > +   break;
> > +   }
> > +   }
> > +   if (unlikely(IS_ROOT(walker_path.dentry))) {
> > +   /*
> > +* Stops at disconnected root directories.  Only allows
> > +* access to internal filesystems (e.g. nsfs, which is
> > +* reachable through /proc//ns/).
> > +*/
> > +   allowed = !!(walker_path.mnt->mnt_flags & MNT_INTERNAL);
> > +   break;
> > +   }
> > +   parent_dentry = dget_parent(walker_path.dentry);
> > +   dput(walker_path.dentry);
> > +   walker_path.dentry = parent_dentry;
> > +   }
> > +   path_put(_path);
> > +   return allowed ? 0 : -EACCES;

That's a whole lot of grabbing/dropping references...  I realize that it's
an utterly tactless question, but... how costly it is?  IOW, do you have
profiling data?

> > +/*
> > + * pivot_root(2), like mount(2), changes the current mount namespace.  It 
> > must
> > + * then be forbidden for a landlocked process.

... and cross-directory rename(2) can change the tree topology.  Do you ban that
as well?

[snip]

> > +static int hook_path_rename(const struct path *const old_dir,
> > +   struct dentry *const old_dentry,
> > +   const struct path *const new_dir,
> > +   struct dentry *const new_dentry)
> > +{
> > +   const struct landlock_ruleset *const dom =
> > +   landlock_get_current_domain();
> > +
> > +   if (!dom)
> > +   return 0;
> > +   /* The mount points are the same for old and new paths, cf. EXDEV. */
> > +   if (old_dir->dentry != new_dir->dentry)
> > +   /* For now, forbids reparenting. */
> > +   return -EACCES;

You do, apparently, and not in a way that would have the userland fall
back to copy+unlink.  Lovely...  Does e.g. git survive such restriction?
Same question for your average package build...


Re: [PATCH v5 1/1] fs: Allow no_new_privs tasks to call chroot(2)

2021-03-31 Thread Al Viro
On Tue, Mar 30, 2021 at 11:03:10PM -0700, Kees Cook wrote:

> Regardless, I still endorse this change because it doesn't make things
> _worse_, since without this, a compromised process wouldn't need ANY
> tricks to escape a chroot because it wouldn't be in one. :) It'd be nice
> if there were some way to make future openat() calls be unable to
> resolve outside the chroot, but I view that as an enhancement.
> 
> But, as it stands, I think this makes sense and I stand by my
> Reviewed-by tag. If Al is too busy to take it, and James would rather
> not take VFS, perhaps akpm would carry it? That's where other similar
> VFS security work has landed.

Frankly, I'm less than fond of that thing, but right now I'm buried
under all kinds of crap (->d_revalidate() joy, mostly).  I'll post
a review, but for now it's very definitely does *not* get an implicit
ACK from me.


Re: [PATCH] jffs2: Hook up splice_write callback

2021-03-30 Thread Al Viro
On Tue, Mar 30, 2021 at 06:31:00PM +, Al Viro wrote:
> On Tue, Mar 30, 2021 at 06:17:15PM +0200, Christoph Hellwig wrote:
> > On Wed, Mar 31, 2021 at 12:15:37AM +1030, Joel Stanley wrote:
> > > overlayfs using jffs2 as the upper filesystem would fail in some cases
> > > since moving to v5.10. The test case used was to run 'touch' on a file
> > > that exists in the lower fs, causing the modification time to be
> > > updated. It returns EINVAL when the bug is triggered.
> > > 
> > > A bisection showed this was introduced in v5.9-rc1, with commit
> > > 36e2c7421f02 ("fs: don't allow splice read/write without explicit ops").
> > > Reverting that commit restores the expected behaviour.
> > > 
> > > Some digging showed that this was due to jffs2 lacking an implementation
> > > of splice_write. (For unknown reasons the warn_unsupported that should
> > > trigger was not displaying any output).
> > > 
> > > Adding this patch resolved the issue and the test now passes.
> > 
> > Looks good:
> > 
> > Reviewed-by: Christoph Hellwig 
> 
> The same goes for quite a few other filesystems, actually - at least
> adfs, affs, bfs, hfs, hfsplus, hostfs, hpfs, minix, omfs, sysv, ufs 
> and vboxsf are in the same boat, and I suspect that ecryptfs and ntfs
> might be too.
> 
> Christoph, do you see any problems with doing the same thing for that
> bunch as well?

coda and udf as well, by the look of it...


Re: [PATCH] jffs2: Hook up splice_write callback

2021-03-30 Thread Al Viro
On Tue, Mar 30, 2021 at 06:17:15PM +0200, Christoph Hellwig wrote:
> On Wed, Mar 31, 2021 at 12:15:37AM +1030, Joel Stanley wrote:
> > overlayfs using jffs2 as the upper filesystem would fail in some cases
> > since moving to v5.10. The test case used was to run 'touch' on a file
> > that exists in the lower fs, causing the modification time to be
> > updated. It returns EINVAL when the bug is triggered.
> > 
> > A bisection showed this was introduced in v5.9-rc1, with commit
> > 36e2c7421f02 ("fs: don't allow splice read/write without explicit ops").
> > Reverting that commit restores the expected behaviour.
> > 
> > Some digging showed that this was due to jffs2 lacking an implementation
> > of splice_write. (For unknown reasons the warn_unsupported that should
> > trigger was not displaying any output).
> > 
> > Adding this patch resolved the issue and the test now passes.
> 
> Looks good:
> 
> Reviewed-by: Christoph Hellwig 

The same goes for quite a few other filesystems, actually - at least
adfs, affs, bfs, hfs, hfsplus, hostfs, hpfs, minix, omfs, sysv, ufs 
and vboxsf are in the same boat, and I suspect that ecryptfs and ntfs
might be too.

Christoph, do you see any problems with doing the same thing for that
bunch as well?


Re: [PATCH v3 03/18] ovl: stack fileattr ops

2021-03-28 Thread Al Viro
On Thu, Mar 25, 2021 at 08:37:40PM +0100, Miklos Szeredi wrote:
> Add stacking for the fileattr operations.
> 
> Add hack for calling security_file_ioctl() for now.  Probably better to
> have a pair of specific hooks for these operations.

Umm...  Shouldn't you remove the old code from their ->ioctl() instance?


Re: [PATCH v3 01/18] vfs: add fileattr ops

2021-03-28 Thread Al Viro
On Thu, Mar 25, 2021 at 08:37:38PM +0100, Miklos Szeredi wrote:

> +int vfs_fileattr_get(struct dentry *dentry, struct fileattr *fa)
> +{
> + struct inode *inode = d_inode(dentry);
> +
> + if (d_is_special(dentry))
> + return -ENOTTY;

FWIW - why?  For uses via ioctl() you simply won't get there with
device nodes et.al. - they have file_operations of their own.
If we add syscall(s) for getting/setting those, there's no reason
for e.g. a device node not to have those attributes...

> +static int ioctl_getflags(struct file *file, void __user *argp)

unsigned int __user *argp, surely?

> +{
> + struct fileattr fa = { .flags_valid = true }; /* hint only */
> + unsigned int flags;
> + int err;
> +
> + err = vfs_fileattr_get(file->f_path.dentry, );
> + if (!err) {
> + flags = fa.flags;
> + if (copy_to_user(argp, , sizeof(flags)))
> + err = -EFAULT;

... and put_user() here.

> + }
> + return err;
> +}
> +
> +static int ioctl_setflags(struct file *file, void __user *argp)
> +{
> + struct fileattr fa;
> + unsigned int flags;
> + int err;
> +
> + if (copy_from_user(, argp, sizeof(flags)))
> + return -EFAULT;
> +
> + err = mnt_want_write_file(file);
> + if (!err) {
> + fileattr_fill_flags(, flags);
> + err = vfs_fileattr_set(file_mnt_user_ns(file), 
> file_dentry(file), );
> + mnt_drop_write_file(file);
> + }
> + return err;
> +}

Similar here.


Re: [PATCH 1/3] fs/dcache: Add d_clear_dir_neg_dentries()

2021-03-28 Thread Al Viro
On Sun, Mar 28, 2021 at 11:43:54AM -0300, André Almeida wrote:

> +/**
> + * d_clear_dir_neg_dentries - Remove negative dentries in an inode
> + * @dir: Directory to clear negative dentries
> + *
> + * For directories with negative dentries that are becoming case-insensitive
> + * dirs, we need to remove all those negative dentries, otherwise they will
> + * become dangling dentries. During the creation of a new file, if a d_hash
> + * collision happens and the names match in a case-insensitive, the name of
> + * the file will be the name defined at the negative dentry, that can be
> + * different from the specified by the user. To prevent this from happening, 
> we
> + * need to remove all dentries in a directory. Given that the directory must 
> be
> + * empty before we call this function we are sure that all dentries there 
> will
> + * be negative.
> + */
> +void d_clear_dir_neg_dentries(struct inode *dir)
> +{
> + struct dentry *alias, *dentry;
> +
> + hlist_for_each_entry(alias, >i_dentry, d_u.d_alias) {
> + list_for_each_entry(dentry, >d_subdirs, d_child) {
> + d_drop(dentry);
> + dput(dentry);
> + }
> + }
> +}

That makes no sense whatsoever.
1) directories can never have more than one alias
2) what the hell are you doing to refcounts on those children?


Re: [syzbot] KASAN: null-ptr-deref Read in filp_close (2)

2021-03-27 Thread Al Viro
On Fri, Mar 26, 2021 at 02:50:11PM +0100, Christian Brauner wrote:
> @@ -632,6 +632,7 @@ EXPORT_SYMBOL(close_fd); /* for ksys_close() */
>  static inline void __range_cloexec(struct files_struct *cur_fds,
>  unsigned int fd, unsigned int max_fd)
>  {
> + unsigned int cur_max;
>   struct fdtable *fdt;
>  
>   if (fd > max_fd)
> @@ -639,7 +640,12 @@ static inline void __range_cloexec(struct files_struct 
> *cur_fds,
>  
>   spin_lock(_fds->file_lock);
>   fdt = files_fdtable(cur_fds);
> - bitmap_set(fdt->close_on_exec, fd, max_fd - fd + 1);
> + /* make very sure we're using the correct maximum value */
> + cur_max = fdt->max_fds;
> + cur_max--;
> + cur_max = min(max_fd, cur_max);
> + if (fd <= cur_max)
> + bitmap_set(fdt->close_on_exec, fd, cur_max - fd + 1);
>   spin_unlock(_fds->file_lock);
>  }

Umm...  That's harder to follow than it ought to be.  What's the point of
having
max_fd = min(max_fd, cur_max);
done in the caller, anyway?  Note that in __range_close() you have to
compare with re-fetched ->max_fds (look at pick_file()), so...

BTW, I really wonder if the cost of jerking ->file_lock up and down
in that loop in __range_close() is negligible.  What values do we
typically get from callers and how sparse does descriptor table tend
to be for those?


Re: [PATCH 6/9] debugfs: Implement debugfs_create_str()

2021-03-27 Thread Al Viro
On Fri, Mar 26, 2021 at 11:33:58AM +0100, Peter Zijlstra wrote:

> +again:
> + rcu_read_lock();
> + str = rcu_dereference(*(char **)file->private_data);
> + len = strlen(str) + 1;
> +
> + if (!copy || copy_len < len) {
> + rcu_read_unlock();
> + kfree(copy);
> + copy = kmalloc(len + 1, GFP_KERNEL);
> + if (!copy) {
> + debugfs_file_put(dentry);
> + return -ENOMEM;
> + }
> + copy_len = len;
> + goto again;
> + }
> +
> + strncpy(copy, str, len);
> + copy[len] = '\n';
> + copy[len+1] = '\0';
> + rcu_read_unlock();

*Ow*

If the string can't change under you, what is RCU use about?
And if it can, any use of string functions is asking for serious
trouble; they are *not* guaranteed to be safe when any of the strings
involved might be modified under them.


Re: [PATCH v1 1/1] kernel.h: Drop inclusion in bitmap.h

2021-03-27 Thread Al Viro
On Fri, Mar 26, 2021 at 07:03:47PM +0200, Andy Shevchenko wrote:
> The bitmap.h header is used in a lot of code around the kernel.
> Besides that it includes kernel.h which sometimes makes a loop.

How much of the kernel does *not* end up pulling kernel.h anyway,
making all subsequent includes fast no-ops?

Realistically, any C compiler is going to recognize the case when
included file has contents of form

#ifndef 
#define  

#endif

where  contain no exposed #else, #elif or #endif and
remember that subsequent includes of that file should be ignored
unless  becomes undefined.

AFAICS, on e.g. allmodconfig amd64 build it's (at most - there
might've been false positives) 131 files out of ~3; on
defconfig it's 55 out of ~2300.  How much does your patch
change that?


Re: [PATCH 3/4] exec: simplify the compat syscall handling

2021-03-26 Thread Al Viro
On Fri, Mar 26, 2021 at 03:38:30PM +0100, Christoph Hellwig wrote:

> -static const char __user *get_user_arg_ptr(struct user_arg_ptr argv, int nr)
> +static const char __user *
> +get_user_arg_ptr(const char __user *const __user *argv, int nr)
>  {
> - const char __user *native;
> -
> -#ifdef CONFIG_COMPAT
> - if (unlikely(argv.is_compat)) {
> + if (in_compat_syscall()) {
> + const compat_uptr_t __user *compat_argv =
> + compat_ptr((unsigned long)argv);
>   compat_uptr_t compat;
>  
> - if (get_user(compat, argv.ptr.compat + nr))
> + if (get_user(compat, compat_argv + nr))
>   return ERR_PTR(-EFAULT);
> -
>   return compat_ptr(compat);
> - }
> -#endif
> -
> - if (get_user(native, argv.ptr.native + nr))
> - return ERR_PTR(-EFAULT);
> + } else {
> + const char __user *native;
>  
> - return native;
> + if (get_user(native, argv + nr))
> + return ERR_PTR(-EFAULT);
> + return native;
> + }
>  }

Yecchhh  So you have in_compat_syscall() called again and again, for
each argument in the list?  I agree that current version is fucking ugly,
but I really hate that approach ;-/


Re: [PATCH 01/18] vfs: add miscattr ops

2021-03-25 Thread Al Viro
On Thu, Mar 25, 2021 at 03:52:28PM +0100, Miklos Szeredi wrote:
> On Tue, Mar 23, 2021 at 1:24 AM Eric Biggers  wrote:
> 
> > > +int vfs_miscattr_set(struct dentry *dentry, struct miscattr *ma)
> > > +{
> > > + struct inode *inode = d_inode(dentry);
> > > + struct miscattr old_ma = {};
> > > + int err;
> > > +
> > > + if (d_is_special(dentry))
> > > + return -ENOTTY;
> > > +
> > > + if (!inode->i_op->miscattr_set)
> > > + return -ENOIOCTLCMD;
> > > +
> > > + if (!inode_owner_or_capable(inode))
> > > + return -EPERM;
> >
> > Shouldn't this be EACCES, not EPERM?
> 
> $ git diff master.. | grep -C1 inode_owner_or_capable | grep
> "^-.*\(EPERM\|EACCES\)" | cut -d- -f3 | sort | uniq -c
>  12 EACCES;
>   4 EPERM;
> 
> So EACCES would win if this was a democracy.  However:
> 
> "[EACCES]
> Permission denied. An attempt was made to access a file in a way
> forbidden by its file access permissions."
> 
> "[EPERM]
> Operation not permitted. An attempt was made to perform an operation
> limited to processes with appropriate privileges or to the owner of a
> file or other resource."
> 
> The EPERM description matches the semantics of
> inode_owner_or_capable() exactly.  It's a pretty clear choice.

Except that existing implementation (e.g. for ext2) gives -EACCES here...
OTOH, EPERM matches the behaviour of chown(2), as well as that of
*BSD chflags(2), which is the best match to functionality (setting and
clearing immutable/append-only/etc.)

So I'd probably go with EPERM, and watched for userland breakage;
if something *does* rely upon the historical EACCES here, we might
have to restore that.


Re: [PATCH v2 01/18] vfs: add miscattr ops

2021-03-24 Thread Al Viro
On Wed, Mar 24, 2021 at 09:45:02AM +0100, Miklos Szeredi wrote:
> On Wed, Mar 24, 2021 at 6:03 AM Al Viro  wrote:
> >
> > On Mon, Mar 22, 2021 at 03:48:59PM +0100, Miklos Szeredi wrote:
> >
> > minor nit: copy_fsxattr_{to,from}_user() might be better.
> >
> > > +int fsxattr_copy_to_user(const struct miscattr *ma, struct fsxattr 
> > > __user *ufa)
> > > +{
> > > + struct fsxattr fa = {
> > > + .fsx_xflags = ma->fsx_xflags,
> > > + .fsx_extsize= ma->fsx_extsize,
> > > + .fsx_nextents   = ma->fsx_nextents,
> > > + .fsx_projid = ma->fsx_projid,
> > > + .fsx_cowextsize = ma->fsx_cowextsize,
> > > + };
> >
> > That wants a comment along the lines of "guaranteed to be gap-free",
> > since otherwise you'd need memset() to avoid an infoleak.
> 
> Isn't structure initialization supposed to zero everything not
> explicitly initialized?

All fields, but not the padding...

> The one in io_uring() seems wrong also, as a beast needing
> file_dentry() should never get out of overlayfs and into io_uring:

That one would be wrong in overlayfs as well - we'd better had the
same names in all layers...

> --- a/fs/io_uring.c
> +++ b/fs/io_uring.c
> @@ -9297,7 +9297,7 @@ static void __io_uring_show_fdinfo(struct
> io_ring_ctx *ctx, struct seq_file *m)
> struct file *f = *io_fixed_file_slot(ctx->file_data, i);
> 
> if (f)
> -   seq_printf(m, "%5u: %s\n", i, 
> file_dentry(f)->d_iname);
> +   seq_printf(m, "%5u: %pD\n", i, f);
> else
> seq_printf(m, "%5u: \n", i);
> }
> 
> 
> Thanks,
> Miklos


Re: [PATCH v2 03/18] ovl: stack miscattr ops

2021-03-23 Thread Al Viro
On Wed, Mar 24, 2021 at 05:09:59AM +, Al Viro wrote:
> On Mon, Mar 22, 2021 at 03:49:01PM +0100, Miklos Szeredi wrote:

> Umm...  No equivalents of
> /*  
>  * Prevent copy up if immutable and has no CAP_LINUX_IMMUTABLE
>  * capability.
>  */ 
> ret = -EPERM;
> if (!ovl_has_upperdata(inode) && IS_IMMUTABLE(inode) &&
> !capable(CAP_LINUX_IMMUTABLE))
> goto unlock;
> 

Nevermind, you take care of that in the caller...


Re: [PATCH v2 03/18] ovl: stack miscattr ops

2021-03-23 Thread Al Viro
On Mon, Mar 22, 2021 at 03:49:01PM +0100, Miklos Szeredi wrote:

> +int ovl_miscattr_set(struct user_namespace *mnt_userns,
> +  struct dentry *dentry, struct miscattr *ma)
> +{
> + struct inode *inode = d_inode(dentry);
> + struct dentry *upperdentry;
> + const struct cred *old_cred;
> + int err;
> +
> + err = ovl_want_write(dentry);
> + if (err)
> + goto out;
> +
> + err = ovl_copy_up(dentry);
> + if (!err) {
> + upperdentry = ovl_dentry_upper(dentry);
> +
> + old_cred = ovl_override_creds(inode->i_sb);
> + err = ovl_security_miscattr(dentry, ma, true);
> + if (!err)
> + err = vfs_miscattr_set(_user_ns, upperdentry, ma);
> + revert_creds(old_cred);
> + ovl_copyflags(ovl_inode_real(inode), inode);
> + }
> + ovl_drop_write(dentry);
> +out:
> + return err;
> +}

Umm...  No equivalents of
/*  
 * Prevent copy up if immutable and has no CAP_LINUX_IMMUTABLE
 * capability.
 */ 
ret = -EPERM;
if (!ovl_has_upperdata(inode) && IS_IMMUTABLE(inode) &&
!capable(CAP_LINUX_IMMUTABLE))
goto unlock;

ret = ovl_maybe_copy_up(file_dentry(file), O_WRONLY);
if (ret)
goto unlock;
in the current tree?


  1   2   3   4   5   6   7   8   9   10   >