On 2/4/26 10:03 PM, Waiman Long wrote:
On 2/4/26 3:18 PM, Al Viro wrote:
On Wed, Feb 04, 2026 at 01:16:15PM -0500, Waiman Long wrote:
Thanks for the detailed explanation. I am thinking about something like
the code diff below. Of course, there are other corner cases like
unshare(2)
that still needs to be handled. Do you think something like this is
viable?
Deadlocks aside, the immediate problem here is that consensus number
is too
low. Take three threads sharing the same fs_struct instance. The
first one
calls your get_fs_pwd_share(); then the remaining two threads call
set_fs_pwd()
(e.g. by calling chdir(2) in userland code). The reference stored into
fs->pwd_waiter by the first of those two gets overwritten by that stored
by the second. When the caller of get_fs_pwd_share() gets to
put_fs_pwd_share(),
only one of the sleepers gets woken up...
And it's very easy to end up with something as simple as chdir("foo")
deadlocking -
we start with resolving the relative pathname we'd been given, audit
wants to
record the current directory, on the theory that relative pathname is
none too
useful in logs without knowing what had it been relative to. Then, in
the
same thread, you call set_fs_pwd() - after all, that's the main
effect of chdir(2).
Deadlock...
IOW, it's not just unshare(2) that needs to be taken care of -
chdir(2) would need
to be treated differently.
Now I realize that there is indeed a deadlock problem. Scrap that. Now
I have a simpler idea that shouldn't have this type of deadlock
problem. So what do you think about the sample code below?
Well, a complete change log is as follows.
Cheers,
Longman
=======================[ Cut here ]================================
diff --git a/fs/fs_struct.c b/fs/fs_struct.c
index b8c46c5a38a0..67e08d8db058 100644
--- a/fs/fs_struct.c
+++ b/fs/fs_struct.c
@@ -32,15 +32,19 @@ void set_fs_root(struct fs_struct *fs, const struct
path *p>
void set_fs_pwd(struct fs_struct *fs, const struct path *path)
{
struct path old_pwd;
+ int count;
path_get(path);
write_seqlock(&fs->seq);
old_pwd = fs->pwd;
fs->pwd = *path;
+ count = fs->pwd_xrefs + 1;
+ fs->pwd_xrefs = 0;
write_sequnlock(&fs->seq);
if (old_pwd.dentry)
- path_put(&old_pwd);
+ while (count--)
+ path_put(&old_pwd);
}
static inline int replace_path(struct path *p, const struct path *old,
const s>
@@ -70,6 +74,8 @@ void chroot_fs_refs(const struct path *old_root, const
struct>
count++;
path_get(new_root);
}
+ count += fs->pwd_xrefs;
+ fs->pwd_xrefs = 0;
write_sequnlock(&fs->seq);
}
task_unlock(p);
@@ -81,8 +87,11 @@ void chroot_fs_refs(const struct path *old_root,
const struc>
void free_fs_struct(struct fs_struct *fs)
{
+ int count = fs->pwd_xrefs + 1;
+
path_put(&fs->root);
- path_put(&fs->pwd);
+ while (count--)
+ path_put(&fs->pwd);
kmem_cache_free(fs_cachep, fs);
}
@@ -110,6 +119,7 @@ struct fs_struct *copy_fs_struct(struct fs_struct *old)
if (fs) {
fs->users = 1;
fs->in_exec = 0;
+ fs->pwd_xrefs = 0;
seqlock_init(&fs->seq);
fs->umask = old->umask;
diff --git a/include/linux/fs_struct.h b/include/linux/fs_struct.h
index 0070764b790a..0d79d51de240 100644
--- a/include/linux/fs_struct.h
+++ b/include/linux/fs_struct.h
@@ -8,10 +8,11 @@
#include <linux/seqlock.h>
struct fs_struct {
- int users;
seqlock_t seq;
+ int users;
int umask;
int in_exec;
+ int pwd_xrefs; /* Extra references of pwd */
struct path root, pwd;
} __randomize_layout;
@@ -40,6 +41,31 @@ static inline void get_fs_pwd(struct fs_struct *fs,
struct p>
read_sequnlock_excl(&fs->seq);
}
+static inline void get_fs_pwd_share(struct fs_struct *fs, struct path *pwd)
+{
+ read_seqlock_excl(&fs->seq);
+ *pwd = fs->pwd;
+ if (fs->pwd_xrefs)
+ fs->pwd_xrefs--;
+ else
+ path_get(pwd);
+ read_sequnlock_excl(&fs->seq);
+}
+
+static inline void put_fs_pwd_share(struct fs_struct *fs, struct path *pwd)
+{
+ bool put = false;
+
+ read_seqlock_excl(&fs->seq);
+ if ((fs->pwd.dentry == pwd->dentry) && (fs->pwd.mnt == pwd->mnt))
+ fs->pwd_xrefs++;
+ else
+ put = true;
+ read_sequnlock_excl(&fs->seq);
+ if (put)
+ path_put(pwd);
+}
+
extern bool current_chrooted(void);
static inline int current_umask(void)