The most obvious syscall to add support for the new LOOKUP_* scoping
flags would be openat(2). However, there are a few reasons to not do
this:

 * The new LOOKUP_* flags are intended to be security features, and
   openat(2) will silently ignore all unknown flags. This means that
   users would need to avoid foot-gunning themselves constantly when
   using this interface if it were part of openat(2).

 * Resolution scoping feels like a different operation to the existing
   O_* flags. And since openat(2) has limited flag space, it seems to be
   quite wasteful to clutter it with 5 flags that are all
   resolution-related. Arguably O_NOFOLLOW is also a resolution flag but
   its entire purpose is to error out if you encounter a trailing
   symlink -- not to scope resolution.

 * Other systems would be able to reimplement this syscall allowing for
   cross-OS standardisation rather than being hidden amongst O_* flags
   which may result in it not being used by all the parties that might
   want to use it (file servers, web servers, container runtimes, etc).

 * It gives us the opportunity to iterate on the O_PATH interface. In
   particular, the new @how->upgrade_mask field for fd re-opening is
   only possible because we have a clean slate without needing to re-use
   the ACC_MODE flag design nor the existing openat(2) @mode semantics.

To this end, we introduce the openat2(2) syscall. It provides all of the
features of openat(2) through the @how->flags argument, but also
also provides a new @how->resolve argument which exposes RESOLVE_* flags
that map to our new LOOKUP_* flags. It also eliminates the long-standing
ugliness of variadic-open(2) by embedding it in a struct.

In order to allow for userspace to lock down their usage of file
descriptor re-opening, openat2(2) has the ability for users to disallow
certain re-opening modes through @how->upgrade_mask. At the moment,
there is no UPGRADE_NOEXEC.

Co-developed-by: Christian Brauner <christ...@brauner.io>
Signed-off-by: Aleksa Sarai <cyp...@cyphar.com>
---
 arch/alpha/kernel/syscalls/syscall.tbl      |   1 +
 arch/arm/tools/syscall.tbl                  |   1 +
 arch/arm64/include/asm/unistd.h             |   2 +-
 arch/arm64/include/asm/unistd32.h           |   2 +
 arch/ia64/kernel/syscalls/syscall.tbl       |   1 +
 arch/m68k/kernel/syscalls/syscall.tbl       |   1 +
 arch/microblaze/kernel/syscalls/syscall.tbl |   1 +
 arch/mips/kernel/syscalls/syscall_n32.tbl   |   1 +
 arch/mips/kernel/syscalls/syscall_n64.tbl   |   1 +
 arch/mips/kernel/syscalls/syscall_o32.tbl   |   1 +
 arch/parisc/kernel/syscalls/syscall.tbl     |   1 +
 arch/powerpc/kernel/syscalls/syscall.tbl    |   1 +
 arch/s390/kernel/syscalls/syscall.tbl       |   1 +
 arch/sh/kernel/syscalls/syscall.tbl         |   1 +
 arch/sparc/kernel/syscalls/syscall.tbl      |   1 +
 arch/x86/entry/syscalls/syscall_32.tbl      |   1 +
 arch/x86/entry/syscalls/syscall_64.tbl      |   1 +
 arch/xtensa/kernel/syscalls/syscall.tbl     |   1 +
 fs/open.c                                   | 136 ++++++++++++++------
 include/linux/fcntl.h                       |  15 ++-
 include/linux/fs.h                          |   4 +-
 include/linux/syscalls.h                    |  14 +-
 include/uapi/asm-generic/unistd.h           |   5 +-
 include/uapi/linux/fcntl.h                  |  38 ++++++
 24 files changed, 186 insertions(+), 46 deletions(-)

diff --git a/arch/alpha/kernel/syscalls/syscall.tbl 
b/arch/alpha/kernel/syscalls/syscall.tbl
index 9e7704e44f6d..1703d048c141 100644
--- a/arch/alpha/kernel/syscalls/syscall.tbl
+++ b/arch/alpha/kernel/syscalls/syscall.tbl
@@ -461,6 +461,7 @@
 530    common  getegid                         sys_getegid
 531    common  geteuid                         sys_geteuid
 532    common  getppid                         sys_getppid
+533    common  openat2                         sys_openat2
 # all other architectures have common numbers for new syscall, alpha
 # is the exception.
 534    common  pidfd_send_signal               sys_pidfd_send_signal
diff --git a/arch/arm/tools/syscall.tbl b/arch/arm/tools/syscall.tbl
index aaf479a9e92d..4ad262698396 100644
--- a/arch/arm/tools/syscall.tbl
+++ b/arch/arm/tools/syscall.tbl
@@ -447,3 +447,4 @@
 431    common  fsconfig                        sys_fsconfig
 432    common  fsmount                         sys_fsmount
 433    common  fspick                          sys_fspick
+434    common  openat2                         sys_openat2
diff --git a/arch/arm64/include/asm/unistd.h b/arch/arm64/include/asm/unistd.h
index c9f8dd421c5f..0d4aa3e5389e 100644
--- a/arch/arm64/include/asm/unistd.h
+++ b/arch/arm64/include/asm/unistd.h
@@ -33,7 +33,7 @@
 #define __ARM_NR_compat_set_tls                (__ARM_NR_COMPAT_BASE + 5)
 #define __ARM_NR_COMPAT_END            (__ARM_NR_COMPAT_BASE + 0x800)
 
-#define __NR_compat_syscalls           434
+#define __NR_compat_syscalls           435
 #endif
 
 #define __ARCH_WANT_SYS_CLONE
diff --git a/arch/arm64/include/asm/unistd32.h 
b/arch/arm64/include/asm/unistd32.h
index aa995920bd34..b134419c0421 100644
--- a/arch/arm64/include/asm/unistd32.h
+++ b/arch/arm64/include/asm/unistd32.h
@@ -875,6 +875,8 @@ __SYSCALL(__NR_fsconfig, sys_fsconfig)
 __SYSCALL(__NR_fsmount, sys_fsmount)
 #define __NR_fspick 433
 __SYSCALL(__NR_fspick, sys_fspick)
+#define __NR_openat2 434
+__SYSCALL(__NR_openat2, sys_openat2)
 
 /*
  * Please add new compat syscalls above this comment and update
diff --git a/arch/ia64/kernel/syscalls/syscall.tbl 
b/arch/ia64/kernel/syscalls/syscall.tbl
index e01df3f2f80d..28d954acf214 100644
--- a/arch/ia64/kernel/syscalls/syscall.tbl
+++ b/arch/ia64/kernel/syscalls/syscall.tbl
@@ -354,3 +354,4 @@
 431    common  fsconfig                        sys_fsconfig
 432    common  fsmount                         sys_fsmount
 433    common  fspick                          sys_fspick
+434    common  openat2                         sys_openat2
diff --git a/arch/m68k/kernel/syscalls/syscall.tbl 
b/arch/m68k/kernel/syscalls/syscall.tbl
index 7e3d0734b2f3..b744b1a1c80e 100644
--- a/arch/m68k/kernel/syscalls/syscall.tbl
+++ b/arch/m68k/kernel/syscalls/syscall.tbl
@@ -433,3 +433,4 @@
 431    common  fsconfig                        sys_fsconfig
 432    common  fsmount                         sys_fsmount
 433    common  fspick                          sys_fspick
+434    common  openat2                         sys_openat2
diff --git a/arch/microblaze/kernel/syscalls/syscall.tbl 
b/arch/microblaze/kernel/syscalls/syscall.tbl
index 26339e417695..bee07b73a898 100644
--- a/arch/microblaze/kernel/syscalls/syscall.tbl
+++ b/arch/microblaze/kernel/syscalls/syscall.tbl
@@ -439,3 +439,4 @@
 431    common  fsconfig                        sys_fsconfig
 432    common  fsmount                         sys_fsmount
 433    common  fspick                          sys_fspick
+434    common  openat2                         sys_openat2
diff --git a/arch/mips/kernel/syscalls/syscall_n32.tbl 
b/arch/mips/kernel/syscalls/syscall_n32.tbl
index 0e2dd68ade57..a3ec5e27630a 100644
--- a/arch/mips/kernel/syscalls/syscall_n32.tbl
+++ b/arch/mips/kernel/syscalls/syscall_n32.tbl
@@ -372,3 +372,4 @@
 431    n32     fsconfig                        sys_fsconfig
 432    n32     fsmount                         sys_fsmount
 433    n32     fspick                          sys_fspick
+434    n32     openat2                         sys_openat2
diff --git a/arch/mips/kernel/syscalls/syscall_n64.tbl 
b/arch/mips/kernel/syscalls/syscall_n64.tbl
index 5eebfa0d155c..3503ac6ef482 100644
--- a/arch/mips/kernel/syscalls/syscall_n64.tbl
+++ b/arch/mips/kernel/syscalls/syscall_n64.tbl
@@ -348,3 +348,4 @@
 431    n64     fsconfig                        sys_fsconfig
 432    n64     fsmount                         sys_fsmount
 433    n64     fspick                          sys_fspick
+434    n64     openat2                         sys_openat2
diff --git a/arch/mips/kernel/syscalls/syscall_o32.tbl 
b/arch/mips/kernel/syscalls/syscall_o32.tbl
index 3cc1374e02d0..e901367371c4 100644
--- a/arch/mips/kernel/syscalls/syscall_o32.tbl
+++ b/arch/mips/kernel/syscalls/syscall_o32.tbl
@@ -421,3 +421,4 @@
 431    o32     fsconfig                        sys_fsconfig
 432    o32     fsmount                         sys_fsmount
 433    o32     fspick                          sys_fspick
+434    o32     openat2                         sys_openat2                     
sys_openat2
diff --git a/arch/parisc/kernel/syscalls/syscall.tbl 
b/arch/parisc/kernel/syscalls/syscall.tbl
index c9e377d59232..5758b0826e4d 100644
--- a/arch/parisc/kernel/syscalls/syscall.tbl
+++ b/arch/parisc/kernel/syscalls/syscall.tbl
@@ -430,3 +430,4 @@
 431    common  fsconfig                        sys_fsconfig
 432    common  fsmount                         sys_fsmount
 433    common  fspick                          sys_fspick
+434    common  openat2                         sys_openat2
diff --git a/arch/powerpc/kernel/syscalls/syscall.tbl 
b/arch/powerpc/kernel/syscalls/syscall.tbl
index 103655d84b4b..9e8377d5b2f6 100644
--- a/arch/powerpc/kernel/syscalls/syscall.tbl
+++ b/arch/powerpc/kernel/syscalls/syscall.tbl
@@ -515,3 +515,4 @@
 431    common  fsconfig                        sys_fsconfig
 432    common  fsmount                         sys_fsmount
 433    common  fspick                          sys_fspick
+434    common  openat2                         sys_openat2
diff --git a/arch/s390/kernel/syscalls/syscall.tbl 
b/arch/s390/kernel/syscalls/syscall.tbl
index e822b2964a83..d6e8eaa20d44 100644
--- a/arch/s390/kernel/syscalls/syscall.tbl
+++ b/arch/s390/kernel/syscalls/syscall.tbl
@@ -436,3 +436,4 @@
 431  common    fsconfig                sys_fsconfig                    
sys_fsconfig
 432  common    fsmount                 sys_fsmount                     
sys_fsmount
 433  common    fspick                  sys_fspick                      
sys_fspick
+434  common    openat2                 sys_openat2                     
sys_openat2
diff --git a/arch/sh/kernel/syscalls/syscall.tbl 
b/arch/sh/kernel/syscalls/syscall.tbl
index 016a727d4357..dc38733d264b 100644
--- a/arch/sh/kernel/syscalls/syscall.tbl
+++ b/arch/sh/kernel/syscalls/syscall.tbl
@@ -436,3 +436,4 @@
 431    common  fsconfig                        sys_fsconfig
 432    common  fsmount                         sys_fsmount
 433    common  fspick                          sys_fspick
+434    common  openat2                         sys_openat2
diff --git a/arch/sparc/kernel/syscalls/syscall.tbl 
b/arch/sparc/kernel/syscalls/syscall.tbl
index e047480b1605..cfeb24ac5299 100644
--- a/arch/sparc/kernel/syscalls/syscall.tbl
+++ b/arch/sparc/kernel/syscalls/syscall.tbl
@@ -479,3 +479,4 @@
 431    common  fsconfig                        sys_fsconfig
 432    common  fsmount                         sys_fsmount
 433    common  fspick                          sys_fspick
+434    common  openat2                 sys_openat2
diff --git a/arch/x86/entry/syscalls/syscall_32.tbl 
b/arch/x86/entry/syscalls/syscall_32.tbl
index ad968b7bac72..1d76a0e84f42 100644
--- a/arch/x86/entry/syscalls/syscall_32.tbl
+++ b/arch/x86/entry/syscalls/syscall_32.tbl
@@ -438,3 +438,4 @@
 431    i386    fsconfig                sys_fsconfig                    
__ia32_sys_fsconfig
 432    i386    fsmount                 sys_fsmount                     
__ia32_sys_fsmount
 433    i386    fspick                  sys_fspick                      
__ia32_sys_fspick
+434    i386    openat2                 sys_openat2                     
__ia32_sys_openat2
diff --git a/arch/x86/entry/syscalls/syscall_64.tbl 
b/arch/x86/entry/syscalls/syscall_64.tbl
index b4e6f9e6204a..828bade2e505 100644
--- a/arch/x86/entry/syscalls/syscall_64.tbl
+++ b/arch/x86/entry/syscalls/syscall_64.tbl
@@ -355,6 +355,7 @@
 431    common  fsconfig                __x64_sys_fsconfig
 432    common  fsmount                 __x64_sys_fsmount
 433    common  fspick                  __x64_sys_fspick
+434    common  openat2                 __x64_sys_openat2
 
 #
 # x32-specific system call numbers start at 512 to avoid cache impact
diff --git a/arch/xtensa/kernel/syscalls/syscall.tbl 
b/arch/xtensa/kernel/syscalls/syscall.tbl
index 5fa0ee1c8e00..78ed6e97d3ae 100644
--- a/arch/xtensa/kernel/syscalls/syscall.tbl
+++ b/arch/xtensa/kernel/syscalls/syscall.tbl
@@ -404,3 +404,4 @@
 431    common  fsconfig                        sys_fsconfig
 432    common  fsmount                         sys_fsmount
 433    common  fspick                          sys_fspick
+434    common  openat2                         sys_openat2
diff --git a/fs/open.c b/fs/open.c
index bdca45528524..63278294d1d4 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -928,24 +928,32 @@ struct file *open_with_fake_path(const struct path *path, 
int flags,
 }
 EXPORT_SYMBOL(open_with_fake_path);
 
-static inline int build_open_flags(int flags, umode_t mode, struct open_flags 
*op)
+static inline int build_open_flags(struct open_how how, struct open_flags *op)
 {
        int lookup_flags = 0;
-       int acc_mode = ACC_MODE(flags);
+       int opath_mask = 0;
+       int acc_mode = ACC_MODE(how.flags);
+
+       if (how.resolve & ~VALID_RESOLVE_FLAGS)
+               return -EINVAL;
+       if (!(how.flags & (O_PATH | O_CREAT | __O_TMPFILE)) && how.mode != 0)
+               return -EINVAL;
+       if (memchr_inv(how.reserved, 0, sizeof(how.reserved)))
+               return -EINVAL;
 
        /*
         * Clear out all open flags we don't know about so that we don't report
         * them in fcntl(F_GETFD) or similar interfaces.
         */
-       flags &= VALID_OPEN_FLAGS;
+       how.flags &= VALID_OPEN_FLAGS;
 
-       if (flags & (O_CREAT | __O_TMPFILE))
-               op->mode = (mode & S_IALLUGO) | S_IFREG;
+       if (how.flags & (O_CREAT | __O_TMPFILE))
+               op->mode = (how.mode & S_IALLUGO) | S_IFREG;
        else
                op->mode = 0;
 
        /* Must never be set by userspace */
-       flags &= ~FMODE_NONOTIFY & ~O_CLOEXEC;
+       how.flags &= ~FMODE_NONOTIFY & ~O_CLOEXEC;
 
        /*
         * O_SYNC is implemented as __O_SYNC|O_DSYNC.  As many places only
@@ -953,51 +961,70 @@ static inline int build_open_flags(int flags, umode_t 
mode, struct open_flags *o
         * always set instead of having to deal with possibly weird behaviour
         * for malicious applications setting only __O_SYNC.
         */
-       if (flags & __O_SYNC)
-               flags |= O_DSYNC;
+       if (how.flags & __O_SYNC)
+               how.flags |= O_DSYNC;
 
-       if (flags & __O_TMPFILE) {
-               if ((flags & O_TMPFILE_MASK) != O_TMPFILE)
+       if (how.flags & __O_TMPFILE) {
+               if ((how.flags & O_TMPFILE_MASK) != O_TMPFILE)
                        return -EINVAL;
                if (!(acc_mode & MAY_WRITE))
                        return -EINVAL;
-       } else if (flags & O_PATH) {
+       } else if (how.flags & O_PATH) {
                /*
                 * If we have O_PATH in the open flag. Then we
                 * cannot have anything other than the below set of flags
                 */
-               flags &= O_DIRECTORY | O_NOFOLLOW | O_PATH;
+               how.flags &= O_DIRECTORY | O_NOFOLLOW | O_PATH;
                acc_mode = 0;
+
+               /* Allow userspace to restrict the re-opening of O_PATH fds. */
+               if (how.upgrade_mask & ~VALID_UPGRADE_FLAGS)
+                       return -EINVAL;
+               if (!(how.upgrade_mask & UPGRADE_NOREAD))
+                       opath_mask |= FMODE_PATH_READ;
+               if (!(how.upgrade_mask & UPGRADE_NOWRITE))
+                       opath_mask |= FMODE_PATH_WRITE;
        }
 
-       op->open_flag = flags;
+       op->open_flag = how.flags;
 
        /* O_TRUNC implies we need access checks for write permissions */
-       if (flags & O_TRUNC)
+       if (how.flags & O_TRUNC)
                acc_mode |= MAY_WRITE;
 
        /* Allow the LSM permission hook to distinguish append
           access from general write access. */
-       if (flags & O_APPEND)
+       if (how.flags & O_APPEND)
                acc_mode |= MAY_APPEND;
 
        op->acc_mode = acc_mode;
-       op->intent = flags & O_PATH ? 0 : LOOKUP_OPEN;
-       /* For O_PATH backwards-compatibility we default to an all-set mask. */
-       op->opath_mask = FMODE_PATH_READ | FMODE_PATH_WRITE;
+       op->intent = how.flags & O_PATH ? 0 : LOOKUP_OPEN;
+       op->opath_mask = opath_mask;
 
-       if (flags & O_CREAT) {
+       if (how.flags & O_CREAT) {
                op->intent |= LOOKUP_CREATE;
-               if (flags & O_EXCL)
+               if (how.flags & O_EXCL)
                        op->intent |= LOOKUP_EXCL;
        }
 
-       if (flags & O_DIRECTORY)
+       if (how.flags & O_DIRECTORY)
                lookup_flags |= LOOKUP_DIRECTORY;
-       if (!(flags & O_NOFOLLOW))
+       if (!(how.flags & O_NOFOLLOW))
                lookup_flags |= LOOKUP_FOLLOW;
-       if (flags & O_EMPTYPATH)
+       if (how.flags & O_EMPTYPATH)
                lookup_flags |= LOOKUP_EMPTY;
+
+       if (how.resolve & RESOLVE_NO_XDEV)
+               lookup_flags |= LOOKUP_XDEV;
+       if (how.resolve & RESOLVE_NO_MAGICLINKS)
+               lookup_flags |= LOOKUP_NO_MAGICLINKS;
+       if (how.resolve & RESOLVE_NO_SYMLINKS)
+               lookup_flags |= LOOKUP_NO_SYMLINKS;
+       if (how.resolve & RESOLVE_BENEATH)
+               lookup_flags |= LOOKUP_BENEATH;
+       if (how.resolve & RESOLVE_IN_ROOT)
+               lookup_flags |= LOOKUP_IN_ROOT;
+
        op->lookup_flags = lookup_flags;
        return 0;
 }
@@ -1016,8 +1043,14 @@ static inline int build_open_flags(int flags, umode_t 
mode, struct open_flags *o
 struct file *file_open_name(struct filename *name, int flags, umode_t mode)
 {
        struct open_flags op;
-       int err = build_open_flags(flags, mode, &op);
-       return err ? ERR_PTR(err) : do_filp_open(AT_FDCWD, name, &op);
+       struct open_how how = {
+               .flags = flags,
+               .mode = OPENHOW_MODE(flags, mode),
+       };
+       int err = build_open_flags(how, &op);
+       if (err)
+               return ERR_PTR(err);
+       return do_filp_open(AT_FDCWD, name, &op);
 }
 
 /**
@@ -1048,17 +1081,22 @@ struct file *file_open_root(struct dentry *dentry, 
struct vfsmount *mnt,
                            const char *filename, int flags, umode_t mode)
 {
        struct open_flags op;
-       int err = build_open_flags(flags, mode, &op);
+       struct open_how how = {
+               .flags = flags,
+               .mode = OPENHOW_MODE(flags, mode),
+       };
+       int err = build_open_flags(how, &op);
        if (err)
                return ERR_PTR(err);
        return do_file_open_root(dentry, mnt, filename, &op);
 }
 EXPORT_SYMBOL(file_open_root);
 
-long do_sys_open(int dfd, const char __user *filename, int flags, umode_t mode)
+long do_sys_open(int dfd, const char __user *filename,
+                struct open_how *how)
 {
        struct open_flags op;
-       int fd = build_open_flags(flags, mode, &op);
+       int fd = build_open_flags(*how, &op);
        int empty = 0;
        struct filename *tmp;
 
@@ -1071,7 +1109,7 @@ long do_sys_open(int dfd, const char __user *filename, 
int flags, umode_t mode)
        if (!empty)
                op.open_flag &= ~O_EMPTYPATH;
 
-       fd = get_unused_fd_flags(flags);
+       fd = get_unused_fd_flags(how->flags);
        if (fd >= 0) {
                struct file *f = do_filp_open(dfd, tmp, &op);
                if (IS_ERR(f)) {
@@ -1088,19 +1126,35 @@ long do_sys_open(int dfd, const char __user *filename, 
int flags, umode_t mode)
 
 SYSCALL_DEFINE3(open, const char __user *, filename, int, flags, umode_t, mode)
 {
-       if (force_o_largefile())
-               flags |= O_LARGEFILE;
-
-       return do_sys_open(AT_FDCWD, filename, flags, mode);
+       return ksys_open(filename, flags, mode);
 }
 
 SYSCALL_DEFINE4(openat, int, dfd, const char __user *, filename, int, flags,
                umode_t, mode)
 {
+       struct open_how how = {
+               .flags = flags,
+               .mode = OPENHOW_MODE(flags, mode),
+       };
+
+       if (force_o_largefile())
+               how.flags |= O_LARGEFILE;
+
+       return do_sys_open(dfd, filename, &how);
+}
+
+SYSCALL_DEFINE3(openat2, int, dfd, const char __user *, filename,
+               const struct open_how __user *, how)
+{
+       struct open_how tmp;
+
+       if (copy_from_user(&tmp, how, sizeof(tmp)))
+               return -EFAULT;
+
        if (force_o_largefile())
-               flags |= O_LARGEFILE;
+               tmp.flags |= O_LARGEFILE;
 
-       return do_sys_open(dfd, filename, flags, mode);
+       return do_sys_open(dfd, filename, &tmp);
 }
 
 #ifdef CONFIG_COMPAT
@@ -1110,7 +1164,11 @@ SYSCALL_DEFINE4(openat, int, dfd, const char __user *, 
filename, int, flags,
  */
 COMPAT_SYSCALL_DEFINE3(open, const char __user *, filename, int, flags, 
umode_t, mode)
 {
-       return do_sys_open(AT_FDCWD, filename, flags, mode);
+       struct open_how how = {
+               .flags = flags,
+               .mode = OPENHOW_MODE(flags, mode),
+       };
+       return do_sys_open(AT_FDCWD, filename, &how);
 }
 
 /*
@@ -1119,7 +1177,11 @@ COMPAT_SYSCALL_DEFINE3(open, const char __user *, 
filename, int, flags, umode_t,
  */
 COMPAT_SYSCALL_DEFINE4(openat, int, dfd, const char __user *, filename, int, 
flags, umode_t, mode)
 {
-       return do_sys_open(dfd, filename, flags, mode);
+       struct open_how how = {
+               .flags = flags,
+               .mode = OPENHOW_MODE(flags, mode),
+       };
+       return do_sys_open(dfd, filename, &how);
 }
 #endif
 
diff --git a/include/linux/fcntl.h b/include/linux/fcntl.h
index 2868ae6c8fc1..e59917292213 100644
--- a/include/linux/fcntl.h
+++ b/include/linux/fcntl.h
@@ -4,13 +4,26 @@
 
 #include <uapi/linux/fcntl.h>
 
-/* list of all valid flags for the open/openat flags argument: */
+/* Should open_how.mode be set for older syscalls wrappers? */
+#define OPENHOW_MODE(flags, mode) \
+       (((flags) | (O_CREAT | __O_TMPFILE)) ? (mode) : 0)
+
+/* List of all valid flags for the open/openat flags argument: */
 #define VALID_OPEN_FLAGS \
        (O_RDONLY | O_WRONLY | O_RDWR | O_CREAT | O_EXCL | O_NOCTTY | O_TRUNC | 
\
         O_APPEND | O_NDELAY | O_NONBLOCK | O_NDELAY | __O_SYNC | O_DSYNC | \
         FASYNC | O_DIRECT | O_LARGEFILE | O_DIRECTORY | O_NOFOLLOW | \
         O_NOATIME | O_CLOEXEC | O_PATH | __O_TMPFILE | O_EMPTYPATH)
 
+/* List of all valid flags for the how->upgrade_mask argument: */
+#define VALID_UPGRADE_FLAGS \
+       (UPGRADE_NOWRITE | UPGRADE_NOREAD)
+
+/* List of all valid flags for the how->resolve argument: */
+#define VALID_RESOLVE_FLAGS \
+       (RESOLVE_NO_XDEV | RESOLVE_NO_MAGICLINKS | RESOLVE_NO_SYMLINKS | \
+        RESOLVE_BENEATH | RESOLVE_IN_ROOT)
+
 #ifndef force_o_largefile
 #define force_o_largefile() (!IS_ENABLED(CONFIG_ARCH_32BIT_OFF_T))
 #endif
diff --git a/include/linux/fs.h b/include/linux/fs.h
index f7df213405ea..a3aede2b3a91 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2515,8 +2515,8 @@ extern int do_truncate(struct dentry *, loff_t start, 
unsigned int time_attrs,
                       struct file *filp);
 extern int vfs_fallocate(struct file *file, int mode, loff_t offset,
                        loff_t len);
-extern long do_sys_open(int dfd, const char __user *filename, int flags,
-                       umode_t mode);
+extern long do_sys_open(int dfd, const char __user *filename,
+                       struct open_how *how);
 extern struct file *file_open_name(struct filename *, int, umode_t);
 extern struct file *filp_open(const char *, int, umode_t);
 extern struct file *file_open_root(struct dentry *, struct vfsmount *,
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index 2bcef4c70183..227303532bb7 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -1369,15 +1369,21 @@ static inline int ksys_close(unsigned int fd)
        return __close_fd(current->files, fd);
 }
 
-extern long do_sys_open(int dfd, const char __user *filename, int flags,
-                       umode_t mode);
+extern long do_sys_open(int dfd, const char __user *filename,
+                       struct open_how *how);
 
 static inline long ksys_open(const char __user *filename, int flags,
                             umode_t mode)
 {
+       struct open_how how = {
+               .flags = flags,
+               .mode = OPENHOW_MODE(flags, mode),
+       };
+
        if (force_o_largefile())
-               flags |= O_LARGEFILE;
-       return do_sys_open(AT_FDCWD, filename, flags, mode);
+               how.flags |= O_LARGEFILE;
+
+       return do_sys_open(AT_FDCWD, filename, &how);
 }
 
 extern long do_sys_truncate(const char __user *pathname, loff_t length);
diff --git a/include/uapi/asm-generic/unistd.h 
b/include/uapi/asm-generic/unistd.h
index a87904daf103..67486188918b 100644
--- a/include/uapi/asm-generic/unistd.h
+++ b/include/uapi/asm-generic/unistd.h
@@ -845,8 +845,11 @@ __SYSCALL(__NR_fsmount, sys_fsmount)
 #define __NR_fspick 433
 __SYSCALL(__NR_fspick, sys_fspick)
 
+#define __NR_openat2 435
+__SYSCALL(__NR_openat2, sys_openat2)
+
 #undef __NR_syscalls
-#define __NR_syscalls 434
+#define __NR_syscalls 435
 
 /*
  * 32 bit systems traditionally used different
diff --git a/include/uapi/linux/fcntl.h b/include/uapi/linux/fcntl.h
index 1d338357df8a..b7c904f0fca9 100644
--- a/include/uapi/linux/fcntl.h
+++ b/include/uapi/linux/fcntl.h
@@ -93,5 +93,43 @@
 
 #define AT_RECURSIVE           0x8000  /* Apply to the entire subtree */
 
+/**
+ * Arguments for how openat2(2) should open the target path. If @extra is zero,
+ * then openat2(2) is identical to openat(2).
+ *
+ * @flags: O_* flags (unknown flags ignored).
+ * @mode: O_CREAT file mode (ignored otherwise).
+ * @upgrade_mask: restrict how the O_PATH may be re-opened (ignored otherwise).
+ * @resolve: RESOLVE_* flags (-EINVAL on unknown flags).
+ * @reserved: reserved for future extensions, must be zeroed.
+ */
+struct open_how {
+       __u32 flags;
+       union {
+               __u16 mode;
+               __u16 upgrade_mask;
+       };
+       __u16 resolve;
+       __u64 reserved[7]; /* must be zeroed */
+};
+
+/* how->resolve flags for openat2(2). */
+#define RESOLVE_NO_XDEV                0x01 /* Block mount-point crossings
+                                       (includes bind-mounts). */
+#define RESOLVE_NO_MAGICLINKS  0x02 /* Block traversal through procfs-style
+                                       "magic-links". */
+#define RESOLVE_NO_SYMLINKS    0x04 /* Block traversal through all symlinks
+                                       (implies OEXT_NO_MAGICLINKS) */
+#define RESOLVE_BENEATH                0x08 /* Block "lexical" trickery like
+                                       "..", symlinks, and absolute
+                                       paths which escape the dirfd. */
+#define RESOLVE_IN_ROOT                0x10 /* Make all jumps to "/" and ".."
+                                       be scoped inside the dirfd
+                                       (similar to chroot(2)). */
+
+/* how->upgrade flags for openat2(2). */
+/* First bit is reserved for a future UPGRADE_NOEXEC flag. */
+#define UPGRADE_NOREAD         0x02 /* Block re-opening with MAY_READ. */
+#define UPGRADE_NOWRITE                0x04 /* Block re-opening with 
MAY_WRITE. */
 
 #endif /* _UAPI_LINUX_FCNTL_H */
-- 
2.22.0

Reply via email to