Ok, sorry for the long delay, I was distracted (and hoping that Al
would come up with a patch).

Anyway, attached is the patch I think we should do for this issue. It
is fairly simple:

 - it adds a "f_pos_mutex" to the "struct file".

 - it adds a new FMODE_ATOMIC_POS flag to the file mode flags to mark
things that need atomic f_pos updates

 - it makes the "struct fd" flags be two flags rather than one: the
second flag is for "unlock f_pos_mutex when done"

 - it introduces "fd[get,put]_pos()" which gets the f_pos_mutex when required

 - it makes read/write/lseek use that.

It's pretty damn straightforward, I think, and is minimally serializing.

Al, comments? Yongzhi Pan, this is pretty much untested, but it's
pretty simple and it does fix your test-case.

                       Linus

On Thu, Feb 20, 2014 at 9:14 AM, Linus Torvalds
<[email protected]> wrote:
> Yes, I do think we violate POSIX here because of how we handle f_pos
> (the earlier thread from 2006 you point to talks about the "thread
> safe" part, the point here about the actual wording of "atomic" is a
> separate issue).
>
> Long long ago we used to just pass in the pointer to f_pos directly,
> and then the low-level write would update it all under the inode
> semaphore, and all was good.
>
> And then we ended up having tons of problems with non-regular files
> and drivers accessing f_pos and having nasty races with it because
> they did *not* have any locking (and very fundamentally didn't want
> any), and we broke the serialization of f_pos. We still do the *IO*
> atomically, but yes, the f_pos access itself is outside the lock.
>
> Ho humm.. Al, any ideas of how to fix this?
 fs/file_table.c      |  1 +
 fs/open.c            |  4 ++++
 fs/read_write.c      | 46 ++++++++++++++++++++++++++++++++++++----------
 include/linux/file.h |  6 +++---
 include/linux/fs.h   |  6 +++++-
 5 files changed, 49 insertions(+), 14 deletions(-)

diff --git a/fs/file_table.c b/fs/file_table.c
index 5fff9030be34..5b24008ea4f6 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -135,6 +135,7 @@ struct file *get_empty_filp(void)
        atomic_long_set(&f->f_count, 1);
        rwlock_init(&f->f_owner.lock);
        spin_lock_init(&f->f_lock);
+       mutex_init(&f->f_pos_lock);
        eventpoll_init_file(f);
        /* f->f_version: 0 */
        return f;
diff --git a/fs/open.c b/fs/open.c
index 4b3e1edf2fe4..b9ed8b25c108 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -705,6 +705,10 @@ static int do_dentry_open(struct file *f,
                return 0;
        }
 
+       /* POSIX.1-2008/SUSv4 Section XSI 2.9.7 */
+       if (S_ISREG(inode->i_mode))
+               f->f_mode |= FMODE_ATOMIC_POS;
+
        f->f_op = fops_get(inode->i_fop);
        if (unlikely(WARN_ON(!f->f_op))) {
                error = -ENODEV;
diff --git a/fs/read_write.c b/fs/read_write.c
index edc5746a902a..9273ec92f07e 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -264,10 +264,36 @@ loff_t vfs_llseek(struct file *file, loff_t offset, int 
whence)
 }
 EXPORT_SYMBOL(vfs_llseek);
 
+/*
+ * We only lock f_pos if we have threads (in which case "need_put"
+ * will be set) or if the file might be shared with another process
+ * (in which case the file count is elevated).
+ */
+static struct fd fdget_pos(int fd)
+{
+       struct fd f = fdget(fd);
+       struct file *file = f.file;
+
+       if (file && (file->f_mode & FMODE_ATOMIC_POS)) {
+               if (f.need_put || file_count(file) > 1) {
+                       f.need_pos_unlock = 1;
+                       mutex_lock(&file->f_pos_lock);
+               }
+       }
+       return f;
+}
+
+static void fdput_pos(struct fd f)
+{
+       if (f.need_pos_unlock)
+               mutex_unlock(&f.file->f_pos_lock);
+       fdput(f);
+}
+
 SYSCALL_DEFINE3(lseek, unsigned int, fd, off_t, offset, unsigned int, whence)
 {
        off_t retval;
-       struct fd f = fdget(fd);
+       struct fd f = fdget_pos(fd);
        if (!f.file)
                return -EBADF;
 
@@ -278,7 +304,7 @@ SYSCALL_DEFINE3(lseek, unsigned int, fd, off_t, offset, 
unsigned int, whence)
                if (res != (loff_t)retval)
                        retval = -EOVERFLOW;    /* LFS: should only happen on 
32 bit platforms */
        }
-       fdput(f);
+       fdput_pos(f);
        return retval;
 }
 
@@ -498,7 +524,7 @@ static inline void file_pos_write(struct file *file, loff_t 
pos)
 
 SYSCALL_DEFINE3(read, unsigned int, fd, char __user *, buf, size_t, count)
 {
-       struct fd f = fdget(fd);
+       struct fd f = fdget_pos(fd);
        ssize_t ret = -EBADF;
 
        if (f.file) {
@@ -506,7 +532,7 @@ SYSCALL_DEFINE3(read, unsigned int, fd, char __user *, buf, 
size_t, count)
                ret = vfs_read(f.file, buf, count, &pos);
                if (ret >= 0)
                        file_pos_write(f.file, pos);
-               fdput(f);
+               fdput_pos(f);
        }
        return ret;
 }
@@ -514,7 +540,7 @@ SYSCALL_DEFINE3(read, unsigned int, fd, char __user *, buf, 
size_t, count)
 SYSCALL_DEFINE3(write, unsigned int, fd, const char __user *, buf,
                size_t, count)
 {
-       struct fd f = fdget(fd);
+       struct fd f = fdget_pos(fd);
        ssize_t ret = -EBADF;
 
        if (f.file) {
@@ -522,7 +548,7 @@ SYSCALL_DEFINE3(write, unsigned int, fd, const char __user 
*, buf,
                ret = vfs_write(f.file, buf, count, &pos);
                if (ret >= 0)
                        file_pos_write(f.file, pos);
-               fdput(f);
+               fdput_pos(f);
        }
 
        return ret;
@@ -797,7 +823,7 @@ EXPORT_SYMBOL(vfs_writev);
 SYSCALL_DEFINE3(readv, unsigned long, fd, const struct iovec __user *, vec,
                unsigned long, vlen)
 {
-       struct fd f = fdget(fd);
+       struct fd f = fdget_pos(fd);
        ssize_t ret = -EBADF;
 
        if (f.file) {
@@ -805,7 +831,7 @@ SYSCALL_DEFINE3(readv, unsigned long, fd, const struct 
iovec __user *, vec,
                ret = vfs_readv(f.file, vec, vlen, &pos);
                if (ret >= 0)
                        file_pos_write(f.file, pos);
-               fdput(f);
+               fdput_pos(f);
        }
 
        if (ret > 0)
@@ -817,7 +843,7 @@ SYSCALL_DEFINE3(readv, unsigned long, fd, const struct 
iovec __user *, vec,
 SYSCALL_DEFINE3(writev, unsigned long, fd, const struct iovec __user *, vec,
                unsigned long, vlen)
 {
-       struct fd f = fdget(fd);
+       struct fd f = fdget_pos(fd);
        ssize_t ret = -EBADF;
 
        if (f.file) {
@@ -825,7 +851,7 @@ SYSCALL_DEFINE3(writev, unsigned long, fd, const struct 
iovec __user *, vec,
                ret = vfs_writev(f.file, vec, vlen, &pos);
                if (ret >= 0)
                        file_pos_write(f.file, pos);
-               fdput(f);
+               fdput_pos(f);
        }
 
        if (ret > 0)
diff --git a/include/linux/file.h b/include/linux/file.h
index cbacf4faf447..636634a09c92 100644
--- a/include/linux/file.h
+++ b/include/linux/file.h
@@ -28,7 +28,7 @@ static inline void fput_light(struct file *file, int 
fput_needed)
 
 struct fd {
        struct file *file;
-       int need_put;
+       unsigned need_put:1, need_pos_unlock:1;
 };
 
 static inline void fdput(struct fd fd)
@@ -44,7 +44,7 @@ static inline struct fd fdget(unsigned int fd)
 {
        int b;
        struct file *f = fget_light(fd, &b);
-       return (struct fd){f,b};
+       return (struct fd){f,b,0};
 }
 
 extern struct file *fget_raw(unsigned int fd);
@@ -54,7 +54,7 @@ static inline struct fd fdget_raw(unsigned int fd)
 {
        int b;
        struct file *f = fget_raw_light(fd, &b);
-       return (struct fd){f,b};
+       return (struct fd){f,b,0};
 }
 
 extern int f_dupfd(unsigned int from, struct file *file, unsigned flags);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 60829565e552..ebfde04bca06 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -123,6 +123,9 @@ typedef void (dio_iodone_t)(struct kiocb *iocb, loff_t 
offset,
 /* File is opened with O_PATH; almost nothing can be done with it */
 #define FMODE_PATH             ((__force fmode_t)0x4000)
 
+/* File needs atomic accesses to f_pos */
+#define FMODE_ATOMIC_POS       ((__force fmode_t)0x8000)
+
 /* File was opened by fanotify and shouldn't generate fanotify events */
 #define FMODE_NONOTIFY         ((__force fmode_t)0x1000000)
 
@@ -780,13 +783,14 @@ struct file {
        const struct file_operations    *f_op;
 
        /*
-        * Protects f_ep_links, f_flags, f_pos vs i_size in lseek SEEK_CUR.
+        * Protects f_ep_links, f_flags.
         * Must not be taken from IRQ context.
         */
        spinlock_t              f_lock;
        atomic_long_t           f_count;
        unsigned int            f_flags;
        fmode_t                 f_mode;
+       struct mutex            f_pos_lock;
        loff_t                  f_pos;
        struct fown_struct      f_owner;
        const struct cred       *f_cred;

Reply via email to