Instead of storing all the file pointers in a single array, use an
IDR.  It is RCU-safe, and does not need to be reallocated when the
fd array grows.  It also handles allocation of new file descriptors.

Signed-off-by: Sandhya Bankar <bankarsandhya...@gmail.com>
[mawil...@microsoft.com: fixes]
Signed-off-by: Matthew Wilcox <mawil...@microsoft.com>
---
 fs/file.c               | 180 ++++++++++++++++++++----------------------------
 include/linux/fdtable.h |  10 +--
 2 files changed, 79 insertions(+), 111 deletions(-)

diff --git a/fs/file.c b/fs/file.c
index ad6f094..1c000d8 100644
--- a/fs/file.c
+++ b/fs/file.c
@@ -47,7 +47,6 @@ static void *alloc_fdmem(size_t size)
 
 static void __free_fdtable(struct fdtable *fdt)
 {
-       kvfree(fdt->fd);
        kvfree(fdt->open_fds);
        kfree(fdt);
 }
@@ -89,15 +88,7 @@ static void copy_fd_bitmaps(struct fdtable *nfdt, struct 
fdtable *ofdt,
  */
 static void copy_fdtable(struct fdtable *nfdt, struct fdtable *ofdt)
 {
-       unsigned int cpy, set;
-
        BUG_ON(nfdt->max_fds < ofdt->max_fds);
-
-       cpy = ofdt->max_fds * sizeof(struct file *);
-       set = (nfdt->max_fds - ofdt->max_fds) * sizeof(struct file *);
-       memcpy(nfdt->fd, ofdt->fd, cpy);
-       memset((char *)nfdt->fd + cpy, 0, set);
-
        copy_fd_bitmaps(nfdt, ofdt, ofdt->max_fds);
 }
 
@@ -131,15 +122,11 @@ static struct fdtable * alloc_fdtable(unsigned int nr)
        if (!fdt)
                goto out;
        fdt->max_fds = nr;
-       data = alloc_fdmem(nr * sizeof(struct file *));
-       if (!data)
-               goto out_fdt;
-       fdt->fd = data;
 
        data = alloc_fdmem(max_t(size_t,
                                 2 * nr / BITS_PER_BYTE + BITBIT_SIZE(nr), 
L1_CACHE_BYTES));
        if (!data)
-               goto out_arr;
+               goto out_fdt;
        fdt->open_fds = data;
        data += nr / BITS_PER_BYTE;
        fdt->close_on_exec = data;
@@ -148,8 +135,6 @@ static struct fdtable * alloc_fdtable(unsigned int nr)
 
        return fdt;
 
-out_arr:
-       kvfree(fdt->fd);
 out_fdt:
        kfree(fdt);
 out:
@@ -170,6 +155,7 @@ static int expand_fdtable(struct files_struct *files, 
unsigned int nr)
        struct fdtable *new_fdt, *cur_fdt;
 
        spin_unlock(&files->file_lock);
+       idr_preload_end();
        new_fdt = alloc_fdtable(nr);
 
        /* make sure all __fd_install() have seen resize_in_progress
@@ -178,6 +164,7 @@ static int expand_fdtable(struct files_struct *files, 
unsigned int nr)
        if (atomic_read(&files->count) > 1)
                synchronize_sched();
 
+       idr_preload(GFP_KERNEL);
        spin_lock(&files->file_lock);
        if (!new_fdt)
                return -ENOMEM;
@@ -228,8 +215,10 @@ static int expand_files(struct files_struct *files, 
unsigned int nr)
 
        if (unlikely(files->resize_in_progress)) {
                spin_unlock(&files->file_lock);
+               idr_preload_end();
                expanded = 1;
                wait_event(files->resize_wait, !files->resize_in_progress);
+               idr_preload(GFP_KERNEL);
                spin_lock(&files->file_lock);
                goto repeat;
        }
@@ -290,8 +279,8 @@ static unsigned int count_open_files(struct fdtable *fdt)
 struct files_struct *dup_fd(struct files_struct *oldf, int *errorp)
 {
        struct files_struct *newf;
-       struct file **old_fds, **new_fds;
        unsigned int open_files, i;
+       struct file *f;
        struct fdtable *old_fdt, *new_fdt;
 
        *errorp = -ENOMEM;
@@ -302,6 +291,7 @@ struct files_struct *dup_fd(struct files_struct *oldf, int 
*errorp)
        atomic_set(&newf->count, 1);
 
        spin_lock_init(&newf->file_lock);
+       idr_init(&newf->fd_idr);
        newf->resize_in_progress = false;
        init_waitqueue_head(&newf->resize_wait);
        newf->next_fd = 0;
@@ -310,8 +300,9 @@ struct files_struct *dup_fd(struct files_struct *oldf, int 
*errorp)
        new_fdt->close_on_exec = newf->close_on_exec_init;
        new_fdt->open_fds = newf->open_fds_init;
        new_fdt->full_fds_bits = newf->full_fds_bits_init;
-       new_fdt->fd = &newf->fd_array[0];
 
+restart:
+       idr_copy_preload(&oldf->fd_idr, GFP_KERNEL);
        spin_lock(&oldf->file_lock);
        old_fdt = files_fdtable(oldf);
        open_files = count_open_files(old_fdt);
@@ -321,6 +312,7 @@ struct files_struct *dup_fd(struct files_struct *oldf, int 
*errorp)
         */
        while (unlikely(open_files > new_fdt->max_fds)) {
                spin_unlock(&oldf->file_lock);
+               idr_preload_end();
 
                if (new_fdt != &newf->fdtab)
                        __free_fdtable(new_fdt);
@@ -343,41 +335,50 @@ struct files_struct *dup_fd(struct files_struct *oldf, 
int *errorp)
                 * who knows it may have a new bigger fd table. We need
                 * the latest pointer.
                 */
+               idr_copy_preload(&oldf->fd_idr, GFP_KERNEL);
                spin_lock(&oldf->file_lock);
                old_fdt = files_fdtable(oldf);
                open_files = count_open_files(old_fdt);
        }
 
+       if (!idr_check_preload(&oldf->fd_idr)) {
+               spin_unlock(&oldf->file_lock);
+               idr_preload_end();
+               goto restart;
+       }
+
        copy_fd_bitmaps(new_fdt, old_fdt, open_files);
 
-       old_fds = old_fdt->fd;
-       new_fds = new_fdt->fd;
-
-       for (i = open_files; i != 0; i--) {
-               struct file *f = *old_fds++;
-               if (f) {
-                       get_file(f);
-               } else {
-                       /*
-                        * The fd may be claimed in the fd bitmap but not yet
-                        * instantiated in the files array if a sibling thread
-                        * is partway through open().  So make sure that this
-                        * fd is available to the new process.
-                        */
-                       __clear_open_fd(open_files - i, new_fdt);
+       idr_for_each_entry(&oldf->fd_idr, f, i) {
+               int err;
+
+               get_file(f);
+               err = idr_alloc(&newf->fd_idr, f, i, i + 1, GFP_NOWAIT);
+               if (WARN(err != i, "Could not allocate %d: %d", i, err)) {
+                       spin_unlock(&oldf->file_lock);
+                       goto out;
                }
-               rcu_assign_pointer(*new_fds++, f);
        }
+
        spin_unlock(&oldf->file_lock);
+       idr_preload_end();
 
-       /* clear the remainder */
-       memset(new_fds, 0, (new_fdt->max_fds - open_files) * sizeof(struct file 
*));
+       /*
+        * The fd may be claimed in the fd bitmap but not yet
+        * instantiated in the files array if a sibling thread
+        * is partway through open().
+        */
+       for_each_set_bit(i, new_fdt->open_fds, new_fdt->max_fds) {
+               if (!idr_find(&newf->fd_idr, i))
+                       __clear_bit(i, new_fdt->open_fds);
+       }
 
        rcu_assign_pointer(newf->fdt, new_fdt);
 
        return newf;
 
 out_release:
+       idr_destroy(&newf->fd_idr);
        kmem_cache_free(files_cachep, newf);
 out:
        return NULL;
@@ -401,7 +402,8 @@ static struct fdtable *close_files(struct files_struct * 
files)
                set = fdt->open_fds[j++];
                while (set) {
                        if (set & 1) {
-                               struct file * file = xchg(&fdt->fd[i], NULL);
+                               struct file *file;
+                               file = idr_remove(&files->fd_idr, i);
                                if (file) {
                                        filp_close(file, files);
                                        cond_resched_rcu_qs();
@@ -469,28 +471,14 @@ struct files_struct init_files = {
        .fdt            = &init_files.fdtab,
        .fdtab          = {
                .max_fds        = NR_OPEN_DEFAULT,
-               .fd             = &init_files.fd_array[0],
                .close_on_exec  = init_files.close_on_exec_init,
                .open_fds       = init_files.open_fds_init,
                .full_fds_bits  = init_files.full_fds_bits_init,
        },
        .file_lock      = __SPIN_LOCK_UNLOCKED(init_files.file_lock),
+       .fd_idr         = IDR_INIT,
 };
 
-static unsigned int find_next_fd(struct fdtable *fdt, unsigned int start)
-{
-       unsigned int maxfd = fdt->max_fds;
-       unsigned int maxbit = maxfd / BITS_PER_LONG;
-       unsigned int bitbit = start / BITS_PER_LONG;
-
-       bitbit = find_next_zero_bit(fdt->full_fds_bits, maxbit, bitbit) * 
BITS_PER_LONG;
-       if (bitbit > maxfd)
-               return maxfd;
-       if (bitbit > start)
-               start = bitbit;
-       return find_next_zero_bit(fdt->open_fds, maxfd, start);
-}
-
 /*
  * allocate a file descriptor, mark it busy.
  */
@@ -501,54 +489,37 @@ int __alloc_fd(struct files_struct *files,
        int error;
        struct fdtable *fdt;
 
+       idr_preload(GFP_KERNEL);
        spin_lock(&files->file_lock);
-repeat:
-       fdt = files_fdtable(files);
-       fd = start;
-       if (fd < files->next_fd)
-               fd = files->next_fd;
 
-       if (fd < fdt->max_fds)
-               fd = find_next_fd(fdt, fd);
-
-       /*
-        * N.B. For clone tasks sharing a files structure, this test
-        * will limit the total number of files that can be opened.
-        */
-       error = -EMFILE;
-       if (fd >= end)
+       error = idr_alloc(&files->fd_idr, NULL, start, end, GFP_NOWAIT);
+       if (error == -ENOSPC) {
+               error = -EMFILE;
                goto out;
+       }
+       BUG_ON(error < 0);
+       fd = error;
 
        error = expand_files(files, fd);
-       if (error < 0)
+       if (error < 0) {
+               idr_remove(&files->fd_idr, fd);
                goto out;
-
-       /*
-        * If we needed to expand the fs array we
-        * might have blocked - try again.
-        */
-       if (error)
-               goto repeat;
+       }
 
        if (start <= files->next_fd)
                files->next_fd = fd + 1;
 
+       fdt = files_fdtable(files);
        __set_open_fd(fd, fdt);
        if (flags & O_CLOEXEC)
                __set_close_on_exec(fd, fdt);
        else
                __clear_close_on_exec(fd, fdt);
        error = fd;
-#if 1
-       /* Sanity check */
-       if (rcu_access_pointer(fdt->fd[fd]) != NULL) {
-               printk(KERN_WARNING "alloc_fd: slot %d not NULL!\n", fd);
-               rcu_assign_pointer(fdt->fd[fd], NULL);
-       }
-#endif
 
 out:
        spin_unlock(&files->file_lock);
+       idr_preload_end();
        return error;
 }
 
@@ -575,6 +546,7 @@ void put_unused_fd(unsigned int fd)
 {
        struct files_struct *files = current->files;
        spin_lock(&files->file_lock);
+       BUG_ON(idr_remove(&files->fd_idr, fd));
        __put_unused_fd(files, fd);
        spin_unlock(&files->file_lock);
 }
@@ -604,22 +576,9 @@ void put_unused_fd(unsigned int fd)
 void __fd_install(struct files_struct *files, unsigned int fd,
                struct file *file)
 {
-       struct fdtable *fdt;
-
-       might_sleep();
-       rcu_read_lock_sched();
-
-       while (unlikely(files->resize_in_progress)) {
-               rcu_read_unlock_sched();
-               wait_event(files->resize_wait, !files->resize_in_progress);
-               rcu_read_lock_sched();
-       }
-       /* coupled with smp_wmb() in expand_fdtable() */
-       smp_rmb();
-       fdt = rcu_dereference_sched(files->fdt);
-       BUG_ON(fdt->fd[fd] != NULL);
-       rcu_assign_pointer(fdt->fd[fd], file);
-       rcu_read_unlock_sched();
+       rcu_read_lock();
+       BUG_ON(idr_replace(&files->fd_idr, file, fd));
+       rcu_read_unlock();
 }
 
 void fd_install(unsigned int fd, struct file *file)
@@ -641,10 +600,9 @@ int __close_fd(struct files_struct *files, unsigned fd)
        fdt = files_fdtable(files);
        if (fd >= fdt->max_fds)
                goto out_unlock;
-       file = fdt->fd[fd];
+       file = idr_remove(&files->fd_idr, fd);
        if (!file)
                goto out_unlock;
-       rcu_assign_pointer(fdt->fd[fd], NULL);
        __clear_close_on_exec(fd, fdt);
        __put_unused_fd(files, fd);
        spin_unlock(&files->file_lock);
@@ -676,10 +634,9 @@ void do_close_on_exec(struct files_struct *files)
                        struct file *file;
                        if (!(set & 1))
                                continue;
-                       file = fdt->fd[fd];
+                       file = idr_remove(&files->fd_idr, fd);
                        if (!file)
                                continue;
-                       rcu_assign_pointer(fdt->fd[fd], NULL);
                        __put_unused_fd(files, fd);
                        spin_unlock(&files->file_lock);
                        filp_close(file, files);
@@ -842,17 +799,27 @@ static int do_dup2(struct files_struct *files,
         * tables and this condition does not arise without those.
         */
        fdt = files_fdtable(files);
-       tofree = fdt->fd[fd];
+       tofree = idr_find(&files->fd_idr, fd);
        if (!tofree && fd_is_open(fd, fdt))
                goto Ebusy;
        get_file(file);
-       rcu_assign_pointer(fdt->fd[fd], file);
+       if (tofree) {
+               idr_replace(&files->fd_idr, file, fd);
+       } else {
+               int err = idr_alloc(&files->fd_idr, file, fd, fd + 1,
+                                                               GFP_NOWAIT);
+               if (err != fd) {
+                       WARN(1, "do_dup2 got %d for fd %d\n", err, fd);
+                       goto Ebusy;
+               }
+       }
        __set_open_fd(fd, fdt);
        if (flags & O_CLOEXEC)
                __set_close_on_exec(fd, fdt);
        else
                __clear_close_on_exec(fd, fdt);
        spin_unlock(&files->file_lock);
+       idr_preload_end();
 
        if (tofree)
                filp_close(tofree, files);
@@ -861,6 +828,7 @@ static int do_dup2(struct files_struct *files,
 
 Ebusy:
        spin_unlock(&files->file_lock);
+       idr_preload_end();
        return -EBUSY;
 }
 
@@ -875,6 +843,7 @@ int replace_fd(unsigned fd, struct file *file, unsigned 
flags)
        if (fd >= rlimit(RLIMIT_NOFILE))
                return -EBADF;
 
+       idr_preload(GFP_KERNEL);
        spin_lock(&files->file_lock);
        err = expand_files(files, fd);
        if (unlikely(err < 0))
@@ -883,6 +852,7 @@ int replace_fd(unsigned fd, struct file *file, unsigned 
flags)
 
 out_unlock:
        spin_unlock(&files->file_lock);
+       idr_preload_end();
        return err;
 }
 
@@ -901,6 +871,7 @@ int replace_fd(unsigned fd, struct file *file, unsigned 
flags)
        if (newfd >= rlimit(RLIMIT_NOFILE))
                return -EBADF;
 
+       idr_preload(GFP_KERNEL);
        spin_lock(&files->file_lock);
        err = expand_files(files, newfd);
        file = fcheck(oldfd);
@@ -917,6 +888,7 @@ int replace_fd(unsigned fd, struct file *file, unsigned 
flags)
        err = -EBADF;
 out_unlock:
        spin_unlock(&files->file_lock);
+       idr_preload_end();
        return err;
 }
 
@@ -974,7 +946,7 @@ int iterate_fd(struct files_struct *files, unsigned n,
        spin_lock(&files->file_lock);
        for (fdt = files_fdtable(files); n < fdt->max_fds; n++) {
                struct file *file;
-               file = rcu_dereference_check_fdtable(files, fdt->fd[n]);
+               file = __fcheck_files(files, n);
                if (!file)
                        continue;
                res = f(p, file, n);
diff --git a/include/linux/fdtable.h b/include/linux/fdtable.h
index 6e84b2cae..4072f24 100644
--- a/include/linux/fdtable.h
+++ b/include/linux/fdtable.h
@@ -12,6 +12,7 @@
 #include <linux/types.h>
 #include <linux/init.h>
 #include <linux/fs.h>
+#include <linux/idr.h>
 
 #include <linux/atomic.h>
 
@@ -23,7 +24,6 @@
 
 struct fdtable {
        unsigned int max_fds;
-       struct file __rcu **fd;      /* current fd array */
        unsigned long *close_on_exec;
        unsigned long *open_fds;
        unsigned long *full_fds_bits;
@@ -51,6 +51,7 @@ struct files_struct {
        bool resize_in_progress;
        wait_queue_head_t resize_wait;
 
+       struct idr fd_idr;
        struct fdtable __rcu *fdt;
        struct fdtable fdtab;
   /*
@@ -61,7 +62,6 @@ struct files_struct {
        unsigned long close_on_exec_init[1];
        unsigned long open_fds_init[1];
        unsigned long full_fds_bits_init[1];
-       struct file __rcu * fd_array[NR_OPEN_DEFAULT];
 };
 
 struct file_operations;
@@ -79,11 +79,7 @@ struct files_struct {
  */
 static inline struct file *__fcheck_files(struct files_struct *files, unsigned 
int fd)
 {
-       struct fdtable *fdt = rcu_dereference_raw(files->fdt);
-
-       if (fd < fdt->max_fds)
-               return rcu_dereference_raw(fdt->fd[fd]);
-       return NULL;
+       return idr_find(&files->fd_idr, fd);
 }
 
 static inline struct file *fcheck_files(struct files_struct *files, unsigned 
int fd)
-- 
1.8.3.1

Reply via email to