From: Dmitry Monakhov <dmonak...@openvz.org> Add EXT4_IOC_MFSYNC ioctl which allow to perform sync on given set of files in optimized way (only 1 barrier will be required in best scenario)
https://jira.sw.ru/browse/PSBM-18567 Signed-off-by: Dmitry Monakhov <dmonak...@openvz.org> +++ Comment on rebasing to rh7 kernel-3.10.0-229.7.2.el7: 1) compile fix for ext4-add-mfsync-support ext4_flush_unwritten_io was removed in rh7-3.10.0-229.7.2 https://jira.sw.ru/browse/PSBM-34909 2) compile fix for ext4-add-mfsync-support part2 __sync_inode was removed in rh7-3.10.0-229.7.2 It is honest to simply disable mfsync in nojournal mode since we so not test nojournal mode at all. https://jira.sw.ru/browse/PSBM-34910 Signed-off-by: Dmitry Monakhov <dmonak...@openvz.org> Rebase to vz8 kernel note: mutex_unlock(&inode->i_mutex) -> inode_lock_shared(inode) Signed-off-by: Konstantin Khorenko <khore...@virtuozzo.com> Signed-off-by: Kirill Tkhai <ktk...@virtuozzo.com> --- fs/ext4/ext4.h | 7 +++ fs/ext4/fsync.c | 108 +++++++++++++++++++++++++++++++++++++++++++ fs/ext4/ioctl.c | 60 ++++++++++++++++++++++++ include/trace/events/ext4.h | 54 ++++++++++++++++++++++ 4 files changed, 229 insertions(+) diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index df46d5586ca1..5f6fdd5514b2 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -615,6 +615,11 @@ struct compat_ext4_new_group_input { }; #endif +struct ext4_ioc_mfsync_info { + __u32 size; + __u32 fd[0]; +}; + /* The struct ext4_new_group_input in kernel space, with free_blocks_count */ struct ext4_new_group_data { __u32 group; @@ -722,6 +727,7 @@ enum { #define EXT4_IOC_GET_ES_CACHE _IOWR('f', 42, struct fiemap) #define EXT4_IOC_OPEN_BALLOON _IO('f', 42) #define EXT4_IOC_CHECKPOINT _IOW('f', 43, __u32) +#define EXT4_IOC_MFSYNC _IO('f', 43) #define EXT4_IOC_SHUTDOWN _IOR ('X', 125, __u32) @@ -2814,6 +2820,7 @@ extern int ext4_check_all_de(struct inode *dir, struct buffer_head *bh, /* fsync.c */ extern int ext4_sync_file(struct file *, loff_t, loff_t, int); +extern int ext4_sync_files(struct file **, unsigned int *, unsigned int); /* hash.c */ extern int ext4fs_dirhash(const struct inode *dir, const char *name, int len, diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c index 027a7d7037a0..8179066765bd 100644 --- a/fs/ext4/fsync.c +++ b/fs/ext4/fsync.c @@ -185,3 +185,111 @@ int ext4_sync_file(struct file *file, loff_t start, loff_t end, int datasync) trace_ext4_sync_file_exit(inode, ret); return ret; } + +int ext4_sync_files(struct file **files, unsigned int *flags, unsigned int nr_files) +{ + struct super_block *sb; + journal_t *journal; + int err = 0, err2 = 0, i = 0, j = 0; + int force_commit = 0, datawriteback = 0; + tid_t commit_tid = 0; + int need_barrier = 0; + + J_ASSERT(ext4_journal_current_handle() == NULL); + if (!nr_files) + return 0; + + sb = files[0]->f_mapping->host->i_sb; + journal = EXT4_SB(sb)->s_journal; + if (sb->s_flags & SB_RDONLY) { + /* Make shure that we read updated s_mount_flags value */ + smp_rmb(); + if (EXT4_SB(sb)->s_mount_flags & EXT4_MF_FS_ABORTED) + return -EROFS; + return 0; + } + for (i = 0; i < nr_files; i++) { + struct address_space * mapping = files[i]->f_mapping; + struct inode *inode = mapping->host; + + BUG_ON(sb != inode->i_sb); + if (!mapping->nrpages) + continue; + + err = filemap_fdatawrite(mapping); + if (err) + break; + + } + /* + * Even if the above returned error, the pages may be + * written partially (e.g. -ENOSPC), so we wait for it. + * But the -EIO is special case, it may indicate the worst + * thing (e.g. bug) happened, so we avoid waiting for it. + */ + if (err == -EIO) + goto out; + + for (j = 0; j < i; j++) { + struct address_space * mapping = files[j]->f_mapping; + struct inode *inode = mapping->host; + struct ext4_inode_info *ei = EXT4_I(inode); + unsigned int datasync = flags[j]; + tid_t tid; + + if (mapping->nrpages) { + err2 = filemap_fdatawait(mapping); + if (!err || err2 == -EIO) + err = err2; + } + + inode_lock_shared(inode); + force_commit |= ext4_should_journal_data(inode); + datawriteback |= ext4_should_writeback_data(inode); + tid = datasync ? ei->i_datasync_tid : ei->i_sync_tid; + inode_unlock_shared(inode); + trace_ext4_sync_files_iterate(files[j]->f_path.dentry, tid, datasync); + if (j == 0 || !tid_geq(commit_tid, tid)) + commit_tid = tid; + } + + /* Ext4 specific stuff starts here */ + if (!journal) { + return -ENOTSUPP; + } else if (force_commit) { + /* data=journal: + * filemap_fdatawrite won't do anything (the buffers are clean). + * ext4_force_commit will write the file data into the journal and + * will wait on that. + * filemap_fdatawait() will encounter a ton of newly-dirtied pages + * (they were dirtied by commit). But that's OK - the blocks are + * safe in-journal, which is all fsync() needs to ensure. + */ + err2 = ext4_force_commit(sb); + } else { + /* + * data=writeback,ordered: + * The caller's filemap_fdatawrite()/wait will sync the data. + * Metadata is in the journal, we wait for proper transaction to + * commit here. + */ + if (journal->j_flags & JBD2_BARRIER && + !jbd2_trans_will_send_data_barrier(journal, commit_tid)) + need_barrier = true; + + err2 = jbd2_complete_transaction(journal, commit_tid); + /* Even if we had to wait for commit completion, it does not + * mean a flush has been issued after data demanded by this + * fsync were written back. Commit could be in state after + * it is already done, but not yet in state where we should + * not wait. + */ + if (need_barrier) + err2 = blkdev_issue_flush(sb->s_bdev); + } +out: + trace_ext4_sync_files_exit(files[0]->f_path.dentry, commit_tid, need_barrier); + if (!err || err2 == -EIO) + err = err2; + return err; +} diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c index 6e2be4859571..0bac68174793 100644 --- a/fs/ext4/ioctl.c +++ b/fs/ext4/ioctl.c @@ -903,6 +903,63 @@ static int ext4_open_balloon(struct super_block *sb, struct vfsmount *mnt) return err; } +static int ext4_mfsync(unsigned long arg) +{ + struct ext4_ioc_mfsync_info mfsync; + struct file **filpp; + unsigned int *flags; + __u32 __user *usr_fd; + int i, err; + + if (!ve_is_super(get_exec_env())) + return -ENOTSUPP; + if (copy_from_user(&mfsync, (struct ext4_ioc_mfsync_info *)arg, + sizeof(mfsync))) + return -EFAULT; + + if (mfsync.size == 0) + return 0; + if (mfsync.size > NR_FILE) + return -ENFILE; + + usr_fd = (__u32 __user *) (arg + sizeof(__u32)); + + filpp = kzalloc(mfsync.size * sizeof(*filpp), GFP_KERNEL); + if (!filpp) + return -ENOMEM; + flags = kzalloc(mfsync.size * sizeof(*flags), GFP_KERNEL); + if (!flags) { + kfree(filpp); + return -ENOMEM; + } + for (i = 0; i < mfsync.size; i++) { + int fd; + int ret; + + err = -EFAULT; + ret = get_user(fd, usr_fd + i); + if (ret) + goto mfsync_fput; + + /* negative fd means fdata_sync */ + flags[i] = (fd & (1<< 31)) != 0; + fd &= ~(1<< 31); + + err = -EBADF; + filpp[i] = fget(fd); + if (!filpp[i]) + goto mfsync_fput; + } + err = ext4_sync_files(filpp, flags, mfsync.size); +mfsync_fput: + for (i = 0; i < mfsync.size; i++) + if (filpp[i]) + fput(filpp[i]); + kfree(filpp); + kfree(flags); + return err; +} + static long __ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) { struct inode *inode = file_inode(filp); @@ -1298,6 +1355,9 @@ static long __ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) case EXT4_IOC_SHUTDOWN: return ext4_shutdown(sb, arg); + case EXT4_IOC_MFSYNC: + return ext4_mfsync(arg); + case FS_IOC_ENABLE_VERITY: if (!ext4_has_feature_verity(sb)) return -EOPNOTSUPP; diff --git a/include/trace/events/ext4.h b/include/trace/events/ext4.h index 0ea36b2b0662..d441a01335df 100644 --- a/include/trace/events/ext4.h +++ b/include/trace/events/ext4.h @@ -966,6 +966,60 @@ TRACE_EVENT(ext4_sync_file_exit, __entry->ret) ); +TRACE_EVENT(ext4_sync_files_iterate, + TP_PROTO(struct dentry *dentry, tid_t tid, int datasync), + + TP_ARGS(dentry, tid, datasync), + + TP_STRUCT__entry( + __field( dev_t, dev ) + __field( ino_t, ino ) + __field( ino_t, parent ) + __field( int, datasync ) + __field( unsigned int, tid ) + ), + + TP_fast_assign( + __entry->dev = dentry->d_inode->i_sb->s_dev; + __entry->ino = dentry->d_inode->i_ino; + __entry->datasync = datasync; + __entry->parent = dentry->d_parent->d_inode->i_ino; + __entry->tid = tid; + ), + + TP_printk("dev %d,%d ino %ld parent %ld datasync %d tid %u", + MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long) __entry->ino, + (unsigned long) __entry->parent, __entry->datasync, + __entry->tid) +); + +TRACE_EVENT(ext4_sync_files_exit, + TP_PROTO(struct dentry *dentry, tid_t tid, int barrier), + + TP_ARGS(dentry, tid, barrier), + + TP_STRUCT__entry( + __field( dev_t, dev ) + __field( ino_t, ino ) + __field( ino_t, parent ) + __field( int, barrier ) + __field( unsigned int, tid ) + ), + + TP_fast_assign( + __entry->dev = dentry->d_inode->i_sb->s_dev; + __entry->ino = dentry->d_inode->i_ino; + __entry->parent = dentry->d_parent->d_inode->i_ino; + __entry->tid = tid; + __entry->barrier = barrier; + ), + + TP_printk("dev %d,%d ino %ld parent %ld explicit_barrier %d tid %u", + MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long) __entry->ino, + (unsigned long) __entry->parent, __entry->barrier, + __entry->tid) +); + TRACE_EVENT(ext4_sync_fs, TP_PROTO(struct super_block *sb, int wait), _______________________________________________ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel