Hi,

On Thu, Oct 22, 2015 at 07:59:14PM +0800, Chao Yu wrote:
> This patch introduces a new ioctl F2FS_IOC_DEFRAGMENT to support file
> defragment in a specified range of regular file.
> 
> This ioctl can be used in very limited workload: if user expects high
> sequential read performance in randomly written file, this interface
> can be used for defragmentation, after that file can be written as
> continuous as possible in the device.
> 
> Meanwhile, it has side-effect, it will make holes in segments where
> blocks located originally, so it's better to trigger GC to eliminate
> fragment in segments.
> 
> Signed-off-by: Chao Yu <chao2...@samsung.com>
> ---
>  fs/f2fs/data.c |   6 +-
>  fs/f2fs/f2fs.h |   8 +++
>  fs/f2fs/file.c | 200 
> +++++++++++++++++++++++++++++++++++++++++++++++++++++++++
>  3 files changed, 213 insertions(+), 1 deletion(-)
> 
> diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
> index 972eab7..5bb375a 100644
> --- a/fs/f2fs/data.c
> +++ b/fs/f2fs/data.c
> @@ -566,7 +566,7 @@ out:
>   *     b. do not use extent cache for better performance
>   *     c. give the block addresses to blockdev
>   */
> -static int f2fs_map_blocks(struct inode *inode, struct f2fs_map_blocks *map,
> +int f2fs_map_blocks(struct inode *inode, struct f2fs_map_blocks *map,
>                                               int create, int flag)
>  {
>       unsigned int maxblocks = map->m_len;
> @@ -1354,6 +1354,10 @@ static int f2fs_write_data_pages(struct address_space 
> *mapping,
>                       available_free_memory(sbi, DIRTY_DENTS))
>               goto skip_write;
>  
> +     /* skip writing during file defragment */
> +     if (is_inode_flag_set(F2FS_I(inode), FI_DO_DEFRAG))
> +             goto skip_write;
> +
>       /* during POR, we don't need to trigger writepage at all. */
>       if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING)))
>               goto skip_write;
> diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
> index 9db5500..068813c 100644
> --- a/fs/f2fs/f2fs.h
> +++ b/fs/f2fs/f2fs.h
> @@ -234,6 +234,7 @@ static inline bool __has_cursum_space(struct 
> f2fs_summary_block *sum, int size,
>  #define F2FS_IOC_ABORT_VOLATILE_WRITE        _IO(F2FS_IOCTL_MAGIC, 5)
>  #define F2FS_IOC_GARBAGE_COLLECT     _IO(F2FS_IOCTL_MAGIC, 6)
>  #define F2FS_IOC_WRITE_CHECKPOINT    _IO(F2FS_IOCTL_MAGIC, 7)
> +#define F2FS_IOC_DEFRAGMENT          _IO(F2FS_IOCTL_MAGIC, 8)
>  
>  #define F2FS_IOC_SET_ENCRYPTION_POLICY                                       
> \
>               _IOR('f', 19, struct f2fs_encryption_policy)
> @@ -260,6 +261,11 @@ static inline bool __has_cursum_space(struct 
> f2fs_summary_block *sum, int size,
>  #define F2FS_IOC32_SETFLAGS             FS_IOC32_SETFLAGS
>  #endif
>  
> +struct f2fs_defragment {
> +     u64 start;
> +     u64 len;
> +};
> +
>  /*
>   * For INODE and NODE manager
>   */
> @@ -1416,6 +1422,7 @@ enum {
>       FI_DROP_CACHE,          /* drop dirty page cache */
>       FI_DATA_EXIST,          /* indicate data exists */
>       FI_INLINE_DOTS,         /* indicate inline dot dentries */
> +     FI_DO_DEFRAG,           /* indicate defragment is running */
>  };
>  
>  static inline void set_inode_flag(struct f2fs_inode_info *fi, int flag)
> @@ -1847,6 +1854,7 @@ struct page *find_data_page(struct inode *, pgoff_t);
>  struct page *get_lock_data_page(struct inode *, pgoff_t, bool);
>  struct page *get_new_data_page(struct inode *, struct page *, pgoff_t, bool);
>  int do_write_data_page(struct f2fs_io_info *);
> +int f2fs_map_blocks(struct inode *, struct f2fs_map_blocks *, int, int);
>  int f2fs_fiemap(struct inode *inode, struct fiemap_extent_info *, u64, u64);
>  void f2fs_invalidate_page(struct page *, unsigned int, unsigned int);
>  int f2fs_release_page(struct page *, gfp_t);
> diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
> index a197215..ad59694 100644
> --- a/fs/f2fs/file.c
> +++ b/fs/f2fs/file.c
> @@ -1646,6 +1646,204 @@ static int f2fs_ioc_write_checkpoint(struct file 
> *filp, unsigned long arg)
>       return 0;
>  }
>  
> +static int f2fs_defragment_range(struct f2fs_sb_info *sbi,
> +                                     struct file *filp,
> +                                     struct f2fs_defragment *range)
> +{
> +     struct inode *inode = file_inode(filp);
> +     struct f2fs_map_blocks map;
> +     struct extent_info ei;
> +     pgoff_t pg_start, pg_end;
> +     unsigned int blk_per_seg = 1 << sbi->log_blocks_per_seg;
> +     unsigned int total = 0, sec_num;
> +     unsigned int pages_per_sec = sbi->segs_per_sec *
> +                                     (1 << sbi->log_blocks_per_seg);
> +     block_t blk_end = 0;
> +     bool fragmented = false;
> +     int err = 0;
> +
> +     pg_start = range->start >> PAGE_CACHE_SHIFT;
> +     pg_end = (range->start + range->len) >> PAGE_CACHE_SHIFT;
> +
> +     f2fs_balance_fs(sbi);
> +
> +     mutex_lock(&inode->i_mutex);
> +
> +     /* writeback all dirty pages in the range */
> +     err = filemap_write_and_wait_range(inode->i_mapping, range->start,
> +                                             range->start + range->len);
> +     if (err)
> +             goto out;
> +
> +     /*
> +      * lookup mapping info in extent cache, skip defragmenting if physical
> +      * block addresses are continuous.
> +      */
> +     if (f2fs_lookup_extent_cache(inode, pg_start, &ei)) {
> +             if (ei.fofs + ei.len >= pg_end)
> +                     goto out;
> +     }
> +
> +     map.m_lblk = pg_start;
> +     map.m_len = pg_end - pg_start;
> +
> +     /*
> +      * lookup mapping info in dnode page cache, skip defragmenting if all
> +      * physical block addresses are continuous even if there are hole(s)
> +      * in logical blocks.
> +      */
> +     while (map.m_lblk < pg_end) {
> +             map.m_flags = 0;
> +             err = f2fs_map_blocks(inode, &map, 0, F2FS_GET_BLOCK_READ);

How about using f2fs_fiemap to get the extent information?

> +             if (err)
> +                     goto out;
> +
> +             if (!(map.m_flags & F2FS_MAP_FLAGS)) {
> +                     map.m_lblk++;
> +                     map.m_len--;
> +                     continue;
> +             }
> +
> +             if (blk_end && blk_end != map.m_pblk) {
> +                     fragmented = true;
> +                     break;
> +             }
> +             blk_end = map.m_pblk + map.m_len;
> +
> +             map.m_lblk += map.m_len;
> +             map.m_len = pg_end - map.m_lblk;
> +     }
> +
> +     if (!fragmented)
> +             goto out;
> +
> +     map.m_lblk = pg_start;
> +     map.m_len = pg_end - pg_start;
> +
> +     sec_num = (map.m_len + pages_per_sec - 1) / pages_per_sec;
> +
> +     if (has_not_enough_free_secs(sbi, sec_num))

Later, ->writepage will handle this?

> +             goto out;
> +
> +     while (map.m_lblk < pg_end) {
> +             pgoff_t idx;
> +             int cnt = 0;

What about this?

        for_each_extents(extent_info) {
                page = get_lock_data_page(inode, idx, true);

                set_page_dirty(page);
        }
        filemap_fdatawrite();

Thanks,

> +
> +do_map:
> +             map.m_flags = 0;
> +             err = f2fs_map_blocks(inode, &map, 0, F2FS_GET_BLOCK_READ);
> +             if (err)
> +                     goto out;
> +
> +             if (!(map.m_flags & F2FS_MAP_FLAGS)) {
> +                     map.m_lblk++;
> +                     continue;
> +             }
> +
> +             set_inode_flag(F2FS_I(inode), FI_DO_DEFRAG);
> +
> +             idx = map.m_lblk;
> +             while (idx < map.m_lblk + map.m_len && cnt < blk_per_seg) {
> +                     struct address_space *mapping = inode->i_mapping;
> +                     struct page *page;
> +
> +                     page = find_or_create_page(mapping, idx, GFP_NOFS);
> +                     if (!page) {
> +                             err = -ENOMEM;
> +                             goto out;
> +                     }
> +
> +                     f2fs_wait_on_page_writeback(page, DATA);
> +
> +                     if (!PageUptodate(page)) {
> +                             err = mapping->a_ops->readpage(filp, page);
> +                             if (unlikely(err)) {
> +                                     f2fs_put_page(page, 0);
> +                                     goto out;
> +                             }
> +
> +                             lock_page_killable(page);
> +
> +                             if (!PageUptodate(page)) {
> +                                     f2fs_put_page(page, 1);
> +                                     err = -EIO;
> +                                     goto out;
> +                             }
> +                     }
> +                     set_page_dirty(page);
> +                     f2fs_put_page(page, 1);
> +
> +                     idx++;
> +                     cnt++;
> +                     total++;
> +             }
> +
> +             map.m_lblk = idx;
> +             map.m_len = pg_end - idx;
> +
> +             if (idx < pg_end && cnt < blk_per_seg)
> +                     goto do_map;
> +
> +             clear_inode_flag(F2FS_I(inode), FI_DO_DEFRAG);
> +
> +             err = filemap_fdatawrite(inode->i_mapping);
> +             if (err)
> +                     goto out;
> +     }
> +out:
> +     mutex_unlock(&inode->i_mutex);
> +     if (!err)
> +             range->len = (u64)total << PAGE_CACHE_SHIFT;
> +     return err;
> +}
> +
> +static int f2fs_ioc_defragment(struct file *filp, unsigned long arg)
> +{
> +     struct inode *inode = file_inode(filp);
> +     struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
> +     struct f2fs_defragment range;
> +     int err;
> +
> +     if (!capable(CAP_SYS_ADMIN))
> +             return -EPERM;
> +
> +     if (!S_ISREG(inode->i_mode))
> +             return -EINVAL;
> +
> +     err = mnt_want_write_file(filp);
> +     if (err)
> +             return err;
> +
> +     if (f2fs_readonly(sbi->sb)) {
> +             err = -EROFS;
> +             goto out;
> +     }
> +
> +     if (copy_from_user(&range, (struct f2fs_defragment __user *)arg,
> +                                                     sizeof(range))) {
> +             err = -EFAULT;
> +             goto out;
> +     }
> +
> +     /* verify alignment of offset & size */
> +     if (range.start & (F2FS_BLKSIZE - 1) ||
> +             range.len & (F2FS_BLKSIZE - 1)) {
> +             err = -EINVAL;
> +             goto out;
> +     }
> +
> +     err = f2fs_defragment_range(sbi, filp, &range);
> +     if (err < 0)
> +             goto out;
> +
> +     if (copy_to_user((struct f2fs_defragment __user *)arg, &range,
> +                                                     sizeof(range)))
> +             err = -EFAULT;
> +out:
> +     mnt_drop_write_file(filp);
> +     return err;
> +}
> +
>  long f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
>  {
>       switch (cmd) {
> @@ -1679,6 +1877,8 @@ long f2fs_ioctl(struct file *filp, unsigned int cmd, 
> unsigned long arg)
>               return f2fs_ioc_gc(filp, arg);
>       case F2FS_IOC_WRITE_CHECKPOINT:
>               return f2fs_ioc_write_checkpoint(filp, arg);
> +     case F2FS_IOC_DEFRAGMENT:
> +             return f2fs_ioc_defragment(filp, arg);
>       default:
>               return -ENOTTY;
>       }
> -- 
> 2.6.1

------------------------------------------------------------------------------
_______________________________________________
Linux-f2fs-devel mailing list
Linux-f2fs-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/linux-f2fs-devel

Reply via email to