[PATCH 3/3] f2fs:dax: Implement direct access
This patch implements Direct Access (DAX) in F2FS, including: - a mount option to enable DAX - read/write and mmap of regular files in the DAX way - zero-out of non-aligned partial blocks in the DAX way - garbage collection of DAX files - incompatibility of DAX with inline data, atomic or volatile write TODO: We may need new implementation of f2fs_collapse/insert_range() for DAX files, as filemap_write_and_wait_range() does not work for DAX files, and thus the data pages cannot be moved correctly. Signed-off-by: Qiuyang Sun--- fs/f2fs/f2fs.h | 8 +++ fs/f2fs/file.c | 197 ++- fs/f2fs/gc.c | 69 +-- fs/f2fs/inline.c | 4 ++ fs/f2fs/inode.c | 88 + fs/f2fs/namei.c | 7 ++ fs/f2fs/super.c | 16 + 7 files changed, 383 insertions(+), 6 deletions(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index f7957ca..d0e8af5 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -87,6 +87,11 @@ struct f2fs_fault_info { #define F2FS_MOUNT_FAULT_INJECTION 0x0001 #define F2FS_MOUNT_ADAPTIVE0x0002 #define F2FS_MOUNT_LFS 0x0004 +#ifdef CONFIG_FS_DAX +#define F2FS_MOUNT_DAX 0x0008 /* Direct Access */ +#else +#define F2FS_MOUNT_DAX 0 +#endif #define clear_opt(sbi, option) (sbi->mount_opt.opt &= ~F2FS_MOUNT_##option) #define set_opt(sbi, option) (sbi->mount_opt.opt |= F2FS_MOUNT_##option) @@ -2063,6 +2068,9 @@ int f2fs_getattr(const struct path *path, struct kstat *stat, int f2fs_write_inode(struct inode *inode, struct writeback_control *wbc); void f2fs_evict_inode(struct inode *inode); void handle_failed_inode(struct inode *inode); +#ifdef CONFIG_FS_DAX +extern struct iomap_ops f2fs_iomap_ops; +#endif /* * namei.c diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 165acbf..4eeb17b 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -23,6 +23,8 @@ #include #include #include +#include +#include #include "f2fs.h" #include "node.h" @@ -106,6 +108,64 @@ static int f2fs_vm_page_mkwrite(struct vm_fault *vmf) .page_mkwrite = f2fs_vm_page_mkwrite, }; +#ifdef CONFIG_FS_DAX +static int f2fs_dax_huge_fault(struct vm_fault *vmf, + enum page_entry_size pe_size) +{ + int result; + struct inode *inode = file_inode(vmf->vma->vm_file); + struct super_block *sb = inode->i_sb; + bool write = vmf->flags & FAULT_FLAG_WRITE; + + if (write) { + sb_start_pagefault(sb); + file_update_time(vmf->vma->vm_file); + } + down_read(_I(inode)->i_mmap_sem); + result = dax_iomap_fault(vmf, pe_size, _iomap_ops); + up_read(_I(inode)->i_mmap_sem); + if (write) + sb_end_pagefault(sb); + + return result; +} + +static int f2fs_dax_fault(struct vm_fault *vmf) +{ + return f2fs_dax_huge_fault(vmf, PE_SIZE_PTE); +} + +static int f2fs_dax_pfn_mkwrite(struct vm_fault *vmf) +{ + struct inode *inode = file_inode(vmf->vma->vm_file); + struct super_block *sb = inode->i_sb; + loff_t size; + int ret; + + sb_start_pagefault(sb); + file_update_time(vmf->vma->vm_file); + down_read(_I(inode)->i_mmap_sem); + size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT; + if (vmf->pgoff >= size) + ret = VM_FAULT_SIGBUS; + else + ret = dax_pfn_mkwrite(vmf); + up_read(_I(inode)->i_mmap_sem); + sb_end_pagefault(sb); + + return ret; +} + +static const struct vm_operations_struct f2fs_dax_vm_ops = { + .fault = f2fs_dax_fault, + .huge_fault = f2fs_dax_huge_fault, + .page_mkwrite = f2fs_dax_fault, + .pfn_mkwrite= f2fs_dax_pfn_mkwrite, +}; +#else +#define f2fs_dax_vm_ops f2fs_file_vm_ops +#endif + static int get_parent_ino(struct inode *inode, nid_t *pino) { struct dentry *dentry; @@ -434,7 +494,13 @@ static int f2fs_file_mmap(struct file *file, struct vm_area_struct *vma) return err; file_accessed(file); - vma->vm_ops = _file_vm_ops; + if (IS_DAX(file_inode(file))) { + vma->vm_ops = _dax_vm_ops; + vma->vm_flags |= VM_MIXEDMAP | VM_HUGEPAGE; + } else { + vma->vm_ops = _file_vm_ops; + } + return 0; } @@ -518,6 +584,18 @@ static int truncate_partial_data_page(struct inode *inode, u64 from, if (!offset && !cache_only) return 0; +#ifdef CONFIG_FS_DAX + if (IS_DAX(inode)) { + int ret; + + down_read(_I(inode)->dio_rwsem[WRITE]); + ret = iomap_zero_range(inode, from, PAGE_SIZE - offset, + NULL, _iomap_ops); + up_read(_I(inode)->dio_rwsem[WRITE]); + return ret; + } +#endif + if (cache_only) { page =
[PATCH 3/3] f2fs:dax: Implement direct access
This patch implements Direct Access (DAX) in F2FS, including: - a mount option to enable DAX - read/write and mmap of regular files in the DAX way - zero-out of non-aligned partial blocks in the DAX way - garbage collection of DAX files - incompatibility of DAX with inline data, atomic or volatile write TODO: We may need new implementation of f2fs_collapse/insert_range() for DAX files, as filemap_write_and_wait_range() does not work for DAX files, and thus the data pages cannot be moved correctly. Signed-off-by: Qiuyang Sun --- fs/f2fs/f2fs.h | 8 +++ fs/f2fs/file.c | 197 ++- fs/f2fs/gc.c | 69 +-- fs/f2fs/inline.c | 4 ++ fs/f2fs/inode.c | 88 + fs/f2fs/namei.c | 7 ++ fs/f2fs/super.c | 16 + 7 files changed, 383 insertions(+), 6 deletions(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index f7957ca..d0e8af5 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -87,6 +87,11 @@ struct f2fs_fault_info { #define F2FS_MOUNT_FAULT_INJECTION 0x0001 #define F2FS_MOUNT_ADAPTIVE0x0002 #define F2FS_MOUNT_LFS 0x0004 +#ifdef CONFIG_FS_DAX +#define F2FS_MOUNT_DAX 0x0008 /* Direct Access */ +#else +#define F2FS_MOUNT_DAX 0 +#endif #define clear_opt(sbi, option) (sbi->mount_opt.opt &= ~F2FS_MOUNT_##option) #define set_opt(sbi, option) (sbi->mount_opt.opt |= F2FS_MOUNT_##option) @@ -2063,6 +2068,9 @@ int f2fs_getattr(const struct path *path, struct kstat *stat, int f2fs_write_inode(struct inode *inode, struct writeback_control *wbc); void f2fs_evict_inode(struct inode *inode); void handle_failed_inode(struct inode *inode); +#ifdef CONFIG_FS_DAX +extern struct iomap_ops f2fs_iomap_ops; +#endif /* * namei.c diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 165acbf..4eeb17b 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -23,6 +23,8 @@ #include #include #include +#include +#include #include "f2fs.h" #include "node.h" @@ -106,6 +108,64 @@ static int f2fs_vm_page_mkwrite(struct vm_fault *vmf) .page_mkwrite = f2fs_vm_page_mkwrite, }; +#ifdef CONFIG_FS_DAX +static int f2fs_dax_huge_fault(struct vm_fault *vmf, + enum page_entry_size pe_size) +{ + int result; + struct inode *inode = file_inode(vmf->vma->vm_file); + struct super_block *sb = inode->i_sb; + bool write = vmf->flags & FAULT_FLAG_WRITE; + + if (write) { + sb_start_pagefault(sb); + file_update_time(vmf->vma->vm_file); + } + down_read(_I(inode)->i_mmap_sem); + result = dax_iomap_fault(vmf, pe_size, _iomap_ops); + up_read(_I(inode)->i_mmap_sem); + if (write) + sb_end_pagefault(sb); + + return result; +} + +static int f2fs_dax_fault(struct vm_fault *vmf) +{ + return f2fs_dax_huge_fault(vmf, PE_SIZE_PTE); +} + +static int f2fs_dax_pfn_mkwrite(struct vm_fault *vmf) +{ + struct inode *inode = file_inode(vmf->vma->vm_file); + struct super_block *sb = inode->i_sb; + loff_t size; + int ret; + + sb_start_pagefault(sb); + file_update_time(vmf->vma->vm_file); + down_read(_I(inode)->i_mmap_sem); + size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT; + if (vmf->pgoff >= size) + ret = VM_FAULT_SIGBUS; + else + ret = dax_pfn_mkwrite(vmf); + up_read(_I(inode)->i_mmap_sem); + sb_end_pagefault(sb); + + return ret; +} + +static const struct vm_operations_struct f2fs_dax_vm_ops = { + .fault = f2fs_dax_fault, + .huge_fault = f2fs_dax_huge_fault, + .page_mkwrite = f2fs_dax_fault, + .pfn_mkwrite= f2fs_dax_pfn_mkwrite, +}; +#else +#define f2fs_dax_vm_ops f2fs_file_vm_ops +#endif + static int get_parent_ino(struct inode *inode, nid_t *pino) { struct dentry *dentry; @@ -434,7 +494,13 @@ static int f2fs_file_mmap(struct file *file, struct vm_area_struct *vma) return err; file_accessed(file); - vma->vm_ops = _file_vm_ops; + if (IS_DAX(file_inode(file))) { + vma->vm_ops = _dax_vm_ops; + vma->vm_flags |= VM_MIXEDMAP | VM_HUGEPAGE; + } else { + vma->vm_ops = _file_vm_ops; + } + return 0; } @@ -518,6 +584,18 @@ static int truncate_partial_data_page(struct inode *inode, u64 from, if (!offset && !cache_only) return 0; +#ifdef CONFIG_FS_DAX + if (IS_DAX(inode)) { + int ret; + + down_read(_I(inode)->dio_rwsem[WRITE]); + ret = iomap_zero_range(inode, from, PAGE_SIZE - offset, + NULL, _iomap_ops); + up_read(_I(inode)->dio_rwsem[WRITE]); + return ret; + } +#endif + if (cache_only) { page = find_lock_page(mapping, index);