[PATCH 3/3] f2fs:dax: Implement direct access

2017-05-03 Thread sunqiuyang
This patch implements Direct Access (DAX) in F2FS, including:
 - a mount option to enable DAX
 - read/write and mmap of regular files in the DAX way
 - zero-out of non-aligned partial blocks in the DAX way
 - garbage collection of DAX files
 - incompatibility of DAX with inline data, atomic or volatile write

TODO: We may need new implementation of f2fs_collapse/insert_range() for 
DAX files, as filemap_write_and_wait_range() does not work for DAX files, 
and thus the data pages cannot be moved correctly.

Signed-off-by: Qiuyang Sun 
---
 fs/f2fs/f2fs.h   |   8 +++
 fs/f2fs/file.c   | 197 ++-
 fs/f2fs/gc.c |  69 +--
 fs/f2fs/inline.c |   4 ++
 fs/f2fs/inode.c  |  88 +
 fs/f2fs/namei.c  |   7 ++
 fs/f2fs/super.c  |  16 +
 7 files changed, 383 insertions(+), 6 deletions(-)

diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index f7957ca..d0e8af5 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -87,6 +87,11 @@ struct f2fs_fault_info {
 #define F2FS_MOUNT_FAULT_INJECTION 0x0001
 #define F2FS_MOUNT_ADAPTIVE0x0002
 #define F2FS_MOUNT_LFS 0x0004
+#ifdef CONFIG_FS_DAX
+#define F2FS_MOUNT_DAX 0x0008 /* Direct Access */
+#else
+#define F2FS_MOUNT_DAX 0
+#endif
 
 #define clear_opt(sbi, option) (sbi->mount_opt.opt &= ~F2FS_MOUNT_##option)
 #define set_opt(sbi, option)   (sbi->mount_opt.opt |= F2FS_MOUNT_##option)
@@ -2063,6 +2068,9 @@ int f2fs_getattr(const struct path *path, struct kstat 
*stat,
 int f2fs_write_inode(struct inode *inode, struct writeback_control *wbc);
 void f2fs_evict_inode(struct inode *inode);
 void handle_failed_inode(struct inode *inode);
+#ifdef CONFIG_FS_DAX
+extern struct iomap_ops f2fs_iomap_ops;
+#endif
 
 /*
  * namei.c
diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
index 165acbf..4eeb17b 100644
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@ -23,6 +23,8 @@
 #include 
 #include 
 #include 
+#include 
+#include 
 
 #include "f2fs.h"
 #include "node.h"
@@ -106,6 +108,64 @@ static int f2fs_vm_page_mkwrite(struct vm_fault *vmf)
.page_mkwrite   = f2fs_vm_page_mkwrite,
 };
 
+#ifdef CONFIG_FS_DAX
+static int f2fs_dax_huge_fault(struct vm_fault *vmf,
+   enum page_entry_size pe_size)
+{
+   int result;
+   struct inode *inode = file_inode(vmf->vma->vm_file);
+   struct super_block *sb = inode->i_sb;
+   bool write = vmf->flags & FAULT_FLAG_WRITE;
+
+   if (write) {
+   sb_start_pagefault(sb);
+   file_update_time(vmf->vma->vm_file);
+   }
+   down_read(_I(inode)->i_mmap_sem);
+   result = dax_iomap_fault(vmf, pe_size, _iomap_ops);
+   up_read(_I(inode)->i_mmap_sem);
+   if (write)
+   sb_end_pagefault(sb);
+
+   return result;
+}
+
+static int f2fs_dax_fault(struct vm_fault *vmf)
+{
+   return f2fs_dax_huge_fault(vmf, PE_SIZE_PTE);
+}
+
+static int f2fs_dax_pfn_mkwrite(struct vm_fault *vmf)
+{
+   struct inode *inode = file_inode(vmf->vma->vm_file);
+   struct super_block *sb = inode->i_sb;
+   loff_t size;
+   int ret;
+
+   sb_start_pagefault(sb);
+   file_update_time(vmf->vma->vm_file);
+   down_read(_I(inode)->i_mmap_sem);
+   size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT;
+   if (vmf->pgoff >= size)
+   ret = VM_FAULT_SIGBUS;
+   else
+   ret = dax_pfn_mkwrite(vmf);
+   up_read(_I(inode)->i_mmap_sem);
+   sb_end_pagefault(sb);
+
+   return ret;
+}
+
+static const struct vm_operations_struct f2fs_dax_vm_ops = {
+   .fault  = f2fs_dax_fault,
+   .huge_fault = f2fs_dax_huge_fault,
+   .page_mkwrite   = f2fs_dax_fault,
+   .pfn_mkwrite= f2fs_dax_pfn_mkwrite,
+};
+#else
+#define f2fs_dax_vm_ops f2fs_file_vm_ops
+#endif
+
 static int get_parent_ino(struct inode *inode, nid_t *pino)
 {
struct dentry *dentry;
@@ -434,7 +494,13 @@ static int f2fs_file_mmap(struct file *file, struct 
vm_area_struct *vma)
return err;
 
file_accessed(file);
-   vma->vm_ops = _file_vm_ops;
+   if (IS_DAX(file_inode(file))) {
+   vma->vm_ops = _dax_vm_ops;
+   vma->vm_flags |= VM_MIXEDMAP | VM_HUGEPAGE;
+   } else {
+   vma->vm_ops = _file_vm_ops;
+   }
+
return 0;
 }
 
@@ -518,6 +584,18 @@ static int truncate_partial_data_page(struct inode *inode, 
u64 from,
if (!offset && !cache_only)
return 0;
 
+#ifdef CONFIG_FS_DAX
+   if (IS_DAX(inode)) {
+   int ret;
+
+   down_read(_I(inode)->dio_rwsem[WRITE]);
+   ret = iomap_zero_range(inode, from, PAGE_SIZE - offset,
+   NULL, _iomap_ops);
+   up_read(_I(inode)->dio_rwsem[WRITE]);
+   return ret;
+   }
+#endif
+
if (cache_only) {
page = 

[PATCH 3/3] f2fs:dax: Implement direct access

2017-05-03 Thread sunqiuyang
This patch implements Direct Access (DAX) in F2FS, including:
 - a mount option to enable DAX
 - read/write and mmap of regular files in the DAX way
 - zero-out of non-aligned partial blocks in the DAX way
 - garbage collection of DAX files
 - incompatibility of DAX with inline data, atomic or volatile write

TODO: We may need new implementation of f2fs_collapse/insert_range() for 
DAX files, as filemap_write_and_wait_range() does not work for DAX files, 
and thus the data pages cannot be moved correctly.

Signed-off-by: Qiuyang Sun 
---
 fs/f2fs/f2fs.h   |   8 +++
 fs/f2fs/file.c   | 197 ++-
 fs/f2fs/gc.c |  69 +--
 fs/f2fs/inline.c |   4 ++
 fs/f2fs/inode.c  |  88 +
 fs/f2fs/namei.c  |   7 ++
 fs/f2fs/super.c  |  16 +
 7 files changed, 383 insertions(+), 6 deletions(-)

diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index f7957ca..d0e8af5 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -87,6 +87,11 @@ struct f2fs_fault_info {
 #define F2FS_MOUNT_FAULT_INJECTION 0x0001
 #define F2FS_MOUNT_ADAPTIVE0x0002
 #define F2FS_MOUNT_LFS 0x0004
+#ifdef CONFIG_FS_DAX
+#define F2FS_MOUNT_DAX 0x0008 /* Direct Access */
+#else
+#define F2FS_MOUNT_DAX 0
+#endif
 
 #define clear_opt(sbi, option) (sbi->mount_opt.opt &= ~F2FS_MOUNT_##option)
 #define set_opt(sbi, option)   (sbi->mount_opt.opt |= F2FS_MOUNT_##option)
@@ -2063,6 +2068,9 @@ int f2fs_getattr(const struct path *path, struct kstat 
*stat,
 int f2fs_write_inode(struct inode *inode, struct writeback_control *wbc);
 void f2fs_evict_inode(struct inode *inode);
 void handle_failed_inode(struct inode *inode);
+#ifdef CONFIG_FS_DAX
+extern struct iomap_ops f2fs_iomap_ops;
+#endif
 
 /*
  * namei.c
diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
index 165acbf..4eeb17b 100644
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@ -23,6 +23,8 @@
 #include 
 #include 
 #include 
+#include 
+#include 
 
 #include "f2fs.h"
 #include "node.h"
@@ -106,6 +108,64 @@ static int f2fs_vm_page_mkwrite(struct vm_fault *vmf)
.page_mkwrite   = f2fs_vm_page_mkwrite,
 };
 
+#ifdef CONFIG_FS_DAX
+static int f2fs_dax_huge_fault(struct vm_fault *vmf,
+   enum page_entry_size pe_size)
+{
+   int result;
+   struct inode *inode = file_inode(vmf->vma->vm_file);
+   struct super_block *sb = inode->i_sb;
+   bool write = vmf->flags & FAULT_FLAG_WRITE;
+
+   if (write) {
+   sb_start_pagefault(sb);
+   file_update_time(vmf->vma->vm_file);
+   }
+   down_read(_I(inode)->i_mmap_sem);
+   result = dax_iomap_fault(vmf, pe_size, _iomap_ops);
+   up_read(_I(inode)->i_mmap_sem);
+   if (write)
+   sb_end_pagefault(sb);
+
+   return result;
+}
+
+static int f2fs_dax_fault(struct vm_fault *vmf)
+{
+   return f2fs_dax_huge_fault(vmf, PE_SIZE_PTE);
+}
+
+static int f2fs_dax_pfn_mkwrite(struct vm_fault *vmf)
+{
+   struct inode *inode = file_inode(vmf->vma->vm_file);
+   struct super_block *sb = inode->i_sb;
+   loff_t size;
+   int ret;
+
+   sb_start_pagefault(sb);
+   file_update_time(vmf->vma->vm_file);
+   down_read(_I(inode)->i_mmap_sem);
+   size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT;
+   if (vmf->pgoff >= size)
+   ret = VM_FAULT_SIGBUS;
+   else
+   ret = dax_pfn_mkwrite(vmf);
+   up_read(_I(inode)->i_mmap_sem);
+   sb_end_pagefault(sb);
+
+   return ret;
+}
+
+static const struct vm_operations_struct f2fs_dax_vm_ops = {
+   .fault  = f2fs_dax_fault,
+   .huge_fault = f2fs_dax_huge_fault,
+   .page_mkwrite   = f2fs_dax_fault,
+   .pfn_mkwrite= f2fs_dax_pfn_mkwrite,
+};
+#else
+#define f2fs_dax_vm_ops f2fs_file_vm_ops
+#endif
+
 static int get_parent_ino(struct inode *inode, nid_t *pino)
 {
struct dentry *dentry;
@@ -434,7 +494,13 @@ static int f2fs_file_mmap(struct file *file, struct 
vm_area_struct *vma)
return err;
 
file_accessed(file);
-   vma->vm_ops = _file_vm_ops;
+   if (IS_DAX(file_inode(file))) {
+   vma->vm_ops = _dax_vm_ops;
+   vma->vm_flags |= VM_MIXEDMAP | VM_HUGEPAGE;
+   } else {
+   vma->vm_ops = _file_vm_ops;
+   }
+
return 0;
 }
 
@@ -518,6 +584,18 @@ static int truncate_partial_data_page(struct inode *inode, 
u64 from,
if (!offset && !cache_only)
return 0;
 
+#ifdef CONFIG_FS_DAX
+   if (IS_DAX(inode)) {
+   int ret;
+
+   down_read(_I(inode)->dio_rwsem[WRITE]);
+   ret = iomap_zero_range(inode, from, PAGE_SIZE - offset,
+   NULL, _iomap_ops);
+   up_read(_I(inode)->dio_rwsem[WRITE]);
+   return ret;
+   }
+#endif
+
if (cache_only) {
page = find_lock_page(mapping, index);