[RFC v2 70/83] File operation: Inplace write.

2018-03-10 Thread Andiry Xu
From: Andiry Xu 

If the user specifies inplace updates, or the file is mmaped,
NOVA performs inplace writes.

The trick is dax page fault can occur concurrently with inplace writes,
and allocate new blocks. Also, inplace write memcpy may trigger page fault 
(xfstests 248).
Since page fault may take the write lock to modify the tree, write routine
cannot take tree lock during the memcpy.
As a result we perform inplace write in the following way:

1. Take the tree read lock, check existing entries or holes.
2. Release the read lock. Allocate new data pages if needed;
   allocate and initialize file write item, add to the list and perform memcpy.
3. With the list of file write items, take the tree write lock and perform 
commit:
   Due to concurrent page fault, the hole returned in step 1 may be filled by
   page fault handlers. In this case, NOVA copies the data from the file write 
item
   to the pages allocated by page fault handler, and free the data blocks 
allocated
   in step 2. This guarantees application can see the write via mmaped region.

The step 3 actually formats a new list of write items, and reuse the CoW commit
routine to commit the items.

Signed-off-by: Andiry Xu 
---
 fs/nova/dax.c  | 472 +
 fs/nova/file.c |  10 +-
 fs/nova/nova.h |   4 +
 3 files changed, 484 insertions(+), 2 deletions(-)

diff --git a/fs/nova/dax.c b/fs/nova/dax.c
index 9561d8e..8624ce4 100644
--- a/fs/nova/dax.c
+++ b/fs/nova/dax.c
@@ -259,3 +259,475 @@ void nova_init_file_write_item(struct super_block *sb,
 
entry->size = file_size;
 }
+
+/*
+ * Check if there is an existing entry or hole for target page offset.
+ * Used for inplace write, DAX-mmap and fallocate.
+ */
+unsigned long nova_check_existing_entry(struct super_block *sb,
+   struct inode *inode, unsigned long num_blocks, unsigned long start_blk,
+   struct nova_file_write_entry **ret_entry,
+   int check_next, u64 epoch_id,
+   int *inplace)
+{
+   struct nova_inode_info *si = NOVA_I(inode);
+   struct nova_inode_info_header *sih = >header;
+   struct nova_file_write_entry *entry;
+   unsigned long next_pgoff;
+   unsigned long ent_blks = 0;
+   timing_t check_time;
+
+   NOVA_START_TIMING(check_entry_t, check_time);
+
+   *ret_entry = NULL;
+   *inplace = 0;
+   entry = nova_get_write_entry(sb, sih, start_blk);
+
+   if (entry) {
+   *ret_entry = entry;
+
+   /* We can do inplace write. Find contiguous blocks */
+   if (entry->reassigned == 0)
+   ent_blks = entry->num_pages -
+   (start_blk - entry->pgoff);
+   else
+   ent_blks = 1;
+
+   if (ent_blks > num_blocks)
+   ent_blks = num_blocks;
+
+   if (entry->epoch_id == epoch_id)
+   *inplace = 1;
+
+   } else if (check_next) {
+   /* Possible Hole */
+   entry = nova_find_next_entry(sb, sih, start_blk);
+   if (entry) {
+   next_pgoff = entry->pgoff;
+   if (next_pgoff <= start_blk) {
+   nova_err(sb, "iblock %lu, entry pgoff %lu, num 
pages %lu\n",
+  start_blk, next_pgoff, entry->num_pages);
+   nova_print_inode_log(sb, inode);
+   dump_stack();
+   ent_blks = num_blocks;
+   goto out;
+   }
+   ent_blks = next_pgoff - start_blk;
+   if (ent_blks > num_blocks)
+   ent_blks = num_blocks;
+   } else {
+   /* File grow */
+   ent_blks = num_blocks;
+   }
+   }
+
+   if (entry && ent_blks == 0) {
+   nova_dbg("%s: %d\n", __func__, check_next);
+   dump_stack();
+   }
+
+out:
+   NOVA_END_TIMING(check_entry_t, check_time);
+   return ent_blks;
+}
+
+/* Memcpy from newly allocated data blocks to existing data blocks */
+static int nova_inplace_memcpy(struct super_block *sb, struct inode *inode,
+   struct nova_file_write_entry *from, struct nova_file_write_entry *to,
+   unsigned long num_blocks, loff_t pos, size_t len)
+{
+   struct nova_inode_info *si = NOVA_I(inode);
+   struct nova_inode_info_header *sih = >header;
+   struct nova_log_entry_info entry_info;
+   unsigned long pgoff;
+   unsigned long from_nvmm, to_nvmm;
+   void *from_addr, *to_addr = NULL;
+   loff_t base, start, end, offset;
+
+   pgoff = le64_to_cpu(from->pgoff);
+   base = start = pgoff << PAGE_SHIFT;
+   end = (pgoff + num_blocks) << PAGE_SHIFT;
+
+   if (start < pos)
+   start = 

[RFC v2 70/83] File operation: Inplace write.

2018-03-10 Thread Andiry Xu
From: Andiry Xu 

If the user specifies inplace updates, or the file is mmaped,
NOVA performs inplace writes.

The trick is dax page fault can occur concurrently with inplace writes,
and allocate new blocks. Also, inplace write memcpy may trigger page fault 
(xfstests 248).
Since page fault may take the write lock to modify the tree, write routine
cannot take tree lock during the memcpy.
As a result we perform inplace write in the following way:

1. Take the tree read lock, check existing entries or holes.
2. Release the read lock. Allocate new data pages if needed;
   allocate and initialize file write item, add to the list and perform memcpy.
3. With the list of file write items, take the tree write lock and perform 
commit:
   Due to concurrent page fault, the hole returned in step 1 may be filled by
   page fault handlers. In this case, NOVA copies the data from the file write 
item
   to the pages allocated by page fault handler, and free the data blocks 
allocated
   in step 2. This guarantees application can see the write via mmaped region.

The step 3 actually formats a new list of write items, and reuse the CoW commit
routine to commit the items.

Signed-off-by: Andiry Xu 
---
 fs/nova/dax.c  | 472 +
 fs/nova/file.c |  10 +-
 fs/nova/nova.h |   4 +
 3 files changed, 484 insertions(+), 2 deletions(-)

diff --git a/fs/nova/dax.c b/fs/nova/dax.c
index 9561d8e..8624ce4 100644
--- a/fs/nova/dax.c
+++ b/fs/nova/dax.c
@@ -259,3 +259,475 @@ void nova_init_file_write_item(struct super_block *sb,
 
entry->size = file_size;
 }
+
+/*
+ * Check if there is an existing entry or hole for target page offset.
+ * Used for inplace write, DAX-mmap and fallocate.
+ */
+unsigned long nova_check_existing_entry(struct super_block *sb,
+   struct inode *inode, unsigned long num_blocks, unsigned long start_blk,
+   struct nova_file_write_entry **ret_entry,
+   int check_next, u64 epoch_id,
+   int *inplace)
+{
+   struct nova_inode_info *si = NOVA_I(inode);
+   struct nova_inode_info_header *sih = >header;
+   struct nova_file_write_entry *entry;
+   unsigned long next_pgoff;
+   unsigned long ent_blks = 0;
+   timing_t check_time;
+
+   NOVA_START_TIMING(check_entry_t, check_time);
+
+   *ret_entry = NULL;
+   *inplace = 0;
+   entry = nova_get_write_entry(sb, sih, start_blk);
+
+   if (entry) {
+   *ret_entry = entry;
+
+   /* We can do inplace write. Find contiguous blocks */
+   if (entry->reassigned == 0)
+   ent_blks = entry->num_pages -
+   (start_blk - entry->pgoff);
+   else
+   ent_blks = 1;
+
+   if (ent_blks > num_blocks)
+   ent_blks = num_blocks;
+
+   if (entry->epoch_id == epoch_id)
+   *inplace = 1;
+
+   } else if (check_next) {
+   /* Possible Hole */
+   entry = nova_find_next_entry(sb, sih, start_blk);
+   if (entry) {
+   next_pgoff = entry->pgoff;
+   if (next_pgoff <= start_blk) {
+   nova_err(sb, "iblock %lu, entry pgoff %lu, num 
pages %lu\n",
+  start_blk, next_pgoff, entry->num_pages);
+   nova_print_inode_log(sb, inode);
+   dump_stack();
+   ent_blks = num_blocks;
+   goto out;
+   }
+   ent_blks = next_pgoff - start_blk;
+   if (ent_blks > num_blocks)
+   ent_blks = num_blocks;
+   } else {
+   /* File grow */
+   ent_blks = num_blocks;
+   }
+   }
+
+   if (entry && ent_blks == 0) {
+   nova_dbg("%s: %d\n", __func__, check_next);
+   dump_stack();
+   }
+
+out:
+   NOVA_END_TIMING(check_entry_t, check_time);
+   return ent_blks;
+}
+
+/* Memcpy from newly allocated data blocks to existing data blocks */
+static int nova_inplace_memcpy(struct super_block *sb, struct inode *inode,
+   struct nova_file_write_entry *from, struct nova_file_write_entry *to,
+   unsigned long num_blocks, loff_t pos, size_t len)
+{
+   struct nova_inode_info *si = NOVA_I(inode);
+   struct nova_inode_info_header *sih = >header;
+   struct nova_log_entry_info entry_info;
+   unsigned long pgoff;
+   unsigned long from_nvmm, to_nvmm;
+   void *from_addr, *to_addr = NULL;
+   loff_t base, start, end, offset;
+
+   pgoff = le64_to_cpu(from->pgoff);
+   base = start = pgoff << PAGE_SHIFT;
+   end = (pgoff + num_blocks) << PAGE_SHIFT;
+
+   if (start < pos)
+   start = pos;
+
+   if (end > pos + len)
+