[RFC v2 43/83] Log operation: in-place update log entry

2018-03-10 Thread Andiry Xu
From: Andiry Xu <jix...@cs.ucsd.edu>

To in-place update a log entry, NOVA starts a lite transaction
to journal the log entry, then performs update and commits the transaction.

Signed-off-by: Andiry Xu <jix...@cs.ucsd.edu>
---
 fs/nova/inode.h |  12 
 fs/nova/log.c   | 183 
 fs/nova/log.h   |   9 +++
 3 files changed, 204 insertions(+)

diff --git a/fs/nova/inode.h b/fs/nova/inode.h
index 943f77f..6970872 100644
--- a/fs/nova/inode.h
+++ b/fs/nova/inode.h
@@ -5,6 +5,7 @@ struct nova_inode_info_header;
 struct nova_inode;
 
 #include "super.h"
+#include "log.h"
 
 enum nova_new_inode_type {
TYPE_CREATE = 0,
@@ -143,6 +144,17 @@ static inline void nova_update_tail(struct nova_inode *pi, 
u64 new_tail)
NOVA_END_TIMING(update_tail_t, update_time);
 }
 
+static inline void nova_update_inode(struct super_block *sb,
+   struct inode *inode, struct nova_inode *pi,
+   struct nova_inode_update *update)
+{
+   struct nova_inode_info *si = NOVA_I(inode);
+   struct nova_inode_info_header *sih = >header;
+
+   sih->log_tail = update->tail;
+   nova_update_tail(pi, update->tail);
+}
+
 static inline
 struct inode_table *nova_get_inode_table(struct super_block *sb, int cpu)
 {
diff --git a/fs/nova/log.c b/fs/nova/log.c
index 4638ccf..c8b7d2e 100644
--- a/fs/nova/log.c
+++ b/fs/nova/log.c
@@ -218,6 +218,35 @@ static int nova_append_log_entry(struct super_block *sb,
return 0;
 }
 
+/* Perform lite transaction to atomically in-place update log entry */
+static int nova_inplace_update_log_entry(struct super_block *sb,
+   struct inode *inode, void *entry,
+   struct nova_log_entry_info *entry_info)
+{
+   struct nova_sb_info *sbi = NOVA_SB(sb);
+   enum nova_entry_type type = entry_info->type;
+   u64 journal_tail;
+   size_t size;
+   int cpu;
+   timing_t update_time;
+
+   NOVA_START_TIMING(update_entry_t, update_time);
+   size = nova_get_log_entry_size(sb, type);
+
+   cpu = smp_processor_id();
+   spin_lock(>journal_locks[cpu]);
+   journal_tail = nova_create_logentry_transaction(sb, entry, type, cpu);
+   nova_update_log_entry(sb, inode, entry, entry_info);
+
+   PERSISTENT_BARRIER();
+
+   nova_commit_lite_transaction(sb, journal_tail, cpu);
+   spin_unlock(>journal_locks[cpu]);
+
+   NOVA_END_TIMING(update_entry_t, update_time);
+   return 0;
+}
+
 /* Returns new tail after append */
 static int nova_append_setattr_entry(struct super_block *sb,
struct nova_inode *pi, struct inode *inode, struct iattr *attr,
@@ -250,6 +279,125 @@ static int nova_append_setattr_entry(struct super_block 
*sb,
return ret;
 }
 
+static int nova_can_inplace_update_setattr(struct super_block *sb,
+   struct nova_inode_info_header *sih, u64 epoch_id)
+{
+   u64 last_log = 0;
+   struct nova_setattr_logentry *entry = NULL;
+
+   last_log = sih->last_setattr;
+   if (last_log) {
+   entry = (struct nova_setattr_logentry *)nova_get_block(sb,
+   last_log);
+   /* Do not overwrite setsize entry */
+   if (entry->attr & ATTR_SIZE)
+   return 0;
+   if (entry->epoch_id == epoch_id)
+   return 1;
+   }
+
+   return 0;
+}
+
+static int nova_inplace_update_setattr_entry(struct super_block *sb,
+   struct inode *inode, struct nova_inode_info_header *sih,
+   struct iattr *attr, u64 epoch_id)
+{
+   struct nova_setattr_logentry *entry = NULL;
+   struct nova_log_entry_info entry_info;
+   u64 last_log = 0;
+
+   nova_dbgv("%s : Modifying last log entry for inode %lu\n",
+   __func__, inode->i_ino);
+   last_log = sih->last_setattr;
+   entry = (struct nova_setattr_logentry *)nova_get_block(sb,
+   last_log);
+
+   entry_info.type = SET_ATTR;
+   entry_info.attr = attr;
+   entry_info.epoch_id = epoch_id;
+   entry_info.trans_id = sih->trans_id;
+
+   return nova_inplace_update_log_entry(sb, inode, entry,
+   _info);
+}
+
+int nova_handle_setattr_operation(struct super_block *sb, struct inode *inode,
+   struct nova_inode *pi, unsigned int ia_valid, struct iattr *attr,
+   u64 epoch_id)
+{
+   struct nova_inode_info *si = NOVA_I(inode);
+   struct nova_inode_info_header *sih = >header;
+   struct nova_inode_update update;
+   u64 last_setattr = 0;
+   int ret;
+
+   if (ia_valid & ATTR_MODE)
+   sih->i_mode = inode->i_mode;
+
+   /*
+* Let's try to do inplace update.
+*/
+   if (!(ia_valid & ATTR_SIZE) &&
+   nova_can

[RFC v2 43/83] Log operation: in-place update log entry

2018-03-10 Thread Andiry Xu
From: Andiry Xu 

To in-place update a log entry, NOVA starts a lite transaction
to journal the log entry, then performs update and commits the transaction.

Signed-off-by: Andiry Xu 
---
 fs/nova/inode.h |  12 
 fs/nova/log.c   | 183 
 fs/nova/log.h   |   9 +++
 3 files changed, 204 insertions(+)

diff --git a/fs/nova/inode.h b/fs/nova/inode.h
index 943f77f..6970872 100644
--- a/fs/nova/inode.h
+++ b/fs/nova/inode.h
@@ -5,6 +5,7 @@ struct nova_inode_info_header;
 struct nova_inode;
 
 #include "super.h"
+#include "log.h"
 
 enum nova_new_inode_type {
TYPE_CREATE = 0,
@@ -143,6 +144,17 @@ static inline void nova_update_tail(struct nova_inode *pi, 
u64 new_tail)
NOVA_END_TIMING(update_tail_t, update_time);
 }
 
+static inline void nova_update_inode(struct super_block *sb,
+   struct inode *inode, struct nova_inode *pi,
+   struct nova_inode_update *update)
+{
+   struct nova_inode_info *si = NOVA_I(inode);
+   struct nova_inode_info_header *sih = >header;
+
+   sih->log_tail = update->tail;
+   nova_update_tail(pi, update->tail);
+}
+
 static inline
 struct inode_table *nova_get_inode_table(struct super_block *sb, int cpu)
 {
diff --git a/fs/nova/log.c b/fs/nova/log.c
index 4638ccf..c8b7d2e 100644
--- a/fs/nova/log.c
+++ b/fs/nova/log.c
@@ -218,6 +218,35 @@ static int nova_append_log_entry(struct super_block *sb,
return 0;
 }
 
+/* Perform lite transaction to atomically in-place update log entry */
+static int nova_inplace_update_log_entry(struct super_block *sb,
+   struct inode *inode, void *entry,
+   struct nova_log_entry_info *entry_info)
+{
+   struct nova_sb_info *sbi = NOVA_SB(sb);
+   enum nova_entry_type type = entry_info->type;
+   u64 journal_tail;
+   size_t size;
+   int cpu;
+   timing_t update_time;
+
+   NOVA_START_TIMING(update_entry_t, update_time);
+   size = nova_get_log_entry_size(sb, type);
+
+   cpu = smp_processor_id();
+   spin_lock(>journal_locks[cpu]);
+   journal_tail = nova_create_logentry_transaction(sb, entry, type, cpu);
+   nova_update_log_entry(sb, inode, entry, entry_info);
+
+   PERSISTENT_BARRIER();
+
+   nova_commit_lite_transaction(sb, journal_tail, cpu);
+   spin_unlock(>journal_locks[cpu]);
+
+   NOVA_END_TIMING(update_entry_t, update_time);
+   return 0;
+}
+
 /* Returns new tail after append */
 static int nova_append_setattr_entry(struct super_block *sb,
struct nova_inode *pi, struct inode *inode, struct iattr *attr,
@@ -250,6 +279,125 @@ static int nova_append_setattr_entry(struct super_block 
*sb,
return ret;
 }
 
+static int nova_can_inplace_update_setattr(struct super_block *sb,
+   struct nova_inode_info_header *sih, u64 epoch_id)
+{
+   u64 last_log = 0;
+   struct nova_setattr_logentry *entry = NULL;
+
+   last_log = sih->last_setattr;
+   if (last_log) {
+   entry = (struct nova_setattr_logentry *)nova_get_block(sb,
+   last_log);
+   /* Do not overwrite setsize entry */
+   if (entry->attr & ATTR_SIZE)
+   return 0;
+   if (entry->epoch_id == epoch_id)
+   return 1;
+   }
+
+   return 0;
+}
+
+static int nova_inplace_update_setattr_entry(struct super_block *sb,
+   struct inode *inode, struct nova_inode_info_header *sih,
+   struct iattr *attr, u64 epoch_id)
+{
+   struct nova_setattr_logentry *entry = NULL;
+   struct nova_log_entry_info entry_info;
+   u64 last_log = 0;
+
+   nova_dbgv("%s : Modifying last log entry for inode %lu\n",
+   __func__, inode->i_ino);
+   last_log = sih->last_setattr;
+   entry = (struct nova_setattr_logentry *)nova_get_block(sb,
+   last_log);
+
+   entry_info.type = SET_ATTR;
+   entry_info.attr = attr;
+   entry_info.epoch_id = epoch_id;
+   entry_info.trans_id = sih->trans_id;
+
+   return nova_inplace_update_log_entry(sb, inode, entry,
+   _info);
+}
+
+int nova_handle_setattr_operation(struct super_block *sb, struct inode *inode,
+   struct nova_inode *pi, unsigned int ia_valid, struct iattr *attr,
+   u64 epoch_id)
+{
+   struct nova_inode_info *si = NOVA_I(inode);
+   struct nova_inode_info_header *sih = >header;
+   struct nova_inode_update update;
+   u64 last_setattr = 0;
+   int ret;
+
+   if (ia_valid & ATTR_MODE)
+   sih->i_mode = inode->i_mode;
+
+   /*
+* Let's try to do inplace update.
+*/
+   if (!(ia_valid & ATTR_SIZE) &&
+   nova_can_inplace_upd

[RFC v2 35/83] Journal: Lite journal helper routines.

2018-03-10 Thread Andiry Xu
From: Andiry Xu <jix...@cs.ucsd.edu>

Signed-off-by: Andiry Xu <jix...@cs.ucsd.edu>
---
 fs/nova/Makefile  |   2 +-
 fs/nova/journal.c | 108 ++
 2 files changed, 109 insertions(+), 1 deletion(-)
 create mode 100644 fs/nova/journal.c

diff --git a/fs/nova/Makefile b/fs/nova/Makefile
index b3638a4..4aeadea 100644
--- a/fs/nova/Makefile
+++ b/fs/nova/Makefile
@@ -4,4 +4,4 @@
 
 obj-$(CONFIG_NOVA_FS) += nova.o
 
-nova-y := balloc.o bbuild.o inode.o log.o rebuild.o stats.o super.o
+nova-y := balloc.o bbuild.o inode.o journal.o log.o rebuild.o stats.o super.o
diff --git a/fs/nova/journal.c b/fs/nova/journal.c
new file mode 100644
index 000..75d590f
--- /dev/null
+++ b/fs/nova/journal.c
@@ -0,0 +1,108 @@
+/*
+ * NOVA journaling facility.
+ *
+ * This file contains journaling code to guarantee the atomicity of directory
+ * operations that span multiple inodes (unlink, rename, etc).
+ *
+ * Copyright 2015-2016 Regents of the University of California,
+ * UCSD Non-Volatile Systems Lab, Andiry Xu <jix...@cs.ucsd.edu>
+ * Copyright 2012-2013 Intel Corporation
+ * Copyright 2009-2011 Marco Stornelli <marco.storne...@gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ */
+
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include "nova.h"
+#include "journal.h"
+
+/ Lite journal **/
+
+static inline void
+nova_print_lite_transaction(struct nova_lite_journal_entry *entry)
+{
+   nova_dbg("Entry %p: Type %llu, data1 0x%llx, data2 0x%llx\n, checksum 
%u\n",
+   entry, entry->type,
+   entry->data1, entry->data2, entry->csum);
+}
+
+static inline int nova_update_journal_entry_csum(struct super_block *sb,
+   struct nova_lite_journal_entry *entry)
+{
+   u32 crc = 0;
+
+   crc = nova_crc32c(~0, (__u8 *)entry,
+   (sizeof(struct nova_lite_journal_entry)
+- sizeof(__le32)));
+
+   entry->csum = cpu_to_le32(crc);
+   nova_flush_buffer(entry, sizeof(struct nova_lite_journal_entry), 0);
+   return 0;
+}
+
+static inline int nova_check_entry_integrity(struct super_block *sb,
+   struct nova_lite_journal_entry *entry)
+{
+   u32 crc = 0;
+
+   crc = nova_crc32c(~0, (__u8 *)entry,
+   (sizeof(struct nova_lite_journal_entry)
+- sizeof(__le32)));
+
+   if (entry->csum == cpu_to_le32(crc))
+   return 0;
+   else
+   return 1;
+}
+
+// Get the next journal entry.  Journal entries are stored in a circular
+// buffer.  They live a 1-page circular buffer.
+//
+// TODO: Add check to ensure that the journal doesn't grow too large.
+static inline u64 next_lite_journal(u64 curr_p)
+{
+   size_t size = sizeof(struct nova_lite_journal_entry);
+
+   if ((curr_p & (PAGE_SIZE - 1)) + size >= PAGE_SIZE)
+   return (curr_p & PAGE_MASK);
+
+   return curr_p + size;
+}
+
+// Walk the journal for one CPU, and verify the checksum on each entry.
+static int nova_check_journal_entries(struct super_block *sb,
+   struct journal_ptr_pair *pair)
+{
+   struct nova_lite_journal_entry *entry;
+   u64 temp;
+   int ret;
+
+   temp = pair->journal_head;
+   while (temp != pair->journal_tail) {
+   entry = (struct nova_lite_journal_entry *)nova_get_block(sb,
+   temp);
+   ret = nova_check_entry_integrity(sb, entry);
+   if (ret) {
+   nova_dbg("Entry %p checksum failure\n", entry);
+   nova_print_lite_transaction(entry);
+   return ret;
+   }
+   temp = next_lite_journal(temp);
+   }
+
+   return 0;
+}
-- 
2.7.4



[RFC v2 35/83] Journal: Lite journal helper routines.

2018-03-10 Thread Andiry Xu
From: Andiry Xu 

Signed-off-by: Andiry Xu 
---
 fs/nova/Makefile  |   2 +-
 fs/nova/journal.c | 108 ++
 2 files changed, 109 insertions(+), 1 deletion(-)
 create mode 100644 fs/nova/journal.c

diff --git a/fs/nova/Makefile b/fs/nova/Makefile
index b3638a4..4aeadea 100644
--- a/fs/nova/Makefile
+++ b/fs/nova/Makefile
@@ -4,4 +4,4 @@
 
 obj-$(CONFIG_NOVA_FS) += nova.o
 
-nova-y := balloc.o bbuild.o inode.o log.o rebuild.o stats.o super.o
+nova-y := balloc.o bbuild.o inode.o journal.o log.o rebuild.o stats.o super.o
diff --git a/fs/nova/journal.c b/fs/nova/journal.c
new file mode 100644
index 000..75d590f
--- /dev/null
+++ b/fs/nova/journal.c
@@ -0,0 +1,108 @@
+/*
+ * NOVA journaling facility.
+ *
+ * This file contains journaling code to guarantee the atomicity of directory
+ * operations that span multiple inodes (unlink, rename, etc).
+ *
+ * Copyright 2015-2016 Regents of the University of California,
+ * UCSD Non-Volatile Systems Lab, Andiry Xu 
+ * Copyright 2012-2013 Intel Corporation
+ * Copyright 2009-2011 Marco Stornelli 
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ */
+
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include "nova.h"
+#include "journal.h"
+
+/ Lite journal **/
+
+static inline void
+nova_print_lite_transaction(struct nova_lite_journal_entry *entry)
+{
+   nova_dbg("Entry %p: Type %llu, data1 0x%llx, data2 0x%llx\n, checksum 
%u\n",
+   entry, entry->type,
+   entry->data1, entry->data2, entry->csum);
+}
+
+static inline int nova_update_journal_entry_csum(struct super_block *sb,
+   struct nova_lite_journal_entry *entry)
+{
+   u32 crc = 0;
+
+   crc = nova_crc32c(~0, (__u8 *)entry,
+   (sizeof(struct nova_lite_journal_entry)
+- sizeof(__le32)));
+
+   entry->csum = cpu_to_le32(crc);
+   nova_flush_buffer(entry, sizeof(struct nova_lite_journal_entry), 0);
+   return 0;
+}
+
+static inline int nova_check_entry_integrity(struct super_block *sb,
+   struct nova_lite_journal_entry *entry)
+{
+   u32 crc = 0;
+
+   crc = nova_crc32c(~0, (__u8 *)entry,
+   (sizeof(struct nova_lite_journal_entry)
+- sizeof(__le32)));
+
+   if (entry->csum == cpu_to_le32(crc))
+   return 0;
+   else
+   return 1;
+}
+
+// Get the next journal entry.  Journal entries are stored in a circular
+// buffer.  They live a 1-page circular buffer.
+//
+// TODO: Add check to ensure that the journal doesn't grow too large.
+static inline u64 next_lite_journal(u64 curr_p)
+{
+   size_t size = sizeof(struct nova_lite_journal_entry);
+
+   if ((curr_p & (PAGE_SIZE - 1)) + size >= PAGE_SIZE)
+   return (curr_p & PAGE_MASK);
+
+   return curr_p + size;
+}
+
+// Walk the journal for one CPU, and verify the checksum on each entry.
+static int nova_check_journal_entries(struct super_block *sb,
+   struct journal_ptr_pair *pair)
+{
+   struct nova_lite_journal_entry *entry;
+   u64 temp;
+   int ret;
+
+   temp = pair->journal_head;
+   while (temp != pair->journal_tail) {
+   entry = (struct nova_lite_journal_entry *)nova_get_block(sb,
+   temp);
+   ret = nova_check_entry_integrity(sb, entry);
+   if (ret) {
+   nova_dbg("Entry %p checksum failure\n", entry);
+   nova_print_lite_transaction(entry);
+   return ret;
+   }
+   temp = next_lite_journal(temp);
+   }
+
+   return 0;
+}
-- 
2.7.4



[RFC v2 37/83] Journal: Lite journal create and commit.

2018-03-10 Thread Andiry Xu
From: Andiry Xu <jix...@cs.ucsd.edu>

NOVA uses lite journal to perform light weight transaction.
Instead of journaling metadata/data changes directly,
NOVA first append updates to each inode's log, and then
journal the log tail pointers to make sure all the logs
are updated atomically. For inode creation and deletion,
NOVA journals the inode's valid field.

Signed-off-by: Andiry Xu <jix...@cs.ucsd.edu>
---
 fs/nova/journal.c | 179 ++
 fs/nova/journal.h |  11 
 2 files changed, 190 insertions(+)

diff --git a/fs/nova/journal.c b/fs/nova/journal.c
index f31de97..0e203fa 100644
--- a/fs/nova/journal.c
+++ b/fs/nova/journal.c
@@ -161,3 +161,182 @@ static int nova_recover_lite_journal(struct super_block 
*sb,
 
return 0;
 }
+
+/ Create/commit **/
+
+/* Create and append an undo entry for a small update to PMEM. */
+static u64 nova_append_entry_journal(struct super_block *sb,
+   u64 curr_p, void *field)
+{
+   struct nova_lite_journal_entry *entry;
+   struct nova_sb_info *sbi = NOVA_SB(sb);
+   u64 *aligned_field;
+   u64 addr;
+
+   entry = (struct nova_lite_journal_entry *)nova_get_block(sb,
+   curr_p);
+   entry->type = cpu_to_le64(JOURNAL_ENTRY);
+   entry->padding = 0;
+   /* Align to 8 bytes */
+   aligned_field = (u64 *)((unsigned long)field & ~7UL);
+   /* Store the offset from the start of Nova instead of the pointer */
+   addr = (u64)nova_get_addr_off(sbi, aligned_field);
+   entry->data1 = cpu_to_le64(addr);
+   entry->data2 = cpu_to_le64(*aligned_field);
+   nova_update_journal_entry_csum(sb, entry);
+
+   curr_p = next_lite_journal(curr_p);
+   return curr_p;
+}
+
+static u64 nova_journal_inode_tail(struct super_block *sb,
+   u64 curr_p, struct nova_inode *pi)
+{
+   curr_p = nova_append_entry_journal(sb, curr_p, >log_tail);
+
+   return curr_p;
+}
+
+/* Create and append undo log entries for creating a new file or directory. */
+static u64 nova_append_inode_journal(struct super_block *sb,
+   u64 curr_p, struct inode *inode, int new_inode,
+   int invalidate, int is_dir)
+{
+   struct nova_inode *pi = nova_get_inode(sb, inode);
+
+   if (!pi) {
+   nova_err(sb, "%s: get inode failed\n", __func__);
+   return curr_p;
+   }
+
+   if (is_dir)
+   return nova_journal_inode_tail(sb, curr_p, pi);
+
+   if (new_inode) {
+   curr_p = nova_append_entry_journal(sb, curr_p,
+   >valid);
+   } else {
+   curr_p = nova_journal_inode_tail(sb, curr_p, pi);
+   if (invalidate) {
+   curr_p = nova_append_entry_journal(sb, curr_p,
+   >valid);
+   curr_p = nova_append_entry_journal(sb, curr_p,
+   >delete_epoch_id);
+   }
+   }
+
+   return curr_p;
+}
+
+static u64 nova_append_dentry_journal(struct super_block *sb,
+   u64 curr_p, struct nova_dentry *dentry)
+{
+   curr_p = nova_append_entry_journal(sb, curr_p, >ino);
+   curr_p = nova_append_entry_journal(sb, curr_p, >csum);
+   return curr_p;
+}
+
+/* Journaled transactions for inode creation */
+u64 nova_create_inode_transaction(struct super_block *sb,
+   struct inode *inode, struct inode *dir, int cpu,
+   int new_inode, int invalidate)
+{
+   struct journal_ptr_pair *pair;
+   u64 temp;
+
+   pair = nova_get_journal_pointers(sb, cpu);
+
+   temp = pair->journal_head;
+
+   temp = nova_append_inode_journal(sb, temp, inode,
+   new_inode, invalidate, 0);
+
+   temp = nova_append_inode_journal(sb, temp, dir,
+   new_inode, invalidate, 1);
+
+   pair->journal_tail = temp;
+   nova_flush_buffer(>journal_head, CACHELINE_SIZE, 1);
+
+   nova_dbgv("%s: head 0x%llx, tail 0x%llx\n",
+   __func__, pair->journal_head, pair->journal_tail);
+   return temp;
+}
+
+/* Journaled transactions for rename operations */
+u64 nova_create_rename_transaction(struct super_block *sb,
+   struct inode *old_inode, struct inode *old_dir, struct inode *new_inode,
+   struct inode *new_dir, struct nova_dentry *father_entry,
+   int invalidate_new_inode, int cpu)
+{
+   struct journal_ptr_pair *pair;
+   u64 temp;
+
+   pair = nova_get_journal_pointers(sb, cpu);
+
+   temp = pair->journal_head;
+
+   /* Journal tails for old inode */
+   temp = nova_append_inode_journal(sb, temp, old_inode, 0, 0, 0);
+
+   /* Journal tails for old dir */
+   temp = nova_append_inode_jo

[RFC v2 40/83] Log operation: file write entry append.

2018-03-10 Thread Andiry Xu
From: Andiry Xu <jix...@cs.ucsd.edu>

NOVA performs writes by appending file write entries to the log.
A file write entry is the metadata of a write operation, and
contains pointers to the data blocks. A single write operation
may append multiple file write entries to the log, if the
allocator cannot provide enough contiguous data blocks.

Signed-off-by: Andiry Xu <jix...@cs.ucsd.edu>
---
 fs/nova/log.c | 51 +++
 fs/nova/log.h |  3 +++
 2 files changed, 54 insertions(+)

diff --git a/fs/nova/log.c b/fs/nova/log.c
index 13f9597..437db26 100644
--- a/fs/nova/log.c
+++ b/fs/nova/log.c
@@ -20,6 +20,18 @@
 #include "inode.h"
 #include "log.h"
 
+static int nova_update_write_entry(struct super_block *sb,
+   struct nova_file_write_entry *entry,
+   struct nova_log_entry_info *entry_info)
+{
+   entry->epoch_id = cpu_to_le64(entry_info->epoch_id);
+   entry->trans_id = cpu_to_le64(entry_info->trans_id);
+   entry->mtime = cpu_to_le32(entry_info->time);
+   entry->size = cpu_to_le64(entry_info->file_size);
+   nova_persist_entry(entry);
+   return 0;
+}
+
 static int nova_update_old_dentry(struct super_block *sb,
struct inode *dir, struct nova_dentry *dentry,
struct nova_log_entry_info *entry_info)
@@ -91,6 +103,11 @@ static int nova_update_log_entry(struct super_block *sb, 
struct inode *inode,
 
switch (type) {
case FILE_WRITE:
+   if (entry_info->inplace)
+   nova_update_write_entry(sb, entry, entry_info);
+   else
+   memcpy_to_pmem_nocache(entry, entry_info->data,
+   sizeof(struct nova_file_write_entry));
break;
case DIR_LOG:
if (entry_info->inplace)
@@ -149,6 +166,40 @@ static int nova_append_log_entry(struct super_block *sb,
return 0;
 }
 
+/*
+ * Append a nova_file_write_entry to the current nova_inode_log_page.
+ * blocknr and start_blk are pgoff.
+ * We cannot update pi->log_tail here because a transaction may contain
+ * multiple entries.
+ */
+int nova_append_file_write_entry(struct super_block *sb, struct nova_inode *pi,
+   struct inode *inode, struct nova_file_write_item *item,
+   struct nova_inode_update *update)
+{
+   struct nova_inode_info *si = NOVA_I(inode);
+   struct nova_inode_info_header *sih = >header;
+   struct nova_file_write_entry *data = >entry;
+   struct nova_log_entry_info entry_info;
+   timing_t append_time;
+   int ret;
+
+   NOVA_START_TIMING(append_file_entry_t, append_time);
+
+   entry_info.type = FILE_WRITE;
+   entry_info.update = update;
+   entry_info.data = data;
+   entry_info.epoch_id = data->epoch_id;
+   entry_info.trans_id = data->trans_id;
+   entry_info.inplace = 0;
+
+   ret = nova_append_log_entry(sb, pi, inode, sih, _info);
+   if (ret)
+   nova_err(sb, "%s failed\n", __func__);
+
+   NOVA_END_TIMING(append_file_entry_t, append_time);
+   return ret;
+}
+
 int nova_append_dentry(struct super_block *sb, struct nova_inode *pi,
struct inode *dir, struct dentry *dentry, u64 ino,
unsigned short de_len, struct nova_inode_update *update,
diff --git a/fs/nova/log.h b/fs/nova/log.h
index 305e69b..db7a72e 100644
--- a/fs/nova/log.h
+++ b/fs/nova/log.h
@@ -364,6 +364,9 @@ static inline int is_dir_init_entry(struct super_block *sb,
 }
 
 
+int nova_append_file_write_entry(struct super_block *sb, struct nova_inode *pi,
+   struct inode *inode, struct nova_file_write_item *item,
+   struct nova_inode_update *update);
 int nova_append_dentry(struct super_block *sb, struct nova_inode *pi,
struct inode *dir, struct dentry *dentry, u64 ino,
unsigned short de_len, struct nova_inode_update *update,
-- 
2.7.4



[RFC v2 37/83] Journal: Lite journal create and commit.

2018-03-10 Thread Andiry Xu
From: Andiry Xu 

NOVA uses lite journal to perform light weight transaction.
Instead of journaling metadata/data changes directly,
NOVA first append updates to each inode's log, and then
journal the log tail pointers to make sure all the logs
are updated atomically. For inode creation and deletion,
NOVA journals the inode's valid field.

Signed-off-by: Andiry Xu 
---
 fs/nova/journal.c | 179 ++
 fs/nova/journal.h |  11 
 2 files changed, 190 insertions(+)

diff --git a/fs/nova/journal.c b/fs/nova/journal.c
index f31de97..0e203fa 100644
--- a/fs/nova/journal.c
+++ b/fs/nova/journal.c
@@ -161,3 +161,182 @@ static int nova_recover_lite_journal(struct super_block 
*sb,
 
return 0;
 }
+
+/ Create/commit **/
+
+/* Create and append an undo entry for a small update to PMEM. */
+static u64 nova_append_entry_journal(struct super_block *sb,
+   u64 curr_p, void *field)
+{
+   struct nova_lite_journal_entry *entry;
+   struct nova_sb_info *sbi = NOVA_SB(sb);
+   u64 *aligned_field;
+   u64 addr;
+
+   entry = (struct nova_lite_journal_entry *)nova_get_block(sb,
+   curr_p);
+   entry->type = cpu_to_le64(JOURNAL_ENTRY);
+   entry->padding = 0;
+   /* Align to 8 bytes */
+   aligned_field = (u64 *)((unsigned long)field & ~7UL);
+   /* Store the offset from the start of Nova instead of the pointer */
+   addr = (u64)nova_get_addr_off(sbi, aligned_field);
+   entry->data1 = cpu_to_le64(addr);
+   entry->data2 = cpu_to_le64(*aligned_field);
+   nova_update_journal_entry_csum(sb, entry);
+
+   curr_p = next_lite_journal(curr_p);
+   return curr_p;
+}
+
+static u64 nova_journal_inode_tail(struct super_block *sb,
+   u64 curr_p, struct nova_inode *pi)
+{
+   curr_p = nova_append_entry_journal(sb, curr_p, >log_tail);
+
+   return curr_p;
+}
+
+/* Create and append undo log entries for creating a new file or directory. */
+static u64 nova_append_inode_journal(struct super_block *sb,
+   u64 curr_p, struct inode *inode, int new_inode,
+   int invalidate, int is_dir)
+{
+   struct nova_inode *pi = nova_get_inode(sb, inode);
+
+   if (!pi) {
+   nova_err(sb, "%s: get inode failed\n", __func__);
+   return curr_p;
+   }
+
+   if (is_dir)
+   return nova_journal_inode_tail(sb, curr_p, pi);
+
+   if (new_inode) {
+   curr_p = nova_append_entry_journal(sb, curr_p,
+   >valid);
+   } else {
+   curr_p = nova_journal_inode_tail(sb, curr_p, pi);
+   if (invalidate) {
+   curr_p = nova_append_entry_journal(sb, curr_p,
+   >valid);
+   curr_p = nova_append_entry_journal(sb, curr_p,
+   >delete_epoch_id);
+   }
+   }
+
+   return curr_p;
+}
+
+static u64 nova_append_dentry_journal(struct super_block *sb,
+   u64 curr_p, struct nova_dentry *dentry)
+{
+   curr_p = nova_append_entry_journal(sb, curr_p, >ino);
+   curr_p = nova_append_entry_journal(sb, curr_p, >csum);
+   return curr_p;
+}
+
+/* Journaled transactions for inode creation */
+u64 nova_create_inode_transaction(struct super_block *sb,
+   struct inode *inode, struct inode *dir, int cpu,
+   int new_inode, int invalidate)
+{
+   struct journal_ptr_pair *pair;
+   u64 temp;
+
+   pair = nova_get_journal_pointers(sb, cpu);
+
+   temp = pair->journal_head;
+
+   temp = nova_append_inode_journal(sb, temp, inode,
+   new_inode, invalidate, 0);
+
+   temp = nova_append_inode_journal(sb, temp, dir,
+   new_inode, invalidate, 1);
+
+   pair->journal_tail = temp;
+   nova_flush_buffer(>journal_head, CACHELINE_SIZE, 1);
+
+   nova_dbgv("%s: head 0x%llx, tail 0x%llx\n",
+   __func__, pair->journal_head, pair->journal_tail);
+   return temp;
+}
+
+/* Journaled transactions for rename operations */
+u64 nova_create_rename_transaction(struct super_block *sb,
+   struct inode *old_inode, struct inode *old_dir, struct inode *new_inode,
+   struct inode *new_dir, struct nova_dentry *father_entry,
+   int invalidate_new_inode, int cpu)
+{
+   struct journal_ptr_pair *pair;
+   u64 temp;
+
+   pair = nova_get_journal_pointers(sb, cpu);
+
+   temp = pair->journal_head;
+
+   /* Journal tails for old inode */
+   temp = nova_append_inode_journal(sb, temp, old_inode, 0, 0, 0);
+
+   /* Journal tails for old dir */
+   temp = nova_append_inode_journal(sb, temp, old_dir, 0, 0, 1);

[RFC v2 40/83] Log operation: file write entry append.

2018-03-10 Thread Andiry Xu
From: Andiry Xu 

NOVA performs writes by appending file write entries to the log.
A file write entry is the metadata of a write operation, and
contains pointers to the data blocks. A single write operation
may append multiple file write entries to the log, if the
allocator cannot provide enough contiguous data blocks.

Signed-off-by: Andiry Xu 
---
 fs/nova/log.c | 51 +++
 fs/nova/log.h |  3 +++
 2 files changed, 54 insertions(+)

diff --git a/fs/nova/log.c b/fs/nova/log.c
index 13f9597..437db26 100644
--- a/fs/nova/log.c
+++ b/fs/nova/log.c
@@ -20,6 +20,18 @@
 #include "inode.h"
 #include "log.h"
 
+static int nova_update_write_entry(struct super_block *sb,
+   struct nova_file_write_entry *entry,
+   struct nova_log_entry_info *entry_info)
+{
+   entry->epoch_id = cpu_to_le64(entry_info->epoch_id);
+   entry->trans_id = cpu_to_le64(entry_info->trans_id);
+   entry->mtime = cpu_to_le32(entry_info->time);
+   entry->size = cpu_to_le64(entry_info->file_size);
+   nova_persist_entry(entry);
+   return 0;
+}
+
 static int nova_update_old_dentry(struct super_block *sb,
struct inode *dir, struct nova_dentry *dentry,
struct nova_log_entry_info *entry_info)
@@ -91,6 +103,11 @@ static int nova_update_log_entry(struct super_block *sb, 
struct inode *inode,
 
switch (type) {
case FILE_WRITE:
+   if (entry_info->inplace)
+   nova_update_write_entry(sb, entry, entry_info);
+   else
+   memcpy_to_pmem_nocache(entry, entry_info->data,
+   sizeof(struct nova_file_write_entry));
break;
case DIR_LOG:
if (entry_info->inplace)
@@ -149,6 +166,40 @@ static int nova_append_log_entry(struct super_block *sb,
return 0;
 }
 
+/*
+ * Append a nova_file_write_entry to the current nova_inode_log_page.
+ * blocknr and start_blk are pgoff.
+ * We cannot update pi->log_tail here because a transaction may contain
+ * multiple entries.
+ */
+int nova_append_file_write_entry(struct super_block *sb, struct nova_inode *pi,
+   struct inode *inode, struct nova_file_write_item *item,
+   struct nova_inode_update *update)
+{
+   struct nova_inode_info *si = NOVA_I(inode);
+   struct nova_inode_info_header *sih = >header;
+   struct nova_file_write_entry *data = >entry;
+   struct nova_log_entry_info entry_info;
+   timing_t append_time;
+   int ret;
+
+   NOVA_START_TIMING(append_file_entry_t, append_time);
+
+   entry_info.type = FILE_WRITE;
+   entry_info.update = update;
+   entry_info.data = data;
+   entry_info.epoch_id = data->epoch_id;
+   entry_info.trans_id = data->trans_id;
+   entry_info.inplace = 0;
+
+   ret = nova_append_log_entry(sb, pi, inode, sih, _info);
+   if (ret)
+   nova_err(sb, "%s failed\n", __func__);
+
+   NOVA_END_TIMING(append_file_entry_t, append_time);
+   return ret;
+}
+
 int nova_append_dentry(struct super_block *sb, struct nova_inode *pi,
struct inode *dir, struct dentry *dentry, u64 ino,
unsigned short de_len, struct nova_inode_update *update,
diff --git a/fs/nova/log.h b/fs/nova/log.h
index 305e69b..db7a72e 100644
--- a/fs/nova/log.h
+++ b/fs/nova/log.h
@@ -364,6 +364,9 @@ static inline int is_dir_init_entry(struct super_block *sb,
 }
 
 
+int nova_append_file_write_entry(struct super_block *sb, struct nova_inode *pi,
+   struct inode *inode, struct nova_file_write_item *item,
+   struct nova_inode_update *update);
 int nova_append_dentry(struct super_block *sb, struct nova_inode *pi,
struct inode *dir, struct dentry *dentry, u64 ino,
unsigned short de_len, struct nova_inode_update *update,
-- 
2.7.4



[RFC v2 41/83] Log operation: setattr entry append

2018-03-10 Thread Andiry Xu
From: Andiry Xu <jix...@cs.ucsd.edu>

NOVA appends a setattr entry to the log upon inode modification operations:
set size, chmod, etc.

Signed-off-by: Andiry Xu <jix...@cs.ucsd.edu>
---
 fs/nova/log.c | 64 +++
 1 file changed, 64 insertions(+)

diff --git a/fs/nova/log.c b/fs/nova/log.c
index 437db26..f85b63e 100644
--- a/fs/nova/log.c
+++ b/fs/nova/log.c
@@ -20,6 +20,37 @@
 #include "inode.h"
 #include "log.h"
 
+static void nova_update_setattr_entry(struct inode *inode,
+   struct nova_setattr_logentry *entry,
+   struct nova_log_entry_info *entry_info)
+{
+   struct iattr *attr = entry_info->attr;
+   unsigned int ia_valid = attr->ia_valid, attr_mask;
+
+   /* These files are in the lowest byte */
+   attr_mask = ATTR_MODE | ATTR_UID | ATTR_GID | ATTR_SIZE |
+   ATTR_ATIME | ATTR_MTIME | ATTR_CTIME;
+
+   entry->entry_type   = SET_ATTR;
+   entry->attr = ia_valid & attr_mask;
+   entry->mode = cpu_to_le16(inode->i_mode);
+   entry->uid  = cpu_to_le32(i_uid_read(inode));
+   entry->gid  = cpu_to_le32(i_gid_read(inode));
+   entry->atime= cpu_to_le32(inode->i_atime.tv_sec);
+   entry->ctime= cpu_to_le32(inode->i_ctime.tv_sec);
+   entry->mtime= cpu_to_le32(inode->i_mtime.tv_sec);
+   entry->epoch_id = cpu_to_le64(entry_info->epoch_id);
+   entry->trans_id = cpu_to_le64(entry_info->trans_id);
+   entry->invalid  = 0;
+
+   if (ia_valid & ATTR_SIZE)
+   entry->size = cpu_to_le64(attr->ia_size);
+   else
+   entry->size = cpu_to_le64(inode->i_size);
+
+   nova_persist_entry(entry);
+}
+
 static int nova_update_write_entry(struct super_block *sb,
struct nova_file_write_entry *entry,
struct nova_log_entry_info *entry_info)
@@ -116,6 +147,7 @@ static int nova_update_log_entry(struct super_block *sb, 
struct inode *inode,
nova_update_new_dentry(sb, inode, entry, entry_info);
break;
case SET_ATTR:
+   nova_update_setattr_entry(inode, entry, entry_info);
break;
case LINK_CHANGE:
break;
@@ -166,6 +198,38 @@ static int nova_append_log_entry(struct super_block *sb,
return 0;
 }
 
+/* Returns new tail after append */
+static int nova_append_setattr_entry(struct super_block *sb,
+   struct nova_inode *pi, struct inode *inode, struct iattr *attr,
+   struct nova_inode_update *update, u64 *last_setattr, u64 epoch_id)
+{
+   struct nova_inode_info *si = NOVA_I(inode);
+   struct nova_inode_info_header *sih = >header;
+   struct nova_log_entry_info entry_info;
+   timing_t append_time;
+   int ret;
+
+   NOVA_START_TIMING(append_setattr_t, append_time);
+   entry_info.type = SET_ATTR;
+   entry_info.attr = attr;
+   entry_info.update = update;
+   entry_info.epoch_id = epoch_id;
+   entry_info.trans_id = sih->trans_id;
+
+   ret = nova_append_log_entry(sb, pi, inode, sih, _info);
+   if (ret) {
+   nova_err(sb, "%s failed\n", __func__);
+   goto out;
+   }
+
+   *last_setattr = sih->last_setattr;
+   sih->last_setattr = entry_info.curr_p;
+
+out:
+   NOVA_END_TIMING(append_setattr_t, append_time);
+   return ret;
+}
+
 /*
  * Append a nova_file_write_entry to the current nova_inode_log_page.
  * blocknr and start_blk are pgoff.
-- 
2.7.4



[RFC v2 41/83] Log operation: setattr entry append

2018-03-10 Thread Andiry Xu
From: Andiry Xu 

NOVA appends a setattr entry to the log upon inode modification operations:
set size, chmod, etc.

Signed-off-by: Andiry Xu 
---
 fs/nova/log.c | 64 +++
 1 file changed, 64 insertions(+)

diff --git a/fs/nova/log.c b/fs/nova/log.c
index 437db26..f85b63e 100644
--- a/fs/nova/log.c
+++ b/fs/nova/log.c
@@ -20,6 +20,37 @@
 #include "inode.h"
 #include "log.h"
 
+static void nova_update_setattr_entry(struct inode *inode,
+   struct nova_setattr_logentry *entry,
+   struct nova_log_entry_info *entry_info)
+{
+   struct iattr *attr = entry_info->attr;
+   unsigned int ia_valid = attr->ia_valid, attr_mask;
+
+   /* These files are in the lowest byte */
+   attr_mask = ATTR_MODE | ATTR_UID | ATTR_GID | ATTR_SIZE |
+   ATTR_ATIME | ATTR_MTIME | ATTR_CTIME;
+
+   entry->entry_type   = SET_ATTR;
+   entry->attr = ia_valid & attr_mask;
+   entry->mode = cpu_to_le16(inode->i_mode);
+   entry->uid  = cpu_to_le32(i_uid_read(inode));
+   entry->gid  = cpu_to_le32(i_gid_read(inode));
+   entry->atime= cpu_to_le32(inode->i_atime.tv_sec);
+   entry->ctime= cpu_to_le32(inode->i_ctime.tv_sec);
+   entry->mtime= cpu_to_le32(inode->i_mtime.tv_sec);
+   entry->epoch_id = cpu_to_le64(entry_info->epoch_id);
+   entry->trans_id = cpu_to_le64(entry_info->trans_id);
+   entry->invalid  = 0;
+
+   if (ia_valid & ATTR_SIZE)
+   entry->size = cpu_to_le64(attr->ia_size);
+   else
+   entry->size = cpu_to_le64(inode->i_size);
+
+   nova_persist_entry(entry);
+}
+
 static int nova_update_write_entry(struct super_block *sb,
struct nova_file_write_entry *entry,
struct nova_log_entry_info *entry_info)
@@ -116,6 +147,7 @@ static int nova_update_log_entry(struct super_block *sb, 
struct inode *inode,
nova_update_new_dentry(sb, inode, entry, entry_info);
break;
case SET_ATTR:
+   nova_update_setattr_entry(inode, entry, entry_info);
break;
case LINK_CHANGE:
break;
@@ -166,6 +198,38 @@ static int nova_append_log_entry(struct super_block *sb,
return 0;
 }
 
+/* Returns new tail after append */
+static int nova_append_setattr_entry(struct super_block *sb,
+   struct nova_inode *pi, struct inode *inode, struct iattr *attr,
+   struct nova_inode_update *update, u64 *last_setattr, u64 epoch_id)
+{
+   struct nova_inode_info *si = NOVA_I(inode);
+   struct nova_inode_info_header *sih = >header;
+   struct nova_log_entry_info entry_info;
+   timing_t append_time;
+   int ret;
+
+   NOVA_START_TIMING(append_setattr_t, append_time);
+   entry_info.type = SET_ATTR;
+   entry_info.attr = attr;
+   entry_info.update = update;
+   entry_info.epoch_id = epoch_id;
+   entry_info.trans_id = sih->trans_id;
+
+   ret = nova_append_log_entry(sb, pi, inode, sih, _info);
+   if (ret) {
+   nova_err(sb, "%s failed\n", __func__);
+   goto out;
+   }
+
+   *last_setattr = sih->last_setattr;
+   sih->last_setattr = entry_info.curr_p;
+
+out:
+   NOVA_END_TIMING(append_setattr_t, append_time);
+   return ret;
+}
+
 /*
  * Append a nova_file_write_entry to the current nova_inode_log_page.
  * blocknr and start_blk are pgoff.
-- 
2.7.4



[RFC v2 44/83] Log operation: invalidate log entries

2018-03-10 Thread Andiry Xu
From: Andiry Xu <jix...@cs.ucsd.edu>

After new log entries are appended to the log, old log entries
can be marked invalid to faciliate garbage collection.

Signed-off-by: Andiry Xu <jix...@cs.ucsd.edu>
---
 fs/nova/log.c  | 160 +
 fs/nova/log.h  |   4 ++
 fs/nova/nova.h |  12 +
 3 files changed, 176 insertions(+)

diff --git a/fs/nova/log.c b/fs/nova/log.c
index c8b7d2e..d150f2e 100644
--- a/fs/nova/log.c
+++ b/fs/nova/log.c
@@ -20,6 +20,88 @@
 #include "inode.h"
 #include "log.h"
 
+static int nova_execute_invalidate_reassign_logentry(struct super_block *sb,
+   void *entry, enum nova_entry_type type, int reassign,
+   unsigned int num_free)
+{
+   struct nova_file_write_entry *fw_entry;
+   int invalid = 0;
+
+   switch (type) {
+   case FILE_WRITE:
+   fw_entry = (struct nova_file_write_entry *)entry;
+   if (reassign)
+   fw_entry->reassigned = 1;
+   if (num_free)
+   fw_entry->invalid_pages += num_free;
+   if (fw_entry->invalid_pages == fw_entry->num_pages)
+   invalid = 1;
+   break;
+   case DIR_LOG:
+   if (reassign) {
+   ((struct nova_dentry *)entry)->reassigned = 1;
+   } else {
+   ((struct nova_dentry *)entry)->invalid = 1;
+   invalid = 1;
+   }
+   break;
+   case SET_ATTR:
+   ((struct nova_setattr_logentry *)entry)->invalid = 1;
+   invalid = 1;
+   break;
+   case LINK_CHANGE:
+   ((struct nova_link_change_entry *)entry)->invalid = 1;
+   invalid = 1;
+   break;
+   default:
+   break;
+   }
+
+   if (invalid) {
+   u64 addr = nova_get_addr_off(NOVA_SB(sb), entry);
+
+   nova_inc_page_invalid_entries(sb, addr);
+   }
+
+   nova_persist_entry(entry);
+   return 0;
+}
+
+static int nova_invalidate_reassign_logentry(struct super_block *sb,
+   void *entry, enum nova_entry_type type, int reassign,
+   unsigned int num_free)
+{
+   nova_execute_invalidate_reassign_logentry(sb, entry, type,
+   reassign, num_free);
+   return 0;
+}
+
+static int nova_invalidate_logentry(struct super_block *sb, void *entry,
+   enum nova_entry_type type, unsigned int num_free)
+{
+   return nova_invalidate_reassign_logentry(sb, entry, type, 0, num_free);
+}
+
+static int nova_reassign_logentry(struct super_block *sb, void *entry,
+   enum nova_entry_type type)
+{
+   return nova_invalidate_reassign_logentry(sb, entry, type, 1, 0);
+}
+
+static inline int nova_invalidate_write_entry(struct super_block *sb,
+   struct nova_file_write_entry *entry, int reassign,
+   unsigned int num_free)
+{
+   if (!entry)
+   return 0;
+
+   if (num_free == 0 && entry->reassigned == 1)
+   return 0;
+
+   return nova_invalidate_reassign_logentry(sb, entry, FILE_WRITE,
+   reassign, num_free);
+}
+
 static void nova_update_setattr_entry(struct inode *inode,
struct nova_setattr_logentry *entry,
struct nova_log_entry_info *entry_info)
@@ -279,6 +361,27 @@ static int nova_append_setattr_entry(struct super_block 
*sb,
return ret;
 }
 
+/* Invalidate old setattr entry */
+static int nova_invalidate_setattr_entry(struct super_block *sb,
+   u64 last_setattr)
+{
+   struct nova_setattr_logentry *old_entry;
+   void *addr;
+   int ret;
+
+   addr = (void *)nova_get_block(sb, last_setattr);
+   old_entry = (struct nova_setattr_logentry *)addr;
+
+   /* Do not invalidate setsize entries */
+   if (!old_entry_freeable(sb, old_entry->epoch_id) ||
+   (old_entry->attr & ATTR_SIZE))
+   return 0;
+
+   ret = nova_invalidate_logentry(sb, old_entry, SET_ATTR, 0);
+
+   return ret;
+}
+
 static int nova_can_inplace_update_setattr(struct super_block *sb,
struct nova_inode_info_header *sih, u64 epoch_id)
 {
@@ -358,9 +461,35 @@ int nova_handle_setattr_operation(struct super_block *sb, 
struct inode *inode,
nova_update_inode(sb, inode, pi, );
}
 
+   /* Invalidate old setattr entry */
+   if (last_setattr)
+   nova_invalidate_setattr_entry(sb, last_setattr);
+
return 0;
 }
 
+/* Invalidate old link change entry */
+int nova_invalidate_link_change_entry(struct super_block *sb,
+   u64 old_link_change)
+{
+   struct nova_link_change_entry *old_entry;
+   void *addr;
+   int ret;
+
+   if (old_link_change == 0)
+   return 0;
+
+   addr = (void *)nova_get_block(sb

[RFC v2 44/83] Log operation: invalidate log entries

2018-03-10 Thread Andiry Xu
From: Andiry Xu 

After new log entries are appended to the log, old log entries
can be marked invalid to faciliate garbage collection.

Signed-off-by: Andiry Xu 
---
 fs/nova/log.c  | 160 +
 fs/nova/log.h  |   4 ++
 fs/nova/nova.h |  12 +
 3 files changed, 176 insertions(+)

diff --git a/fs/nova/log.c b/fs/nova/log.c
index c8b7d2e..d150f2e 100644
--- a/fs/nova/log.c
+++ b/fs/nova/log.c
@@ -20,6 +20,88 @@
 #include "inode.h"
 #include "log.h"
 
+static int nova_execute_invalidate_reassign_logentry(struct super_block *sb,
+   void *entry, enum nova_entry_type type, int reassign,
+   unsigned int num_free)
+{
+   struct nova_file_write_entry *fw_entry;
+   int invalid = 0;
+
+   switch (type) {
+   case FILE_WRITE:
+   fw_entry = (struct nova_file_write_entry *)entry;
+   if (reassign)
+   fw_entry->reassigned = 1;
+   if (num_free)
+   fw_entry->invalid_pages += num_free;
+   if (fw_entry->invalid_pages == fw_entry->num_pages)
+   invalid = 1;
+   break;
+   case DIR_LOG:
+   if (reassign) {
+   ((struct nova_dentry *)entry)->reassigned = 1;
+   } else {
+   ((struct nova_dentry *)entry)->invalid = 1;
+   invalid = 1;
+   }
+   break;
+   case SET_ATTR:
+   ((struct nova_setattr_logentry *)entry)->invalid = 1;
+   invalid = 1;
+   break;
+   case LINK_CHANGE:
+   ((struct nova_link_change_entry *)entry)->invalid = 1;
+   invalid = 1;
+   break;
+   default:
+   break;
+   }
+
+   if (invalid) {
+   u64 addr = nova_get_addr_off(NOVA_SB(sb), entry);
+
+   nova_inc_page_invalid_entries(sb, addr);
+   }
+
+   nova_persist_entry(entry);
+   return 0;
+}
+
+static int nova_invalidate_reassign_logentry(struct super_block *sb,
+   void *entry, enum nova_entry_type type, int reassign,
+   unsigned int num_free)
+{
+   nova_execute_invalidate_reassign_logentry(sb, entry, type,
+   reassign, num_free);
+   return 0;
+}
+
+static int nova_invalidate_logentry(struct super_block *sb, void *entry,
+   enum nova_entry_type type, unsigned int num_free)
+{
+   return nova_invalidate_reassign_logentry(sb, entry, type, 0, num_free);
+}
+
+static int nova_reassign_logentry(struct super_block *sb, void *entry,
+   enum nova_entry_type type)
+{
+   return nova_invalidate_reassign_logentry(sb, entry, type, 1, 0);
+}
+
+static inline int nova_invalidate_write_entry(struct super_block *sb,
+   struct nova_file_write_entry *entry, int reassign,
+   unsigned int num_free)
+{
+   if (!entry)
+   return 0;
+
+   if (num_free == 0 && entry->reassigned == 1)
+   return 0;
+
+   return nova_invalidate_reassign_logentry(sb, entry, FILE_WRITE,
+   reassign, num_free);
+}
+
 static void nova_update_setattr_entry(struct inode *inode,
struct nova_setattr_logentry *entry,
struct nova_log_entry_info *entry_info)
@@ -279,6 +361,27 @@ static int nova_append_setattr_entry(struct super_block 
*sb,
return ret;
 }
 
+/* Invalidate old setattr entry */
+static int nova_invalidate_setattr_entry(struct super_block *sb,
+   u64 last_setattr)
+{
+   struct nova_setattr_logentry *old_entry;
+   void *addr;
+   int ret;
+
+   addr = (void *)nova_get_block(sb, last_setattr);
+   old_entry = (struct nova_setattr_logentry *)addr;
+
+   /* Do not invalidate setsize entries */
+   if (!old_entry_freeable(sb, old_entry->epoch_id) ||
+   (old_entry->attr & ATTR_SIZE))
+   return 0;
+
+   ret = nova_invalidate_logentry(sb, old_entry, SET_ATTR, 0);
+
+   return ret;
+}
+
 static int nova_can_inplace_update_setattr(struct super_block *sb,
struct nova_inode_info_header *sih, u64 epoch_id)
 {
@@ -358,9 +461,35 @@ int nova_handle_setattr_operation(struct super_block *sb, 
struct inode *inode,
nova_update_inode(sb, inode, pi, );
}
 
+   /* Invalidate old setattr entry */
+   if (last_setattr)
+   nova_invalidate_setattr_entry(sb, last_setattr);
+
return 0;
 }
 
+/* Invalidate old link change entry */
+int nova_invalidate_link_change_entry(struct super_block *sb,
+   u64 old_link_change)
+{
+   struct nova_link_change_entry *old_entry;
+   void *addr;
+   int ret;
+
+   if (old_link_change == 0)
+   return 0;
+
+   addr = (void *)nova_get_block(sb, old_link_change);
+   old_entry = (struct no

[RFC v2 42/83] Log operation: link change append.

2018-03-10 Thread Andiry Xu
From: Andiry Xu <jix...@cs.ucsd.edu>

NOVA appends link change entries to atomically update link count and ctime.
This occurs in link, unlink and rmdir.

Signed-off-by: Andiry Xu <jix...@cs.ucsd.edu>
---
 fs/nova/log.c | 52 
 fs/nova/log.h |  3 +++
 2 files changed, 55 insertions(+)

diff --git a/fs/nova/log.c b/fs/nova/log.c
index f85b63e..4638ccf 100644
--- a/fs/nova/log.c
+++ b/fs/nova/log.c
@@ -51,6 +51,25 @@ static void nova_update_setattr_entry(struct inode *inode,
nova_persist_entry(entry);
 }
 
+static void nova_update_link_change_entry(struct inode *inode,
+   struct nova_link_change_entry *entry,
+   struct nova_log_entry_info *entry_info)
+{
+   struct nova_inode_info *si = NOVA_I(inode);
+   struct nova_inode_info_header *sih = >header;
+
+   entry->entry_type   = LINK_CHANGE;
+   entry->epoch_id = cpu_to_le64(entry_info->epoch_id);
+   entry->trans_id = cpu_to_le64(entry_info->trans_id);
+   entry->invalid  = 0;
+   entry->links= cpu_to_le16(inode->i_nlink);
+   entry->ctime= cpu_to_le32(inode->i_ctime.tv_sec);
+   entry->flags= cpu_to_le32(sih->i_flags);
+   entry->generation   = cpu_to_le32(inode->i_generation);
+
+   nova_persist_entry(entry);
+}
+
 static int nova_update_write_entry(struct super_block *sb,
struct nova_file_write_entry *entry,
struct nova_log_entry_info *entry_info)
@@ -150,6 +169,7 @@ static int nova_update_log_entry(struct super_block *sb, 
struct inode *inode,
nova_update_setattr_entry(inode, entry, entry_info);
break;
case LINK_CHANGE:
+   nova_update_link_change_entry(inode, entry, entry_info);
break;
default:
break;
@@ -230,6 +250,38 @@ static int nova_append_setattr_entry(struct super_block 
*sb,
return ret;
 }
 
+/* Returns new tail after append */
+int nova_append_link_change_entry(struct super_block *sb,
+   struct nova_inode *pi, struct inode *inode,
+   struct nova_inode_update *update, u64 *old_linkc, u64 epoch_id)
+{
+   struct nova_inode_info *si = NOVA_I(inode);
+   struct nova_inode_info_header *sih = >header;
+   struct nova_log_entry_info entry_info;
+   int ret = 0;
+   timing_t append_time;
+
+   NOVA_START_TIMING(append_link_change_t, append_time);
+
+   entry_info.type = LINK_CHANGE;
+   entry_info.update = update;
+   entry_info.epoch_id = epoch_id;
+   entry_info.trans_id = sih->trans_id;
+
+   ret = nova_append_log_entry(sb, pi, inode, sih, _info);
+   if (ret) {
+   nova_err(sb, "%s failed\n", __func__);
+   goto out;
+   }
+
+   *old_linkc = sih->last_link_change;
+   sih->last_link_change = entry_info.curr_p;
+   sih->trans_id++;
+out:
+   NOVA_END_TIMING(append_link_change_t, append_time);
+   return ret;
+}
+
 /*
  * Append a nova_file_write_entry to the current nova_inode_log_page.
  * blocknr and start_blk are pgoff.
diff --git a/fs/nova/log.h b/fs/nova/log.h
index db7a72e..f36f4a3 100644
--- a/fs/nova/log.h
+++ b/fs/nova/log.h
@@ -364,6 +364,9 @@ static inline int is_dir_init_entry(struct super_block *sb,
 }
 
 
+int nova_append_link_change_entry(struct super_block *sb,
+   struct nova_inode *pi, struct inode *inode,
+   struct nova_inode_update *update, u64 *old_linkc, u64 epoch_id);
 int nova_append_file_write_entry(struct super_block *sb, struct nova_inode *pi,
struct inode *inode, struct nova_file_write_item *item,
struct nova_inode_update *update);
-- 
2.7.4



[RFC v2 42/83] Log operation: link change append.

2018-03-10 Thread Andiry Xu
From: Andiry Xu 

NOVA appends link change entries to atomically update link count and ctime.
This occurs in link, unlink and rmdir.

Signed-off-by: Andiry Xu 
---
 fs/nova/log.c | 52 
 fs/nova/log.h |  3 +++
 2 files changed, 55 insertions(+)

diff --git a/fs/nova/log.c b/fs/nova/log.c
index f85b63e..4638ccf 100644
--- a/fs/nova/log.c
+++ b/fs/nova/log.c
@@ -51,6 +51,25 @@ static void nova_update_setattr_entry(struct inode *inode,
nova_persist_entry(entry);
 }
 
+static void nova_update_link_change_entry(struct inode *inode,
+   struct nova_link_change_entry *entry,
+   struct nova_log_entry_info *entry_info)
+{
+   struct nova_inode_info *si = NOVA_I(inode);
+   struct nova_inode_info_header *sih = >header;
+
+   entry->entry_type   = LINK_CHANGE;
+   entry->epoch_id = cpu_to_le64(entry_info->epoch_id);
+   entry->trans_id = cpu_to_le64(entry_info->trans_id);
+   entry->invalid  = 0;
+   entry->links= cpu_to_le16(inode->i_nlink);
+   entry->ctime= cpu_to_le32(inode->i_ctime.tv_sec);
+   entry->flags= cpu_to_le32(sih->i_flags);
+   entry->generation   = cpu_to_le32(inode->i_generation);
+
+   nova_persist_entry(entry);
+}
+
 static int nova_update_write_entry(struct super_block *sb,
struct nova_file_write_entry *entry,
struct nova_log_entry_info *entry_info)
@@ -150,6 +169,7 @@ static int nova_update_log_entry(struct super_block *sb, 
struct inode *inode,
nova_update_setattr_entry(inode, entry, entry_info);
break;
case LINK_CHANGE:
+   nova_update_link_change_entry(inode, entry, entry_info);
break;
default:
break;
@@ -230,6 +250,38 @@ static int nova_append_setattr_entry(struct super_block 
*sb,
return ret;
 }
 
+/* Returns new tail after append */
+int nova_append_link_change_entry(struct super_block *sb,
+   struct nova_inode *pi, struct inode *inode,
+   struct nova_inode_update *update, u64 *old_linkc, u64 epoch_id)
+{
+   struct nova_inode_info *si = NOVA_I(inode);
+   struct nova_inode_info_header *sih = >header;
+   struct nova_log_entry_info entry_info;
+   int ret = 0;
+   timing_t append_time;
+
+   NOVA_START_TIMING(append_link_change_t, append_time);
+
+   entry_info.type = LINK_CHANGE;
+   entry_info.update = update;
+   entry_info.epoch_id = epoch_id;
+   entry_info.trans_id = sih->trans_id;
+
+   ret = nova_append_log_entry(sb, pi, inode, sih, _info);
+   if (ret) {
+   nova_err(sb, "%s failed\n", __func__);
+   goto out;
+   }
+
+   *old_linkc = sih->last_link_change;
+   sih->last_link_change = entry_info.curr_p;
+   sih->trans_id++;
+out:
+   NOVA_END_TIMING(append_link_change_t, append_time);
+   return ret;
+}
+
 /*
  * Append a nova_file_write_entry to the current nova_inode_log_page.
  * blocknr and start_blk are pgoff.
diff --git a/fs/nova/log.h b/fs/nova/log.h
index db7a72e..f36f4a3 100644
--- a/fs/nova/log.h
+++ b/fs/nova/log.h
@@ -364,6 +364,9 @@ static inline int is_dir_init_entry(struct super_block *sb,
 }
 
 
+int nova_append_link_change_entry(struct super_block *sb,
+   struct nova_inode *pi, struct inode *inode,
+   struct nova_inode_update *update, u64 *old_linkc, u64 epoch_id);
 int nova_append_file_write_entry(struct super_block *sb, struct nova_inode *pi,
struct inode *inode, struct nova_file_write_item *item,
struct nova_inode_update *update);
-- 
2.7.4



[RFC v2 47/83] Dir: Add initial dentries when initializing a directory inode log.

2018-03-10 Thread Andiry Xu
From: Andiry Xu <jix...@cs.ucsd.edu>

For root directory and newly created directory via mkdir(),
we append . and .. dentries to the directory inode log.

Signed-off-by: Andiry Xu <jix...@cs.ucsd.edu>
---
 fs/nova/dir.c   | 82 +
 fs/nova/nova.h  |  2 ++
 fs/nova/super.c |  5 
 3 files changed, 89 insertions(+)

diff --git a/fs/nova/dir.c b/fs/nova/dir.c
index 5bea57a..377d2da 100644
--- a/fs/nova/dir.c
+++ b/fs/nova/dir.c
@@ -139,3 +139,85 @@ void nova_delete_dir_tree(struct super_block *sb,
 
NOVA_END_TIMING(delete_dir_tree_t, delete_time);
 }
+
+/* = Entry operations = */
+
+static unsigned int nova_init_dentry(struct super_block *sb,
+   struct nova_dentry *de_entry, u64 self_ino, u64 parent_ino,
+   u64 epoch_id)
+{
+   void *start = de_entry;
+   struct nova_inode_log_page *curr_page = start;
+   unsigned int length;
+   unsigned short de_len;
+
+   de_len = NOVA_DIR_LOG_REC_LEN(1);
+   memset(de_entry, 0, de_len);
+   de_entry->entry_type = DIR_LOG;
+   de_entry->epoch_id = epoch_id;
+   de_entry->trans_id = 0;
+   de_entry->ino = cpu_to_le64(self_ino);
+   de_entry->name_len = 1;
+   de_entry->de_len = cpu_to_le16(de_len);
+   de_entry->mtime = timespec_trunc(current_kernel_time(),
+sb->s_time_gran).tv_sec;
+
+   de_entry->links_count = 1;
+   strncpy(de_entry->name, ".\0", 2);
+   nova_persist_entry(de_entry);
+
+   length = de_len;
+
+   de_entry = (struct nova_dentry *)((char *)de_entry + length);
+   de_len = NOVA_DIR_LOG_REC_LEN(2);
+   memset(de_entry, 0, de_len);
+   de_entry->entry_type = DIR_LOG;
+   de_entry->epoch_id = epoch_id;
+   de_entry->trans_id = 0;
+   de_entry->ino = cpu_to_le64(parent_ino);
+   de_entry->name_len = 2;
+   de_entry->de_len = cpu_to_le16(de_len);
+   de_entry->mtime = timespec_trunc(current_kernel_time(),
+sb->s_time_gran).tv_sec;
+
+   de_entry->links_count = 2;
+   strncpy(de_entry->name, "..\0", 3);
+   nova_persist_entry(de_entry);
+   length += de_len;
+
+   nova_set_page_num_entries(sb, curr_page, 2, 1);
+
+   nova_flush_buffer(start, length, 0);
+   return length;
+}
+
+/* Append . and .. entries */
+int nova_append_dir_init_entries(struct super_block *sb,
+   struct nova_inode *pi, u64 self_ino, u64 parent_ino, u64 epoch_id)
+{
+   struct nova_inode_info_header sih;
+   int allocated;
+   u64 new_block;
+   unsigned int length;
+   struct nova_dentry *de_entry;
+
+   sih.ino = self_ino;
+   sih.i_blk_type = NOVA_DEFAULT_BLOCK_TYPE;
+
+   allocated = nova_allocate_inode_log_pages(sb, , 1, _block,
+   ANY_CPU, 0);
+   if (allocated != 1) {
+   nova_err(sb, "ERROR: no inode log page available\n");
+   return -ENOMEM;
+   }
+
+   pi->log_tail = pi->log_head = new_block;
+
+   de_entry = (struct nova_dentry *)nova_get_block(sb, new_block);
+
+   length = nova_init_dentry(sb, de_entry, self_ino, parent_ino, epoch_id);
+
+   nova_update_tail(pi, new_block + length);
+
+   return 0;
+}
diff --git a/fs/nova/nova.h b/fs/nova/nova.h
index 3890479..a94f44d 100644
--- a/fs/nova/nova.h
+++ b/fs/nova/nova.h
@@ -458,6 +458,8 @@ void nova_delete_dir_tree(struct super_block *sb,
 struct nova_dentry *nova_find_dentry(struct super_block *sb,
struct nova_inode *pi, struct inode *inode, const char *name,
unsigned long name_len);
+int nova_append_dir_init_entries(struct super_block *sb,
+   struct nova_inode *pi, u64 self_ino, u64 parent_ino, u64 epoch_id);
 
 /* rebuild.c */
 int nova_rebuild_inode(struct super_block *sb, struct nova_inode_info *si,
diff --git a/fs/nova/super.c b/fs/nova/super.c
index 216d396..1e67062 100644
--- a/fs/nova/super.c
+++ b/fs/nova/super.c
@@ -349,6 +349,7 @@ static struct nova_inode *nova_init(struct super_block *sb,
struct nova_inode *root_i, *pi;
struct nova_super_block *super;
struct nova_sb_info *sbi = NOVA_SB(sb);
+   u64 epoch_id;
timing_t init_time;
 
NOVA_START_TIMING(new_init_t, init_time);
@@ -415,6 +416,10 @@ static struct nova_inode *nova_init(struct super_block *sb,
 
nova_flush_buffer(root_i, sizeof(*root_i), false);
 
+   epoch_id = nova_get_epoch_id(sb);
+   nova_append_dir_init_entries(sb, root_i, NOVA_ROOT_INO,
+   NOVA_ROOT_INO, epoch_id);
+
PERSISTENT_MARK();
PERSISTENT_BARRIER();
nova_info("NOVA initialization finish\n");
-- 
2.7.4



[RFC v2 47/83] Dir: Add initial dentries when initializing a directory inode log.

2018-03-10 Thread Andiry Xu
From: Andiry Xu 

For root directory and newly created directory via mkdir(),
we append . and .. dentries to the directory inode log.

Signed-off-by: Andiry Xu 
---
 fs/nova/dir.c   | 82 +
 fs/nova/nova.h  |  2 ++
 fs/nova/super.c |  5 
 3 files changed, 89 insertions(+)

diff --git a/fs/nova/dir.c b/fs/nova/dir.c
index 5bea57a..377d2da 100644
--- a/fs/nova/dir.c
+++ b/fs/nova/dir.c
@@ -139,3 +139,85 @@ void nova_delete_dir_tree(struct super_block *sb,
 
NOVA_END_TIMING(delete_dir_tree_t, delete_time);
 }
+
+/* = Entry operations = */
+
+static unsigned int nova_init_dentry(struct super_block *sb,
+   struct nova_dentry *de_entry, u64 self_ino, u64 parent_ino,
+   u64 epoch_id)
+{
+   void *start = de_entry;
+   struct nova_inode_log_page *curr_page = start;
+   unsigned int length;
+   unsigned short de_len;
+
+   de_len = NOVA_DIR_LOG_REC_LEN(1);
+   memset(de_entry, 0, de_len);
+   de_entry->entry_type = DIR_LOG;
+   de_entry->epoch_id = epoch_id;
+   de_entry->trans_id = 0;
+   de_entry->ino = cpu_to_le64(self_ino);
+   de_entry->name_len = 1;
+   de_entry->de_len = cpu_to_le16(de_len);
+   de_entry->mtime = timespec_trunc(current_kernel_time(),
+sb->s_time_gran).tv_sec;
+
+   de_entry->links_count = 1;
+   strncpy(de_entry->name, ".\0", 2);
+   nova_persist_entry(de_entry);
+
+   length = de_len;
+
+   de_entry = (struct nova_dentry *)((char *)de_entry + length);
+   de_len = NOVA_DIR_LOG_REC_LEN(2);
+   memset(de_entry, 0, de_len);
+   de_entry->entry_type = DIR_LOG;
+   de_entry->epoch_id = epoch_id;
+   de_entry->trans_id = 0;
+   de_entry->ino = cpu_to_le64(parent_ino);
+   de_entry->name_len = 2;
+   de_entry->de_len = cpu_to_le16(de_len);
+   de_entry->mtime = timespec_trunc(current_kernel_time(),
+sb->s_time_gran).tv_sec;
+
+   de_entry->links_count = 2;
+   strncpy(de_entry->name, "..\0", 3);
+   nova_persist_entry(de_entry);
+   length += de_len;
+
+   nova_set_page_num_entries(sb, curr_page, 2, 1);
+
+   nova_flush_buffer(start, length, 0);
+   return length;
+}
+
+/* Append . and .. entries */
+int nova_append_dir_init_entries(struct super_block *sb,
+   struct nova_inode *pi, u64 self_ino, u64 parent_ino, u64 epoch_id)
+{
+   struct nova_inode_info_header sih;
+   int allocated;
+   u64 new_block;
+   unsigned int length;
+   struct nova_dentry *de_entry;
+
+   sih.ino = self_ino;
+   sih.i_blk_type = NOVA_DEFAULT_BLOCK_TYPE;
+
+   allocated = nova_allocate_inode_log_pages(sb, , 1, _block,
+   ANY_CPU, 0);
+   if (allocated != 1) {
+   nova_err(sb, "ERROR: no inode log page available\n");
+   return -ENOMEM;
+   }
+
+   pi->log_tail = pi->log_head = new_block;
+
+   de_entry = (struct nova_dentry *)nova_get_block(sb, new_block);
+
+   length = nova_init_dentry(sb, de_entry, self_ino, parent_ino, epoch_id);
+
+   nova_update_tail(pi, new_block + length);
+
+   return 0;
+}
diff --git a/fs/nova/nova.h b/fs/nova/nova.h
index 3890479..a94f44d 100644
--- a/fs/nova/nova.h
+++ b/fs/nova/nova.h
@@ -458,6 +458,8 @@ void nova_delete_dir_tree(struct super_block *sb,
 struct nova_dentry *nova_find_dentry(struct super_block *sb,
struct nova_inode *pi, struct inode *inode, const char *name,
unsigned long name_len);
+int nova_append_dir_init_entries(struct super_block *sb,
+   struct nova_inode *pi, u64 self_ino, u64 parent_ino, u64 epoch_id);
 
 /* rebuild.c */
 int nova_rebuild_inode(struct super_block *sb, struct nova_inode_info *si,
diff --git a/fs/nova/super.c b/fs/nova/super.c
index 216d396..1e67062 100644
--- a/fs/nova/super.c
+++ b/fs/nova/super.c
@@ -349,6 +349,7 @@ static struct nova_inode *nova_init(struct super_block *sb,
struct nova_inode *root_i, *pi;
struct nova_super_block *super;
struct nova_sb_info *sbi = NOVA_SB(sb);
+   u64 epoch_id;
timing_t init_time;
 
NOVA_START_TIMING(new_init_t, init_time);
@@ -415,6 +416,10 @@ static struct nova_inode *nova_init(struct super_block *sb,
 
nova_flush_buffer(root_i, sizeof(*root_i), false);
 
+   epoch_id = nova_get_epoch_id(sb);
+   nova_append_dir_init_entries(sb, root_i, NOVA_ROOT_INO,
+   NOVA_ROOT_INO, epoch_id);
+
PERSISTENT_MARK();
PERSISTENT_BARRIER();
nova_info("NOVA initialization finish\n");
-- 
2.7.4



[RFC v2 46/83] Dir: Add Directory radix tree insert/remove methods.

2018-03-10 Thread Andiry Xu
From: Andiry Xu <jix...@cs.ucsd.edu>

NOVA uses Hash to quickly locate dentry in the directory inode log.
The key is the hash of the filename, the value is the dentry.

Currently hash collision is ignored, and the radix tree may occupy
large memory space with huge directories. Considering replacing
it in the future.

Signed-off-by: Andiry Xu <jix...@cs.ucsd.edu>
---
 fs/nova/Makefile |   2 +-
 fs/nova/dir.c| 141 +++
 fs/nova/nova.h   |  26 ++
 3 files changed, 168 insertions(+), 1 deletion(-)
 create mode 100644 fs/nova/dir.c

diff --git a/fs/nova/Makefile b/fs/nova/Makefile
index 4aeadea..3a3243c 100644
--- a/fs/nova/Makefile
+++ b/fs/nova/Makefile
@@ -4,4 +4,4 @@
 
 obj-$(CONFIG_NOVA_FS) += nova.o
 
-nova-y := balloc.o bbuild.o inode.o journal.o log.o rebuild.o stats.o super.o
+nova-y := balloc.o bbuild.o dir.o inode.o journal.o log.o rebuild.o stats.o 
super.o
diff --git a/fs/nova/dir.c b/fs/nova/dir.c
new file mode 100644
index 000..5bea57a
--- /dev/null
+++ b/fs/nova/dir.c
@@ -0,0 +1,141 @@
+/*
+ * BRIEF DESCRIPTION
+ *
+ * File operations for directories.
+ *
+ * Copyright 2015-2016 Regents of the University of California,
+ * UCSD Non-Volatile Systems Lab, Andiry Xu <jix...@cs.ucsd.edu>
+ * Copyright 2012-2013 Intel Corporation
+ * Copyright 2009-2011 Marco Stornelli <marco.storne...@gmail.com>
+ * Copyright 2003 Sony Corporation
+ * Copyright 2003 Matsushita Electric Industrial Co., Ltd.
+ * 2003-2004 (c) MontaVista Software, Inc. , Steve Longerbeam
+ * This file is licensed under the terms of the GNU General Public
+ * License version 2. This program is licensed "as is" without any
+ * warranty of any kind, whether express or implied.
+ */
+
+#include 
+#include 
+#include "nova.h"
+#include "inode.h"
+
+#define DT2IF(dt) (((dt) << 12) & S_IFMT)
+#define IF2DT(sif) (((sif) & S_IFMT) >> 12)
+
+struct nova_dentry *nova_find_dentry(struct super_block *sb,
+   struct nova_inode *pi, struct inode *inode, const char *name,
+   unsigned long name_len)
+{
+   struct nova_inode_info *si = NOVA_I(inode);
+   struct nova_inode_info_header *sih = >header;
+   struct nova_dentry *direntry;
+   unsigned long hash;
+
+   hash = BKDRHash(name, name_len);
+   direntry = radix_tree_lookup(>tree, hash);
+
+   return direntry;
+}
+
+int nova_insert_dir_radix_tree(struct super_block *sb,
+   struct nova_inode_info_header *sih, const char *name,
+   int namelen, struct nova_dentry *direntry)
+{
+   unsigned long hash;
+   int ret;
+
+   hash = BKDRHash(name, namelen);
+   nova_dbgv("%s: insert %s hash %lu\n", __func__, name, hash);
+
+   /* FIXME: hash collision ignored here */
+   ret = radix_tree_insert(>tree, hash, direntry);
+   if (ret)
+   nova_dbg("%s ERROR %d: %s\n", __func__, ret, name);
+
+   return ret;
+}
+
+static int nova_check_dentry_match(struct super_block *sb,
+   struct nova_dentry *dentry, const char *name, int namelen)
+{
+   if (dentry->name_len != namelen)
+   return -EINVAL;
+
+   return strncmp(dentry->name, name, namelen);
+}
+
+int nova_remove_dir_radix_tree(struct super_block *sb,
+   struct nova_inode_info_header *sih, const char *name, int namelen,
+   int replay, struct nova_dentry **create_dentry)
+{
+   struct nova_dentry *entry;
+   unsigned long hash;
+
+   hash = BKDRHash(name, namelen);
+   entry = radix_tree_delete(>tree, hash);
+
+   if (replay == 0) {
+   if (!entry) {
+   nova_dbg("%s ERROR: %s, length %d, hash %lu\n",
+   __func__, name, namelen, hash);
+   return -EINVAL;
+   }
+
+   if (entry->ino == 0 || entry->invalid ||
+   nova_check_dentry_match(sb, entry, name, namelen)) {
+   nova_dbg("%s dentry not match: %s, length %d, hash 
%lu\n",
+__func__, name, namelen, hash);
+   /* for debug information, still allow access to nvmm */
+   nova_dbg("dentry: type %d, inode %llu, name %s, namelen 
%u, rec len %u\n",
+entry->entry_type, le64_to_cpu(entry->ino),
+entry->name, entry->name_len,
+le16_to_cpu(entry->de_len));
+   return -EINVAL;
+   }
+
+   if (create_dentry)
+   *create_dentry = entry;
+   }
+
+   return 0;
+}
+
+void nova_delete_dir_tree(struct super_block *sb,
+   struct nova_inode_info_header *sih)
+{
+   struct nova_dentry *direntry;
+   unsigned long pos = 0;
+   struct nova_dentry *entr

[RFC v2 46/83] Dir: Add Directory radix tree insert/remove methods.

2018-03-10 Thread Andiry Xu
From: Andiry Xu 

NOVA uses Hash to quickly locate dentry in the directory inode log.
The key is the hash of the filename, the value is the dentry.

Currently hash collision is ignored, and the radix tree may occupy
large memory space with huge directories. Considering replacing
it in the future.

Signed-off-by: Andiry Xu 
---
 fs/nova/Makefile |   2 +-
 fs/nova/dir.c| 141 +++
 fs/nova/nova.h   |  26 ++
 3 files changed, 168 insertions(+), 1 deletion(-)
 create mode 100644 fs/nova/dir.c

diff --git a/fs/nova/Makefile b/fs/nova/Makefile
index 4aeadea..3a3243c 100644
--- a/fs/nova/Makefile
+++ b/fs/nova/Makefile
@@ -4,4 +4,4 @@
 
 obj-$(CONFIG_NOVA_FS) += nova.o
 
-nova-y := balloc.o bbuild.o inode.o journal.o log.o rebuild.o stats.o super.o
+nova-y := balloc.o bbuild.o dir.o inode.o journal.o log.o rebuild.o stats.o 
super.o
diff --git a/fs/nova/dir.c b/fs/nova/dir.c
new file mode 100644
index 000..5bea57a
--- /dev/null
+++ b/fs/nova/dir.c
@@ -0,0 +1,141 @@
+/*
+ * BRIEF DESCRIPTION
+ *
+ * File operations for directories.
+ *
+ * Copyright 2015-2016 Regents of the University of California,
+ * UCSD Non-Volatile Systems Lab, Andiry Xu 
+ * Copyright 2012-2013 Intel Corporation
+ * Copyright 2009-2011 Marco Stornelli 
+ * Copyright 2003 Sony Corporation
+ * Copyright 2003 Matsushita Electric Industrial Co., Ltd.
+ * 2003-2004 (c) MontaVista Software, Inc. , Steve Longerbeam
+ * This file is licensed under the terms of the GNU General Public
+ * License version 2. This program is licensed "as is" without any
+ * warranty of any kind, whether express or implied.
+ */
+
+#include 
+#include 
+#include "nova.h"
+#include "inode.h"
+
+#define DT2IF(dt) (((dt) << 12) & S_IFMT)
+#define IF2DT(sif) (((sif) & S_IFMT) >> 12)
+
+struct nova_dentry *nova_find_dentry(struct super_block *sb,
+   struct nova_inode *pi, struct inode *inode, const char *name,
+   unsigned long name_len)
+{
+   struct nova_inode_info *si = NOVA_I(inode);
+   struct nova_inode_info_header *sih = >header;
+   struct nova_dentry *direntry;
+   unsigned long hash;
+
+   hash = BKDRHash(name, name_len);
+   direntry = radix_tree_lookup(>tree, hash);
+
+   return direntry;
+}
+
+int nova_insert_dir_radix_tree(struct super_block *sb,
+   struct nova_inode_info_header *sih, const char *name,
+   int namelen, struct nova_dentry *direntry)
+{
+   unsigned long hash;
+   int ret;
+
+   hash = BKDRHash(name, namelen);
+   nova_dbgv("%s: insert %s hash %lu\n", __func__, name, hash);
+
+   /* FIXME: hash collision ignored here */
+   ret = radix_tree_insert(>tree, hash, direntry);
+   if (ret)
+   nova_dbg("%s ERROR %d: %s\n", __func__, ret, name);
+
+   return ret;
+}
+
+static int nova_check_dentry_match(struct super_block *sb,
+   struct nova_dentry *dentry, const char *name, int namelen)
+{
+   if (dentry->name_len != namelen)
+   return -EINVAL;
+
+   return strncmp(dentry->name, name, namelen);
+}
+
+int nova_remove_dir_radix_tree(struct super_block *sb,
+   struct nova_inode_info_header *sih, const char *name, int namelen,
+   int replay, struct nova_dentry **create_dentry)
+{
+   struct nova_dentry *entry;
+   unsigned long hash;
+
+   hash = BKDRHash(name, namelen);
+   entry = radix_tree_delete(>tree, hash);
+
+   if (replay == 0) {
+   if (!entry) {
+   nova_dbg("%s ERROR: %s, length %d, hash %lu\n",
+   __func__, name, namelen, hash);
+   return -EINVAL;
+   }
+
+   if (entry->ino == 0 || entry->invalid ||
+   nova_check_dentry_match(sb, entry, name, namelen)) {
+   nova_dbg("%s dentry not match: %s, length %d, hash 
%lu\n",
+__func__, name, namelen, hash);
+   /* for debug information, still allow access to nvmm */
+   nova_dbg("dentry: type %d, inode %llu, name %s, namelen 
%u, rec len %u\n",
+entry->entry_type, le64_to_cpu(entry->ino),
+entry->name, entry->name_len,
+le16_to_cpu(entry->de_len));
+   return -EINVAL;
+   }
+
+   if (create_dentry)
+   *create_dentry = entry;
+   }
+
+   return 0;
+}
+
+void nova_delete_dir_tree(struct super_block *sb,
+   struct nova_inode_info_header *sih)
+{
+   struct nova_dentry *direntry;
+   unsigned long pos = 0;
+   struct nova_dentry *entries[FREE_BATCH];
+   timing_t delete_time;
+   int nr_entries;
+   int i;
+   void *re

[RFC v2 50/83] Inode: Add nova_evict_inode.

2018-03-10 Thread Andiry Xu
From: Andiry Xu <jix...@cs.ucsd.edu>

If the inode still have links, release the DRAM resource (radix tree, etc).
Otherwise reclaim data pages and log pages.

Signed-off-by: Andiry Xu <jix...@cs.ucsd.edu>
---
 fs/nova/inode.c | 257 +++-
 fs/nova/inode.h |   5 ++
 fs/nova/log.h   |   7 ++
 fs/nova/super.c |   1 +
 4 files changed, 269 insertions(+), 1 deletion(-)

diff --git a/fs/nova/inode.c b/fs/nova/inode.c
index 41417e3..17addd3 100644
--- a/fs/nova/inode.c
+++ b/fs/nova/inode.c
@@ -457,7 +457,7 @@ static int nova_alloc_unused_inode(struct super_block *sb, 
int cpuid,
return 0;
 }
 
-int nova_free_inuse_inode(struct super_block *sb, unsigned long ino)
+static int nova_free_inuse_inode(struct super_block *sb, unsigned long ino)
 {
struct nova_sb_info *sbi = NOVA_SB(sb);
struct inode_map *inode_map;
@@ -532,6 +532,261 @@ int nova_free_inuse_inode(struct super_block *sb, 
unsigned long ino)
return ret;
 }
 
+static int nova_free_inode(struct super_block *sb, struct nova_inode *pi,
+   struct nova_inode_info_header *sih)
+{
+   int err = 0;
+   timing_t free_time;
+
+   NOVA_START_TIMING(free_inode_t, free_time);
+
+   nova_free_inode_log(sb, pi, sih);
+
+   sih->log_pages = 0;
+   sih->i_mode = 0;
+   sih->pi_addr = 0;
+   sih->i_size = 0;
+   sih->i_blocks = 0;
+
+   err = nova_free_inuse_inode(sb, pi->nova_ino);
+
+   NOVA_END_TIMING(free_inode_t, free_time);
+   return err;
+}
+
+/*
+ * We do not really rely on this last blocknr
+ * because blocks can be allocated beyond file end
+ */
+static unsigned long nova_get_last_blocknr(struct super_block *sb,
+   struct nova_inode_info_header *sih)
+{
+   struct nova_inode *pi, fake_pi;
+   unsigned long last_blocknr;
+   unsigned int btype;
+   unsigned int data_bits;
+   int ret;
+
+   ret = nova_get_reference(sb, sih->pi_addr, _pi,
+   (void **), sizeof(struct nova_inode));
+   if (ret) {
+   nova_dbg("%s: read pi @ 0x%lx failed\n",
+   __func__, sih->pi_addr);
+   btype = 0;
+   } else {
+   btype = sih->i_blk_type;
+   }
+
+   data_bits = blk_type_to_shift[btype];
+
+   if (sih->i_size == 0)
+   last_blocknr = 0;
+   else
+   last_blocknr = (sih->i_size - 1) >> data_bits;
+
+   return last_blocknr;
+}
+
+int nova_delete_file_tree(struct super_block *sb,
+   struct nova_inode_info_header *sih, unsigned long start_blocknr,
+   unsigned long last_blocknr, bool delete_nvmm, bool delete_dead,
+   u64 epoch_id)
+{
+   struct nova_file_write_entry *entry;
+   struct nova_file_write_entry *old_entry = NULL;
+   unsigned long pgoff = start_blocknr;
+   unsigned long old_pgoff = 0;
+   unsigned int num_free = 0;
+   int freed = 0;
+   void *ret;
+   timing_t delete_time;
+
+   NOVA_START_TIMING(delete_file_tree_t, delete_time);
+
+   /* Handle EOF blocks */
+   do {
+   entry = radix_tree_lookup(>tree, pgoff);
+   if (entry) {
+   ret = radix_tree_delete(>tree, pgoff);
+   WARN_ON(!ret || ret != entry);
+   if (entry != old_entry) {
+   if (old_entry && delete_nvmm) {
+   nova_free_old_entry(sb, sih,
+   old_entry, old_pgoff,
+   num_free, delete_dead,
+   epoch_id);
+   freed += num_free;
+   }
+
+   old_entry = entry;
+   old_pgoff = pgoff;
+   num_free = 1;
+   } else {
+   num_free++;
+   }
+   pgoff++;
+   } else {
+   /* We are finding a hole. Jump to the next entry. */
+   entry = nova_find_next_entry(sb, sih, pgoff);
+   if (!entry)
+   break;
+
+   pgoff++;
+   pgoff = pgoff > entry->pgoff ? pgoff : entry->pgoff;
+   }
+   } while (1);
+
+   if (old_entry && delete_nvmm) {
+   nova_free_old_entry(sb, sih, old_entry, old_pgoff,
+   num_free, delete_dead, epoch_id);
+   freed += num_free;
+   }
+
+   nova_dbgv("Inode %lu: delete file tree from pgoff %lu to %lu, %d blocks 
freed\n",
+   sih->ino, start_blocknr, last_blocknr, f

[RFC v2 50/83] Inode: Add nova_evict_inode.

2018-03-10 Thread Andiry Xu
From: Andiry Xu 

If the inode still have links, release the DRAM resource (radix tree, etc).
Otherwise reclaim data pages and log pages.

Signed-off-by: Andiry Xu 
---
 fs/nova/inode.c | 257 +++-
 fs/nova/inode.h |   5 ++
 fs/nova/log.h   |   7 ++
 fs/nova/super.c |   1 +
 4 files changed, 269 insertions(+), 1 deletion(-)

diff --git a/fs/nova/inode.c b/fs/nova/inode.c
index 41417e3..17addd3 100644
--- a/fs/nova/inode.c
+++ b/fs/nova/inode.c
@@ -457,7 +457,7 @@ static int nova_alloc_unused_inode(struct super_block *sb, 
int cpuid,
return 0;
 }
 
-int nova_free_inuse_inode(struct super_block *sb, unsigned long ino)
+static int nova_free_inuse_inode(struct super_block *sb, unsigned long ino)
 {
struct nova_sb_info *sbi = NOVA_SB(sb);
struct inode_map *inode_map;
@@ -532,6 +532,261 @@ int nova_free_inuse_inode(struct super_block *sb, 
unsigned long ino)
return ret;
 }
 
+static int nova_free_inode(struct super_block *sb, struct nova_inode *pi,
+   struct nova_inode_info_header *sih)
+{
+   int err = 0;
+   timing_t free_time;
+
+   NOVA_START_TIMING(free_inode_t, free_time);
+
+   nova_free_inode_log(sb, pi, sih);
+
+   sih->log_pages = 0;
+   sih->i_mode = 0;
+   sih->pi_addr = 0;
+   sih->i_size = 0;
+   sih->i_blocks = 0;
+
+   err = nova_free_inuse_inode(sb, pi->nova_ino);
+
+   NOVA_END_TIMING(free_inode_t, free_time);
+   return err;
+}
+
+/*
+ * We do not really rely on this last blocknr
+ * because blocks can be allocated beyond file end
+ */
+static unsigned long nova_get_last_blocknr(struct super_block *sb,
+   struct nova_inode_info_header *sih)
+{
+   struct nova_inode *pi, fake_pi;
+   unsigned long last_blocknr;
+   unsigned int btype;
+   unsigned int data_bits;
+   int ret;
+
+   ret = nova_get_reference(sb, sih->pi_addr, _pi,
+   (void **), sizeof(struct nova_inode));
+   if (ret) {
+   nova_dbg("%s: read pi @ 0x%lx failed\n",
+   __func__, sih->pi_addr);
+   btype = 0;
+   } else {
+   btype = sih->i_blk_type;
+   }
+
+   data_bits = blk_type_to_shift[btype];
+
+   if (sih->i_size == 0)
+   last_blocknr = 0;
+   else
+   last_blocknr = (sih->i_size - 1) >> data_bits;
+
+   return last_blocknr;
+}
+
+int nova_delete_file_tree(struct super_block *sb,
+   struct nova_inode_info_header *sih, unsigned long start_blocknr,
+   unsigned long last_blocknr, bool delete_nvmm, bool delete_dead,
+   u64 epoch_id)
+{
+   struct nova_file_write_entry *entry;
+   struct nova_file_write_entry *old_entry = NULL;
+   unsigned long pgoff = start_blocknr;
+   unsigned long old_pgoff = 0;
+   unsigned int num_free = 0;
+   int freed = 0;
+   void *ret;
+   timing_t delete_time;
+
+   NOVA_START_TIMING(delete_file_tree_t, delete_time);
+
+   /* Handle EOF blocks */
+   do {
+   entry = radix_tree_lookup(>tree, pgoff);
+   if (entry) {
+   ret = radix_tree_delete(>tree, pgoff);
+   WARN_ON(!ret || ret != entry);
+   if (entry != old_entry) {
+   if (old_entry && delete_nvmm) {
+   nova_free_old_entry(sb, sih,
+   old_entry, old_pgoff,
+   num_free, delete_dead,
+   epoch_id);
+   freed += num_free;
+   }
+
+   old_entry = entry;
+   old_pgoff = pgoff;
+   num_free = 1;
+   } else {
+   num_free++;
+   }
+   pgoff++;
+   } else {
+   /* We are finding a hole. Jump to the next entry. */
+   entry = nova_find_next_entry(sb, sih, pgoff);
+   if (!entry)
+   break;
+
+   pgoff++;
+   pgoff = pgoff > entry->pgoff ? pgoff : entry->pgoff;
+   }
+   } while (1);
+
+   if (old_entry && delete_nvmm) {
+   nova_free_old_entry(sb, sih, old_entry, old_pgoff,
+   num_free, delete_dead, epoch_id);
+   freed += num_free;
+   }
+
+   nova_dbgv("Inode %lu: delete file tree from pgoff %lu to %lu, %d blocks 
freed\n",
+   sih->ino, start_blocknr, last_blocknr, freed);
+
+   NOVA_END_TIMING(delete_file_tree_t, delete

[RFC v2 45/83] Log operation: file inode log lookup and assign

2018-03-10 Thread Andiry Xu
From: Andiry Xu <jix...@cs.ucsd.edu>

After NOVA appends file write entry to commit new writes,
it updates the file offset radix tree, finds the old entries (if overwrite)
and reclaims the stale data blocks.

Signed-off-by: Andiry Xu <jix...@cs.ucsd.edu>
---
 fs/nova/log.c  | 108 +
 fs/nova/log.h  |   5 +++
 fs/nova/nova.h |  64 ++
 3 files changed, 177 insertions(+)

diff --git a/fs/nova/log.c b/fs/nova/log.c
index d150f2e..451be27 100644
--- a/fs/nova/log.c
+++ b/fs/nova/log.c
@@ -102,6 +102,50 @@ static inline int nova_invalidate_write_entry(struct 
super_block *sb,
reassign, num_free);
 }
 
+unsigned int nova_free_old_entry(struct super_block *sb,
+   struct nova_inode_info_header *sih,
+   struct nova_file_write_entry *entry,
+   unsigned long pgoff, unsigned int num_free,
+   bool delete_dead, u64 epoch_id)
+{
+   unsigned long old_nvmm;
+   timing_t free_time;
+
+   if (!entry)
+   return 0;
+
+   NOVA_START_TIMING(free_old_t, free_time);
+
+   old_nvmm = get_nvmm(sb, sih, entry, pgoff);
+
+   if (!delete_dead)
+   nova_invalidate_write_entry(sb, entry, 1, num_free);
+
+   nova_dbgv("%s: pgoff %lu, free %u blocks\n",
+   __func__, pgoff, num_free);
+   nova_free_data_blocks(sb, sih, old_nvmm, num_free);
+
+   sih->i_blocks -= num_free;
+
+   NOVA_END_TIMING(free_old_t, free_time);
+   return num_free;
+}
+
+struct nova_file_write_entry *nova_find_next_entry(struct super_block *sb,
+   struct nova_inode_info_header *sih, pgoff_t pgoff)
+{
+   struct nova_file_write_entry *entry = NULL;
+   struct nova_file_write_entry *entries[1];
+   int nr_entries;
+
+   nr_entries = radix_tree_gang_lookup(>tree,
+   (void **)entries, pgoff, 1);
+   if (nr_entries == 1)
+   entry = entries[0];
+
+   return entry;
+}
+
 static void nova_update_setattr_entry(struct inode *inode,
struct nova_setattr_logentry *entry,
struct nova_log_entry_info *entry_info)
@@ -568,6 +612,70 @@ int nova_append_link_change_entry(struct super_block *sb,
return ret;
 }
 
+int nova_assign_write_entry(struct super_block *sb,
+   struct nova_inode_info_header *sih,
+   struct nova_file_write_entry *entry,
+   bool free)
+{
+   struct nova_file_write_entry *old_entry;
+   struct nova_file_write_entry *start_old_entry = NULL;
+   void **pentry;
+   unsigned long start_pgoff = entry->pgoff;
+   unsigned long start_old_pgoff = 0;
+   unsigned int num = entry->num_pages;
+   unsigned int num_free = 0;
+   unsigned long curr_pgoff;
+   int i;
+   int ret = 0;
+   timing_t assign_time;
+
+   NOVA_START_TIMING(assign_t, assign_time);
+   for (i = 0; i < num; i++) {
+   curr_pgoff = start_pgoff + i;
+
+   pentry = radix_tree_lookup_slot(>tree, curr_pgoff);
+   if (pentry) {
+   old_entry = radix_tree_deref_slot(pentry);
+   if (old_entry != start_old_entry) {
+   if (start_old_entry && free)
+   nova_free_old_entry(sb, sih,
+   start_old_entry,
+   start_old_pgoff,
+   num_free, false,
+   entry->epoch_id);
+   nova_invalidate_write_entry(sb,
+   start_old_entry, 1, 0);
+
+   start_old_entry = old_entry;
+   start_old_pgoff = curr_pgoff;
+   num_free = 1;
+   } else {
+   num_free++;
+   }
+
+   radix_tree_replace_slot(>tree, pentry, entry);
+   } else {
+   ret = radix_tree_insert(>tree, curr_pgoff, entry);
+   if (ret) {
+   nova_dbg("%s: ERROR %d\n", __func__, ret);
+   goto out;
+   }
+   }
+   }
+
+   if (start_old_entry && free)
+   nova_free_old_entry(sb, sih, start_old_entry,
+   start_old_pgoff, num_free, false,
+   entry->epoch_id);
+
+   nova_invalidate_write_entry(sb, start_old_entry, 1, 0);
+
+out:
+   NOVA_END_TIMING(assign_t, assign_time);
+
+   return ret;
+}
+
 int nova_inplace_update_write_entry(struct super_block *sb,
  

[RFC v2 45/83] Log operation: file inode log lookup and assign

2018-03-10 Thread Andiry Xu
From: Andiry Xu 

After NOVA appends file write entry to commit new writes,
it updates the file offset radix tree, finds the old entries (if overwrite)
and reclaims the stale data blocks.

Signed-off-by: Andiry Xu 
---
 fs/nova/log.c  | 108 +
 fs/nova/log.h  |   5 +++
 fs/nova/nova.h |  64 ++
 3 files changed, 177 insertions(+)

diff --git a/fs/nova/log.c b/fs/nova/log.c
index d150f2e..451be27 100644
--- a/fs/nova/log.c
+++ b/fs/nova/log.c
@@ -102,6 +102,50 @@ static inline int nova_invalidate_write_entry(struct 
super_block *sb,
reassign, num_free);
 }
 
+unsigned int nova_free_old_entry(struct super_block *sb,
+   struct nova_inode_info_header *sih,
+   struct nova_file_write_entry *entry,
+   unsigned long pgoff, unsigned int num_free,
+   bool delete_dead, u64 epoch_id)
+{
+   unsigned long old_nvmm;
+   timing_t free_time;
+
+   if (!entry)
+   return 0;
+
+   NOVA_START_TIMING(free_old_t, free_time);
+
+   old_nvmm = get_nvmm(sb, sih, entry, pgoff);
+
+   if (!delete_dead)
+   nova_invalidate_write_entry(sb, entry, 1, num_free);
+
+   nova_dbgv("%s: pgoff %lu, free %u blocks\n",
+   __func__, pgoff, num_free);
+   nova_free_data_blocks(sb, sih, old_nvmm, num_free);
+
+   sih->i_blocks -= num_free;
+
+   NOVA_END_TIMING(free_old_t, free_time);
+   return num_free;
+}
+
+struct nova_file_write_entry *nova_find_next_entry(struct super_block *sb,
+   struct nova_inode_info_header *sih, pgoff_t pgoff)
+{
+   struct nova_file_write_entry *entry = NULL;
+   struct nova_file_write_entry *entries[1];
+   int nr_entries;
+
+   nr_entries = radix_tree_gang_lookup(>tree,
+   (void **)entries, pgoff, 1);
+   if (nr_entries == 1)
+   entry = entries[0];
+
+   return entry;
+}
+
 static void nova_update_setattr_entry(struct inode *inode,
struct nova_setattr_logentry *entry,
struct nova_log_entry_info *entry_info)
@@ -568,6 +612,70 @@ int nova_append_link_change_entry(struct super_block *sb,
return ret;
 }
 
+int nova_assign_write_entry(struct super_block *sb,
+   struct nova_inode_info_header *sih,
+   struct nova_file_write_entry *entry,
+   bool free)
+{
+   struct nova_file_write_entry *old_entry;
+   struct nova_file_write_entry *start_old_entry = NULL;
+   void **pentry;
+   unsigned long start_pgoff = entry->pgoff;
+   unsigned long start_old_pgoff = 0;
+   unsigned int num = entry->num_pages;
+   unsigned int num_free = 0;
+   unsigned long curr_pgoff;
+   int i;
+   int ret = 0;
+   timing_t assign_time;
+
+   NOVA_START_TIMING(assign_t, assign_time);
+   for (i = 0; i < num; i++) {
+   curr_pgoff = start_pgoff + i;
+
+   pentry = radix_tree_lookup_slot(>tree, curr_pgoff);
+   if (pentry) {
+   old_entry = radix_tree_deref_slot(pentry);
+   if (old_entry != start_old_entry) {
+   if (start_old_entry && free)
+   nova_free_old_entry(sb, sih,
+   start_old_entry,
+   start_old_pgoff,
+   num_free, false,
+   entry->epoch_id);
+   nova_invalidate_write_entry(sb,
+   start_old_entry, 1, 0);
+
+   start_old_entry = old_entry;
+   start_old_pgoff = curr_pgoff;
+   num_free = 1;
+   } else {
+   num_free++;
+   }
+
+   radix_tree_replace_slot(>tree, pentry, entry);
+   } else {
+   ret = radix_tree_insert(>tree, curr_pgoff, entry);
+   if (ret) {
+   nova_dbg("%s: ERROR %d\n", __func__, ret);
+   goto out;
+   }
+   }
+   }
+
+   if (start_old_entry && free)
+   nova_free_old_entry(sb, sih, start_old_entry,
+   start_old_pgoff, num_free, false,
+   entry->epoch_id);
+
+   nova_invalidate_write_entry(sb, start_old_entry, 1, 0);
+
+out:
+   NOVA_END_TIMING(assign_t, assign_time);
+
+   return ret;
+}
+
 int nova_inplace_update_write_entry(struct super_block *sb,
struct inode *inode, struct nova_file_write_

[RFC v2 49/83] Dir: Append create/remove dentry.

2018-03-10 Thread Andiry Xu
From: Andiry Xu <jix...@cs.ucsd.edu>

NOVA adds or removes a directory/file by appending a dentry
to the parent directory's log. Dentry contains filename and inode number.
A positive inode number indicates a create(valid) dentry, and
a dentry with inode number zero is a remove dentry.
NOVA can also inplace update a create dentry to invalidate it.

Signed-off-by: Andiry Xu <jix...@cs.ucsd.edu>
---
 fs/nova/dir.c  | 140 +
 fs/nova/nova.h |   4 ++
 2 files changed, 144 insertions(+)

diff --git a/fs/nova/dir.c b/fs/nova/dir.c
index 35a66f9..47ee9ad 100644
--- a/fs/nova/dir.c
+++ b/fs/nova/dir.c
@@ -222,6 +222,146 @@ int nova_append_dir_init_entries(struct super_block *sb,
return 0;
 }
 
+/* adds a directory entry pointing to the inode. assumes the inode has
+ * already been logged for consistency
+ */
+int nova_add_dentry(struct dentry *dentry, u64 ino, int inc_link,
+   struct nova_inode_update *update, u64 epoch_id)
+{
+   struct inode *dir = dentry->d_parent->d_inode;
+   struct super_block *sb = dir->i_sb;
+   struct nova_inode_info *si = NOVA_I(dir);
+   struct nova_inode_info_header *sih = >header;
+   struct nova_inode *pidir;
+   const char *name = dentry->d_name.name;
+   int namelen = dentry->d_name.len;
+   struct nova_dentry *direntry;
+   unsigned short loglen;
+   int ret;
+   u64 curr_entry;
+   timing_t add_dentry_time;
+
+   nova_dbg_verbose("%s: dir %lu new inode %llu\n",
+   __func__, dir->i_ino, ino);
+   nova_dbg_verbose("%s: %s %d\n", __func__, name, namelen);
+   NOVA_START_TIMING(add_dentry_t, add_dentry_time);
+   if (namelen == 0)
+   return -EINVAL;
+
+   pidir = nova_get_inode(sb, dir);
+
+   /*
+* XXX shouldn't update any times until successful
+* completion of syscall, but too many callers depend
+* on this.
+*/
+   dir->i_mtime = dir->i_ctime = current_time(dir);
+
+   loglen = NOVA_DIR_LOG_REC_LEN(namelen);
+   ret = nova_append_dentry(sb, pidir, dir, dentry,
+   ino, loglen, update,
+   inc_link, epoch_id);
+
+   if (ret) {
+   nova_dbg("%s: append dir entry failure\n", __func__);
+   return ret;
+   }
+
+   curr_entry = update->curr_entry;
+   direntry = (struct nova_dentry *)nova_get_block(sb, curr_entry);
+   sih->last_dentry = curr_entry;
+   ret = nova_insert_dir_radix_tree(sb, sih, name, namelen, direntry);
+
+   sih->trans_id++;
+   NOVA_END_TIMING(add_dentry_t, add_dentry_time);
+   return ret;
+}
+
+static int nova_can_inplace_update_dentry(struct super_block *sb,
+   struct nova_dentry *dentry, u64 epoch_id)
+{
+   if (dentry && dentry->epoch_id == epoch_id)
+   return 1;
+
+   return 0;
+}
+
+/* removes a directory entry pointing to the inode. assumes the inode has
+ * already been logged for consistency
+ */
+int nova_remove_dentry(struct dentry *dentry, int dec_link,
+   struct nova_inode_update *update, u64 epoch_id)
+{
+   struct inode *dir = dentry->d_parent->d_inode;
+   struct super_block *sb = dir->i_sb;
+   struct nova_sb_info *sbi = NOVA_SB(sb);
+   struct nova_inode_info *si = NOVA_I(dir);
+   struct nova_inode_info_header *sih = >header;
+   struct nova_inode *pidir;
+   struct qstr *entry = >d_name;
+   struct nova_dentry *old_dentry = NULL;
+   unsigned short loglen;
+   int ret;
+   u64 curr_entry;
+   timing_t remove_dentry_time;
+
+   NOVA_START_TIMING(remove_dentry_t, remove_dentry_time);
+
+   update->create_dentry = NULL;
+   update->delete_dentry = NULL;
+
+   if (!dentry->d_name.len) {
+   ret = -EINVAL;
+   goto out;
+   }
+
+   ret = nova_remove_dir_radix_tree(sb, sih, entry->name, entry->len, 0,
+   _dentry);
+
+   if (ret)
+   goto out;
+
+   pidir = nova_get_inode(sb, dir);
+
+   dir->i_mtime = dir->i_ctime = current_time(dir);
+
+   if (nova_can_inplace_update_dentry(sb, old_dentry, epoch_id)) {
+   nova_inplace_update_dentry(sb, dir, old_dentry,
+   dec_link, epoch_id);
+   curr_entry = nova_get_addr_off(sbi, old_dentry);
+
+   sih->last_dentry = curr_entry;
+   /* Leave create/delete_dentry to NULL
+* Do not change tail if used as input
+*/
+   if (update->tail == 0) {
+   update->tail = sih->log_tail;
+   }
+   sih->trans_id++;
+   goto out;
+   }
+
+   loglen = 

[RFC v2 51/83] Rebuild: directory inode.

2018-03-10 Thread Andiry Xu
From: Andiry Xu <jix...@cs.ucsd.edu>

When vfs issues a read inode command, or when the inode is newly allocated,
walk through the inode log to rebuild inode information and the radix tree.

Signed-off-by: Andiry Xu <jix...@cs.ucsd.edu>
---
 fs/nova/inode.h   |  15 +++
 fs/nova/nova.h|  21 
 fs/nova/rebuild.c | 329 +-
 3 files changed, 364 insertions(+), 1 deletion(-)

diff --git a/fs/nova/inode.h b/fs/nova/inode.h
index 62c8bdc..42690e6 100644
--- a/fs/nova/inode.h
+++ b/fs/nova/inode.h
@@ -97,6 +97,21 @@ struct nova_inode_info_header {
u8  i_blk_type;
 };
 
+/* For rebuild purpose, temporarily store pi infomation */
+struct nova_inode_rebuild {
+   u64 i_size;
+   u32 i_flags;/* Inode flags */
+   u32 i_ctime;/* Inode modification time */
+   u32 i_mtime;/* Inode b-tree Modification time */
+   u32 i_atime;/* Access time */
+   u32 i_uid;  /* Owner Uid */
+   u32 i_gid;  /* Group Id */
+   u32 i_generation;   /* File version (for NFS) */
+   u16 i_links_count;  /* Links count */
+   u16 i_mode; /* File mode */
+   u64 trans_id;
+};
+
 /*
  * DRAM state for inodes
  */
diff --git a/fs/nova/nova.h b/fs/nova/nova.h
index 3a51dae..983c6b2 100644
--- a/fs/nova/nova.h
+++ b/fs/nova/nova.h
@@ -301,6 +301,24 @@ static inline u64 nova_get_epoch_id(struct super_block *sb)
 }
 
 #include "inode.h"
+
+static inline int nova_get_head_tail(struct super_block *sb,
+   struct nova_inode *pi, struct nova_inode_info_header *sih)
+{
+   struct nova_inode fake_pi;
+   int rc;
+
+   rc = memcpy_mcsafe(_pi, pi, sizeof(struct nova_inode));
+   if (rc)
+   return rc;
+
+   sih->i_blk_type = fake_pi.i_blk_type;
+   sih->log_head = fake_pi.log_head;
+   sih->log_tail = fake_pi.log_tail;
+
+   return rc;
+}
+
 #include "log.h"
 
 struct nova_range_node_lowhigh {
@@ -467,6 +485,9 @@ int nova_remove_dentry(struct dentry *dentry, int dec_link,
struct nova_inode_update *update, u64 epoch_id);
 
 /* rebuild.c */
+int nova_rebuild_dir_inode_tree(struct super_block *sb,
+   struct nova_inode *pi, u64 pi_addr,
+   struct nova_inode_info_header *sih);
 int nova_rebuild_inode(struct super_block *sb, struct nova_inode_info *si,
u64 ino, u64 pi_addr, int rebuild_dir);
 
diff --git a/fs/nova/rebuild.c b/fs/nova/rebuild.c
index 0595851..9a1327d 100644
--- a/fs/nova/rebuild.c
+++ b/fs/nova/rebuild.c
@@ -18,6 +18,319 @@
 #include "nova.h"
 #include "inode.h"
 
+/* entry given to this function is a copy in dram */
+static void nova_apply_setattr_entry(struct super_block *sb,
+   struct nova_inode_rebuild *reb, struct nova_inode_info_header *sih,
+   struct nova_setattr_logentry *entry)
+{
+   unsigned int data_bits = blk_type_to_shift[sih->i_blk_type];
+   unsigned long first_blocknr, last_blocknr;
+   loff_t start, end;
+   int freed = 0;
+
+   reb->i_mode = entry->mode;
+   reb->i_uid  = entry->uid;
+   reb->i_gid  = entry->gid;
+   reb->i_atime= entry->atime;
+
+   if (S_ISREG(reb->i_mode)) {
+   start = entry->size;
+   end = reb->i_size;
+
+   first_blocknr = (start + (1UL << data_bits) - 1) >> data_bits;
+
+   if (end > 0)
+   last_blocknr = (end - 1) >> data_bits;
+   else
+   last_blocknr = 0;
+
+   freed = nova_delete_file_tree(sb, sih, first_blocknr,
+   last_blocknr, false, false, 0);
+   }
+}
+
+/* entry given to this function is a copy in dram */
+static void nova_apply_link_change_entry(struct super_block *sb,
+   struct nova_inode_rebuild *reb, struct nova_link_change_entry *entry)
+{
+   reb->i_links_count  = entry->links;
+   reb->i_ctime= entry->ctime;
+   reb->i_flags= entry->flags;
+   reb->i_generation   = entry->generation;
+
+   /* Do not flush now */
+}
+
+static void nova_update_inode_with_rebuild(struct super_block *sb,
+   struct nova_inode_rebuild *reb, struct nova_inode *pi)
+{
+   pi->i_size = cpu_to_le64(reb->i_size);
+   pi->i_flags = cpu_to_le32(reb->i_flags);
+   pi->i_uid = cpu_to_le32(reb->i_uid);
+   pi->i_gid = cpu_to_le32(reb->i_gid);
+   pi->i_atime = cpu_to_le32(reb->i_atime);
+   pi->i_ctime = cpu_to_le32(reb->i_ctime);
+   pi->i_mtime = cpu_to_le32(reb->i_mtime);
+   pi->i_generation = cpu_to_le32(reb->i_generation);
+   pi->i_links_count = cpu_to_le16(reb->i_links_count);
+   pi->i_mode = cpu_to_le16(reb->i

[RFC v2 49/83] Dir: Append create/remove dentry.

2018-03-10 Thread Andiry Xu
From: Andiry Xu 

NOVA adds or removes a directory/file by appending a dentry
to the parent directory's log. Dentry contains filename and inode number.
A positive inode number indicates a create(valid) dentry, and
a dentry with inode number zero is a remove dentry.
NOVA can also inplace update a create dentry to invalidate it.

Signed-off-by: Andiry Xu 
---
 fs/nova/dir.c  | 140 +
 fs/nova/nova.h |   4 ++
 2 files changed, 144 insertions(+)

diff --git a/fs/nova/dir.c b/fs/nova/dir.c
index 35a66f9..47ee9ad 100644
--- a/fs/nova/dir.c
+++ b/fs/nova/dir.c
@@ -222,6 +222,146 @@ int nova_append_dir_init_entries(struct super_block *sb,
return 0;
 }
 
+/* adds a directory entry pointing to the inode. assumes the inode has
+ * already been logged for consistency
+ */
+int nova_add_dentry(struct dentry *dentry, u64 ino, int inc_link,
+   struct nova_inode_update *update, u64 epoch_id)
+{
+   struct inode *dir = dentry->d_parent->d_inode;
+   struct super_block *sb = dir->i_sb;
+   struct nova_inode_info *si = NOVA_I(dir);
+   struct nova_inode_info_header *sih = >header;
+   struct nova_inode *pidir;
+   const char *name = dentry->d_name.name;
+   int namelen = dentry->d_name.len;
+   struct nova_dentry *direntry;
+   unsigned short loglen;
+   int ret;
+   u64 curr_entry;
+   timing_t add_dentry_time;
+
+   nova_dbg_verbose("%s: dir %lu new inode %llu\n",
+   __func__, dir->i_ino, ino);
+   nova_dbg_verbose("%s: %s %d\n", __func__, name, namelen);
+   NOVA_START_TIMING(add_dentry_t, add_dentry_time);
+   if (namelen == 0)
+   return -EINVAL;
+
+   pidir = nova_get_inode(sb, dir);
+
+   /*
+* XXX shouldn't update any times until successful
+* completion of syscall, but too many callers depend
+* on this.
+*/
+   dir->i_mtime = dir->i_ctime = current_time(dir);
+
+   loglen = NOVA_DIR_LOG_REC_LEN(namelen);
+   ret = nova_append_dentry(sb, pidir, dir, dentry,
+   ino, loglen, update,
+   inc_link, epoch_id);
+
+   if (ret) {
+   nova_dbg("%s: append dir entry failure\n", __func__);
+   return ret;
+   }
+
+   curr_entry = update->curr_entry;
+   direntry = (struct nova_dentry *)nova_get_block(sb, curr_entry);
+   sih->last_dentry = curr_entry;
+   ret = nova_insert_dir_radix_tree(sb, sih, name, namelen, direntry);
+
+   sih->trans_id++;
+   NOVA_END_TIMING(add_dentry_t, add_dentry_time);
+   return ret;
+}
+
+static int nova_can_inplace_update_dentry(struct super_block *sb,
+   struct nova_dentry *dentry, u64 epoch_id)
+{
+   if (dentry && dentry->epoch_id == epoch_id)
+   return 1;
+
+   return 0;
+}
+
+/* removes a directory entry pointing to the inode. assumes the inode has
+ * already been logged for consistency
+ */
+int nova_remove_dentry(struct dentry *dentry, int dec_link,
+   struct nova_inode_update *update, u64 epoch_id)
+{
+   struct inode *dir = dentry->d_parent->d_inode;
+   struct super_block *sb = dir->i_sb;
+   struct nova_sb_info *sbi = NOVA_SB(sb);
+   struct nova_inode_info *si = NOVA_I(dir);
+   struct nova_inode_info_header *sih = >header;
+   struct nova_inode *pidir;
+   struct qstr *entry = >d_name;
+   struct nova_dentry *old_dentry = NULL;
+   unsigned short loglen;
+   int ret;
+   u64 curr_entry;
+   timing_t remove_dentry_time;
+
+   NOVA_START_TIMING(remove_dentry_t, remove_dentry_time);
+
+   update->create_dentry = NULL;
+   update->delete_dentry = NULL;
+
+   if (!dentry->d_name.len) {
+   ret = -EINVAL;
+   goto out;
+   }
+
+   ret = nova_remove_dir_radix_tree(sb, sih, entry->name, entry->len, 0,
+   _dentry);
+
+   if (ret)
+   goto out;
+
+   pidir = nova_get_inode(sb, dir);
+
+   dir->i_mtime = dir->i_ctime = current_time(dir);
+
+   if (nova_can_inplace_update_dentry(sb, old_dentry, epoch_id)) {
+   nova_inplace_update_dentry(sb, dir, old_dentry,
+   dec_link, epoch_id);
+   curr_entry = nova_get_addr_off(sbi, old_dentry);
+
+   sih->last_dentry = curr_entry;
+   /* Leave create/delete_dentry to NULL
+* Do not change tail if used as input
+*/
+   if (update->tail == 0) {
+   update->tail = sih->log_tail;
+   }
+   sih->trans_id++;
+   goto out;
+   }
+
+   loglen = NOVA_DIR_LOG_REC_LEN(entry->len);
+   ret = nova_append_d

[RFC v2 51/83] Rebuild: directory inode.

2018-03-10 Thread Andiry Xu
From: Andiry Xu 

When vfs issues a read inode command, or when the inode is newly allocated,
walk through the inode log to rebuild inode information and the radix tree.

Signed-off-by: Andiry Xu 
---
 fs/nova/inode.h   |  15 +++
 fs/nova/nova.h|  21 
 fs/nova/rebuild.c | 329 +-
 3 files changed, 364 insertions(+), 1 deletion(-)

diff --git a/fs/nova/inode.h b/fs/nova/inode.h
index 62c8bdc..42690e6 100644
--- a/fs/nova/inode.h
+++ b/fs/nova/inode.h
@@ -97,6 +97,21 @@ struct nova_inode_info_header {
u8  i_blk_type;
 };
 
+/* For rebuild purpose, temporarily store pi infomation */
+struct nova_inode_rebuild {
+   u64 i_size;
+   u32 i_flags;/* Inode flags */
+   u32 i_ctime;/* Inode modification time */
+   u32 i_mtime;/* Inode b-tree Modification time */
+   u32 i_atime;/* Access time */
+   u32 i_uid;  /* Owner Uid */
+   u32 i_gid;  /* Group Id */
+   u32 i_generation;   /* File version (for NFS) */
+   u16 i_links_count;  /* Links count */
+   u16 i_mode; /* File mode */
+   u64 trans_id;
+};
+
 /*
  * DRAM state for inodes
  */
diff --git a/fs/nova/nova.h b/fs/nova/nova.h
index 3a51dae..983c6b2 100644
--- a/fs/nova/nova.h
+++ b/fs/nova/nova.h
@@ -301,6 +301,24 @@ static inline u64 nova_get_epoch_id(struct super_block *sb)
 }
 
 #include "inode.h"
+
+static inline int nova_get_head_tail(struct super_block *sb,
+   struct nova_inode *pi, struct nova_inode_info_header *sih)
+{
+   struct nova_inode fake_pi;
+   int rc;
+
+   rc = memcpy_mcsafe(_pi, pi, sizeof(struct nova_inode));
+   if (rc)
+   return rc;
+
+   sih->i_blk_type = fake_pi.i_blk_type;
+   sih->log_head = fake_pi.log_head;
+   sih->log_tail = fake_pi.log_tail;
+
+   return rc;
+}
+
 #include "log.h"
 
 struct nova_range_node_lowhigh {
@@ -467,6 +485,9 @@ int nova_remove_dentry(struct dentry *dentry, int dec_link,
struct nova_inode_update *update, u64 epoch_id);
 
 /* rebuild.c */
+int nova_rebuild_dir_inode_tree(struct super_block *sb,
+   struct nova_inode *pi, u64 pi_addr,
+   struct nova_inode_info_header *sih);
 int nova_rebuild_inode(struct super_block *sb, struct nova_inode_info *si,
u64 ino, u64 pi_addr, int rebuild_dir);
 
diff --git a/fs/nova/rebuild.c b/fs/nova/rebuild.c
index 0595851..9a1327d 100644
--- a/fs/nova/rebuild.c
+++ b/fs/nova/rebuild.c
@@ -18,6 +18,319 @@
 #include "nova.h"
 #include "inode.h"
 
+/* entry given to this function is a copy in dram */
+static void nova_apply_setattr_entry(struct super_block *sb,
+   struct nova_inode_rebuild *reb, struct nova_inode_info_header *sih,
+   struct nova_setattr_logentry *entry)
+{
+   unsigned int data_bits = blk_type_to_shift[sih->i_blk_type];
+   unsigned long first_blocknr, last_blocknr;
+   loff_t start, end;
+   int freed = 0;
+
+   reb->i_mode = entry->mode;
+   reb->i_uid  = entry->uid;
+   reb->i_gid  = entry->gid;
+   reb->i_atime= entry->atime;
+
+   if (S_ISREG(reb->i_mode)) {
+   start = entry->size;
+   end = reb->i_size;
+
+   first_blocknr = (start + (1UL << data_bits) - 1) >> data_bits;
+
+   if (end > 0)
+   last_blocknr = (end - 1) >> data_bits;
+   else
+   last_blocknr = 0;
+
+   freed = nova_delete_file_tree(sb, sih, first_blocknr,
+   last_blocknr, false, false, 0);
+   }
+}
+
+/* entry given to this function is a copy in dram */
+static void nova_apply_link_change_entry(struct super_block *sb,
+   struct nova_inode_rebuild *reb, struct nova_link_change_entry *entry)
+{
+   reb->i_links_count  = entry->links;
+   reb->i_ctime= entry->ctime;
+   reb->i_flags= entry->flags;
+   reb->i_generation   = entry->generation;
+
+   /* Do not flush now */
+}
+
+static void nova_update_inode_with_rebuild(struct super_block *sb,
+   struct nova_inode_rebuild *reb, struct nova_inode *pi)
+{
+   pi->i_size = cpu_to_le64(reb->i_size);
+   pi->i_flags = cpu_to_le32(reb->i_flags);
+   pi->i_uid = cpu_to_le32(reb->i_uid);
+   pi->i_gid = cpu_to_le32(reb->i_gid);
+   pi->i_atime = cpu_to_le32(reb->i_atime);
+   pi->i_ctime = cpu_to_le32(reb->i_ctime);
+   pi->i_mtime = cpu_to_le32(reb->i_mtime);
+   pi->i_generation = cpu_to_le32(reb->i_generation);
+   pi->i_links_count = cpu_to_le16(reb->i_links_count);
+   pi->i_mode = cpu_to_le16(reb->i_mode);
+}
+
+static int nova_init_inode_rebuild(struct super_

[RFC v2 52/83] Rebuild: file inode.

2018-03-10 Thread Andiry Xu
From: Andiry Xu <jix...@cs.ucsd.edu>

Rebuild file inode metadata and radix tree on read_inode.

Signed-off-by: Andiry Xu <jix...@cs.ucsd.edu>
---
 fs/nova/log.h |   4 ++
 fs/nova/rebuild.c | 124 ++
 2 files changed, 128 insertions(+)

diff --git a/fs/nova/log.h b/fs/nova/log.h
index 87ce5f9..bdb85eb 100644
--- a/fs/nova/log.h
+++ b/fs/nova/log.h
@@ -385,6 +385,10 @@ int nova_inplace_update_write_entry(struct super_block *sb,
 int nova_append_file_write_entry(struct super_block *sb, struct nova_inode *pi,
struct inode *inode, struct nova_file_write_item *item,
struct nova_inode_update *update);
+int nova_assign_write_entry(struct super_block *sb,
+   struct nova_inode_info_header *sih,
+   struct nova_file_write_entry *entry,
+   bool free);
 int nova_invalidate_dentries(struct super_block *sb,
struct nova_inode_update *update);
 int nova_inplace_update_dentry(struct super_block *sb,
diff --git a/fs/nova/rebuild.c b/fs/nova/rebuild.c
index 9a1327d..07cf6e3 100644
--- a/fs/nova/rebuild.c
+++ b/fs/nova/rebuild.c
@@ -156,6 +156,126 @@ static int nova_rebuild_inode_finish(struct super_block 
*sb,
return 0;
 }
 
+static void nova_rebuild_handle_write_entry(struct super_block *sb,
+   struct nova_inode_info_header *sih, struct nova_inode_rebuild *reb,
+   struct nova_file_write_entry *entry)
+{
+   if (entry->num_pages != entry->invalid_pages) {
+   /*
+* The overlaped blocks are already freed.
+* Don't double free them, just re-assign the pointers.
+*/
+   nova_assign_write_entry(sb, sih, entry, false);
+   }
+
+   if (entry->trans_id >= sih->trans_id) {
+   nova_rebuild_file_time_and_size(sb, reb,
+   entry->mtime, entry->mtime,
+   entry->size);
+   reb->trans_id = entry->trans_id;
+   }
+
+   /* Update sih->i_size for setattr apply operations */
+   sih->i_size = le64_to_cpu(reb->i_size);
+}
+
+static int nova_rebuild_file_inode_tree(struct super_block *sb,
+   struct nova_inode *pi, u64 pi_addr,
+   struct nova_inode_info_header *sih)
+{
+   struct nova_file_write_entry *entry = NULL;
+   struct nova_setattr_logentry *attr_entry = NULL;
+   struct nova_link_change_entry *link_change_entry = NULL;
+   struct nova_inode_rebuild rebuild, *reb;
+   unsigned int data_bits = blk_type_to_shift[sih->i_blk_type];
+   u64 ino = pi->nova_ino;
+   timing_t rebuild_time;
+   void *addr, *entryc = NULL;
+   u64 curr_p;
+   u8 type;
+   int ret;
+
+   NOVA_START_TIMING(rebuild_file_t, rebuild_time);
+   nova_dbg_verbose("Rebuild file inode %llu tree\n", ino);
+
+   reb = 
+   ret = nova_rebuild_inode_start(sb, pi, sih, reb, pi_addr);
+   if (ret)
+   goto out;
+
+   curr_p = sih->log_head;
+   if (curr_p == 0 && sih->log_tail == 0)
+   goto out;
+
+// nova_print_nova_log(sb, sih);
+
+   while (curr_p != sih->log_tail) {
+   if (goto_next_page(sb, curr_p)) {
+   sih->log_pages++;
+   curr_p = next_log_page(sb, curr_p);
+   }
+
+   if (curr_p == 0) {
+   nova_err(sb, "File inode %llu log is NULL!\n", ino);
+   ret = -EIO;
+   goto out;
+   }
+
+   addr = (void *)nova_get_block(sb, curr_p);
+
+   entryc = addr;
+
+   type = nova_get_entry_type(entryc);
+
+   switch (type) {
+   case SET_ATTR:
+   attr_entry = (struct nova_setattr_logentry *)entryc;
+   nova_apply_setattr_entry(sb, reb, sih, attr_entry);
+   sih->last_setattr = curr_p;
+   if (attr_entry->trans_id >= reb->trans_id) {
+   nova_rebuild_file_time_and_size(sb, reb,
+   attr_entry->mtime,
+   attr_entry->ctime,
+   attr_entry->size);
+   reb->trans_id = attr_entry->trans_id;
+   }
+
+   /* Update sih->i_size for setattr operation */
+   sih->i_size = le64_to_cpu(reb->i_size);
+   curr_p += sizeof(struct nova_setattr_logentry);
+   break;
+   case LINK_CHANGE:
+   link_change_entry =
+   (struct nova_link_change_entry *)entryc;
+   nova_apply_link_change_entr

[RFC v2 52/83] Rebuild: file inode.

2018-03-10 Thread Andiry Xu
From: Andiry Xu 

Rebuild file inode metadata and radix tree on read_inode.

Signed-off-by: Andiry Xu 
---
 fs/nova/log.h |   4 ++
 fs/nova/rebuild.c | 124 ++
 2 files changed, 128 insertions(+)

diff --git a/fs/nova/log.h b/fs/nova/log.h
index 87ce5f9..bdb85eb 100644
--- a/fs/nova/log.h
+++ b/fs/nova/log.h
@@ -385,6 +385,10 @@ int nova_inplace_update_write_entry(struct super_block *sb,
 int nova_append_file_write_entry(struct super_block *sb, struct nova_inode *pi,
struct inode *inode, struct nova_file_write_item *item,
struct nova_inode_update *update);
+int nova_assign_write_entry(struct super_block *sb,
+   struct nova_inode_info_header *sih,
+   struct nova_file_write_entry *entry,
+   bool free);
 int nova_invalidate_dentries(struct super_block *sb,
struct nova_inode_update *update);
 int nova_inplace_update_dentry(struct super_block *sb,
diff --git a/fs/nova/rebuild.c b/fs/nova/rebuild.c
index 9a1327d..07cf6e3 100644
--- a/fs/nova/rebuild.c
+++ b/fs/nova/rebuild.c
@@ -156,6 +156,126 @@ static int nova_rebuild_inode_finish(struct super_block 
*sb,
return 0;
 }
 
+static void nova_rebuild_handle_write_entry(struct super_block *sb,
+   struct nova_inode_info_header *sih, struct nova_inode_rebuild *reb,
+   struct nova_file_write_entry *entry)
+{
+   if (entry->num_pages != entry->invalid_pages) {
+   /*
+* The overlaped blocks are already freed.
+* Don't double free them, just re-assign the pointers.
+*/
+   nova_assign_write_entry(sb, sih, entry, false);
+   }
+
+   if (entry->trans_id >= sih->trans_id) {
+   nova_rebuild_file_time_and_size(sb, reb,
+   entry->mtime, entry->mtime,
+   entry->size);
+   reb->trans_id = entry->trans_id;
+   }
+
+   /* Update sih->i_size for setattr apply operations */
+   sih->i_size = le64_to_cpu(reb->i_size);
+}
+
+static int nova_rebuild_file_inode_tree(struct super_block *sb,
+   struct nova_inode *pi, u64 pi_addr,
+   struct nova_inode_info_header *sih)
+{
+   struct nova_file_write_entry *entry = NULL;
+   struct nova_setattr_logentry *attr_entry = NULL;
+   struct nova_link_change_entry *link_change_entry = NULL;
+   struct nova_inode_rebuild rebuild, *reb;
+   unsigned int data_bits = blk_type_to_shift[sih->i_blk_type];
+   u64 ino = pi->nova_ino;
+   timing_t rebuild_time;
+   void *addr, *entryc = NULL;
+   u64 curr_p;
+   u8 type;
+   int ret;
+
+   NOVA_START_TIMING(rebuild_file_t, rebuild_time);
+   nova_dbg_verbose("Rebuild file inode %llu tree\n", ino);
+
+   reb = 
+   ret = nova_rebuild_inode_start(sb, pi, sih, reb, pi_addr);
+   if (ret)
+   goto out;
+
+   curr_p = sih->log_head;
+   if (curr_p == 0 && sih->log_tail == 0)
+   goto out;
+
+// nova_print_nova_log(sb, sih);
+
+   while (curr_p != sih->log_tail) {
+   if (goto_next_page(sb, curr_p)) {
+   sih->log_pages++;
+   curr_p = next_log_page(sb, curr_p);
+   }
+
+   if (curr_p == 0) {
+   nova_err(sb, "File inode %llu log is NULL!\n", ino);
+   ret = -EIO;
+   goto out;
+   }
+
+   addr = (void *)nova_get_block(sb, curr_p);
+
+   entryc = addr;
+
+   type = nova_get_entry_type(entryc);
+
+   switch (type) {
+   case SET_ATTR:
+   attr_entry = (struct nova_setattr_logentry *)entryc;
+   nova_apply_setattr_entry(sb, reb, sih, attr_entry);
+   sih->last_setattr = curr_p;
+   if (attr_entry->trans_id >= reb->trans_id) {
+   nova_rebuild_file_time_and_size(sb, reb,
+   attr_entry->mtime,
+   attr_entry->ctime,
+   attr_entry->size);
+   reb->trans_id = attr_entry->trans_id;
+   }
+
+   /* Update sih->i_size for setattr operation */
+   sih->i_size = le64_to_cpu(reb->i_size);
+   curr_p += sizeof(struct nova_setattr_logentry);
+   break;
+   case LINK_CHANGE:
+   link_change_entry =
+   (struct nova_link_change_entry *)entryc;
+   nova_apply_link_change_entry(sb, reb,
+  

[RFC v2 55/83] Namei: mkdir

2018-03-10 Thread Andiry Xu
From: Andiry Xu <jix...@cs.ucsd.edu>

NOVA mkdir is similar to create. The difference is NOVA will
allocate log page for the newly created directory, and append
init dentries.

Signed-off-by: Andiry Xu <jix...@cs.ucsd.edu>
---
 fs/nova/namei.c | 74 +
 1 file changed, 74 insertions(+)

diff --git a/fs/nova/namei.c b/fs/nova/namei.c
index a07cc4f..a95b2fe 100644
--- a/fs/nova/namei.c
+++ b/fs/nova/namei.c
@@ -207,6 +207,79 @@ static int nova_mknod(struct inode *dir, struct dentry 
*dentry, umode_t mode,
return err;
 }
 
+static int nova_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
+{
+   struct super_block *sb = dir->i_sb;
+   struct inode *inode;
+   struct nova_inode *pidir, *pi;
+   struct nova_inode_info *si, *sidir;
+   struct nova_inode_info_header *sih = NULL;
+   struct nova_inode_update update;
+   u64 pi_addr = 0;
+   u64 ino;
+   u64 epoch_id;
+   int err = -EMLINK;
+   timing_t mkdir_time;
+
+   NOVA_START_TIMING(mkdir_t, mkdir_time);
+   if (dir->i_nlink >= NOVA_LINK_MAX)
+   goto out;
+
+   ino = nova_new_nova_inode(sb, _addr);
+   if (ino == 0)
+   goto out_err;
+
+   epoch_id = nova_get_epoch_id(sb);
+   nova_dbgv("%s: name %s\n", __func__, dentry->d_name.name);
+   nova_dbgv("%s: inode %llu, dir %lu, link %d\n", __func__,
+   ino, dir->i_ino, dir->i_nlink);
+
+   update.tail = 0;
+   err = nova_add_dentry(dentry, ino, 1, , epoch_id);
+   if (err) {
+   nova_dbg("failed to add dir entry\n");
+   goto out_err;
+   }
+
+   inode = nova_new_vfs_inode(TYPE_MKDIR, dir, pi_addr, ino,
+   S_IFDIR | mode, sb->s_blocksize,
+   0, >d_name, epoch_id);
+   if (IS_ERR(inode)) {
+   err = PTR_ERR(inode);
+   goto out_err;
+   }
+
+   pi = nova_get_inode(sb, inode);
+   err = nova_append_dir_init_entries(sb, pi, inode->i_ino, dir->i_ino,
+   epoch_id);
+   if (err < 0)
+   goto out_err;
+
+   /* Build the dir tree */
+   si = NOVA_I(inode);
+   sih = >header;
+   nova_rebuild_dir_inode_tree(sb, pi, pi_addr, sih);
+
+   pidir = nova_get_inode(sb, dir);
+   sidir = NOVA_I(dir);
+   sih = >header;
+   dir->i_blocks = sih->i_blocks;
+   inc_nlink(dir);
+   d_instantiate(dentry, inode);
+   unlock_new_inode(inode);
+
+   nova_lite_transaction_for_new_inode(sb, pi, pidir, inode, dir,
+   );
+out:
+   NOVA_END_TIMING(mkdir_t, mkdir_time);
+   return err;
+
+out_err:
+// clear_nlink(inode);
+   nova_err(sb, "%s return %d\n", __func__, err);
+   goto out;
+}
+
 struct dentry *nova_get_parent(struct dentry *child)
 {
struct inode *inode;
@@ -234,5 +307,6 @@ struct dentry *nova_get_parent(struct dentry *child)
 const struct inode_operations nova_dir_inode_operations = {
.create = nova_create,
.lookup = nova_lookup,
+   .mkdir  = nova_mkdir,
.mknod  = nova_mknod,
 };
-- 
2.7.4



[RFC v2 58/83] Namei: rename

2018-03-10 Thread Andiry Xu
From: Andiry Xu <jix...@cs.ucsd.edu>

Rename is the most cpmplex namei operation. The target dir may be
different from the source dir, and the target inode may exist.
Rename involves up to four inodes, and NOVA uses rename transation
to atomically update all the affected inodes.

Signed-off-by: Andiry Xu <jix...@cs.ucsd.edu>
---
 fs/nova/namei.c | 195 
 1 file changed, 195 insertions(+)

diff --git a/fs/nova/namei.c b/fs/nova/namei.c
index 4bf6396..bb50c0a 100644
--- a/fs/nova/namei.c
+++ b/fs/nova/namei.c
@@ -541,6 +541,200 @@ static int nova_rmdir(struct inode *dir, struct dentry 
*dentry)
return err;
 }
 
+static int nova_rename(struct inode *old_dir,
+   struct dentry *old_dentry,
+   struct inode *new_dir, struct dentry *new_dentry,
+   unsigned int flags)
+{
+   struct inode *old_inode = old_dentry->d_inode;
+   struct inode *new_inode = new_dentry->d_inode;
+   struct super_block *sb = old_inode->i_sb;
+   struct nova_sb_info *sbi = NOVA_SB(sb);
+   struct nova_inode *old_pi = NULL, *new_pi = NULL;
+   struct nova_inode *new_pidir = NULL, *old_pidir = NULL;
+   struct nova_dentry *father_entry = NULL;
+   char *head_addr = NULL;
+   int invalidate_new_inode = 0;
+   struct nova_inode_update update_dir_new;
+   struct nova_inode_update update_dir_old;
+   struct nova_inode_update update_new;
+   struct nova_inode_update update_old;
+   u64 old_linkc1 = 0, old_linkc2 = 0;
+   int err = -ENOENT;
+   int inc_link = 0, dec_link = 0;
+   int cpu;
+   int change_parent = 0;
+   u64 journal_tail;
+   u64 epoch_id;
+   timing_t rename_time;
+
+   nova_dbgv("%s: rename %s to %s,\n", __func__,
+   old_dentry->d_name.name, new_dentry->d_name.name);
+   nova_dbgv("%s: %s inode %lu, old dir %lu, new dir %lu, new inode %lu\n",
+   __func__, S_ISDIR(old_inode->i_mode) ? "dir" : "normal",
+   old_inode->i_ino, old_dir->i_ino, new_dir->i_ino,
+   new_inode ? new_inode->i_ino : 0);
+
+   if (flags & ~RENAME_NOREPLACE)
+   return -EINVAL;
+
+   NOVA_START_TIMING(rename_t, rename_time);
+
+   if (new_inode) {
+   err = -ENOTEMPTY;
+   if (S_ISDIR(old_inode->i_mode) && !nova_empty_dir(new_inode))
+   goto out;
+   } else {
+   if (S_ISDIR(old_inode->i_mode)) {
+   err = -EMLINK;
+   if (new_dir->i_nlink >= NOVA_LINK_MAX)
+   goto out;
+   }
+   }
+
+   if (S_ISDIR(old_inode->i_mode)) {
+   dec_link = -1;
+   if (!new_inode)
+   inc_link = 1;
+   /*
+* Tricky for in-place update:
+* New dentry is always after renamed dentry, so we have to
+* make sure new dentry has the correct links count
+* to workaround the rebuild nlink issue.
+*/
+   if (old_dir == new_dir) {
+   inc_link--;
+   if (inc_link == 0)
+   dec_link = 0;
+   }
+   }
+
+   epoch_id = nova_get_epoch_id(sb);
+   new_pidir = nova_get_inode(sb, new_dir);
+   old_pidir = nova_get_inode(sb, old_dir);
+
+   old_pi = nova_get_inode(sb, old_inode);
+   old_inode->i_ctime = current_time(old_inode);
+   update_old.tail = 0;
+   err = nova_append_link_change_entry(sb, old_pi, old_inode,
+   _old, _linkc1, epoch_id);
+   if (err)
+   goto out;
+
+   if (S_ISDIR(old_inode->i_mode) && old_dir != new_dir) {
+   /* My father is changed. Update .. entry */
+   /* For simplicity, we use in-place update and journal it */
+   change_parent = 1;
+   head_addr = (char *)nova_get_block(sb, old_pi->log_head);
+   father_entry = (struct nova_dentry *)(head_addr +
+   NOVA_DIR_LOG_REC_LEN(1));
+
+   if (le64_to_cpu(father_entry->ino) != old_dir->i_ino)
+   nova_err(sb, "%s: dir %lu parent should be %lu, but 
actually %lu\n",
+   __func__,
+   old_inode->i_ino, old_dir->i_ino,
+   le64_to_cpu(father_entry->ino));
+   }
+
+   update_dir_new.tail = 0;
+   if (new_inode) {
+   /* First remove the old entry in the new directory */
+   err = nova_remove_dentry(new_dentry, 0, _dir_new,
+   epo

[RFC v2 55/83] Namei: mkdir

2018-03-10 Thread Andiry Xu
From: Andiry Xu 

NOVA mkdir is similar to create. The difference is NOVA will
allocate log page for the newly created directory, and append
init dentries.

Signed-off-by: Andiry Xu 
---
 fs/nova/namei.c | 74 +
 1 file changed, 74 insertions(+)

diff --git a/fs/nova/namei.c b/fs/nova/namei.c
index a07cc4f..a95b2fe 100644
--- a/fs/nova/namei.c
+++ b/fs/nova/namei.c
@@ -207,6 +207,79 @@ static int nova_mknod(struct inode *dir, struct dentry 
*dentry, umode_t mode,
return err;
 }
 
+static int nova_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
+{
+   struct super_block *sb = dir->i_sb;
+   struct inode *inode;
+   struct nova_inode *pidir, *pi;
+   struct nova_inode_info *si, *sidir;
+   struct nova_inode_info_header *sih = NULL;
+   struct nova_inode_update update;
+   u64 pi_addr = 0;
+   u64 ino;
+   u64 epoch_id;
+   int err = -EMLINK;
+   timing_t mkdir_time;
+
+   NOVA_START_TIMING(mkdir_t, mkdir_time);
+   if (dir->i_nlink >= NOVA_LINK_MAX)
+   goto out;
+
+   ino = nova_new_nova_inode(sb, _addr);
+   if (ino == 0)
+   goto out_err;
+
+   epoch_id = nova_get_epoch_id(sb);
+   nova_dbgv("%s: name %s\n", __func__, dentry->d_name.name);
+   nova_dbgv("%s: inode %llu, dir %lu, link %d\n", __func__,
+   ino, dir->i_ino, dir->i_nlink);
+
+   update.tail = 0;
+   err = nova_add_dentry(dentry, ino, 1, , epoch_id);
+   if (err) {
+   nova_dbg("failed to add dir entry\n");
+   goto out_err;
+   }
+
+   inode = nova_new_vfs_inode(TYPE_MKDIR, dir, pi_addr, ino,
+   S_IFDIR | mode, sb->s_blocksize,
+   0, >d_name, epoch_id);
+   if (IS_ERR(inode)) {
+   err = PTR_ERR(inode);
+   goto out_err;
+   }
+
+   pi = nova_get_inode(sb, inode);
+   err = nova_append_dir_init_entries(sb, pi, inode->i_ino, dir->i_ino,
+   epoch_id);
+   if (err < 0)
+   goto out_err;
+
+   /* Build the dir tree */
+   si = NOVA_I(inode);
+   sih = >header;
+   nova_rebuild_dir_inode_tree(sb, pi, pi_addr, sih);
+
+   pidir = nova_get_inode(sb, dir);
+   sidir = NOVA_I(dir);
+   sih = >header;
+   dir->i_blocks = sih->i_blocks;
+   inc_nlink(dir);
+   d_instantiate(dentry, inode);
+   unlock_new_inode(inode);
+
+   nova_lite_transaction_for_new_inode(sb, pi, pidir, inode, dir,
+   );
+out:
+   NOVA_END_TIMING(mkdir_t, mkdir_time);
+   return err;
+
+out_err:
+// clear_nlink(inode);
+   nova_err(sb, "%s return %d\n", __func__, err);
+   goto out;
+}
+
 struct dentry *nova_get_parent(struct dentry *child)
 {
struct inode *inode;
@@ -234,5 +307,6 @@ struct dentry *nova_get_parent(struct dentry *child)
 const struct inode_operations nova_dir_inode_operations = {
.create = nova_create,
.lookup = nova_lookup,
+   .mkdir  = nova_mkdir,
.mknod  = nova_mknod,
 };
-- 
2.7.4



[RFC v2 58/83] Namei: rename

2018-03-10 Thread Andiry Xu
From: Andiry Xu 

Rename is the most cpmplex namei operation. The target dir may be
different from the source dir, and the target inode may exist.
Rename involves up to four inodes, and NOVA uses rename transation
to atomically update all the affected inodes.

Signed-off-by: Andiry Xu 
---
 fs/nova/namei.c | 195 
 1 file changed, 195 insertions(+)

diff --git a/fs/nova/namei.c b/fs/nova/namei.c
index 4bf6396..bb50c0a 100644
--- a/fs/nova/namei.c
+++ b/fs/nova/namei.c
@@ -541,6 +541,200 @@ static int nova_rmdir(struct inode *dir, struct dentry 
*dentry)
return err;
 }
 
+static int nova_rename(struct inode *old_dir,
+   struct dentry *old_dentry,
+   struct inode *new_dir, struct dentry *new_dentry,
+   unsigned int flags)
+{
+   struct inode *old_inode = old_dentry->d_inode;
+   struct inode *new_inode = new_dentry->d_inode;
+   struct super_block *sb = old_inode->i_sb;
+   struct nova_sb_info *sbi = NOVA_SB(sb);
+   struct nova_inode *old_pi = NULL, *new_pi = NULL;
+   struct nova_inode *new_pidir = NULL, *old_pidir = NULL;
+   struct nova_dentry *father_entry = NULL;
+   char *head_addr = NULL;
+   int invalidate_new_inode = 0;
+   struct nova_inode_update update_dir_new;
+   struct nova_inode_update update_dir_old;
+   struct nova_inode_update update_new;
+   struct nova_inode_update update_old;
+   u64 old_linkc1 = 0, old_linkc2 = 0;
+   int err = -ENOENT;
+   int inc_link = 0, dec_link = 0;
+   int cpu;
+   int change_parent = 0;
+   u64 journal_tail;
+   u64 epoch_id;
+   timing_t rename_time;
+
+   nova_dbgv("%s: rename %s to %s,\n", __func__,
+   old_dentry->d_name.name, new_dentry->d_name.name);
+   nova_dbgv("%s: %s inode %lu, old dir %lu, new dir %lu, new inode %lu\n",
+   __func__, S_ISDIR(old_inode->i_mode) ? "dir" : "normal",
+   old_inode->i_ino, old_dir->i_ino, new_dir->i_ino,
+   new_inode ? new_inode->i_ino : 0);
+
+   if (flags & ~RENAME_NOREPLACE)
+   return -EINVAL;
+
+   NOVA_START_TIMING(rename_t, rename_time);
+
+   if (new_inode) {
+   err = -ENOTEMPTY;
+   if (S_ISDIR(old_inode->i_mode) && !nova_empty_dir(new_inode))
+   goto out;
+   } else {
+   if (S_ISDIR(old_inode->i_mode)) {
+   err = -EMLINK;
+   if (new_dir->i_nlink >= NOVA_LINK_MAX)
+   goto out;
+   }
+   }
+
+   if (S_ISDIR(old_inode->i_mode)) {
+   dec_link = -1;
+   if (!new_inode)
+   inc_link = 1;
+   /*
+* Tricky for in-place update:
+* New dentry is always after renamed dentry, so we have to
+* make sure new dentry has the correct links count
+* to workaround the rebuild nlink issue.
+*/
+   if (old_dir == new_dir) {
+   inc_link--;
+   if (inc_link == 0)
+   dec_link = 0;
+   }
+   }
+
+   epoch_id = nova_get_epoch_id(sb);
+   new_pidir = nova_get_inode(sb, new_dir);
+   old_pidir = nova_get_inode(sb, old_dir);
+
+   old_pi = nova_get_inode(sb, old_inode);
+   old_inode->i_ctime = current_time(old_inode);
+   update_old.tail = 0;
+   err = nova_append_link_change_entry(sb, old_pi, old_inode,
+   _old, _linkc1, epoch_id);
+   if (err)
+   goto out;
+
+   if (S_ISDIR(old_inode->i_mode) && old_dir != new_dir) {
+   /* My father is changed. Update .. entry */
+   /* For simplicity, we use in-place update and journal it */
+   change_parent = 1;
+   head_addr = (char *)nova_get_block(sb, old_pi->log_head);
+   father_entry = (struct nova_dentry *)(head_addr +
+   NOVA_DIR_LOG_REC_LEN(1));
+
+   if (le64_to_cpu(father_entry->ino) != old_dir->i_ino)
+   nova_err(sb, "%s: dir %lu parent should be %lu, but 
actually %lu\n",
+   __func__,
+   old_inode->i_ino, old_dir->i_ino,
+   le64_to_cpu(father_entry->ino));
+   }
+
+   update_dir_new.tail = 0;
+   if (new_inode) {
+   /* First remove the old entry in the new directory */
+   err = nova_remove_dentry(new_dentry, 0, _dir_new,
+   epoch_id);
+   if (err)
+   

[RFC v2 57/83] Namei: rmdir

2018-03-10 Thread Andiry Xu
From: Andiry Xu <jix...@cs.ucsd.edu>

Similar to unlink.

Signed-off-by: Andiry Xu <jix...@cs.ucsd.edu>
---
 fs/nova/namei.c | 105 
 1 file changed, 105 insertions(+)

diff --git a/fs/nova/namei.c b/fs/nova/namei.c
index 360d716..4bf6396 100644
--- a/fs/nova/namei.c
+++ b/fs/nova/namei.c
@@ -437,6 +437,110 @@ static int nova_mkdir(struct inode *dir, struct dentry 
*dentry, umode_t mode)
goto out;
 }
 
+/*
+ * routine to check that the specified directory is empty (for rmdir)
+ */
+static int nova_empty_dir(struct inode *inode)
+{
+   struct super_block *sb;
+   struct nova_inode_info *si = NOVA_I(inode);
+   struct nova_inode_info_header *sih = >header;
+   struct nova_dentry *entry;
+   unsigned long pos = 0;
+   struct nova_dentry *entries[4];
+   int nr_entries;
+   int i;
+
+   sb = inode->i_sb;
+   nr_entries = radix_tree_gang_lookup(>tree,
+   (void **)entries, pos, 4);
+   if (nr_entries > 2)
+   return 0;
+
+   for (i = 0; i < nr_entries; i++) {
+   entry = entries[i];
+
+   if (!is_dir_init_entry(sb, entry))
+   return 0;
+   }
+
+   return 1;
+}
+
+static int nova_rmdir(struct inode *dir, struct dentry *dentry)
+{
+   struct inode *inode = dentry->d_inode;
+   struct nova_dentry *de;
+   struct super_block *sb = inode->i_sb;
+   struct nova_inode *pi = nova_get_inode(sb, inode), *pidir;
+   struct nova_inode_update update_dir;
+   struct nova_inode_update update;
+   u64 old_linkc = 0;
+   struct nova_inode_info *si = NOVA_I(inode);
+   struct nova_inode_info_header *sih = >header;
+   int err = -ENOTEMPTY;
+   u64 epoch_id;
+   timing_t rmdir_time;
+
+   NOVA_START_TIMING(rmdir_t, rmdir_time);
+   if (!inode)
+   return -ENOENT;
+
+   nova_dbgv("%s: name %s\n", __func__, dentry->d_name.name);
+   pidir = nova_get_inode(sb, dir);
+   if (!pidir)
+   return -EINVAL;
+
+   if (nova_inode_by_name(dir, >d_name, ) == 0)
+   return -ENOENT;
+
+   if (!nova_empty_dir(inode))
+   return err;
+
+   nova_dbgv("%s: inode %lu, dir %lu, link %d\n", __func__,
+   inode->i_ino, dir->i_ino, dir->i_nlink);
+
+   if (inode->i_nlink != 2)
+   nova_dbg("empty directory %lu has nlink!=2 (%d), dir %lu",
+   inode->i_ino, inode->i_nlink, dir->i_ino);
+
+   epoch_id = nova_get_epoch_id(sb);
+
+   update_dir.tail = 0;
+   err = nova_remove_dentry(dentry, -1, _dir, epoch_id);
+   if (err)
+   goto end_rmdir;
+
+   /*inode->i_version++; */
+   clear_nlink(inode);
+   inode->i_ctime = dir->i_ctime;
+
+   if (dir->i_nlink)
+   drop_nlink(dir);
+
+   nova_delete_dir_tree(sb, sih);
+
+   update.tail = 0;
+   err = nova_append_link_change_entry(sb, pi, inode, ,
+   _linkc, epoch_id);
+   if (err)
+   goto end_rmdir;
+
+   nova_lite_transaction_for_time_and_link(sb, pi, pidir, inode, dir,
+   , _dir, 1, epoch_id);
+
+   nova_invalidate_link_change_entry(sb, old_linkc);
+   nova_invalidate_dentries(sb, _dir);
+
+   NOVA_END_TIMING(rmdir_t, rmdir_time);
+   return err;
+
+end_rmdir:
+   nova_err(sb, "%s return %d\n", __func__, err);
+   NOVA_END_TIMING(rmdir_t, rmdir_time);
+   return err;
+}
+
 struct dentry *nova_get_parent(struct dentry *child)
 {
struct inode *inode;
@@ -467,5 +571,6 @@ const struct inode_operations nova_dir_inode_operations = {
.link   = nova_link,
.unlink = nova_unlink,
.mkdir  = nova_mkdir,
+   .rmdir  = nova_rmdir,
.mknod  = nova_mknod,
 };
-- 
2.7.4



[RFC v2 57/83] Namei: rmdir

2018-03-10 Thread Andiry Xu
From: Andiry Xu 

Similar to unlink.

Signed-off-by: Andiry Xu 
---
 fs/nova/namei.c | 105 
 1 file changed, 105 insertions(+)

diff --git a/fs/nova/namei.c b/fs/nova/namei.c
index 360d716..4bf6396 100644
--- a/fs/nova/namei.c
+++ b/fs/nova/namei.c
@@ -437,6 +437,110 @@ static int nova_mkdir(struct inode *dir, struct dentry 
*dentry, umode_t mode)
goto out;
 }
 
+/*
+ * routine to check that the specified directory is empty (for rmdir)
+ */
+static int nova_empty_dir(struct inode *inode)
+{
+   struct super_block *sb;
+   struct nova_inode_info *si = NOVA_I(inode);
+   struct nova_inode_info_header *sih = >header;
+   struct nova_dentry *entry;
+   unsigned long pos = 0;
+   struct nova_dentry *entries[4];
+   int nr_entries;
+   int i;
+
+   sb = inode->i_sb;
+   nr_entries = radix_tree_gang_lookup(>tree,
+   (void **)entries, pos, 4);
+   if (nr_entries > 2)
+   return 0;
+
+   for (i = 0; i < nr_entries; i++) {
+   entry = entries[i];
+
+   if (!is_dir_init_entry(sb, entry))
+   return 0;
+   }
+
+   return 1;
+}
+
+static int nova_rmdir(struct inode *dir, struct dentry *dentry)
+{
+   struct inode *inode = dentry->d_inode;
+   struct nova_dentry *de;
+   struct super_block *sb = inode->i_sb;
+   struct nova_inode *pi = nova_get_inode(sb, inode), *pidir;
+   struct nova_inode_update update_dir;
+   struct nova_inode_update update;
+   u64 old_linkc = 0;
+   struct nova_inode_info *si = NOVA_I(inode);
+   struct nova_inode_info_header *sih = >header;
+   int err = -ENOTEMPTY;
+   u64 epoch_id;
+   timing_t rmdir_time;
+
+   NOVA_START_TIMING(rmdir_t, rmdir_time);
+   if (!inode)
+   return -ENOENT;
+
+   nova_dbgv("%s: name %s\n", __func__, dentry->d_name.name);
+   pidir = nova_get_inode(sb, dir);
+   if (!pidir)
+   return -EINVAL;
+
+   if (nova_inode_by_name(dir, >d_name, ) == 0)
+   return -ENOENT;
+
+   if (!nova_empty_dir(inode))
+   return err;
+
+   nova_dbgv("%s: inode %lu, dir %lu, link %d\n", __func__,
+   inode->i_ino, dir->i_ino, dir->i_nlink);
+
+   if (inode->i_nlink != 2)
+   nova_dbg("empty directory %lu has nlink!=2 (%d), dir %lu",
+   inode->i_ino, inode->i_nlink, dir->i_ino);
+
+   epoch_id = nova_get_epoch_id(sb);
+
+   update_dir.tail = 0;
+   err = nova_remove_dentry(dentry, -1, _dir, epoch_id);
+   if (err)
+   goto end_rmdir;
+
+   /*inode->i_version++; */
+   clear_nlink(inode);
+   inode->i_ctime = dir->i_ctime;
+
+   if (dir->i_nlink)
+   drop_nlink(dir);
+
+   nova_delete_dir_tree(sb, sih);
+
+   update.tail = 0;
+   err = nova_append_link_change_entry(sb, pi, inode, ,
+   _linkc, epoch_id);
+   if (err)
+   goto end_rmdir;
+
+   nova_lite_transaction_for_time_and_link(sb, pi, pidir, inode, dir,
+   , _dir, 1, epoch_id);
+
+   nova_invalidate_link_change_entry(sb, old_linkc);
+   nova_invalidate_dentries(sb, _dir);
+
+   NOVA_END_TIMING(rmdir_t, rmdir_time);
+   return err;
+
+end_rmdir:
+   nova_err(sb, "%s return %d\n", __func__, err);
+   NOVA_END_TIMING(rmdir_t, rmdir_time);
+   return err;
+}
+
 struct dentry *nova_get_parent(struct dentry *child)
 {
struct inode *inode;
@@ -467,5 +571,6 @@ const struct inode_operations nova_dir_inode_operations = {
.link   = nova_link,
.unlink = nova_unlink,
.mkdir  = nova_mkdir,
+   .rmdir  = nova_rmdir,
.mknod  = nova_mknod,
 };
-- 
2.7.4



[RFC v2 56/83] Namei: link and unlink.

2018-03-10 Thread Andiry Xu
From: Andiry Xu <jix...@cs.ucsd.edu>

For link change operations, NOVA appends a link change entry
to the affected inode's log, and uses lite transaction to
atomically commit changes to multiple logs.

Signed-off-by: Andiry Xu <jix...@cs.ucsd.edu>
---
 fs/nova/namei.c | 159 
 1 file changed, 159 insertions(+)

diff --git a/fs/nova/namei.c b/fs/nova/namei.c
index a95b2fe..360d716 100644
--- a/fs/nova/namei.c
+++ b/fs/nova/namei.c
@@ -207,6 +207,163 @@ static int nova_mknod(struct inode *dir, struct dentry 
*dentry, umode_t mode,
return err;
 }
 
+static void nova_lite_transaction_for_time_and_link(struct super_block *sb,
+   struct nova_inode *pi, struct nova_inode *pidir, struct inode *inode,
+   struct inode *dir, struct nova_inode_update *update,
+   struct nova_inode_update *update_dir, int invalidate, u64 epoch_id)
+{
+   struct nova_sb_info *sbi = NOVA_SB(sb);
+   u64 journal_tail;
+   int cpu;
+   timing_t trans_time;
+
+   NOVA_START_TIMING(link_trans_t, trans_time);
+
+   cpu = smp_processor_id();
+   spin_lock(>journal_locks[cpu]);
+
+   // If you change what's required to create a new inode, you need to
+   // update this functions so the changes will be roll back on failure.
+   journal_tail = nova_create_inode_transaction(sb, inode, dir, cpu,
+   0, invalidate);
+
+   if (invalidate) {
+   pi->valid = 0;
+   pi->delete_epoch_id = epoch_id;
+   }
+   nova_update_inode(sb, inode, pi, update);
+
+   nova_update_inode(sb, dir, pidir, update_dir);
+
+   PERSISTENT_BARRIER();
+
+   nova_commit_lite_transaction(sb, journal_tail, cpu);
+   spin_unlock(>journal_locks[cpu]);
+
+   NOVA_END_TIMING(link_trans_t, trans_time);
+}
+
+static int nova_link(struct dentry *dest_dentry, struct inode *dir,
+ struct dentry *dentry)
+{
+   struct super_block *sb = dir->i_sb;
+   struct inode *inode = dest_dentry->d_inode;
+   struct nova_inode *pi = nova_get_inode(sb, inode);
+   struct nova_inode *pidir;
+   struct nova_inode_update update_dir;
+   struct nova_inode_update update;
+   u64 old_linkc = 0;
+   u64 epoch_id;
+   int err = -ENOMEM;
+   timing_t link_time;
+
+   NOVA_START_TIMING(link_t, link_time);
+   if (inode->i_nlink >= NOVA_LINK_MAX) {
+   err = -EMLINK;
+   goto out;
+   }
+
+   pidir = nova_get_inode(sb, dir);
+   if (!pidir) {
+   err = -EINVAL;
+   goto out;
+   }
+
+   ihold(inode);
+   epoch_id = nova_get_epoch_id(sb);
+
+   nova_dbgv("%s: name %s, dest %s\n", __func__,
+   dentry->d_name.name, dest_dentry->d_name.name);
+   nova_dbgv("%s: inode %lu, dir %lu\n", __func__,
+   inode->i_ino, dir->i_ino);
+
+   update_dir.tail = 0;
+   err = nova_add_dentry(dentry, inode->i_ino, 0, _dir, epoch_id);
+   if (err) {
+   iput(inode);
+   goto out;
+   }
+
+   inode->i_ctime = current_time(inode);
+   inc_nlink(inode);
+
+   update.tail = 0;
+   err = nova_append_link_change_entry(sb, pi, inode, ,
+   _linkc, epoch_id);
+   if (err) {
+   iput(inode);
+   goto out;
+   }
+
+   d_instantiate(dentry, inode);
+   nova_lite_transaction_for_time_and_link(sb, pi, pidir, inode, dir,
+   , _dir, 0, epoch_id);
+
+   nova_invalidate_link_change_entry(sb, old_linkc);
+
+out:
+   NOVA_END_TIMING(link_t, link_time);
+   return err;
+}
+
+static int nova_unlink(struct inode *dir, struct dentry *dentry)
+{
+   struct inode *inode = dentry->d_inode;
+   struct super_block *sb = dir->i_sb;
+   int retval = -ENOMEM;
+   struct nova_inode *pi = nova_get_inode(sb, inode);
+   struct nova_inode *pidir;
+   struct nova_inode_update update_dir;
+   struct nova_inode_update update;
+   u64 old_linkc = 0;
+   u64 epoch_id;
+   int invalidate = 0;
+   timing_t unlink_time;
+
+   NOVA_START_TIMING(unlink_t, unlink_time);
+
+   pidir = nova_get_inode(sb, dir);
+   if (!pidir)
+   goto out;
+
+   epoch_id = nova_get_epoch_id(sb);
+   nova_dbgv("%s: %s\n", __func__, dentry->d_name.name);
+   nova_dbgv("%s: inode %lu, dir %lu\n", __func__,
+   inode->i_ino, dir->i_ino);
+
+   update_dir.tail = 0;
+   retval = nova_remove_dentry(dentry, 0, _dir, epoch_id);
+   if (retval)
+   goto out;
+
+   inode->i_ctime = dir->i_ctime;
+
+   if (inode->i_nlink == 1)
+   inva

[RFC v2 56/83] Namei: link and unlink.

2018-03-10 Thread Andiry Xu
From: Andiry Xu 

For link change operations, NOVA appends a link change entry
to the affected inode's log, and uses lite transaction to
atomically commit changes to multiple logs.

Signed-off-by: Andiry Xu 
---
 fs/nova/namei.c | 159 
 1 file changed, 159 insertions(+)

diff --git a/fs/nova/namei.c b/fs/nova/namei.c
index a95b2fe..360d716 100644
--- a/fs/nova/namei.c
+++ b/fs/nova/namei.c
@@ -207,6 +207,163 @@ static int nova_mknod(struct inode *dir, struct dentry 
*dentry, umode_t mode,
return err;
 }
 
+static void nova_lite_transaction_for_time_and_link(struct super_block *sb,
+   struct nova_inode *pi, struct nova_inode *pidir, struct inode *inode,
+   struct inode *dir, struct nova_inode_update *update,
+   struct nova_inode_update *update_dir, int invalidate, u64 epoch_id)
+{
+   struct nova_sb_info *sbi = NOVA_SB(sb);
+   u64 journal_tail;
+   int cpu;
+   timing_t trans_time;
+
+   NOVA_START_TIMING(link_trans_t, trans_time);
+
+   cpu = smp_processor_id();
+   spin_lock(>journal_locks[cpu]);
+
+   // If you change what's required to create a new inode, you need to
+   // update this functions so the changes will be roll back on failure.
+   journal_tail = nova_create_inode_transaction(sb, inode, dir, cpu,
+   0, invalidate);
+
+   if (invalidate) {
+   pi->valid = 0;
+   pi->delete_epoch_id = epoch_id;
+   }
+   nova_update_inode(sb, inode, pi, update);
+
+   nova_update_inode(sb, dir, pidir, update_dir);
+
+   PERSISTENT_BARRIER();
+
+   nova_commit_lite_transaction(sb, journal_tail, cpu);
+   spin_unlock(>journal_locks[cpu]);
+
+   NOVA_END_TIMING(link_trans_t, trans_time);
+}
+
+static int nova_link(struct dentry *dest_dentry, struct inode *dir,
+ struct dentry *dentry)
+{
+   struct super_block *sb = dir->i_sb;
+   struct inode *inode = dest_dentry->d_inode;
+   struct nova_inode *pi = nova_get_inode(sb, inode);
+   struct nova_inode *pidir;
+   struct nova_inode_update update_dir;
+   struct nova_inode_update update;
+   u64 old_linkc = 0;
+   u64 epoch_id;
+   int err = -ENOMEM;
+   timing_t link_time;
+
+   NOVA_START_TIMING(link_t, link_time);
+   if (inode->i_nlink >= NOVA_LINK_MAX) {
+   err = -EMLINK;
+   goto out;
+   }
+
+   pidir = nova_get_inode(sb, dir);
+   if (!pidir) {
+   err = -EINVAL;
+   goto out;
+   }
+
+   ihold(inode);
+   epoch_id = nova_get_epoch_id(sb);
+
+   nova_dbgv("%s: name %s, dest %s\n", __func__,
+   dentry->d_name.name, dest_dentry->d_name.name);
+   nova_dbgv("%s: inode %lu, dir %lu\n", __func__,
+   inode->i_ino, dir->i_ino);
+
+   update_dir.tail = 0;
+   err = nova_add_dentry(dentry, inode->i_ino, 0, _dir, epoch_id);
+   if (err) {
+   iput(inode);
+   goto out;
+   }
+
+   inode->i_ctime = current_time(inode);
+   inc_nlink(inode);
+
+   update.tail = 0;
+   err = nova_append_link_change_entry(sb, pi, inode, ,
+   _linkc, epoch_id);
+   if (err) {
+   iput(inode);
+   goto out;
+   }
+
+   d_instantiate(dentry, inode);
+   nova_lite_transaction_for_time_and_link(sb, pi, pidir, inode, dir,
+   , _dir, 0, epoch_id);
+
+   nova_invalidate_link_change_entry(sb, old_linkc);
+
+out:
+   NOVA_END_TIMING(link_t, link_time);
+   return err;
+}
+
+static int nova_unlink(struct inode *dir, struct dentry *dentry)
+{
+   struct inode *inode = dentry->d_inode;
+   struct super_block *sb = dir->i_sb;
+   int retval = -ENOMEM;
+   struct nova_inode *pi = nova_get_inode(sb, inode);
+   struct nova_inode *pidir;
+   struct nova_inode_update update_dir;
+   struct nova_inode_update update;
+   u64 old_linkc = 0;
+   u64 epoch_id;
+   int invalidate = 0;
+   timing_t unlink_time;
+
+   NOVA_START_TIMING(unlink_t, unlink_time);
+
+   pidir = nova_get_inode(sb, dir);
+   if (!pidir)
+   goto out;
+
+   epoch_id = nova_get_epoch_id(sb);
+   nova_dbgv("%s: %s\n", __func__, dentry->d_name.name);
+   nova_dbgv("%s: inode %lu, dir %lu\n", __func__,
+   inode->i_ino, dir->i_ino);
+
+   update_dir.tail = 0;
+   retval = nova_remove_dentry(dentry, 0, _dir, epoch_id);
+   if (retval)
+   goto out;
+
+   inode->i_ctime = dir->i_ctime;
+
+   if (inode->i_nlink == 1)
+   invalidate = 1;
+
+   if (inode->i_nlink)
+

[RFC v2 59/83] Namei: setattr

2018-03-10 Thread Andiry Xu
From: Andiry Xu <jix...@cs.ucsd.edu>

Add notify_change for setattr operations. Truncate the file blocks
if the file is shrunk.

Signed-off-by: Andiry Xu <jix...@cs.ucsd.edu>
---
 fs/nova/inode.c | 180 
 fs/nova/inode.h |   1 +
 fs/nova/namei.c |   2 +
 3 files changed, 183 insertions(+)

diff --git a/fs/nova/inode.c b/fs/nova/inode.c
index 2d3f7a3..2092a55 100644
--- a/fs/nova/inode.c
+++ b/fs/nova/inode.c
@@ -141,6 +141,58 @@ void nova_set_inode_flags(struct inode *inode, struct 
nova_inode *pi,
inode->i_flags |= S_DAX;
 }
 
+static inline void check_eof_blocks(struct super_block *sb,
+   struct nova_inode *pi, struct inode *inode,
+   struct nova_inode_info_header *sih)
+{
+   if ((pi->i_flags & cpu_to_le32(NOVA_EOFBLOCKS_FL)) &&
+   (inode->i_size + sb->s_blocksize) > (sih->i_blocks
+   << sb->s_blocksize_bits)) {
+   pi->i_flags &= cpu_to_le32(~NOVA_EOFBLOCKS_FL);
+   nova_persist_inode(pi);
+   }
+}
+
+/*
+ * Free data blocks from inode in the range start <=> end
+ */
+static void nova_truncate_file_blocks(struct inode *inode, loff_t start,
+   loff_t end, u64 epoch_id)
+{
+   struct super_block *sb = inode->i_sb;
+   struct nova_inode *pi = nova_get_inode(sb, inode);
+   struct nova_inode_info *si = NOVA_I(inode);
+   struct nova_inode_info_header *sih = >header;
+   unsigned int data_bits = blk_type_to_shift[sih->i_blk_type];
+   unsigned long first_blocknr, last_blocknr;
+   int freed = 0;
+
+   inode->i_mtime = inode->i_ctime = current_time(inode);
+
+   nova_dbg_verbose("truncate: pi %p iblocks %lx %llx %llx %llx\n", pi,
+sih->i_blocks, start, end, pi->i_size);
+
+   first_blocknr = (start + (1UL << data_bits) - 1) >> data_bits;
+
+   if (end == 0)
+   return;
+   last_blocknr = (end - 1) >> data_bits;
+
+   if (first_blocknr > last_blocknr)
+   return;
+
+   freed = nova_delete_file_tree(sb, sih, first_blocknr,
+   last_blocknr, true, false, epoch_id);
+
+   inode->i_blocks -= (freed * (1 << (data_bits -
+   sb->s_blocksize_bits)));
+
+   sih->i_blocks = inode->i_blocks;
+   /* Check for the flag EOFBLOCKS is still valid after the set size */
+   check_eof_blocks(sb, pi, inode, sih);
+
+}
+
 /* copy persistent state to struct inode */
 static int nova_read_inode(struct super_block *sb, struct inode *inode,
u64 pi_addr)
@@ -963,6 +1015,134 @@ void nova_dirty_inode(struct inode *inode, int flags)
nova_flush_buffer(>i_atime, sizeof(pi->i_atime), 0);
 }
 
+/*
+ * Zero the tail page. Used in resize request
+ * to avoid to keep data in case the file grows again.
+ */
+static void nova_clear_last_page_tail(struct super_block *sb,
+   struct inode *inode, loff_t newsize)
+{
+   struct nova_sb_info *sbi = NOVA_SB(sb);
+   struct nova_inode_info *si = NOVA_I(inode);
+   struct nova_inode_info_header *sih = >header;
+   unsigned long offset = newsize & (sb->s_blocksize - 1);
+   unsigned long pgoff, length;
+   u64 nvmm;
+   char *nvmm_addr;
+
+   if (offset == 0 || newsize > inode->i_size)
+   return;
+
+   length = sb->s_blocksize - offset;
+   pgoff = newsize >> sb->s_blocksize_bits;
+
+   nvmm = nova_find_nvmm_block(sb, sih, NULL, pgoff);
+   if (nvmm == 0)
+   return;
+
+   nvmm_addr = (char *)nova_get_block(sb, nvmm);
+   memcpy_to_pmem_nocache(nvmm_addr + offset, sbi->zeroed_page, length);
+}
+
+static void nova_setsize(struct inode *inode, loff_t oldsize, loff_t newsize,
+   u64 epoch_id)
+{
+   struct super_block *sb = inode->i_sb;
+   struct nova_inode_info *si = NOVA_I(inode);
+   struct nova_inode_info_header *sih = >header;
+   timing_t setsize_time;
+
+   /* We only support truncate regular file */
+   if (!(S_ISREG(inode->i_mode))) {
+   nova_err(inode->i_sb, "%s:wrong file mode %x\n", inode->i_mode);
+   return;
+   }
+
+   NOVA_START_TIMING(setsize_t, setsize_time);
+
+   inode_dio_wait(inode);
+
+   nova_dbgv("%s: inode %lu, old size %llu, new size %llu\n",
+   __func__, inode->i_ino, oldsize, newsize);
+
+   sih_lock(sih);
+   if (newsize != oldsize) {
+   nova_clear_last_page_tail(sb, inode, newsize);
+   i_size_write(inode, newsize);
+   sih->i_size = newsize;
+   }
+
+   /* FIXME: we should make sure that there is nobody reading the inode
+* before truncating it. Also we need to munmap the truncate

[RFC v2 60/83] Add special inode operations.

2018-03-10 Thread Andiry Xu
From: Andiry Xu <jix...@cs.ucsd.edu>

Signed-off-by: Andiry Xu <jix...@cs.ucsd.edu>
---
 fs/nova/inode.c | 2 ++
 fs/nova/namei.c | 5 +
 fs/nova/nova.h  | 1 +
 3 files changed, 8 insertions(+)

diff --git a/fs/nova/inode.c b/fs/nova/inode.c
index 2092a55..0e9ab4b 100644
--- a/fs/nova/inode.c
+++ b/fs/nova/inode.c
@@ -239,6 +239,7 @@ static int nova_read_inode(struct super_block *sb, struct 
inode *inode,
case S_IFLNK:
break;
default:
+   inode->i_op = _special_inode_operations;
init_special_inode(inode, inode->i_mode,
   le32_to_cpu(pi->dev.rdev));
break;
@@ -929,6 +930,7 @@ struct inode *nova_new_vfs_inode(enum nova_new_inode_type 
type,
break;
case TYPE_MKNOD:
init_special_inode(inode, mode, rdev);
+   inode->i_op = _special_inode_operations;
break;
case TYPE_SYMLINK:
inode->i_mapping->a_ops = _aops_dax;
diff --git a/fs/nova/namei.c b/fs/nova/namei.c
index 1966bff..7a81672 100644
--- a/fs/nova/namei.c
+++ b/fs/nova/namei.c
@@ -771,3 +771,8 @@ const struct inode_operations nova_dir_inode_operations = {
.setattr= nova_notify_change,
.get_acl= NULL,
 };
+
+const struct inode_operations nova_special_inode_operations = {
+   .setattr= nova_notify_change,
+   .get_acl= NULL,
+};
diff --git a/fs/nova/nova.h b/fs/nova/nova.h
index 03ea0bd..85292d3 100644
--- a/fs/nova/nova.h
+++ b/fs/nova/nova.h
@@ -486,6 +486,7 @@ int nova_remove_dentry(struct dentry *dentry, int dec_link,
 
 /* namei.c */
 extern const struct inode_operations nova_dir_inode_operations;
+extern const struct inode_operations nova_special_inode_operations;
 extern struct dentry *nova_get_parent(struct dentry *child);
 
 /* rebuild.c */
-- 
2.7.4



[RFC v2 60/83] Add special inode operations.

2018-03-10 Thread Andiry Xu
From: Andiry Xu 

Signed-off-by: Andiry Xu 
---
 fs/nova/inode.c | 2 ++
 fs/nova/namei.c | 5 +
 fs/nova/nova.h  | 1 +
 3 files changed, 8 insertions(+)

diff --git a/fs/nova/inode.c b/fs/nova/inode.c
index 2092a55..0e9ab4b 100644
--- a/fs/nova/inode.c
+++ b/fs/nova/inode.c
@@ -239,6 +239,7 @@ static int nova_read_inode(struct super_block *sb, struct 
inode *inode,
case S_IFLNK:
break;
default:
+   inode->i_op = _special_inode_operations;
init_special_inode(inode, inode->i_mode,
   le32_to_cpu(pi->dev.rdev));
break;
@@ -929,6 +930,7 @@ struct inode *nova_new_vfs_inode(enum nova_new_inode_type 
type,
break;
case TYPE_MKNOD:
init_special_inode(inode, mode, rdev);
+   inode->i_op = _special_inode_operations;
break;
case TYPE_SYMLINK:
inode->i_mapping->a_ops = _aops_dax;
diff --git a/fs/nova/namei.c b/fs/nova/namei.c
index 1966bff..7a81672 100644
--- a/fs/nova/namei.c
+++ b/fs/nova/namei.c
@@ -771,3 +771,8 @@ const struct inode_operations nova_dir_inode_operations = {
.setattr= nova_notify_change,
.get_acl= NULL,
 };
+
+const struct inode_operations nova_special_inode_operations = {
+   .setattr= nova_notify_change,
+   .get_acl= NULL,
+};
diff --git a/fs/nova/nova.h b/fs/nova/nova.h
index 03ea0bd..85292d3 100644
--- a/fs/nova/nova.h
+++ b/fs/nova/nova.h
@@ -486,6 +486,7 @@ int nova_remove_dentry(struct dentry *dentry, int dec_link,
 
 /* namei.c */
 extern const struct inode_operations nova_dir_inode_operations;
+extern const struct inode_operations nova_special_inode_operations;
 extern struct dentry *nova_get_parent(struct dentry *child);
 
 /* rebuild.c */
-- 
2.7.4



[RFC v2 59/83] Namei: setattr

2018-03-10 Thread Andiry Xu
From: Andiry Xu 

Add notify_change for setattr operations. Truncate the file blocks
if the file is shrunk.

Signed-off-by: Andiry Xu 
---
 fs/nova/inode.c | 180 
 fs/nova/inode.h |   1 +
 fs/nova/namei.c |   2 +
 3 files changed, 183 insertions(+)

diff --git a/fs/nova/inode.c b/fs/nova/inode.c
index 2d3f7a3..2092a55 100644
--- a/fs/nova/inode.c
+++ b/fs/nova/inode.c
@@ -141,6 +141,58 @@ void nova_set_inode_flags(struct inode *inode, struct 
nova_inode *pi,
inode->i_flags |= S_DAX;
 }
 
+static inline void check_eof_blocks(struct super_block *sb,
+   struct nova_inode *pi, struct inode *inode,
+   struct nova_inode_info_header *sih)
+{
+   if ((pi->i_flags & cpu_to_le32(NOVA_EOFBLOCKS_FL)) &&
+   (inode->i_size + sb->s_blocksize) > (sih->i_blocks
+   << sb->s_blocksize_bits)) {
+   pi->i_flags &= cpu_to_le32(~NOVA_EOFBLOCKS_FL);
+   nova_persist_inode(pi);
+   }
+}
+
+/*
+ * Free data blocks from inode in the range start <=> end
+ */
+static void nova_truncate_file_blocks(struct inode *inode, loff_t start,
+   loff_t end, u64 epoch_id)
+{
+   struct super_block *sb = inode->i_sb;
+   struct nova_inode *pi = nova_get_inode(sb, inode);
+   struct nova_inode_info *si = NOVA_I(inode);
+   struct nova_inode_info_header *sih = >header;
+   unsigned int data_bits = blk_type_to_shift[sih->i_blk_type];
+   unsigned long first_blocknr, last_blocknr;
+   int freed = 0;
+
+   inode->i_mtime = inode->i_ctime = current_time(inode);
+
+   nova_dbg_verbose("truncate: pi %p iblocks %lx %llx %llx %llx\n", pi,
+sih->i_blocks, start, end, pi->i_size);
+
+   first_blocknr = (start + (1UL << data_bits) - 1) >> data_bits;
+
+   if (end == 0)
+   return;
+   last_blocknr = (end - 1) >> data_bits;
+
+   if (first_blocknr > last_blocknr)
+   return;
+
+   freed = nova_delete_file_tree(sb, sih, first_blocknr,
+   last_blocknr, true, false, epoch_id);
+
+   inode->i_blocks -= (freed * (1 << (data_bits -
+   sb->s_blocksize_bits)));
+
+   sih->i_blocks = inode->i_blocks;
+   /* Check for the flag EOFBLOCKS is still valid after the set size */
+   check_eof_blocks(sb, pi, inode, sih);
+
+}
+
 /* copy persistent state to struct inode */
 static int nova_read_inode(struct super_block *sb, struct inode *inode,
u64 pi_addr)
@@ -963,6 +1015,134 @@ void nova_dirty_inode(struct inode *inode, int flags)
nova_flush_buffer(>i_atime, sizeof(pi->i_atime), 0);
 }
 
+/*
+ * Zero the tail page. Used in resize request
+ * to avoid to keep data in case the file grows again.
+ */
+static void nova_clear_last_page_tail(struct super_block *sb,
+   struct inode *inode, loff_t newsize)
+{
+   struct nova_sb_info *sbi = NOVA_SB(sb);
+   struct nova_inode_info *si = NOVA_I(inode);
+   struct nova_inode_info_header *sih = >header;
+   unsigned long offset = newsize & (sb->s_blocksize - 1);
+   unsigned long pgoff, length;
+   u64 nvmm;
+   char *nvmm_addr;
+
+   if (offset == 0 || newsize > inode->i_size)
+   return;
+
+   length = sb->s_blocksize - offset;
+   pgoff = newsize >> sb->s_blocksize_bits;
+
+   nvmm = nova_find_nvmm_block(sb, sih, NULL, pgoff);
+   if (nvmm == 0)
+   return;
+
+   nvmm_addr = (char *)nova_get_block(sb, nvmm);
+   memcpy_to_pmem_nocache(nvmm_addr + offset, sbi->zeroed_page, length);
+}
+
+static void nova_setsize(struct inode *inode, loff_t oldsize, loff_t newsize,
+   u64 epoch_id)
+{
+   struct super_block *sb = inode->i_sb;
+   struct nova_inode_info *si = NOVA_I(inode);
+   struct nova_inode_info_header *sih = >header;
+   timing_t setsize_time;
+
+   /* We only support truncate regular file */
+   if (!(S_ISREG(inode->i_mode))) {
+   nova_err(inode->i_sb, "%s:wrong file mode %x\n", inode->i_mode);
+   return;
+   }
+
+   NOVA_START_TIMING(setsize_t, setsize_time);
+
+   inode_dio_wait(inode);
+
+   nova_dbgv("%s: inode %lu, old size %llu, new size %llu\n",
+   __func__, inode->i_ino, oldsize, newsize);
+
+   sih_lock(sih);
+   if (newsize != oldsize) {
+   nova_clear_last_page_tail(sb, inode, newsize);
+   i_size_write(inode, newsize);
+   sih->i_size = newsize;
+   }
+
+   /* FIXME: we should make sure that there is nobody reading the inode
+* before truncating it. Also we need to munmap the truncated range
+* from applicat

[RFC v2 61/83] Super: Add nova_export_ops.

2018-03-10 Thread Andiry Xu
From: Andiry Xu <jix...@cs.ucsd.edu>

Signed-off-by: Andiry Xu <jix...@cs.ucsd.edu>
---
 fs/nova/super.c | 48 
 1 file changed, 48 insertions(+)

diff --git a/fs/nova/super.c b/fs/nova/super.c
index daf3270..0847e57 100644
--- a/fs/nova/super.c
+++ b/fs/nova/super.c
@@ -51,6 +51,7 @@ module_param(nova_dbgmask, int, 0444);
 MODULE_PARM_DESC(nova_dbgmask, "Control debugging output");
 
 static struct super_operations nova_sops;
+static const struct export_operations nova_export_ops;
 
 static struct kmem_cache *nova_inode_cachep;
 static struct kmem_cache *nova_range_node_cachep;
@@ -631,6 +632,7 @@ static int nova_fill_super(struct super_block *sb, void 
*data, int silent)
sb->s_op = _sops;
sb->s_maxbytes = nova_max_size(sb->s_blocksize_bits);
sb->s_time_gran = 10; // 1 second.
+   sb->s_export_op = _export_ops;
sb->s_xattr = NULL;
sb->s_flags |= MS_NOSEC;
 
@@ -904,6 +906,52 @@ static struct file_system_type nova_fs_type = {
.kill_sb= kill_block_super,
 };
 
+static struct inode *nova_nfs_get_inode(struct super_block *sb,
+u64 ino, u32 generation)
+{
+   struct inode *inode;
+
+   if (ino < NOVA_ROOT_INO)
+   return ERR_PTR(-ESTALE);
+
+   if (ino > LONG_MAX)
+   return ERR_PTR(-ESTALE);
+
+   inode = nova_iget(sb, ino);
+   if (IS_ERR(inode))
+   return ERR_CAST(inode);
+
+   if (generation && inode->i_generation != generation) {
+   /* we didn't find the right inode.. */
+   iput(inode);
+   return ERR_PTR(-ESTALE);
+   }
+
+   return inode;
+}
+
+static struct dentry *nova_fh_to_dentry(struct super_block *sb,
+struct fid *fid, int fh_len,
+int fh_type)
+{
+   return generic_fh_to_dentry(sb, fid, fh_len, fh_type,
+   nova_nfs_get_inode);
+}
+
+static struct dentry *nova_fh_to_parent(struct super_block *sb,
+struct fid *fid, int fh_len,
+int fh_type)
+{
+   return generic_fh_to_parent(sb, fid, fh_len, fh_type,
+   nova_nfs_get_inode);
+}
+
+static const struct export_operations nova_export_ops = {
+   .fh_to_dentry   = nova_fh_to_dentry,
+   .fh_to_parent   = nova_fh_to_parent,
+   .get_parent = nova_get_parent,
+};
+
 static int __init init_nova_fs(void)
 {
int rc = 0;
-- 
2.7.4



[RFC v2 54/83] Namei: create and mknod.

2018-03-10 Thread Andiry Xu
From: Andiry Xu <jix...@cs.ucsd.edu>

NOVA allocates and initializes a new inode, and appends a dentry
to the directory's log. Then NOVA creates a transaction to
commit both changes atomically: update the directory log tail
pointer and validate the new inode.

Signed-off-by: Andiry Xu <jix...@cs.ucsd.edu>
---
 fs/nova/namei.c | 141 
 1 file changed, 141 insertions(+)

diff --git a/fs/nova/namei.c b/fs/nova/namei.c
index 8076f5b..a07cc4f 100644
--- a/fs/nova/namei.c
+++ b/fs/nova/namei.c
@@ -68,6 +68,145 @@ static struct dentry *nova_lookup(struct inode *dir, struct 
dentry *dentry,
return d_splice_alias(inode, dentry);
 }
 
+static void nova_lite_transaction_for_new_inode(struct super_block *sb,
+   struct nova_inode *pi, struct nova_inode *pidir, struct inode *inode,
+   struct inode *dir, struct nova_inode_update *update)
+{
+   struct nova_sb_info *sbi = NOVA_SB(sb);
+   int cpu;
+   u64 journal_tail;
+   timing_t trans_time;
+
+   NOVA_START_TIMING(create_trans_t, trans_time);
+
+   cpu = smp_processor_id();
+   spin_lock(>journal_locks[cpu]);
+
+   // If you change what's required to create a new inode, you need to
+   // update this functions so the changes will be roll back on failure.
+   journal_tail = nova_create_inode_transaction(sb, inode, dir, cpu, 1, 0);
+
+   nova_update_inode(sb, dir, pidir, update);
+
+   pi->valid = 1;
+   nova_persist_inode(pi);
+   PERSISTENT_BARRIER();
+
+   nova_commit_lite_transaction(sb, journal_tail, cpu);
+   spin_unlock(>journal_locks[cpu]);
+
+   NOVA_END_TIMING(create_trans_t, trans_time);
+}
+
+/* Returns new tail after append */
+/*
+ * By the time this is called, we already have created
+ * the directory cache entry for the new file, but it
+ * is so far negative - it has no inode.
+ *
+ * If the create succeeds, we fill in the inode information
+ * with d_instantiate().
+ */
+static int nova_create(struct inode *dir, struct dentry *dentry, umode_t mode,
+   bool excl)
+{
+   struct inode *inode = NULL;
+   int err = PTR_ERR(inode);
+   struct super_block *sb = dir->i_sb;
+   struct nova_inode *pidir, *pi;
+   struct nova_inode_update update;
+   u64 pi_addr = 0;
+   u64 ino, epoch_id;
+   timing_t create_time;
+
+   NOVA_START_TIMING(create_t, create_time);
+
+   pidir = nova_get_inode(sb, dir);
+   if (!pidir)
+   goto out_err;
+
+   epoch_id = nova_get_epoch_id(sb);
+   ino = nova_new_nova_inode(sb, _addr);
+   if (ino == 0)
+   goto out_err;
+
+   update.tail = 0;
+   err = nova_add_dentry(dentry, ino, 0, , epoch_id);
+   if (err)
+   goto out_err;
+
+   nova_dbgv("%s: %s\n", __func__, dentry->d_name.name);
+   nova_dbgv("%s: inode %llu, dir %lu\n", __func__, ino, dir->i_ino);
+   inode = nova_new_vfs_inode(TYPE_CREATE, dir, pi_addr, ino, mode,
+   0, 0, >d_name, epoch_id);
+   if (IS_ERR(inode))
+   goto out_err;
+
+   d_instantiate(dentry, inode);
+   unlock_new_inode(inode);
+
+   pi = nova_get_block(sb, pi_addr);
+   nova_lite_transaction_for_new_inode(sb, pi, pidir, inode, dir,
+   );
+   NOVA_END_TIMING(create_t, create_time);
+   return err;
+out_err:
+   nova_err(sb, "%s return %d\n", __func__, err);
+   NOVA_END_TIMING(create_t, create_time);
+   return err;
+}
+
+static int nova_mknod(struct inode *dir, struct dentry *dentry, umode_t mode,
+  dev_t rdev)
+{
+   struct inode *inode = NULL;
+   int err = PTR_ERR(inode);
+   struct super_block *sb = dir->i_sb;
+   u64 pi_addr = 0;
+   struct nova_inode *pidir, *pi;
+   struct nova_inode_update update;
+   u64 ino;
+   u64 epoch_id;
+   timing_t mknod_time;
+
+   NOVA_START_TIMING(mknod_t, mknod_time);
+
+   pidir = nova_get_inode(sb, dir);
+   if (!pidir)
+   goto out_err;
+
+   epoch_id = nova_get_epoch_id(sb);
+   ino = nova_new_nova_inode(sb, _addr);
+   if (ino == 0)
+   goto out_err;
+
+   nova_dbgv("%s: %s\n", __func__, dentry->d_name.name);
+   nova_dbgv("%s: inode %llu, dir %lu\n", __func__, ino, dir->i_ino);
+
+   update.tail = 0;
+   err = nova_add_dentry(dentry, ino, 0, , epoch_id);
+   if (err)
+   goto out_err;
+
+   inode = nova_new_vfs_inode(TYPE_MKNOD, dir, pi_addr, ino, mode,
+   0, rdev, >d_name, epoch_id);
+   if (IS_ERR(inode))
+   goto out_err;
+
+   d_instantiate(dentry, inode);
+   unlock_new_inode(inode);
+
+   pi = nov

[RFC v2 62/83] File: getattr and file inode operations

2018-03-10 Thread Andiry Xu
From: Andiry Xu <jix...@cs.ucsd.edu>

Signed-off-by: Andiry Xu <jix...@cs.ucsd.edu>
---
 fs/nova/Makefile |  2 +-
 fs/nova/file.c   | 31 +++
 fs/nova/inode.c  | 25 +
 fs/nova/inode.h  |  2 ++
 fs/nova/nova.h   |  3 +++
 5 files changed, 62 insertions(+), 1 deletion(-)
 create mode 100644 fs/nova/file.c

diff --git a/fs/nova/Makefile b/fs/nova/Makefile
index eb97e46..468ed6f 100644
--- a/fs/nova/Makefile
+++ b/fs/nova/Makefile
@@ -4,5 +4,5 @@
 
 obj-$(CONFIG_NOVA_FS) += nova.o
 
-nova-y := balloc.o bbuild.o dir.o inode.o journal.o log.o namei.o\
+nova-y := balloc.o bbuild.o dir.o file.o inode.o journal.o log.o namei.o\
  rebuild.o stats.o super.o
diff --git a/fs/nova/file.c b/fs/nova/file.c
new file mode 100644
index 000..b46d4bd
--- /dev/null
+++ b/fs/nova/file.c
@@ -0,0 +1,31 @@
+/*
+ * BRIEF DESCRIPTION
+ *
+ * File operations for files.
+ *
+ * Copyright 2015-2016 Regents of the University of California,
+ * UCSD Non-Volatile Systems Lab, Andiry Xu <jix...@cs.ucsd.edu>
+ * Copyright 2012-2013 Intel Corporation
+ * Copyright 2009-2011 Marco Stornelli <marco.storne...@gmail.com>
+ * Copyright 2003 Sony Corporation
+ * Copyright 2003 Matsushita Electric Industrial Co., Ltd.
+ * 2003-2004 (c) MontaVista Software, Inc. , Steve Longerbeam
+ * This file is licensed under the terms of the GNU General Public
+ * License version 2. This program is licensed "as is" without any
+ * warranty of any kind, whether express or implied.
+ */
+
+#include 
+#include 
+#include 
+#include 
+#include 
+#include "nova.h"
+#include "inode.h"
+
+
+const struct inode_operations nova_file_inode_operations = {
+   .setattr= nova_notify_change,
+   .getattr= nova_getattr,
+   .get_acl= NULL,
+};
diff --git a/fs/nova/inode.c b/fs/nova/inode.c
index 0e9ab4b..6fcc5e7 100644
--- a/fs/nova/inode.c
+++ b/fs/nova/inode.c
@@ -231,6 +231,7 @@ static int nova_read_inode(struct super_block *sb, struct 
inode *inode,
 
switch (inode->i_mode & S_IFMT) {
case S_IFREG:
+   inode->i_op = _file_inode_operations;
break;
case S_IFDIR:
inode->i_op = _dir_inode_operations;
@@ -926,6 +927,7 @@ struct inode *nova_new_vfs_inode(enum nova_new_inode_type 
type,
 
switch (type) {
case TYPE_CREATE:
+   inode->i_op = _file_inode_operations;
inode->i_mapping->a_ops = _aops_dax;
break;
case TYPE_MKNOD:
@@ -1089,6 +1091,29 @@ static void nova_setsize(struct inode *inode, loff_t 
oldsize, loff_t newsize,
NOVA_END_TIMING(setsize_t, setsize_time);
 }
 
+int nova_getattr(const struct path *path, struct kstat *stat,
+u32 request_mask, unsigned int query_flags)
+{
+   struct inode *inode = d_inode(path->dentry);
+   struct nova_inode_info *si = NOVA_I(inode);
+   struct nova_inode_info_header *sih = >header;
+   unsigned int flags = sih->i_flags;
+
+   if (flags & FS_APPEND_FL)
+   stat->attributes |= STATX_ATTR_APPEND;
+   if (flags & FS_COMPR_FL)
+   stat->attributes |= STATX_ATTR_COMPRESSED;
+   if (flags & FS_IMMUTABLE_FL)
+   stat->attributes |= STATX_ATTR_IMMUTABLE;
+   if (flags & FS_NODUMP_FL)
+   stat->attributes |= STATX_ATTR_NODUMP;
+
+   generic_fillattr(inode, stat);
+   /* stat->blocks should be the number of 512B blocks */
+   stat->blocks = (inode->i_blocks << inode->i_sb->s_blocksize_bits) >> 9;
+   return 0;
+}
+
 int nova_notify_change(struct dentry *dentry, struct iattr *attr)
 {
struct inode *inode = dentry->d_inode;
diff --git a/fs/nova/inode.h b/fs/nova/inode.h
index 4ddf8c2..48403cf 100644
--- a/fs/nova/inode.h
+++ b/fs/nova/inode.h
@@ -267,6 +267,8 @@ int nova_delete_file_tree(struct super_block *sb,
 extern void nova_evict_inode(struct inode *inode);
 extern int nova_write_inode(struct inode *inode, struct writeback_control 
*wbc);
 extern void nova_dirty_inode(struct inode *inode, int flags);
+extern int nova_getattr(const struct path *path, struct kstat *stat,
+u32 request_mask, unsigned int query_flags);
 extern int nova_notify_change(struct dentry *dentry, struct iattr *attr);
 
 #endif
diff --git a/fs/nova/nova.h b/fs/nova/nova.h
index 85292d3..601e082 100644
--- a/fs/nova/nova.h
+++ b/fs/nova/nova.h
@@ -484,6 +484,9 @@ int nova_add_dentry(struct dentry *dentry, u64 ino, int 
inc_link,
 int nova_remove_dentry(struct dentry *dentry, int dec_link,
struct nova_inode_update *update, u64 epoch_id);
 
+/* file.c */
+extern const struct inode_operations nova_file_inode_operations;
+
 /* namei.c */
 extern const struct inode_operations nova_dir_inode_operations;
 extern const struct inode_operations nova_special_inode_operations;
-- 
2.7.4



[RFC v2 54/83] Namei: create and mknod.

2018-03-10 Thread Andiry Xu
From: Andiry Xu 

NOVA allocates and initializes a new inode, and appends a dentry
to the directory's log. Then NOVA creates a transaction to
commit both changes atomically: update the directory log tail
pointer and validate the new inode.

Signed-off-by: Andiry Xu 
---
 fs/nova/namei.c | 141 
 1 file changed, 141 insertions(+)

diff --git a/fs/nova/namei.c b/fs/nova/namei.c
index 8076f5b..a07cc4f 100644
--- a/fs/nova/namei.c
+++ b/fs/nova/namei.c
@@ -68,6 +68,145 @@ static struct dentry *nova_lookup(struct inode *dir, struct 
dentry *dentry,
return d_splice_alias(inode, dentry);
 }
 
+static void nova_lite_transaction_for_new_inode(struct super_block *sb,
+   struct nova_inode *pi, struct nova_inode *pidir, struct inode *inode,
+   struct inode *dir, struct nova_inode_update *update)
+{
+   struct nova_sb_info *sbi = NOVA_SB(sb);
+   int cpu;
+   u64 journal_tail;
+   timing_t trans_time;
+
+   NOVA_START_TIMING(create_trans_t, trans_time);
+
+   cpu = smp_processor_id();
+   spin_lock(>journal_locks[cpu]);
+
+   // If you change what's required to create a new inode, you need to
+   // update this functions so the changes will be roll back on failure.
+   journal_tail = nova_create_inode_transaction(sb, inode, dir, cpu, 1, 0);
+
+   nova_update_inode(sb, dir, pidir, update);
+
+   pi->valid = 1;
+   nova_persist_inode(pi);
+   PERSISTENT_BARRIER();
+
+   nova_commit_lite_transaction(sb, journal_tail, cpu);
+   spin_unlock(>journal_locks[cpu]);
+
+   NOVA_END_TIMING(create_trans_t, trans_time);
+}
+
+/* Returns new tail after append */
+/*
+ * By the time this is called, we already have created
+ * the directory cache entry for the new file, but it
+ * is so far negative - it has no inode.
+ *
+ * If the create succeeds, we fill in the inode information
+ * with d_instantiate().
+ */
+static int nova_create(struct inode *dir, struct dentry *dentry, umode_t mode,
+   bool excl)
+{
+   struct inode *inode = NULL;
+   int err = PTR_ERR(inode);
+   struct super_block *sb = dir->i_sb;
+   struct nova_inode *pidir, *pi;
+   struct nova_inode_update update;
+   u64 pi_addr = 0;
+   u64 ino, epoch_id;
+   timing_t create_time;
+
+   NOVA_START_TIMING(create_t, create_time);
+
+   pidir = nova_get_inode(sb, dir);
+   if (!pidir)
+   goto out_err;
+
+   epoch_id = nova_get_epoch_id(sb);
+   ino = nova_new_nova_inode(sb, _addr);
+   if (ino == 0)
+   goto out_err;
+
+   update.tail = 0;
+   err = nova_add_dentry(dentry, ino, 0, , epoch_id);
+   if (err)
+   goto out_err;
+
+   nova_dbgv("%s: %s\n", __func__, dentry->d_name.name);
+   nova_dbgv("%s: inode %llu, dir %lu\n", __func__, ino, dir->i_ino);
+   inode = nova_new_vfs_inode(TYPE_CREATE, dir, pi_addr, ino, mode,
+   0, 0, >d_name, epoch_id);
+   if (IS_ERR(inode))
+   goto out_err;
+
+   d_instantiate(dentry, inode);
+   unlock_new_inode(inode);
+
+   pi = nova_get_block(sb, pi_addr);
+   nova_lite_transaction_for_new_inode(sb, pi, pidir, inode, dir,
+   );
+   NOVA_END_TIMING(create_t, create_time);
+   return err;
+out_err:
+   nova_err(sb, "%s return %d\n", __func__, err);
+   NOVA_END_TIMING(create_t, create_time);
+   return err;
+}
+
+static int nova_mknod(struct inode *dir, struct dentry *dentry, umode_t mode,
+  dev_t rdev)
+{
+   struct inode *inode = NULL;
+   int err = PTR_ERR(inode);
+   struct super_block *sb = dir->i_sb;
+   u64 pi_addr = 0;
+   struct nova_inode *pidir, *pi;
+   struct nova_inode_update update;
+   u64 ino;
+   u64 epoch_id;
+   timing_t mknod_time;
+
+   NOVA_START_TIMING(mknod_t, mknod_time);
+
+   pidir = nova_get_inode(sb, dir);
+   if (!pidir)
+   goto out_err;
+
+   epoch_id = nova_get_epoch_id(sb);
+   ino = nova_new_nova_inode(sb, _addr);
+   if (ino == 0)
+   goto out_err;
+
+   nova_dbgv("%s: %s\n", __func__, dentry->d_name.name);
+   nova_dbgv("%s: inode %llu, dir %lu\n", __func__, ino, dir->i_ino);
+
+   update.tail = 0;
+   err = nova_add_dentry(dentry, ino, 0, , epoch_id);
+   if (err)
+   goto out_err;
+
+   inode = nova_new_vfs_inode(TYPE_MKNOD, dir, pi_addr, ino, mode,
+   0, rdev, >d_name, epoch_id);
+   if (IS_ERR(inode))
+   goto out_err;
+
+   d_instantiate(dentry, inode);
+   unlock_new_inode(inode);
+
+   pi = nova_get_block(sb, pi_addr);
+   nova_lite_transaction_for_new_inode(sb, pi, pidir, inode, dir,
+  

[RFC v2 62/83] File: getattr and file inode operations

2018-03-10 Thread Andiry Xu
From: Andiry Xu 

Signed-off-by: Andiry Xu 
---
 fs/nova/Makefile |  2 +-
 fs/nova/file.c   | 31 +++
 fs/nova/inode.c  | 25 +
 fs/nova/inode.h  |  2 ++
 fs/nova/nova.h   |  3 +++
 5 files changed, 62 insertions(+), 1 deletion(-)
 create mode 100644 fs/nova/file.c

diff --git a/fs/nova/Makefile b/fs/nova/Makefile
index eb97e46..468ed6f 100644
--- a/fs/nova/Makefile
+++ b/fs/nova/Makefile
@@ -4,5 +4,5 @@
 
 obj-$(CONFIG_NOVA_FS) += nova.o
 
-nova-y := balloc.o bbuild.o dir.o inode.o journal.o log.o namei.o\
+nova-y := balloc.o bbuild.o dir.o file.o inode.o journal.o log.o namei.o\
  rebuild.o stats.o super.o
diff --git a/fs/nova/file.c b/fs/nova/file.c
new file mode 100644
index 000..b46d4bd
--- /dev/null
+++ b/fs/nova/file.c
@@ -0,0 +1,31 @@
+/*
+ * BRIEF DESCRIPTION
+ *
+ * File operations for files.
+ *
+ * Copyright 2015-2016 Regents of the University of California,
+ * UCSD Non-Volatile Systems Lab, Andiry Xu 
+ * Copyright 2012-2013 Intel Corporation
+ * Copyright 2009-2011 Marco Stornelli 
+ * Copyright 2003 Sony Corporation
+ * Copyright 2003 Matsushita Electric Industrial Co., Ltd.
+ * 2003-2004 (c) MontaVista Software, Inc. , Steve Longerbeam
+ * This file is licensed under the terms of the GNU General Public
+ * License version 2. This program is licensed "as is" without any
+ * warranty of any kind, whether express or implied.
+ */
+
+#include 
+#include 
+#include 
+#include 
+#include 
+#include "nova.h"
+#include "inode.h"
+
+
+const struct inode_operations nova_file_inode_operations = {
+   .setattr= nova_notify_change,
+   .getattr= nova_getattr,
+   .get_acl= NULL,
+};
diff --git a/fs/nova/inode.c b/fs/nova/inode.c
index 0e9ab4b..6fcc5e7 100644
--- a/fs/nova/inode.c
+++ b/fs/nova/inode.c
@@ -231,6 +231,7 @@ static int nova_read_inode(struct super_block *sb, struct 
inode *inode,
 
switch (inode->i_mode & S_IFMT) {
case S_IFREG:
+   inode->i_op = _file_inode_operations;
break;
case S_IFDIR:
inode->i_op = _dir_inode_operations;
@@ -926,6 +927,7 @@ struct inode *nova_new_vfs_inode(enum nova_new_inode_type 
type,
 
switch (type) {
case TYPE_CREATE:
+   inode->i_op = _file_inode_operations;
inode->i_mapping->a_ops = _aops_dax;
break;
case TYPE_MKNOD:
@@ -1089,6 +1091,29 @@ static void nova_setsize(struct inode *inode, loff_t 
oldsize, loff_t newsize,
NOVA_END_TIMING(setsize_t, setsize_time);
 }
 
+int nova_getattr(const struct path *path, struct kstat *stat,
+u32 request_mask, unsigned int query_flags)
+{
+   struct inode *inode = d_inode(path->dentry);
+   struct nova_inode_info *si = NOVA_I(inode);
+   struct nova_inode_info_header *sih = >header;
+   unsigned int flags = sih->i_flags;
+
+   if (flags & FS_APPEND_FL)
+   stat->attributes |= STATX_ATTR_APPEND;
+   if (flags & FS_COMPR_FL)
+   stat->attributes |= STATX_ATTR_COMPRESSED;
+   if (flags & FS_IMMUTABLE_FL)
+   stat->attributes |= STATX_ATTR_IMMUTABLE;
+   if (flags & FS_NODUMP_FL)
+   stat->attributes |= STATX_ATTR_NODUMP;
+
+   generic_fillattr(inode, stat);
+   /* stat->blocks should be the number of 512B blocks */
+   stat->blocks = (inode->i_blocks << inode->i_sb->s_blocksize_bits) >> 9;
+   return 0;
+}
+
 int nova_notify_change(struct dentry *dentry, struct iattr *attr)
 {
struct inode *inode = dentry->d_inode;
diff --git a/fs/nova/inode.h b/fs/nova/inode.h
index 4ddf8c2..48403cf 100644
--- a/fs/nova/inode.h
+++ b/fs/nova/inode.h
@@ -267,6 +267,8 @@ int nova_delete_file_tree(struct super_block *sb,
 extern void nova_evict_inode(struct inode *inode);
 extern int nova_write_inode(struct inode *inode, struct writeback_control 
*wbc);
 extern void nova_dirty_inode(struct inode *inode, int flags);
+extern int nova_getattr(const struct path *path, struct kstat *stat,
+u32 request_mask, unsigned int query_flags);
 extern int nova_notify_change(struct dentry *dentry, struct iattr *attr);
 
 #endif
diff --git a/fs/nova/nova.h b/fs/nova/nova.h
index 85292d3..601e082 100644
--- a/fs/nova/nova.h
+++ b/fs/nova/nova.h
@@ -484,6 +484,9 @@ int nova_add_dentry(struct dentry *dentry, u64 ino, int 
inc_link,
 int nova_remove_dentry(struct dentry *dentry, int dec_link,
struct nova_inode_update *update, u64 epoch_id);
 
+/* file.c */
+extern const struct inode_operations nova_file_inode_operations;
+
 /* namei.c */
 extern const struct inode_operations nova_dir_inode_operations;
 extern const struct inode_operations nova_special_inode_operations;
-- 
2.7.4



[RFC v2 61/83] Super: Add nova_export_ops.

2018-03-10 Thread Andiry Xu
From: Andiry Xu 

Signed-off-by: Andiry Xu 
---
 fs/nova/super.c | 48 
 1 file changed, 48 insertions(+)

diff --git a/fs/nova/super.c b/fs/nova/super.c
index daf3270..0847e57 100644
--- a/fs/nova/super.c
+++ b/fs/nova/super.c
@@ -51,6 +51,7 @@ module_param(nova_dbgmask, int, 0444);
 MODULE_PARM_DESC(nova_dbgmask, "Control debugging output");
 
 static struct super_operations nova_sops;
+static const struct export_operations nova_export_ops;
 
 static struct kmem_cache *nova_inode_cachep;
 static struct kmem_cache *nova_range_node_cachep;
@@ -631,6 +632,7 @@ static int nova_fill_super(struct super_block *sb, void 
*data, int silent)
sb->s_op = _sops;
sb->s_maxbytes = nova_max_size(sb->s_blocksize_bits);
sb->s_time_gran = 10; // 1 second.
+   sb->s_export_op = _export_ops;
sb->s_xattr = NULL;
sb->s_flags |= MS_NOSEC;
 
@@ -904,6 +906,52 @@ static struct file_system_type nova_fs_type = {
.kill_sb= kill_block_super,
 };
 
+static struct inode *nova_nfs_get_inode(struct super_block *sb,
+u64 ino, u32 generation)
+{
+   struct inode *inode;
+
+   if (ino < NOVA_ROOT_INO)
+   return ERR_PTR(-ESTALE);
+
+   if (ino > LONG_MAX)
+   return ERR_PTR(-ESTALE);
+
+   inode = nova_iget(sb, ino);
+   if (IS_ERR(inode))
+   return ERR_CAST(inode);
+
+   if (generation && inode->i_generation != generation) {
+   /* we didn't find the right inode.. */
+   iput(inode);
+   return ERR_PTR(-ESTALE);
+   }
+
+   return inode;
+}
+
+static struct dentry *nova_fh_to_dentry(struct super_block *sb,
+struct fid *fid, int fh_len,
+int fh_type)
+{
+   return generic_fh_to_dentry(sb, fid, fh_len, fh_type,
+   nova_nfs_get_inode);
+}
+
+static struct dentry *nova_fh_to_parent(struct super_block *sb,
+struct fid *fid, int fh_len,
+int fh_type)
+{
+   return generic_fh_to_parent(sb, fid, fh_len, fh_type,
+   nova_nfs_get_inode);
+}
+
+static const struct export_operations nova_export_ops = {
+   .fh_to_dentry   = nova_fh_to_dentry,
+   .fh_to_parent   = nova_fh_to_parent,
+   .get_parent = nova_get_parent,
+};
+
 static int __init init_nova_fs(void)
 {
int rc = 0;
-- 
2.7.4



[RFC v2 53/83] Namei: lookup.

2018-03-10 Thread Andiry Xu
From: Andiry Xu <jix...@cs.ucsd.edu>

NOVA lookup the inode number by searching the radix tree with
the filename hash value and locating the corresponding dentry on the log.

Signed-off-by: Andiry Xu <jix...@cs.ucsd.edu>
---
 fs/nova/Makefile |  3 +-
 fs/nova/inode.c  |  2 ++
 fs/nova/namei.c  | 97 
 fs/nova/nova.h   |  4 +++
 4 files changed, 105 insertions(+), 1 deletion(-)
 create mode 100644 fs/nova/namei.c

diff --git a/fs/nova/Makefile b/fs/nova/Makefile
index 3a3243c..eb97e46 100644
--- a/fs/nova/Makefile
+++ b/fs/nova/Makefile
@@ -4,4 +4,5 @@
 
 obj-$(CONFIG_NOVA_FS) += nova.o
 
-nova-y := balloc.o bbuild.o dir.o inode.o journal.o log.o rebuild.o stats.o 
super.o
+nova-y := balloc.o bbuild.o dir.o inode.o journal.o log.o namei.o\
+ rebuild.o stats.o super.o
diff --git a/fs/nova/inode.c b/fs/nova/inode.c
index 17addd3..2d3f7a3 100644
--- a/fs/nova/inode.c
+++ b/fs/nova/inode.c
@@ -181,6 +181,7 @@ static int nova_read_inode(struct super_block *sb, struct 
inode *inode,
case S_IFREG:
break;
case S_IFDIR:
+   inode->i_op = _dir_inode_operations;
inode->i_fop = _dir_operations;
break;
case S_IFLNK:
@@ -881,6 +882,7 @@ struct inode *nova_new_vfs_inode(enum nova_new_inode_type 
type,
inode->i_mapping->a_ops = _aops_dax;
break;
case TYPE_MKDIR:
+   inode->i_op = _dir_inode_operations;
inode->i_fop = _dir_operations;
inode->i_mapping->a_ops = _aops_dax;
set_nlink(inode, 2);
diff --git a/fs/nova/namei.c b/fs/nova/namei.c
new file mode 100644
index 000..8076f5b
--- /dev/null
+++ b/fs/nova/namei.c
@@ -0,0 +1,97 @@
+/*
+ * BRIEF DESCRIPTION
+ *
+ * Inode operations for directories.
+ *
+ * Copyright 2015-2016 Regents of the University of California,
+ * UCSD Non-Volatile Systems Lab, Andiry Xu <jix...@cs.ucsd.edu>
+ * Copyright 2012-2013 Intel Corporation
+ * Copyright 2009-2011 Marco Stornelli <marco.storne...@gmail.com>
+ * Copyright 2003 Sony Corporation
+ * Copyright 2003 Matsushita Electric Industrial Co., Ltd.
+ * 2003-2004 (c) MontaVista Software, Inc. , Steve Longerbeam
+ * This file is licensed under the terms of the GNU General Public
+ * License version 2. This program is licensed "as is" without any
+ * warranty of any kind, whether express or implied.
+ */
+#include 
+#include 
+#include "nova.h"
+#include "journal.h"
+#include "inode.h"
+
+static ino_t nova_inode_by_name(struct inode *dir, struct qstr *entry,
+struct nova_dentry **res_entry)
+{
+   struct super_block *sb = dir->i_sb;
+   struct nova_dentry *direntry;
+
+   direntry = nova_find_dentry(sb, NULL, dir,
+   entry->name, entry->len);
+   if (direntry == NULL)
+   return 0;
+
+   *res_entry = direntry;
+   return direntry->ino;
+}
+
+static struct dentry *nova_lookup(struct inode *dir, struct dentry *dentry,
+  unsigned int flags)
+{
+   struct inode *inode = NULL;
+   struct nova_dentry *de;
+   ino_t ino;
+   timing_t lookup_time;
+
+   NOVA_START_TIMING(lookup_t, lookup_time);
+   if (dentry->d_name.len > NOVA_NAME_LEN) {
+   nova_dbg("%s: namelen %u exceeds limit\n",
+   __func__, dentry->d_name.len);
+   return ERR_PTR(-ENAMETOOLONG);
+   }
+
+   nova_dbg_verbose("%s: %s\n", __func__, dentry->d_name.name);
+   ino = nova_inode_by_name(dir, >d_name, );
+   nova_dbg_verbose("%s: ino %lu\n", __func__, ino);
+   if (ino) {
+   inode = nova_iget(dir->i_sb, ino);
+   if (inode == ERR_PTR(-ESTALE) || inode == ERR_PTR(-ENOMEM)
+   || inode == ERR_PTR(-EACCES)) {
+   nova_err(dir->i_sb,
+ "%s: get inode failed: %lu\n",
+ __func__, (unsigned long)ino);
+   return ERR_PTR(-EIO);
+   }
+   }
+
+   NOVA_END_TIMING(lookup_t, lookup_time);
+   return d_splice_alias(inode, dentry);
+}
+
+struct dentry *nova_get_parent(struct dentry *child)
+{
+   struct inode *inode;
+   struct qstr dotdot = QSTR_INIT("..", 2);
+   struct nova_dentry *de = NULL;
+   ino_t ino;
+
+   nova_inode_by_name(child->d_inode, , );
+   if (!de)
+   return ERR_PTR(-ENOENT);
+
+   /* FIXME: can de->ino be avoided by using the return value of
+* nova_inode_by_name()?
+*/
+   ino = le64_to_cpu(de->ino);
+
+   if (ino)
+   inode = nova_iget(child->d_inode->i

[RFC v2 63/83] File operation: llseek.

2018-03-10 Thread Andiry Xu
From: Andiry Xu <jix...@cs.ucsd.edu>

Search the file radix tree to find hold or data.

Signed-off-by: Andiry Xu <jix...@cs.ucsd.edu>
---
 fs/nova/file.c  |  47 +++
 fs/nova/inode.c | 113 
 fs/nova/inode.h |   1 +
 fs/nova/nova.h  |   1 +
 4 files changed, 162 insertions(+)

diff --git a/fs/nova/file.c b/fs/nova/file.c
index b46d4bd..ecaf20a 100644
--- a/fs/nova/file.c
+++ b/fs/nova/file.c
@@ -23,6 +23,53 @@
 #include "nova.h"
 #include "inode.h"
 
+static loff_t nova_llseek(struct file *file, loff_t offset, int origin)
+{
+   struct inode *inode = file->f_path.dentry->d_inode;
+   struct nova_inode_info *si = NOVA_I(inode);
+   struct nova_inode_info_header *sih = >header;
+   int retval;
+
+   if (origin != SEEK_DATA && origin != SEEK_HOLE)
+   return generic_file_llseek(file, offset, origin);
+
+   sih_lock_shared(sih);
+   switch (origin) {
+   case SEEK_DATA:
+   retval = nova_find_region(inode, , 0);
+   if (retval) {
+   sih_unlock_shared(sih);
+   return retval;
+   }
+   break;
+   case SEEK_HOLE:
+   retval = nova_find_region(inode, , 1);
+   if (retval) {
+   sih_unlock_shared(sih);
+   return retval;
+   }
+   break;
+   }
+
+   if ((offset < 0 && !(file->f_mode & FMODE_UNSIGNED_OFFSET)) ||
+   offset > inode->i_sb->s_maxbytes) {
+   sih_unlock_shared(sih);
+   return -ENXIO;
+   }
+
+   if (offset != file->f_pos) {
+   file->f_pos = offset;
+   file->f_version = 0;
+   }
+
+   sih_unlock_shared(sih);
+   return offset;
+}
+
+
+const struct file_operations nova_dax_file_operations = {
+   .llseek = nova_llseek,
+};
 
 const struct inode_operations nova_file_inode_operations = {
.setattr= nova_notify_change,
diff --git a/fs/nova/inode.c b/fs/nova/inode.c
index 6fcc5e7..a6d74cb 100644
--- a/fs/nova/inode.c
+++ b/fs/nova/inode.c
@@ -193,6 +193,52 @@ static void nova_truncate_file_blocks(struct inode *inode, 
loff_t start,
 
 }
 
+/* search the radix tree to find hole or data
+ * in the specified range
+ * Input:
+ * first_blocknr: first block in the specified range
+ * last_blocknr: last_blocknr in the specified range
+ * @data_found: indicates whether data blocks were found
+ * @hole_found: indicates whether a hole was found
+ * hole: whether we are looking for a hole or data
+ */
+static int nova_lookup_hole_in_range(struct super_block *sb,
+   struct nova_inode_info_header *sih,
+   unsigned long first_blocknr, unsigned long last_blocknr,
+   int *data_found, int *hole_found, int hole)
+{
+   struct nova_file_write_entry *entry;
+   unsigned long blocks = 0;
+   unsigned long pgoff, old_pgoff;
+
+   pgoff = first_blocknr;
+   while (pgoff <= last_blocknr) {
+   old_pgoff = pgoff;
+   entry = radix_tree_lookup(>tree, pgoff);
+   if (entry) {
+   *data_found = 1;
+   if (!hole)
+   goto done;
+   pgoff++;
+   } else {
+   *hole_found = 1;
+   entry = nova_find_next_entry(sb, sih, pgoff);
+   pgoff++;
+   if (entry) {
+   pgoff = pgoff > entry->pgoff ?
+   pgoff : entry->pgoff;
+   if (pgoff > last_blocknr)
+   pgoff = last_blocknr + 1;
+   }
+   }
+
+   if (!*hole_found || !hole)
+   blocks += pgoff - old_pgoff;
+   }
+done:
+   return blocks;
+}
+
 /* copy persistent state to struct inode */
 static int nova_read_inode(struct super_block *sb, struct inode *inode,
u64 pi_addr)
@@ -232,6 +278,7 @@ static int nova_read_inode(struct super_block *sb, struct 
inode *inode,
switch (inode->i_mode & S_IFMT) {
case S_IFREG:
inode->i_op = _file_inode_operations;
+   inode->i_fop = _dax_file_operations;
break;
case S_IFDIR:
inode->i_op = _dir_inode_operations;
@@ -929,6 +976,7 @@ struct inode *nova_new_vfs_inode(enum nova_new_inode_type 
type,
case TYPE_CREATE:
inode->i_op = _file_inode_operations;
inode->i_mapping->a_ops = _aops_dax;
+   inode->i_fop = _dax_file_operations;
break;
case TYPE_MKNOD:
init_special_inode(inode, mode, rdev);
@@ -1170,6 +

[RFC v2 53/83] Namei: lookup.

2018-03-10 Thread Andiry Xu
From: Andiry Xu 

NOVA lookup the inode number by searching the radix tree with
the filename hash value and locating the corresponding dentry on the log.

Signed-off-by: Andiry Xu 
---
 fs/nova/Makefile |  3 +-
 fs/nova/inode.c  |  2 ++
 fs/nova/namei.c  | 97 
 fs/nova/nova.h   |  4 +++
 4 files changed, 105 insertions(+), 1 deletion(-)
 create mode 100644 fs/nova/namei.c

diff --git a/fs/nova/Makefile b/fs/nova/Makefile
index 3a3243c..eb97e46 100644
--- a/fs/nova/Makefile
+++ b/fs/nova/Makefile
@@ -4,4 +4,5 @@
 
 obj-$(CONFIG_NOVA_FS) += nova.o
 
-nova-y := balloc.o bbuild.o dir.o inode.o journal.o log.o rebuild.o stats.o 
super.o
+nova-y := balloc.o bbuild.o dir.o inode.o journal.o log.o namei.o\
+ rebuild.o stats.o super.o
diff --git a/fs/nova/inode.c b/fs/nova/inode.c
index 17addd3..2d3f7a3 100644
--- a/fs/nova/inode.c
+++ b/fs/nova/inode.c
@@ -181,6 +181,7 @@ static int nova_read_inode(struct super_block *sb, struct 
inode *inode,
case S_IFREG:
break;
case S_IFDIR:
+   inode->i_op = _dir_inode_operations;
inode->i_fop = _dir_operations;
break;
case S_IFLNK:
@@ -881,6 +882,7 @@ struct inode *nova_new_vfs_inode(enum nova_new_inode_type 
type,
inode->i_mapping->a_ops = _aops_dax;
break;
case TYPE_MKDIR:
+   inode->i_op = _dir_inode_operations;
inode->i_fop = _dir_operations;
inode->i_mapping->a_ops = _aops_dax;
set_nlink(inode, 2);
diff --git a/fs/nova/namei.c b/fs/nova/namei.c
new file mode 100644
index 000..8076f5b
--- /dev/null
+++ b/fs/nova/namei.c
@@ -0,0 +1,97 @@
+/*
+ * BRIEF DESCRIPTION
+ *
+ * Inode operations for directories.
+ *
+ * Copyright 2015-2016 Regents of the University of California,
+ * UCSD Non-Volatile Systems Lab, Andiry Xu 
+ * Copyright 2012-2013 Intel Corporation
+ * Copyright 2009-2011 Marco Stornelli 
+ * Copyright 2003 Sony Corporation
+ * Copyright 2003 Matsushita Electric Industrial Co., Ltd.
+ * 2003-2004 (c) MontaVista Software, Inc. , Steve Longerbeam
+ * This file is licensed under the terms of the GNU General Public
+ * License version 2. This program is licensed "as is" without any
+ * warranty of any kind, whether express or implied.
+ */
+#include 
+#include 
+#include "nova.h"
+#include "journal.h"
+#include "inode.h"
+
+static ino_t nova_inode_by_name(struct inode *dir, struct qstr *entry,
+struct nova_dentry **res_entry)
+{
+   struct super_block *sb = dir->i_sb;
+   struct nova_dentry *direntry;
+
+   direntry = nova_find_dentry(sb, NULL, dir,
+   entry->name, entry->len);
+   if (direntry == NULL)
+   return 0;
+
+   *res_entry = direntry;
+   return direntry->ino;
+}
+
+static struct dentry *nova_lookup(struct inode *dir, struct dentry *dentry,
+  unsigned int flags)
+{
+   struct inode *inode = NULL;
+   struct nova_dentry *de;
+   ino_t ino;
+   timing_t lookup_time;
+
+   NOVA_START_TIMING(lookup_t, lookup_time);
+   if (dentry->d_name.len > NOVA_NAME_LEN) {
+   nova_dbg("%s: namelen %u exceeds limit\n",
+   __func__, dentry->d_name.len);
+   return ERR_PTR(-ENAMETOOLONG);
+   }
+
+   nova_dbg_verbose("%s: %s\n", __func__, dentry->d_name.name);
+   ino = nova_inode_by_name(dir, >d_name, );
+   nova_dbg_verbose("%s: ino %lu\n", __func__, ino);
+   if (ino) {
+   inode = nova_iget(dir->i_sb, ino);
+   if (inode == ERR_PTR(-ESTALE) || inode == ERR_PTR(-ENOMEM)
+   || inode == ERR_PTR(-EACCES)) {
+   nova_err(dir->i_sb,
+ "%s: get inode failed: %lu\n",
+ __func__, (unsigned long)ino);
+   return ERR_PTR(-EIO);
+   }
+   }
+
+   NOVA_END_TIMING(lookup_t, lookup_time);
+   return d_splice_alias(inode, dentry);
+}
+
+struct dentry *nova_get_parent(struct dentry *child)
+{
+   struct inode *inode;
+   struct qstr dotdot = QSTR_INIT("..", 2);
+   struct nova_dentry *de = NULL;
+   ino_t ino;
+
+   nova_inode_by_name(child->d_inode, , );
+   if (!de)
+   return ERR_PTR(-ENOENT);
+
+   /* FIXME: can de->ino be avoided by using the return value of
+* nova_inode_by_name()?
+*/
+   ino = le64_to_cpu(de->ino);
+
+   if (ino)
+   inode = nova_iget(child->d_inode->i_sb, ino);
+   else
+   return ERR_PTR(-ENOENT);
+
+   return d_obtain_alias(inode);
+}
+
+const struct in

[RFC v2 63/83] File operation: llseek.

2018-03-10 Thread Andiry Xu
From: Andiry Xu 

Search the file radix tree to find hold or data.

Signed-off-by: Andiry Xu 
---
 fs/nova/file.c  |  47 +++
 fs/nova/inode.c | 113 
 fs/nova/inode.h |   1 +
 fs/nova/nova.h  |   1 +
 4 files changed, 162 insertions(+)

diff --git a/fs/nova/file.c b/fs/nova/file.c
index b46d4bd..ecaf20a 100644
--- a/fs/nova/file.c
+++ b/fs/nova/file.c
@@ -23,6 +23,53 @@
 #include "nova.h"
 #include "inode.h"
 
+static loff_t nova_llseek(struct file *file, loff_t offset, int origin)
+{
+   struct inode *inode = file->f_path.dentry->d_inode;
+   struct nova_inode_info *si = NOVA_I(inode);
+   struct nova_inode_info_header *sih = >header;
+   int retval;
+
+   if (origin != SEEK_DATA && origin != SEEK_HOLE)
+   return generic_file_llseek(file, offset, origin);
+
+   sih_lock_shared(sih);
+   switch (origin) {
+   case SEEK_DATA:
+   retval = nova_find_region(inode, , 0);
+   if (retval) {
+   sih_unlock_shared(sih);
+   return retval;
+   }
+   break;
+   case SEEK_HOLE:
+   retval = nova_find_region(inode, , 1);
+   if (retval) {
+   sih_unlock_shared(sih);
+   return retval;
+   }
+   break;
+   }
+
+   if ((offset < 0 && !(file->f_mode & FMODE_UNSIGNED_OFFSET)) ||
+   offset > inode->i_sb->s_maxbytes) {
+   sih_unlock_shared(sih);
+   return -ENXIO;
+   }
+
+   if (offset != file->f_pos) {
+   file->f_pos = offset;
+   file->f_version = 0;
+   }
+
+   sih_unlock_shared(sih);
+   return offset;
+}
+
+
+const struct file_operations nova_dax_file_operations = {
+   .llseek = nova_llseek,
+};
 
 const struct inode_operations nova_file_inode_operations = {
.setattr= nova_notify_change,
diff --git a/fs/nova/inode.c b/fs/nova/inode.c
index 6fcc5e7..a6d74cb 100644
--- a/fs/nova/inode.c
+++ b/fs/nova/inode.c
@@ -193,6 +193,52 @@ static void nova_truncate_file_blocks(struct inode *inode, 
loff_t start,
 
 }
 
+/* search the radix tree to find hole or data
+ * in the specified range
+ * Input:
+ * first_blocknr: first block in the specified range
+ * last_blocknr: last_blocknr in the specified range
+ * @data_found: indicates whether data blocks were found
+ * @hole_found: indicates whether a hole was found
+ * hole: whether we are looking for a hole or data
+ */
+static int nova_lookup_hole_in_range(struct super_block *sb,
+   struct nova_inode_info_header *sih,
+   unsigned long first_blocknr, unsigned long last_blocknr,
+   int *data_found, int *hole_found, int hole)
+{
+   struct nova_file_write_entry *entry;
+   unsigned long blocks = 0;
+   unsigned long pgoff, old_pgoff;
+
+   pgoff = first_blocknr;
+   while (pgoff <= last_blocknr) {
+   old_pgoff = pgoff;
+   entry = radix_tree_lookup(>tree, pgoff);
+   if (entry) {
+   *data_found = 1;
+   if (!hole)
+   goto done;
+   pgoff++;
+   } else {
+   *hole_found = 1;
+   entry = nova_find_next_entry(sb, sih, pgoff);
+   pgoff++;
+   if (entry) {
+   pgoff = pgoff > entry->pgoff ?
+   pgoff : entry->pgoff;
+   if (pgoff > last_blocknr)
+   pgoff = last_blocknr + 1;
+   }
+   }
+
+   if (!*hole_found || !hole)
+   blocks += pgoff - old_pgoff;
+   }
+done:
+   return blocks;
+}
+
 /* copy persistent state to struct inode */
 static int nova_read_inode(struct super_block *sb, struct inode *inode,
u64 pi_addr)
@@ -232,6 +278,7 @@ static int nova_read_inode(struct super_block *sb, struct 
inode *inode,
switch (inode->i_mode & S_IFMT) {
case S_IFREG:
inode->i_op = _file_inode_operations;
+   inode->i_fop = _dax_file_operations;
break;
case S_IFDIR:
inode->i_op = _dir_inode_operations;
@@ -929,6 +976,7 @@ struct inode *nova_new_vfs_inode(enum nova_new_inode_type 
type,
case TYPE_CREATE:
inode->i_op = _file_inode_operations;
inode->i_mapping->a_ops = _aops_dax;
+   inode->i_fop = _dax_file_operations;
break;
case TYPE_MKNOD:
init_special_inode(inode, mode, rdev);
@@ -1170,6 +1218,71 @@ int nova_notify_change(struct dentry *dentry,

[RFC v2 66/83] Super: Add file write item cache.

2018-03-10 Thread Andiry Xu
From: Andiry Xu <jix...@cs.ucsd.edu>

nova_file_write_item combines a file write item with a list head.
NOVA uses a linked list of file write items to describe a write operation.

Signed-off-by: Andiry Xu <jix...@cs.ucsd.edu>
---
 fs/nova/super.c | 43 ++-
 fs/nova/super.h |  3 +++
 2 files changed, 45 insertions(+), 1 deletion(-)

diff --git a/fs/nova/super.c b/fs/nova/super.c
index 0847e57..9710be8 100644
--- a/fs/nova/super.c
+++ b/fs/nova/super.c
@@ -55,6 +55,7 @@ static const struct export_operations nova_export_ops;
 
 static struct kmem_cache *nova_inode_cachep;
 static struct kmem_cache *nova_range_node_cachep;
+static struct kmem_cache *nova_file_write_item_cachep;
 
 
 /* FIXME: should the following variable be one per NOVA instance? */
@@ -791,6 +792,21 @@ inline void nova_free_inode_node(struct super_block *sb,
nova_free_range_node(node);
 }
 
+inline void nova_free_file_write_item(struct nova_file_write_item *item)
+{
+   kmem_cache_free(nova_file_write_item_cachep, item);
+}
+
+inline struct nova_file_write_item *
+nova_alloc_file_write_item(struct super_block *sb)
+{
+   struct nova_file_write_item *p;
+
+   p = (struct nova_file_write_item *)
+   kmem_cache_alloc(nova_file_write_item_cachep, GFP_NOFS);
+   return p;
+}
+
 inline struct nova_range_node *nova_alloc_range_node(struct super_block *sb)
 {
struct nova_range_node *p;
@@ -849,6 +865,18 @@ static int __init init_rangenode_cache(void)
return 0;
 }
 
+static int __init init_file_write_item_cache(void)
+{
+   nova_file_write_item_cachep = kmem_cache_create(
+   "nova_file_write_item_cache",
+   sizeof(struct nova_file_write_item),
+   0, (SLAB_RECLAIM_ACCOUNT |
+   SLAB_MEM_SPREAD), NULL);
+   if (nova_file_write_item_cachep == NULL)
+   return -ENOMEM;
+   return 0;
+}
+
 static int __init init_inodecache(void)
 {
nova_inode_cachep = kmem_cache_create("nova_inode_cache",
@@ -875,6 +903,11 @@ static void destroy_rangenode_cache(void)
kmem_cache_destroy(nova_range_node_cachep);
 }
 
+static void destroy_file_write_item_cache(void)
+{
+   kmem_cache_destroy(nova_file_write_item_cachep);
+}
+
 
 /*
  * the super block writes are all done "on the fly", so the
@@ -974,14 +1007,21 @@ static int __init init_nova_fs(void)
if (rc)
goto out1;
 
-   rc = register_filesystem(_fs_type);
+   rc = init_file_write_item_cache();
if (rc)
goto out2;
 
+   rc = register_filesystem(_fs_type);
+   if (rc)
+   goto out3;
+
 out:
NOVA_END_TIMING(init_t, init_time);
return rc;
 
+out3:
+   destroy_file_write_item_cache();
+
 out2:
destroy_inodecache();
 
@@ -993,6 +1033,7 @@ static int __init init_nova_fs(void)
 static void __exit exit_nova_fs(void)
 {
unregister_filesystem(_fs_type);
+   destroy_file_write_item_cache();
destroy_inodecache();
destroy_rangenode_cache();
 }
diff --git a/fs/nova/super.h b/fs/nova/super.h
index 56a840e..bcf9548 100644
--- a/fs/nova/super.h
+++ b/fs/nova/super.h
@@ -160,8 +160,11 @@ static inline struct nova_super_block 
*nova_get_super(struct super_block *sb)
 extern void nova_error_mng(struct super_block *sb, const char *fmt, ...);
 extern struct nova_range_node *nova_alloc_range_node(struct super_block *sb);
 extern inline struct nova_range_node *nova_alloc_inode_node(struct super_block 
*sb);
+extern struct nova_file_write_item *
+nova_alloc_file_write_item(struct super_block *sb);
 extern void nova_free_range_node(struct nova_range_node *node);
 extern inline void nova_free_inode_node(struct super_block *sb,
struct nova_range_node *node);
+void nova_free_file_write_item(struct nova_file_write_item *item);
 
 #endif
-- 
2.7.4



[RFC v2 66/83] Super: Add file write item cache.

2018-03-10 Thread Andiry Xu
From: Andiry Xu 

nova_file_write_item combines a file write item with a list head.
NOVA uses a linked list of file write items to describe a write operation.

Signed-off-by: Andiry Xu 
---
 fs/nova/super.c | 43 ++-
 fs/nova/super.h |  3 +++
 2 files changed, 45 insertions(+), 1 deletion(-)

diff --git a/fs/nova/super.c b/fs/nova/super.c
index 0847e57..9710be8 100644
--- a/fs/nova/super.c
+++ b/fs/nova/super.c
@@ -55,6 +55,7 @@ static const struct export_operations nova_export_ops;
 
 static struct kmem_cache *nova_inode_cachep;
 static struct kmem_cache *nova_range_node_cachep;
+static struct kmem_cache *nova_file_write_item_cachep;
 
 
 /* FIXME: should the following variable be one per NOVA instance? */
@@ -791,6 +792,21 @@ inline void nova_free_inode_node(struct super_block *sb,
nova_free_range_node(node);
 }
 
+inline void nova_free_file_write_item(struct nova_file_write_item *item)
+{
+   kmem_cache_free(nova_file_write_item_cachep, item);
+}
+
+inline struct nova_file_write_item *
+nova_alloc_file_write_item(struct super_block *sb)
+{
+   struct nova_file_write_item *p;
+
+   p = (struct nova_file_write_item *)
+   kmem_cache_alloc(nova_file_write_item_cachep, GFP_NOFS);
+   return p;
+}
+
 inline struct nova_range_node *nova_alloc_range_node(struct super_block *sb)
 {
struct nova_range_node *p;
@@ -849,6 +865,18 @@ static int __init init_rangenode_cache(void)
return 0;
 }
 
+static int __init init_file_write_item_cache(void)
+{
+   nova_file_write_item_cachep = kmem_cache_create(
+   "nova_file_write_item_cache",
+   sizeof(struct nova_file_write_item),
+   0, (SLAB_RECLAIM_ACCOUNT |
+   SLAB_MEM_SPREAD), NULL);
+   if (nova_file_write_item_cachep == NULL)
+   return -ENOMEM;
+   return 0;
+}
+
 static int __init init_inodecache(void)
 {
nova_inode_cachep = kmem_cache_create("nova_inode_cache",
@@ -875,6 +903,11 @@ static void destroy_rangenode_cache(void)
kmem_cache_destroy(nova_range_node_cachep);
 }
 
+static void destroy_file_write_item_cache(void)
+{
+   kmem_cache_destroy(nova_file_write_item_cachep);
+}
+
 
 /*
  * the super block writes are all done "on the fly", so the
@@ -974,14 +1007,21 @@ static int __init init_nova_fs(void)
if (rc)
goto out1;
 
-   rc = register_filesystem(_fs_type);
+   rc = init_file_write_item_cache();
if (rc)
goto out2;
 
+   rc = register_filesystem(_fs_type);
+   if (rc)
+   goto out3;
+
 out:
NOVA_END_TIMING(init_t, init_time);
return rc;
 
+out3:
+   destroy_file_write_item_cache();
+
 out2:
destroy_inodecache();
 
@@ -993,6 +1033,7 @@ static int __init init_nova_fs(void)
 static void __exit exit_nova_fs(void)
 {
unregister_filesystem(_fs_type);
+   destroy_file_write_item_cache();
destroy_inodecache();
destroy_rangenode_cache();
 }
diff --git a/fs/nova/super.h b/fs/nova/super.h
index 56a840e..bcf9548 100644
--- a/fs/nova/super.h
+++ b/fs/nova/super.h
@@ -160,8 +160,11 @@ static inline struct nova_super_block 
*nova_get_super(struct super_block *sb)
 extern void nova_error_mng(struct super_block *sb, const char *fmt, ...);
 extern struct nova_range_node *nova_alloc_range_node(struct super_block *sb);
 extern inline struct nova_range_node *nova_alloc_inode_node(struct super_block 
*sb);
+extern struct nova_file_write_item *
+nova_alloc_file_write_item(struct super_block *sb);
 extern void nova_free_range_node(struct nova_range_node *node);
 extern inline void nova_free_inode_node(struct super_block *sb,
struct nova_range_node *node);
+void nova_free_file_write_item(struct nova_file_write_item *item);
 
 #endif
-- 
2.7.4



[RFC v2 64/83] File operation: open, fsync, flush.

2018-03-10 Thread Andiry Xu
From: Andiry Xu <jix...@cs.ucsd.edu>

NOVA persists file metadata and data before returning to the user space.
Hence, fsync is a no-op if the file is not mmaped.

Signed-off-by: Andiry Xu <jix...@cs.ucsd.edu>
---
 fs/nova/file.c | 50 ++
 1 file changed, 50 insertions(+)

diff --git a/fs/nova/file.c b/fs/nova/file.c
index ecaf20a..f60fdf3 100644
--- a/fs/nova/file.c
+++ b/fs/nova/file.c
@@ -66,9 +66,59 @@ static loff_t nova_llseek(struct file *file, loff_t offset, 
int origin)
return offset;
 }
 
+/* This function is called by both msync() and fsync().
+ * TODO: Check if we can avoid calling nova_flush_buffer() for fsync. We use
+ * movnti to write data to files, so we may want to avoid doing unnecessary
+ * nova_flush_buffer() on fsync()
+ */
+static int nova_fsync(struct file *file, loff_t start, loff_t end, int 
datasync)
+{
+   struct address_space *mapping = file->f_mapping;
+   unsigned long start_pgoff, end_pgoff;
+   int ret = 0;
+   timing_t fsync_time;
+
+   NOVA_START_TIMING(fsync_t, fsync_time);
+
+   if (datasync)
+   NOVA_STATS_ADD(fdatasync, 1);
+
+   /* No need to flush if the file is not mmaped */
+   if (!mapping_mapped(mapping))
+   goto persist;
+
+   start_pgoff = start >> PAGE_SHIFT;
+   end_pgoff = (end + 1) >> PAGE_SHIFT;
+   nova_dbgv("%s: msync pgoff range %lu to %lu\n",
+   __func__, start_pgoff, end_pgoff);
+
+   ret = generic_file_fsync(file, start, end, datasync);
+
+persist:
+   PERSISTENT_BARRIER();
+   NOVA_END_TIMING(fsync_t, fsync_time);
+
+   return ret;
+}
+
+/* This callback is called when a file is closed */
+static int nova_flush(struct file *file, fl_owner_t id)
+{
+   PERSISTENT_BARRIER();
+   return 0;
+}
+
+static int nova_open(struct inode *inode, struct file *filp)
+{
+   return generic_file_open(inode, filp);
+}
+
 
 const struct file_operations nova_dax_file_operations = {
.llseek = nova_llseek,
+   .open   = nova_open,
+   .fsync  = nova_fsync,
+   .flush  = nova_flush,
 };
 
 const struct inode_operations nova_file_inode_operations = {
-- 
2.7.4



[RFC v2 64/83] File operation: open, fsync, flush.

2018-03-10 Thread Andiry Xu
From: Andiry Xu 

NOVA persists file metadata and data before returning to the user space.
Hence, fsync is a no-op if the file is not mmaped.

Signed-off-by: Andiry Xu 
---
 fs/nova/file.c | 50 ++
 1 file changed, 50 insertions(+)

diff --git a/fs/nova/file.c b/fs/nova/file.c
index ecaf20a..f60fdf3 100644
--- a/fs/nova/file.c
+++ b/fs/nova/file.c
@@ -66,9 +66,59 @@ static loff_t nova_llseek(struct file *file, loff_t offset, 
int origin)
return offset;
 }
 
+/* This function is called by both msync() and fsync().
+ * TODO: Check if we can avoid calling nova_flush_buffer() for fsync. We use
+ * movnti to write data to files, so we may want to avoid doing unnecessary
+ * nova_flush_buffer() on fsync()
+ */
+static int nova_fsync(struct file *file, loff_t start, loff_t end, int 
datasync)
+{
+   struct address_space *mapping = file->f_mapping;
+   unsigned long start_pgoff, end_pgoff;
+   int ret = 0;
+   timing_t fsync_time;
+
+   NOVA_START_TIMING(fsync_t, fsync_time);
+
+   if (datasync)
+   NOVA_STATS_ADD(fdatasync, 1);
+
+   /* No need to flush if the file is not mmaped */
+   if (!mapping_mapped(mapping))
+   goto persist;
+
+   start_pgoff = start >> PAGE_SHIFT;
+   end_pgoff = (end + 1) >> PAGE_SHIFT;
+   nova_dbgv("%s: msync pgoff range %lu to %lu\n",
+   __func__, start_pgoff, end_pgoff);
+
+   ret = generic_file_fsync(file, start, end, datasync);
+
+persist:
+   PERSISTENT_BARRIER();
+   NOVA_END_TIMING(fsync_t, fsync_time);
+
+   return ret;
+}
+
+/* This callback is called when a file is closed */
+static int nova_flush(struct file *file, fl_owner_t id)
+{
+   PERSISTENT_BARRIER();
+   return 0;
+}
+
+static int nova_open(struct inode *inode, struct file *filp)
+{
+   return generic_file_open(inode, filp);
+}
+
 
 const struct file_operations nova_dax_file_operations = {
.llseek = nova_llseek,
+   .open   = nova_open,
+   .fsync  = nova_fsync,
+   .flush  = nova_flush,
 };
 
 const struct inode_operations nova_file_inode_operations = {
-- 
2.7.4



[RFC v2 67/83] Dax: commit list of file write items to log.

2018-03-10 Thread Andiry Xu
From: Andiry Xu <jix...@cs.ucsd.edu>

Given a list of file write items, NOVA commits them by appending
each file write entry to the log, and then updates the radix tree
to point to these new entries, and updates log tail pointer to
commit all the writes atomically.
If the items are allocated on heap, free them on success.

Signed-off-by: Andiry Xu <jix...@cs.ucsd.edu>
---
 fs/nova/Makefile |   2 +-
 fs/nova/dax.c| 112 +++
 fs/nova/nova.h   |   5 +++
 3 files changed, 118 insertions(+), 1 deletion(-)
 create mode 100644 fs/nova/dax.c

diff --git a/fs/nova/Makefile b/fs/nova/Makefile
index 468ed6f..7f851f2 100644
--- a/fs/nova/Makefile
+++ b/fs/nova/Makefile
@@ -4,5 +4,5 @@
 
 obj-$(CONFIG_NOVA_FS) += nova.o
 
-nova-y := balloc.o bbuild.o dir.o file.o inode.o journal.o log.o namei.o\
+nova-y := balloc.o bbuild.o dax.o dir.o file.o inode.o journal.o log.o namei.o\
  rebuild.o stats.o super.o
diff --git a/fs/nova/dax.c b/fs/nova/dax.c
new file mode 100644
index 000..1669dc0
--- /dev/null
+++ b/fs/nova/dax.c
@@ -0,0 +1,112 @@
+/*
+ * BRIEF DESCRIPTION
+ *
+ * DAX file operations.
+ *
+ * Copyright 2015-2016 Regents of the University of California,
+ * UCSD Non-Volatile Systems Lab, Andiry Xu <jix...@cs.ucsd.edu>
+ * Copyright 2012-2013 Intel Corporation
+ * Copyright 2009-2011 Marco Stornelli <marco.storne...@gmail.com>
+ *
+ * This file is licensed under the terms of the GNU General Public
+ * License version 2. This program is licensed "as is" without any
+ * warranty of any kind, whether express or implied.
+ */
+
+#include 
+#include 
+#include 
+#include 
+#include 
+#include "nova.h"
+#include "inode.h"
+
+
+static int nova_reassign_file_tree(struct super_block *sb,
+   struct nova_inode_info_header *sih, u64 begin_tail, u64 end_tail)
+{
+   void *addr;
+   struct nova_file_write_entry *entry;
+   u64 curr_p = begin_tail;
+   size_t entry_size = sizeof(struct nova_file_write_entry);
+
+   while (curr_p && curr_p != end_tail) {
+   if (is_last_entry(curr_p, entry_size))
+   curr_p = next_log_page(sb, curr_p);
+
+   if (curr_p == 0) {
+   nova_err(sb, "%s: File inode %lu log is NULL!\n",
+   __func__, sih->ino);
+   return -EINVAL;
+   }
+
+   addr = (void *) nova_get_block(sb, curr_p);
+   entry = (struct nova_file_write_entry *) addr;
+
+   if (nova_get_entry_type(entry) != FILE_WRITE) {
+   nova_dbg("%s: entry type is not write? %d\n",
+   __func__, nova_get_entry_type(entry));
+   curr_p += entry_size;
+   continue;
+   }
+
+   nova_assign_write_entry(sb, sih, entry, true);
+   curr_p += entry_size;
+   }
+
+   return 0;
+}
+
+int nova_commit_writes_to_log(struct super_block *sb, struct nova_inode *pi,
+   struct inode *inode, struct list_head *head, unsigned long new_blocks,
+   int free)
+{
+   struct nova_inode_info *si = NOVA_I(inode);
+   struct nova_inode_info_header *sih = >header;
+   struct nova_file_write_item *entry_item, *temp;
+   struct nova_inode_update update;
+   unsigned int data_bits;
+   u64 begin_tail = 0;
+   int ret = 0;
+
+   if (list_empty(head))
+   return 0;
+
+   update.tail = 0;
+
+   list_for_each_entry(entry_item, head, list) {
+   ret = nova_append_file_write_entry(sb, pi, inode,
+   entry_item, );
+   if (ret) {
+   nova_dbg("%s: append inode entry failed\n", __func__);
+   return -ENOSPC;
+   }
+
+   if (begin_tail == 0)
+   begin_tail = update.curr_entry;
+   }
+
+   /* Update file tree */
+   ret = nova_reassign_file_tree(sb, sih, begin_tail, update.tail);
+   if (ret < 0) {
+   /* FIXME: Need to rebuild the tree */
+   return ret;
+   }
+
+   data_bits = blk_type_to_shift[sih->i_blk_type];
+   sih->i_blocks += (new_blocks << (data_bits - sb->s_blocksize_bits));
+
+   inode->i_blocks = sih->i_blocks;
+
+   nova_update_inode(sb, inode, pi, );
+   NOVA_STATS_ADD(inplace_new_blocks, 1);
+
+   sih->trans_id++;
+
+   if (free) {
+   list_for_each_entry_safe(entry_item, temp, head, list)
+   nova_free_file_write_item(entry_item);
+   }
+
+   return ret;
+}
diff --git a/fs/nova/nova.h b/fs/nova/nova.h
index b2831f6..dcda02a 100644
--- a/fs/nova/nova.h
+++ b/fs/nova/nova.h
@@ -464,6 

[RFC v2 67/83] Dax: commit list of file write items to log.

2018-03-10 Thread Andiry Xu
From: Andiry Xu 

Given a list of file write items, NOVA commits them by appending
each file write entry to the log, and then updates the radix tree
to point to these new entries, and updates log tail pointer to
commit all the writes atomically.
If the items are allocated on heap, free them on success.

Signed-off-by: Andiry Xu 
---
 fs/nova/Makefile |   2 +-
 fs/nova/dax.c| 112 +++
 fs/nova/nova.h   |   5 +++
 3 files changed, 118 insertions(+), 1 deletion(-)
 create mode 100644 fs/nova/dax.c

diff --git a/fs/nova/Makefile b/fs/nova/Makefile
index 468ed6f..7f851f2 100644
--- a/fs/nova/Makefile
+++ b/fs/nova/Makefile
@@ -4,5 +4,5 @@
 
 obj-$(CONFIG_NOVA_FS) += nova.o
 
-nova-y := balloc.o bbuild.o dir.o file.o inode.o journal.o log.o namei.o\
+nova-y := balloc.o bbuild.o dax.o dir.o file.o inode.o journal.o log.o namei.o\
  rebuild.o stats.o super.o
diff --git a/fs/nova/dax.c b/fs/nova/dax.c
new file mode 100644
index 000..1669dc0
--- /dev/null
+++ b/fs/nova/dax.c
@@ -0,0 +1,112 @@
+/*
+ * BRIEF DESCRIPTION
+ *
+ * DAX file operations.
+ *
+ * Copyright 2015-2016 Regents of the University of California,
+ * UCSD Non-Volatile Systems Lab, Andiry Xu 
+ * Copyright 2012-2013 Intel Corporation
+ * Copyright 2009-2011 Marco Stornelli 
+ *
+ * This file is licensed under the terms of the GNU General Public
+ * License version 2. This program is licensed "as is" without any
+ * warranty of any kind, whether express or implied.
+ */
+
+#include 
+#include 
+#include 
+#include 
+#include 
+#include "nova.h"
+#include "inode.h"
+
+
+static int nova_reassign_file_tree(struct super_block *sb,
+   struct nova_inode_info_header *sih, u64 begin_tail, u64 end_tail)
+{
+   void *addr;
+   struct nova_file_write_entry *entry;
+   u64 curr_p = begin_tail;
+   size_t entry_size = sizeof(struct nova_file_write_entry);
+
+   while (curr_p && curr_p != end_tail) {
+   if (is_last_entry(curr_p, entry_size))
+   curr_p = next_log_page(sb, curr_p);
+
+   if (curr_p == 0) {
+   nova_err(sb, "%s: File inode %lu log is NULL!\n",
+   __func__, sih->ino);
+   return -EINVAL;
+   }
+
+   addr = (void *) nova_get_block(sb, curr_p);
+   entry = (struct nova_file_write_entry *) addr;
+
+   if (nova_get_entry_type(entry) != FILE_WRITE) {
+   nova_dbg("%s: entry type is not write? %d\n",
+   __func__, nova_get_entry_type(entry));
+   curr_p += entry_size;
+   continue;
+   }
+
+   nova_assign_write_entry(sb, sih, entry, true);
+   curr_p += entry_size;
+   }
+
+   return 0;
+}
+
+int nova_commit_writes_to_log(struct super_block *sb, struct nova_inode *pi,
+   struct inode *inode, struct list_head *head, unsigned long new_blocks,
+   int free)
+{
+   struct nova_inode_info *si = NOVA_I(inode);
+   struct nova_inode_info_header *sih = >header;
+   struct nova_file_write_item *entry_item, *temp;
+   struct nova_inode_update update;
+   unsigned int data_bits;
+   u64 begin_tail = 0;
+   int ret = 0;
+
+   if (list_empty(head))
+   return 0;
+
+   update.tail = 0;
+
+   list_for_each_entry(entry_item, head, list) {
+   ret = nova_append_file_write_entry(sb, pi, inode,
+   entry_item, );
+   if (ret) {
+   nova_dbg("%s: append inode entry failed\n", __func__);
+   return -ENOSPC;
+   }
+
+   if (begin_tail == 0)
+   begin_tail = update.curr_entry;
+   }
+
+   /* Update file tree */
+   ret = nova_reassign_file_tree(sb, sih, begin_tail, update.tail);
+   if (ret < 0) {
+   /* FIXME: Need to rebuild the tree */
+   return ret;
+   }
+
+   data_bits = blk_type_to_shift[sih->i_blk_type];
+   sih->i_blocks += (new_blocks << (data_bits - sb->s_blocksize_bits));
+
+   inode->i_blocks = sih->i_blocks;
+
+   nova_update_inode(sb, inode, pi, );
+   NOVA_STATS_ADD(inplace_new_blocks, 1);
+
+   sih->trans_id++;
+
+   if (free) {
+   list_for_each_entry_safe(entry_item, temp, head, list)
+   nova_free_file_write_item(entry_item);
+   }
+
+   return ret;
+}
diff --git a/fs/nova/nova.h b/fs/nova/nova.h
index b2831f6..dcda02a 100644
--- a/fs/nova/nova.h
+++ b/fs/nova/nova.h
@@ -464,6 +464,11 @@ nova_get_blocknr(struct super_block *sb, u64 block, 
unsigned short btype)
 /* ==  Function prototypes  = */
 /* =

[RFC v2 65/83] File operation: read.

2018-03-10 Thread Andiry Xu
From: Andiry Xu <jix...@cs.ucsd.edu>

NOVA is a DAX file system and does not use page cache.
For read, NOVA looks up the file write entry by searching the radix tree,
and copies data from pmem pages to user buffer directly.

Signed-off-by: Andiry Xu <jix...@cs.ucsd.edu>
---
 fs/nova/file.c | 144 +
 1 file changed, 144 insertions(+)

diff --git a/fs/nova/file.c b/fs/nova/file.c
index f60fdf3..842da45 100644
--- a/fs/nova/file.c
+++ b/fs/nova/file.c
@@ -113,9 +113,153 @@ static int nova_open(struct inode *inode, struct file 
*filp)
return generic_file_open(inode, filp);
 }
 
+static ssize_t
+do_dax_mapping_read(struct file *filp, char __user *buf,
+   size_t len, loff_t *ppos)
+{
+   struct inode *inode = filp->f_mapping->host;
+   struct super_block *sb = inode->i_sb;
+   struct nova_inode_info *si = NOVA_I(inode);
+   struct nova_inode_info_header *sih = >header;
+   struct nova_file_write_entry *entry;
+   pgoff_t index, end_index;
+   unsigned long offset;
+   loff_t isize, pos;
+   size_t copied = 0, error = 0;
+   timing_t memcpy_time;
+
+   pos = *ppos;
+   index = pos >> PAGE_SHIFT;
+   offset = pos & ~PAGE_MASK;
+
+   if (!access_ok(VERIFY_WRITE, buf, len)) {
+   error = -EFAULT;
+   goto out;
+   }
+
+   isize = i_size_read(inode);
+   if (!isize)
+   goto out;
+
+   nova_dbgv("%s: inode %lu, offset %lld, count %lu, size %lld\n",
+   __func__, inode->i_ino, pos, len, isize);
+
+   if (len > isize - pos)
+   len = isize - pos;
+
+   if (len <= 0)
+   goto out;
+
+   end_index = (isize - 1) >> PAGE_SHIFT;
+   do {
+   unsigned long nr, left;
+   unsigned long nvmm;
+   void *dax_mem = NULL;
+   int zero = 0;
+
+   /* nr is the maximum number of bytes to copy from this page */
+   if (index >= end_index) {
+   if (index > end_index)
+   goto out;
+   nr = ((isize - 1) & ~PAGE_MASK) + 1;
+   if (nr <= offset)
+   goto out;
+   }
+
+   entry = nova_get_write_entry(sb, sih, index);
+   if (unlikely(entry == NULL)) {
+   nova_dbgv("Required extent not found: pgoff %lu, inode 
size %lld\n",
+   index, isize);
+   nr = PAGE_SIZE;
+   zero = 1;
+   goto memcpy;
+   }
+
+   /* Find contiguous blocks */
+   if (index < entry->pgoff ||
+   index - entry->pgoff >= entry->num_pages) {
+   nova_err(sb, "%s ERROR: %lu, entry pgoff %llu, num %u, 
blocknr %llu\n",
+   __func__, index, entry->pgoff,
+   entry->num_pages, entry->block >> PAGE_SHIFT);
+   return -EINVAL;
+   }
+   if (entry->reassigned == 0) {
+   nr = (entry->num_pages - (index - entry->pgoff))
+   * PAGE_SIZE;
+   } else {
+   nr = PAGE_SIZE;
+   }
+
+   nvmm = get_nvmm(sb, sih, entry, index);
+   dax_mem = nova_get_block(sb, (nvmm << PAGE_SHIFT));
+
+memcpy:
+   nr = nr - offset;
+   if (nr > len - copied)
+   nr = len - copied;
+
+   NOVA_START_TIMING(memcpy_r_nvmm_t, memcpy_time);
+
+   if (!zero)
+   left = __copy_to_user(buf + copied,
+   dax_mem + offset, nr);
+   else
+   left = __clear_user(buf + copied, nr);
+
+   NOVA_END_TIMING(memcpy_r_nvmm_t, memcpy_time);
+
+   if (left) {
+   nova_dbg("%s ERROR!: bytes %lu, left %lu\n",
+   __func__, nr, left);
+   error = -EFAULT;
+   goto out;
+   }
+
+   copied += (nr - left);
+   offset += (nr - left);
+   index += offset >> PAGE_SHIFT;
+   offset &= ~PAGE_MASK;
+   } while (copied < len);
+
+out:
+   *ppos = pos + copied;
+   if (filp)
+   file_accessed(filp);
+
+   NOVA_STATS_ADD(read_bytes, copied);
+
+   nova_dbgv("%s returned %zu\n", __func__, copied);
+   return copied ? copied : error;
+}
+
+/*
+ * Wrappers. We need to use the read lock to avoid
+ * concurrent truncate operation. No problem for write because we held
+ * l

[RFC v2 65/83] File operation: read.

2018-03-10 Thread Andiry Xu
From: Andiry Xu 

NOVA is a DAX file system and does not use page cache.
For read, NOVA looks up the file write entry by searching the radix tree,
and copies data from pmem pages to user buffer directly.

Signed-off-by: Andiry Xu 
---
 fs/nova/file.c | 144 +
 1 file changed, 144 insertions(+)

diff --git a/fs/nova/file.c b/fs/nova/file.c
index f60fdf3..842da45 100644
--- a/fs/nova/file.c
+++ b/fs/nova/file.c
@@ -113,9 +113,153 @@ static int nova_open(struct inode *inode, struct file 
*filp)
return generic_file_open(inode, filp);
 }
 
+static ssize_t
+do_dax_mapping_read(struct file *filp, char __user *buf,
+   size_t len, loff_t *ppos)
+{
+   struct inode *inode = filp->f_mapping->host;
+   struct super_block *sb = inode->i_sb;
+   struct nova_inode_info *si = NOVA_I(inode);
+   struct nova_inode_info_header *sih = >header;
+   struct nova_file_write_entry *entry;
+   pgoff_t index, end_index;
+   unsigned long offset;
+   loff_t isize, pos;
+   size_t copied = 0, error = 0;
+   timing_t memcpy_time;
+
+   pos = *ppos;
+   index = pos >> PAGE_SHIFT;
+   offset = pos & ~PAGE_MASK;
+
+   if (!access_ok(VERIFY_WRITE, buf, len)) {
+   error = -EFAULT;
+   goto out;
+   }
+
+   isize = i_size_read(inode);
+   if (!isize)
+   goto out;
+
+   nova_dbgv("%s: inode %lu, offset %lld, count %lu, size %lld\n",
+   __func__, inode->i_ino, pos, len, isize);
+
+   if (len > isize - pos)
+   len = isize - pos;
+
+   if (len <= 0)
+   goto out;
+
+   end_index = (isize - 1) >> PAGE_SHIFT;
+   do {
+   unsigned long nr, left;
+   unsigned long nvmm;
+   void *dax_mem = NULL;
+   int zero = 0;
+
+   /* nr is the maximum number of bytes to copy from this page */
+   if (index >= end_index) {
+   if (index > end_index)
+   goto out;
+   nr = ((isize - 1) & ~PAGE_MASK) + 1;
+   if (nr <= offset)
+   goto out;
+   }
+
+   entry = nova_get_write_entry(sb, sih, index);
+   if (unlikely(entry == NULL)) {
+   nova_dbgv("Required extent not found: pgoff %lu, inode 
size %lld\n",
+   index, isize);
+   nr = PAGE_SIZE;
+   zero = 1;
+   goto memcpy;
+   }
+
+   /* Find contiguous blocks */
+   if (index < entry->pgoff ||
+   index - entry->pgoff >= entry->num_pages) {
+   nova_err(sb, "%s ERROR: %lu, entry pgoff %llu, num %u, 
blocknr %llu\n",
+   __func__, index, entry->pgoff,
+   entry->num_pages, entry->block >> PAGE_SHIFT);
+   return -EINVAL;
+   }
+   if (entry->reassigned == 0) {
+   nr = (entry->num_pages - (index - entry->pgoff))
+   * PAGE_SIZE;
+   } else {
+   nr = PAGE_SIZE;
+   }
+
+   nvmm = get_nvmm(sb, sih, entry, index);
+   dax_mem = nova_get_block(sb, (nvmm << PAGE_SHIFT));
+
+memcpy:
+   nr = nr - offset;
+   if (nr > len - copied)
+   nr = len - copied;
+
+   NOVA_START_TIMING(memcpy_r_nvmm_t, memcpy_time);
+
+   if (!zero)
+   left = __copy_to_user(buf + copied,
+   dax_mem + offset, nr);
+   else
+   left = __clear_user(buf + copied, nr);
+
+   NOVA_END_TIMING(memcpy_r_nvmm_t, memcpy_time);
+
+   if (left) {
+   nova_dbg("%s ERROR!: bytes %lu, left %lu\n",
+   __func__, nr, left);
+   error = -EFAULT;
+   goto out;
+   }
+
+   copied += (nr - left);
+   offset += (nr - left);
+   index += offset >> PAGE_SHIFT;
+   offset &= ~PAGE_MASK;
+   } while (copied < len);
+
+out:
+   *ppos = pos + copied;
+   if (filp)
+   file_accessed(filp);
+
+   NOVA_STATS_ADD(read_bytes, copied);
+
+   nova_dbgv("%s returned %zu\n", __func__, copied);
+   return copied ? copied : error;
+}
+
+/*
+ * Wrappers. We need to use the read lock to avoid
+ * concurrent truncate operation. No problem for write because we held
+ * lock.
+ */
+static ssize_t nova_dax_file_read(

[RFC v2 68/83] File operation: copy-on-write write.

2018-03-10 Thread Andiry Xu
From: Andiry Xu <jix...@cs.ucsd.edu>

If the file is not mmaped, NOVA performs copy-on-write.
The CoW is composed of parts:

1. Allocate contiguous data pages.
2. Copy data from user buffer to the data pages.
   If the write is not aligned to page size, also copy data from existing
   pmem pages.
3. Allocate and initialize a file write item, add it to a linked list.
4. Repeat 1 - 3 until the whole user data is copied to pmem pages.
5. Commit the list of file write items to the log and update the radix tree.
6. Update log tail pointer once all the items are committed.

Signed-off-by: Andiry Xu <jix...@cs.ucsd.edu>
---
 fs/nova/dax.c  | 149 +
 fs/nova/file.c | 208 +
 fs/nova/nova.h |   8 +++
 3 files changed, 365 insertions(+)

diff --git a/fs/nova/dax.c b/fs/nova/dax.c
index 1669dc0..9561d8e 100644
--- a/fs/nova/dax.c
+++ b/fs/nova/dax.c
@@ -22,6 +22,113 @@
 #include "inode.h"
 
 
+static inline int nova_copy_partial_block(struct super_block *sb,
+   struct nova_inode_info_header *sih,
+   struct nova_file_write_entry *entry, unsigned long index,
+   size_t offset, size_t length, void *kmem)
+{
+   void *ptr;
+   int rc = 0;
+   unsigned long nvmm;
+
+   nvmm = get_nvmm(sb, sih, entry, index);
+   ptr = nova_get_block(sb, (nvmm << PAGE_SHIFT));
+
+   if (ptr != NULL) {
+   if (support_clwb)
+   rc = memcpy_mcsafe(kmem + offset, ptr + offset,
+   length);
+   else
+   memcpy_to_pmem_nocache(kmem + offset, ptr + offset,
+   length);
+   }
+
+   /* TODO: If rc < 0, go to MCE data recovery. */
+   return rc;
+}
+
+static inline int nova_handle_partial_block(struct super_block *sb,
+   struct nova_inode_info_header *sih,
+   struct nova_file_write_entry *entry, unsigned long index,
+   size_t offset, size_t length, void *kmem)
+{
+   struct nova_sb_info *sbi = NOVA_SB(sb);
+
+   if (entry == NULL) {
+   /* Fill zero */
+   if (support_clwb)
+   memset(kmem + offset, 0, length);
+   else
+   memcpy_to_pmem_nocache(kmem + offset,
+   sbi->zeroed_page, length);
+   } else {
+   nova_copy_partial_block(sb, sih, entry, index,
+   offset, length, kmem);
+
+   }
+   if (support_clwb)
+   nova_flush_buffer(kmem + offset, length, 0);
+   return 0;
+}
+
+/*
+ * Fill the new start/end block from original blocks.
+ * Do nothing if fully covered; copy if original blocks present;
+ * Fill zero otherwise.
+ */
+int nova_handle_head_tail_blocks(struct super_block *sb,
+   struct inode *inode, loff_t pos, size_t count, void *kmem)
+{
+   struct nova_inode_info *si = NOVA_I(inode);
+   struct nova_inode_info_header *sih = >header;
+   size_t offset, eblk_offset;
+   unsigned long start_blk, end_blk, num_blocks;
+   struct nova_file_write_entry *entry;
+   timing_t partial_time;
+   int ret = 0;
+
+   NOVA_START_TIMING(partial_block_t, partial_time);
+   offset = pos & (sb->s_blocksize - 1);
+   num_blocks = ((count + offset - 1) >> sb->s_blocksize_bits) + 1;
+   /* offset in the actual block size block */
+   offset = pos & (nova_inode_blk_size(sih) - 1);
+   start_blk = pos >> sb->s_blocksize_bits;
+   end_blk = start_blk + num_blocks - 1;
+
+   nova_dbg_verbose("%s: %lu blocks\n", __func__, num_blocks);
+   /* We avoid zeroing the alloc'd range, which is going to be overwritten
+* by this system call anyway
+*/
+   nova_dbg_verbose("%s: start offset %lu start blk %lu %p\n", __func__,
+   offset, start_blk, kmem);
+   if (offset != 0) {
+   entry = nova_get_write_entry(sb, sih, start_blk);
+   ret = nova_handle_partial_block(sb, sih, entry,
+   start_blk, 0, offset, kmem);
+   if (ret < 0)
+   return ret;
+   }
+
+   kmem = (void *)((char *)kmem +
+   ((num_blocks - 1) << sb->s_blocksize_bits));
+   eblk_offset = (pos + count) & (nova_inode_blk_size(sih) - 1);
+   nova_dbg_verbose("%s: end offset %lu, end blk %lu %p\n", __func__,
+   eblk_offset, end_blk, kmem);
+   if (eblk_offset != 0) {
+   entry = nova_get_write_entry(sb, sih, end_blk);
+
+   ret = nova_handle_partial_block(sb, sih, entry, end_blk,
+   eblk_offset,
+   s

[RFC v2 68/83] File operation: copy-on-write write.

2018-03-10 Thread Andiry Xu
From: Andiry Xu 

If the file is not mmaped, NOVA performs copy-on-write.
The CoW is composed of parts:

1. Allocate contiguous data pages.
2. Copy data from user buffer to the data pages.
   If the write is not aligned to page size, also copy data from existing
   pmem pages.
3. Allocate and initialize a file write item, add it to a linked list.
4. Repeat 1 - 3 until the whole user data is copied to pmem pages.
5. Commit the list of file write items to the log and update the radix tree.
6. Update log tail pointer once all the items are committed.

Signed-off-by: Andiry Xu 
---
 fs/nova/dax.c  | 149 +
 fs/nova/file.c | 208 +
 fs/nova/nova.h |   8 +++
 3 files changed, 365 insertions(+)

diff --git a/fs/nova/dax.c b/fs/nova/dax.c
index 1669dc0..9561d8e 100644
--- a/fs/nova/dax.c
+++ b/fs/nova/dax.c
@@ -22,6 +22,113 @@
 #include "inode.h"
 
 
+static inline int nova_copy_partial_block(struct super_block *sb,
+   struct nova_inode_info_header *sih,
+   struct nova_file_write_entry *entry, unsigned long index,
+   size_t offset, size_t length, void *kmem)
+{
+   void *ptr;
+   int rc = 0;
+   unsigned long nvmm;
+
+   nvmm = get_nvmm(sb, sih, entry, index);
+   ptr = nova_get_block(sb, (nvmm << PAGE_SHIFT));
+
+   if (ptr != NULL) {
+   if (support_clwb)
+   rc = memcpy_mcsafe(kmem + offset, ptr + offset,
+   length);
+   else
+   memcpy_to_pmem_nocache(kmem + offset, ptr + offset,
+   length);
+   }
+
+   /* TODO: If rc < 0, go to MCE data recovery. */
+   return rc;
+}
+
+static inline int nova_handle_partial_block(struct super_block *sb,
+   struct nova_inode_info_header *sih,
+   struct nova_file_write_entry *entry, unsigned long index,
+   size_t offset, size_t length, void *kmem)
+{
+   struct nova_sb_info *sbi = NOVA_SB(sb);
+
+   if (entry == NULL) {
+   /* Fill zero */
+   if (support_clwb)
+   memset(kmem + offset, 0, length);
+   else
+   memcpy_to_pmem_nocache(kmem + offset,
+   sbi->zeroed_page, length);
+   } else {
+   nova_copy_partial_block(sb, sih, entry, index,
+   offset, length, kmem);
+
+   }
+   if (support_clwb)
+   nova_flush_buffer(kmem + offset, length, 0);
+   return 0;
+}
+
+/*
+ * Fill the new start/end block from original blocks.
+ * Do nothing if fully covered; copy if original blocks present;
+ * Fill zero otherwise.
+ */
+int nova_handle_head_tail_blocks(struct super_block *sb,
+   struct inode *inode, loff_t pos, size_t count, void *kmem)
+{
+   struct nova_inode_info *si = NOVA_I(inode);
+   struct nova_inode_info_header *sih = >header;
+   size_t offset, eblk_offset;
+   unsigned long start_blk, end_blk, num_blocks;
+   struct nova_file_write_entry *entry;
+   timing_t partial_time;
+   int ret = 0;
+
+   NOVA_START_TIMING(partial_block_t, partial_time);
+   offset = pos & (sb->s_blocksize - 1);
+   num_blocks = ((count + offset - 1) >> sb->s_blocksize_bits) + 1;
+   /* offset in the actual block size block */
+   offset = pos & (nova_inode_blk_size(sih) - 1);
+   start_blk = pos >> sb->s_blocksize_bits;
+   end_blk = start_blk + num_blocks - 1;
+
+   nova_dbg_verbose("%s: %lu blocks\n", __func__, num_blocks);
+   /* We avoid zeroing the alloc'd range, which is going to be overwritten
+* by this system call anyway
+*/
+   nova_dbg_verbose("%s: start offset %lu start blk %lu %p\n", __func__,
+   offset, start_blk, kmem);
+   if (offset != 0) {
+   entry = nova_get_write_entry(sb, sih, start_blk);
+   ret = nova_handle_partial_block(sb, sih, entry,
+   start_blk, 0, offset, kmem);
+   if (ret < 0)
+   return ret;
+   }
+
+   kmem = (void *)((char *)kmem +
+   ((num_blocks - 1) << sb->s_blocksize_bits));
+   eblk_offset = (pos + count) & (nova_inode_blk_size(sih) - 1);
+   nova_dbg_verbose("%s: end offset %lu, end blk %lu %p\n", __func__,
+   eblk_offset, end_blk, kmem);
+   if (eblk_offset != 0) {
+   entry = nova_get_write_entry(sb, sih, end_blk);
+
+   ret = nova_handle_partial_block(sb, sih, entry, end_blk,
+   eblk_offset,
+   sb->s_blocksize - eblk_offset,
+  

[RFC v2 70/83] File operation: Inplace write.

2018-03-10 Thread Andiry Xu
From: Andiry Xu <jix...@cs.ucsd.edu>

If the user specifies inplace updates, or the file is mmaped,
NOVA performs inplace writes.

The trick is dax page fault can occur concurrently with inplace writes,
and allocate new blocks. Also, inplace write memcpy may trigger page fault 
(xfstests 248).
Since page fault may take the write lock to modify the tree, write routine
cannot take tree lock during the memcpy.
As a result we perform inplace write in the following way:

1. Take the tree read lock, check existing entries or holes.
2. Release the read lock. Allocate new data pages if needed;
   allocate and initialize file write item, add to the list and perform memcpy.
3. With the list of file write items, take the tree write lock and perform 
commit:
   Due to concurrent page fault, the hole returned in step 1 may be filled by
   page fault handlers. In this case, NOVA copies the data from the file write 
item
   to the pages allocated by page fault handler, and free the data blocks 
allocated
   in step 2. This guarantees application can see the write via mmaped region.

The step 3 actually formats a new list of write items, and reuse the CoW commit
routine to commit the items.

Signed-off-by: Andiry Xu <jix...@cs.ucsd.edu>
---
 fs/nova/dax.c  | 472 +
 fs/nova/file.c |  10 +-
 fs/nova/nova.h |   4 +
 3 files changed, 484 insertions(+), 2 deletions(-)

diff --git a/fs/nova/dax.c b/fs/nova/dax.c
index 9561d8e..8624ce4 100644
--- a/fs/nova/dax.c
+++ b/fs/nova/dax.c
@@ -259,3 +259,475 @@ void nova_init_file_write_item(struct super_block *sb,
 
entry->size = file_size;
 }
+
+/*
+ * Check if there is an existing entry or hole for target page offset.
+ * Used for inplace write, DAX-mmap and fallocate.
+ */
+unsigned long nova_check_existing_entry(struct super_block *sb,
+   struct inode *inode, unsigned long num_blocks, unsigned long start_blk,
+   struct nova_file_write_entry **ret_entry,
+   int check_next, u64 epoch_id,
+   int *inplace)
+{
+   struct nova_inode_info *si = NOVA_I(inode);
+   struct nova_inode_info_header *sih = >header;
+   struct nova_file_write_entry *entry;
+   unsigned long next_pgoff;
+   unsigned long ent_blks = 0;
+   timing_t check_time;
+
+   NOVA_START_TIMING(check_entry_t, check_time);
+
+   *ret_entry = NULL;
+   *inplace = 0;
+   entry = nova_get_write_entry(sb, sih, start_blk);
+
+   if (entry) {
+   *ret_entry = entry;
+
+   /* We can do inplace write. Find contiguous blocks */
+   if (entry->reassigned == 0)
+   ent_blks = entry->num_pages -
+   (start_blk - entry->pgoff);
+   else
+   ent_blks = 1;
+
+   if (ent_blks > num_blocks)
+   ent_blks = num_blocks;
+
+   if (entry->epoch_id == epoch_id)
+   *inplace = 1;
+
+   } else if (check_next) {
+   /* Possible Hole */
+   entry = nova_find_next_entry(sb, sih, start_blk);
+   if (entry) {
+   next_pgoff = entry->pgoff;
+   if (next_pgoff <= start_blk) {
+   nova_err(sb, "iblock %lu, entry pgoff %lu, num 
pages %lu\n",
+  start_blk, next_pgoff, entry->num_pages);
+   nova_print_inode_log(sb, inode);
+   dump_stack();
+   ent_blks = num_blocks;
+   goto out;
+   }
+   ent_blks = next_pgoff - start_blk;
+   if (ent_blks > num_blocks)
+   ent_blks = num_blocks;
+   } else {
+   /* File grow */
+   ent_blks = num_blocks;
+   }
+   }
+
+   if (entry && ent_blks == 0) {
+   nova_dbg("%s: %d\n", __func__, check_next);
+   dump_stack();
+   }
+
+out:
+   NOVA_END_TIMING(check_entry_t, check_time);
+   return ent_blks;
+}
+
+/* Memcpy from newly allocated data blocks to existing data blocks */
+static int nova_inplace_memcpy(struct super_block *sb, struct inode *inode,
+   struct nova_file_write_entry *from, struct nova_file_write_entry *to,
+   unsigned long num_blocks, loff_t pos, size_t len)
+{
+   struct nova_inode_info *si = NOVA_I(inode);
+   struct nova_inode_info_header *sih = >header;
+   struct nova_log_entry_info entry_info;
+   unsigned long pgoff;
+   unsigned long from_nvmm, to_nvmm;
+   void *from_addr, *to_addr = NULL;
+   loff_t base, start, end, offset;
+
+   pgoff = le64_to_cpu(from->pgoff);
+   base = start = pgoff << PAGE_SHIFT;
+   end =

[RFC v2 70/83] File operation: Inplace write.

2018-03-10 Thread Andiry Xu
From: Andiry Xu 

If the user specifies inplace updates, or the file is mmaped,
NOVA performs inplace writes.

The trick is dax page fault can occur concurrently with inplace writes,
and allocate new blocks. Also, inplace write memcpy may trigger page fault 
(xfstests 248).
Since page fault may take the write lock to modify the tree, write routine
cannot take tree lock during the memcpy.
As a result we perform inplace write in the following way:

1. Take the tree read lock, check existing entries or holes.
2. Release the read lock. Allocate new data pages if needed;
   allocate and initialize file write item, add to the list and perform memcpy.
3. With the list of file write items, take the tree write lock and perform 
commit:
   Due to concurrent page fault, the hole returned in step 1 may be filled by
   page fault handlers. In this case, NOVA copies the data from the file write 
item
   to the pages allocated by page fault handler, and free the data blocks 
allocated
   in step 2. This guarantees application can see the write via mmaped region.

The step 3 actually formats a new list of write items, and reuse the CoW commit
routine to commit the items.

Signed-off-by: Andiry Xu 
---
 fs/nova/dax.c  | 472 +
 fs/nova/file.c |  10 +-
 fs/nova/nova.h |   4 +
 3 files changed, 484 insertions(+), 2 deletions(-)

diff --git a/fs/nova/dax.c b/fs/nova/dax.c
index 9561d8e..8624ce4 100644
--- a/fs/nova/dax.c
+++ b/fs/nova/dax.c
@@ -259,3 +259,475 @@ void nova_init_file_write_item(struct super_block *sb,
 
entry->size = file_size;
 }
+
+/*
+ * Check if there is an existing entry or hole for target page offset.
+ * Used for inplace write, DAX-mmap and fallocate.
+ */
+unsigned long nova_check_existing_entry(struct super_block *sb,
+   struct inode *inode, unsigned long num_blocks, unsigned long start_blk,
+   struct nova_file_write_entry **ret_entry,
+   int check_next, u64 epoch_id,
+   int *inplace)
+{
+   struct nova_inode_info *si = NOVA_I(inode);
+   struct nova_inode_info_header *sih = >header;
+   struct nova_file_write_entry *entry;
+   unsigned long next_pgoff;
+   unsigned long ent_blks = 0;
+   timing_t check_time;
+
+   NOVA_START_TIMING(check_entry_t, check_time);
+
+   *ret_entry = NULL;
+   *inplace = 0;
+   entry = nova_get_write_entry(sb, sih, start_blk);
+
+   if (entry) {
+   *ret_entry = entry;
+
+   /* We can do inplace write. Find contiguous blocks */
+   if (entry->reassigned == 0)
+   ent_blks = entry->num_pages -
+   (start_blk - entry->pgoff);
+   else
+   ent_blks = 1;
+
+   if (ent_blks > num_blocks)
+   ent_blks = num_blocks;
+
+   if (entry->epoch_id == epoch_id)
+   *inplace = 1;
+
+   } else if (check_next) {
+   /* Possible Hole */
+   entry = nova_find_next_entry(sb, sih, start_blk);
+   if (entry) {
+   next_pgoff = entry->pgoff;
+   if (next_pgoff <= start_blk) {
+   nova_err(sb, "iblock %lu, entry pgoff %lu, num 
pages %lu\n",
+  start_blk, next_pgoff, entry->num_pages);
+   nova_print_inode_log(sb, inode);
+   dump_stack();
+   ent_blks = num_blocks;
+   goto out;
+   }
+   ent_blks = next_pgoff - start_blk;
+   if (ent_blks > num_blocks)
+   ent_blks = num_blocks;
+   } else {
+   /* File grow */
+   ent_blks = num_blocks;
+   }
+   }
+
+   if (entry && ent_blks == 0) {
+   nova_dbg("%s: %d\n", __func__, check_next);
+   dump_stack();
+   }
+
+out:
+   NOVA_END_TIMING(check_entry_t, check_time);
+   return ent_blks;
+}
+
+/* Memcpy from newly allocated data blocks to existing data blocks */
+static int nova_inplace_memcpy(struct super_block *sb, struct inode *inode,
+   struct nova_file_write_entry *from, struct nova_file_write_entry *to,
+   unsigned long num_blocks, loff_t pos, size_t len)
+{
+   struct nova_inode_info *si = NOVA_I(inode);
+   struct nova_inode_info_header *sih = >header;
+   struct nova_log_entry_info entry_info;
+   unsigned long pgoff;
+   unsigned long from_nvmm, to_nvmm;
+   void *from_addr, *to_addr = NULL;
+   loff_t base, start, end, offset;
+
+   pgoff = le64_to_cpu(from->pgoff);
+   base = start = pgoff << PAGE_SHIFT;
+   end = (pgoff + num_blocks) << PAGE_SHIFT;
+
+   

[RFC v2 69/83] Super: Add module param inplace_data_updates.

2018-03-10 Thread Andiry Xu
From: Andiry Xu <jix...@cs.ucsd.edu>

Provide inplace data updates option if people prefer inplace
updates to copy-on-write.

Signed-off-by: Andiry Xu <jix...@cs.ucsd.edu>
---
 fs/nova/nova.h  | 1 +
 fs/nova/super.c | 7 ++-
 2 files changed, 7 insertions(+), 1 deletion(-)

diff --git a/fs/nova/nova.h b/fs/nova/nova.h
index 1c2205e..6c94a9b 100644
--- a/fs/nova/nova.h
+++ b/fs/nova/nova.h
@@ -138,6 +138,7 @@ extern unsigned int nova_dbgmask;
 
 
 extern int measure_timing;
+extern int inplace_data_updates;
 
 
 extern unsigned int blk_type_to_shift[NOVA_BLOCK_TYPE_MAX];
diff --git a/fs/nova/super.c b/fs/nova/super.c
index 9710be8..980b1d7 100644
--- a/fs/nova/super.c
+++ b/fs/nova/super.c
@@ -43,10 +43,14 @@
 
 int measure_timing;
 int support_clwb;
+int inplace_data_updates;
 
 module_param(measure_timing, int, 0444);
 MODULE_PARM_DESC(measure_timing, "Timing measurement");
 
+module_param(inplace_data_updates, int, 0444);
+MODULE_PARM_DESC(inplace_data_updates, "Perform data updates in-place (i.e., 
not atomically)");
+
 module_param(nova_dbgmask, int, 0444);
 MODULE_PARM_DESC(nova_dbgmask, "Control debugging output");
 
@@ -541,7 +545,8 @@ static int nova_fill_super(struct super_block *sb, void 
*data, int silent)
goto out;
}
 
-   nova_dbg("measure timing %d\n", measure_timing);
+   nova_dbg("measure timing %d, inplace data update %d\n",
+   measure_timing, inplace_data_updates);
 
get_random_bytes(, sizeof(u32));
atomic_set(>next_generation, random);
-- 
2.7.4



[RFC v2 69/83] Super: Add module param inplace_data_updates.

2018-03-10 Thread Andiry Xu
From: Andiry Xu 

Provide inplace data updates option if people prefer inplace
updates to copy-on-write.

Signed-off-by: Andiry Xu 
---
 fs/nova/nova.h  | 1 +
 fs/nova/super.c | 7 ++-
 2 files changed, 7 insertions(+), 1 deletion(-)

diff --git a/fs/nova/nova.h b/fs/nova/nova.h
index 1c2205e..6c94a9b 100644
--- a/fs/nova/nova.h
+++ b/fs/nova/nova.h
@@ -138,6 +138,7 @@ extern unsigned int nova_dbgmask;
 
 
 extern int measure_timing;
+extern int inplace_data_updates;
 
 
 extern unsigned int blk_type_to_shift[NOVA_BLOCK_TYPE_MAX];
diff --git a/fs/nova/super.c b/fs/nova/super.c
index 9710be8..980b1d7 100644
--- a/fs/nova/super.c
+++ b/fs/nova/super.c
@@ -43,10 +43,14 @@
 
 int measure_timing;
 int support_clwb;
+int inplace_data_updates;
 
 module_param(measure_timing, int, 0444);
 MODULE_PARM_DESC(measure_timing, "Timing measurement");
 
+module_param(inplace_data_updates, int, 0444);
+MODULE_PARM_DESC(inplace_data_updates, "Perform data updates in-place (i.e., 
not atomically)");
+
 module_param(nova_dbgmask, int, 0444);
 MODULE_PARM_DESC(nova_dbgmask, "Control debugging output");
 
@@ -541,7 +545,8 @@ static int nova_fill_super(struct super_block *sb, void 
*data, int silent)
goto out;
}
 
-   nova_dbg("measure timing %d\n", measure_timing);
+   nova_dbg("measure timing %d, inplace data update %d\n",
+   measure_timing, inplace_data_updates);
 
get_random_bytes(, sizeof(u32));
atomic_set(>next_generation, random);
-- 
2.7.4



[RFC v2 73/83] Dax: Add iomap operations.

2018-03-10 Thread Andiry Xu
From: Andiry Xu <jix...@cs.ucsd.edu>

The key of iomap is dax_get_blocks(). It first takes the read lock
and lookup the block; if the block is missing, it takes write lock,
check again and allocate the new block if needed.

Signed-off-by: Andiry Xu <jix...@cs.ucsd.edu>
---
 fs/nova/dax.c  | 184 +
 fs/nova/nova.h |   3 +
 2 files changed, 187 insertions(+)

diff --git a/fs/nova/dax.c b/fs/nova/dax.c
index 8624ce4..e639b23 100644
--- a/fs/nova/dax.c
+++ b/fs/nova/dax.c
@@ -731,3 +731,187 @@ ssize_t nova_inplace_file_write(struct file *filp,
 
return ret;
 }
+
+/*
+ * return > 0, # of blocks mapped or allocated.
+ * return = 0, if plain lookup failed.
+ * return < 0, error case.
+ */
+static int nova_dax_get_blocks(struct inode *inode, sector_t iblock,
+   unsigned long max_blocks, u32 *bno, bool *new, bool *boundary,
+   int create)
+{
+   struct super_block *sb = inode->i_sb;
+   struct nova_inode *pi;
+   struct nova_inode_info *si = NOVA_I(inode);
+   struct nova_inode_info_header *sih = >header;
+   struct nova_file_write_entry *entry = NULL;
+   struct nova_file_write_item entry_item;
+   struct list_head item_head;
+   struct nova_inode_update update;
+   u32 time;
+   unsigned long nvmm = 0;
+   unsigned long blocknr = 0;
+   u64 epoch_id;
+   int num_blocks = 0;
+   int inplace = 0;
+   int allocated = 0;
+   int locked = 0;
+   int check_next;
+   int ret = 0;
+   timing_t get_block_time;
+
+
+   if (max_blocks == 0)
+   return 0;
+
+   NOVA_START_TIMING(dax_get_block_t, get_block_time);
+   INIT_LIST_HEAD(_head);
+
+   nova_dbgv("%s: pgoff %lu, num %lu, create %d\n",
+   __func__, iblock, max_blocks, create);
+
+   epoch_id = nova_get_epoch_id(sb);
+
+   check_next = 0;
+   sih_lock_shared(sih);
+
+again:
+   num_blocks = nova_check_existing_entry(sb, inode, max_blocks,
+   iblock, , check_next,
+   epoch_id, );
+
+   if (entry) {
+   if (create == 0 || inplace) {
+   nvmm = get_nvmm(sb, sih, entry, iblock);
+   nova_dbgv("%s: found pgoff %lu, block %lu\n",
+   __func__, iblock, nvmm);
+   goto out;
+   }
+   }
+
+   if (create == 0) {
+   num_blocks = 0;
+   goto out1;
+   }
+
+   if (locked == 0) {
+   sih_unlock_shared(sih);
+   sih_lock(sih);
+   locked = 1;
+   /* Check again incase someone has done it for us */
+   check_next = 1;
+   goto again;
+   }
+
+   pi = nova_get_inode(sb, inode);
+   inode->i_ctime = inode->i_mtime = current_time(inode);
+   time = current_time(inode).tv_sec;
+   update.tail = sih->log_tail;
+
+   /* Return initialized blocks to the user */
+   allocated = nova_new_data_blocks(sb, sih, , iblock,
+num_blocks, ALLOC_INIT_ZERO, ANY_CPU,
+ALLOC_FROM_HEAD);
+   if (allocated <= 0) {
+   nova_dbgv("%s alloc blocks failed %d\n", __func__,
+   allocated);
+   ret = allocated;
+   goto out;
+   }
+
+   num_blocks = allocated;
+   /* FIXME: how to handle file size? */
+   nova_init_file_write_item(sb, sih, _item,
+   epoch_id, iblock, num_blocks,
+   blocknr, time, inode->i_size);
+
+   list_add_tail(_item.list, _head);
+
+   nvmm = blocknr;
+
+   ret = nova_commit_writes_to_log(sb, pi, inode,
+   _head, num_blocks, 0);
+   if (ret < 0) {
+   nova_err(sb, "commit to log failed\n");
+   goto out;
+   }
+
+   NOVA_STATS_ADD(dax_new_blocks, 1);
+
+   *new = true;
+// set_buffer_new(bh);
+out:
+   if (ret < 0) {
+   nova_cleanup_incomplete_write(sb, sih, _head, 0);
+   num_blocks = ret;
+   goto out1;
+   }
+
+   *bno = nvmm;
+// if (num_blocks > 1)
+// bh->b_size = sb->s_blocksize * num_blocks;
+
+out1:
+   if (locked)
+   sih_unlock(sih);
+   else
+   sih_unlock_shared(sih);
+
+   NOVA_END_TIMING(dax_get_block_t, get_block_time);
+   return num_blocks;
+}
+
+static int nova_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
+   unsigned int flags, struct iomap *iomap)
+{
+   struct nova_sb_info *sbi = NOVA_SB(inode->i_sb);
+   unsigned int blkbits = inode->i_blkbits;
+ 

[RFC v2 71/83] Symlink support.

2018-03-10 Thread Andiry Xu
From: Andiry Xu <jix...@cs.ucsd.edu>

NOVA alloates two blocks for symlink inode: One for inode log,
and the other one is a data block, storing symname.

Signed-off-by: Andiry Xu <jix...@cs.ucsd.edu>
---
 fs/nova/Makefile  |   2 +-
 fs/nova/inode.c   |   2 +
 fs/nova/namei.c   |  70 
 fs/nova/nova.h|   5 ++
 fs/nova/symlink.c | 133 ++
 5 files changed, 211 insertions(+), 1 deletion(-)
 create mode 100644 fs/nova/symlink.c

diff --git a/fs/nova/Makefile b/fs/nova/Makefile
index 7f851f2..7bf6403 100644
--- a/fs/nova/Makefile
+++ b/fs/nova/Makefile
@@ -5,4 +5,4 @@
 obj-$(CONFIG_NOVA_FS) += nova.o
 
 nova-y := balloc.o bbuild.o dax.o dir.o file.o inode.o journal.o log.o namei.o\
- rebuild.o stats.o super.o
+ rebuild.o stats.o super.o symlink.o
diff --git a/fs/nova/inode.c b/fs/nova/inode.c
index a6d74cb..21be31a 100644
--- a/fs/nova/inode.c
+++ b/fs/nova/inode.c
@@ -285,6 +285,7 @@ static int nova_read_inode(struct super_block *sb, struct 
inode *inode,
inode->i_fop = _dir_operations;
break;
case S_IFLNK:
+   inode->i_op = _symlink_inode_operations;
break;
default:
inode->i_op = _special_inode_operations;
@@ -983,6 +984,7 @@ struct inode *nova_new_vfs_inode(enum nova_new_inode_type 
type,
inode->i_op = _special_inode_operations;
break;
case TYPE_SYMLINK:
+   inode->i_op = _symlink_inode_operations;
inode->i_mapping->a_ops = _aops_dax;
break;
case TYPE_MKDIR:
diff --git a/fs/nova/namei.c b/fs/nova/namei.c
index 7a81672..58f6a72 100644
--- a/fs/nova/namei.c
+++ b/fs/nova/namei.c
@@ -207,6 +207,75 @@ static int nova_mknod(struct inode *dir, struct dentry 
*dentry, umode_t mode,
return err;
 }
 
+static int nova_symlink(struct inode *dir, struct dentry *dentry,
+   const char *symname)
+{
+   struct super_block *sb = dir->i_sb;
+   int err = -ENAMETOOLONG;
+   unsigned int len = strlen(symname);
+   struct inode *inode;
+   struct nova_inode_info *si;
+   struct nova_inode_info_header *sih;
+   u64 pi_addr = 0;
+   struct nova_inode *pidir, *pi;
+   struct nova_inode_update update;
+   u64 ino;
+   u64 epoch_id;
+   timing_t symlink_time;
+
+   NOVA_START_TIMING(symlink_t, symlink_time);
+   if (len + 1 > sb->s_blocksize)
+   goto out;
+
+   pidir = nova_get_inode(sb, dir);
+   if (!pidir)
+   goto out_fail;
+
+   epoch_id = nova_get_epoch_id(sb);
+   ino = nova_new_nova_inode(sb, _addr);
+   if (ino == 0)
+   goto out_fail;
+
+   nova_dbgv("%s: name %s, symname %s\n", __func__,
+   dentry->d_name.name, symname);
+   nova_dbgv("%s: inode %llu, dir %lu\n", __func__, ino, dir->i_ino);
+
+   update.tail = 0;
+   err = nova_add_dentry(dentry, ino, 0, , epoch_id);
+   if (err)
+   goto out_fail;
+
+   inode = nova_new_vfs_inode(TYPE_SYMLINK, dir, pi_addr, ino,
+   S_IFLNK|0777, len, 0,
+   >d_name, epoch_id);
+   if (IS_ERR(inode)) {
+   err = PTR_ERR(inode);
+   goto out_fail;
+   }
+
+   pi = nova_get_inode(sb, inode);
+
+   si = NOVA_I(inode);
+   sih = >header;
+
+   err = nova_block_symlink(sb, pi, inode, symname, len, epoch_id);
+   if (err)
+   goto out_fail;
+
+   d_instantiate(dentry, inode);
+   unlock_new_inode(inode);
+
+   nova_lite_transaction_for_new_inode(sb, pi, pidir, inode, dir,
+   );
+out:
+   NOVA_END_TIMING(symlink_t, symlink_time);
+   return err;
+
+out_fail:
+   nova_err(sb, "%s return %d\n", __func__, err);
+   goto out;
+}
+
 static void nova_lite_transaction_for_time_and_link(struct super_block *sb,
struct nova_inode *pi, struct nova_inode *pidir, struct inode *inode,
struct inode *dir, struct nova_inode_update *update,
@@ -764,6 +833,7 @@ const struct inode_operations nova_dir_inode_operations = {
.lookup = nova_lookup,
.link   = nova_link,
.unlink = nova_unlink,
+   .symlink= nova_symlink,
.mkdir  = nova_mkdir,
.rmdir  = nova_rmdir,
.mknod  = nova_mknod,
diff --git a/fs/nova/nova.h b/fs/nova/nova.h
index 40c70da..6392bb3 100644
--- a/fs/nova/nova.h
+++ b/fs/nova/nova.h
@@ -518,6 +518,11 @@ int nova_rebuild_dir_inode_tree(struct super_block *sb,
 int nova_rebuild_inode(struct super_block *sb, struct nova_inode_info *si,
u64 ino, u64 pi_addr, int rebuild_dir);
 
+/* symlink.c */
+int nova_block_symlink(struct super_blo

[RFC v2 71/83] Symlink support.

2018-03-10 Thread Andiry Xu
From: Andiry Xu 

NOVA alloates two blocks for symlink inode: One for inode log,
and the other one is a data block, storing symname.

Signed-off-by: Andiry Xu 
---
 fs/nova/Makefile  |   2 +-
 fs/nova/inode.c   |   2 +
 fs/nova/namei.c   |  70 
 fs/nova/nova.h|   5 ++
 fs/nova/symlink.c | 133 ++
 5 files changed, 211 insertions(+), 1 deletion(-)
 create mode 100644 fs/nova/symlink.c

diff --git a/fs/nova/Makefile b/fs/nova/Makefile
index 7f851f2..7bf6403 100644
--- a/fs/nova/Makefile
+++ b/fs/nova/Makefile
@@ -5,4 +5,4 @@
 obj-$(CONFIG_NOVA_FS) += nova.o
 
 nova-y := balloc.o bbuild.o dax.o dir.o file.o inode.o journal.o log.o namei.o\
- rebuild.o stats.o super.o
+ rebuild.o stats.o super.o symlink.o
diff --git a/fs/nova/inode.c b/fs/nova/inode.c
index a6d74cb..21be31a 100644
--- a/fs/nova/inode.c
+++ b/fs/nova/inode.c
@@ -285,6 +285,7 @@ static int nova_read_inode(struct super_block *sb, struct 
inode *inode,
inode->i_fop = _dir_operations;
break;
case S_IFLNK:
+   inode->i_op = _symlink_inode_operations;
break;
default:
inode->i_op = _special_inode_operations;
@@ -983,6 +984,7 @@ struct inode *nova_new_vfs_inode(enum nova_new_inode_type 
type,
inode->i_op = _special_inode_operations;
break;
case TYPE_SYMLINK:
+   inode->i_op = _symlink_inode_operations;
inode->i_mapping->a_ops = _aops_dax;
break;
case TYPE_MKDIR:
diff --git a/fs/nova/namei.c b/fs/nova/namei.c
index 7a81672..58f6a72 100644
--- a/fs/nova/namei.c
+++ b/fs/nova/namei.c
@@ -207,6 +207,75 @@ static int nova_mknod(struct inode *dir, struct dentry 
*dentry, umode_t mode,
return err;
 }
 
+static int nova_symlink(struct inode *dir, struct dentry *dentry,
+   const char *symname)
+{
+   struct super_block *sb = dir->i_sb;
+   int err = -ENAMETOOLONG;
+   unsigned int len = strlen(symname);
+   struct inode *inode;
+   struct nova_inode_info *si;
+   struct nova_inode_info_header *sih;
+   u64 pi_addr = 0;
+   struct nova_inode *pidir, *pi;
+   struct nova_inode_update update;
+   u64 ino;
+   u64 epoch_id;
+   timing_t symlink_time;
+
+   NOVA_START_TIMING(symlink_t, symlink_time);
+   if (len + 1 > sb->s_blocksize)
+   goto out;
+
+   pidir = nova_get_inode(sb, dir);
+   if (!pidir)
+   goto out_fail;
+
+   epoch_id = nova_get_epoch_id(sb);
+   ino = nova_new_nova_inode(sb, _addr);
+   if (ino == 0)
+   goto out_fail;
+
+   nova_dbgv("%s: name %s, symname %s\n", __func__,
+   dentry->d_name.name, symname);
+   nova_dbgv("%s: inode %llu, dir %lu\n", __func__, ino, dir->i_ino);
+
+   update.tail = 0;
+   err = nova_add_dentry(dentry, ino, 0, , epoch_id);
+   if (err)
+   goto out_fail;
+
+   inode = nova_new_vfs_inode(TYPE_SYMLINK, dir, pi_addr, ino,
+   S_IFLNK|0777, len, 0,
+   >d_name, epoch_id);
+   if (IS_ERR(inode)) {
+   err = PTR_ERR(inode);
+   goto out_fail;
+   }
+
+   pi = nova_get_inode(sb, inode);
+
+   si = NOVA_I(inode);
+   sih = >header;
+
+   err = nova_block_symlink(sb, pi, inode, symname, len, epoch_id);
+   if (err)
+   goto out_fail;
+
+   d_instantiate(dentry, inode);
+   unlock_new_inode(inode);
+
+   nova_lite_transaction_for_new_inode(sb, pi, pidir, inode, dir,
+   );
+out:
+   NOVA_END_TIMING(symlink_t, symlink_time);
+   return err;
+
+out_fail:
+   nova_err(sb, "%s return %d\n", __func__, err);
+   goto out;
+}
+
 static void nova_lite_transaction_for_time_and_link(struct super_block *sb,
struct nova_inode *pi, struct nova_inode *pidir, struct inode *inode,
struct inode *dir, struct nova_inode_update *update,
@@ -764,6 +833,7 @@ const struct inode_operations nova_dir_inode_operations = {
.lookup = nova_lookup,
.link   = nova_link,
.unlink = nova_unlink,
+   .symlink= nova_symlink,
.mkdir  = nova_mkdir,
.rmdir  = nova_rmdir,
.mknod  = nova_mknod,
diff --git a/fs/nova/nova.h b/fs/nova/nova.h
index 40c70da..6392bb3 100644
--- a/fs/nova/nova.h
+++ b/fs/nova/nova.h
@@ -518,6 +518,11 @@ int nova_rebuild_dir_inode_tree(struct super_block *sb,
 int nova_rebuild_inode(struct super_block *sb, struct nova_inode_info *si,
u64 ino, u64 pi_addr, int rebuild_dir);
 
+/* symlink.c */
+int nova_block_symlink(struct super_block *sb, struct nova_inode *pi,
+   struct 

[RFC v2 73/83] Dax: Add iomap operations.

2018-03-10 Thread Andiry Xu
From: Andiry Xu 

The key of iomap is dax_get_blocks(). It first takes the read lock
and lookup the block; if the block is missing, it takes write lock,
check again and allocate the new block if needed.

Signed-off-by: Andiry Xu 
---
 fs/nova/dax.c  | 184 +
 fs/nova/nova.h |   3 +
 2 files changed, 187 insertions(+)

diff --git a/fs/nova/dax.c b/fs/nova/dax.c
index 8624ce4..e639b23 100644
--- a/fs/nova/dax.c
+++ b/fs/nova/dax.c
@@ -731,3 +731,187 @@ ssize_t nova_inplace_file_write(struct file *filp,
 
return ret;
 }
+
+/*
+ * return > 0, # of blocks mapped or allocated.
+ * return = 0, if plain lookup failed.
+ * return < 0, error case.
+ */
+static int nova_dax_get_blocks(struct inode *inode, sector_t iblock,
+   unsigned long max_blocks, u32 *bno, bool *new, bool *boundary,
+   int create)
+{
+   struct super_block *sb = inode->i_sb;
+   struct nova_inode *pi;
+   struct nova_inode_info *si = NOVA_I(inode);
+   struct nova_inode_info_header *sih = >header;
+   struct nova_file_write_entry *entry = NULL;
+   struct nova_file_write_item entry_item;
+   struct list_head item_head;
+   struct nova_inode_update update;
+   u32 time;
+   unsigned long nvmm = 0;
+   unsigned long blocknr = 0;
+   u64 epoch_id;
+   int num_blocks = 0;
+   int inplace = 0;
+   int allocated = 0;
+   int locked = 0;
+   int check_next;
+   int ret = 0;
+   timing_t get_block_time;
+
+
+   if (max_blocks == 0)
+   return 0;
+
+   NOVA_START_TIMING(dax_get_block_t, get_block_time);
+   INIT_LIST_HEAD(_head);
+
+   nova_dbgv("%s: pgoff %lu, num %lu, create %d\n",
+   __func__, iblock, max_blocks, create);
+
+   epoch_id = nova_get_epoch_id(sb);
+
+   check_next = 0;
+   sih_lock_shared(sih);
+
+again:
+   num_blocks = nova_check_existing_entry(sb, inode, max_blocks,
+   iblock, , check_next,
+   epoch_id, );
+
+   if (entry) {
+   if (create == 0 || inplace) {
+   nvmm = get_nvmm(sb, sih, entry, iblock);
+   nova_dbgv("%s: found pgoff %lu, block %lu\n",
+   __func__, iblock, nvmm);
+   goto out;
+   }
+   }
+
+   if (create == 0) {
+   num_blocks = 0;
+   goto out1;
+   }
+
+   if (locked == 0) {
+   sih_unlock_shared(sih);
+   sih_lock(sih);
+   locked = 1;
+   /* Check again incase someone has done it for us */
+   check_next = 1;
+   goto again;
+   }
+
+   pi = nova_get_inode(sb, inode);
+   inode->i_ctime = inode->i_mtime = current_time(inode);
+   time = current_time(inode).tv_sec;
+   update.tail = sih->log_tail;
+
+   /* Return initialized blocks to the user */
+   allocated = nova_new_data_blocks(sb, sih, , iblock,
+num_blocks, ALLOC_INIT_ZERO, ANY_CPU,
+ALLOC_FROM_HEAD);
+   if (allocated <= 0) {
+   nova_dbgv("%s alloc blocks failed %d\n", __func__,
+   allocated);
+   ret = allocated;
+   goto out;
+   }
+
+   num_blocks = allocated;
+   /* FIXME: how to handle file size? */
+   nova_init_file_write_item(sb, sih, _item,
+   epoch_id, iblock, num_blocks,
+   blocknr, time, inode->i_size);
+
+   list_add_tail(_item.list, _head);
+
+   nvmm = blocknr;
+
+   ret = nova_commit_writes_to_log(sb, pi, inode,
+   _head, num_blocks, 0);
+   if (ret < 0) {
+   nova_err(sb, "commit to log failed\n");
+   goto out;
+   }
+
+   NOVA_STATS_ADD(dax_new_blocks, 1);
+
+   *new = true;
+// set_buffer_new(bh);
+out:
+   if (ret < 0) {
+   nova_cleanup_incomplete_write(sb, sih, _head, 0);
+   num_blocks = ret;
+   goto out1;
+   }
+
+   *bno = nvmm;
+// if (num_blocks > 1)
+// bh->b_size = sb->s_blocksize * num_blocks;
+
+out1:
+   if (locked)
+   sih_unlock(sih);
+   else
+   sih_unlock_shared(sih);
+
+   NOVA_END_TIMING(dax_get_block_t, get_block_time);
+   return num_blocks;
+}
+
+static int nova_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
+   unsigned int flags, struct iomap *iomap)
+{
+   struct nova_sb_info *sbi = NOVA_SB(inode->i_sb);
+   unsigned int blkbits = inode->i_blkbits;
+   unsigned long first_block = offset >> blkbits;

[RFC v2 72/83] File operation: fallocate.

2018-03-10 Thread Andiry Xu
From: Andiry Xu <jix...@cs.ucsd.edu>

Fallocate works similar as writes, allocating zeroed blocked
for the holes in the request region.

Signed-off-by: Andiry Xu <jix...@cs.ucsd.edu>
---
 fs/nova/file.c | 148 +
 fs/nova/nova.h |   5 ++
 2 files changed, 153 insertions(+)

diff --git a/fs/nova/file.c b/fs/nova/file.c
index b94a9a3..a6b5bd3 100644
--- a/fs/nova/file.c
+++ b/fs/nova/file.c
@@ -113,6 +113,153 @@ static int nova_open(struct inode *inode, struct file 
*filp)
return generic_file_open(inode, filp);
 }
 
+static long nova_fallocate(struct file *file, int mode, loff_t offset,
+   loff_t len)
+{
+   struct inode *inode = file->f_path.dentry->d_inode;
+   struct super_block *sb = inode->i_sb;
+   struct nova_inode_info *si = NOVA_I(inode);
+   struct nova_inode_info_header *sih = >header;
+   struct nova_inode *pi;
+   struct nova_file_write_entry *entry;
+   struct nova_file_write_item *entry_item;
+   struct list_head item_head;
+   struct nova_inode_update update;
+   unsigned long start_blk, num_blocks, ent_blks = 0;
+   unsigned long total_blocks = 0;
+   unsigned long blocknr = 0;
+   unsigned long blockoff;
+   loff_t new_size;
+   long ret = 0;
+   int inplace = 0;
+   int blocksize_mask;
+   int allocated = 0;
+   timing_t fallocate_time;
+   u64 epoch_id;
+   u32 time;
+
+   /*
+* Fallocate does not make much sence for CoW,
+* but we still support it for DAX-mmap purpose.
+*/
+
+   /* We only support the FALLOC_FL_KEEP_SIZE mode */
+   if (mode & ~FALLOC_FL_KEEP_SIZE)
+   return -EOPNOTSUPP;
+
+   if (S_ISDIR(inode->i_mode))
+   return -ENODEV;
+
+   INIT_LIST_HEAD(_head);
+   new_size = len + offset;
+   if (!(mode & FALLOC_FL_KEEP_SIZE) && new_size > inode->i_size) {
+   ret = inode_newsize_ok(inode, new_size);
+   if (ret)
+   return ret;
+   } else {
+   new_size = inode->i_size;
+   }
+
+   nova_dbgv("%s: inode %lu, offset %lld, count %lld, mode 0x%x\n",
+   __func__, inode->i_ino, offset, len, mode);
+
+   NOVA_START_TIMING(fallocate_t, fallocate_time);
+   inode_lock(inode);
+   sih_lock(sih);
+
+   pi = nova_get_inode(sb, inode);
+   if (!pi) {
+   ret = -EACCES;
+   goto out;
+   }
+
+   inode->i_mtime = inode->i_ctime = current_time(inode);
+   time = current_time(inode).tv_sec;
+
+   blocksize_mask = sb->s_blocksize - 1;
+   start_blk = offset >> sb->s_blocksize_bits;
+   blockoff = offset & blocksize_mask;
+   num_blocks = (blockoff + len + blocksize_mask) >> sb->s_blocksize_bits;
+
+   epoch_id = nova_get_epoch_id(sb);
+   update.tail = sih->log_tail;
+   while (num_blocks > 0) {
+   ent_blks = nova_check_existing_entry(sb, inode, num_blocks,
+   start_blk, ,
+   1, epoch_id, );
+
+   if (entry && inplace) {
+   if (entry->size < new_size) {
+   /* Update existing entry */
+   entry->size = new_size;
+   nova_persist_entry(entry);
+   }
+   allocated = ent_blks;
+   goto next;
+   }
+
+   /* Allocate zeroed blocks to fill hole */
+   allocated = nova_new_data_blocks(sb, sih, , start_blk,
+ent_blks, ALLOC_INIT_ZERO, ANY_CPU,
+ALLOC_FROM_HEAD);
+   nova_dbgv("%s: alloc %d blocks @ %lu\n", __func__,
+   allocated, blocknr);
+
+   if (allocated <= 0) {
+   nova_dbg("%s alloc %lu blocks failed!, %d\n",
+   __func__, ent_blks, allocated);
+   ret = allocated;
+   goto out;
+   }
+
+   entry_item = nova_alloc_file_write_item(sb);
+   if (!entry_item) {
+   ret = -ENOMEM;
+   goto out;
+   }
+
+   /* Handle hole fill write */
+   nova_init_file_write_item(sb, sih, entry_item, epoch_id,
+   start_blk, allocated, blocknr,
+   time, new_size);
+
+   list_add_tail(_item->list, _head);
+
+   total_blocks += allocated;
+next:
+   num_blocks -= allocated;
+   start_blk += allocated;
+ 

[RFC v2 75/83] File operation: read/write iter.

2018-03-10 Thread Andiry Xu
From: Andiry Xu <jix...@cs.ucsd.edu>

They use the iomap framework to do read/write. Due to software overheads
they are slower than dax read/write.

Signed-off-by: Andiry Xu <jix...@cs.ucsd.edu>
---
 fs/nova/file.c | 65 ++
 1 file changed, 65 insertions(+)

diff --git a/fs/nova/file.c b/fs/nova/file.c
index 0ae0333..7e90415 100644
--- a/fs/nova/file.c
+++ b/fs/nova/file.c
@@ -260,6 +260,69 @@ static long nova_fallocate(struct file *file, int mode, 
loff_t offset,
return ret;
 }
 
+static ssize_t nova_dax_read_iter(struct kiocb *iocb, struct iov_iter *to)
+{
+   struct inode *inode = iocb->ki_filp->f_mapping->host;
+   ssize_t ret;
+   timing_t read_iter_time;
+
+   if (!iov_iter_count(to))
+   return 0;
+
+   NOVA_START_TIMING(read_iter_t, read_iter_time);
+
+   inode_lock_shared(inode);
+   ret = dax_iomap_rw(iocb, to, _iomap_ops);
+   inode_unlock_shared(inode);
+
+   file_accessed(iocb->ki_filp);
+   NOVA_END_TIMING(read_iter_t, read_iter_time);
+   return ret;
+}
+
+static ssize_t nova_dax_write_iter(struct kiocb *iocb, struct iov_iter *from)
+{
+   struct file *file = iocb->ki_filp;
+   struct inode *inode = file->f_mapping->host;
+   struct nova_inode_info *si = NOVA_I(inode);
+   struct nova_inode_info_header *sih = >header;
+   loff_t offset;
+   size_t count;
+   ssize_t ret;
+   timing_t write_iter_time;
+
+   NOVA_START_TIMING(write_iter_t, write_iter_time);
+   inode_lock(inode);
+   ret = generic_write_checks(iocb, from);
+   if (ret <= 0)
+   goto out_unlock;
+
+   ret = file_remove_privs(file);
+   if (ret)
+   goto out_unlock;
+
+   ret = file_update_time(file);
+   if (ret)
+   goto out_unlock;
+
+   count = iov_iter_count(from);
+   offset = iocb->ki_pos;
+
+   ret = dax_iomap_rw(iocb, from, _iomap_ops);
+   if (ret > 0 && iocb->ki_pos > i_size_read(inode)) {
+   i_size_write(inode, iocb->ki_pos);
+   sih->i_size = iocb->ki_pos;
+   mark_inode_dirty(inode);
+   }
+
+out_unlock:
+   inode_unlock(inode);
+   if (ret > 0)
+   ret = generic_write_sync(iocb, ret);
+   NOVA_END_TIMING(write_iter_t, write_iter_time);
+   return ret;
+}
+
 static ssize_t
 do_dax_mapping_read(struct file *filp, char __user *buf,
size_t len, loff_t *ppos)
@@ -645,6 +708,8 @@ const struct file_operations nova_dax_file_operations = {
.llseek = nova_llseek,
.read   = nova_dax_file_read,
.write  = nova_dax_file_write,
+   .read_iter  = nova_dax_read_iter,
+   .write_iter = nova_dax_write_iter,
.mmap   = nova_dax_file_mmap,
.open   = nova_open,
.fsync  = nova_fsync,
-- 
2.7.4



[RFC v2 72/83] File operation: fallocate.

2018-03-10 Thread Andiry Xu
From: Andiry Xu 

Fallocate works similar as writes, allocating zeroed blocked
for the holes in the request region.

Signed-off-by: Andiry Xu 
---
 fs/nova/file.c | 148 +
 fs/nova/nova.h |   5 ++
 2 files changed, 153 insertions(+)

diff --git a/fs/nova/file.c b/fs/nova/file.c
index b94a9a3..a6b5bd3 100644
--- a/fs/nova/file.c
+++ b/fs/nova/file.c
@@ -113,6 +113,153 @@ static int nova_open(struct inode *inode, struct file 
*filp)
return generic_file_open(inode, filp);
 }
 
+static long nova_fallocate(struct file *file, int mode, loff_t offset,
+   loff_t len)
+{
+   struct inode *inode = file->f_path.dentry->d_inode;
+   struct super_block *sb = inode->i_sb;
+   struct nova_inode_info *si = NOVA_I(inode);
+   struct nova_inode_info_header *sih = >header;
+   struct nova_inode *pi;
+   struct nova_file_write_entry *entry;
+   struct nova_file_write_item *entry_item;
+   struct list_head item_head;
+   struct nova_inode_update update;
+   unsigned long start_blk, num_blocks, ent_blks = 0;
+   unsigned long total_blocks = 0;
+   unsigned long blocknr = 0;
+   unsigned long blockoff;
+   loff_t new_size;
+   long ret = 0;
+   int inplace = 0;
+   int blocksize_mask;
+   int allocated = 0;
+   timing_t fallocate_time;
+   u64 epoch_id;
+   u32 time;
+
+   /*
+* Fallocate does not make much sence for CoW,
+* but we still support it for DAX-mmap purpose.
+*/
+
+   /* We only support the FALLOC_FL_KEEP_SIZE mode */
+   if (mode & ~FALLOC_FL_KEEP_SIZE)
+   return -EOPNOTSUPP;
+
+   if (S_ISDIR(inode->i_mode))
+   return -ENODEV;
+
+   INIT_LIST_HEAD(_head);
+   new_size = len + offset;
+   if (!(mode & FALLOC_FL_KEEP_SIZE) && new_size > inode->i_size) {
+   ret = inode_newsize_ok(inode, new_size);
+   if (ret)
+   return ret;
+   } else {
+   new_size = inode->i_size;
+   }
+
+   nova_dbgv("%s: inode %lu, offset %lld, count %lld, mode 0x%x\n",
+   __func__, inode->i_ino, offset, len, mode);
+
+   NOVA_START_TIMING(fallocate_t, fallocate_time);
+   inode_lock(inode);
+   sih_lock(sih);
+
+   pi = nova_get_inode(sb, inode);
+   if (!pi) {
+   ret = -EACCES;
+   goto out;
+   }
+
+   inode->i_mtime = inode->i_ctime = current_time(inode);
+   time = current_time(inode).tv_sec;
+
+   blocksize_mask = sb->s_blocksize - 1;
+   start_blk = offset >> sb->s_blocksize_bits;
+   blockoff = offset & blocksize_mask;
+   num_blocks = (blockoff + len + blocksize_mask) >> sb->s_blocksize_bits;
+
+   epoch_id = nova_get_epoch_id(sb);
+   update.tail = sih->log_tail;
+   while (num_blocks > 0) {
+   ent_blks = nova_check_existing_entry(sb, inode, num_blocks,
+   start_blk, ,
+   1, epoch_id, );
+
+   if (entry && inplace) {
+   if (entry->size < new_size) {
+   /* Update existing entry */
+   entry->size = new_size;
+   nova_persist_entry(entry);
+   }
+   allocated = ent_blks;
+   goto next;
+   }
+
+   /* Allocate zeroed blocks to fill hole */
+   allocated = nova_new_data_blocks(sb, sih, , start_blk,
+ent_blks, ALLOC_INIT_ZERO, ANY_CPU,
+ALLOC_FROM_HEAD);
+   nova_dbgv("%s: alloc %d blocks @ %lu\n", __func__,
+   allocated, blocknr);
+
+   if (allocated <= 0) {
+   nova_dbg("%s alloc %lu blocks failed!, %d\n",
+   __func__, ent_blks, allocated);
+   ret = allocated;
+   goto out;
+   }
+
+   entry_item = nova_alloc_file_write_item(sb);
+   if (!entry_item) {
+   ret = -ENOMEM;
+   goto out;
+   }
+
+   /* Handle hole fill write */
+   nova_init_file_write_item(sb, sih, entry_item, epoch_id,
+   start_blk, allocated, blocknr,
+   time, new_size);
+
+   list_add_tail(_item->list, _head);
+
+   total_blocks += allocated;
+next:
+   num_blocks -= allocated;
+   start_blk += allocated;
+   }
+
+   ret = nova_commit_writes_to_log(sb, pi

[RFC v2 75/83] File operation: read/write iter.

2018-03-10 Thread Andiry Xu
From: Andiry Xu 

They use the iomap framework to do read/write. Due to software overheads
they are slower than dax read/write.

Signed-off-by: Andiry Xu 
---
 fs/nova/file.c | 65 ++
 1 file changed, 65 insertions(+)

diff --git a/fs/nova/file.c b/fs/nova/file.c
index 0ae0333..7e90415 100644
--- a/fs/nova/file.c
+++ b/fs/nova/file.c
@@ -260,6 +260,69 @@ static long nova_fallocate(struct file *file, int mode, 
loff_t offset,
return ret;
 }
 
+static ssize_t nova_dax_read_iter(struct kiocb *iocb, struct iov_iter *to)
+{
+   struct inode *inode = iocb->ki_filp->f_mapping->host;
+   ssize_t ret;
+   timing_t read_iter_time;
+
+   if (!iov_iter_count(to))
+   return 0;
+
+   NOVA_START_TIMING(read_iter_t, read_iter_time);
+
+   inode_lock_shared(inode);
+   ret = dax_iomap_rw(iocb, to, _iomap_ops);
+   inode_unlock_shared(inode);
+
+   file_accessed(iocb->ki_filp);
+   NOVA_END_TIMING(read_iter_t, read_iter_time);
+   return ret;
+}
+
+static ssize_t nova_dax_write_iter(struct kiocb *iocb, struct iov_iter *from)
+{
+   struct file *file = iocb->ki_filp;
+   struct inode *inode = file->f_mapping->host;
+   struct nova_inode_info *si = NOVA_I(inode);
+   struct nova_inode_info_header *sih = >header;
+   loff_t offset;
+   size_t count;
+   ssize_t ret;
+   timing_t write_iter_time;
+
+   NOVA_START_TIMING(write_iter_t, write_iter_time);
+   inode_lock(inode);
+   ret = generic_write_checks(iocb, from);
+   if (ret <= 0)
+   goto out_unlock;
+
+   ret = file_remove_privs(file);
+   if (ret)
+   goto out_unlock;
+
+   ret = file_update_time(file);
+   if (ret)
+   goto out_unlock;
+
+   count = iov_iter_count(from);
+   offset = iocb->ki_pos;
+
+   ret = dax_iomap_rw(iocb, from, _iomap_ops);
+   if (ret > 0 && iocb->ki_pos > i_size_read(inode)) {
+   i_size_write(inode, iocb->ki_pos);
+   sih->i_size = iocb->ki_pos;
+   mark_inode_dirty(inode);
+   }
+
+out_unlock:
+   inode_unlock(inode);
+   if (ret > 0)
+   ret = generic_write_sync(iocb, ret);
+   NOVA_END_TIMING(write_iter_t, write_iter_time);
+   return ret;
+}
+
 static ssize_t
 do_dax_mapping_read(struct file *filp, char __user *buf,
size_t len, loff_t *ppos)
@@ -645,6 +708,8 @@ const struct file_operations nova_dax_file_operations = {
.llseek = nova_llseek,
.read   = nova_dax_file_read,
.write  = nova_dax_file_write,
+   .read_iter  = nova_dax_read_iter,
+   .write_iter = nova_dax_write_iter,
.mmap   = nova_dax_file_mmap,
.open   = nova_open,
.fsync  = nova_fsync,
-- 
2.7.4



[RFC v2 74/83] File operation: Mmap.

2018-03-10 Thread Andiry Xu
From: Andiry Xu <jix...@cs.ucsd.edu>

NOVA uses the iomap framework to support mmap operation.
Currently it does not support huge page mmap.

Signed-off-by: Andiry Xu <jix...@cs.ucsd.edu>
---
 fs/nova/dax.c  | 53 +
 fs/nova/file.c | 25 +
 fs/nova/nova.h |  1 +
 3 files changed, 79 insertions(+)

diff --git a/fs/nova/dax.c b/fs/nova/dax.c
index e639b23..fa424b1 100644
--- a/fs/nova/dax.c
+++ b/fs/nova/dax.c
@@ -915,3 +915,56 @@ const struct iomap_ops nova_iomap_ops = {
.iomap_begin= nova_iomap_begin,
.iomap_end  = nova_iomap_end,
 };
+
+
+/* TODO: Hugemap mmap */
+static int nova_dax_huge_fault(struct vm_fault *vmf,
+   enum page_entry_size pe_size)
+{
+   int ret = 0;
+   timing_t fault_time;
+   struct address_space *mapping = vmf->vma->vm_file->f_mapping;
+   struct inode *inode = mapping->host;
+
+   NOVA_START_TIMING(pmd_fault_t, fault_time);
+
+   nova_dbgv("%s: inode %lu, pgoff %lu\n",
+ __func__, inode->i_ino, vmf->pgoff);
+
+   if (vmf->flags & FAULT_FLAG_WRITE)
+   file_update_time(vmf->vma->vm_file);
+
+   ret = dax_iomap_fault(vmf, pe_size, NULL, NULL, _iomap_ops);
+
+   NOVA_END_TIMING(pmd_fault_t, fault_time);
+   return ret;
+}
+
+static int nova_dax_fault(struct vm_fault *vmf)
+{
+   struct address_space *mapping = vmf->vma->vm_file->f_mapping;
+   struct inode *inode = mapping->host;
+
+   nova_dbgv("%s: inode %lu, pgoff %lu, flags 0x%x\n",
+ __func__, inode->i_ino, vmf->pgoff, vmf->flags);
+
+   return nova_dax_huge_fault(vmf, PE_SIZE_PTE);
+}
+
+static int nova_dax_pfn_mkwrite(struct vm_fault *vmf)
+{
+   struct address_space *mapping = vmf->vma->vm_file->f_mapping;
+   struct inode *inode = mapping->host;
+
+   nova_dbgv("%s: inode %lu, pgoff %lu, flags 0x%x\n",
+ __func__, inode->i_ino, vmf->pgoff, vmf->flags);
+
+   return nova_dax_huge_fault(vmf, PE_SIZE_PTE);
+}
+
+const struct vm_operations_struct nova_dax_vm_ops = {
+   .fault  = nova_dax_fault,
+   .huge_fault = nova_dax_huge_fault,
+   .page_mkwrite = nova_dax_fault,
+   .pfn_mkwrite = nova_dax_pfn_mkwrite,
+};
diff --git a/fs/nova/file.c b/fs/nova/file.c
index a6b5bd3..0ae0333 100644
--- a/fs/nova/file.c
+++ b/fs/nova/file.c
@@ -617,10 +617,35 @@ static ssize_t nova_dax_file_write(struct file *filp, 
const char __user *buf,
 }
 
 
+static int nova_dax_file_mmap(struct file *file, struct vm_area_struct *vma)
+{
+   struct inode *inode = file->f_mapping->host;
+
+   file_accessed(file);
+
+   vma->vm_flags |= VM_MIXEDMAP;
+
+   vma->vm_ops = _dax_vm_ops;
+
+   nova_dbg_mmap4k("[%s:%d] inode %lu, MMAP 4KPAGE vm_start(0x%lx), "
+   "vm_end(0x%lx), vm pgoff %lu, %lu blocks, "
+   "vm_flags(0x%lx), vm_page_prot(0x%lx)\n",
+   __func__, __LINE__,
+   inode->i_ino, vma->vm_start, vma->vm_end,
+   vma->vm_pgoff,
+   (vma->vm_end - vma->vm_start) >> PAGE_SHIFT,
+   vma->vm_flags,
+   pgprot_val(vma->vm_page_prot));
+
+   return 0;
+}
+
+
 const struct file_operations nova_dax_file_operations = {
.llseek = nova_llseek,
.read   = nova_dax_file_read,
.write  = nova_dax_file_write,
+   .mmap   = nova_dax_file_mmap,
.open   = nova_open,
.fsync  = nova_fsync,
.flush  = nova_flush,
diff --git a/fs/nova/nova.h b/fs/nova/nova.h
index 0d62c47..d209cfc 100644
--- a/fs/nova/nova.h
+++ b/fs/nova/nova.h
@@ -488,6 +488,7 @@ ssize_t do_nova_inplace_file_write(struct file *filp, const 
char __user *buf,
size_t len, loff_t *ppos);
 
 extern const struct iomap_ops nova_iomap_ops;
+extern const struct vm_operations_struct nova_dax_vm_ops;
 
 
 /* dir.c */
-- 
2.7.4



[RFC v2 74/83] File operation: Mmap.

2018-03-10 Thread Andiry Xu
From: Andiry Xu 

NOVA uses the iomap framework to support mmap operation.
Currently it does not support huge page mmap.

Signed-off-by: Andiry Xu 
---
 fs/nova/dax.c  | 53 +
 fs/nova/file.c | 25 +
 fs/nova/nova.h |  1 +
 3 files changed, 79 insertions(+)

diff --git a/fs/nova/dax.c b/fs/nova/dax.c
index e639b23..fa424b1 100644
--- a/fs/nova/dax.c
+++ b/fs/nova/dax.c
@@ -915,3 +915,56 @@ const struct iomap_ops nova_iomap_ops = {
.iomap_begin= nova_iomap_begin,
.iomap_end  = nova_iomap_end,
 };
+
+
+/* TODO: Hugemap mmap */
+static int nova_dax_huge_fault(struct vm_fault *vmf,
+   enum page_entry_size pe_size)
+{
+   int ret = 0;
+   timing_t fault_time;
+   struct address_space *mapping = vmf->vma->vm_file->f_mapping;
+   struct inode *inode = mapping->host;
+
+   NOVA_START_TIMING(pmd_fault_t, fault_time);
+
+   nova_dbgv("%s: inode %lu, pgoff %lu\n",
+ __func__, inode->i_ino, vmf->pgoff);
+
+   if (vmf->flags & FAULT_FLAG_WRITE)
+   file_update_time(vmf->vma->vm_file);
+
+   ret = dax_iomap_fault(vmf, pe_size, NULL, NULL, _iomap_ops);
+
+   NOVA_END_TIMING(pmd_fault_t, fault_time);
+   return ret;
+}
+
+static int nova_dax_fault(struct vm_fault *vmf)
+{
+   struct address_space *mapping = vmf->vma->vm_file->f_mapping;
+   struct inode *inode = mapping->host;
+
+   nova_dbgv("%s: inode %lu, pgoff %lu, flags 0x%x\n",
+ __func__, inode->i_ino, vmf->pgoff, vmf->flags);
+
+   return nova_dax_huge_fault(vmf, PE_SIZE_PTE);
+}
+
+static int nova_dax_pfn_mkwrite(struct vm_fault *vmf)
+{
+   struct address_space *mapping = vmf->vma->vm_file->f_mapping;
+   struct inode *inode = mapping->host;
+
+   nova_dbgv("%s: inode %lu, pgoff %lu, flags 0x%x\n",
+ __func__, inode->i_ino, vmf->pgoff, vmf->flags);
+
+   return nova_dax_huge_fault(vmf, PE_SIZE_PTE);
+}
+
+const struct vm_operations_struct nova_dax_vm_ops = {
+   .fault  = nova_dax_fault,
+   .huge_fault = nova_dax_huge_fault,
+   .page_mkwrite = nova_dax_fault,
+   .pfn_mkwrite = nova_dax_pfn_mkwrite,
+};
diff --git a/fs/nova/file.c b/fs/nova/file.c
index a6b5bd3..0ae0333 100644
--- a/fs/nova/file.c
+++ b/fs/nova/file.c
@@ -617,10 +617,35 @@ static ssize_t nova_dax_file_write(struct file *filp, 
const char __user *buf,
 }
 
 
+static int nova_dax_file_mmap(struct file *file, struct vm_area_struct *vma)
+{
+   struct inode *inode = file->f_mapping->host;
+
+   file_accessed(file);
+
+   vma->vm_flags |= VM_MIXEDMAP;
+
+   vma->vm_ops = _dax_vm_ops;
+
+   nova_dbg_mmap4k("[%s:%d] inode %lu, MMAP 4KPAGE vm_start(0x%lx), "
+   "vm_end(0x%lx), vm pgoff %lu, %lu blocks, "
+   "vm_flags(0x%lx), vm_page_prot(0x%lx)\n",
+   __func__, __LINE__,
+   inode->i_ino, vma->vm_start, vma->vm_end,
+   vma->vm_pgoff,
+   (vma->vm_end - vma->vm_start) >> PAGE_SHIFT,
+   vma->vm_flags,
+   pgprot_val(vma->vm_page_prot));
+
+   return 0;
+}
+
+
 const struct file_operations nova_dax_file_operations = {
.llseek = nova_llseek,
.read   = nova_dax_file_read,
.write  = nova_dax_file_write,
+   .mmap   = nova_dax_file_mmap,
.open   = nova_open,
.fsync  = nova_fsync,
.flush  = nova_flush,
diff --git a/fs/nova/nova.h b/fs/nova/nova.h
index 0d62c47..d209cfc 100644
--- a/fs/nova/nova.h
+++ b/fs/nova/nova.h
@@ -488,6 +488,7 @@ ssize_t do_nova_inplace_file_write(struct file *filp, const 
char __user *buf,
size_t len, loff_t *ppos);
 
 extern const struct iomap_ops nova_iomap_ops;
+extern const struct vm_operations_struct nova_dax_vm_ops;
 
 
 /* dir.c */
-- 
2.7.4



[RFC v2 76/83] Ioctl support.

2018-03-10 Thread Andiry Xu
From: Andiry Xu <jix...@cs.ucsd.edu>

NOVA appends link change entry to the inode log to implement
SETFLAGS and SETVERSION.

Signed-off-by: Andiry Xu <jix...@cs.ucsd.edu>
---
 fs/nova/Makefile |   4 +-
 fs/nova/dir.c|   4 ++
 fs/nova/file.c   |   4 ++
 fs/nova/inode.h  |   2 +
 fs/nova/ioctl.c  | 184 +++
 fs/nova/nova.h   |   7 +++
 6 files changed, 203 insertions(+), 2 deletions(-)
 create mode 100644 fs/nova/ioctl.c

diff --git a/fs/nova/Makefile b/fs/nova/Makefile
index 7bf6403..87e56c6 100644
--- a/fs/nova/Makefile
+++ b/fs/nova/Makefile
@@ -4,5 +4,5 @@
 
 obj-$(CONFIG_NOVA_FS) += nova.o
 
-nova-y := balloc.o bbuild.o dax.o dir.o file.o inode.o journal.o log.o namei.o\
- rebuild.o stats.o super.o symlink.o
+nova-y := balloc.o bbuild.o dax.o dir.o file.o inode.o ioctl.o journal.o\
+ log.o namei.o rebuild.o stats.o super.o symlink.o
diff --git a/fs/nova/dir.c b/fs/nova/dir.c
index 47ee9ad..3694d9d 100644
--- a/fs/nova/dir.c
+++ b/fs/nova/dir.c
@@ -513,4 +513,8 @@ const struct file_operations nova_dir_operations = {
.read   = generic_read_dir,
.iterate= nova_readdir,
.fsync  = noop_fsync,
+   .unlocked_ioctl = nova_ioctl,
+#ifdef CONFIG_COMPAT
+   .compat_ioctl   = nova_compat_ioctl,
+#endif
 };
diff --git a/fs/nova/file.c b/fs/nova/file.c
index 7e90415..2b70b9d 100644
--- a/fs/nova/file.c
+++ b/fs/nova/file.c
@@ -714,7 +714,11 @@ const struct file_operations nova_dax_file_operations = {
.open   = nova_open,
.fsync  = nova_fsync,
.flush  = nova_flush,
+   .unlocked_ioctl = nova_ioctl,
.fallocate  = nova_fallocate,
+#ifdef CONFIG_COMPAT
+   .compat_ioctl   = nova_compat_ioctl,
+#endif
 };
 
 const struct inode_operations nova_file_inode_operations = {
diff --git a/fs/nova/inode.h b/fs/nova/inode.h
index 693aa90..086a7cb 100644
--- a/fs/nova/inode.h
+++ b/fs/nova/inode.h
@@ -264,6 +264,8 @@ int nova_delete_file_tree(struct super_block *sb,
struct nova_inode_info_header *sih, unsigned long start_blocknr,
unsigned long last_blocknr, bool delete_nvmm, bool delete_dead,
u64 epoch_id);
+extern void nova_set_inode_flags(struct inode *inode, struct nova_inode *pi,
+   unsigned int flags);
 unsigned long nova_find_region(struct inode *inode, loff_t *offset, int hole);
 extern void nova_evict_inode(struct inode *inode);
 extern int nova_write_inode(struct inode *inode, struct writeback_control 
*wbc);
diff --git a/fs/nova/ioctl.c b/fs/nova/ioctl.c
new file mode 100644
index 000..2509371
--- /dev/null
+++ b/fs/nova/ioctl.c
@@ -0,0 +1,184 @@
+/*
+ * BRIEF DESCRIPTION
+ *
+ * Ioctl operations.
+ *
+ * Copyright 2015-2016 Regents of the University of California,
+ * UCSD Non-Volatile Systems Lab, Andiry Xu <jix...@cs.ucsd.edu>
+ * Copyright 2012-2013 Intel Corporation
+ * Copyright 2010-2011 Marco Stornelli <marco.storne...@gmail.com>
+ *
+ * This file is licensed under the terms of the GNU General Public
+ * License version 2. This program is licensed "as is" without any
+ * warranty of any kind, whether express or implied.
+ */
+
+#include 
+#include 
+#include 
+#include 
+#include 
+#include "nova.h"
+#include "inode.h"
+
+long nova_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
+{
+   struct address_space *mapping = filp->f_mapping;
+   struct inode*inode = mapping->host;
+   struct nova_inode_info *si = NOVA_I(inode);
+   struct nova_inode_info_header *sih = >header;
+   struct nova_inode *pi;
+   struct super_block *sb = inode->i_sb;
+   struct nova_inode_update update;
+   unsigned int flags;
+   int ret;
+
+   pi = nova_get_inode(sb, inode);
+   if (!pi)
+   return -EACCES;
+
+   switch (cmd) {
+   case FS_IOC_GETFLAGS:
+   flags = (sih->i_flags) & NOVA_FL_USER_VISIBLE;
+   return put_user(flags, (int __user *)arg);
+   case FS_IOC_SETFLAGS: {
+   unsigned int oldflags;
+   u64 old_linkc = 0;
+   u64 epoch_id;
+
+   ret = mnt_want_write_file(filp);
+   if (ret)
+   return ret;
+
+   if (!inode_owner_or_capable(inode)) {
+   ret = -EPERM;
+   goto flags_out;
+   }
+
+   if (get_user(flags, (int __user *)arg)) {
+   ret = -EFAULT;
+   goto flags_out;
+   }
+
+   inode_lock(inode);
+   sih_lock(sih);
+   oldflags = le32_to_cpu(pi->i_flags);
+
+   if ((flags ^ oldflags) &
+   (FS_APPEND_FL | FS_IMMUTABLE_FL)) {
+   if (!capable(CAP_LINUX_IMMUTABLE)) {
+   inode_unlock(inode);
+ 

[RFC v2 76/83] Ioctl support.

2018-03-10 Thread Andiry Xu
From: Andiry Xu 

NOVA appends link change entry to the inode log to implement
SETFLAGS and SETVERSION.

Signed-off-by: Andiry Xu 
---
 fs/nova/Makefile |   4 +-
 fs/nova/dir.c|   4 ++
 fs/nova/file.c   |   4 ++
 fs/nova/inode.h  |   2 +
 fs/nova/ioctl.c  | 184 +++
 fs/nova/nova.h   |   7 +++
 6 files changed, 203 insertions(+), 2 deletions(-)
 create mode 100644 fs/nova/ioctl.c

diff --git a/fs/nova/Makefile b/fs/nova/Makefile
index 7bf6403..87e56c6 100644
--- a/fs/nova/Makefile
+++ b/fs/nova/Makefile
@@ -4,5 +4,5 @@
 
 obj-$(CONFIG_NOVA_FS) += nova.o
 
-nova-y := balloc.o bbuild.o dax.o dir.o file.o inode.o journal.o log.o namei.o\
- rebuild.o stats.o super.o symlink.o
+nova-y := balloc.o bbuild.o dax.o dir.o file.o inode.o ioctl.o journal.o\
+ log.o namei.o rebuild.o stats.o super.o symlink.o
diff --git a/fs/nova/dir.c b/fs/nova/dir.c
index 47ee9ad..3694d9d 100644
--- a/fs/nova/dir.c
+++ b/fs/nova/dir.c
@@ -513,4 +513,8 @@ const struct file_operations nova_dir_operations = {
.read   = generic_read_dir,
.iterate= nova_readdir,
.fsync  = noop_fsync,
+   .unlocked_ioctl = nova_ioctl,
+#ifdef CONFIG_COMPAT
+   .compat_ioctl   = nova_compat_ioctl,
+#endif
 };
diff --git a/fs/nova/file.c b/fs/nova/file.c
index 7e90415..2b70b9d 100644
--- a/fs/nova/file.c
+++ b/fs/nova/file.c
@@ -714,7 +714,11 @@ const struct file_operations nova_dax_file_operations = {
.open   = nova_open,
.fsync  = nova_fsync,
.flush  = nova_flush,
+   .unlocked_ioctl = nova_ioctl,
.fallocate  = nova_fallocate,
+#ifdef CONFIG_COMPAT
+   .compat_ioctl   = nova_compat_ioctl,
+#endif
 };
 
 const struct inode_operations nova_file_inode_operations = {
diff --git a/fs/nova/inode.h b/fs/nova/inode.h
index 693aa90..086a7cb 100644
--- a/fs/nova/inode.h
+++ b/fs/nova/inode.h
@@ -264,6 +264,8 @@ int nova_delete_file_tree(struct super_block *sb,
struct nova_inode_info_header *sih, unsigned long start_blocknr,
unsigned long last_blocknr, bool delete_nvmm, bool delete_dead,
u64 epoch_id);
+extern void nova_set_inode_flags(struct inode *inode, struct nova_inode *pi,
+   unsigned int flags);
 unsigned long nova_find_region(struct inode *inode, loff_t *offset, int hole);
 extern void nova_evict_inode(struct inode *inode);
 extern int nova_write_inode(struct inode *inode, struct writeback_control 
*wbc);
diff --git a/fs/nova/ioctl.c b/fs/nova/ioctl.c
new file mode 100644
index 000..2509371
--- /dev/null
+++ b/fs/nova/ioctl.c
@@ -0,0 +1,184 @@
+/*
+ * BRIEF DESCRIPTION
+ *
+ * Ioctl operations.
+ *
+ * Copyright 2015-2016 Regents of the University of California,
+ * UCSD Non-Volatile Systems Lab, Andiry Xu 
+ * Copyright 2012-2013 Intel Corporation
+ * Copyright 2010-2011 Marco Stornelli 
+ *
+ * This file is licensed under the terms of the GNU General Public
+ * License version 2. This program is licensed "as is" without any
+ * warranty of any kind, whether express or implied.
+ */
+
+#include 
+#include 
+#include 
+#include 
+#include 
+#include "nova.h"
+#include "inode.h"
+
+long nova_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
+{
+   struct address_space *mapping = filp->f_mapping;
+   struct inode*inode = mapping->host;
+   struct nova_inode_info *si = NOVA_I(inode);
+   struct nova_inode_info_header *sih = >header;
+   struct nova_inode *pi;
+   struct super_block *sb = inode->i_sb;
+   struct nova_inode_update update;
+   unsigned int flags;
+   int ret;
+
+   pi = nova_get_inode(sb, inode);
+   if (!pi)
+   return -EACCES;
+
+   switch (cmd) {
+   case FS_IOC_GETFLAGS:
+   flags = (sih->i_flags) & NOVA_FL_USER_VISIBLE;
+   return put_user(flags, (int __user *)arg);
+   case FS_IOC_SETFLAGS: {
+   unsigned int oldflags;
+   u64 old_linkc = 0;
+   u64 epoch_id;
+
+   ret = mnt_want_write_file(filp);
+   if (ret)
+   return ret;
+
+   if (!inode_owner_or_capable(inode)) {
+   ret = -EPERM;
+   goto flags_out;
+   }
+
+   if (get_user(flags, (int __user *)arg)) {
+   ret = -EFAULT;
+   goto flags_out;
+   }
+
+   inode_lock(inode);
+   sih_lock(sih);
+   oldflags = le32_to_cpu(pi->i_flags);
+
+   if ((flags ^ oldflags) &
+   (FS_APPEND_FL | FS_IMMUTABLE_FL)) {
+   if (!capable(CAP_LINUX_IMMUTABLE)) {
+   inode_unlock(inode);
+   ret = -EPERM;
+   goto flags_out_unlock;
+ 

[RFC v2 80/83] Failure recovery: bitmap operations.

2018-03-10 Thread Andiry Xu
From: Andiry Xu <jix...@cs.ucsd.edu>

Upon system failure, NOVA needs to scan all the inode logs
to rebuild the allocator. During the scanning, NOVA stores allocated
log/data pages in a bitmap, and uses the bitmap to rebuild the allocator
once scan finishes.

Signed-off-by: Andiry Xu <jix...@cs.ucsd.edu>
---
 fs/nova/bbuild.c | 252 +++
 fs/nova/bbuild.h |  18 
 2 files changed, 270 insertions(+)

diff --git a/fs/nova/bbuild.c b/fs/nova/bbuild.c
index ca51dca..35c661a 100644
--- a/fs/nova/bbuild.c
+++ b/fs/nova/bbuild.c
@@ -414,6 +414,258 @@ void nova_save_blocknode_mappings_to_log(struct 
super_block *sb)
  pi->log_head, pi->log_tail);
 }
 
+/** Bitmap operations /
+
+static inline void set_scan_bm(unsigned long bit,
+   struct single_scan_bm *scan_bm)
+{
+   set_bit(bit, scan_bm->bitmap);
+}
+
+inline void set_bm(unsigned long bit, struct scan_bitmap *bm,
+   enum bm_type type)
+{
+   switch (type) {
+   case BM_4K:
+   set_scan_bm(bit, >scan_bm_4K);
+   break;
+   case BM_2M:
+   set_scan_bm(bit, >scan_bm_2M);
+   break;
+   case BM_1G:
+   set_scan_bm(bit, >scan_bm_1G);
+   break;
+   default:
+   break;
+   }
+}
+
+static int nova_insert_blocknode_map(struct super_block *sb,
+   int cpuid, unsigned long low, unsigned long high)
+{
+   struct nova_sb_info *sbi = NOVA_SB(sb);
+   struct free_list *free_list;
+   struct rb_root *tree;
+   struct nova_range_node *blknode = NULL;
+   unsigned long num_blocks = 0;
+   int ret;
+
+   num_blocks = high - low + 1;
+   nova_dbgv("%s: cpu %d, low %lu, high %lu, num %lu\n",
+   __func__, cpuid, low, high, num_blocks);
+   free_list = nova_get_free_list(sb, cpuid);
+   tree = &(free_list->block_free_tree);
+
+   blknode = nova_alloc_blocknode(sb);
+   if (blknode == NULL)
+   return -ENOMEM;
+   blknode->range_low = low;
+   blknode->range_high = high;
+   ret = nova_insert_blocktree(sbi, tree, blknode);
+   if (ret) {
+   nova_err(sb, "%s failed\n", __func__);
+   nova_free_blocknode(sb, blknode);
+   goto out;
+   }
+   if (!free_list->first_node)
+   free_list->first_node = blknode;
+   free_list->last_node = blknode;
+   free_list->num_blocknode++;
+   free_list->num_free_blocks += num_blocks;
+out:
+   return ret;
+}
+
+static int __nova_build_blocknode_map(struct super_block *sb,
+   unsigned long *bitmap, unsigned long bsize, unsigned long scale)
+{
+   struct nova_sb_info *sbi = NOVA_SB(sb);
+   struct free_list *free_list;
+   unsigned long next = 0;
+   unsigned long low = 0;
+   unsigned long start, end;
+   int cpuid = 0;
+
+   free_list = nova_get_free_list(sb, cpuid);
+   start = free_list->block_start;
+   end = free_list->block_end + 1;
+   while (1) {
+   next = find_next_zero_bit(bitmap, end, start);
+   if (next == bsize)
+   break;
+   if (next == end) {
+   if (cpuid == sbi->cpus - 1)
+   break;
+
+   cpuid++;
+   free_list = nova_get_free_list(sb, cpuid);
+   start = free_list->block_start;
+   end = free_list->block_end + 1;
+   continue;
+   }
+
+   low = next;
+   next = find_next_bit(bitmap, end, next);
+   if (nova_insert_blocknode_map(sb, cpuid,
+   low << scale, (next << scale) - 1)) {
+   nova_dbg("Error: could not insert %lu - %lu\n",
+   low << scale, ((next << scale) - 1));
+   }
+   start = next;
+   if (next == bsize)
+   break;
+   if (next == end) {
+   if (cpuid == sbi->cpus - 1)
+   break;
+
+   cpuid++;
+   free_list = nova_get_free_list(sb, cpuid);
+   start = free_list->block_start;
+   end = free_list->block_end + 1;
+   }
+   }
+   return 0;
+}
+
+static void nova_update_4K_map(struct super_block *sb,
+   struct scan_bitmap *bm, unsigned long *bitmap,
+   unsigned long bsize, unsigned long scale)
+{
+   unsigned long next = 0;
+   unsigned long low = 0;
+   int i;
+
+   while (1) {
+   next = find_next_bit(bitmap, bsize, next);
+   if (next == bsize)
+

[RFC v2 80/83] Failure recovery: bitmap operations.

2018-03-10 Thread Andiry Xu
From: Andiry Xu 

Upon system failure, NOVA needs to scan all the inode logs
to rebuild the allocator. During the scanning, NOVA stores allocated
log/data pages in a bitmap, and uses the bitmap to rebuild the allocator
once scan finishes.

Signed-off-by: Andiry Xu 
---
 fs/nova/bbuild.c | 252 +++
 fs/nova/bbuild.h |  18 
 2 files changed, 270 insertions(+)

diff --git a/fs/nova/bbuild.c b/fs/nova/bbuild.c
index ca51dca..35c661a 100644
--- a/fs/nova/bbuild.c
+++ b/fs/nova/bbuild.c
@@ -414,6 +414,258 @@ void nova_save_blocknode_mappings_to_log(struct 
super_block *sb)
  pi->log_head, pi->log_tail);
 }
 
+/** Bitmap operations /
+
+static inline void set_scan_bm(unsigned long bit,
+   struct single_scan_bm *scan_bm)
+{
+   set_bit(bit, scan_bm->bitmap);
+}
+
+inline void set_bm(unsigned long bit, struct scan_bitmap *bm,
+   enum bm_type type)
+{
+   switch (type) {
+   case BM_4K:
+   set_scan_bm(bit, >scan_bm_4K);
+   break;
+   case BM_2M:
+   set_scan_bm(bit, >scan_bm_2M);
+   break;
+   case BM_1G:
+   set_scan_bm(bit, >scan_bm_1G);
+   break;
+   default:
+   break;
+   }
+}
+
+static int nova_insert_blocknode_map(struct super_block *sb,
+   int cpuid, unsigned long low, unsigned long high)
+{
+   struct nova_sb_info *sbi = NOVA_SB(sb);
+   struct free_list *free_list;
+   struct rb_root *tree;
+   struct nova_range_node *blknode = NULL;
+   unsigned long num_blocks = 0;
+   int ret;
+
+   num_blocks = high - low + 1;
+   nova_dbgv("%s: cpu %d, low %lu, high %lu, num %lu\n",
+   __func__, cpuid, low, high, num_blocks);
+   free_list = nova_get_free_list(sb, cpuid);
+   tree = &(free_list->block_free_tree);
+
+   blknode = nova_alloc_blocknode(sb);
+   if (blknode == NULL)
+   return -ENOMEM;
+   blknode->range_low = low;
+   blknode->range_high = high;
+   ret = nova_insert_blocktree(sbi, tree, blknode);
+   if (ret) {
+   nova_err(sb, "%s failed\n", __func__);
+   nova_free_blocknode(sb, blknode);
+   goto out;
+   }
+   if (!free_list->first_node)
+   free_list->first_node = blknode;
+   free_list->last_node = blknode;
+   free_list->num_blocknode++;
+   free_list->num_free_blocks += num_blocks;
+out:
+   return ret;
+}
+
+static int __nova_build_blocknode_map(struct super_block *sb,
+   unsigned long *bitmap, unsigned long bsize, unsigned long scale)
+{
+   struct nova_sb_info *sbi = NOVA_SB(sb);
+   struct free_list *free_list;
+   unsigned long next = 0;
+   unsigned long low = 0;
+   unsigned long start, end;
+   int cpuid = 0;
+
+   free_list = nova_get_free_list(sb, cpuid);
+   start = free_list->block_start;
+   end = free_list->block_end + 1;
+   while (1) {
+   next = find_next_zero_bit(bitmap, end, start);
+   if (next == bsize)
+   break;
+   if (next == end) {
+   if (cpuid == sbi->cpus - 1)
+   break;
+
+   cpuid++;
+   free_list = nova_get_free_list(sb, cpuid);
+   start = free_list->block_start;
+   end = free_list->block_end + 1;
+   continue;
+   }
+
+   low = next;
+   next = find_next_bit(bitmap, end, next);
+   if (nova_insert_blocknode_map(sb, cpuid,
+   low << scale, (next << scale) - 1)) {
+   nova_dbg("Error: could not insert %lu - %lu\n",
+   low << scale, ((next << scale) - 1));
+   }
+   start = next;
+   if (next == bsize)
+   break;
+   if (next == end) {
+   if (cpuid == sbi->cpus - 1)
+   break;
+
+   cpuid++;
+   free_list = nova_get_free_list(sb, cpuid);
+   start = free_list->block_start;
+   end = free_list->block_end + 1;
+   }
+   }
+   return 0;
+}
+
+static void nova_update_4K_map(struct super_block *sb,
+   struct scan_bitmap *bm, unsigned long *bitmap,
+   unsigned long bsize, unsigned long scale)
+{
+   unsigned long next = 0;
+   unsigned long low = 0;
+   int i;
+
+   while (1) {
+   next = find_next_bit(bitmap, bsize, next);
+   if (next == bsize)
+   break;
+   low = next;
+  

[RFC v2 79/83] Normal recovery.

2018-03-10 Thread Andiry Xu
From: Andiry Xu <jix...@cs.ucsd.edu>

Upon umount, NOVA stores the allocator information and the inuse
inode list in reserved inodes. During remount, NOVA reads these
information and rebuild the allocator and inuse inode list DRAM
data structures.

Signed-off-by: Andiry Xu <jix...@cs.ucsd.edu>
---
 fs/nova/bbuild.c | 266 +++
 fs/nova/bbuild.h |   1 +
 fs/nova/super.c  |   3 +
 3 files changed, 270 insertions(+)

diff --git a/fs/nova/bbuild.c b/fs/nova/bbuild.c
index af1b352..ca51dca 100644
--- a/fs/nova/bbuild.c
+++ b/fs/nova/bbuild.c
@@ -52,6 +52,206 @@ void nova_init_header(struct super_block *sb,
init_rwsem(>i_sem);
 }
 
+static inline int get_cpuid(struct nova_sb_info *sbi, unsigned long blocknr)
+{
+   return blocknr / sbi->per_list_blocks;
+}
+
+static void nova_destroy_range_node_tree(struct super_block *sb,
+   struct rb_root *tree)
+{
+   struct nova_range_node *curr;
+   struct rb_node *temp;
+
+   temp = rb_first(tree);
+   while (temp) {
+   curr = container_of(temp, struct nova_range_node, node);
+   temp = rb_next(temp);
+   rb_erase(>node, tree);
+   nova_free_range_node(curr);
+   }
+}
+
+static void nova_destroy_blocknode_tree(struct super_block *sb, int cpu)
+{
+   struct free_list *free_list;
+
+   free_list = nova_get_free_list(sb, cpu);
+   nova_destroy_range_node_tree(sb, _list->block_free_tree);
+}
+
+static void nova_destroy_blocknode_trees(struct super_block *sb)
+{
+   struct nova_sb_info *sbi = NOVA_SB(sb);
+   int i;
+
+   for (i = 0; i < sbi->cpus; i++)
+   nova_destroy_blocknode_tree(sb, i);
+
+}
+
+static int nova_init_blockmap_from_inode(struct super_block *sb)
+{
+   struct nova_sb_info *sbi = NOVA_SB(sb);
+   struct nova_inode *pi = nova_get_inode_by_ino(sb, NOVA_BLOCKNODE_INO);
+   struct nova_inode_info_header sih;
+   struct free_list *free_list;
+   struct nova_range_node_lowhigh *entry;
+   struct nova_range_node *blknode;
+   size_t size = sizeof(struct nova_range_node_lowhigh);
+   u64 curr_p;
+   u64 cpuid;
+   int ret = 0;
+
+   /* FIXME: Backup inode for BLOCKNODE */
+   ret = nova_get_head_tail(sb, pi, );
+   if (ret)
+   goto out;
+
+   curr_p = sih.log_head;
+   if (curr_p == 0) {
+   nova_dbg("%s: pi head is 0!\n", __func__);
+   return -EINVAL;
+   }
+
+   while (curr_p != sih.log_tail) {
+   if (is_last_entry(curr_p, size))
+   curr_p = next_log_page(sb, curr_p);
+
+   if (curr_p == 0) {
+   nova_dbg("%s: curr_p is NULL!\n", __func__);
+   NOVA_ASSERT(0);
+   ret = -EINVAL;
+   break;
+   }
+
+   entry = (struct nova_range_node_lowhigh *)nova_get_block(sb,
+   curr_p);
+   blknode = nova_alloc_blocknode(sb);
+   if (blknode == NULL)
+   NOVA_ASSERT(0);
+   blknode->range_low = le64_to_cpu(entry->range_low);
+   blknode->range_high = le64_to_cpu(entry->range_high);
+   cpuid = get_cpuid(sbi, blknode->range_low);
+
+   /* FIXME: Assume NR_CPUS not change */
+   free_list = nova_get_free_list(sb, cpuid);
+   ret = nova_insert_blocktree(sbi,
+   _list->block_free_tree, blknode);
+   if (ret) {
+   nova_err(sb, "%s failed\n", __func__);
+   nova_free_blocknode(sb, blknode);
+   NOVA_ASSERT(0);
+   nova_destroy_blocknode_trees(sb);
+   goto out;
+   }
+   free_list->num_blocknode++;
+   if (free_list->num_blocknode == 1)
+   free_list->first_node = blknode;
+   free_list->last_node = blknode;
+   free_list->num_free_blocks +=
+   blknode->range_high - blknode->range_low + 1;
+   curr_p += sizeof(struct nova_range_node_lowhigh);
+   }
+out:
+   nova_free_inode_log(sb, pi, );
+   return ret;
+}
+
+static void nova_destroy_inode_trees(struct super_block *sb)
+{
+   struct nova_sb_info *sbi = NOVA_SB(sb);
+   struct inode_map *inode_map;
+   int i;
+
+   for (i = 0; i < sbi->cpus; i++) {
+   inode_map = >inode_maps[i];
+   nova_destroy_range_node_tree(sb,
+   _map->inode_inuse_tree);
+   }
+}
+
+#define CPUID_MASK 0xff00
+
+static int nova_init_inode_list_from_inode(struct super_block *sb)
+{
+   struct nova_sb_info *s

[RFC v2 79/83] Normal recovery.

2018-03-10 Thread Andiry Xu
From: Andiry Xu 

Upon umount, NOVA stores the allocator information and the inuse
inode list in reserved inodes. During remount, NOVA reads these
information and rebuild the allocator and inuse inode list DRAM
data structures.

Signed-off-by: Andiry Xu 
---
 fs/nova/bbuild.c | 266 +++
 fs/nova/bbuild.h |   1 +
 fs/nova/super.c  |   3 +
 3 files changed, 270 insertions(+)

diff --git a/fs/nova/bbuild.c b/fs/nova/bbuild.c
index af1b352..ca51dca 100644
--- a/fs/nova/bbuild.c
+++ b/fs/nova/bbuild.c
@@ -52,6 +52,206 @@ void nova_init_header(struct super_block *sb,
init_rwsem(>i_sem);
 }
 
+static inline int get_cpuid(struct nova_sb_info *sbi, unsigned long blocknr)
+{
+   return blocknr / sbi->per_list_blocks;
+}
+
+static void nova_destroy_range_node_tree(struct super_block *sb,
+   struct rb_root *tree)
+{
+   struct nova_range_node *curr;
+   struct rb_node *temp;
+
+   temp = rb_first(tree);
+   while (temp) {
+   curr = container_of(temp, struct nova_range_node, node);
+   temp = rb_next(temp);
+   rb_erase(>node, tree);
+   nova_free_range_node(curr);
+   }
+}
+
+static void nova_destroy_blocknode_tree(struct super_block *sb, int cpu)
+{
+   struct free_list *free_list;
+
+   free_list = nova_get_free_list(sb, cpu);
+   nova_destroy_range_node_tree(sb, _list->block_free_tree);
+}
+
+static void nova_destroy_blocknode_trees(struct super_block *sb)
+{
+   struct nova_sb_info *sbi = NOVA_SB(sb);
+   int i;
+
+   for (i = 0; i < sbi->cpus; i++)
+   nova_destroy_blocknode_tree(sb, i);
+
+}
+
+static int nova_init_blockmap_from_inode(struct super_block *sb)
+{
+   struct nova_sb_info *sbi = NOVA_SB(sb);
+   struct nova_inode *pi = nova_get_inode_by_ino(sb, NOVA_BLOCKNODE_INO);
+   struct nova_inode_info_header sih;
+   struct free_list *free_list;
+   struct nova_range_node_lowhigh *entry;
+   struct nova_range_node *blknode;
+   size_t size = sizeof(struct nova_range_node_lowhigh);
+   u64 curr_p;
+   u64 cpuid;
+   int ret = 0;
+
+   /* FIXME: Backup inode for BLOCKNODE */
+   ret = nova_get_head_tail(sb, pi, );
+   if (ret)
+   goto out;
+
+   curr_p = sih.log_head;
+   if (curr_p == 0) {
+   nova_dbg("%s: pi head is 0!\n", __func__);
+   return -EINVAL;
+   }
+
+   while (curr_p != sih.log_tail) {
+   if (is_last_entry(curr_p, size))
+   curr_p = next_log_page(sb, curr_p);
+
+   if (curr_p == 0) {
+   nova_dbg("%s: curr_p is NULL!\n", __func__);
+   NOVA_ASSERT(0);
+   ret = -EINVAL;
+   break;
+   }
+
+   entry = (struct nova_range_node_lowhigh *)nova_get_block(sb,
+   curr_p);
+   blknode = nova_alloc_blocknode(sb);
+   if (blknode == NULL)
+   NOVA_ASSERT(0);
+   blknode->range_low = le64_to_cpu(entry->range_low);
+   blknode->range_high = le64_to_cpu(entry->range_high);
+   cpuid = get_cpuid(sbi, blknode->range_low);
+
+   /* FIXME: Assume NR_CPUS not change */
+   free_list = nova_get_free_list(sb, cpuid);
+   ret = nova_insert_blocktree(sbi,
+   _list->block_free_tree, blknode);
+   if (ret) {
+   nova_err(sb, "%s failed\n", __func__);
+   nova_free_blocknode(sb, blknode);
+   NOVA_ASSERT(0);
+   nova_destroy_blocknode_trees(sb);
+   goto out;
+   }
+   free_list->num_blocknode++;
+   if (free_list->num_blocknode == 1)
+   free_list->first_node = blknode;
+   free_list->last_node = blknode;
+   free_list->num_free_blocks +=
+   blknode->range_high - blknode->range_low + 1;
+   curr_p += sizeof(struct nova_range_node_lowhigh);
+   }
+out:
+   nova_free_inode_log(sb, pi, );
+   return ret;
+}
+
+static void nova_destroy_inode_trees(struct super_block *sb)
+{
+   struct nova_sb_info *sbi = NOVA_SB(sb);
+   struct inode_map *inode_map;
+   int i;
+
+   for (i = 0; i < sbi->cpus; i++) {
+   inode_map = >inode_maps[i];
+   nova_destroy_range_node_tree(sb,
+   _map->inode_inuse_tree);
+   }
+}
+
+#define CPUID_MASK 0xff00
+
+static int nova_init_inode_list_from_inode(struct super_block *sb)
+{
+   struct nova_sb_info *sbi = NOVA_SB(sb);
+   struct nova_inode *pi = nova_

[RFC v2 83/83] Sysfs support.

2018-03-10 Thread Andiry Xu
From: Andiry Xu <jix...@cs.ucsd.edu>

Sysfs support allows user to get/post information of running NOVA instance.
After mount, NOVA creates four entries under proc directory
/proc/fs/nova/pmem#/:

timing_statsIO_statsallocator   gc

Show NOVA file operation timing statistics:
cat /proc/fs/NOVA/pmem#/timing_stats

Clear timing statistics:
echo 1 > /proc/fs/NOVA/pmem#/timing_stats

Show NOVA I/O statistics:
cat /proc/fs/NOVA/pmem#/IO_stats

Clear I/O statistics:
echo 1 > /proc/fs/NOVA/pmem#/IO_stats

Show NOVA allocator information:
cat /proc/fs/NOVA/pmem#/allocator

Manual garbage collection:
echo #inode_number > /proc/fs/NOVA/pmem#/gc

Signed-off-by: Andiry Xu <jix...@cs.ucsd.edu>
---
 fs/nova/Makefile |   2 +-
 fs/nova/nova.h   |   6 +
 fs/nova/super.c  |   9 ++
 fs/nova/super.h  |   1 +
 fs/nova/sysfs.c  | 379 +++
 5 files changed, 396 insertions(+), 1 deletion(-)
 create mode 100644 fs/nova/sysfs.c

diff --git a/fs/nova/Makefile b/fs/nova/Makefile
index 7a5fb6d..6e1c29d 100644
--- a/fs/nova/Makefile
+++ b/fs/nova/Makefile
@@ -5,4 +5,4 @@
 obj-$(CONFIG_NOVA_FS) += nova.o
 
 nova-y := balloc.o bbuild.o dax.o dir.o file.o gc.o inode.o ioctl.o journal.o\
- log.o namei.o rebuild.o stats.o super.o symlink.o
+ log.o namei.o rebuild.o stats.o super.o symlink.o sysfs.o
diff --git a/fs/nova/nova.h b/fs/nova/nova.h
index 32b7b2f..0814676 100644
--- a/fs/nova/nova.h
+++ b/fs/nova/nova.h
@@ -546,6 +546,12 @@ int nova_block_symlink(struct super_block *sb, struct 
nova_inode *pi,
struct inode *inode, const char *symname, int len, u64 epoch_id);
 extern const struct inode_operations nova_symlink_inode_operations;
 
+/* sysfs.c */
+extern const char *proc_dirname;
+extern struct proc_dir_entry *nova_proc_root;
+void nova_sysfs_init(struct super_block *sb);
+void nova_sysfs_exit(struct super_block *sb);
+
 /* stats.c */
 void nova_get_timing_stats(void);
 void nova_get_IO_stats(void);
diff --git a/fs/nova/super.c b/fs/nova/super.c
index 14b4af6..039c003 100644
--- a/fs/nova/super.c
+++ b/fs/nova/super.c
@@ -596,6 +596,8 @@ static int nova_fill_super(struct super_block *sb, void 
*data, int silent)
goto out;
}
 
+   nova_sysfs_init(sb);
+
/* Init a new nova instance */
if (sbi->s_mount_opt & NOVA_MOUNT_FORMAT) {
root_pi = nova_init(sb, sbi->initsize);
@@ -680,6 +682,8 @@ static int nova_fill_super(struct super_block *sb, void 
*data, int silent)
kfree(sbi->inode_maps);
sbi->inode_maps = NULL;
 
+   nova_sysfs_exit(sb);
+
kfree(sbi->nova_sb);
kfree(sbi);
nova_dbg("%s failed: return %d\n", __func__, retval);
@@ -783,6 +787,8 @@ static void nova_put_super(struct super_block *sb)
i, inode_map->allocated, inode_map->freed);
}
 
+   nova_sysfs_exit(sb);
+
kfree(sbi->inode_maps);
kfree(sbi->nova_sb);
kfree(sbi);
@@ -1007,6 +1013,8 @@ static int __init init_nova_fs(void)
nova_info("Arch new instructions support: CLWB %s\n",
support_clwb ? "YES" : "NO");
 
+   nova_proc_root = proc_mkdir(proc_dirname, NULL);
+
rc = init_rangenode_cache();
if (rc)
goto out;
@@ -1041,6 +1049,7 @@ static int __init init_nova_fs(void)
 static void __exit exit_nova_fs(void)
 {
unregister_filesystem(_fs_type);
+   remove_proc_entry(proc_dirname, NULL);
destroy_file_write_item_cache();
destroy_inodecache();
destroy_rangenode_cache();
diff --git a/fs/nova/super.h b/fs/nova/super.h
index bcf9548..bcbe862 100644
--- a/fs/nova/super.h
+++ b/fs/nova/super.h
@@ -112,6 +112,7 @@ struct nova_sb_info {
struct mutexs_lock; /* protects the SB's buffer-head */
 
int cpus;
+   struct proc_dir_entry *s_proc;
 
/* Current epoch. volatile guarantees visibility */
volatile u64 s_epoch_id;
diff --git a/fs/nova/sysfs.c b/fs/nova/sysfs.c
new file mode 100644
index 000..0a73ef4
--- /dev/null
+++ b/fs/nova/sysfs.c
@@ -0,0 +1,379 @@
+/*
+ * BRIEF DESCRIPTION
+ *
+ * Proc fs operations
+ *
+ * Copyright 2015-2016 Regents of the University of California,
+ * UCSD Non-Volatile Systems Lab, Andiry Xu <jix...@cs.ucsd.edu>
+ * Copyright 2012-2013 Intel Corporation
+ * Copyright 2009-2011 Marco Stornelli <marco.storne...@gmail.com>
+ * Copyright 2003 Sony Corporation
+ * Copyright 2003 Matsushita Electric Industrial Co., Ltd.
+ * 2003-2004 (c) MontaVista Software, Inc. , Steve Longerbeam
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ *
+ * This file is licensed under the terms of the GNU General Public
+ * License version 2. This program is licensed "as is" without any
+ * warranty of any kind, whether express or implied.
+ */
+
+#in

[RFC v2 81/83] Failure recovery: Inode pages recovery routines.

2018-03-10 Thread Andiry Xu
From: Andiry Xu <jix...@cs.ucsd.edu>

For each inode, NOVA traverses the inode log and records the pages
allocated in the bitmap. For directory inode, NOVA only set the log pages.
For file and symlink inodes, NOVA needs to set the data pages.
NOVA divides the file into 1GB zones, and records the pages fall into
the current zone, until all the pages have been recorded.

Signed-off-by: Andiry Xu <jix...@cs.ucsd.edu>
---
 fs/nova/bbuild.c | 307 +++
 1 file changed, 307 insertions(+)

diff --git a/fs/nova/bbuild.c b/fs/nova/bbuild.c
index 35c661a..75dfcba 100644
--- a/fs/nova/bbuild.c
+++ b/fs/nova/bbuild.c
@@ -665,6 +665,313 @@ static int alloc_bm(struct super_block *sb, unsigned long 
initsize)
return 0;
 }
 
+/** NOVA recovery /
+
+#define MAX_PGOFF  262144
+
+struct task_ring {
+   u64 addr0[512];
+   int num;
+   int inodes_used_count;
+   u64 *entry_array;
+   u64 *nvmm_array;
+};
+
+static int nova_traverse_inode_log(struct super_block *sb,
+   struct nova_inode *pi, struct scan_bitmap *bm, u64 head)
+{
+   u64 curr_p;
+   u64 next;
+
+   curr_p = head;
+
+   if (curr_p == 0)
+   return 0;
+
+   WARN_ON(curr_p & (PAGE_SIZE - 1));
+   set_bm(curr_p >> PAGE_SHIFT, bm, BM_4K);
+
+   next = next_log_page(sb, curr_p);
+   while (next > 0) {
+   curr_p = next;
+   WARN_ON(curr_p & (PAGE_SIZE - 1));
+   set_bm(curr_p >> PAGE_SHIFT, bm, BM_4K);
+   next = next_log_page(sb, curr_p);
+   }
+
+   return 0;
+}
+
+static void nova_traverse_dir_inode_log(struct super_block *sb,
+   struct nova_inode *pi, struct scan_bitmap *bm)
+{
+   nova_traverse_inode_log(sb, pi, bm, pi->log_head);
+}
+
+static int nova_set_ring_array(struct super_block *sb,
+   struct nova_inode_info_header *sih, struct nova_file_write_entry *entry,
+   struct task_ring *ring,
+   unsigned long base, struct scan_bitmap *bm)
+{
+   unsigned long start, end;
+   unsigned long pgoff, old_pgoff = 0;
+   unsigned long index;
+   unsigned int num_free = 0;
+   u64 old_entry = 0;
+
+   start = entry->pgoff;
+   if (start < base)
+   start = base;
+
+   end = entry->pgoff + entry->num_pages;
+   if (end > base + MAX_PGOFF)
+   end = base + MAX_PGOFF;
+
+   for (pgoff = start; pgoff < end; pgoff++) {
+   index = pgoff - base;
+   if (ring->nvmm_array[index]) {
+   if (ring->entry_array[index] != old_entry) {
+   old_entry = ring->entry_array[index];
+   old_pgoff = pgoff;
+   num_free = 1;
+   } else {
+   num_free++;
+   }
+   }
+   }
+
+   for (pgoff = start; pgoff < end; pgoff++) {
+   index = pgoff - base;
+   ring->entry_array[index] = (u64)entry;
+   ring->nvmm_array[index] = (u64)(entry->block >> PAGE_SHIFT)
+   + pgoff - entry->pgoff;
+   }
+
+   return 0;
+}
+
+static int nova_set_file_bm(struct super_block *sb,
+   struct nova_inode_info_header *sih, struct task_ring *ring,
+   struct scan_bitmap *bm, unsigned long base, unsigned long last_blocknr)
+{
+   unsigned long nvmm, pgoff;
+
+   if (last_blocknr >= base + MAX_PGOFF)
+   last_blocknr = MAX_PGOFF - 1;
+   else
+   last_blocknr -= base;
+
+   for (pgoff = 0; pgoff <= last_blocknr; pgoff++) {
+   nvmm = ring->nvmm_array[pgoff];
+   if (nvmm) {
+   set_bm(nvmm, bm, BM_4K);
+   ring->nvmm_array[pgoff] = 0;
+   ring->entry_array[pgoff] = 0;
+   }
+   }
+
+   return 0;
+}
+
+/* entry given to this function is a copy in dram */
+static void nova_ring_setattr_entry(struct super_block *sb,
+   struct nova_inode_info_header *sih,
+   struct nova_setattr_logentry *entry, struct task_ring *ring,
+   unsigned long base, unsigned int data_bits, struct scan_bitmap *bm)
+{
+   unsigned long first_blocknr, last_blocknr;
+   unsigned long pgoff, old_pgoff = 0;
+   unsigned long index;
+   unsigned int num_free = 0;
+   u64 old_entry = 0;
+   loff_t start, end;
+
+   if (sih->i_size <= entry->size)
+   goto out;
+
+   start = entry->size;
+   end = sih->i_size;
+
+   first_blocknr = (start + (1UL << data_bits) - 1) >> data_bits;
+
+   if (end > 0)
+   last_blocknr = (end - 1) >> data_bits;
+   else
+

[RFC v2 83/83] Sysfs support.

2018-03-10 Thread Andiry Xu
From: Andiry Xu 

Sysfs support allows user to get/post information of running NOVA instance.
After mount, NOVA creates four entries under proc directory
/proc/fs/nova/pmem#/:

timing_statsIO_statsallocator   gc

Show NOVA file operation timing statistics:
cat /proc/fs/NOVA/pmem#/timing_stats

Clear timing statistics:
echo 1 > /proc/fs/NOVA/pmem#/timing_stats

Show NOVA I/O statistics:
cat /proc/fs/NOVA/pmem#/IO_stats

Clear I/O statistics:
echo 1 > /proc/fs/NOVA/pmem#/IO_stats

Show NOVA allocator information:
cat /proc/fs/NOVA/pmem#/allocator

Manual garbage collection:
echo #inode_number > /proc/fs/NOVA/pmem#/gc

Signed-off-by: Andiry Xu 
---
 fs/nova/Makefile |   2 +-
 fs/nova/nova.h   |   6 +
 fs/nova/super.c  |   9 ++
 fs/nova/super.h  |   1 +
 fs/nova/sysfs.c  | 379 +++
 5 files changed, 396 insertions(+), 1 deletion(-)
 create mode 100644 fs/nova/sysfs.c

diff --git a/fs/nova/Makefile b/fs/nova/Makefile
index 7a5fb6d..6e1c29d 100644
--- a/fs/nova/Makefile
+++ b/fs/nova/Makefile
@@ -5,4 +5,4 @@
 obj-$(CONFIG_NOVA_FS) += nova.o
 
 nova-y := balloc.o bbuild.o dax.o dir.o file.o gc.o inode.o ioctl.o journal.o\
- log.o namei.o rebuild.o stats.o super.o symlink.o
+ log.o namei.o rebuild.o stats.o super.o symlink.o sysfs.o
diff --git a/fs/nova/nova.h b/fs/nova/nova.h
index 32b7b2f..0814676 100644
--- a/fs/nova/nova.h
+++ b/fs/nova/nova.h
@@ -546,6 +546,12 @@ int nova_block_symlink(struct super_block *sb, struct 
nova_inode *pi,
struct inode *inode, const char *symname, int len, u64 epoch_id);
 extern const struct inode_operations nova_symlink_inode_operations;
 
+/* sysfs.c */
+extern const char *proc_dirname;
+extern struct proc_dir_entry *nova_proc_root;
+void nova_sysfs_init(struct super_block *sb);
+void nova_sysfs_exit(struct super_block *sb);
+
 /* stats.c */
 void nova_get_timing_stats(void);
 void nova_get_IO_stats(void);
diff --git a/fs/nova/super.c b/fs/nova/super.c
index 14b4af6..039c003 100644
--- a/fs/nova/super.c
+++ b/fs/nova/super.c
@@ -596,6 +596,8 @@ static int nova_fill_super(struct super_block *sb, void 
*data, int silent)
goto out;
}
 
+   nova_sysfs_init(sb);
+
/* Init a new nova instance */
if (sbi->s_mount_opt & NOVA_MOUNT_FORMAT) {
root_pi = nova_init(sb, sbi->initsize);
@@ -680,6 +682,8 @@ static int nova_fill_super(struct super_block *sb, void 
*data, int silent)
kfree(sbi->inode_maps);
sbi->inode_maps = NULL;
 
+   nova_sysfs_exit(sb);
+
kfree(sbi->nova_sb);
kfree(sbi);
nova_dbg("%s failed: return %d\n", __func__, retval);
@@ -783,6 +787,8 @@ static void nova_put_super(struct super_block *sb)
i, inode_map->allocated, inode_map->freed);
}
 
+   nova_sysfs_exit(sb);
+
kfree(sbi->inode_maps);
kfree(sbi->nova_sb);
kfree(sbi);
@@ -1007,6 +1013,8 @@ static int __init init_nova_fs(void)
nova_info("Arch new instructions support: CLWB %s\n",
support_clwb ? "YES" : "NO");
 
+   nova_proc_root = proc_mkdir(proc_dirname, NULL);
+
rc = init_rangenode_cache();
if (rc)
goto out;
@@ -1041,6 +1049,7 @@ static int __init init_nova_fs(void)
 static void __exit exit_nova_fs(void)
 {
unregister_filesystem(_fs_type);
+   remove_proc_entry(proc_dirname, NULL);
destroy_file_write_item_cache();
destroy_inodecache();
destroy_rangenode_cache();
diff --git a/fs/nova/super.h b/fs/nova/super.h
index bcf9548..bcbe862 100644
--- a/fs/nova/super.h
+++ b/fs/nova/super.h
@@ -112,6 +112,7 @@ struct nova_sb_info {
struct mutexs_lock; /* protects the SB's buffer-head */
 
int cpus;
+   struct proc_dir_entry *s_proc;
 
/* Current epoch. volatile guarantees visibility */
volatile u64 s_epoch_id;
diff --git a/fs/nova/sysfs.c b/fs/nova/sysfs.c
new file mode 100644
index 000..0a73ef4
--- /dev/null
+++ b/fs/nova/sysfs.c
@@ -0,0 +1,379 @@
+/*
+ * BRIEF DESCRIPTION
+ *
+ * Proc fs operations
+ *
+ * Copyright 2015-2016 Regents of the University of California,
+ * UCSD Non-Volatile Systems Lab, Andiry Xu 
+ * Copyright 2012-2013 Intel Corporation
+ * Copyright 2009-2011 Marco Stornelli 
+ * Copyright 2003 Sony Corporation
+ * Copyright 2003 Matsushita Electric Industrial Co., Ltd.
+ * 2003-2004 (c) MontaVista Software, Inc. , Steve Longerbeam
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ *
+ * This file is licensed under the terms of the GNU General Public
+ * License version 2. This program is licensed "as is" without any
+ * warranty of any kind, whether express or implied.
+ */
+
+#include "nova.h"
+#include "inode.h"
+
+const char *proc_dirname = "fs/NOVA";

[RFC v2 81/83] Failure recovery: Inode pages recovery routines.

2018-03-10 Thread Andiry Xu
From: Andiry Xu 

For each inode, NOVA traverses the inode log and records the pages
allocated in the bitmap. For directory inode, NOVA only set the log pages.
For file and symlink inodes, NOVA needs to set the data pages.
NOVA divides the file into 1GB zones, and records the pages fall into
the current zone, until all the pages have been recorded.

Signed-off-by: Andiry Xu 
---
 fs/nova/bbuild.c | 307 +++
 1 file changed, 307 insertions(+)

diff --git a/fs/nova/bbuild.c b/fs/nova/bbuild.c
index 35c661a..75dfcba 100644
--- a/fs/nova/bbuild.c
+++ b/fs/nova/bbuild.c
@@ -665,6 +665,313 @@ static int alloc_bm(struct super_block *sb, unsigned long 
initsize)
return 0;
 }
 
+/** NOVA recovery /
+
+#define MAX_PGOFF  262144
+
+struct task_ring {
+   u64 addr0[512];
+   int num;
+   int inodes_used_count;
+   u64 *entry_array;
+   u64 *nvmm_array;
+};
+
+static int nova_traverse_inode_log(struct super_block *sb,
+   struct nova_inode *pi, struct scan_bitmap *bm, u64 head)
+{
+   u64 curr_p;
+   u64 next;
+
+   curr_p = head;
+
+   if (curr_p == 0)
+   return 0;
+
+   WARN_ON(curr_p & (PAGE_SIZE - 1));
+   set_bm(curr_p >> PAGE_SHIFT, bm, BM_4K);
+
+   next = next_log_page(sb, curr_p);
+   while (next > 0) {
+   curr_p = next;
+   WARN_ON(curr_p & (PAGE_SIZE - 1));
+   set_bm(curr_p >> PAGE_SHIFT, bm, BM_4K);
+   next = next_log_page(sb, curr_p);
+   }
+
+   return 0;
+}
+
+static void nova_traverse_dir_inode_log(struct super_block *sb,
+   struct nova_inode *pi, struct scan_bitmap *bm)
+{
+   nova_traverse_inode_log(sb, pi, bm, pi->log_head);
+}
+
+static int nova_set_ring_array(struct super_block *sb,
+   struct nova_inode_info_header *sih, struct nova_file_write_entry *entry,
+   struct task_ring *ring,
+   unsigned long base, struct scan_bitmap *bm)
+{
+   unsigned long start, end;
+   unsigned long pgoff, old_pgoff = 0;
+   unsigned long index;
+   unsigned int num_free = 0;
+   u64 old_entry = 0;
+
+   start = entry->pgoff;
+   if (start < base)
+   start = base;
+
+   end = entry->pgoff + entry->num_pages;
+   if (end > base + MAX_PGOFF)
+   end = base + MAX_PGOFF;
+
+   for (pgoff = start; pgoff < end; pgoff++) {
+   index = pgoff - base;
+   if (ring->nvmm_array[index]) {
+   if (ring->entry_array[index] != old_entry) {
+   old_entry = ring->entry_array[index];
+   old_pgoff = pgoff;
+   num_free = 1;
+   } else {
+   num_free++;
+   }
+   }
+   }
+
+   for (pgoff = start; pgoff < end; pgoff++) {
+   index = pgoff - base;
+   ring->entry_array[index] = (u64)entry;
+   ring->nvmm_array[index] = (u64)(entry->block >> PAGE_SHIFT)
+   + pgoff - entry->pgoff;
+   }
+
+   return 0;
+}
+
+static int nova_set_file_bm(struct super_block *sb,
+   struct nova_inode_info_header *sih, struct task_ring *ring,
+   struct scan_bitmap *bm, unsigned long base, unsigned long last_blocknr)
+{
+   unsigned long nvmm, pgoff;
+
+   if (last_blocknr >= base + MAX_PGOFF)
+   last_blocknr = MAX_PGOFF - 1;
+   else
+   last_blocknr -= base;
+
+   for (pgoff = 0; pgoff <= last_blocknr; pgoff++) {
+   nvmm = ring->nvmm_array[pgoff];
+   if (nvmm) {
+   set_bm(nvmm, bm, BM_4K);
+   ring->nvmm_array[pgoff] = 0;
+   ring->entry_array[pgoff] = 0;
+   }
+   }
+
+   return 0;
+}
+
+/* entry given to this function is a copy in dram */
+static void nova_ring_setattr_entry(struct super_block *sb,
+   struct nova_inode_info_header *sih,
+   struct nova_setattr_logentry *entry, struct task_ring *ring,
+   unsigned long base, unsigned int data_bits, struct scan_bitmap *bm)
+{
+   unsigned long first_blocknr, last_blocknr;
+   unsigned long pgoff, old_pgoff = 0;
+   unsigned long index;
+   unsigned int num_free = 0;
+   u64 old_entry = 0;
+   loff_t start, end;
+
+   if (sih->i_size <= entry->size)
+   goto out;
+
+   start = entry->size;
+   end = sih->i_size;
+
+   first_blocknr = (start + (1UL << data_bits) - 1) >> data_bits;
+
+   if (end > 0)
+   last_blocknr = (end - 1) >> data_bits;
+   else
+   last_blocknr = 0;
+
+   if (fir

[RFC v2 82/83] Failure recovery: Per-CPU recovery.

2018-03-10 Thread Andiry Xu
From: Andiry Xu <jix...@cs.ucsd.edu>

NOVA starts a recovery thread on each CPU, and scans all the inodes
in a parallel way. It recovers the inode inuse list during the
scan as well.

Signed-off-by: Andiry Xu <jix...@cs.ucsd.edu>
---
 fs/nova/bbuild.c | 396 +++
 1 file changed, 396 insertions(+)

diff --git a/fs/nova/bbuild.c b/fs/nova/bbuild.c
index 75dfcba..3271166 100644
--- a/fs/nova/bbuild.c
+++ b/fs/nova/bbuild.c
@@ -677,6 +677,11 @@ struct task_ring {
u64 *nvmm_array;
 };
 
+static struct task_ring *task_rings;
+static struct task_struct **threads;
+wait_queue_head_t finish_wq;
+int *finished;
+
 static int nova_traverse_inode_log(struct super_block *sb,
struct nova_inode *pi, struct scan_bitmap *bm, u64 head)
 {
@@ -973,6 +978,378 @@ static int nova_recover_inode_pages(struct super_block 
*sb,
 }
 
 
+static void free_resources(struct super_block *sb)
+{
+   struct nova_sb_info *sbi = NOVA_SB(sb);
+   struct task_ring *ring;
+   int i;
+
+   if (task_rings) {
+   for (i = 0; i < sbi->cpus; i++) {
+   ring = _rings[i];
+   vfree(ring->entry_array);
+   vfree(ring->nvmm_array);
+   ring->entry_array = NULL;
+   ring->nvmm_array = NULL;
+   }
+   }
+
+   kfree(task_rings);
+   kfree(threads);
+   kfree(finished);
+}
+
+static int failure_thread_func(void *data);
+
+static int allocate_resources(struct super_block *sb, int cpus)
+{
+   struct task_ring *ring;
+   int i;
+
+   task_rings = kcalloc(cpus, sizeof(struct task_ring), GFP_KERNEL);
+   if (!task_rings)
+   goto fail;
+
+   for (i = 0; i < cpus; i++) {
+   ring = _rings[i];
+
+   ring->nvmm_array = vzalloc(sizeof(u64) * MAX_PGOFF);
+   if (!ring->nvmm_array)
+   goto fail;
+
+   ring->entry_array = vmalloc(sizeof(u64) * MAX_PGOFF);
+   if (!ring->entry_array)
+   goto fail;
+   }
+
+   threads = kcalloc(cpus, sizeof(struct task_struct *), GFP_KERNEL);
+   if (!threads)
+   goto fail;
+
+   finished = kcalloc(cpus, sizeof(int), GFP_KERNEL);
+   if (!finished)
+   goto fail;
+
+   init_waitqueue_head(_wq);
+
+   for (i = 0; i < cpus; i++) {
+   threads[i] = kthread_create(failure_thread_func,
+   sb, "recovery thread");
+   kthread_bind(threads[i], i);
+   }
+
+   return 0;
+
+fail:
+   free_resources(sb);
+   return -ENOMEM;
+}
+
+static void wait_to_finish(int cpus)
+{
+   int i;
+
+   for (i = 0; i < cpus; i++) {
+   while (finished[i] == 0) {
+   wait_event_interruptible_timeout(finish_wq, false,
+   msecs_to_jiffies(1));
+   }
+   }
+}
+
+/*** Failure recovery */
+
+static int nova_failure_insert_inodetree(struct super_block *sb,
+   unsigned long ino_low, unsigned long ino_high)
+{
+   struct nova_sb_info *sbi = NOVA_SB(sb);
+   struct inode_map *inode_map;
+   struct nova_range_node *prev = NULL, *next = NULL;
+   struct nova_range_node *new_node;
+   unsigned long internal_low, internal_high;
+   int cpu;
+   struct rb_root *tree;
+   int ret;
+
+   if (ino_low > ino_high) {
+   nova_err(sb, "%s: ino low %lu, ino high %lu\n",
+   __func__, ino_low, ino_high);
+   return -EINVAL;
+   }
+
+   cpu = ino_low % sbi->cpus;
+   if (ino_high % sbi->cpus != cpu) {
+   nova_err(sb, "%s: ino low %lu, ino high %lu\n",
+   __func__, ino_low, ino_high);
+   return -EINVAL;
+   }
+
+   internal_low = ino_low / sbi->cpus;
+   internal_high = ino_high / sbi->cpus;
+   inode_map = >inode_maps[cpu];
+   tree = _map->inode_inuse_tree;
+   mutex_lock(_map->inode_table_mutex);
+
+   ret = nova_find_free_slot(sbi, tree, internal_low, internal_high,
+   , );
+   if (ret) {
+   nova_dbg("%s: ino %lu - %lu already exists!: %d\n",
+   __func__, ino_low, ino_high, ret);
+   mutex_unlock(_map->inode_table_mutex);
+   return ret;
+   }
+
+   if (prev && next && (internal_low == prev->range_high + 1) &&
+   (internal_high + 1 == next->range_low)) {
+   /* fits the hole */
+   rb_erase(>node, tree);
+   inode_map-&g

[RFC v2 82/83] Failure recovery: Per-CPU recovery.

2018-03-10 Thread Andiry Xu
From: Andiry Xu 

NOVA starts a recovery thread on each CPU, and scans all the inodes
in a parallel way. It recovers the inode inuse list during the
scan as well.

Signed-off-by: Andiry Xu 
---
 fs/nova/bbuild.c | 396 +++
 1 file changed, 396 insertions(+)

diff --git a/fs/nova/bbuild.c b/fs/nova/bbuild.c
index 75dfcba..3271166 100644
--- a/fs/nova/bbuild.c
+++ b/fs/nova/bbuild.c
@@ -677,6 +677,11 @@ struct task_ring {
u64 *nvmm_array;
 };
 
+static struct task_ring *task_rings;
+static struct task_struct **threads;
+wait_queue_head_t finish_wq;
+int *finished;
+
 static int nova_traverse_inode_log(struct super_block *sb,
struct nova_inode *pi, struct scan_bitmap *bm, u64 head)
 {
@@ -973,6 +978,378 @@ static int nova_recover_inode_pages(struct super_block 
*sb,
 }
 
 
+static void free_resources(struct super_block *sb)
+{
+   struct nova_sb_info *sbi = NOVA_SB(sb);
+   struct task_ring *ring;
+   int i;
+
+   if (task_rings) {
+   for (i = 0; i < sbi->cpus; i++) {
+   ring = _rings[i];
+   vfree(ring->entry_array);
+   vfree(ring->nvmm_array);
+   ring->entry_array = NULL;
+   ring->nvmm_array = NULL;
+   }
+   }
+
+   kfree(task_rings);
+   kfree(threads);
+   kfree(finished);
+}
+
+static int failure_thread_func(void *data);
+
+static int allocate_resources(struct super_block *sb, int cpus)
+{
+   struct task_ring *ring;
+   int i;
+
+   task_rings = kcalloc(cpus, sizeof(struct task_ring), GFP_KERNEL);
+   if (!task_rings)
+   goto fail;
+
+   for (i = 0; i < cpus; i++) {
+   ring = _rings[i];
+
+   ring->nvmm_array = vzalloc(sizeof(u64) * MAX_PGOFF);
+   if (!ring->nvmm_array)
+   goto fail;
+
+   ring->entry_array = vmalloc(sizeof(u64) * MAX_PGOFF);
+   if (!ring->entry_array)
+   goto fail;
+   }
+
+   threads = kcalloc(cpus, sizeof(struct task_struct *), GFP_KERNEL);
+   if (!threads)
+   goto fail;
+
+   finished = kcalloc(cpus, sizeof(int), GFP_KERNEL);
+   if (!finished)
+   goto fail;
+
+   init_waitqueue_head(_wq);
+
+   for (i = 0; i < cpus; i++) {
+   threads[i] = kthread_create(failure_thread_func,
+   sb, "recovery thread");
+   kthread_bind(threads[i], i);
+   }
+
+   return 0;
+
+fail:
+   free_resources(sb);
+   return -ENOMEM;
+}
+
+static void wait_to_finish(int cpus)
+{
+   int i;
+
+   for (i = 0; i < cpus; i++) {
+   while (finished[i] == 0) {
+   wait_event_interruptible_timeout(finish_wq, false,
+   msecs_to_jiffies(1));
+   }
+   }
+}
+
+/*** Failure recovery */
+
+static int nova_failure_insert_inodetree(struct super_block *sb,
+   unsigned long ino_low, unsigned long ino_high)
+{
+   struct nova_sb_info *sbi = NOVA_SB(sb);
+   struct inode_map *inode_map;
+   struct nova_range_node *prev = NULL, *next = NULL;
+   struct nova_range_node *new_node;
+   unsigned long internal_low, internal_high;
+   int cpu;
+   struct rb_root *tree;
+   int ret;
+
+   if (ino_low > ino_high) {
+   nova_err(sb, "%s: ino low %lu, ino high %lu\n",
+   __func__, ino_low, ino_high);
+   return -EINVAL;
+   }
+
+   cpu = ino_low % sbi->cpus;
+   if (ino_high % sbi->cpus != cpu) {
+   nova_err(sb, "%s: ino low %lu, ino high %lu\n",
+   __func__, ino_low, ino_high);
+   return -EINVAL;
+   }
+
+   internal_low = ino_low / sbi->cpus;
+   internal_high = ino_high / sbi->cpus;
+   inode_map = >inode_maps[cpu];
+   tree = _map->inode_inuse_tree;
+   mutex_lock(_map->inode_table_mutex);
+
+   ret = nova_find_free_slot(sbi, tree, internal_low, internal_high,
+   , );
+   if (ret) {
+   nova_dbg("%s: ino %lu - %lu already exists!: %d\n",
+   __func__, ino_low, ino_high, ret);
+   mutex_unlock(_map->inode_table_mutex);
+   return ret;
+   }
+
+   if (prev && next && (internal_low == prev->range_high + 1) &&
+   (internal_high + 1 == next->range_low)) {
+   /* fits the hole */
+   rb_erase(>node, tree);
+   inode_map->num_range_node_inode--;
+   prev->range_high = n

[RFC v2 77/83] GC: Fast garbage collection.

2018-03-10 Thread Andiry Xu
From: Andiry Xu <jix...@cs.ucsd.edu>

NOVA cleans and compacts the log when the log is full.
The log is a linked list of 4KB pmem pages, and NOVA performs
fast garbage collection by deleting dead log pages (all the entries are invalid)
from the linked list.

Example:
I = Invalid, V = Valid

VIIV ->  -> VVII

||
||  fast gc
\/

VIIV -> VVII

Signed-off-by: Andiry Xu <jix...@cs.ucsd.edu>
---
 fs/nova/Makefile |   2 +-
 fs/nova/gc.c | 186 +++
 fs/nova/log.c|   3 +
 fs/nova/nova.h   |   7 +++
 4 files changed, 197 insertions(+), 1 deletion(-)
 create mode 100644 fs/nova/gc.c

diff --git a/fs/nova/Makefile b/fs/nova/Makefile
index 87e56c6..7a5fb6d 100644
--- a/fs/nova/Makefile
+++ b/fs/nova/Makefile
@@ -4,5 +4,5 @@
 
 obj-$(CONFIG_NOVA_FS) += nova.o
 
-nova-y := balloc.o bbuild.o dax.o dir.o file.o inode.o ioctl.o journal.o\
+nova-y := balloc.o bbuild.o dax.o dir.o file.o gc.o inode.o ioctl.o journal.o\
  log.o namei.o rebuild.o stats.o super.o symlink.o
diff --git a/fs/nova/gc.c b/fs/nova/gc.c
new file mode 100644
index 000..1634c04
--- /dev/null
+++ b/fs/nova/gc.c
@@ -0,0 +1,186 @@
+/*
+ * BRIEF DESCRIPTION
+ *
+ * Garbage collection methods
+ *
+ * Copyright 2015-2016 Regents of the University of California,
+ * UCSD Non-Volatile Systems Lab, Andiry Xu <jix...@cs.ucsd.edu>
+ * Copyright 2012-2013 Intel Corporation
+ * Copyright 2009-2011 Marco Stornelli <marco.storne...@gmail.com>
+ * Copyright 2003 Sony Corporation
+ * Copyright 2003 Matsushita Electric Industrial Co., Ltd.
+ * 2003-2004 (c) MontaVista Software, Inc. , Steve Longerbeam
+ * This file is licensed under the terms of the GNU General Public
+ * License version 2. This program is licensed "as is" without any
+ * warranty of any kind, whether express or implied.
+ */
+
+#include "nova.h"
+#include "inode.h"
+
+
+static bool curr_page_invalid(struct super_block *sb,
+   struct nova_inode *pi, struct nova_inode_info_header *sih,
+   u64 page_head)
+{
+   struct nova_inode_log_page *curr_page;
+   struct nova_inode_page_tail page_tail;
+   unsigned int num_entries;
+   unsigned int invalid_entries;
+   bool ret;
+   timing_t check_time;
+   int rc;
+
+   NOVA_START_TIMING(check_invalid_t, check_time);
+
+   curr_page = (struct nova_inode_log_page *)
+   nova_get_block(sb, page_head);
+   rc = memcpy_mcsafe(_tail, _page->page_tail,
+   sizeof(struct nova_inode_page_tail));
+   if (rc) {
+   nova_err(sb, "check page failed\n");
+   return false;
+   }
+
+   num_entries = le32_to_cpu(page_tail.num_entries);
+   invalid_entries = le32_to_cpu(page_tail.invalid_entries);
+
+   ret = (invalid_entries == num_entries);
+   if (!ret) {
+   sih->num_entries += num_entries;
+   sih->valid_entries += num_entries - invalid_entries;
+   }
+
+   NOVA_END_TIMING(check_invalid_t, check_time);
+   return ret;
+}
+
+static void free_curr_page(struct super_block *sb,
+   struct nova_inode_info_header *sih,
+   struct nova_inode_log_page *curr_page,
+   struct nova_inode_log_page *last_page, u64 curr_head)
+{
+   u8 btype = sih->i_blk_type;
+
+   nova_set_next_page_address(sb, last_page,
+   curr_page->page_tail.next_page, 1);
+   nova_free_log_blocks(sb, sih,
+   nova_get_blocknr(sb, curr_head, btype), 1);
+}
+
+
+/*
+ * Scan pages in the log and remove those with no valid log entries.
+ */
+int nova_inode_log_fast_gc(struct super_block *sb,
+   struct nova_inode *pi, struct nova_inode_info_header *sih,
+   u64 curr_tail, u64 new_block,
+   int num_pages, int force_thorough)
+{
+   u64 curr, next, possible_head = 0;
+   int found_head = 0;
+   struct nova_inode_log_page *last_page = NULL;
+   struct nova_inode_log_page *curr_page = NULL;
+   int first_need_free = 0;
+   int num_logs;
+   u8 btype = sih->i_blk_type;
+   unsigned long blocks;
+   unsigned long checked_pages = 0;
+   int freed_pages = 0;
+   timing_t gc_time;
+
+   NOVA_START_TIMING(fast_gc_t, gc_time);
+   curr = sih->log_head;
+   sih->valid_entries = 0;
+   sih->num_entries = 0;
+
+   num_logs = 1;
+
+   nova_dbgv("%s: log head 0x%llx, tail 0x%llx\n",
+   __func__, curr, curr_tail);
+   while (1) {
+   if (curr >> PAGE_SHIFT == sih->log_tail >> PAGE_SHIFT) {
+   /* Don't recycle tail page */
+   if (found_head == 0) {
+   possible_head = cpu_to_le64(curr);
+   }
+   break;
+   }

[RFC v2 77/83] GC: Fast garbage collection.

2018-03-10 Thread Andiry Xu
From: Andiry Xu 

NOVA cleans and compacts the log when the log is full.
The log is a linked list of 4KB pmem pages, and NOVA performs
fast garbage collection by deleting dead log pages (all the entries are invalid)
from the linked list.

Example:
I = Invalid, V = Valid

VIIV ->  -> VVII

||
||  fast gc
\/

VIIV -> VVII

Signed-off-by: Andiry Xu 
---
 fs/nova/Makefile |   2 +-
 fs/nova/gc.c | 186 +++
 fs/nova/log.c|   3 +
 fs/nova/nova.h   |   7 +++
 4 files changed, 197 insertions(+), 1 deletion(-)
 create mode 100644 fs/nova/gc.c

diff --git a/fs/nova/Makefile b/fs/nova/Makefile
index 87e56c6..7a5fb6d 100644
--- a/fs/nova/Makefile
+++ b/fs/nova/Makefile
@@ -4,5 +4,5 @@
 
 obj-$(CONFIG_NOVA_FS) += nova.o
 
-nova-y := balloc.o bbuild.o dax.o dir.o file.o inode.o ioctl.o journal.o\
+nova-y := balloc.o bbuild.o dax.o dir.o file.o gc.o inode.o ioctl.o journal.o\
  log.o namei.o rebuild.o stats.o super.o symlink.o
diff --git a/fs/nova/gc.c b/fs/nova/gc.c
new file mode 100644
index 000..1634c04
--- /dev/null
+++ b/fs/nova/gc.c
@@ -0,0 +1,186 @@
+/*
+ * BRIEF DESCRIPTION
+ *
+ * Garbage collection methods
+ *
+ * Copyright 2015-2016 Regents of the University of California,
+ * UCSD Non-Volatile Systems Lab, Andiry Xu 
+ * Copyright 2012-2013 Intel Corporation
+ * Copyright 2009-2011 Marco Stornelli 
+ * Copyright 2003 Sony Corporation
+ * Copyright 2003 Matsushita Electric Industrial Co., Ltd.
+ * 2003-2004 (c) MontaVista Software, Inc. , Steve Longerbeam
+ * This file is licensed under the terms of the GNU General Public
+ * License version 2. This program is licensed "as is" without any
+ * warranty of any kind, whether express or implied.
+ */
+
+#include "nova.h"
+#include "inode.h"
+
+
+static bool curr_page_invalid(struct super_block *sb,
+   struct nova_inode *pi, struct nova_inode_info_header *sih,
+   u64 page_head)
+{
+   struct nova_inode_log_page *curr_page;
+   struct nova_inode_page_tail page_tail;
+   unsigned int num_entries;
+   unsigned int invalid_entries;
+   bool ret;
+   timing_t check_time;
+   int rc;
+
+   NOVA_START_TIMING(check_invalid_t, check_time);
+
+   curr_page = (struct nova_inode_log_page *)
+   nova_get_block(sb, page_head);
+   rc = memcpy_mcsafe(_tail, _page->page_tail,
+   sizeof(struct nova_inode_page_tail));
+   if (rc) {
+   nova_err(sb, "check page failed\n");
+   return false;
+   }
+
+   num_entries = le32_to_cpu(page_tail.num_entries);
+   invalid_entries = le32_to_cpu(page_tail.invalid_entries);
+
+   ret = (invalid_entries == num_entries);
+   if (!ret) {
+   sih->num_entries += num_entries;
+   sih->valid_entries += num_entries - invalid_entries;
+   }
+
+   NOVA_END_TIMING(check_invalid_t, check_time);
+   return ret;
+}
+
+static void free_curr_page(struct super_block *sb,
+   struct nova_inode_info_header *sih,
+   struct nova_inode_log_page *curr_page,
+   struct nova_inode_log_page *last_page, u64 curr_head)
+{
+   u8 btype = sih->i_blk_type;
+
+   nova_set_next_page_address(sb, last_page,
+   curr_page->page_tail.next_page, 1);
+   nova_free_log_blocks(sb, sih,
+   nova_get_blocknr(sb, curr_head, btype), 1);
+}
+
+
+/*
+ * Scan pages in the log and remove those with no valid log entries.
+ */
+int nova_inode_log_fast_gc(struct super_block *sb,
+   struct nova_inode *pi, struct nova_inode_info_header *sih,
+   u64 curr_tail, u64 new_block,
+   int num_pages, int force_thorough)
+{
+   u64 curr, next, possible_head = 0;
+   int found_head = 0;
+   struct nova_inode_log_page *last_page = NULL;
+   struct nova_inode_log_page *curr_page = NULL;
+   int first_need_free = 0;
+   int num_logs;
+   u8 btype = sih->i_blk_type;
+   unsigned long blocks;
+   unsigned long checked_pages = 0;
+   int freed_pages = 0;
+   timing_t gc_time;
+
+   NOVA_START_TIMING(fast_gc_t, gc_time);
+   curr = sih->log_head;
+   sih->valid_entries = 0;
+   sih->num_entries = 0;
+
+   num_logs = 1;
+
+   nova_dbgv("%s: log head 0x%llx, tail 0x%llx\n",
+   __func__, curr, curr_tail);
+   while (1) {
+   if (curr >> PAGE_SHIFT == sih->log_tail >> PAGE_SHIFT) {
+   /* Don't recycle tail page */
+   if (found_head == 0) {
+   possible_head = cpu_to_le64(curr);
+   }
+   break;
+   }
+
+   curr_page = (struct nova_inode_log_page *)
+   n

[RFC v2 78/83] GC: Thorough garbage collection.

2018-03-10 Thread Andiry Xu
From: Andiry Xu <jix...@cs.ucsd.edu>

After fast gc, if the valid log entries still account for less
than the half of the log size, NOVA starts thorough garbage collection,
allocates a new log, copies the live log entries to it, and switches
to the new log atomically. The radix tree needs to be updated to point
to the new log.

Example:
I = Invalid, V = Valid

VIIV ->  -> VVII

 ||
 ||  fast gc
 \/

VIIV -> VVII

 ||
 ||  thorough gc
 \/



Signed-off-by: Andiry Xu <jix...@cs.ucsd.edu>
---
 fs/nova/gc.c | 273 +++
 1 file changed, 273 insertions(+)

diff --git a/fs/nova/gc.c b/fs/nova/gc.c
index 1634c04..d74286e 100644
--- a/fs/nova/gc.c
+++ b/fs/nova/gc.c
@@ -18,6 +18,62 @@
 #include "nova.h"
 #include "inode.h"
 
+static bool curr_log_entry_invalid(struct super_block *sb,
+   struct nova_inode *pi, struct nova_inode_info_header *sih,
+   u64 curr_p, size_t *length)
+{
+   struct nova_file_write_entry *entry;
+   struct nova_dentry *dentry;
+   struct nova_setattr_logentry *setattr_entry;
+   struct nova_link_change_entry *linkc_entry;
+   void *entryc;
+   u8 type;
+   bool ret = true;
+
+   entryc = (void *)nova_get_block(sb, curr_p);
+   type = nova_get_entry_type(entryc);
+
+   switch (type) {
+   case SET_ATTR:
+   setattr_entry = (struct nova_setattr_logentry *) entryc;
+   if (setattr_entry->invalid == 0)
+   ret = false;
+   *length = sizeof(struct nova_setattr_logentry);
+   break;
+   case LINK_CHANGE:
+   linkc_entry = (struct nova_link_change_entry *) entryc;
+   if (linkc_entry->invalid == 0)
+   ret = false;
+   *length = sizeof(struct nova_link_change_entry);
+   break;
+   case FILE_WRITE:
+   entry = (struct nova_file_write_entry *) entryc;
+   if (entry->num_pages != entry->invalid_pages)
+   ret = false;
+   *length = sizeof(struct nova_file_write_entry);
+   break;
+   case DIR_LOG:
+   dentry = (struct nova_dentry *) entryc;
+   if (dentry->invalid == 0)
+   ret = false;
+   if (sih->last_dentry == curr_p)
+   ret = false;
+   *length = le16_to_cpu(dentry->de_len);
+   break;
+   case NEXT_PAGE:
+   /* No more entries in this page */
+   *length = PAGE_SIZE - ENTRY_LOC(curr_p);
+   break;
+   default:
+   nova_dbg("%s: unknown type %d, 0x%llx\n",
+   __func__, type, curr_p);
+   NOVA_ASSERT(0);
+   *length = PAGE_SIZE - ENTRY_LOC(curr_p);
+   break;
+   }
+
+   return ret;
+}
 
 static bool curr_page_invalid(struct super_block *sb,
struct nova_inode *pi, struct nova_inode_info_header *sih,
@@ -68,6 +124,210 @@ static void free_curr_page(struct super_block *sb,
nova_get_blocknr(sb, curr_head, btype), 1);
 }
 
+static int nova_gc_assign_file_entry(struct super_block *sb,
+   struct nova_inode_info_header *sih,
+   struct nova_file_write_entry *old_entry,
+   struct nova_file_write_entry *new_entry)
+{
+   struct nova_file_write_entry *temp;
+   void **pentry;
+   unsigned long start_pgoff = old_entry->pgoff;
+   unsigned int num = old_entry->num_pages;
+   unsigned long curr_pgoff;
+   int i;
+   int ret = 0;
+
+   for (i = 0; i < num; i++) {
+   curr_pgoff = start_pgoff + i;
+
+   pentry = radix_tree_lookup_slot(>tree, curr_pgoff);
+   if (pentry) {
+   temp = radix_tree_deref_slot(pentry);
+   if (temp == old_entry)
+   radix_tree_replace_slot(>tree, pentry,
+   new_entry);
+   }
+   }
+
+   return ret;
+}
+
+static int nova_gc_assign_dentry(struct super_block *sb,
+   struct nova_inode_info_header *sih, struct nova_dentry *old_dentry,
+   struct nova_dentry *new_dentry)
+{
+   struct nova_dentry *temp;
+   void **pentry;
+   unsigned long hash;
+   int ret = 0;
+
+   hash = BKDRHash(old_dentry->name, old_dentry->name_len);
+   nova_dbgv("%s: assign %s hash %lu\n", __func__,
+   old_dentry->name, hash);
+
+   /* FIXME: hash collision ignored here */
+   pentry = radix_tree_lookup_slot(>tree, hash);
+   if (pentry) {
+   temp = radix_tree_deref_slot(pentry);
+   if (temp == old_dentry)
+   radix_tree_replace_slot(>tree, pentr

[RFC v2 78/83] GC: Thorough garbage collection.

2018-03-10 Thread Andiry Xu
From: Andiry Xu 

After fast gc, if the valid log entries still account for less
than the half of the log size, NOVA starts thorough garbage collection,
allocates a new log, copies the live log entries to it, and switches
to the new log atomically. The radix tree needs to be updated to point
to the new log.

Example:
I = Invalid, V = Valid

VIIV ->  -> VVII

 ||
 ||  fast gc
 \/

VIIV -> VVII

 ||
 ||  thorough gc
 \/



Signed-off-by: Andiry Xu 
---
 fs/nova/gc.c | 273 +++
 1 file changed, 273 insertions(+)

diff --git a/fs/nova/gc.c b/fs/nova/gc.c
index 1634c04..d74286e 100644
--- a/fs/nova/gc.c
+++ b/fs/nova/gc.c
@@ -18,6 +18,62 @@
 #include "nova.h"
 #include "inode.h"
 
+static bool curr_log_entry_invalid(struct super_block *sb,
+   struct nova_inode *pi, struct nova_inode_info_header *sih,
+   u64 curr_p, size_t *length)
+{
+   struct nova_file_write_entry *entry;
+   struct nova_dentry *dentry;
+   struct nova_setattr_logentry *setattr_entry;
+   struct nova_link_change_entry *linkc_entry;
+   void *entryc;
+   u8 type;
+   bool ret = true;
+
+   entryc = (void *)nova_get_block(sb, curr_p);
+   type = nova_get_entry_type(entryc);
+
+   switch (type) {
+   case SET_ATTR:
+   setattr_entry = (struct nova_setattr_logentry *) entryc;
+   if (setattr_entry->invalid == 0)
+   ret = false;
+   *length = sizeof(struct nova_setattr_logentry);
+   break;
+   case LINK_CHANGE:
+   linkc_entry = (struct nova_link_change_entry *) entryc;
+   if (linkc_entry->invalid == 0)
+   ret = false;
+   *length = sizeof(struct nova_link_change_entry);
+   break;
+   case FILE_WRITE:
+   entry = (struct nova_file_write_entry *) entryc;
+   if (entry->num_pages != entry->invalid_pages)
+   ret = false;
+   *length = sizeof(struct nova_file_write_entry);
+   break;
+   case DIR_LOG:
+   dentry = (struct nova_dentry *) entryc;
+   if (dentry->invalid == 0)
+   ret = false;
+   if (sih->last_dentry == curr_p)
+   ret = false;
+   *length = le16_to_cpu(dentry->de_len);
+   break;
+   case NEXT_PAGE:
+   /* No more entries in this page */
+   *length = PAGE_SIZE - ENTRY_LOC(curr_p);
+   break;
+   default:
+   nova_dbg("%s: unknown type %d, 0x%llx\n",
+   __func__, type, curr_p);
+   NOVA_ASSERT(0);
+   *length = PAGE_SIZE - ENTRY_LOC(curr_p);
+   break;
+   }
+
+   return ret;
+}
 
 static bool curr_page_invalid(struct super_block *sb,
struct nova_inode *pi, struct nova_inode_info_header *sih,
@@ -68,6 +124,210 @@ static void free_curr_page(struct super_block *sb,
nova_get_blocknr(sb, curr_head, btype), 1);
 }
 
+static int nova_gc_assign_file_entry(struct super_block *sb,
+   struct nova_inode_info_header *sih,
+   struct nova_file_write_entry *old_entry,
+   struct nova_file_write_entry *new_entry)
+{
+   struct nova_file_write_entry *temp;
+   void **pentry;
+   unsigned long start_pgoff = old_entry->pgoff;
+   unsigned int num = old_entry->num_pages;
+   unsigned long curr_pgoff;
+   int i;
+   int ret = 0;
+
+   for (i = 0; i < num; i++) {
+   curr_pgoff = start_pgoff + i;
+
+   pentry = radix_tree_lookup_slot(>tree, curr_pgoff);
+   if (pentry) {
+   temp = radix_tree_deref_slot(pentry);
+   if (temp == old_entry)
+   radix_tree_replace_slot(>tree, pentry,
+   new_entry);
+   }
+   }
+
+   return ret;
+}
+
+static int nova_gc_assign_dentry(struct super_block *sb,
+   struct nova_inode_info_header *sih, struct nova_dentry *old_dentry,
+   struct nova_dentry *new_dentry)
+{
+   struct nova_dentry *temp;
+   void **pentry;
+   unsigned long hash;
+   int ret = 0;
+
+   hash = BKDRHash(old_dentry->name, old_dentry->name_len);
+   nova_dbgv("%s: assign %s hash %lu\n", __func__,
+   old_dentry->name, hash);
+
+   /* FIXME: hash collision ignored here */
+   pentry = radix_tree_lookup_slot(>tree, hash);
+   if (pentry) {
+   temp = radix_tree_deref_slot(pentry);
+   if (temp == old_dentry)
+   radix_tree_replace_slot(>tree, pentry, new_dentry);
+   }
+
+   return ret;
+}

[RFC v2 48/83] Dir: Readdir operation.

2018-03-10 Thread Andiry Xu
From: Andiry Xu <jix...@cs.ucsd.edu>

NOVA reads the directory by traversing the log and reports
the valid dentries. Valid dentris have inode number greater than zero,
meaning it's a create dentry.

Signed-off-by: Andiry Xu <jix...@cs.ucsd.edu>
---
 fs/nova/dir.c   | 153 
 fs/nova/inode.c |   2 +
 fs/nova/nova.h  |   1 +
 3 files changed, 156 insertions(+)

diff --git a/fs/nova/dir.c b/fs/nova/dir.c
index 377d2da..35a66f9 100644
--- a/fs/nova/dir.c
+++ b/fs/nova/dir.c
@@ -221,3 +221,156 @@ int nova_append_dir_init_entries(struct super_block *sb,
 
return 0;
 }
+
+static u64 nova_find_next_dentry_addr(struct super_block *sb,
+   struct nova_inode_info_header *sih, u64 pos)
+{
+   struct nova_sb_info *sbi = NOVA_SB(sb);
+   struct nova_file_write_entry *entry = NULL;
+   struct nova_file_write_entry *entries[1];
+   int nr_entries;
+   u64 addr = 0;
+
+   nr_entries = radix_tree_gang_lookup(>tree,
+   (void **)entries, pos, 1);
+   if (nr_entries == 1) {
+   entry = entries[0];
+   addr = nova_get_addr_off(sbi, entry);
+   }
+
+   return addr;
+}
+
+static int nova_readdir(struct file *file, struct dir_context *ctx)
+{
+   struct inode *inode = file_inode(file);
+   struct super_block *sb = inode->i_sb;
+   struct nova_inode *pidir;
+   struct nova_inode_info *si = NOVA_I(inode);
+   struct nova_inode_info_header *sih = >header;
+   struct nova_inode *child_pi;
+   struct nova_inode *prev_child_pi = NULL;
+   struct nova_dentry *entry = NULL;
+   struct nova_dentry *prev_entry = NULL;
+   unsigned short de_len;
+   u64 pi_addr;
+   unsigned long pos = 0;
+   ino_t ino;
+   void *addr;
+   u64 curr_p;
+   u8 type;
+   int ret = 0;
+   timing_t readdir_time;
+
+   NOVA_START_TIMING(readdir_t, readdir_time);
+   pidir = nova_get_inode(sb, inode);
+   nova_dbgv("%s: ino %llu, size %llu, pos 0x%llx\n",
+   __func__, (u64)inode->i_ino,
+   pidir->i_size, ctx->pos);
+
+   if (sih->log_head == 0) {
+   nova_err(sb, "Dir %lu log is NULL!\n", inode->i_ino);
+   ret = -ENOSPC;
+   goto out;
+   }
+
+   pos = ctx->pos;
+
+   if (pos == 0)
+   curr_p = sih->log_head;
+   else if (pos == READDIR_END)
+   goto out;
+   else {
+   curr_p = nova_find_next_dentry_addr(sb, sih, pos);
+   if (curr_p == 0)
+   goto out;
+   }
+
+   while (curr_p != sih->log_tail) {
+   if (goto_next_page(sb, curr_p))
+   curr_p = next_log_page(sb, curr_p);
+
+
+   if (curr_p == 0) {
+   nova_err(sb, "Dir %lu log is NULL!\n", inode->i_ino);
+   ret = -EINVAL;
+   goto out;
+   }
+
+   addr = (void *)nova_get_block(sb, curr_p);
+   type = nova_get_entry_type(addr);
+   switch (type) {
+   case SET_ATTR:
+   curr_p += sizeof(struct nova_setattr_logentry);
+   continue;
+   case LINK_CHANGE:
+   curr_p += sizeof(struct nova_link_change_entry);
+   continue;
+   case DIR_LOG:
+   break;
+   default:
+   nova_err(sb, "%s: unknown type %d, 0x%llx\n",
+__func__, type, curr_p);
+   ret = -EINVAL;
+   goto out;
+   }
+
+   entry = (struct nova_dentry *)nova_get_block(sb, curr_p);
+   nova_dbgv("curr_p: 0x%llx, type %d, ino %llu, name %s, namelen 
%u, rec len %u\n",
+ curr_p, entry->entry_type, le64_to_cpu(entry->ino),
+ entry->name, entry->name_len,
+ le16_to_cpu(entry->de_len));
+
+   de_len = le16_to_cpu(entry->de_len);
+   if (entry->ino > 0 && entry->invalid == 0
+   && entry->reassigned == 0) {
+   ino = __le64_to_cpu(entry->ino);
+   pos = BKDRHash(entry->name, entry->name_len);
+
+   ret = nova_get_inode_address(sb, ino,
+_addr, 0);
+   if (ret) {
+   nova_dbg("%s: get child inode %lu address 
failed %d\n",
+__func__, ino, ret);
+   ctx->pos = READDIR_END;
+   goto o

[RFC v2 48/83] Dir: Readdir operation.

2018-03-10 Thread Andiry Xu
From: Andiry Xu 

NOVA reads the directory by traversing the log and reports
the valid dentries. Valid dentris have inode number greater than zero,
meaning it's a create dentry.

Signed-off-by: Andiry Xu 
---
 fs/nova/dir.c   | 153 
 fs/nova/inode.c |   2 +
 fs/nova/nova.h  |   1 +
 3 files changed, 156 insertions(+)

diff --git a/fs/nova/dir.c b/fs/nova/dir.c
index 377d2da..35a66f9 100644
--- a/fs/nova/dir.c
+++ b/fs/nova/dir.c
@@ -221,3 +221,156 @@ int nova_append_dir_init_entries(struct super_block *sb,
 
return 0;
 }
+
+static u64 nova_find_next_dentry_addr(struct super_block *sb,
+   struct nova_inode_info_header *sih, u64 pos)
+{
+   struct nova_sb_info *sbi = NOVA_SB(sb);
+   struct nova_file_write_entry *entry = NULL;
+   struct nova_file_write_entry *entries[1];
+   int nr_entries;
+   u64 addr = 0;
+
+   nr_entries = radix_tree_gang_lookup(>tree,
+   (void **)entries, pos, 1);
+   if (nr_entries == 1) {
+   entry = entries[0];
+   addr = nova_get_addr_off(sbi, entry);
+   }
+
+   return addr;
+}
+
+static int nova_readdir(struct file *file, struct dir_context *ctx)
+{
+   struct inode *inode = file_inode(file);
+   struct super_block *sb = inode->i_sb;
+   struct nova_inode *pidir;
+   struct nova_inode_info *si = NOVA_I(inode);
+   struct nova_inode_info_header *sih = >header;
+   struct nova_inode *child_pi;
+   struct nova_inode *prev_child_pi = NULL;
+   struct nova_dentry *entry = NULL;
+   struct nova_dentry *prev_entry = NULL;
+   unsigned short de_len;
+   u64 pi_addr;
+   unsigned long pos = 0;
+   ino_t ino;
+   void *addr;
+   u64 curr_p;
+   u8 type;
+   int ret = 0;
+   timing_t readdir_time;
+
+   NOVA_START_TIMING(readdir_t, readdir_time);
+   pidir = nova_get_inode(sb, inode);
+   nova_dbgv("%s: ino %llu, size %llu, pos 0x%llx\n",
+   __func__, (u64)inode->i_ino,
+   pidir->i_size, ctx->pos);
+
+   if (sih->log_head == 0) {
+   nova_err(sb, "Dir %lu log is NULL!\n", inode->i_ino);
+   ret = -ENOSPC;
+   goto out;
+   }
+
+   pos = ctx->pos;
+
+   if (pos == 0)
+   curr_p = sih->log_head;
+   else if (pos == READDIR_END)
+   goto out;
+   else {
+   curr_p = nova_find_next_dentry_addr(sb, sih, pos);
+   if (curr_p == 0)
+   goto out;
+   }
+
+   while (curr_p != sih->log_tail) {
+   if (goto_next_page(sb, curr_p))
+   curr_p = next_log_page(sb, curr_p);
+
+
+   if (curr_p == 0) {
+   nova_err(sb, "Dir %lu log is NULL!\n", inode->i_ino);
+   ret = -EINVAL;
+   goto out;
+   }
+
+   addr = (void *)nova_get_block(sb, curr_p);
+   type = nova_get_entry_type(addr);
+   switch (type) {
+   case SET_ATTR:
+   curr_p += sizeof(struct nova_setattr_logentry);
+   continue;
+   case LINK_CHANGE:
+   curr_p += sizeof(struct nova_link_change_entry);
+   continue;
+   case DIR_LOG:
+   break;
+   default:
+   nova_err(sb, "%s: unknown type %d, 0x%llx\n",
+__func__, type, curr_p);
+   ret = -EINVAL;
+   goto out;
+   }
+
+   entry = (struct nova_dentry *)nova_get_block(sb, curr_p);
+   nova_dbgv("curr_p: 0x%llx, type %d, ino %llu, name %s, namelen 
%u, rec len %u\n",
+ curr_p, entry->entry_type, le64_to_cpu(entry->ino),
+ entry->name, entry->name_len,
+ le16_to_cpu(entry->de_len));
+
+   de_len = le16_to_cpu(entry->de_len);
+   if (entry->ino > 0 && entry->invalid == 0
+   && entry->reassigned == 0) {
+   ino = __le64_to_cpu(entry->ino);
+   pos = BKDRHash(entry->name, entry->name_len);
+
+   ret = nova_get_inode_address(sb, ino,
+_addr, 0);
+   if (ret) {
+   nova_dbg("%s: get child inode %lu address 
failed %d\n",
+__func__, ino, ret);
+   ctx->pos = READDIR_END;
+   goto out;
+ 

[RFC v2 01/83] Introduction and documentation of NOVA filesystem.

2018-03-10 Thread Andiry Xu
From: Andiry Xu <jix...@cs.ucsd.edu>

NOVA is a log-structured file system tailored for byte-addressable non-volatile 
memories.
It was designed and developed at the Non-Volatile Systems Laboratory in the 
Computer
Science and Engineering Department at the University of California, San Diego.
Its primary authors are Andiry Xu <jix...@eng.ucsd.edu>, Lu Zhang
<l...@eng.ucsd.edu>, and Steven Swanson <swan...@eng.ucsd.edu>.

These two papers provide a detailed, high-level description of NOVA's design 
goals and approach:

   NOVA: A Log-structured File system for Hybrid Volatile/Non-volatile Main 
Memories
   In The 14th USENIX Conference on File and Storage Technologies (FAST '16)
   (http://cseweb.ucsd.edu/~swanson/papers/FAST2016NOVA.pdf)

   NOVA-Fortis: A Fault-Tolerant Non-Volatile Main Memory File System
   In The 26th ACM Symposium on Operating Systems Principles (SOSP '17)
   (http://cseweb.ucsd.edu/~swanson/papers/SOSP2017-NOVAFortis.pdf)

This patchset contains features from the FAST paper. We leave NOVA-Fortis 
features,
such as snapshot, metadata and data replication and RAID parity for
future submission.

Signed-off-by: Andiry Xu <jix...@cs.ucsd.edu>
---
 Documentation/filesystems/00-INDEX |   2 +
 Documentation/filesystems/nova.txt | 498 +
 MAINTAINERS|   8 +
 3 files changed, 508 insertions(+)
 create mode 100644 Documentation/filesystems/nova.txt

diff --git a/Documentation/filesystems/00-INDEX 
b/Documentation/filesystems/00-INDEX
index b7bd6c9..dc5c722 100644
--- a/Documentation/filesystems/00-INDEX
+++ b/Documentation/filesystems/00-INDEX
@@ -95,6 +95,8 @@ nfs/
- nfs-related documentation.
 nilfs2.txt
- info and mount options for the NILFS2 filesystem.
+nova.txt
+   - info on the NOVA filesystem.
 ntfs.txt
- info and mount options for the NTFS filesystem (Windows NT).
 ocfs2.txt
diff --git a/Documentation/filesystems/nova.txt 
b/Documentation/filesystems/nova.txt
new file mode 100644
index 000..4728f50
--- /dev/null
+++ b/Documentation/filesystems/nova.txt
@@ -0,0 +1,498 @@
+The NOVA Filesystem
+===
+
+NOn-Volatile memory Accelerated file system (NOVA) is a DAX file system
+designed to provide a high performance and production-ready file system
+tailored for byte-addressable non-volatile memories (e.g., NVDIMMs
+and Intel's soon-to-be-released 3DXPoint DIMMs).
+NOVA combines design elements from many other file systems
+and adapts conventional log-structured file system techniques to
+exploit the fast random access that NVMs provide. In particular, NOVA maintains
+separate logs for each inode to improve concurrency, and stores file data
+outside the log to minimize log size and reduce garbage collection costs. 
NOVA's
+logs provide metadata and data atomicity and focus on simplicity and
+reliability, keeping complex metadata structures in DRAM to accelerate lookup
+operations.
+
+NOVA was developed by the Non-Volatile Systems Laboratory (NVSL) in
+the Computer Science and Engineering Department at the University of
+California, San Diego.
+
+A more thorough discussion of NOVA's design is avaialable in these two papers:
+
+NOVA: A Log-structured File System for Hybrid Volatile/Non-volatile Main 
Memories
+Jian Xu and Steven Swanson
+In The 14th USENIX Conference on File and Storage Technologies (FAST '16)
+
+NOVA-Fortis: A Fault-Tolerant Non-Volatile Main Memory File System
+Jian Xu, Lu Zhang, Amirsaman Memaripour, Akshatha Gangadharaiah, Amit Borase,
+Tamires Brito Da Silva, Andy Rudoff and Steven Swanson
+In The 26th ACM Symposium on Operating Systems Principles (SOSP '17)
+
+This version of NOVA contains features from the FAST paper.
+NOVA-Fortis features, such as snapshot, metadata and data protection and 
replication
+are left for future submission.
+
+The main NOVA features include:
+
+  * POSIX semantics
+  * Directly access (DAX) byte-addressable NVMM without page caching
+  * Per-CPU NVMM pool to maximize concurrency
+  * Strong consistency guarantees with 8-byte atomic stores
+
+
+Filesystem Design
+=
+
+NOVA divides NVMM into several regions. NOVA's 512B superblock contains global
+file system information and the recovery inode. The recovery inode represents a
+special file that stores recovery information (e.g., the list of unallocated
+NVMM pages). NOVA divides its inode tables into per-CPU stripes. It also
+provides per-CPU journals for complex file operations that involve multiple
+inodes. The rest of the available NVMM stores logs and file data.
+
+NOVA is log-structured and stores a separate log for each inode to maximize
+concurrency and provide atomicity for operations that affect a single file. The
+logs only store metadata and comprise a linked list of 4 KB pages. Log entries
+are small – between 32 and 64 bytes. Logs are generally non-contiguous, and log
+pages may reside anywhere in NVMM.
+
+NOVA keeps copies of mos

[RFC v2 00/83] NOVA: a new file system for persistent memory

2018-03-10 Thread Andiry Xu
From: Andiry Xu <jix...@cs.ucsd.edu>

This is the second version of RFC patch series that impements
NOVA (NOn-Volatile memory Accelerated file system), a new file system built for 
PMEM.

NOVA's goal is to provide a high performance, production-ready
file system tailored for byte-addressable non-volatile memories (e.g., NVDIMMs
and Intel's soon-to-be-released 3DXpoint DIMMs).
 
NOVA was developed at the Non-Volatile Systems Laboratory in the Computer
Science and Engineering Department at the University of California, San Diego.
Its primary authors are Andiry Xu <jix...@cs.ucsd.edu>, Lu Zhang
<l...@eng.ucsd.edu>, and Steven Swanson <swan...@eng.ucsd.edu>.
 
NOVA is stable enough to run complex applications, but there is substantial
work left to do.  This RFC is intended to gather feedback to guide its
development toward eventual inclusion upstream.
 
The patches are based on Linux 4.16-rc4.


Changes from v1:

* Remove snapshot, metadata replication and data parity for future submission.
  This significantly reduces complexity and LOC: 22129 -> 13834.

* Breakdown the code in a more reviewer-friendly way:
  The patchset starts with a simple skeleton and adds more features gradually.
  Each patch leaves the tree in a compilable and working state,
  and is self-contained and small, so easier to review.

* Fix bugs so that NOVA passes xfstests: https://github.com/NVSL/xfstests


Overview


NOVA is primarily a log-structured file system, but rather than maintain a
single global log for the entire file system, it maintains separate logs for
each inode.  NOVA breaks the logs into 4KB pages, they need not be
contiguous in memory.  The logs only contain metadata.

File data pages reside outside the log, and log entries for write operations
point to data pages they modify.  File modification can be done in
either inplace update or copy-on-write (COW) way to provide atomic file updates.

For file operations that involve multiple inodes, NOVA use small, fixed-sized
redo logs to atomically append log entries to the logs of the inodes involved.

This structure keeps logs small and makes garbage collection very fast.  It also
enables enormous parallelism during recovery from an unclean unmount, since
threads can scan logs in parallel.

Documentation/filesystems/NOVA.txt contains some lower-level implementation and
usage information.  A more thorough discussion of NOVA's goals and design is
avaialable in two papers:

NOVA: A Log-structured File system for Hybrid Volatile/Non-volatile Main 
Memories
http://cseweb.ucsd.edu/~swanson/papers/FAST2016NOVA.pdf
Jian Xu and Steven Swanson
Published in FAST 2016

NOVA-Fortis: A Fault-Tolerant Non-Volatile Main Memory File System
http://cseweb.ucsd.edu/~swanson/papers/SOSP2017-NOVAFortis.pdf
Jian Xu, Lu Zhang, Amirsaman Memaripour, Akshatha Gangadharaiah,
Amit Borase, Tamires Brito Da Silva, Andy Rudoff, Steven Swanson
Published in SOSP 2017

This version contains features from the FAST paper. We leave NOVA-Fortis
features for future.


Build and Run
=

To build NOVA, build the kernel with PMEM (`CONFIG_BLK_DEV_PMEM`),
DAX (`CONFIG_FS_DAX`) and NOVA (`CONFIG_NOVA_FS`) support.  Install as usual.

NOVA runs on a pmem non-volatile memory region created by memmap kernel option.
For instance, adding 'memmap=16G!8G' to the kernel boot parameters will reserve
16GB memory starting from address 8GB, and the kernel will create a pmem0 
block device under the /dev directory.

After the OS has booted, initialize a NOVA instance with the following commands:

# modprobe nova
# mount -t NOVA -o init /dev/pmem0 /mnt/nova

The above commands create a NOVA instance on /dev/pmem0 and mounts it on
/mnt/nova. Currently NOVA does not have mkfs or fsck support.


Performance
===

Comparing to other DAX file systems such as ext4-DAX and xfs-DAX,
NOVA provides fine-grained, byte granularity metadata operation,
and it performs better in metadata-intensive and write-intensive applications.
NOVA also excel in append-fsync access pattern, i.e. write-ahead logging,
which is very common in DBMS and key-value stores.

The following test is performed on Intel i7-3770K with 16GB DRAM
and 8GB PMEM emulated with DRAM. The kernel is 4.16-rc4 64bit on Ubuntu 16.04.
Performance may vary on different platforms.


Filebench throughout (ops/s):
xfs-DAX ext4-DAXNOVA
Fileserver  86971   177826  334166
Varmail 148032  288033  999794
Webserver   370245  370144  374130
Webproxy315084  737544  927216

Webserver is read-intensive and all the file systems have similar performance.


SQLite test:
SQLite has four journaling modes:
Delete: delete the undo log file after transaction commit
Truncate: truncate the undo log file to zero after transaction commit
Persist: write a flag at the beginning of the log file after transaction commit
WAL: write-a

[RFC v2 01/83] Introduction and documentation of NOVA filesystem.

2018-03-10 Thread Andiry Xu
From: Andiry Xu 

NOVA is a log-structured file system tailored for byte-addressable non-volatile 
memories.
It was designed and developed at the Non-Volatile Systems Laboratory in the 
Computer
Science and Engineering Department at the University of California, San Diego.
Its primary authors are Andiry Xu , Lu Zhang
, and Steven Swanson .

These two papers provide a detailed, high-level description of NOVA's design 
goals and approach:

   NOVA: A Log-structured File system for Hybrid Volatile/Non-volatile Main 
Memories
   In The 14th USENIX Conference on File and Storage Technologies (FAST '16)
   (http://cseweb.ucsd.edu/~swanson/papers/FAST2016NOVA.pdf)

   NOVA-Fortis: A Fault-Tolerant Non-Volatile Main Memory File System
   In The 26th ACM Symposium on Operating Systems Principles (SOSP '17)
   (http://cseweb.ucsd.edu/~swanson/papers/SOSP2017-NOVAFortis.pdf)

This patchset contains features from the FAST paper. We leave NOVA-Fortis 
features,
such as snapshot, metadata and data replication and RAID parity for
future submission.

Signed-off-by: Andiry Xu 
---
 Documentation/filesystems/00-INDEX |   2 +
 Documentation/filesystems/nova.txt | 498 +
 MAINTAINERS|   8 +
 3 files changed, 508 insertions(+)
 create mode 100644 Documentation/filesystems/nova.txt

diff --git a/Documentation/filesystems/00-INDEX 
b/Documentation/filesystems/00-INDEX
index b7bd6c9..dc5c722 100644
--- a/Documentation/filesystems/00-INDEX
+++ b/Documentation/filesystems/00-INDEX
@@ -95,6 +95,8 @@ nfs/
- nfs-related documentation.
 nilfs2.txt
- info and mount options for the NILFS2 filesystem.
+nova.txt
+   - info on the NOVA filesystem.
 ntfs.txt
- info and mount options for the NTFS filesystem (Windows NT).
 ocfs2.txt
diff --git a/Documentation/filesystems/nova.txt 
b/Documentation/filesystems/nova.txt
new file mode 100644
index 000..4728f50
--- /dev/null
+++ b/Documentation/filesystems/nova.txt
@@ -0,0 +1,498 @@
+The NOVA Filesystem
+===
+
+NOn-Volatile memory Accelerated file system (NOVA) is a DAX file system
+designed to provide a high performance and production-ready file system
+tailored for byte-addressable non-volatile memories (e.g., NVDIMMs
+and Intel's soon-to-be-released 3DXPoint DIMMs).
+NOVA combines design elements from many other file systems
+and adapts conventional log-structured file system techniques to
+exploit the fast random access that NVMs provide. In particular, NOVA maintains
+separate logs for each inode to improve concurrency, and stores file data
+outside the log to minimize log size and reduce garbage collection costs. 
NOVA's
+logs provide metadata and data atomicity and focus on simplicity and
+reliability, keeping complex metadata structures in DRAM to accelerate lookup
+operations.
+
+NOVA was developed by the Non-Volatile Systems Laboratory (NVSL) in
+the Computer Science and Engineering Department at the University of
+California, San Diego.
+
+A more thorough discussion of NOVA's design is avaialable in these two papers:
+
+NOVA: A Log-structured File System for Hybrid Volatile/Non-volatile Main 
Memories
+Jian Xu and Steven Swanson
+In The 14th USENIX Conference on File and Storage Technologies (FAST '16)
+
+NOVA-Fortis: A Fault-Tolerant Non-Volatile Main Memory File System
+Jian Xu, Lu Zhang, Amirsaman Memaripour, Akshatha Gangadharaiah, Amit Borase,
+Tamires Brito Da Silva, Andy Rudoff and Steven Swanson
+In The 26th ACM Symposium on Operating Systems Principles (SOSP '17)
+
+This version of NOVA contains features from the FAST paper.
+NOVA-Fortis features, such as snapshot, metadata and data protection and 
replication
+are left for future submission.
+
+The main NOVA features include:
+
+  * POSIX semantics
+  * Directly access (DAX) byte-addressable NVMM without page caching
+  * Per-CPU NVMM pool to maximize concurrency
+  * Strong consistency guarantees with 8-byte atomic stores
+
+
+Filesystem Design
+=
+
+NOVA divides NVMM into several regions. NOVA's 512B superblock contains global
+file system information and the recovery inode. The recovery inode represents a
+special file that stores recovery information (e.g., the list of unallocated
+NVMM pages). NOVA divides its inode tables into per-CPU stripes. It also
+provides per-CPU journals for complex file operations that involve multiple
+inodes. The rest of the available NVMM stores logs and file data.
+
+NOVA is log-structured and stores a separate log for each inode to maximize
+concurrency and provide atomicity for operations that affect a single file. The
+logs only store metadata and comprise a linked list of 4 KB pages. Log entries
+are small – between 32 and 64 bytes. Logs are generally non-contiguous, and log
+pages may reside anywhere in NVMM.
+
+NOVA keeps copies of most file metadata in DRAM during normal
+operations, eliminating the need to access metadata in NVMM during reads.
+
+NOVA supports

[RFC v2 00/83] NOVA: a new file system for persistent memory

2018-03-10 Thread Andiry Xu
From: Andiry Xu 

This is the second version of RFC patch series that impements
NOVA (NOn-Volatile memory Accelerated file system), a new file system built for 
PMEM.

NOVA's goal is to provide a high performance, production-ready
file system tailored for byte-addressable non-volatile memories (e.g., NVDIMMs
and Intel's soon-to-be-released 3DXpoint DIMMs).
 
NOVA was developed at the Non-Volatile Systems Laboratory in the Computer
Science and Engineering Department at the University of California, San Diego.
Its primary authors are Andiry Xu , Lu Zhang
, and Steven Swanson .
 
NOVA is stable enough to run complex applications, but there is substantial
work left to do.  This RFC is intended to gather feedback to guide its
development toward eventual inclusion upstream.
 
The patches are based on Linux 4.16-rc4.


Changes from v1:

* Remove snapshot, metadata replication and data parity for future submission.
  This significantly reduces complexity and LOC: 22129 -> 13834.

* Breakdown the code in a more reviewer-friendly way:
  The patchset starts with a simple skeleton and adds more features gradually.
  Each patch leaves the tree in a compilable and working state,
  and is self-contained and small, so easier to review.

* Fix bugs so that NOVA passes xfstests: https://github.com/NVSL/xfstests


Overview


NOVA is primarily a log-structured file system, but rather than maintain a
single global log for the entire file system, it maintains separate logs for
each inode.  NOVA breaks the logs into 4KB pages, they need not be
contiguous in memory.  The logs only contain metadata.

File data pages reside outside the log, and log entries for write operations
point to data pages they modify.  File modification can be done in
either inplace update or copy-on-write (COW) way to provide atomic file updates.

For file operations that involve multiple inodes, NOVA use small, fixed-sized
redo logs to atomically append log entries to the logs of the inodes involved.

This structure keeps logs small and makes garbage collection very fast.  It also
enables enormous parallelism during recovery from an unclean unmount, since
threads can scan logs in parallel.

Documentation/filesystems/NOVA.txt contains some lower-level implementation and
usage information.  A more thorough discussion of NOVA's goals and design is
avaialable in two papers:

NOVA: A Log-structured File system for Hybrid Volatile/Non-volatile Main 
Memories
http://cseweb.ucsd.edu/~swanson/papers/FAST2016NOVA.pdf
Jian Xu and Steven Swanson
Published in FAST 2016

NOVA-Fortis: A Fault-Tolerant Non-Volatile Main Memory File System
http://cseweb.ucsd.edu/~swanson/papers/SOSP2017-NOVAFortis.pdf
Jian Xu, Lu Zhang, Amirsaman Memaripour, Akshatha Gangadharaiah,
Amit Borase, Tamires Brito Da Silva, Andy Rudoff, Steven Swanson
Published in SOSP 2017

This version contains features from the FAST paper. We leave NOVA-Fortis
features for future.


Build and Run
=

To build NOVA, build the kernel with PMEM (`CONFIG_BLK_DEV_PMEM`),
DAX (`CONFIG_FS_DAX`) and NOVA (`CONFIG_NOVA_FS`) support.  Install as usual.

NOVA runs on a pmem non-volatile memory region created by memmap kernel option.
For instance, adding 'memmap=16G!8G' to the kernel boot parameters will reserve
16GB memory starting from address 8GB, and the kernel will create a pmem0 
block device under the /dev directory.

After the OS has booted, initialize a NOVA instance with the following commands:

# modprobe nova
# mount -t NOVA -o init /dev/pmem0 /mnt/nova

The above commands create a NOVA instance on /dev/pmem0 and mounts it on
/mnt/nova. Currently NOVA does not have mkfs or fsck support.


Performance
===

Comparing to other DAX file systems such as ext4-DAX and xfs-DAX,
NOVA provides fine-grained, byte granularity metadata operation,
and it performs better in metadata-intensive and write-intensive applications.
NOVA also excel in append-fsync access pattern, i.e. write-ahead logging,
which is very common in DBMS and key-value stores.

The following test is performed on Intel i7-3770K with 16GB DRAM
and 8GB PMEM emulated with DRAM. The kernel is 4.16-rc4 64bit on Ubuntu 16.04.
Performance may vary on different platforms.


Filebench throughout (ops/s):
xfs-DAX ext4-DAXNOVA
Fileserver  86971   177826  334166
Varmail 148032  288033  999794
Webserver   370245  370144  374130
Webproxy315084  737544  927216

Webserver is read-intensive and all the file systems have similar performance.


SQLite test:
SQLite has four journaling modes:
Delete: delete the undo log file after transaction commit
Truncate: truncate the undo log file to zero after transaction commit
Persist: write a flag at the beginning of the log file after transaction commit
WAL: write-ahead logging

SQLite insert (transactions/s):
xfs-DAX ext4-DAXNOVA
Del

[RFC v2 10/83] Add superblock integrity check.

2018-03-10 Thread Andiry Xu
From: Andiry Xu <jix...@cs.ucsd.edu>

Repair broken primary superblock with redundant superblock.

Signed-off-by: Andiry Xu <jix...@cs.ucsd.edu>
---
 fs/nova/super.c | 102 
 1 file changed, 102 insertions(+)

diff --git a/fs/nova/super.c b/fs/nova/super.c
index 552fe5d..e0e38ab 100644
--- a/fs/nova/super.c
+++ b/fs/nova/super.c
@@ -276,6 +276,21 @@ static bool nova_check_size(struct super_block *sb, 
unsigned long size)
return true;
 }
 
+static inline int nova_check_super_checksum(struct super_block *sb)
+{
+   struct nova_sb_info *sbi = NOVA_SB(sb);
+   u32 crc = 0;
+
+   // Check CRC but skip c_sum, which is the 4 bytes at the beginning
+   crc = nova_crc32c(~0, (__u8 *)sbi->nova_sb + sizeof(__le32),
+   sizeof(struct nova_super_block) - sizeof(__le32));
+
+   if (sbi->nova_sb->s_sum == cpu_to_le32(crc))
+   return 0;
+   else
+   return 1;
+}
+
 static inline void nova_sync_super(struct super_block *sb)
 {
struct nova_sb_info *sbi = NOVA_SB(sb);
@@ -293,6 +308,34 @@ static inline void nova_sync_super(struct super_block *sb)
PERSISTENT_BARRIER();
 }
 
+/* Update checksum for the DRAM copy */
+static inline void nova_update_super_crc(struct super_block *sb)
+{
+   struct nova_sb_info *sbi = NOVA_SB(sb);
+   u32 crc = 0;
+
+   sbi->nova_sb->s_wtime = cpu_to_le32(get_seconds());
+   sbi->nova_sb->s_sum = 0;
+   crc = nova_crc32c(~0, (__u8 *)sbi->nova_sb + sizeof(__le32),
+   sizeof(struct nova_super_block) - sizeof(__le32));
+   sbi->nova_sb->s_sum = cpu_to_le32(crc);
+}
+
+
+static inline void nova_update_mount_time(struct super_block *sb)
+{
+   struct nova_sb_info *sbi = NOVA_SB(sb);
+   u64 mnt_write_time;
+
+   mnt_write_time = (get_seconds() & 0x);
+   mnt_write_time = mnt_write_time | (mnt_write_time << 32);
+
+   sbi->nova_sb->s_mtime = cpu_to_le64(mnt_write_time);
+   nova_update_super_crc(sb);
+
+   nova_sync_super(sb);
+}
+
 static struct nova_inode *nova_init(struct super_block *sb,
  unsigned long size)
 {
@@ -328,6 +371,7 @@ static struct nova_inode *nova_init(struct super_block *sb,
sbi->nova_sb->s_blocksize = cpu_to_le32(blocksize);
sbi->nova_sb->s_magic = cpu_to_le32(NOVA_SUPER_MAGIC);
sbi->nova_sb->s_epoch_id = 0;
+   nova_update_super_crc(sb);
 
nova_sync_super(sb);
 
@@ -369,6 +413,54 @@ static void nova_root_check(struct super_block *sb, struct 
nova_inode *root_pi)
nova_warn("root is not a directory!\n");
 }
 
+/* Check super block magic and checksum */
+static int nova_check_super(struct super_block *sb,
+   struct nova_super_block *ps)
+{
+   struct nova_sb_info *sbi = NOVA_SB(sb);
+   int rc;
+
+   rc = memcpy_mcsafe(sbi->nova_sb, ps,
+   sizeof(struct nova_super_block));
+
+   if (rc < 0)
+   return rc;
+
+   if (le32_to_cpu(sbi->nova_sb->s_magic) != NOVA_SUPER_MAGIC)
+   return -EIO;
+
+   if (nova_check_super_checksum(sb))
+   return -EIO;
+
+   return 0;
+}
+
+static int nova_check_integrity(struct super_block *sb)
+{
+   struct nova_super_block *super = nova_get_super(sb);
+   struct nova_super_block *super_redund;
+   int rc;
+
+   super_redund = nova_get_redund_super(sb);
+
+   /* Do sanity checks on the superblock */
+   rc = nova_check_super(sb, super);
+   if (rc < 0) {
+   rc = nova_check_super(sb, super_redund);
+   if (rc < 0) {
+   nova_err(sb, "Can't find a valid nova partition\n");
+   return rc;
+   } else {
+   nova_warn("Error in super block: try to repair it with 
the other copy\n");
+   memcpy_to_pmem_nocache((void *)super, (void 
*)super_redund,
+   sizeof(struct nova_super_block));
+   PERSISTENT_BARRIER();
+   }
+   }
+
+   return 0;
+}
+
 static int nova_fill_super(struct super_block *sb, void *data, int silent)
 {
struct nova_sb_info *sbi = NULL;
@@ -446,6 +538,13 @@ static int nova_fill_super(struct super_block *sb, void 
*data, int silent)
goto setup_sb;
}
 
+   if (nova_check_integrity(sb) < 0) {
+   retval = -EINVAL;
+   nova_dbg("Memory contains invalid nova %x:%x\n",
+   le32_to_cpu(sbi->nova_sb->s_magic), NOVA_SUPER_MAGIC);
+   goto out;
+   }
+
blocksize = le32_to_cpu(sbi->nova_sb->s_blocksize);
nova_set_blocksize(sb, blocksize);
 
@@ -482,6 +581,9 @@ st

[RFC v2 10/83] Add superblock integrity check.

2018-03-10 Thread Andiry Xu
From: Andiry Xu 

Repair broken primary superblock with redundant superblock.

Signed-off-by: Andiry Xu 
---
 fs/nova/super.c | 102 
 1 file changed, 102 insertions(+)

diff --git a/fs/nova/super.c b/fs/nova/super.c
index 552fe5d..e0e38ab 100644
--- a/fs/nova/super.c
+++ b/fs/nova/super.c
@@ -276,6 +276,21 @@ static bool nova_check_size(struct super_block *sb, 
unsigned long size)
return true;
 }
 
+static inline int nova_check_super_checksum(struct super_block *sb)
+{
+   struct nova_sb_info *sbi = NOVA_SB(sb);
+   u32 crc = 0;
+
+   // Check CRC but skip c_sum, which is the 4 bytes at the beginning
+   crc = nova_crc32c(~0, (__u8 *)sbi->nova_sb + sizeof(__le32),
+   sizeof(struct nova_super_block) - sizeof(__le32));
+
+   if (sbi->nova_sb->s_sum == cpu_to_le32(crc))
+   return 0;
+   else
+   return 1;
+}
+
 static inline void nova_sync_super(struct super_block *sb)
 {
struct nova_sb_info *sbi = NOVA_SB(sb);
@@ -293,6 +308,34 @@ static inline void nova_sync_super(struct super_block *sb)
PERSISTENT_BARRIER();
 }
 
+/* Update checksum for the DRAM copy */
+static inline void nova_update_super_crc(struct super_block *sb)
+{
+   struct nova_sb_info *sbi = NOVA_SB(sb);
+   u32 crc = 0;
+
+   sbi->nova_sb->s_wtime = cpu_to_le32(get_seconds());
+   sbi->nova_sb->s_sum = 0;
+   crc = nova_crc32c(~0, (__u8 *)sbi->nova_sb + sizeof(__le32),
+   sizeof(struct nova_super_block) - sizeof(__le32));
+   sbi->nova_sb->s_sum = cpu_to_le32(crc);
+}
+
+
+static inline void nova_update_mount_time(struct super_block *sb)
+{
+   struct nova_sb_info *sbi = NOVA_SB(sb);
+   u64 mnt_write_time;
+
+   mnt_write_time = (get_seconds() & 0x);
+   mnt_write_time = mnt_write_time | (mnt_write_time << 32);
+
+   sbi->nova_sb->s_mtime = cpu_to_le64(mnt_write_time);
+   nova_update_super_crc(sb);
+
+   nova_sync_super(sb);
+}
+
 static struct nova_inode *nova_init(struct super_block *sb,
  unsigned long size)
 {
@@ -328,6 +371,7 @@ static struct nova_inode *nova_init(struct super_block *sb,
sbi->nova_sb->s_blocksize = cpu_to_le32(blocksize);
sbi->nova_sb->s_magic = cpu_to_le32(NOVA_SUPER_MAGIC);
sbi->nova_sb->s_epoch_id = 0;
+   nova_update_super_crc(sb);
 
nova_sync_super(sb);
 
@@ -369,6 +413,54 @@ static void nova_root_check(struct super_block *sb, struct 
nova_inode *root_pi)
nova_warn("root is not a directory!\n");
 }
 
+/* Check super block magic and checksum */
+static int nova_check_super(struct super_block *sb,
+   struct nova_super_block *ps)
+{
+   struct nova_sb_info *sbi = NOVA_SB(sb);
+   int rc;
+
+   rc = memcpy_mcsafe(sbi->nova_sb, ps,
+   sizeof(struct nova_super_block));
+
+   if (rc < 0)
+   return rc;
+
+   if (le32_to_cpu(sbi->nova_sb->s_magic) != NOVA_SUPER_MAGIC)
+   return -EIO;
+
+   if (nova_check_super_checksum(sb))
+   return -EIO;
+
+   return 0;
+}
+
+static int nova_check_integrity(struct super_block *sb)
+{
+   struct nova_super_block *super = nova_get_super(sb);
+   struct nova_super_block *super_redund;
+   int rc;
+
+   super_redund = nova_get_redund_super(sb);
+
+   /* Do sanity checks on the superblock */
+   rc = nova_check_super(sb, super);
+   if (rc < 0) {
+   rc = nova_check_super(sb, super_redund);
+   if (rc < 0) {
+   nova_err(sb, "Can't find a valid nova partition\n");
+   return rc;
+   } else {
+   nova_warn("Error in super block: try to repair it with 
the other copy\n");
+   memcpy_to_pmem_nocache((void *)super, (void 
*)super_redund,
+   sizeof(struct nova_super_block));
+   PERSISTENT_BARRIER();
+   }
+   }
+
+   return 0;
+}
+
 static int nova_fill_super(struct super_block *sb, void *data, int silent)
 {
struct nova_sb_info *sbi = NULL;
@@ -446,6 +538,13 @@ static int nova_fill_super(struct super_block *sb, void 
*data, int silent)
goto setup_sb;
}
 
+   if (nova_check_integrity(sb) < 0) {
+   retval = -EINVAL;
+   nova_dbg("Memory contains invalid nova %x:%x\n",
+   le32_to_cpu(sbi->nova_sb->s_magic), NOVA_SUPER_MAGIC);
+   goto out;
+   }
+
blocksize = le32_to_cpu(sbi->nova_sb->s_blocksize);
nova_set_blocksize(sb, blocksize);
 
@@ -482,6 +581,9 @@ static int nova_fill_super(struct super_block *sb, void 
*

Filebench failure on ramdisk with Ext4-DAX

2015-07-07 Thread Andiry Xu
Hi,

I am running into failures when run filebench on ramdisk(/dev/ram0)
with Ext4-DAX.
The kernel version is 4.0, and I also verified it occurs on 4.2-rc1.

The issue reproduction steps:

// Set ramdisk size to 2GB
# mkfs.ext4 /dev/ram0
# mount -o dax /dev/ram0 /mnt/ramdisk
# filebench
filebench> load fileserver
filebench> set $dir=/mnt/ramdisk
filebench> run 30

And filebench fails in a few seconds like this:

8163: 22.992: Failed to pre-allocate file
/mnt/ramdisk/bigfileset/0001/0006/0001/0024/0005/0002/0006:
No such file or directory on line 128
 8163: 22.992: Failed to create filesets on line 128

Or like this:

8141: 16.372: Failed to write 51967 bytes on fd 23: Success
 8151: 16.372: Failed to write 136735 bytes on fd 18: Success
 8148: 16.372: Failed to write 123317 bytes on fd 31: Success
 8141: 16.381: filereaderthread-36: flowop wrtfile1-1 failed
 8151: 16.381: filereaderthread-46: flowop wrtfile1-1 failed
 8148: 16.381: filereaderthread-43: flowop wrtfile1-1 failed
 8098: 16.521: Run took 1 seconds...
 8098: 16.521: NO VALID RESULTS! Filebench run terminated prematurely on line 65
 8098: 16.521: Shutting down processes

Sometimes it succeeds, but the chance is low. The failure rate is 80%+.

Note:
The issues does not occur with normal Ext4.
The issues does not occur with Ext4-DAX on pmem driver (from 01org/prd).

The only significant difference between brd.c and pmem.c is that brd.c
uses alloc_page() and pmem.c reserved memory range and uses ioremap()
to get virtual address. I assume that the memcpy
operation(copy_from/to_user) directly between user buffer and page by
alloc_page() does not work correctly somehow. I wonder if this is a
bug? If it is, how to fix it? Thanks.

Thanks,
Andiry
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Filebench failure on ramdisk with Ext4-DAX

2015-07-07 Thread Andiry Xu
Hi,

I am running into failures when run filebench on ramdisk(/dev/ram0)
with Ext4-DAX.
The kernel version is 4.0, and I also verified it occurs on 4.2-rc1.

The issue reproduction steps:

// Set ramdisk size to 2GB
# mkfs.ext4 /dev/ram0
# mount -o dax /dev/ram0 /mnt/ramdisk
# filebench
filebench load fileserver
filebench set $dir=/mnt/ramdisk
filebench run 30

And filebench fails in a few seconds like this:

8163: 22.992: Failed to pre-allocate file
/mnt/ramdisk/bigfileset/0001/0006/0001/0024/0005/0002/0006:
No such file or directory on line 128
 8163: 22.992: Failed to create filesets on line 128

Or like this:

8141: 16.372: Failed to write 51967 bytes on fd 23: Success
 8151: 16.372: Failed to write 136735 bytes on fd 18: Success
 8148: 16.372: Failed to write 123317 bytes on fd 31: Success
 8141: 16.381: filereaderthread-36: flowop wrtfile1-1 failed
 8151: 16.381: filereaderthread-46: flowop wrtfile1-1 failed
 8148: 16.381: filereaderthread-43: flowop wrtfile1-1 failed
 8098: 16.521: Run took 1 seconds...
 8098: 16.521: NO VALID RESULTS! Filebench run terminated prematurely on line 65
 8098: 16.521: Shutting down processes

Sometimes it succeeds, but the chance is low. The failure rate is 80%+.

Note:
The issues does not occur with normal Ext4.
The issues does not occur with Ext4-DAX on pmem driver (from 01org/prd).

The only significant difference between brd.c and pmem.c is that brd.c
uses alloc_page() and pmem.c reserved memory range and uses ioremap()
to get virtual address. I assume that the memcpy
operation(copy_from/to_user) directly between user buffer and page by
alloc_page() does not work correctly somehow. I wonder if this is a
bug? If it is, how to fix it? Thanks.

Thanks,
Andiry
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


<    1   2   3   >