from:"Andiry Xu"

Re: [RFC v2 06/83] Add inode get/read methods.

2018-04-23 Thread Andiry Xu

On Sun, Apr 22, 2018 at 11:12 PM, Darrick J. Wong
<darrick.w...@oracle.com> wrote:
> [haaa, I finally found time to read more of these]
>
> On Sat, Mar 10, 2018 at 10:17:47AM -0800, Andiry Xu wrote:
>> From: Andiry Xu <jix...@cs.ucsd.edu>
>>
>> These routines are incomplete and currently only support reserved inodes,
>> whose addresses are fixed. This is necessary for fill_super to work.
>> File/dir operations are left NULL.
>>
>> Signed-off-by: Andiry Xu <jix...@cs.ucsd.edu>
>> ---
>>  fs/nova/inode.c | 176 
>> 
>>  fs/nova/inode.h |   3 +
>>  2 files changed, 179 insertions(+)
>>  create mode 100644 fs/nova/inode.c
>>
>> diff --git a/fs/nova/inode.c b/fs/nova/inode.c
>> new file mode 100644
>> index 000..bfdc5dc
>> --- /dev/null
>> +++ b/fs/nova/inode.c
>> @@ -0,0 +1,176 @@
>> +/*
>> + * BRIEF DESCRIPTION
>> + *
>> + * Inode methods (allocate/free/read/write).
>> + *
>> + * Copyright 2015-2016 Regents of the University of California,
>> + * UCSD Non-Volatile Systems Lab, Andiry Xu <jix...@cs.ucsd.edu>
>> + * Copyright 2012-2013 Intel Corporation
>> + * Copyright 2009-2011 Marco Stornelli <marco.storne...@gmail.com>
>> + * Copyright 2003 Sony Corporation
>> + * Copyright 2003 Matsushita Electric Industrial Co., Ltd.
>> + * 2003-2004 (c) MontaVista Software, Inc. , Steve Longerbeam
>> + * This file is licensed under the terms of the GNU General Public
>> + * License version 2. This program is licensed "as is" without any
>> + * warranty of any kind, whether express or implied.
>> + */
>> +
>> +#include 
>> +#include 
>> +#include 
>> +#include 
>> +#include 
>> +#include 
>> +#include 
>> +#include 
>> +#include "nova.h"
>> +#include "inode.h"
>> +
>> +unsigned int blk_type_to_shift[NOVA_BLOCK_TYPE_MAX] = {12, 21, 30};
>> +uint32_t blk_type_to_size[NOVA_BLOCK_TYPE_MAX] = {0x1000, 0x20, 
>> 0x4000};
>> +
>> +void nova_set_inode_flags(struct inode *inode, struct nova_inode *pi,
>> + unsigned int flags)
>> +{
>> + inode->i_flags &=
>> + ~(S_SYNC | S_APPEND | S_IMMUTABLE | S_NOATIME | S_DIRSYNC);
>> + if (flags & FS_SYNC_FL)
>> + inode->i_flags |= S_SYNC;
>> + if (flags & FS_APPEND_FL)
>> + inode->i_flags |= S_APPEND;
>> + if (flags & FS_IMMUTABLE_FL)
>> + inode->i_flags |= S_IMMUTABLE;
>> + if (flags & FS_NOATIME_FL)
>> + inode->i_flags |= S_NOATIME;
>> + if (flags & FS_DIRSYNC_FL)
>> + inode->i_flags |= S_DIRSYNC;
>> + if (!pi->i_xattr)
>> + inode_has_no_xattr(inode);
>> + inode->i_flags |= S_DAX;
>> +}
>> +
>> +/* copy persistent state to struct inode */
>> +static int nova_read_inode(struct super_block *sb, struct inode *inode,
>> + u64 pi_addr)
>> +{
>> + struct nova_inode_info *si = NOVA_I(inode);
>> + struct nova_inode *pi, fake_pi;
>> + struct nova_inode_info_header *sih = >header;
>> + int ret = -EIO;
>> + unsigned long ino;
>> +
>> + ret = nova_get_reference(sb, pi_addr, _pi,
>> + (void **), sizeof(struct nova_inode));
>> + if (ret) {
>> + nova_dbg("%s: read pi @ 0x%llx failed\n",
>> + __func__, pi_addr);
>> + goto bad_inode;
>> + }
>> +
>> + inode->i_mode = sih->i_mode;
>
> Hm, do you validate the on-pmem metadata as it's read in?  What if
> i_mode is garbage?
>

I have checksum for inode and all metadata structures in the
NOVA-fortis code. I removed them in this patchset to make the code
shorter and simpler.

>> + i_uid_write(inode, le32_to_cpu(pi->i_uid));
>> + i_gid_write(inode, le32_to_cpu(pi->i_gid));
>> +//   set_nlink(inode, le16_to_cpu(pi->i_links_count));
>
> C++ comment?
>

Will fix.

>> + inode->i_generation = le32_to_cpu(pi->i_generation);
>> + nova_set_inode_flags(inode, pi, le32_to_cpu(pi->i_flags));
>> + ino = inode->i_ino;
>> +
>> + /* check if the inode is active. */
>> + if (inode->i_mode == 0 || pi->deleted == 1) {
>> + /* this inode is deleted */
>> + ret = -ESTALE;
>> + goto bad_inode;
>> + }
>> +
>> +

Re: [RFC v2 06/83] Add inode get/read methods.

2018-04-23 Thread Andiry Xu

On Sun, Apr 22, 2018 at 11:12 PM, Darrick J. Wong
 wrote:
> [haaa, I finally found time to read more of these]
>
> On Sat, Mar 10, 2018 at 10:17:47AM -0800, Andiry Xu wrote:
>> From: Andiry Xu 
>>
>> These routines are incomplete and currently only support reserved inodes,
>> whose addresses are fixed. This is necessary for fill_super to work.
>> File/dir operations are left NULL.
>>
>> Signed-off-by: Andiry Xu 
>> ---
>>  fs/nova/inode.c | 176 
>> 
>>  fs/nova/inode.h |   3 +
>>  2 files changed, 179 insertions(+)
>>  create mode 100644 fs/nova/inode.c
>>
>> diff --git a/fs/nova/inode.c b/fs/nova/inode.c
>> new file mode 100644
>> index 000..bfdc5dc
>> --- /dev/null
>> +++ b/fs/nova/inode.c
>> @@ -0,0 +1,176 @@
>> +/*
>> + * BRIEF DESCRIPTION
>> + *
>> + * Inode methods (allocate/free/read/write).
>> + *
>> + * Copyright 2015-2016 Regents of the University of California,
>> + * UCSD Non-Volatile Systems Lab, Andiry Xu 
>> + * Copyright 2012-2013 Intel Corporation
>> + * Copyright 2009-2011 Marco Stornelli 
>> + * Copyright 2003 Sony Corporation
>> + * Copyright 2003 Matsushita Electric Industrial Co., Ltd.
>> + * 2003-2004 (c) MontaVista Software, Inc. , Steve Longerbeam
>> + * This file is licensed under the terms of the GNU General Public
>> + * License version 2. This program is licensed "as is" without any
>> + * warranty of any kind, whether express or implied.
>> + */
>> +
>> +#include 
>> +#include 
>> +#include 
>> +#include 
>> +#include 
>> +#include 
>> +#include 
>> +#include 
>> +#include "nova.h"
>> +#include "inode.h"
>> +
>> +unsigned int blk_type_to_shift[NOVA_BLOCK_TYPE_MAX] = {12, 21, 30};
>> +uint32_t blk_type_to_size[NOVA_BLOCK_TYPE_MAX] = {0x1000, 0x20, 
>> 0x4000};
>> +
>> +void nova_set_inode_flags(struct inode *inode, struct nova_inode *pi,
>> + unsigned int flags)
>> +{
>> + inode->i_flags &=
>> + ~(S_SYNC | S_APPEND | S_IMMUTABLE | S_NOATIME | S_DIRSYNC);
>> + if (flags & FS_SYNC_FL)
>> + inode->i_flags |= S_SYNC;
>> + if (flags & FS_APPEND_FL)
>> + inode->i_flags |= S_APPEND;
>> + if (flags & FS_IMMUTABLE_FL)
>> + inode->i_flags |= S_IMMUTABLE;
>> + if (flags & FS_NOATIME_FL)
>> + inode->i_flags |= S_NOATIME;
>> + if (flags & FS_DIRSYNC_FL)
>> + inode->i_flags |= S_DIRSYNC;
>> + if (!pi->i_xattr)
>> + inode_has_no_xattr(inode);
>> + inode->i_flags |= S_DAX;
>> +}
>> +
>> +/* copy persistent state to struct inode */
>> +static int nova_read_inode(struct super_block *sb, struct inode *inode,
>> + u64 pi_addr)
>> +{
>> + struct nova_inode_info *si = NOVA_I(inode);
>> + struct nova_inode *pi, fake_pi;
>> + struct nova_inode_info_header *sih = >header;
>> + int ret = -EIO;
>> + unsigned long ino;
>> +
>> + ret = nova_get_reference(sb, pi_addr, _pi,
>> + (void **), sizeof(struct nova_inode));
>> + if (ret) {
>> + nova_dbg("%s: read pi @ 0x%llx failed\n",
>> + __func__, pi_addr);
>> + goto bad_inode;
>> + }
>> +
>> + inode->i_mode = sih->i_mode;
>
> Hm, do you validate the on-pmem metadata as it's read in?  What if
> i_mode is garbage?
>

I have checksum for inode and all metadata structures in the
NOVA-fortis code. I removed them in this patchset to make the code
shorter and simpler.

>> + i_uid_write(inode, le32_to_cpu(pi->i_uid));
>> + i_gid_write(inode, le32_to_cpu(pi->i_gid));
>> +//   set_nlink(inode, le16_to_cpu(pi->i_links_count));
>
> C++ comment?
>

Will fix.

>> + inode->i_generation = le32_to_cpu(pi->i_generation);
>> + nova_set_inode_flags(inode, pi, le32_to_cpu(pi->i_flags));
>> + ino = inode->i_ino;
>> +
>> + /* check if the inode is active. */
>> + if (inode->i_mode == 0 || pi->deleted == 1) {
>> + /* this inode is deleted */
>> + ret = -ESTALE;
>> + goto bad_inode;
>> + }
>> +
>> + inode->i_blocks = sih->i_blocks;
>
> Not le64_to_cpu(sih->i_blocks)?  Or is that somewhere else I'm
> missing...
>

sih i

Re: [RFC v2 83/83] Sysfs support.

2018-03-22 Thread Andiry Xu

On Thu, Mar 22, 2018 at 8:00 AM, David Sterba <dste...@suse.cz> wrote:
> On Sat, Mar 10, 2018 at 10:19:04AM -0800, Andiry Xu wrote:
>> From: Andiry Xu <jix...@cs.ucsd.edu>
>>
>> Sysfs support allows user to get/post information of running NOVA instance.
>> After mount, NOVA creates four entries under proc directory
>> /proc/fs/nova/pmem#/:
>>
>> timing_stats  IO_statsallocator   gc
>>
>> Show NOVA file operation timing statistics:
>> cat /proc/fs/NOVA/pmem#/timing_stats
>>
>> Clear timing statistics:
>> echo 1 > /proc/fs/NOVA/pmem#/timing_stats
>>
>> Show NOVA I/O statistics:
>> cat /proc/fs/NOVA/pmem#/IO_stats
>>
>> Clear I/O statistics:
>> echo 1 > /proc/fs/NOVA/pmem#/IO_stats
>>
>> Show NOVA allocator information:
>> cat /proc/fs/NOVA/pmem#/allocator
>>
>> Manual garbage collection:
>> echo #inode_number > /proc/fs/NOVA/pmem#/gc
>
> IIRC no new entries should be added to /proc, /sys is supposed to be
> used. I can't find it documented though, so you'd better check with
> sysfs people.

Thanks. I will try to switch to sysfs.

Thanks,
Andiry

Re: [RFC v2 83/83] Sysfs support.

2018-03-22 Thread Andiry Xu

On Thu, Mar 22, 2018 at 8:00 AM, David Sterba  wrote:
> On Sat, Mar 10, 2018 at 10:19:04AM -0800, Andiry Xu wrote:
>> From: Andiry Xu 
>>
>> Sysfs support allows user to get/post information of running NOVA instance.
>> After mount, NOVA creates four entries under proc directory
>> /proc/fs/nova/pmem#/:
>>
>> timing_stats  IO_statsallocator   gc
>>
>> Show NOVA file operation timing statistics:
>> cat /proc/fs/NOVA/pmem#/timing_stats
>>
>> Clear timing statistics:
>> echo 1 > /proc/fs/NOVA/pmem#/timing_stats
>>
>> Show NOVA I/O statistics:
>> cat /proc/fs/NOVA/pmem#/IO_stats
>>
>> Clear I/O statistics:
>> echo 1 > /proc/fs/NOVA/pmem#/IO_stats
>>
>> Show NOVA allocator information:
>> cat /proc/fs/NOVA/pmem#/allocator
>>
>> Manual garbage collection:
>> echo #inode_number > /proc/fs/NOVA/pmem#/gc
>
> IIRC no new entries should be added to /proc, /sys is supposed to be
> used. I can't find it documented though, so you'd better check with
> sysfs people.

Thanks. I will try to switch to sysfs.

Thanks,
Andiry

Re: [RFC v2 01/83] Introduction and documentation of NOVA filesystem.

2018-03-19 Thread Andiry Xu

Thanks for all the comments.

On Mon, Mar 19, 2018 at 1:43 PM, Randy Dunlap <rdun...@infradead.org> wrote:
> On 03/10/2018 10:17 AM, Andiry Xu wrote:
>> From: Andiry Xu <jix...@cs.ucsd.edu>
>>
>> NOVA is a log-structured file system tailored for byte-addressable 
>> non-volatile memories.
>> It was designed and developed at the Non-Volatile Systems Laboratory in the 
>> Computer
>> Science and Engineering Department at the University of California, San 
>> Diego.
>> Its primary authors are Andiry Xu <jix...@eng.ucsd.edu>, Lu Zhang
>> <l...@eng.ucsd.edu>, and Steven Swanson <swan...@eng.ucsd.edu>.
>>
>> These two papers provide a detailed, high-level description of NOVA's design 
>> goals and approach:
>>
>>NOVA: A Log-structured File system for Hybrid Volatile/Non-volatile Main 
>> Memories
>>In The 14th USENIX Conference on File and Storage Technologies (FAST '16)
>>(http://cseweb.ucsd.edu/~swanson/papers/FAST2016NOVA.pdf)
>>
>>NOVA-Fortis: A Fault-Tolerant Non-Volatile Main Memory File System
>>In The 26th ACM Symposium on Operating Systems Principles (SOSP '17)
>>(http://cseweb.ucsd.edu/~swanson/papers/SOSP2017-NOVAFortis.pdf)
>>
>> This patchset contains features from the FAST paper. We leave NOVA-Fortis 
>> features,
>> such as snapshot, metadata and data replication and RAID parity for
>> future submission.
>>
>> Signed-off-by: Andiry Xu <jix...@cs.ucsd.edu>
>> ---
>>  Documentation/filesystems/00-INDEX |   2 +
>>  Documentation/filesystems/nova.txt | 498 
>> +
>>  MAINTAINERS|   8 +
>>  3 files changed, 508 insertions(+)
>>  create mode 100644 Documentation/filesystems/nova.txt
>
>> diff --git a/Documentation/filesystems/nova.txt 
>> b/Documentation/filesystems/nova.txt
>> new file mode 100644
>> index 000..4728f50
>> --- /dev/null
>> +++ b/Documentation/filesystems/nova.txt
>> @@ -0,0 +1,498 @@
>> +The NOVA Filesystem
>> +===
>> +
>> +NOn-Volatile memory Accelerated file system (NOVA) is a DAX file system
>> +designed to provide a high performance and production-ready file system
>> +tailored for byte-addressable non-volatile memories (e.g., NVDIMMs
>> +and Intel's soon-to-be-released 3DXPoint DIMMs).
>> +NOVA combines design elements from many other file systems
>> +and adapts conventional log-structured file system techniques to
>> +exploit the fast random access that NVMs provide. In particular, NOVA 
>> maintains
>> +separate logs for each inode to improve concurrency, and stores file data
>> +outside the log to minimize log size and reduce garbage collection costs. 
>> NOVA's
>> +logs provide metadata and data atomicity and focus on simplicity and
>> +reliability, keeping complex metadata structures in DRAM to accelerate 
>> lookup
>> +operations.
>> +
>> +NOVA was developed by the Non-Volatile Systems Laboratory (NVSL) in
>> +the Computer Science and Engineering Department at the University of
>> +California, San Diego.
>> +
>> +A more thorough discussion of NOVA's design is avaialable in these two 
>> papers:
>
>   available
>
>> +
>> +NOVA: A Log-structured File System for Hybrid Volatile/Non-volatile Main 
>> Memories
>> +Jian Xu and Steven Swanson
>> +In The 14th USENIX Conference on File and Storage Technologies (FAST '16)
>> +
>> +NOVA-Fortis: A Fault-Tolerant Non-Volatile Main Memory File System
>> +Jian Xu, Lu Zhang, Amirsaman Memaripour, Akshatha Gangadharaiah, Amit 
>> Borase,
>> +Tamires Brito Da Silva, Andy Rudoff and Steven Swanson
>> +In The 26th ACM Symposium on Operating Systems Principles (SOSP '17)
>> +
>> +This version of NOVA contains features from the FAST paper.
>> +NOVA-Fortis features, such as snapshot, metadata and data protection and 
>> replication
>> +are left for future submission.
>> +
>> +The main NOVA features include:
>> +
>> +  * POSIX semantics
>> +  * Directly access (DAX) byte-addressable NVMM without page caching
>> +  * Per-CPU NVMM pool to maximize concurrency
>> +  * Strong consistency guarantees with 8-byte atomic stores
>> +
>> +
>> +Filesystem Design
>> +=
>> +
>> +NOVA divides NVMM into several regions. NOVA's 512B superblock contains 
>> global
>
> (prefer:) 512-byte
>
>> +file system information

Re: [RFC v2 01/83] Introduction and documentation of NOVA filesystem.

2018-03-19 Thread Andiry Xu

Thanks for all the comments.

On Mon, Mar 19, 2018 at 1:43 PM, Randy Dunlap  wrote:
> On 03/10/2018 10:17 AM, Andiry Xu wrote:
>> From: Andiry Xu 
>>
>> NOVA is a log-structured file system tailored for byte-addressable 
>> non-volatile memories.
>> It was designed and developed at the Non-Volatile Systems Laboratory in the 
>> Computer
>> Science and Engineering Department at the University of California, San 
>> Diego.
>> Its primary authors are Andiry Xu , Lu Zhang
>> , and Steven Swanson .
>>
>> These two papers provide a detailed, high-level description of NOVA's design 
>> goals and approach:
>>
>>NOVA: A Log-structured File system for Hybrid Volatile/Non-volatile Main 
>> Memories
>>In The 14th USENIX Conference on File and Storage Technologies (FAST '16)
>>(http://cseweb.ucsd.edu/~swanson/papers/FAST2016NOVA.pdf)
>>
>>NOVA-Fortis: A Fault-Tolerant Non-Volatile Main Memory File System
>>In The 26th ACM Symposium on Operating Systems Principles (SOSP '17)
>>(http://cseweb.ucsd.edu/~swanson/papers/SOSP2017-NOVAFortis.pdf)
>>
>> This patchset contains features from the FAST paper. We leave NOVA-Fortis 
>> features,
>> such as snapshot, metadata and data replication and RAID parity for
>> future submission.
>>
>> Signed-off-by: Andiry Xu 
>> ---
>>  Documentation/filesystems/00-INDEX |   2 +
>>  Documentation/filesystems/nova.txt | 498 
>> +
>>  MAINTAINERS|   8 +
>>  3 files changed, 508 insertions(+)
>>  create mode 100644 Documentation/filesystems/nova.txt
>
>> diff --git a/Documentation/filesystems/nova.txt 
>> b/Documentation/filesystems/nova.txt
>> new file mode 100644
>> index 000..4728f50
>> --- /dev/null
>> +++ b/Documentation/filesystems/nova.txt
>> @@ -0,0 +1,498 @@
>> +The NOVA Filesystem
>> +===
>> +
>> +NOn-Volatile memory Accelerated file system (NOVA) is a DAX file system
>> +designed to provide a high performance and production-ready file system
>> +tailored for byte-addressable non-volatile memories (e.g., NVDIMMs
>> +and Intel's soon-to-be-released 3DXPoint DIMMs).
>> +NOVA combines design elements from many other file systems
>> +and adapts conventional log-structured file system techniques to
>> +exploit the fast random access that NVMs provide. In particular, NOVA 
>> maintains
>> +separate logs for each inode to improve concurrency, and stores file data
>> +outside the log to minimize log size and reduce garbage collection costs. 
>> NOVA's
>> +logs provide metadata and data atomicity and focus on simplicity and
>> +reliability, keeping complex metadata structures in DRAM to accelerate 
>> lookup
>> +operations.
>> +
>> +NOVA was developed by the Non-Volatile Systems Laboratory (NVSL) in
>> +the Computer Science and Engineering Department at the University of
>> +California, San Diego.
>> +
>> +A more thorough discussion of NOVA's design is avaialable in these two 
>> papers:
>
>   available
>
>> +
>> +NOVA: A Log-structured File System for Hybrid Volatile/Non-volatile Main 
>> Memories
>> +Jian Xu and Steven Swanson
>> +In The 14th USENIX Conference on File and Storage Technologies (FAST '16)
>> +
>> +NOVA-Fortis: A Fault-Tolerant Non-Volatile Main Memory File System
>> +Jian Xu, Lu Zhang, Amirsaman Memaripour, Akshatha Gangadharaiah, Amit 
>> Borase,
>> +Tamires Brito Da Silva, Andy Rudoff and Steven Swanson
>> +In The 26th ACM Symposium on Operating Systems Principles (SOSP '17)
>> +
>> +This version of NOVA contains features from the FAST paper.
>> +NOVA-Fortis features, such as snapshot, metadata and data protection and 
>> replication
>> +are left for future submission.
>> +
>> +The main NOVA features include:
>> +
>> +  * POSIX semantics
>> +  * Directly access (DAX) byte-addressable NVMM without page caching
>> +  * Per-CPU NVMM pool to maximize concurrency
>> +  * Strong consistency guarantees with 8-byte atomic stores
>> +
>> +
>> +Filesystem Design
>> +=
>> +
>> +NOVA divides NVMM into several regions. NOVA's 512B superblock contains 
>> global
>
> (prefer:) 512-byte
>
>> +file system information and the recovery inode. The recovery inode 
>> represents a
>> +special file that stores recovery information (e.g., the list of unallocated
>> +

Re: [RFC v2 05/83] Add NOVA filesystem definitions and useful helper routines.

2018-03-19 Thread Andiry Xu

On Mon, Mar 19, 2018 at 1:30 PM, Eric Biggers <ebigge...@gmail.com> wrote:
> On Mon, Mar 19, 2018 at 12:39:55PM -0700, Andiry Xu wrote:
>> On Sun, Mar 11, 2018 at 12:22 PM, Eric Biggers <ebigge...@gmail.com> wrote:
>> > On Sun, Mar 11, 2018 at 02:00:13PM +0200, Nikolay Borisov wrote:
>> >> [Adding Herbert Xu to CC since he is the maintainer of the crypto subsys
>> >> maintainer]
>> >>
>> >> On 10.03.2018 20:17, Andiry Xu wrote:
>> >> 
>> >>
>> >> > +static inline u32 nova_crc32c(u32 crc, const u8 *data, size_t len)
>> >> > +{
>> >> > +   u8 *ptr = (u8 *) data;
>> >> > +   u64 acc = crc; /* accumulator, crc32c value in lower 32b */
>> >> > +   u32 csum;
>> >> > +
>> >> > +   /* x86 instruction crc32 is part of SSE-4.2 */
>> >> > +   if (static_cpu_has(X86_FEATURE_XMM4_2)) {
>> >> > +   /* This inline assembly implementation should be equivalent
>> >> > +* to the kernel's crc32c_intel_le_hw() function used by
>> >> > +* crc32c(), but this performs better on test machines.
>> >> > +*/
>> >> > +   while (len > 8) {
>> >> > +   asm volatile(/* 64b quad words */
>> >> > +   "crc32q (%1), %0"
>> >> > +   : "=r" (acc)
>> >> > +   : "r"  (ptr), "0" (acc)
>> >> > +   );
>> >> > +   ptr += 8;
>> >> > +   len -= 8;
>> >> > +   }
>> >> > +
>> >> > +   while (len > 0) {
>> >> > +   asm volatile(/* trailing bytes */
>> >> > +   "crc32b (%1), %0"
>> >> > +   : "=r" (acc)
>> >> > +   : "r"  (ptr), "0" (acc)
>> >> > +   );
>> >> > +   ptr++;
>> >> > +   len--;
>> >> > +   }
>> >> > +
>> >> > +   csum = (u32) acc;
>> >> > +   } else {
>> >> > +   /* The kernel's crc32c() function should also detect and 
>> >> > use the
>> >> > +* crc32 instruction of SSE-4.2. But calling in to this 
>> >> > function
>> >> > +* is about 3x to 5x slower than the inline assembly 
>> >> > version on
>> >> > +* some test machines.
>> >>
>> >> That is really odd. Did you try to characterize why this is the case? Is
>> >> it purely the overhead of dispatching to the correct backend function?
>> >> That's a rather big performance hit.
>> >>
>> >> > +*/
>> >> > +   csum = crc32c(crc, data, len);
>> >> > +   }
>> >> > +
>> >> > +   return csum;
>> >> > +}
>> >> > +
>> >
>> > Are you sure that CONFIG_CRYPTO_CRC32C_INTEL was enabled during your tests 
>> > and
>> > that the accelerated version was being called?  Or, perhaps 
>> > CRC32C_PCL_BREAKEVEN
>> > (defined in arch/x86/crypto/crc32c-intel_glue.c) needs to be adjusted.  
>> > Please
>> > don't hack around performance problems like this; if they exist, they need 
>> > to be
>> > fixed for everyone.
>> >
>>
>> I have performed the crc32c test on a Xeon X5647 at 2.93GHz, 14G DDR3
>> memory at 1066MHz platform.
>> You are right that enabling CONFIG_CRYPTO_CRC32C_INTEL improves the
>> performance significantly. nova_crc32c() is still slightly faster than
>> crc32c() with the flag enabled.
>>
>> Result numbers are follows: data size in bytes, latency in ns, column
>> 3 is crc32c() with  CONFIG_CRYPTO_CRC32C_INTEL enabled and column 4
>> disabled.
>>
>> data size (bytes)nova_crc32c()crc32c() -enabled
>> crc32c() -disabled
>> 64  19   21 56
>> 12828   29 99
>> 25646   43 182
>> 51282   149 354
>> 1024  157 232 728
>> 2048  305 415 1440
>> 4096  603 725 2869
>>
>
> Probably CRC32C_PCL_BREAKEVEN needs to be adjusted for that CPU, as I 
> suggested
> may be the case; notice that your measured speeds are about the same before 
> 512
> (CRC32C_PCL_BREAKEVEN) bytes, but the crypto API version is slower at >= 512
> bytes.   It would be possible to set the breakeven point in
> crc32c_intel_mod_init() depending on the CPU.  Again, if the performance is 
> not
> good enough you need to fix it for everyone, not hack around it.
>

We verify that by setting CRC32C_PCL_BREAKEVEN to 8192, the
performance difference between nova_crc32c() and kernel's crc32c() is
negligible. Thanks for the comments, and I will use kernel's crc32c()
in the next version.

Thanks,
Andiry

Re: [RFC v2 05/83] Add NOVA filesystem definitions and useful helper routines.

2018-03-19 Thread Andiry Xu

On Mon, Mar 19, 2018 at 1:30 PM, Eric Biggers  wrote:
> On Mon, Mar 19, 2018 at 12:39:55PM -0700, Andiry Xu wrote:
>> On Sun, Mar 11, 2018 at 12:22 PM, Eric Biggers  wrote:
>> > On Sun, Mar 11, 2018 at 02:00:13PM +0200, Nikolay Borisov wrote:
>> >> [Adding Herbert Xu to CC since he is the maintainer of the crypto subsys
>> >> maintainer]
>> >>
>> >> On 10.03.2018 20:17, Andiry Xu wrote:
>> >> 
>> >>
>> >> > +static inline u32 nova_crc32c(u32 crc, const u8 *data, size_t len)
>> >> > +{
>> >> > +   u8 *ptr = (u8 *) data;
>> >> > +   u64 acc = crc; /* accumulator, crc32c value in lower 32b */
>> >> > +   u32 csum;
>> >> > +
>> >> > +   /* x86 instruction crc32 is part of SSE-4.2 */
>> >> > +   if (static_cpu_has(X86_FEATURE_XMM4_2)) {
>> >> > +   /* This inline assembly implementation should be equivalent
>> >> > +* to the kernel's crc32c_intel_le_hw() function used by
>> >> > +* crc32c(), but this performs better on test machines.
>> >> > +*/
>> >> > +   while (len > 8) {
>> >> > +   asm volatile(/* 64b quad words */
>> >> > +   "crc32q (%1), %0"
>> >> > +   : "=r" (acc)
>> >> > +   : "r"  (ptr), "0" (acc)
>> >> > +   );
>> >> > +   ptr += 8;
>> >> > +   len -= 8;
>> >> > +   }
>> >> > +
>> >> > +   while (len > 0) {
>> >> > +   asm volatile(/* trailing bytes */
>> >> > +   "crc32b (%1), %0"
>> >> > +   : "=r" (acc)
>> >> > +   : "r"  (ptr), "0" (acc)
>> >> > +   );
>> >> > +   ptr++;
>> >> > +   len--;
>> >> > +   }
>> >> > +
>> >> > +   csum = (u32) acc;
>> >> > +   } else {
>> >> > +   /* The kernel's crc32c() function should also detect and 
>> >> > use the
>> >> > +* crc32 instruction of SSE-4.2. But calling in to this 
>> >> > function
>> >> > +* is about 3x to 5x slower than the inline assembly 
>> >> > version on
>> >> > +* some test machines.
>> >>
>> >> That is really odd. Did you try to characterize why this is the case? Is
>> >> it purely the overhead of dispatching to the correct backend function?
>> >> That's a rather big performance hit.
>> >>
>> >> > +*/
>> >> > +   csum = crc32c(crc, data, len);
>> >> > +   }
>> >> > +
>> >> > +   return csum;
>> >> > +}
>> >> > +
>> >
>> > Are you sure that CONFIG_CRYPTO_CRC32C_INTEL was enabled during your tests 
>> > and
>> > that the accelerated version was being called?  Or, perhaps 
>> > CRC32C_PCL_BREAKEVEN
>> > (defined in arch/x86/crypto/crc32c-intel_glue.c) needs to be adjusted.  
>> > Please
>> > don't hack around performance problems like this; if they exist, they need 
>> > to be
>> > fixed for everyone.
>> >
>>
>> I have performed the crc32c test on a Xeon X5647 at 2.93GHz, 14G DDR3
>> memory at 1066MHz platform.
>> You are right that enabling CONFIG_CRYPTO_CRC32C_INTEL improves the
>> performance significantly. nova_crc32c() is still slightly faster than
>> crc32c() with the flag enabled.
>>
>> Result numbers are follows: data size in bytes, latency in ns, column
>> 3 is crc32c() with  CONFIG_CRYPTO_CRC32C_INTEL enabled and column 4
>> disabled.
>>
>> data size (bytes)nova_crc32c()crc32c() -enabled
>> crc32c() -disabled
>> 64  19   21 56
>> 12828   29 99
>> 25646   43 182
>> 51282   149 354
>> 1024  157 232 728
>> 2048  305 415 1440
>> 4096  603 725 2869
>>
>
> Probably CRC32C_PCL_BREAKEVEN needs to be adjusted for that CPU, as I 
> suggested
> may be the case; notice that your measured speeds are about the same before 
> 512
> (CRC32C_PCL_BREAKEVEN) bytes, but the crypto API version is slower at >= 512
> bytes.   It would be possible to set the breakeven point in
> crc32c_intel_mod_init() depending on the CPU.  Again, if the performance is 
> not
> good enough you need to fix it for everyone, not hack around it.
>

We verify that by setting CRC32C_PCL_BREAKEVEN to 8192, the
performance difference between nova_crc32c() and kernel's crc32c() is
negligible. Thanks for the comments, and I will use kernel's crc32c()
in the next version.

Thanks,
Andiry

Re: [RFC v2 05/83] Add NOVA filesystem definitions and useful helper routines.

2018-03-19 Thread Andiry Xu

On Sun, Mar 11, 2018 at 12:22 PM, Eric Biggers <ebigge...@gmail.com> wrote:
> On Sun, Mar 11, 2018 at 02:00:13PM +0200, Nikolay Borisov wrote:
>> [Adding Herbert Xu to CC since he is the maintainer of the crypto subsys
>> maintainer]
>>
>> On 10.03.2018 20:17, Andiry Xu wrote:
>> 
>>
>> > +static inline u32 nova_crc32c(u32 crc, const u8 *data, size_t len)
>> > +{
>> > +   u8 *ptr = (u8 *) data;
>> > +   u64 acc = crc; /* accumulator, crc32c value in lower 32b */
>> > +   u32 csum;
>> > +
>> > +   /* x86 instruction crc32 is part of SSE-4.2 */
>> > +   if (static_cpu_has(X86_FEATURE_XMM4_2)) {
>> > +   /* This inline assembly implementation should be equivalent
>> > +* to the kernel's crc32c_intel_le_hw() function used by
>> > +* crc32c(), but this performs better on test machines.
>> > +*/
>> > +   while (len > 8) {
>> > +   asm volatile(/* 64b quad words */
>> > +   "crc32q (%1), %0"
>> > +   : "=r" (acc)
>> > +   : "r"  (ptr), "0" (acc)
>> > +   );
>> > +   ptr += 8;
>> > +   len -= 8;
>> > +   }
>> > +
>> > +   while (len > 0) {
>> > +   asm volatile(/* trailing bytes */
>> > +   "crc32b (%1), %0"
>> > +   : "=r" (acc)
>> > +   : "r"  (ptr), "0" (acc)
>> > +   );
>> > +   ptr++;
>> > +   len--;
>> > +   }
>> > +
>> > +   csum = (u32) acc;
>> > +   } else {
>> > +   /* The kernel's crc32c() function should also detect and use 
>> > the
>> > +* crc32 instruction of SSE-4.2. But calling in to this 
>> > function
>> > +* is about 3x to 5x slower than the inline assembly version on
>> > +* some test machines.
>>
>> That is really odd. Did you try to characterize why this is the case? Is
>> it purely the overhead of dispatching to the correct backend function?
>> That's a rather big performance hit.
>>
>> > +*/
>> > +   csum = crc32c(crc, data, len);
>> > +   }
>> > +
>> > +   return csum;
>> > +}
>> > +
>
> Are you sure that CONFIG_CRYPTO_CRC32C_INTEL was enabled during your tests and
> that the accelerated version was being called?  Or, perhaps 
> CRC32C_PCL_BREAKEVEN
> (defined in arch/x86/crypto/crc32c-intel_glue.c) needs to be adjusted.  Please
> don't hack around performance problems like this; if they exist, they need to 
> be
> fixed for everyone.
>

I have performed the crc32c test on a Xeon X5647 at 2.93GHz, 14G DDR3
memory at 1066MHz platform.
You are right that enabling CONFIG_CRYPTO_CRC32C_INTEL improves the
performance significantly. nova_crc32c() is still slightly faster than
crc32c() with the flag enabled.

Result numbers are follows: data size in bytes, latency in ns, column
3 is crc32c() with  CONFIG_CRYPTO_CRC32C_INTEL enabled and column 4
disabled.

data size (bytes)nova_crc32c()crc32c() -enabled
crc32c() -disabled
64  19   21
56
12828   29
   99
25646   43
   182
51282   149
  354
1024  157 232
728
2048  305 415
1440
4096  603 725
2869

Thanks,
Andiry

Re: [RFC v2 05/83] Add NOVA filesystem definitions and useful helper routines.

2018-03-19 Thread Andiry Xu

On Sun, Mar 11, 2018 at 12:22 PM, Eric Biggers  wrote:
> On Sun, Mar 11, 2018 at 02:00:13PM +0200, Nikolay Borisov wrote:
>> [Adding Herbert Xu to CC since he is the maintainer of the crypto subsys
>> maintainer]
>>
>> On 10.03.2018 20:17, Andiry Xu wrote:
>> 
>>
>> > +static inline u32 nova_crc32c(u32 crc, const u8 *data, size_t len)
>> > +{
>> > +   u8 *ptr = (u8 *) data;
>> > +   u64 acc = crc; /* accumulator, crc32c value in lower 32b */
>> > +   u32 csum;
>> > +
>> > +   /* x86 instruction crc32 is part of SSE-4.2 */
>> > +   if (static_cpu_has(X86_FEATURE_XMM4_2)) {
>> > +   /* This inline assembly implementation should be equivalent
>> > +* to the kernel's crc32c_intel_le_hw() function used by
>> > +* crc32c(), but this performs better on test machines.
>> > +*/
>> > +   while (len > 8) {
>> > +   asm volatile(/* 64b quad words */
>> > +   "crc32q (%1), %0"
>> > +   : "=r" (acc)
>> > +   : "r"  (ptr), "0" (acc)
>> > +   );
>> > +   ptr += 8;
>> > +   len -= 8;
>> > +   }
>> > +
>> > +   while (len > 0) {
>> > +   asm volatile(/* trailing bytes */
>> > +   "crc32b (%1), %0"
>> > +   : "=r" (acc)
>> > +   : "r"  (ptr), "0" (acc)
>> > +   );
>> > +   ptr++;
>> > +   len--;
>> > +   }
>> > +
>> > +   csum = (u32) acc;
>> > +   } else {
>> > +   /* The kernel's crc32c() function should also detect and use 
>> > the
>> > +* crc32 instruction of SSE-4.2. But calling in to this 
>> > function
>> > +* is about 3x to 5x slower than the inline assembly version on
>> > +* some test machines.
>>
>> That is really odd. Did you try to characterize why this is the case? Is
>> it purely the overhead of dispatching to the correct backend function?
>> That's a rather big performance hit.
>>
>> > +*/
>> > +   csum = crc32c(crc, data, len);
>> > +   }
>> > +
>> > +   return csum;
>> > +}
>> > +
>
> Are you sure that CONFIG_CRYPTO_CRC32C_INTEL was enabled during your tests and
> that the accelerated version was being called?  Or, perhaps 
> CRC32C_PCL_BREAKEVEN
> (defined in arch/x86/crypto/crc32c-intel_glue.c) needs to be adjusted.  Please
> don't hack around performance problems like this; if they exist, they need to 
> be
> fixed for everyone.
>

I have performed the crc32c test on a Xeon X5647 at 2.93GHz, 14G DDR3
memory at 1066MHz platform.
You are right that enabling CONFIG_CRYPTO_CRC32C_INTEL improves the
performance significantly. nova_crc32c() is still slightly faster than
crc32c() with the flag enabled.

Result numbers are follows: data size in bytes, latency in ns, column
3 is crc32c() with  CONFIG_CRYPTO_CRC32C_INTEL enabled and column 4
disabled.

data size (bytes)nova_crc32c()crc32c() -enabled
crc32c() -disabled
64  19   21
56
12828   29
   99
25646   43
   182
51282   149
  354
1024  157 232
728
2048  305 415
1440
4096  603 725
2869

Thanks,
Andiry

Re: [RFC v2 03/83] Add super.h.

2018-03-16 Thread Andiry Xu

On Thu, Mar 15, 2018 at 7:59 PM, Theodore Y. Ts'o  wrote:
> On Thu, Mar 15, 2018 at 09:38:29PM +0100, Arnd Bergmann wrote:
>>
>> You could also have a resolution of less than a nanosecond. Note
>> that today, the file time stamps generated by the kernel are in
>> jiffies resolution, so at best one millisecond. However, most modern
>> file systems go with the 64+32 bit timestamps because it's not all
>> that expensive.
>
> It actually depends on the architecture and the accuracy/granularity
> of the timekeeping hardware available to the system, but it's possible
> for the granularity of file time stamps to be up to one nanosecond.
> So you can get results like this:
>
> % stat unix_io.o
>   File: unix_io.o
>   Size: 55000   Blocks: 112IO Block: 4096   regular file
> Device: fc01h/64513dInode: 19931278Links: 1
> Access: (0644/-rw-r--r--)  Uid: (15806/   tytso)   Gid: (15806/   tytso)
> Access: 2018-03-15 18:09:21.679914182 -0400
> Modify: 2018-03-15 18:09:21.639914089 -0400
> Change: 2018-03-15 18:09:21.639914089 -0400
>

Thanks for all the suggestions. I think I will follow ext4's time
format. 2446 should be far away enough.

Thanks,
Andiry

Re: [RFC v2 03/83] Add super.h.

2018-03-16 Thread Andiry Xu

On Thu, Mar 15, 2018 at 7:59 PM, Theodore Y. Ts'o  wrote:
> On Thu, Mar 15, 2018 at 09:38:29PM +0100, Arnd Bergmann wrote:
>>
>> You could also have a resolution of less than a nanosecond. Note
>> that today, the file time stamps generated by the kernel are in
>> jiffies resolution, so at best one millisecond. However, most modern
>> file systems go with the 64+32 bit timestamps because it's not all
>> that expensive.
>
> It actually depends on the architecture and the accuracy/granularity
> of the timekeeping hardware available to the system, but it's possible
> for the granularity of file time stamps to be up to one nanosecond.
> So you can get results like this:
>
> % stat unix_io.o
>   File: unix_io.o
>   Size: 55000   Blocks: 112IO Block: 4096   regular file
> Device: fc01h/64513dInode: 19931278Links: 1
> Access: (0644/-rw-r--r--)  Uid: (15806/   tytso)   Gid: (15806/   tytso)
> Access: 2018-03-15 18:09:21.679914182 -0400
> Modify: 2018-03-15 18:09:21.639914089 -0400
> Change: 2018-03-15 18:09:21.639914089 -0400
>

Thanks for all the suggestions. I think I will follow ext4's time
format. 2446 should be far away enough.

Thanks,
Andiry

Re: [RFC v2 03/83] Add super.h.

2018-03-15 Thread Andiry Xu

On Thu, Mar 15, 2018 at 2:05 AM, Arnd Bergmann <a...@arndb.de> wrote:
> On Thu, Mar 15, 2018 at 7:11 AM, Andiry Xu <jix...@eng.ucsd.edu> wrote:
>> On Wed, Mar 14, 2018 at 9:54 PM, Darrick J. Wong
>> <darrick.w...@oracle.com> wrote:
>>> On Sat, Mar 10, 2018 at 10:17:44AM -0800, Andiry Xu wrote:
>
>>>> + /* s_mtime and s_wtime should be together and their order should not 
>>>> be
>>>> +  * changed. we use an 8 byte write to update both of them atomically
>>>> +  */
>>>> + __le32  s_mtime;/* mount time */
>>>> + __le32  s_wtime;/* write time */
>>>
>>> Hmmm, 32-bit timestamps?  2038 isn't that far away...
>>>
>>
>> I will try fixing this in the next version.
>
> I would also recommend adding nanosecond-resolution timestamps.
> In theory, a signed 64-bit nanosecond field is sufficient for each timestamp
> (it's good for several hundred years), but the more common format uses
> 64-bit seconds and 32-bit nanoseconds in other file systems.
>
> Unfortunately it looks, you will have to come up with a more sophisticated
> update method above, even if you leave out the nanoseconds, you can't
> easily rely on a 16-byte atomic update across architectures to deal with
> the two 64-bit timestamps. For the superblock fields, you might be able
> to get away with using second resolution, and then encoding the
> timestamps as a signed 64-bit 'mkfs time' along with two unsigned
> 32-bit times added on top, which gives you a range of 136 years mount
> a file system after its creation.
>

I will take a look at other file systems.

Superblock mtime is not a big problem as it is updated rarely. 64-bit
seconds and 32-bit nanoseconds make the inode and log entry bigger,
and updating file->atime cannot be done with a single 64bit update.
That may be annoying and needs to use journaling.

Thanks,
Andiry

>   Arnd

Re: [RFC v2 03/83] Add super.h.

2018-03-15 Thread Andiry Xu

On Thu, Mar 15, 2018 at 2:05 AM, Arnd Bergmann  wrote:
> On Thu, Mar 15, 2018 at 7:11 AM, Andiry Xu  wrote:
>> On Wed, Mar 14, 2018 at 9:54 PM, Darrick J. Wong
>>  wrote:
>>> On Sat, Mar 10, 2018 at 10:17:44AM -0800, Andiry Xu wrote:
>
>>>> + /* s_mtime and s_wtime should be together and their order should not 
>>>> be
>>>> +  * changed. we use an 8 byte write to update both of them atomically
>>>> +  */
>>>> + __le32  s_mtime;/* mount time */
>>>> + __le32  s_wtime;/* write time */
>>>
>>> Hmmm, 32-bit timestamps?  2038 isn't that far away...
>>>
>>
>> I will try fixing this in the next version.
>
> I would also recommend adding nanosecond-resolution timestamps.
> In theory, a signed 64-bit nanosecond field is sufficient for each timestamp
> (it's good for several hundred years), but the more common format uses
> 64-bit seconds and 32-bit nanoseconds in other file systems.
>
> Unfortunately it looks, you will have to come up with a more sophisticated
> update method above, even if you leave out the nanoseconds, you can't
> easily rely on a 16-byte atomic update across architectures to deal with
> the two 64-bit timestamps. For the superblock fields, you might be able
> to get away with using second resolution, and then encoding the
> timestamps as a signed 64-bit 'mkfs time' along with two unsigned
> 32-bit times added on top, which gives you a range of 136 years mount
> a file system after its creation.
>

I will take a look at other file systems.

Superblock mtime is not a big problem as it is updated rarely. 64-bit
seconds and 32-bit nanoseconds make the inode and log entry bigger,
and updating file->atime cannot be done with a single 64bit update.
That may be annoying and needs to use journaling.

Thanks,
Andiry

>   Arnd

Re: [RFC v2 04/83] NOVA inode definition.

2018-03-15 Thread Andiry Xu

On Wed, Mar 14, 2018 at 10:06 PM, Darrick J. Wong
<darrick.w...@oracle.com> wrote:
> On Sat, Mar 10, 2018 at 10:17:45AM -0800, Andiry Xu wrote:
>> From: Andiry Xu <jix...@cs.ucsd.edu>
>>
>> inode.h defines the non-volatile and volatile NOVA inode data structures.
>>
>> The non-volatile NOVA inode (nova_inode) is aligned to 128 bytes and contains
>> file/directory metadata information. The most important fields
>> are log_head and log_tail. log_head points to the start of
>> the log, and log_tail points to the end of the latest committed
>> log entry. NOVA make updates to the inode by appending
>> to the log tail and update the log_tail pointer atomically.
>>
>> The volatile NOVA inode (nova_inode_info) contains necessary
>> information to limit access to the non-volatile NOVA inode during runtime.
>> It has a radix tree to map file offset or filenames to the corresponding
>> log entries.
>>
>> Signed-off-by: Andiry Xu <jix...@cs.ucsd.edu>
>> ---
>>  fs/nova/inode.h | 187 
>> 
>>  1 file changed, 187 insertions(+)
>>  create mode 100644 fs/nova/inode.h
>>
>> diff --git a/fs/nova/inode.h b/fs/nova/inode.h
>> new file mode 100644
>> index 000..f9187e3
>> --- /dev/null
>> +++ b/fs/nova/inode.h
>> @@ -0,0 +1,187 @@
>> +#ifndef __INODE_H
>> +#define __INODE_H
>> +
>> +struct nova_inode_info_header;
>> +struct nova_inode;
>> +
>> +#include "super.h"
>> +
>> +enum nova_new_inode_type {
>> + TYPE_CREATE = 0,
>> + TYPE_MKNOD,
>> + TYPE_SYMLINK,
>> + TYPE_MKDIR
>> +};
>> +
>> +
>> +/*
>> + * Structure of an inode in PMEM
>> + * Keep the inode size to within 120 bytes: We use the last eight bytes
>> + * as inode table tail pointer.
>
> I would've expected a
> BUILD_BUG_ON(NOVA_INODE_SIZE - sizeof(struct nova_inode) == 8);
> or something to enforce this.
>

Thanks, will do.

> (Or just equate inode number with byte offset?  I looked ahead at the
> directory entries and they seem to be 64-bit...)
>
> I guess I'm being lazy and doing a on-disk-format-only review. :)
>
>> + */
>> +struct nova_inode {
>> +
>> + /* first 40 bytes */
>> + u8  i_rsvd;  /* reserved. used to be checksum */
>
> Magic number?
>

OK.

>> + u8  valid;   /* Is this inode valid? */
>> + u8  deleted; /* Is this inode deleted? */
>
> Would i_mode == 0 cover these?
>

Deleted flag comes from NOVA-Fortis code. I will check if i_mode can cover it.

>> + u8  i_blk_type;  /* data block size this inode uses */
>
> I would've thought these would just be bits of i_flags?
>
> Also, if I have a 1G blocksize file and free space fragments to the
> point that there's > 1G of free space but none of it contiguous, I guess
> I can expect ENOSPC?
>

Yes, but 1G blocksize has not been tested.

>> + __le32  i_flags; /* Inode flags */
>> + __le64  i_size;  /* Size of data in bytes */
>> + __le32  i_ctime; /* Inode modification time */
>> + __le32  i_mtime; /* Inode b-tree Modification time */
>> + __le32  i_atime; /* Access time */
>
> Same y2038 grumble from the previous patch.
>

Will fix.

>> + __le16  i_mode;  /* File mode */
>> + __le16  i_links_count;   /* Links count */
>> +
>> + __le64  i_xattr; /* Extended attribute block */
>> +
>> + /* second 40 bytes */
>> + __le32  i_uid;   /* Owner Uid */
>> + __le32  i_gid;   /* Group Id */
>> + __le32  i_generation;/* File version (for NFS) */
>> + __le32  i_create_time;   /* Create time */
>> + __le64  nova_ino;/* nova inode number */
>> +
>> + __le64  log_head;/* Log head pointer */
>> + __le64  log_tail;/* Log tail pointer */
>> +
>> + /* last 40 bytes */
>> + __le64  create_epoch_id; /* Transaction ID when create */
>> + __le64  delete_epoch_id; /* Transaction ID when deleted */
>> +
>> + struct {
>> + __le32 rdev; /* major/minor # */
>> + } dev;   /* device inode */
>> +
>> + __le32  csum;/* CRC32 checksum */
>> + /* Leave 8 bytes for inode table tail pointer */
>> +} __attribute((__packed__));
>> +
>> +/*
>> + * NOVA-specific inode state kept in DRAM
>> + */
>> +struct nova_inode_info_heade

Re: [RFC v2 04/83] NOVA inode definition.

2018-03-15 Thread Andiry Xu

On Wed, Mar 14, 2018 at 10:06 PM, Darrick J. Wong
 wrote:
> On Sat, Mar 10, 2018 at 10:17:45AM -0800, Andiry Xu wrote:
>> From: Andiry Xu 
>>
>> inode.h defines the non-volatile and volatile NOVA inode data structures.
>>
>> The non-volatile NOVA inode (nova_inode) is aligned to 128 bytes and contains
>> file/directory metadata information. The most important fields
>> are log_head and log_tail. log_head points to the start of
>> the log, and log_tail points to the end of the latest committed
>> log entry. NOVA make updates to the inode by appending
>> to the log tail and update the log_tail pointer atomically.
>>
>> The volatile NOVA inode (nova_inode_info) contains necessary
>> information to limit access to the non-volatile NOVA inode during runtime.
>> It has a radix tree to map file offset or filenames to the corresponding
>> log entries.
>>
>> Signed-off-by: Andiry Xu 
>> ---
>>  fs/nova/inode.h | 187 
>> 
>>  1 file changed, 187 insertions(+)
>>  create mode 100644 fs/nova/inode.h
>>
>> diff --git a/fs/nova/inode.h b/fs/nova/inode.h
>> new file mode 100644
>> index 000..f9187e3
>> --- /dev/null
>> +++ b/fs/nova/inode.h
>> @@ -0,0 +1,187 @@
>> +#ifndef __INODE_H
>> +#define __INODE_H
>> +
>> +struct nova_inode_info_header;
>> +struct nova_inode;
>> +
>> +#include "super.h"
>> +
>> +enum nova_new_inode_type {
>> + TYPE_CREATE = 0,
>> + TYPE_MKNOD,
>> + TYPE_SYMLINK,
>> + TYPE_MKDIR
>> +};
>> +
>> +
>> +/*
>> + * Structure of an inode in PMEM
>> + * Keep the inode size to within 120 bytes: We use the last eight bytes
>> + * as inode table tail pointer.
>
> I would've expected a
> BUILD_BUG_ON(NOVA_INODE_SIZE - sizeof(struct nova_inode) == 8);
> or something to enforce this.
>

Thanks, will do.

> (Or just equate inode number with byte offset?  I looked ahead at the
> directory entries and they seem to be 64-bit...)
>
> I guess I'm being lazy and doing a on-disk-format-only review. :)
>
>> + */
>> +struct nova_inode {
>> +
>> + /* first 40 bytes */
>> + u8  i_rsvd;  /* reserved. used to be checksum */
>
> Magic number?
>

OK.

>> + u8  valid;   /* Is this inode valid? */
>> + u8  deleted; /* Is this inode deleted? */
>
> Would i_mode == 0 cover these?
>

Deleted flag comes from NOVA-Fortis code. I will check if i_mode can cover it.

>> + u8  i_blk_type;  /* data block size this inode uses */
>
> I would've thought these would just be bits of i_flags?
>
> Also, if I have a 1G blocksize file and free space fragments to the
> point that there's > 1G of free space but none of it contiguous, I guess
> I can expect ENOSPC?
>

Yes, but 1G blocksize has not been tested.

>> + __le32  i_flags; /* Inode flags */
>> + __le64  i_size;  /* Size of data in bytes */
>> + __le32  i_ctime; /* Inode modification time */
>> + __le32  i_mtime; /* Inode b-tree Modification time */
>> + __le32  i_atime; /* Access time */
>
> Same y2038 grumble from the previous patch.
>

Will fix.

>> + __le16  i_mode;  /* File mode */
>> + __le16  i_links_count;   /* Links count */
>> +
>> + __le64  i_xattr; /* Extended attribute block */
>> +
>> + /* second 40 bytes */
>> + __le32  i_uid;   /* Owner Uid */
>> + __le32  i_gid;   /* Group Id */
>> + __le32  i_generation;/* File version (for NFS) */
>> + __le32  i_create_time;   /* Create time */
>> + __le64  nova_ino;/* nova inode number */
>> +
>> + __le64  log_head;/* Log head pointer */
>> + __le64  log_tail;/* Log tail pointer */
>> +
>> + /* last 40 bytes */
>> + __le64  create_epoch_id; /* Transaction ID when create */
>> + __le64  delete_epoch_id; /* Transaction ID when deleted */
>> +
>> + struct {
>> + __le32 rdev; /* major/minor # */
>> + } dev;   /* device inode */
>> +
>> + __le32  csum;/* CRC32 checksum */
>> + /* Leave 8 bytes for inode table tail pointer */
>> +} __attribute((__packed__));
>> +
>> +/*
>> + * NOVA-specific inode state kept in DRAM
>> + */
>> +struct nova_inode_info_header {
>> + /* For files, tree holds a map from file offsets to
>> +

Re: [RFC v2 03/83] Add super.h.

2018-03-15 Thread Andiry Xu

On Wed, Mar 14, 2018 at 9:54 PM, Darrick J. Wong
<darrick.w...@oracle.com> wrote:
> On Sat, Mar 10, 2018 at 10:17:44AM -0800, Andiry Xu wrote:
>> From: Andiry Xu <jix...@cs.ucsd.edu>
>>
>> This header file defines NOVA persistent and volatile superblock
>> data structures.
>>
>> It also defines NOVA block layout:
>>
>> Page 0: Superblock
>> Page 1: Reserved inodes
>> Page 2 - 15: Reserved
>> Page 16 - 31: Inode table pointers
>> Page 32 - 47: Journal address pointers
>> Page 48 - 63: Reserved
>> Pages n-2: Replicate reserved inodes
>> Pages n-1: Replicate superblock
>>
>> Other pages are for normal inodes, logs and data.
>>
>> Signed-off-by: Andiry Xu <jix...@cs.ucsd.edu>
>> ---
>>  fs/nova/super.h | 149 
>> 
>>  1 file changed, 149 insertions(+)
>>  create mode 100644 fs/nova/super.h
>>
>> diff --git a/fs/nova/super.h b/fs/nova/super.h
>> new file mode 100644
>> index 000..cb53908
>> --- /dev/null
>> +++ b/fs/nova/super.h
>> @@ -0,0 +1,149 @@
>> +#ifndef __SUPER_H
>> +#define __SUPER_H
>> +/*
>> + * Structure of the NOVA super block in PMEM
>> + *
>> + * The fields are partitioned into static and dynamic fields. The static 
>> fields
>> + * never change after file system creation. This was primarily done because
>> + * nova_get_block() returns NULL if the block offset is 0 (helps in catching
>> + * bugs). So if we modify any field using journaling (for consistency), we
>> + * will have to modify s_sum which is at offset 0. So journaling code fails.
>> + * This (static+dynamic fields) is a temporary solution and can be avoided
>> + * once the file system becomes stable and nova_get_block() returns correct
>> + * pointers even for offset 0.
>> + */
>> +struct nova_super_block {
>> + /* static fields. they never change after file system creation.
>> +  * checksum only validates up to s_start_dynamic field below
>> +  */
>> + __le32  s_sum;  /* checksum of this sb */
>> + __le32  s_magic;/* magic signature */
>> + __le32  s_padding32;
>> + __le32  s_blocksize;/* blocksize in bytes */
>> + __le64  s_size; /* total size of fs in bytes */
>> + chars_volume_name[16];  /* volume name */
>> +
>> + /* all the dynamic fields should go here */
>> + __le64  s_epoch_id; /* Epoch ID */
>> +
>> + /* s_mtime and s_wtime should be together and their order should not be
>> +  * changed. we use an 8 byte write to update both of them atomically
>> +  */
>> + __le32  s_mtime;/* mount time */
>> + __le32  s_wtime;/* write time */
>
> Hmmm, 32-bit timestamps?  2038 isn't that far away...
>

I will try fixing this in the next version.

>> +} __attribute((__packed__));
>> +
>> +#define NOVA_SB_SIZE 512   /* must be power of two */
>> +
>> +/* === Reserved blocks = */
>> +
>> +/*
>> + * Page 0 contains super blocks;
>> + * Page 1 contains reserved inodes;
>> + * Page 2 - 15 are reserved.
>> + * Page 16 - 31 contain pointers to inode tables.
>> + * Page 32 - 47 contain pointers to journal pages.
>> + */
>> +#define  HEAD_RESERVED_BLOCKS64
>> +#define  NUM_JOURNAL_PAGES   16
>> +
>> +#define  SUPER_BLOCK_START   0 // Superblock
>> +#define  RESERVE_INODE_START 1 // Reserved inodes
>> +#define  INODE_TABLE_START   16 // inode table pointers
>> +#define  JOURNAL_START   32 // journal pointer table
>> +
>> +/* For replica super block and replica reserved inodes */
>> +#define  TAIL_RESERVED_BLOCKS2
>> +
>> +/* === Reserved inodes = */
>> +
>> +/* We have space for 31 reserved inodes */
>> +#define NOVA_ROOT_INO(1)
>> +#define NOVA_INODETABLE_INO  (2) /* Fake inode associated with inode
>> +  * stroage.  We need this because our
>> +  * allocator requires inode to be
>> +  * associated with each allocation.
>> +  * The data actually lives in linked
>> +  * lists in INODE_T

Re: [RFC v2 03/83] Add super.h.

2018-03-15 Thread Andiry Xu

On Wed, Mar 14, 2018 at 9:54 PM, Darrick J. Wong
 wrote:
> On Sat, Mar 10, 2018 at 10:17:44AM -0800, Andiry Xu wrote:
>> From: Andiry Xu 
>>
>> This header file defines NOVA persistent and volatile superblock
>> data structures.
>>
>> It also defines NOVA block layout:
>>
>> Page 0: Superblock
>> Page 1: Reserved inodes
>> Page 2 - 15: Reserved
>> Page 16 - 31: Inode table pointers
>> Page 32 - 47: Journal address pointers
>> Page 48 - 63: Reserved
>> Pages n-2: Replicate reserved inodes
>> Pages n-1: Replicate superblock
>>
>> Other pages are for normal inodes, logs and data.
>>
>> Signed-off-by: Andiry Xu 
>> ---
>>  fs/nova/super.h | 149 
>> 
>>  1 file changed, 149 insertions(+)
>>  create mode 100644 fs/nova/super.h
>>
>> diff --git a/fs/nova/super.h b/fs/nova/super.h
>> new file mode 100644
>> index 000..cb53908
>> --- /dev/null
>> +++ b/fs/nova/super.h
>> @@ -0,0 +1,149 @@
>> +#ifndef __SUPER_H
>> +#define __SUPER_H
>> +/*
>> + * Structure of the NOVA super block in PMEM
>> + *
>> + * The fields are partitioned into static and dynamic fields. The static 
>> fields
>> + * never change after file system creation. This was primarily done because
>> + * nova_get_block() returns NULL if the block offset is 0 (helps in catching
>> + * bugs). So if we modify any field using journaling (for consistency), we
>> + * will have to modify s_sum which is at offset 0. So journaling code fails.
>> + * This (static+dynamic fields) is a temporary solution and can be avoided
>> + * once the file system becomes stable and nova_get_block() returns correct
>> + * pointers even for offset 0.
>> + */
>> +struct nova_super_block {
>> + /* static fields. they never change after file system creation.
>> +  * checksum only validates up to s_start_dynamic field below
>> +  */
>> + __le32  s_sum;  /* checksum of this sb */
>> + __le32  s_magic;/* magic signature */
>> + __le32  s_padding32;
>> + __le32  s_blocksize;/* blocksize in bytes */
>> + __le64  s_size; /* total size of fs in bytes */
>> + chars_volume_name[16];  /* volume name */
>> +
>> + /* all the dynamic fields should go here */
>> + __le64  s_epoch_id; /* Epoch ID */
>> +
>> + /* s_mtime and s_wtime should be together and their order should not be
>> +  * changed. we use an 8 byte write to update both of them atomically
>> +  */
>> + __le32  s_mtime;/* mount time */
>> + __le32  s_wtime;/* write time */
>
> Hmmm, 32-bit timestamps?  2038 isn't that far away...
>

I will try fixing this in the next version.

>> +} __attribute((__packed__));
>> +
>> +#define NOVA_SB_SIZE 512   /* must be power of two */
>> +
>> +/* === Reserved blocks = */
>> +
>> +/*
>> + * Page 0 contains super blocks;
>> + * Page 1 contains reserved inodes;
>> + * Page 2 - 15 are reserved.
>> + * Page 16 - 31 contain pointers to inode tables.
>> + * Page 32 - 47 contain pointers to journal pages.
>> + */
>> +#define  HEAD_RESERVED_BLOCKS64
>> +#define  NUM_JOURNAL_PAGES   16
>> +
>> +#define  SUPER_BLOCK_START   0 // Superblock
>> +#define  RESERVE_INODE_START 1 // Reserved inodes
>> +#define  INODE_TABLE_START   16 // inode table pointers
>> +#define  JOURNAL_START   32 // journal pointer table
>> +
>> +/* For replica super block and replica reserved inodes */
>> +#define  TAIL_RESERVED_BLOCKS2
>> +
>> +/* === Reserved inodes = */
>> +
>> +/* We have space for 31 reserved inodes */
>> +#define NOVA_ROOT_INO(1)
>> +#define NOVA_INODETABLE_INO  (2) /* Fake inode associated with inode
>> +  * stroage.  We need this because our
>> +  * allocator requires inode to be
>> +  * associated with each allocation.
>> +  * The data actually lives in linked
>> +  * lists in INODE_TABLE_START. */
>> +#define NOVA_BLOCKNODE_INO   (3) /* Storage for alloca

Re: [RFC v2 83/83] Sysfs support.

2018-03-15 Thread Andiry Xu

On Wed, Mar 14, 2018 at 5:33 PM, Randy Dunlap <rdun...@infradead.org> wrote:
> On 03/10/2018 10:19 AM, Andiry Xu wrote:
>> Sysfs support allows user to get/post information of running NOVA instance.
>> After mount, NOVA creates four entries under proc directory
>> /proc/fs/nova/pmem#/:
>>
>> timing_stats  IO_statsallocator   gc
>
> Hi,
>
> This is all procfs, not sysfs, so the name is (or can be) confusing.
>
> Please change it.
>

Thanks, will fix.

Andiry

> --
> ~Randy

Re: [RFC v2 83/83] Sysfs support.

2018-03-15 Thread Andiry Xu

On Wed, Mar 14, 2018 at 5:33 PM, Randy Dunlap  wrote:
> On 03/10/2018 10:19 AM, Andiry Xu wrote:
>> Sysfs support allows user to get/post information of running NOVA instance.
>> After mount, NOVA creates four entries under proc directory
>> /proc/fs/nova/pmem#/:
>>
>> timing_stats  IO_statsallocator   gc
>
> Hi,
>
> This is all procfs, not sysfs, so the name is (or can be) confusing.
>
> Please change it.
>

Thanks, will fix.

Andiry

> --
> ~Randy

Re: [RFC v2 05/83] Add NOVA filesystem definitions and useful helper routines.

2018-03-11 Thread Andiry Xu

On Sun, Mar 11, 2018 at 12:22 PM, Eric Biggers <ebigge...@gmail.com> wrote:
> On Sun, Mar 11, 2018 at 02:00:13PM +0200, Nikolay Borisov wrote:
>> [Adding Herbert Xu to CC since he is the maintainer of the crypto subsys
>> maintainer]
>>
>> On 10.03.2018 20:17, Andiry Xu wrote:
>> 
>>
>> > +static inline u32 nova_crc32c(u32 crc, const u8 *data, size_t len)
>> > +{
>> > +   u8 *ptr = (u8 *) data;
>> > +   u64 acc = crc; /* accumulator, crc32c value in lower 32b */
>> > +   u32 csum;
>> > +
>> > +   /* x86 instruction crc32 is part of SSE-4.2 */
>> > +   if (static_cpu_has(X86_FEATURE_XMM4_2)) {
>> > +   /* This inline assembly implementation should be equivalent
>> > +* to the kernel's crc32c_intel_le_hw() function used by
>> > +* crc32c(), but this performs better on test machines.
>> > +*/
>> > +   while (len > 8) {
>> > +   asm volatile(/* 64b quad words */
>> > +   "crc32q (%1), %0"
>> > +   : "=r" (acc)
>> > +   : "r"  (ptr), "0" (acc)
>> > +   );
>> > +   ptr += 8;
>> > +   len -= 8;
>> > +   }
>> > +
>> > +   while (len > 0) {
>> > +   asm volatile(/* trailing bytes */
>> > +   "crc32b (%1), %0"
>> > +   : "=r" (acc)
>> > +   : "r"  (ptr), "0" (acc)
>> > +   );
>> > +   ptr++;
>> > +   len--;
>> > +   }
>> > +
>> > +   csum = (u32) acc;
>> > +   } else {
>> > +   /* The kernel's crc32c() function should also detect and use 
>> > the
>> > +* crc32 instruction of SSE-4.2. But calling in to this 
>> > function
>> > +* is about 3x to 5x slower than the inline assembly version on
>> > +* some test machines.
>>
>> That is really odd. Did you try to characterize why this is the case? Is
>> it purely the overhead of dispatching to the correct backend function?
>> That's a rather big performance hit.
>>
>> > +*/
>> > +   csum = crc32c(crc, data, len);
>> > +   }
>> > +
>> > +   return csum;
>> > +}
>> > +
>
> Are you sure that CONFIG_CRYPTO_CRC32C_INTEL was enabled during your tests and
> that the accelerated version was being called?  Or, perhaps 
> CRC32C_PCL_BREAKEVEN
> (defined in arch/x86/crypto/crc32c-intel_glue.c) needs to be adjusted.  Please
> don't hack around performance problems like this; if they exist, they need to 
> be
> fixed for everyone.
>

I think we found the issue when implementing NOVA-Fortis metadata and
data protections, which use crc32c a lot. They have been removed in
this patchset; but I will double check and make sure if the issue
exists or not.

Thanks,
Andiry

> Eric

Re: [RFC v2 05/83] Add NOVA filesystem definitions and useful helper routines.

2018-03-11 Thread Andiry Xu

On Sun, Mar 11, 2018 at 12:22 PM, Eric Biggers  wrote:
> On Sun, Mar 11, 2018 at 02:00:13PM +0200, Nikolay Borisov wrote:
>> [Adding Herbert Xu to CC since he is the maintainer of the crypto subsys
>> maintainer]
>>
>> On 10.03.2018 20:17, Andiry Xu wrote:
>> 
>>
>> > +static inline u32 nova_crc32c(u32 crc, const u8 *data, size_t len)
>> > +{
>> > +   u8 *ptr = (u8 *) data;
>> > +   u64 acc = crc; /* accumulator, crc32c value in lower 32b */
>> > +   u32 csum;
>> > +
>> > +   /* x86 instruction crc32 is part of SSE-4.2 */
>> > +   if (static_cpu_has(X86_FEATURE_XMM4_2)) {
>> > +   /* This inline assembly implementation should be equivalent
>> > +* to the kernel's crc32c_intel_le_hw() function used by
>> > +* crc32c(), but this performs better on test machines.
>> > +*/
>> > +   while (len > 8) {
>> > +   asm volatile(/* 64b quad words */
>> > +   "crc32q (%1), %0"
>> > +   : "=r" (acc)
>> > +   : "r"  (ptr), "0" (acc)
>> > +   );
>> > +   ptr += 8;
>> > +   len -= 8;
>> > +   }
>> > +
>> > +   while (len > 0) {
>> > +   asm volatile(/* trailing bytes */
>> > +   "crc32b (%1), %0"
>> > +   : "=r" (acc)
>> > +   : "r"  (ptr), "0" (acc)
>> > +   );
>> > +   ptr++;
>> > +   len--;
>> > +   }
>> > +
>> > +   csum = (u32) acc;
>> > +   } else {
>> > +   /* The kernel's crc32c() function should also detect and use 
>> > the
>> > +* crc32 instruction of SSE-4.2. But calling in to this 
>> > function
>> > +* is about 3x to 5x slower than the inline assembly version on
>> > +* some test machines.
>>
>> That is really odd. Did you try to characterize why this is the case? Is
>> it purely the overhead of dispatching to the correct backend function?
>> That's a rather big performance hit.
>>
>> > +*/
>> > +   csum = crc32c(crc, data, len);
>> > +   }
>> > +
>> > +   return csum;
>> > +}
>> > +
>
> Are you sure that CONFIG_CRYPTO_CRC32C_INTEL was enabled during your tests and
> that the accelerated version was being called?  Or, perhaps 
> CRC32C_PCL_BREAKEVEN
> (defined in arch/x86/crypto/crc32c-intel_glue.c) needs to be adjusted.  Please
> don't hack around performance problems like this; if they exist, they need to 
> be
> fixed for everyone.
>

I think we found the issue when implementing NOVA-Fortis metadata and
data protections, which use crc32c a lot. They have been removed in
this patchset; but I will double check and make sure if the issue
exists or not.

Thanks,
Andiry

> Eric

Re: [RFC v2 09/83] Add Kconfig and Makefile

2018-03-11 Thread Andiry Xu

On Sun, Mar 11, 2018 at 5:15 AM, Nikolay Borisov
<n.borisov.l...@gmail.com> wrote:
>
>
> On 10.03.2018 20:17, Andiry Xu wrote:
>> From: Andiry Xu <jix...@cs.ucsd.edu>
>>
>> Signed-off-by: Andiry Xu <jix...@cs.ucsd.edu>
>> ---
>>  fs/Kconfig   |  2 ++
>>  fs/Makefile  |  1 +
>>  fs/nova/Kconfig  | 15 +++
>>  fs/nova/Makefile |  7 +++
>>  4 files changed, 25 insertions(+)
>>  create mode 100644 fs/nova/Kconfig
>>  create mode 100644 fs/nova/Makefile
>>
>> diff --git a/fs/Kconfig b/fs/Kconfig
>> index bc821a8..5e9ff3e 100644
>> --- a/fs/Kconfig
>> +++ b/fs/Kconfig
>> @@ -58,6 +58,8 @@ config FS_DAX_PMD
>>   depends on ZONE_DEVICE
>>   depends on TRANSPARENT_HUGEPAGE
>>
>> +source "fs/nova/Kconfig"
>> +
>>  # Selected by DAX drivers that do not expect filesystem DAX to support
>>  # get_user_pages() of DAX mappings. I.e. "limited" indicates no support
>>  # for fork() of processes with MAP_SHARED mappings or support for
>> diff --git a/fs/Makefile b/fs/Makefile
>> index add789e..65ea619 100644
>> --- a/fs/Makefile
>> +++ b/fs/Makefile
>> @@ -113,6 +113,7 @@ obj-$(CONFIG_OMFS_FS) += omfs/
>>  obj-$(CONFIG_JFS_FS) += jfs/
>>  obj-$(CONFIG_XFS_FS) += xfs/
>>  obj-$(CONFIG_9P_FS)  += 9p/
>> +obj-$(CONFIG_NOVA_FS)+= nova/
>>  obj-$(CONFIG_AFS_FS) += afs/
>>  obj-$(CONFIG_NILFS2_FS)  += nilfs2/
>>  obj-$(CONFIG_BEFS_FS)+= befs/
>> diff --git a/fs/nova/Kconfig b/fs/nova/Kconfig
>> new file mode 100644
>> index 000..c1c692e
>> --- /dev/null
>> +++ b/fs/nova/Kconfig
>> @@ -0,0 +1,15 @@
>> +config NOVA_FS
>> + tristate "NOVA: log-structured file system for non-volatile memories"
>> + depends on FS_DAX
>> + select CRC32
>
> What do you need crc32 for? Selecting libcrc32c is enough to do "the
> right thing"
>

I think this is the legacy of the removed NOVA-Fortis code. I will double check.

Thanks,
Andiry

>> + select LIBCRC32C
>> + help
>> +   If your system has a block of fast (comparable in access speed to
>> +   system memory) and non-volatile byte-addressable memory and you wish
>> +   to mount a light-weight filesystem with strong consistency support
>> +   over it, say Y here.
>> +
>> +   To compile this as a module, choose M here: the module will be
>> +   called nova.
>> +
>> +   If unsure, say N.
>> diff --git a/fs/nova/Makefile b/fs/nova/Makefile
>> new file mode 100644
>> index 000..eb19646
>> --- /dev/null
>> +++ b/fs/nova/Makefile
>> @@ -0,0 +1,7 @@
>> +#
>> +# Makefile for the linux NOVA filesystem routines.
>> +#
>> +
>> +obj-$(CONFIG_NOVA_FS) += nova.o
>> +
>> +nova-y := bbuild.o inode.o rebuild.o super.o
>>

Re: [RFC v2 09/83] Add Kconfig and Makefile

2018-03-11 Thread Andiry Xu

On Sun, Mar 11, 2018 at 5:15 AM, Nikolay Borisov
 wrote:
>
>
> On 10.03.2018 20:17, Andiry Xu wrote:
>> From: Andiry Xu 
>>
>> Signed-off-by: Andiry Xu 
>> ---
>>  fs/Kconfig   |  2 ++
>>  fs/Makefile  |  1 +
>>  fs/nova/Kconfig  | 15 +++
>>  fs/nova/Makefile |  7 +++
>>  4 files changed, 25 insertions(+)
>>  create mode 100644 fs/nova/Kconfig
>>  create mode 100644 fs/nova/Makefile
>>
>> diff --git a/fs/Kconfig b/fs/Kconfig
>> index bc821a8..5e9ff3e 100644
>> --- a/fs/Kconfig
>> +++ b/fs/Kconfig
>> @@ -58,6 +58,8 @@ config FS_DAX_PMD
>>   depends on ZONE_DEVICE
>>   depends on TRANSPARENT_HUGEPAGE
>>
>> +source "fs/nova/Kconfig"
>> +
>>  # Selected by DAX drivers that do not expect filesystem DAX to support
>>  # get_user_pages() of DAX mappings. I.e. "limited" indicates no support
>>  # for fork() of processes with MAP_SHARED mappings or support for
>> diff --git a/fs/Makefile b/fs/Makefile
>> index add789e..65ea619 100644
>> --- a/fs/Makefile
>> +++ b/fs/Makefile
>> @@ -113,6 +113,7 @@ obj-$(CONFIG_OMFS_FS) += omfs/
>>  obj-$(CONFIG_JFS_FS) += jfs/
>>  obj-$(CONFIG_XFS_FS) += xfs/
>>  obj-$(CONFIG_9P_FS)  += 9p/
>> +obj-$(CONFIG_NOVA_FS)+= nova/
>>  obj-$(CONFIG_AFS_FS) += afs/
>>  obj-$(CONFIG_NILFS2_FS)  += nilfs2/
>>  obj-$(CONFIG_BEFS_FS)+= befs/
>> diff --git a/fs/nova/Kconfig b/fs/nova/Kconfig
>> new file mode 100644
>> index 000..c1c692e
>> --- /dev/null
>> +++ b/fs/nova/Kconfig
>> @@ -0,0 +1,15 @@
>> +config NOVA_FS
>> + tristate "NOVA: log-structured file system for non-volatile memories"
>> + depends on FS_DAX
>> + select CRC32
>
> What do you need crc32 for? Selecting libcrc32c is enough to do "the
> right thing"
>

I think this is the legacy of the removed NOVA-Fortis code. I will double check.

Thanks,
Andiry

>> + select LIBCRC32C
>> + help
>> +   If your system has a block of fast (comparable in access speed to
>> +   system memory) and non-volatile byte-addressable memory and you wish
>> +   to mount a light-weight filesystem with strong consistency support
>> +   over it, say Y here.
>> +
>> +   To compile this as a module, choose M here: the module will be
>> +   called nova.
>> +
>> +   If unsure, say N.
>> diff --git a/fs/nova/Makefile b/fs/nova/Makefile
>> new file mode 100644
>> index 000..eb19646
>> --- /dev/null
>> +++ b/fs/nova/Makefile
>> @@ -0,0 +1,7 @@
>> +#
>> +# Makefile for the linux NOVA filesystem routines.
>> +#
>> +
>> +obj-$(CONFIG_NOVA_FS) += nova.o
>> +
>> +nova-y := bbuild.o inode.o rebuild.o super.o
>>

Re: [RFC v2 14/83] Add range node kmem cache.

2018-03-11 Thread Andiry Xu

On Sun, Mar 11, 2018 at 4:55 AM, Nikolay Borisov
<n.borisov.l...@gmail.com> wrote:
>
>
> On 10.03.2018 20:17, Andiry Xu wrote:
>> From: Andiry Xu <jix...@cs.ucsd.edu>
>>
>> Range node specifies a range of [start, end]. and is managed by a red-black 
>> tree.
>> NOVA uses range node to manage NVM allocator and inodes being used.
>>
>> Signed-off-by: Andiry Xu <jix...@cs.ucsd.edu>
>> ---
>>  fs/nova/nova.h  |  8 
>>  fs/nova/super.c | 45 ++---
>>  fs/nova/super.h |  2 ++
>>  3 files changed, 52 insertions(+), 3 deletions(-)
>>
>> diff --git a/fs/nova/nova.h b/fs/nova/nova.h
>> index ba7ffca..e0e85fb 100644
>> --- a/fs/nova/nova.h
>> +++ b/fs/nova/nova.h
>> @@ -301,6 +301,14 @@ static inline u64 nova_get_epoch_id(struct super_block 
>> *sb)
>>  }
>>
>>  #include "inode.h"
>> +
>> +/* A node in the RB tree representing a range of pages */
>> +struct nova_range_node {
>> + struct rb_node node;
>> + unsigned long range_low;
>> + unsigned long range_high;
>> +};
>> +
>>  #include "bbuild.h"
>>
>>  /* == */
>> diff --git a/fs/nova/super.c b/fs/nova/super.c
>> index f41cc04..aec1cd3 100644
>> --- a/fs/nova/super.c
>> +++ b/fs/nova/super.c
>> @@ -52,6 +52,7 @@ MODULE_PARM_DESC(nova_dbgmask, "Control debugging output");
>>  static struct super_operations nova_sops;
>>
>>  static struct kmem_cache *nova_inode_cachep;
>> +static struct kmem_cache *nova_range_node_cachep;
>>
>>
>>  /* FIXME: should the following variable be one per NOVA instance? */
>> @@ -686,6 +687,20 @@ static void nova_put_super(struct super_block *sb)
>>   sb->s_fs_info = NULL;
>>  }
>>
>> +inline void nova_free_range_node(struct nova_range_node *node)
>> +{
>> + kmem_cache_free(nova_range_node_cachep, node);
>> +}
>> +
>> +inline struct nova_range_node *nova_alloc_range_node(struct super_block *sb)
>> +{
>> + struct nova_range_node *p;
>> +
>> + p = (struct nova_range_node *)
> nit: needless cast

Thanks. Will fix.

Andiry

>> + kmem_cache_zalloc(nova_range_node_cachep, GFP_NOFS);
>> + return p;
>> +}
>> +
>>  static struct inode *nova_alloc_inode(struct super_block *sb)
>>  {
>>   struct nova_inode_info *vi;
>> @@ -719,6 +734,17 @@ static void init_once(void *foo)
>>   inode_init_once(>vfs_inode);
>>  }
>>
>> +static int __init init_rangenode_cache(void)
>> +{
>> + nova_range_node_cachep = kmem_cache_create("nova_range_node_cache",
>> + sizeof(struct nova_range_node),
>> + 0, (SLAB_RECLAIM_ACCOUNT |
>
>> + SLAB_MEM_SPREAD), NULL);
>> + if (nova_range_node_cachep == NULL)
>> + return -ENOMEM;
>> + return 0;
>> +}
>> +
>>  static int __init init_inodecache(void)
>>  {
>>   nova_inode_cachep = kmem_cache_create("nova_inode_cache",
>> @@ -740,6 +766,11 @@ static void destroy_inodecache(void)
>>   kmem_cache_destroy(nova_inode_cachep);
>>  }
>>
>> +static void destroy_rangenode_cache(void)
>> +{
>> + kmem_cache_destroy(nova_range_node_cachep);
>> +}
>> +
>>
>>  /*
>>   * the super block writes are all done "on the fly", so the
>> @@ -781,20 +812,27 @@ static int __init init_nova_fs(void)
>>   nova_info("Arch new instructions support: CLWB %s\n",
>>   support_clwb ? "YES" : "NO");
>>
>> - rc = init_inodecache();
>> + rc = init_rangenode_cache();
>>   if (rc)
>>   goto out;
>>
>> - rc = register_filesystem(_fs_type);
>> + rc = init_inodecache();
>>   if (rc)
>>   goto out1;
>>
>> + rc = register_filesystem(_fs_type);
>> + if (rc)
>> + goto out2;
>> +
>>  out:
>>   NOVA_END_TIMING(init_t, init_time);
>>   return rc;
>>
>> -out1:
>> +out2:
>>   destroy_inodecache();
>> +
>> +out1:
>> + destroy_rangenode_cache();
>>   goto out;
>>  }
>>
>> @@ -802,6 +840,7 @@ static void __exit exit_nova_fs(void)
>>  {
>>   unregister_filesystem(_fs_type);
>>   destroy_inodecache();
>> + destroy_rangenode_cache();
>>  }
>>
>>  MODULE_AUTHOR("Andiry Xu <jix...@cs.ucsd.edu>");
>> diff --git a/fs/nova/super.h b/fs/nova/super.h
>> index cb53908..b478080 100644
>> --- a/fs/nova/super.h
>> +++ b/fs/nova/super.h
>> @@ -145,5 +145,7 @@ static inline struct nova_super_block 
>> *nova_get_super(struct super_block *sb)
>>  }
>>
>>  extern void nova_error_mng(struct super_block *sb, const char *fmt, ...);
>> +extern struct nova_range_node *nova_alloc_range_node(struct super_block 
>> *sb);
>> +extern void nova_free_range_node(struct nova_range_node *node);
>>
>>  #endif
>>

Re: [RFC v2 14/83] Add range node kmem cache.

2018-03-11 Thread Andiry Xu

On Sun, Mar 11, 2018 at 4:55 AM, Nikolay Borisov
 wrote:
>
>
> On 10.03.2018 20:17, Andiry Xu wrote:
>> From: Andiry Xu 
>>
>> Range node specifies a range of [start, end]. and is managed by a red-black 
>> tree.
>> NOVA uses range node to manage NVM allocator and inodes being used.
>>
>> Signed-off-by: Andiry Xu 
>> ---
>>  fs/nova/nova.h  |  8 
>>  fs/nova/super.c | 45 ++---
>>  fs/nova/super.h |  2 ++
>>  3 files changed, 52 insertions(+), 3 deletions(-)
>>
>> diff --git a/fs/nova/nova.h b/fs/nova/nova.h
>> index ba7ffca..e0e85fb 100644
>> --- a/fs/nova/nova.h
>> +++ b/fs/nova/nova.h
>> @@ -301,6 +301,14 @@ static inline u64 nova_get_epoch_id(struct super_block 
>> *sb)
>>  }
>>
>>  #include "inode.h"
>> +
>> +/* A node in the RB tree representing a range of pages */
>> +struct nova_range_node {
>> + struct rb_node node;
>> + unsigned long range_low;
>> + unsigned long range_high;
>> +};
>> +
>>  #include "bbuild.h"
>>
>>  /* == */
>> diff --git a/fs/nova/super.c b/fs/nova/super.c
>> index f41cc04..aec1cd3 100644
>> --- a/fs/nova/super.c
>> +++ b/fs/nova/super.c
>> @@ -52,6 +52,7 @@ MODULE_PARM_DESC(nova_dbgmask, "Control debugging output");
>>  static struct super_operations nova_sops;
>>
>>  static struct kmem_cache *nova_inode_cachep;
>> +static struct kmem_cache *nova_range_node_cachep;
>>
>>
>>  /* FIXME: should the following variable be one per NOVA instance? */
>> @@ -686,6 +687,20 @@ static void nova_put_super(struct super_block *sb)
>>   sb->s_fs_info = NULL;
>>  }
>>
>> +inline void nova_free_range_node(struct nova_range_node *node)
>> +{
>> + kmem_cache_free(nova_range_node_cachep, node);
>> +}
>> +
>> +inline struct nova_range_node *nova_alloc_range_node(struct super_block *sb)
>> +{
>> + struct nova_range_node *p;
>> +
>> + p = (struct nova_range_node *)
> nit: needless cast

Thanks. Will fix.

Andiry

>> + kmem_cache_zalloc(nova_range_node_cachep, GFP_NOFS);
>> + return p;
>> +}
>> +
>>  static struct inode *nova_alloc_inode(struct super_block *sb)
>>  {
>>   struct nova_inode_info *vi;
>> @@ -719,6 +734,17 @@ static void init_once(void *foo)
>>   inode_init_once(>vfs_inode);
>>  }
>>
>> +static int __init init_rangenode_cache(void)
>> +{
>> + nova_range_node_cachep = kmem_cache_create("nova_range_node_cache",
>> + sizeof(struct nova_range_node),
>> + 0, (SLAB_RECLAIM_ACCOUNT |
>
>> + SLAB_MEM_SPREAD), NULL);
>> + if (nova_range_node_cachep == NULL)
>> + return -ENOMEM;
>> + return 0;
>> +}
>> +
>>  static int __init init_inodecache(void)
>>  {
>>   nova_inode_cachep = kmem_cache_create("nova_inode_cache",
>> @@ -740,6 +766,11 @@ static void destroy_inodecache(void)
>>   kmem_cache_destroy(nova_inode_cachep);
>>  }
>>
>> +static void destroy_rangenode_cache(void)
>> +{
>> + kmem_cache_destroy(nova_range_node_cachep);
>> +}
>> +
>>
>>  /*
>>   * the super block writes are all done "on the fly", so the
>> @@ -781,20 +812,27 @@ static int __init init_nova_fs(void)
>>   nova_info("Arch new instructions support: CLWB %s\n",
>>   support_clwb ? "YES" : "NO");
>>
>> - rc = init_inodecache();
>> + rc = init_rangenode_cache();
>>   if (rc)
>>   goto out;
>>
>> - rc = register_filesystem(_fs_type);
>> + rc = init_inodecache();
>>   if (rc)
>>   goto out1;
>>
>> + rc = register_filesystem(_fs_type);
>> + if (rc)
>> + goto out2;
>> +
>>  out:
>>   NOVA_END_TIMING(init_t, init_time);
>>   return rc;
>>
>> -out1:
>> +out2:
>>   destroy_inodecache();
>> +
>> +out1:
>> + destroy_rangenode_cache();
>>   goto out;
>>  }
>>
>> @@ -802,6 +840,7 @@ static void __exit exit_nova_fs(void)
>>  {
>>   unregister_filesystem(_fs_type);
>>   destroy_inodecache();
>> + destroy_rangenode_cache();
>>  }
>>
>>  MODULE_AUTHOR("Andiry Xu ");
>> diff --git a/fs/nova/super.h b/fs/nova/super.h
>> index cb53908..b478080 100644
>> --- a/fs/nova/super.h
>> +++ b/fs/nova/super.h
>> @@ -145,5 +145,7 @@ static inline struct nova_super_block 
>> *nova_get_super(struct super_block *sb)
>>  }
>>
>>  extern void nova_error_mng(struct super_block *sb, const char *fmt, ...);
>> +extern struct nova_range_node *nova_alloc_range_node(struct super_block 
>> *sb);
>> +extern void nova_free_range_node(struct nova_range_node *node);
>>
>>  #endif
>>

Re: [RFC v2 16/83] Initialize block map and free lists in nova_init().

2018-03-11 Thread Andiry Xu

On Sun, Mar 11, 2018 at 5:12 AM, Nikolay Borisov
<n.borisov.l...@gmail.com> wrote:
>
>
> On 10.03.2018 20:17, Andiry Xu wrote:
>> From: Andiry Xu <jix...@cs.ucsd.edu>
>>
>> NOVA divides the pmem range equally among per-CPU free lists,
>> and format the red-black trees by inserting the initial free range.
>>
>> Signed-off-by: Andiry Xu <jix...@cs.ucsd.edu>
>> ---
>>  fs/nova/balloc.c | 161 
>> +++
>>  fs/nova/balloc.h |  13 -
>>  fs/nova/super.c  |   2 +
>>  3 files changed, 175 insertions(+), 1 deletion(-)
>>
>> diff --git a/fs/nova/balloc.c b/fs/nova/balloc.c
>> index 450c942..cb627db 100644
>> --- a/fs/nova/balloc.c
>> +++ b/fs/nova/balloc.c
>> @@ -55,4 +55,165 @@ void nova_delete_free_lists(struct super_block *sb)
>>   sbi->free_lists = NULL;
>>  }
>>
>> +// Initialize a free list.  Each CPU gets an equal share of the block space 
>> to
>> +// manage.
>> +static void nova_init_free_list(struct super_block *sb,
>> + struct free_list *free_list, int index)
>> +{
>> + struct nova_sb_info *sbi = NOVA_SB(sb);
>> + unsigned long per_list_blocks;
>> +
>> + per_list_blocks = sbi->num_blocks / sbi->cpus;
>
> nit: You've already initialised per_list_blocks in nova_init_blockmap,
> which calls this function. So just reference it, rather than performing
> the the divison every time
>

Thanks for catching this.

>> +
>> + free_list->block_start = per_list_blocks * index;
>> + free_list->block_end = free_list->block_start +
>> + per_list_blocks - 1;
>> + if (index == 0)
>> + free_list->block_start += sbi->head_reserved_blocks;
>> + if (index == sbi->cpus - 1)
>> + free_list->block_end -= sbi->tail_reserved_blocks;
>> +}
>> +
>> +inline struct nova_range_node *nova_alloc_blocknode(struct super_block *sb)
>> +{
>> + return nova_alloc_range_node(sb);
>> +}
>> +
>> +inline void nova_free_blocknode(struct super_block *sb,
>> + struct nova_range_node *node)
>> +{
>> + nova_free_range_node(node);
>> +}
>> +
>> +void nova_init_blockmap(struct super_block *sb, int recovery)
>> +{
>> + struct nova_sb_info *sbi = NOVA_SB(sb);
>> + struct rb_root *tree;
>> + struct nova_range_node *blknode;
>> + struct free_list *free_list;
>> + int i;
>> + int ret;
>> +
>> + /* Divide the block range among per-CPU free lists */
>> + sbi->per_list_blocks = sbi->num_blocks / sbi->cpus;
>> + for (i = 0; i < sbi->cpus; i++) {
>> + free_list = nova_get_free_list(sb, i);
>> + tree = &(free_list->block_free_tree);
>> + nova_init_free_list(sb, free_list, i);
>> +
>> + /* For recovery, update these fields later */
>> + if (recovery == 0) {
>> + free_list->num_free_blocks = free_list->block_end -
>> + free_list->block_start + 1;
>> +
>> + blknode = nova_alloc_blocknode(sb);
>> + if (blknode == NULL)
>> + return;
>> + blknode->range_low = free_list->block_start;
>> + blknode->range_high = free_list->block_end;
>> + ret = nova_insert_blocktree(sbi, tree, blknode);
>> + if (ret) {
>> + nova_err(sb, "%s failed\n", __func__);
>> + nova_free_blocknode(sb, blknode);
>> + return;
>> + }
>> + free_list->first_node = blknode;
>> + free_list->last_node = blknode;
>> + free_list->num_blocknode = 1;
>> + }
>> +
>> + nova_dbgv("%s: free list %d: block start %lu, end %lu, %lu 
>> free blocks\n",
>> +   __func__, i,
>> +   free_list->block_start,
>> +   free_list->block_end,
>> +   free_list->num_free_blocks);
>> + }
>> +}
>> +
>> +static inline int nova_rbtree_compare_rangenode(struct nova_range_node 
>> *curr,
>> + unsigned long range_low)
>> +{
>> + if (range_low <

Re: [RFC v2 16/83] Initialize block map and free lists in nova_init().

2018-03-11 Thread Andiry Xu

On Sun, Mar 11, 2018 at 5:12 AM, Nikolay Borisov
 wrote:
>
>
> On 10.03.2018 20:17, Andiry Xu wrote:
>> From: Andiry Xu 
>>
>> NOVA divides the pmem range equally among per-CPU free lists,
>> and format the red-black trees by inserting the initial free range.
>>
>> Signed-off-by: Andiry Xu 
>> ---
>>  fs/nova/balloc.c | 161 
>> +++
>>  fs/nova/balloc.h |  13 -
>>  fs/nova/super.c  |   2 +
>>  3 files changed, 175 insertions(+), 1 deletion(-)
>>
>> diff --git a/fs/nova/balloc.c b/fs/nova/balloc.c
>> index 450c942..cb627db 100644
>> --- a/fs/nova/balloc.c
>> +++ b/fs/nova/balloc.c
>> @@ -55,4 +55,165 @@ void nova_delete_free_lists(struct super_block *sb)
>>   sbi->free_lists = NULL;
>>  }
>>
>> +// Initialize a free list.  Each CPU gets an equal share of the block space 
>> to
>> +// manage.
>> +static void nova_init_free_list(struct super_block *sb,
>> + struct free_list *free_list, int index)
>> +{
>> + struct nova_sb_info *sbi = NOVA_SB(sb);
>> + unsigned long per_list_blocks;
>> +
>> + per_list_blocks = sbi->num_blocks / sbi->cpus;
>
> nit: You've already initialised per_list_blocks in nova_init_blockmap,
> which calls this function. So just reference it, rather than performing
> the the divison every time
>

Thanks for catching this.

>> +
>> + free_list->block_start = per_list_blocks * index;
>> + free_list->block_end = free_list->block_start +
>> + per_list_blocks - 1;
>> + if (index == 0)
>> + free_list->block_start += sbi->head_reserved_blocks;
>> + if (index == sbi->cpus - 1)
>> + free_list->block_end -= sbi->tail_reserved_blocks;
>> +}
>> +
>> +inline struct nova_range_node *nova_alloc_blocknode(struct super_block *sb)
>> +{
>> + return nova_alloc_range_node(sb);
>> +}
>> +
>> +inline void nova_free_blocknode(struct super_block *sb,
>> + struct nova_range_node *node)
>> +{
>> + nova_free_range_node(node);
>> +}
>> +
>> +void nova_init_blockmap(struct super_block *sb, int recovery)
>> +{
>> + struct nova_sb_info *sbi = NOVA_SB(sb);
>> + struct rb_root *tree;
>> + struct nova_range_node *blknode;
>> + struct free_list *free_list;
>> + int i;
>> + int ret;
>> +
>> + /* Divide the block range among per-CPU free lists */
>> + sbi->per_list_blocks = sbi->num_blocks / sbi->cpus;
>> + for (i = 0; i < sbi->cpus; i++) {
>> + free_list = nova_get_free_list(sb, i);
>> + tree = &(free_list->block_free_tree);
>> + nova_init_free_list(sb, free_list, i);
>> +
>> + /* For recovery, update these fields later */
>> + if (recovery == 0) {
>> + free_list->num_free_blocks = free_list->block_end -
>> + free_list->block_start + 1;
>> +
>> + blknode = nova_alloc_blocknode(sb);
>> + if (blknode == NULL)
>> + return;
>> + blknode->range_low = free_list->block_start;
>> + blknode->range_high = free_list->block_end;
>> + ret = nova_insert_blocktree(sbi, tree, blknode);
>> + if (ret) {
>> + nova_err(sb, "%s failed\n", __func__);
>> + nova_free_blocknode(sb, blknode);
>> + return;
>> + }
>> + free_list->first_node = blknode;
>> + free_list->last_node = blknode;
>> + free_list->num_blocknode = 1;
>> + }
>> +
>> + nova_dbgv("%s: free list %d: block start %lu, end %lu, %lu 
>> free blocks\n",
>> +   __func__, i,
>> +   free_list->block_start,
>> +   free_list->block_end,
>> +   free_list->num_free_blocks);
>> + }
>> +}
>> +
>> +static inline int nova_rbtree_compare_rangenode(struct nova_range_node 
>> *curr,
>> + unsigned long range_low)
>> +{
>> + if (range_low < curr->range_low)
>> + return -1;
>> + if (range_

Re: [RFC v2 00/83] NOVA: a new file system for persistent memory

2018-03-10 Thread Andiry Xu

On Sat, Mar 10, 2018 at 6:14 PM, Theodore Y. Ts'o  wrote:
> FYI, your patch set doesn't even compile for me without these fixups.
> I'm not sure why you were trying to declare inline functions in a
> header file without the function body?
>

Thanks for catching this. I will fix it in the next version and adopt
stricter flags next time.

Thanks,
Andiry

> - Ted
>
> diff --git a/fs/nova/balloc.c b/fs/nova/balloc.c
> index 8e992156f28c..9c7b74aa712e 100644
> --- a/fs/nova/balloc.c
> +++ b/fs/nova/balloc.c
> @@ -74,12 +74,12 @@ static void nova_init_free_list(struct super_block *sb,
> free_list->block_end -= sbi->tail_reserved_blocks;
>  }
>
> -inline struct nova_range_node *nova_alloc_blocknode(struct super_block *sb)
> +struct nova_range_node *nova_alloc_blocknode(struct super_block *sb)
>  {
> return nova_alloc_range_node(sb);
>  }
>
> -inline void nova_free_blocknode(struct super_block *sb,
> +void nova_free_blocknode(struct super_block *sb,
> struct nova_range_node *node)
>  {
> nova_free_range_node(node);
> @@ -206,7 +206,7 @@ int nova_insert_range_node(struct rb_root *tree,
> return 0;
>  }
>
> -inline int nova_insert_blocktree(struct nova_sb_info *sbi,
> +int nova_insert_blocktree(struct nova_sb_info *sbi,
> struct rb_root *tree, struct nova_range_node *new_node)
>  {
> int ret;
> @@ -659,7 +659,7 @@ static int nova_new_blocks(struct super_block *sb, 
> unsigned long *blocknr,
>
>  // Allocate data blocks.  The offset for the allocated block comes back in
>  // blocknr.  Return the number of blocks allocated.
> -inline int nova_new_data_blocks(struct super_block *sb,
> +int nova_new_data_blocks(struct super_block *sb,
> struct nova_inode_info_header *sih, unsigned long *blocknr,
> unsigned long start_blk, unsigned int num,
> enum nova_alloc_init zero, int cpu,
> diff --git a/fs/nova/balloc.h b/fs/nova/balloc.h
> index 463fbac99eff..aca7e8c18dde 100644
> --- a/fs/nova/balloc.h
> +++ b/fs/nova/balloc.h
> @@ -62,18 +62,18 @@ enum alloc_type {
>
>  int nova_alloc_block_free_lists(struct super_block *sb);
>  void nova_delete_free_lists(struct super_block *sb);
> -inline struct nova_range_node *nova_alloc_blocknode(struct super_block *sb);
> -inline void nova_free_blocknode(struct super_block *sb,
> +struct nova_range_node *nova_alloc_blocknode(struct super_block *sb);
> +void nova_free_blocknode(struct super_block *sb,
> struct nova_range_node *bnode);
>  extern void nova_init_blockmap(struct super_block *sb, int recovery);
>  extern unsigned long nova_count_free_blocks(struct super_block *sb);
> -inline int nova_insert_blocktree(struct nova_sb_info *sbi,
> +int nova_insert_blocktree(struct nova_sb_info *sbi,
> struct rb_root *tree, struct nova_range_node *new_node);
>  extern int nova_free_data_blocks(struct super_block *sb,
> struct nova_inode_info_header *sih, unsigned long blocknr, int num);
>  extern int nova_free_log_blocks(struct super_block *sb,
> struct nova_inode_info_header *sih, unsigned long blocknr, int num);
> -extern inline int nova_new_data_blocks(struct super_block *sb,
> +extern int nova_new_data_blocks(struct super_block *sb,
> struct nova_inode_info_header *sih, unsigned long *blocknr,
> unsigned long start_blk, unsigned int num,
> enum nova_alloc_init zero, int cpu,
> diff --git a/fs/nova/inode.c b/fs/nova/inode.c
> index 21be31a05d26..31ef258978ba 100644
> --- a/fs/nova/inode.c
> +++ b/fs/nova/inode.c
> @@ -440,7 +440,7 @@ struct inode *nova_iget(struct super_block *sb, unsigned 
> long ino)
> return ERR_PTR(err);
>  }
>
> -inline int nova_insert_inodetree(struct nova_sb_info *sbi,
> +int nova_insert_inodetree(struct nova_sb_info *sbi,
> struct nova_range_node *new_node, int cpu)
>  {
> struct rb_root *tree;
> diff --git a/fs/nova/inode.h b/fs/nova/inode.h
> index 086a7cba8ac3..1097e15ff7af 100644
> --- a/fs/nova/inode.h
> +++ b/fs/nova/inode.h
> @@ -254,7 +254,7 @@ int nova_init_inode_table(struct super_block *sb);
>  int nova_get_inode_address(struct super_block *sb, u64 ino,
> u64 *pi_addr, int extendable);
>  struct inode *nova_iget(struct super_block *sb, unsigned long ino);
> -inline int nova_insert_inodetree(struct nova_sb_info *sbi,
> +int nova_insert_inodetree(struct nova_sb_info *sbi,
> struct nova_range_node *new_node, int cpu);
>  u64 nova_new_nova_inode(struct super_block *sb, u64 *pi_addr);
>  struct inode *nova_new_vfs_inode(enum nova_new_inode_type type,
> diff --git a/fs/nova/super.c b/fs/nova/super.c
> index 039c003b698b..9f06ec847c95 100644
> --- a/fs/nova/super.c
> +++ b/fs/nova/super.c
> @@ -795,23 +795,23 @@ static void nova_put_super(struct super_block *sb)
> sb->s_fs_info = NULL;
>  }
>
> -inline void nova_free_range_node(struct nova_range_node *node)
> +void nova_free_range_node(struct nova_range_node *node)

Re: [RFC v2 00/83] NOVA: a new file system for persistent memory

2018-03-10 Thread Andiry Xu

On Sat, Mar 10, 2018 at 6:14 PM, Theodore Y. Ts'o  wrote:
> FYI, your patch set doesn't even compile for me without these fixups.
> I'm not sure why you were trying to declare inline functions in a
> header file without the function body?
>

Thanks for catching this. I will fix it in the next version and adopt
stricter flags next time.

Thanks,
Andiry

> - Ted
>
> diff --git a/fs/nova/balloc.c b/fs/nova/balloc.c
> index 8e992156f28c..9c7b74aa712e 100644
> --- a/fs/nova/balloc.c
> +++ b/fs/nova/balloc.c
> @@ -74,12 +74,12 @@ static void nova_init_free_list(struct super_block *sb,
> free_list->block_end -= sbi->tail_reserved_blocks;
>  }
>
> -inline struct nova_range_node *nova_alloc_blocknode(struct super_block *sb)
> +struct nova_range_node *nova_alloc_blocknode(struct super_block *sb)
>  {
> return nova_alloc_range_node(sb);
>  }
>
> -inline void nova_free_blocknode(struct super_block *sb,
> +void nova_free_blocknode(struct super_block *sb,
> struct nova_range_node *node)
>  {
> nova_free_range_node(node);
> @@ -206,7 +206,7 @@ int nova_insert_range_node(struct rb_root *tree,
> return 0;
>  }
>
> -inline int nova_insert_blocktree(struct nova_sb_info *sbi,
> +int nova_insert_blocktree(struct nova_sb_info *sbi,
> struct rb_root *tree, struct nova_range_node *new_node)
>  {
> int ret;
> @@ -659,7 +659,7 @@ static int nova_new_blocks(struct super_block *sb, 
> unsigned long *blocknr,
>
>  // Allocate data blocks.  The offset for the allocated block comes back in
>  // blocknr.  Return the number of blocks allocated.
> -inline int nova_new_data_blocks(struct super_block *sb,
> +int nova_new_data_blocks(struct super_block *sb,
> struct nova_inode_info_header *sih, unsigned long *blocknr,
> unsigned long start_blk, unsigned int num,
> enum nova_alloc_init zero, int cpu,
> diff --git a/fs/nova/balloc.h b/fs/nova/balloc.h
> index 463fbac99eff..aca7e8c18dde 100644
> --- a/fs/nova/balloc.h
> +++ b/fs/nova/balloc.h
> @@ -62,18 +62,18 @@ enum alloc_type {
>
>  int nova_alloc_block_free_lists(struct super_block *sb);
>  void nova_delete_free_lists(struct super_block *sb);
> -inline struct nova_range_node *nova_alloc_blocknode(struct super_block *sb);
> -inline void nova_free_blocknode(struct super_block *sb,
> +struct nova_range_node *nova_alloc_blocknode(struct super_block *sb);
> +void nova_free_blocknode(struct super_block *sb,
> struct nova_range_node *bnode);
>  extern void nova_init_blockmap(struct super_block *sb, int recovery);
>  extern unsigned long nova_count_free_blocks(struct super_block *sb);
> -inline int nova_insert_blocktree(struct nova_sb_info *sbi,
> +int nova_insert_blocktree(struct nova_sb_info *sbi,
> struct rb_root *tree, struct nova_range_node *new_node);
>  extern int nova_free_data_blocks(struct super_block *sb,
> struct nova_inode_info_header *sih, unsigned long blocknr, int num);
>  extern int nova_free_log_blocks(struct super_block *sb,
> struct nova_inode_info_header *sih, unsigned long blocknr, int num);
> -extern inline int nova_new_data_blocks(struct super_block *sb,
> +extern int nova_new_data_blocks(struct super_block *sb,
> struct nova_inode_info_header *sih, unsigned long *blocknr,
> unsigned long start_blk, unsigned int num,
> enum nova_alloc_init zero, int cpu,
> diff --git a/fs/nova/inode.c b/fs/nova/inode.c
> index 21be31a05d26..31ef258978ba 100644
> --- a/fs/nova/inode.c
> +++ b/fs/nova/inode.c
> @@ -440,7 +440,7 @@ struct inode *nova_iget(struct super_block *sb, unsigned 
> long ino)
> return ERR_PTR(err);
>  }
>
> -inline int nova_insert_inodetree(struct nova_sb_info *sbi,
> +int nova_insert_inodetree(struct nova_sb_info *sbi,
> struct nova_range_node *new_node, int cpu)
>  {
> struct rb_root *tree;
> diff --git a/fs/nova/inode.h b/fs/nova/inode.h
> index 086a7cba8ac3..1097e15ff7af 100644
> --- a/fs/nova/inode.h
> +++ b/fs/nova/inode.h
> @@ -254,7 +254,7 @@ int nova_init_inode_table(struct super_block *sb);
>  int nova_get_inode_address(struct super_block *sb, u64 ino,
> u64 *pi_addr, int extendable);
>  struct inode *nova_iget(struct super_block *sb, unsigned long ino);
> -inline int nova_insert_inodetree(struct nova_sb_info *sbi,
> +int nova_insert_inodetree(struct nova_sb_info *sbi,
> struct nova_range_node *new_node, int cpu);
>  u64 nova_new_nova_inode(struct super_block *sb, u64 *pi_addr);
>  struct inode *nova_new_vfs_inode(enum nova_new_inode_type type,
> diff --git a/fs/nova/super.c b/fs/nova/super.c
> index 039c003b698b..9f06ec847c95 100644
> --- a/fs/nova/super.c
> +++ b/fs/nova/super.c
> @@ -795,23 +795,23 @@ static void nova_put_super(struct super_block *sb)
> sb->s_fs_info = NULL;
>  }
>
> -inline void nova_free_range_node(struct nova_range_node *node)
> +void nova_free_range_node(struct nova_range_node *node)
>  {
>

[RFC v2 04/83] NOVA inode definition.

2018-03-10 Thread Andiry Xu

From: Andiry Xu <jix...@cs.ucsd.edu>

inode.h defines the non-volatile and volatile NOVA inode data structures.

The non-volatile NOVA inode (nova_inode) is aligned to 128 bytes and contains
file/directory metadata information. The most important fields
are log_head and log_tail. log_head points to the start of
the log, and log_tail points to the end of the latest committed
log entry. NOVA make updates to the inode by appending
to the log tail and update the log_tail pointer atomically.

The volatile NOVA inode (nova_inode_info) contains necessary
information to limit access to the non-volatile NOVA inode during runtime.
It has a radix tree to map file offset or filenames to the corresponding
log entries.

Signed-off-by: Andiry Xu <jix...@cs.ucsd.edu>
---
 fs/nova/inode.h | 187 
 1 file changed, 187 insertions(+)
 create mode 100644 fs/nova/inode.h

diff --git a/fs/nova/inode.h b/fs/nova/inode.h
new file mode 100644
index 000..f9187e3
--- /dev/null
+++ b/fs/nova/inode.h
@@ -0,0 +1,187 @@
+#ifndef __INODE_H
+#define __INODE_H
+
+struct nova_inode_info_header;
+struct nova_inode;
+
+#include "super.h"
+
+enum nova_new_inode_type {
+   TYPE_CREATE = 0,
+   TYPE_MKNOD,
+   TYPE_SYMLINK,
+   TYPE_MKDIR
+};
+
+
+/*
+ * Structure of an inode in PMEM
+ * Keep the inode size to within 120 bytes: We use the last eight bytes
+ * as inode table tail pointer.
+ */
+struct nova_inode {
+
+   /* first 40 bytes */
+   u8  i_rsvd;  /* reserved. used to be checksum */
+   u8  valid;   /* Is this inode valid? */
+   u8  deleted; /* Is this inode deleted? */
+   u8  i_blk_type;  /* data block size this inode uses */
+   __le32  i_flags; /* Inode flags */
+   __le64  i_size;  /* Size of data in bytes */
+   __le32  i_ctime; /* Inode modification time */
+   __le32  i_mtime; /* Inode b-tree Modification time */
+   __le32  i_atime; /* Access time */
+   __le16  i_mode;  /* File mode */
+   __le16  i_links_count;   /* Links count */
+
+   __le64  i_xattr; /* Extended attribute block */
+
+   /* second 40 bytes */
+   __le32  i_uid;   /* Owner Uid */
+   __le32  i_gid;   /* Group Id */
+   __le32  i_generation;/* File version (for NFS) */
+   __le32  i_create_time;   /* Create time */
+   __le64  nova_ino;/* nova inode number */
+
+   __le64  log_head;/* Log head pointer */
+   __le64  log_tail;/* Log tail pointer */
+
+   /* last 40 bytes */
+   __le64  create_epoch_id; /* Transaction ID when create */
+   __le64  delete_epoch_id; /* Transaction ID when deleted */
+
+   struct {
+   __le32 rdev; /* major/minor # */
+   } dev;   /* device inode */
+
+   __le32  csum;/* CRC32 checksum */
+
+   /* Leave 8 bytes for inode table tail pointer */
+} __attribute((__packed__));
+
+/*
+ * NOVA-specific inode state kept in DRAM
+ */
+struct nova_inode_info_header {
+   /* For files, tree holds a map from file offsets to
+* write log entries.
+*
+* For directories, tree holds a map from a hash of the file name to
+* dentry log entry.
+*/
+   struct radix_tree_root tree;
+   struct rw_semaphore i_sem;  /* Protect log and tree */
+   unsigned short i_mode;  /* Dir or file? */
+   unsigned int i_flags;
+   unsigned long log_pages;/* Num of log pages */
+   unsigned long i_size;
+   unsigned long i_blocks;
+   unsigned long ino;
+   unsigned long pi_addr;
+   unsigned long valid_entries;/* For thorough GC */
+   unsigned long num_entries;  /* For thorough GC */
+   u64 last_setattr;   /* Last setattr entry */
+   u64 last_link_change;   /* Last link change entry */
+   u64 last_dentry;/* Last updated dentry */
+   u64 trans_id;   /* Transaction ID */
+   u64 log_head;   /* Log head pointer */
+   u64 log_tail;   /* Log tail pointer */
+   u8  i_blk_type;
+};
+
+/*
+ * DRAM state for inodes
+ */
+struct nova_inode_info {
+   struct nova_inode_info_header header;
+   struct inode vfs_inode;
+};
+
+
+static inline struct nova_inode_info *NOVA_I(struct inode *inode)
+{
+   return container_of(inode, struct nova_inode_info, vfs_inode);
+}
+
+static inline void sih_lock(struct nova_inode_info_header *header)
+{
+   down_write(>i_sem);
+}
+
+static inline void sih_unlock(struct nova_inode_info_header *header)
+{
+   up_write(>i_sem);
+}
+
+static inline void sih_lock_shared(struct nova_inode_info_header *header)
+{
+   down_read(>i_sem);
+}
+
+static inline void sih_unlock_shared(struct

[RFC v2 04/83] NOVA inode definition.

2018-03-10 Thread Andiry Xu

From: Andiry Xu 

inode.h defines the non-volatile and volatile NOVA inode data structures.

The non-volatile NOVA inode (nova_inode) is aligned to 128 bytes and contains
file/directory metadata information. The most important fields
are log_head and log_tail. log_head points to the start of
the log, and log_tail points to the end of the latest committed
log entry. NOVA make updates to the inode by appending
to the log tail and update the log_tail pointer atomically.

The volatile NOVA inode (nova_inode_info) contains necessary
information to limit access to the non-volatile NOVA inode during runtime.
It has a radix tree to map file offset or filenames to the corresponding
log entries.

Signed-off-by: Andiry Xu 
---
 fs/nova/inode.h | 187 
 1 file changed, 187 insertions(+)
 create mode 100644 fs/nova/inode.h

diff --git a/fs/nova/inode.h b/fs/nova/inode.h
new file mode 100644
index 000..f9187e3
--- /dev/null
+++ b/fs/nova/inode.h
@@ -0,0 +1,187 @@
+#ifndef __INODE_H
+#define __INODE_H
+
+struct nova_inode_info_header;
+struct nova_inode;
+
+#include "super.h"
+
+enum nova_new_inode_type {
+   TYPE_CREATE = 0,
+   TYPE_MKNOD,
+   TYPE_SYMLINK,
+   TYPE_MKDIR
+};
+
+
+/*
+ * Structure of an inode in PMEM
+ * Keep the inode size to within 120 bytes: We use the last eight bytes
+ * as inode table tail pointer.
+ */
+struct nova_inode {
+
+   /* first 40 bytes */
+   u8  i_rsvd;  /* reserved. used to be checksum */
+   u8  valid;   /* Is this inode valid? */
+   u8  deleted; /* Is this inode deleted? */
+   u8  i_blk_type;  /* data block size this inode uses */
+   __le32  i_flags; /* Inode flags */
+   __le64  i_size;  /* Size of data in bytes */
+   __le32  i_ctime; /* Inode modification time */
+   __le32  i_mtime; /* Inode b-tree Modification time */
+   __le32  i_atime; /* Access time */
+   __le16  i_mode;  /* File mode */
+   __le16  i_links_count;   /* Links count */
+
+   __le64  i_xattr; /* Extended attribute block */
+
+   /* second 40 bytes */
+   __le32  i_uid;   /* Owner Uid */
+   __le32  i_gid;   /* Group Id */
+   __le32  i_generation;/* File version (for NFS) */
+   __le32  i_create_time;   /* Create time */
+   __le64  nova_ino;/* nova inode number */
+
+   __le64  log_head;/* Log head pointer */
+   __le64  log_tail;/* Log tail pointer */
+
+   /* last 40 bytes */
+   __le64  create_epoch_id; /* Transaction ID when create */
+   __le64  delete_epoch_id; /* Transaction ID when deleted */
+
+   struct {
+   __le32 rdev; /* major/minor # */
+   } dev;   /* device inode */
+
+   __le32  csum;/* CRC32 checksum */
+
+   /* Leave 8 bytes for inode table tail pointer */
+} __attribute((__packed__));
+
+/*
+ * NOVA-specific inode state kept in DRAM
+ */
+struct nova_inode_info_header {
+   /* For files, tree holds a map from file offsets to
+* write log entries.
+*
+* For directories, tree holds a map from a hash of the file name to
+* dentry log entry.
+*/
+   struct radix_tree_root tree;
+   struct rw_semaphore i_sem;  /* Protect log and tree */
+   unsigned short i_mode;  /* Dir or file? */
+   unsigned int i_flags;
+   unsigned long log_pages;/* Num of log pages */
+   unsigned long i_size;
+   unsigned long i_blocks;
+   unsigned long ino;
+   unsigned long pi_addr;
+   unsigned long valid_entries;/* For thorough GC */
+   unsigned long num_entries;  /* For thorough GC */
+   u64 last_setattr;   /* Last setattr entry */
+   u64 last_link_change;   /* Last link change entry */
+   u64 last_dentry;/* Last updated dentry */
+   u64 trans_id;   /* Transaction ID */
+   u64 log_head;   /* Log head pointer */
+   u64 log_tail;   /* Log tail pointer */
+   u8  i_blk_type;
+};
+
+/*
+ * DRAM state for inodes
+ */
+struct nova_inode_info {
+   struct nova_inode_info_header header;
+   struct inode vfs_inode;
+};
+
+
+static inline struct nova_inode_info *NOVA_I(struct inode *inode)
+{
+   return container_of(inode, struct nova_inode_info, vfs_inode);
+}
+
+static inline void sih_lock(struct nova_inode_info_header *header)
+{
+   down_write(>i_sem);
+}
+
+static inline void sih_unlock(struct nova_inode_info_header *header)
+{
+   up_write(>i_sem);
+}
+
+static inline void sih_lock_shared(struct nova_inode_info_header *header)
+{
+   down_read(>i_sem);
+}
+
+static inline void sih_unlock_shared(struct nova_inode_info_header *header)
+{
+   up_read(>i_sem);
+}
+
+

[RFC v2 03/83] Add super.h.

2018-03-10 Thread Andiry Xu

From: Andiry Xu <jix...@cs.ucsd.edu>

This header file defines NOVA persistent and volatile superblock
data structures.

It also defines NOVA block layout:

Page 0: Superblock
Page 1: Reserved inodes
Page 2 - 15: Reserved
Page 16 - 31: Inode table pointers
Page 32 - 47: Journal address pointers
Page 48 - 63: Reserved
Pages n-2: Replicate reserved inodes
Pages n-1: Replicate superblock

Other pages are for normal inodes, logs and data.

Signed-off-by: Andiry Xu <jix...@cs.ucsd.edu>
---
 fs/nova/super.h | 149 
 1 file changed, 149 insertions(+)
 create mode 100644 fs/nova/super.h

diff --git a/fs/nova/super.h b/fs/nova/super.h
new file mode 100644
index 000..cb53908
--- /dev/null
+++ b/fs/nova/super.h
@@ -0,0 +1,149 @@
+#ifndef __SUPER_H
+#define __SUPER_H
+/*
+ * Structure of the NOVA super block in PMEM
+ *
+ * The fields are partitioned into static and dynamic fields. The static fields
+ * never change after file system creation. This was primarily done because
+ * nova_get_block() returns NULL if the block offset is 0 (helps in catching
+ * bugs). So if we modify any field using journaling (for consistency), we
+ * will have to modify s_sum which is at offset 0. So journaling code fails.
+ * This (static+dynamic fields) is a temporary solution and can be avoided
+ * once the file system becomes stable and nova_get_block() returns correct
+ * pointers even for offset 0.
+ */
+struct nova_super_block {
+   /* static fields. they never change after file system creation.
+* checksum only validates up to s_start_dynamic field below
+*/
+   __le32  s_sum;  /* checksum of this sb */
+   __le32  s_magic;/* magic signature */
+   __le32  s_padding32;
+   __le32  s_blocksize;/* blocksize in bytes */
+   __le64  s_size; /* total size of fs in bytes */
+   chars_volume_name[16];  /* volume name */
+
+   /* all the dynamic fields should go here */
+   __le64  s_epoch_id; /* Epoch ID */
+
+   /* s_mtime and s_wtime should be together and their order should not be
+* changed. we use an 8 byte write to update both of them atomically
+*/
+   __le32  s_mtime;/* mount time */
+   __le32  s_wtime;/* write time */
+} __attribute((__packed__));
+
+#define NOVA_SB_SIZE 512   /* must be power of two */
+
+/* === Reserved blocks = */
+
+/*
+ * Page 0 contains super blocks;
+ * Page 1 contains reserved inodes;
+ * Page 2 - 15 are reserved.
+ * Page 16 - 31 contain pointers to inode tables.
+ * Page 32 - 47 contain pointers to journal pages.
+ */
+#defineHEAD_RESERVED_BLOCKS64
+#defineNUM_JOURNAL_PAGES   16
+
+#defineSUPER_BLOCK_START   0 // Superblock
+#defineRESERVE_INODE_START 1 // Reserved inodes
+#defineINODE_TABLE_START   16 // inode table pointers
+#defineJOURNAL_START   32 // journal pointer table
+
+/* For replica super block and replica reserved inodes */
+#defineTAIL_RESERVED_BLOCKS2
+
+/* === Reserved inodes = */
+
+/* We have space for 31 reserved inodes */
+#define NOVA_ROOT_INO  (1)
+#define NOVA_INODETABLE_INO(2) /* Fake inode associated with inode
+* stroage.  We need this because our
+* allocator requires inode to be
+* associated with each allocation.
+* The data actually lives in linked
+* lists in INODE_TABLE_START. */
+#define NOVA_BLOCKNODE_INO (3) /* Storage for allocator state */
+#define NOVA_LITEJOURNAL_INO   (4) /* Storage for lightweight journals */
+#define NOVA_INODELIST_INO (5) /* Storage for Inode free list */
+
+
+/* Normal inode starts at 32 */
+#define NOVA_NORMAL_INODE_START  (32)
+
+
+
+/*
+ * NOVA super-block data in DRAM
+ */
+struct nova_sb_info {
+   struct super_block *sb; /* VFS super block */
+   struct nova_super_block *nova_sb;   /* DRAM copy of SB */
+   struct block_device *s_bdev;
+   struct dax_device *s_dax_dev;
+
+   /*
+* base physical and virtual address of NOVA (which is also
+* the pointer to the super block)
+*/
+   phys_addr_t phys_addr;
+   void*virt_addr;
+   void*replica_reserved_inodes_addr;
+   void*replica_sb_addr;
+
+   unsigned long   num_blocks;
+
+   /* Mount options */
+   unsigned long   bpi;
+   unsigned long   blocksize;
+   unsigned long   initsize;
+   unsigned long

[RFC v2 03/83] Add super.h.

2018-03-10 Thread Andiry Xu

From: Andiry Xu 

This header file defines NOVA persistent and volatile superblock
data structures.

It also defines NOVA block layout:

Page 0: Superblock
Page 1: Reserved inodes
Page 2 - 15: Reserved
Page 16 - 31: Inode table pointers
Page 32 - 47: Journal address pointers
Page 48 - 63: Reserved
Pages n-2: Replicate reserved inodes
Pages n-1: Replicate superblock

Other pages are for normal inodes, logs and data.

Signed-off-by: Andiry Xu 
---
 fs/nova/super.h | 149 
 1 file changed, 149 insertions(+)
 create mode 100644 fs/nova/super.h

diff --git a/fs/nova/super.h b/fs/nova/super.h
new file mode 100644
index 000..cb53908
--- /dev/null
+++ b/fs/nova/super.h
@@ -0,0 +1,149 @@
+#ifndef __SUPER_H
+#define __SUPER_H
+/*
+ * Structure of the NOVA super block in PMEM
+ *
+ * The fields are partitioned into static and dynamic fields. The static fields
+ * never change after file system creation. This was primarily done because
+ * nova_get_block() returns NULL if the block offset is 0 (helps in catching
+ * bugs). So if we modify any field using journaling (for consistency), we
+ * will have to modify s_sum which is at offset 0. So journaling code fails.
+ * This (static+dynamic fields) is a temporary solution and can be avoided
+ * once the file system becomes stable and nova_get_block() returns correct
+ * pointers even for offset 0.
+ */
+struct nova_super_block {
+   /* static fields. they never change after file system creation.
+* checksum only validates up to s_start_dynamic field below
+*/
+   __le32  s_sum;  /* checksum of this sb */
+   __le32  s_magic;/* magic signature */
+   __le32  s_padding32;
+   __le32  s_blocksize;/* blocksize in bytes */
+   __le64  s_size; /* total size of fs in bytes */
+   chars_volume_name[16];  /* volume name */
+
+   /* all the dynamic fields should go here */
+   __le64  s_epoch_id; /* Epoch ID */
+
+   /* s_mtime and s_wtime should be together and their order should not be
+* changed. we use an 8 byte write to update both of them atomically
+*/
+   __le32  s_mtime;/* mount time */
+   __le32  s_wtime;/* write time */
+} __attribute((__packed__));
+
+#define NOVA_SB_SIZE 512   /* must be power of two */
+
+/* === Reserved blocks = */
+
+/*
+ * Page 0 contains super blocks;
+ * Page 1 contains reserved inodes;
+ * Page 2 - 15 are reserved.
+ * Page 16 - 31 contain pointers to inode tables.
+ * Page 32 - 47 contain pointers to journal pages.
+ */
+#defineHEAD_RESERVED_BLOCKS64
+#defineNUM_JOURNAL_PAGES   16
+
+#defineSUPER_BLOCK_START   0 // Superblock
+#defineRESERVE_INODE_START 1 // Reserved inodes
+#defineINODE_TABLE_START   16 // inode table pointers
+#defineJOURNAL_START   32 // journal pointer table
+
+/* For replica super block and replica reserved inodes */
+#defineTAIL_RESERVED_BLOCKS2
+
+/* === Reserved inodes = */
+
+/* We have space for 31 reserved inodes */
+#define NOVA_ROOT_INO  (1)
+#define NOVA_INODETABLE_INO(2) /* Fake inode associated with inode
+* stroage.  We need this because our
+* allocator requires inode to be
+* associated with each allocation.
+* The data actually lives in linked
+* lists in INODE_TABLE_START. */
+#define NOVA_BLOCKNODE_INO (3) /* Storage for allocator state */
+#define NOVA_LITEJOURNAL_INO   (4) /* Storage for lightweight journals */
+#define NOVA_INODELIST_INO (5) /* Storage for Inode free list */
+
+
+/* Normal inode starts at 32 */
+#define NOVA_NORMAL_INODE_START  (32)
+
+
+
+/*
+ * NOVA super-block data in DRAM
+ */
+struct nova_sb_info {
+   struct super_block *sb; /* VFS super block */
+   struct nova_super_block *nova_sb;   /* DRAM copy of SB */
+   struct block_device *s_bdev;
+   struct dax_device *s_dax_dev;
+
+   /*
+* base physical and virtual address of NOVA (which is also
+* the pointer to the super block)
+*/
+   phys_addr_t phys_addr;
+   void*virt_addr;
+   void*replica_reserved_inodes_addr;
+   void*replica_sb_addr;
+
+   unsigned long   num_blocks;
+
+   /* Mount options */
+   unsigned long   bpi;
+   unsigned long   blocksize;
+   unsigned long   initsize;
+   unsigned long   s_mount_opt;
+   kuid_t  uid;/* Mount uid

[RFC v2 05/83] Add NOVA filesystem definitions and useful helper routines.

2018-03-10 Thread Andiry Xu

From: Andiry Xu <jix...@cs.ucsd.edu>

NOVA stores offset rather than absolute addresses in pmem.
nova_get_block() and nova_get_addr_off() provide transitions
between these two kinds of addresses.

Signed-off-by: Andiry Xu <jix...@cs.ucsd.edu>
---
 fs/nova/nova.h | 299 +
 1 file changed, 299 insertions(+)
 create mode 100644 fs/nova/nova.h

diff --git a/fs/nova/nova.h b/fs/nova/nova.h
new file mode 100644
index 000..5eb696c
--- /dev/null
+++ b/fs/nova/nova.h
@@ -0,0 +1,299 @@
+/*
+ * BRIEF DESCRIPTION
+ *
+ * Definitions for the NOVA filesystem.
+ *
+ * Copyright 2015-2016 Regents of the University of California,
+ * UCSD Non-Volatile Systems Lab, Andiry Xu <jix...@cs.ucsd.edu>
+ * Copyright 2012-2013 Intel Corporation
+ * Copyright 2009-2011 Marco Stornelli <marco.storne...@gmail.com>
+ * Copyright 2003 Sony Corporation
+ * Copyright 2003 Matsushita Electric Industrial Co., Ltd.
+ * 2003-2004 (c) MontaVista Software, Inc. , Steve Longerbeam
+ * This file is licensed under the terms of the GNU General Public
+ * License version 2. This program is licensed "as is" without any
+ * warranty of any kind, whether express or implied.
+ */
+#ifndef __NOVA_H
+#define __NOVA_H
+
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+#include "nova_def.h"
+
+#define PAGE_SHIFT_2M 21
+#define PAGE_SHIFT_1G 30
+
+
+/*
+ * Debug code
+ */
+#ifdef pr_fmt
+#undef pr_fmt
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+#endif
+
+/* #define nova_dbg(s, args...)pr_debug(s, ## args) */
+#define nova_dbg(s, args ...)  pr_info(s, ## args)
+#define nova_err(sb, s, args ...)  nova_error_mng(sb, s, ## args)
+#define nova_warn(s, args ...) pr_warn(s, ## args)
+#define nova_info(s, args ...) pr_info(s, ## args)
+
+extern unsigned int nova_dbgmask;
+#define NOVA_DBGMASK_MMAPHUGE (0x0001)
+#define NOVA_DBGMASK_MMAP4K   (0x0002)
+#define NOVA_DBGMASK_MMAPVERBOSE   (0x0004)
+#define NOVA_DBGMASK_MMAPVVERBOSE  (0x0008)
+#define NOVA_DBGMASK_VERBOSE  (0x0010)
+#define NOVA_DBGMASK_TRANSACTION   (0x0020)
+
+#define nova_dbg_mmap4k(s, args ...)\
+   ((nova_dbgmask & NOVA_DBGMASK_MMAP4K) ? nova_dbg(s, args) : 0)
+#define nova_dbg_mmapv(s, args ...) \
+   ((nova_dbgmask & NOVA_DBGMASK_MMAPVERBOSE) ? nova_dbg(s, args) : 0)
+#define nova_dbg_mmapvv(s, args ...)\
+   ((nova_dbgmask & NOVA_DBGMASK_MMAPVVERBOSE) ? nova_dbg(s, args) : 0)
+
+#define nova_dbg_verbose(s, args ...)   \
+   ((nova_dbgmask & NOVA_DBGMASK_VERBOSE) ? nova_dbg(s, ##args) : 0)
+#define nova_dbgv(s, args ...) nova_dbg_verbose(s, ##args)
+#define nova_dbg_trans(s, args ...) \
+   ((nova_dbgmask & NOVA_DBGMASK_TRANSACTION) ? nova_dbg(s, ##args) : 0)
+
+#define NOVA_ASSERT(x) do {\
+  if (!(x))\
+  nova_warn("assertion failed %s:%d: 
%s\n", \
+  __FILE__, __LINE__, #x);\
+  } while (0)
+
+#define nova_set_bit  __test_and_set_bit_le
+#define nova_clear_bit__test_and_clear_bit_le
+#define nova_find_next_zero_bit   find_next_zero_bit_le
+
+#define clear_opt(o, opt)  (o &= ~NOVA_MOUNT_ ## opt)
+#define set_opt(o, opt)(o |= NOVA_MOUNT_ ## opt)
+#define test_opt(sb, opt)  (NOVA_SB(sb)->s_mount_opt & NOVA_MOUNT_ ## opt)
+
+#define NOVA_LARGE_INODE_TABLE_SIZE(0x20)
+/* NOVA size threshold for using 2M blocks for inode table */
+#define NOVA_LARGE_INODE_TABLE_THREASHOLD(0x2000)
+/*
+ * nova inode flags
+ *
+ * NOVA_EOFBLOCKS_FL   There are blocks allocated beyond eof
+ */
+#define NOVA_EOFBLOCKS_FL  0x2000
+/* Flags that should be inherited by new inodes from their parent. */
+#define NOVA_FL_INHERITED (FS_SECRM_FL | FS_UNRM_FL | FS_COMPR_FL | \
+   FS_SYNC_FL | FS_NODUMP_FL | FS_NOATIME_FL | \
+   FS_COMPRBLK_FL | FS_NOCOMP_FL | \
+   FS_JOURNAL_DATA_FL | FS_NOTAIL_FL | FS_DIRSYNC_FL)
+/* Flags that are appropriate for regular files (all but dir-specific ones). */
+#define NOVA_REG_FLMASK (~(FS_DIRSYNC_FL | FS_TOPDIR_FL))
+/* Flags that are appropriate for non-directories/regular files. */
+#define NOVA_OTHER_FLMASK (FS_NODUMP_FL | FS_NOATIME_FL)
+#define NOVA_FL_USER_VISIBLE (FS_FL_USER_VISIBLE | NOVA_EOFBLOCKS_FL)
+
+/* IOCTLs */
+#defineNOVA_PRINT_TIMING   0xBCD00010
+#defineNOVA_CLEAR_STAT

[RFC v2 05/83] Add NOVA filesystem definitions and useful helper routines.

2018-03-10 Thread Andiry Xu

From: Andiry Xu 

NOVA stores offset rather than absolute addresses in pmem.
nova_get_block() and nova_get_addr_off() provide transitions
between these two kinds of addresses.

Signed-off-by: Andiry Xu 
---
 fs/nova/nova.h | 299 +
 1 file changed, 299 insertions(+)
 create mode 100644 fs/nova/nova.h

diff --git a/fs/nova/nova.h b/fs/nova/nova.h
new file mode 100644
index 000..5eb696c
--- /dev/null
+++ b/fs/nova/nova.h
@@ -0,0 +1,299 @@
+/*
+ * BRIEF DESCRIPTION
+ *
+ * Definitions for the NOVA filesystem.
+ *
+ * Copyright 2015-2016 Regents of the University of California,
+ * UCSD Non-Volatile Systems Lab, Andiry Xu 
+ * Copyright 2012-2013 Intel Corporation
+ * Copyright 2009-2011 Marco Stornelli 
+ * Copyright 2003 Sony Corporation
+ * Copyright 2003 Matsushita Electric Industrial Co., Ltd.
+ * 2003-2004 (c) MontaVista Software, Inc. , Steve Longerbeam
+ * This file is licensed under the terms of the GNU General Public
+ * License version 2. This program is licensed "as is" without any
+ * warranty of any kind, whether express or implied.
+ */
+#ifndef __NOVA_H
+#define __NOVA_H
+
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+#include "nova_def.h"
+
+#define PAGE_SHIFT_2M 21
+#define PAGE_SHIFT_1G 30
+
+
+/*
+ * Debug code
+ */
+#ifdef pr_fmt
+#undef pr_fmt
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+#endif
+
+/* #define nova_dbg(s, args...)pr_debug(s, ## args) */
+#define nova_dbg(s, args ...)  pr_info(s, ## args)
+#define nova_err(sb, s, args ...)  nova_error_mng(sb, s, ## args)
+#define nova_warn(s, args ...) pr_warn(s, ## args)
+#define nova_info(s, args ...) pr_info(s, ## args)
+
+extern unsigned int nova_dbgmask;
+#define NOVA_DBGMASK_MMAPHUGE (0x0001)
+#define NOVA_DBGMASK_MMAP4K   (0x0002)
+#define NOVA_DBGMASK_MMAPVERBOSE   (0x0004)
+#define NOVA_DBGMASK_MMAPVVERBOSE  (0x0008)
+#define NOVA_DBGMASK_VERBOSE  (0x0010)
+#define NOVA_DBGMASK_TRANSACTION   (0x0020)
+
+#define nova_dbg_mmap4k(s, args ...)\
+   ((nova_dbgmask & NOVA_DBGMASK_MMAP4K) ? nova_dbg(s, args) : 0)
+#define nova_dbg_mmapv(s, args ...) \
+   ((nova_dbgmask & NOVA_DBGMASK_MMAPVERBOSE) ? nova_dbg(s, args) : 0)
+#define nova_dbg_mmapvv(s, args ...)\
+   ((nova_dbgmask & NOVA_DBGMASK_MMAPVVERBOSE) ? nova_dbg(s, args) : 0)
+
+#define nova_dbg_verbose(s, args ...)   \
+   ((nova_dbgmask & NOVA_DBGMASK_VERBOSE) ? nova_dbg(s, ##args) : 0)
+#define nova_dbgv(s, args ...) nova_dbg_verbose(s, ##args)
+#define nova_dbg_trans(s, args ...) \
+   ((nova_dbgmask & NOVA_DBGMASK_TRANSACTION) ? nova_dbg(s, ##args) : 0)
+
+#define NOVA_ASSERT(x) do {\
+  if (!(x))\
+  nova_warn("assertion failed %s:%d: 
%s\n", \
+  __FILE__, __LINE__, #x);\
+  } while (0)
+
+#define nova_set_bit  __test_and_set_bit_le
+#define nova_clear_bit__test_and_clear_bit_le
+#define nova_find_next_zero_bit   find_next_zero_bit_le
+
+#define clear_opt(o, opt)  (o &= ~NOVA_MOUNT_ ## opt)
+#define set_opt(o, opt)(o |= NOVA_MOUNT_ ## opt)
+#define test_opt(sb, opt)  (NOVA_SB(sb)->s_mount_opt & NOVA_MOUNT_ ## opt)
+
+#define NOVA_LARGE_INODE_TABLE_SIZE(0x20)
+/* NOVA size threshold for using 2M blocks for inode table */
+#define NOVA_LARGE_INODE_TABLE_THREASHOLD(0x2000)
+/*
+ * nova inode flags
+ *
+ * NOVA_EOFBLOCKS_FL   There are blocks allocated beyond eof
+ */
+#define NOVA_EOFBLOCKS_FL  0x2000
+/* Flags that should be inherited by new inodes from their parent. */
+#define NOVA_FL_INHERITED (FS_SECRM_FL | FS_UNRM_FL | FS_COMPR_FL | \
+   FS_SYNC_FL | FS_NODUMP_FL | FS_NOATIME_FL | \
+   FS_COMPRBLK_FL | FS_NOCOMP_FL | \
+   FS_JOURNAL_DATA_FL | FS_NOTAIL_FL | FS_DIRSYNC_FL)
+/* Flags that are appropriate for regular files (all but dir-specific ones). */
+#define NOVA_REG_FLMASK (~(FS_DIRSYNC_FL | FS_TOPDIR_FL))
+/* Flags that are appropriate for non-directories/regular files. */
+#define NOVA_OTHER_FLMASK (FS_NODUMP_FL | FS_NOATIME_FL)
+#define NOVA_FL_USER_VISIBLE (FS_FL_USER_VISIBLE | NOVA_EOFBLOCKS_FL)
+
+/* IOCTLs */
+#defineNOVA_PRINT_TIMING   0xBCD00010
+#defineNOVA_CLEAR_STATS0xBCD00011
+#defineNOVA_PRINT_LOG  0xBCD00013
+#defineNOVA_PRI

[RFC v2 06/83] Add inode get/read methods.

2018-03-10 Thread Andiry Xu

From: Andiry Xu <jix...@cs.ucsd.edu>

These routines are incomplete and currently only support reserved inodes,
whose addresses are fixed. This is necessary for fill_super to work.
File/dir operations are left NULL.

Signed-off-by: Andiry Xu <jix...@cs.ucsd.edu>
---
 fs/nova/inode.c | 176 
 fs/nova/inode.h |   3 +
 2 files changed, 179 insertions(+)
 create mode 100644 fs/nova/inode.c

diff --git a/fs/nova/inode.c b/fs/nova/inode.c
new file mode 100644
index 000..bfdc5dc
--- /dev/null
+++ b/fs/nova/inode.c
@@ -0,0 +1,176 @@
+/*
+ * BRIEF DESCRIPTION
+ *
+ * Inode methods (allocate/free/read/write).
+ *
+ * Copyright 2015-2016 Regents of the University of California,
+ * UCSD Non-Volatile Systems Lab, Andiry Xu <jix...@cs.ucsd.edu>
+ * Copyright 2012-2013 Intel Corporation
+ * Copyright 2009-2011 Marco Stornelli <marco.storne...@gmail.com>
+ * Copyright 2003 Sony Corporation
+ * Copyright 2003 Matsushita Electric Industrial Co., Ltd.
+ * 2003-2004 (c) MontaVista Software, Inc. , Steve Longerbeam
+ * This file is licensed under the terms of the GNU General Public
+ * License version 2. This program is licensed "as is" without any
+ * warranty of any kind, whether express or implied.
+ */
+
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include "nova.h"
+#include "inode.h"
+
+unsigned int blk_type_to_shift[NOVA_BLOCK_TYPE_MAX] = {12, 21, 30};
+uint32_t blk_type_to_size[NOVA_BLOCK_TYPE_MAX] = {0x1000, 0x20, 
0x4000};
+
+void nova_set_inode_flags(struct inode *inode, struct nova_inode *pi,
+   unsigned int flags)
+{
+   inode->i_flags &=
+   ~(S_SYNC | S_APPEND | S_IMMUTABLE | S_NOATIME | S_DIRSYNC);
+   if (flags & FS_SYNC_FL)
+   inode->i_flags |= S_SYNC;
+   if (flags & FS_APPEND_FL)
+   inode->i_flags |= S_APPEND;
+   if (flags & FS_IMMUTABLE_FL)
+   inode->i_flags |= S_IMMUTABLE;
+   if (flags & FS_NOATIME_FL)
+   inode->i_flags |= S_NOATIME;
+   if (flags & FS_DIRSYNC_FL)
+   inode->i_flags |= S_DIRSYNC;
+   if (!pi->i_xattr)
+   inode_has_no_xattr(inode);
+   inode->i_flags |= S_DAX;
+}
+
+/* copy persistent state to struct inode */
+static int nova_read_inode(struct super_block *sb, struct inode *inode,
+   u64 pi_addr)
+{
+   struct nova_inode_info *si = NOVA_I(inode);
+   struct nova_inode *pi, fake_pi;
+   struct nova_inode_info_header *sih = >header;
+   int ret = -EIO;
+   unsigned long ino;
+
+   ret = nova_get_reference(sb, pi_addr, _pi,
+   (void **), sizeof(struct nova_inode));
+   if (ret) {
+   nova_dbg("%s: read pi @ 0x%llx failed\n",
+   __func__, pi_addr);
+   goto bad_inode;
+   }
+
+   inode->i_mode = sih->i_mode;
+   i_uid_write(inode, le32_to_cpu(pi->i_uid));
+   i_gid_write(inode, le32_to_cpu(pi->i_gid));
+// set_nlink(inode, le16_to_cpu(pi->i_links_count));
+   inode->i_generation = le32_to_cpu(pi->i_generation);
+   nova_set_inode_flags(inode, pi, le32_to_cpu(pi->i_flags));
+   ino = inode->i_ino;
+
+   /* check if the inode is active. */
+   if (inode->i_mode == 0 || pi->deleted == 1) {
+   /* this inode is deleted */
+   ret = -ESTALE;
+   goto bad_inode;
+   }
+
+   inode->i_blocks = sih->i_blocks;
+
+   switch (inode->i_mode & S_IFMT) {
+   case S_IFREG:
+   break;
+   case S_IFDIR:
+   break;
+   case S_IFLNK:
+   break;
+   default:
+   init_special_inode(inode, inode->i_mode,
+  le32_to_cpu(pi->dev.rdev));
+   break;
+   }
+
+   /* Update size and time after rebuild the tree */
+   inode->i_size = le64_to_cpu(sih->i_size);
+   inode->i_atime.tv_sec = (__s32)le32_to_cpu(pi->i_atime);
+   inode->i_ctime.tv_sec = (__s32)le32_to_cpu(pi->i_ctime);
+   inode->i_mtime.tv_sec = (__s32)le32_to_cpu(pi->i_mtime);
+   inode->i_atime.tv_nsec = inode->i_mtime.tv_nsec =
+inode->i_ctime.tv_nsec = 0;
+   set_nlink(inode, le16_to_cpu(pi->i_links_count));
+   return 0;
+
+bad_inode:
+   make_bad_inode(inode);
+   return ret;
+}
+
+/* Get the address in PMEM of an inode by inode number.  Allocate additional
+ * block to store additional inodes if necessary.
+ */
+int nova_get_inode_address(struct super_block *sb, u64 ino,
+   u64 *pi_addr, int extendable)
+{
+   if (ino < NOVA_NORMAL_INODE_START) {
+   *pi_addr = nova_get_reserved_inode_addr(sb, ino);
+

[RFC v2 06/83] Add inode get/read methods.

2018-03-10 Thread Andiry Xu

From: Andiry Xu 

These routines are incomplete and currently only support reserved inodes,
whose addresses are fixed. This is necessary for fill_super to work.
File/dir operations are left NULL.

Signed-off-by: Andiry Xu 
---
 fs/nova/inode.c | 176 
 fs/nova/inode.h |   3 +
 2 files changed, 179 insertions(+)
 create mode 100644 fs/nova/inode.c

diff --git a/fs/nova/inode.c b/fs/nova/inode.c
new file mode 100644
index 000..bfdc5dc
--- /dev/null
+++ b/fs/nova/inode.c
@@ -0,0 +1,176 @@
+/*
+ * BRIEF DESCRIPTION
+ *
+ * Inode methods (allocate/free/read/write).
+ *
+ * Copyright 2015-2016 Regents of the University of California,
+ * UCSD Non-Volatile Systems Lab, Andiry Xu 
+ * Copyright 2012-2013 Intel Corporation
+ * Copyright 2009-2011 Marco Stornelli 
+ * Copyright 2003 Sony Corporation
+ * Copyright 2003 Matsushita Electric Industrial Co., Ltd.
+ * 2003-2004 (c) MontaVista Software, Inc. , Steve Longerbeam
+ * This file is licensed under the terms of the GNU General Public
+ * License version 2. This program is licensed "as is" without any
+ * warranty of any kind, whether express or implied.
+ */
+
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include "nova.h"
+#include "inode.h"
+
+unsigned int blk_type_to_shift[NOVA_BLOCK_TYPE_MAX] = {12, 21, 30};
+uint32_t blk_type_to_size[NOVA_BLOCK_TYPE_MAX] = {0x1000, 0x20, 
0x4000};
+
+void nova_set_inode_flags(struct inode *inode, struct nova_inode *pi,
+   unsigned int flags)
+{
+   inode->i_flags &=
+   ~(S_SYNC | S_APPEND | S_IMMUTABLE | S_NOATIME | S_DIRSYNC);
+   if (flags & FS_SYNC_FL)
+   inode->i_flags |= S_SYNC;
+   if (flags & FS_APPEND_FL)
+   inode->i_flags |= S_APPEND;
+   if (flags & FS_IMMUTABLE_FL)
+   inode->i_flags |= S_IMMUTABLE;
+   if (flags & FS_NOATIME_FL)
+   inode->i_flags |= S_NOATIME;
+   if (flags & FS_DIRSYNC_FL)
+   inode->i_flags |= S_DIRSYNC;
+   if (!pi->i_xattr)
+   inode_has_no_xattr(inode);
+   inode->i_flags |= S_DAX;
+}
+
+/* copy persistent state to struct inode */
+static int nova_read_inode(struct super_block *sb, struct inode *inode,
+   u64 pi_addr)
+{
+   struct nova_inode_info *si = NOVA_I(inode);
+   struct nova_inode *pi, fake_pi;
+   struct nova_inode_info_header *sih = >header;
+   int ret = -EIO;
+   unsigned long ino;
+
+   ret = nova_get_reference(sb, pi_addr, _pi,
+   (void **), sizeof(struct nova_inode));
+   if (ret) {
+   nova_dbg("%s: read pi @ 0x%llx failed\n",
+   __func__, pi_addr);
+   goto bad_inode;
+   }
+
+   inode->i_mode = sih->i_mode;
+   i_uid_write(inode, le32_to_cpu(pi->i_uid));
+   i_gid_write(inode, le32_to_cpu(pi->i_gid));
+// set_nlink(inode, le16_to_cpu(pi->i_links_count));
+   inode->i_generation = le32_to_cpu(pi->i_generation);
+   nova_set_inode_flags(inode, pi, le32_to_cpu(pi->i_flags));
+   ino = inode->i_ino;
+
+   /* check if the inode is active. */
+   if (inode->i_mode == 0 || pi->deleted == 1) {
+   /* this inode is deleted */
+   ret = -ESTALE;
+   goto bad_inode;
+   }
+
+   inode->i_blocks = sih->i_blocks;
+
+   switch (inode->i_mode & S_IFMT) {
+   case S_IFREG:
+   break;
+   case S_IFDIR:
+   break;
+   case S_IFLNK:
+   break;
+   default:
+   init_special_inode(inode, inode->i_mode,
+  le32_to_cpu(pi->dev.rdev));
+   break;
+   }
+
+   /* Update size and time after rebuild the tree */
+   inode->i_size = le64_to_cpu(sih->i_size);
+   inode->i_atime.tv_sec = (__s32)le32_to_cpu(pi->i_atime);
+   inode->i_ctime.tv_sec = (__s32)le32_to_cpu(pi->i_ctime);
+   inode->i_mtime.tv_sec = (__s32)le32_to_cpu(pi->i_mtime);
+   inode->i_atime.tv_nsec = inode->i_mtime.tv_nsec =
+inode->i_ctime.tv_nsec = 0;
+   set_nlink(inode, le16_to_cpu(pi->i_links_count));
+   return 0;
+
+bad_inode:
+   make_bad_inode(inode);
+   return ret;
+}
+
+/* Get the address in PMEM of an inode by inode number.  Allocate additional
+ * block to store additional inodes if necessary.
+ */
+int nova_get_inode_address(struct super_block *sb, u64 ino,
+   u64 *pi_addr, int extendable)
+{
+   if (ino < NOVA_NORMAL_INODE_START) {
+   *pi_addr = nova_get_reserved_inode_addr(sb, ino);
+   return 0;
+   }
+
+   *pi_addr = 0;
+   return 0;
+}
+
+struct inode *nova_iget(stru

[RFC v2 02/83] Add nova_def.h.

2018-03-10 Thread Andiry Xu

From: Andiry Xu <jix...@cs.ucsd.edu>

This file defines NOVA filesystem macros and routines to persist updates
by using Intel persistent memory instruction CLWB or clflush.

Signed-off-by: Andiry Xu <jix...@cs.ucsd.edu>
---
 fs/nova/nova_def.h | 128 +
 1 file changed, 128 insertions(+)
 create mode 100644 fs/nova/nova_def.h

diff --git a/fs/nova/nova_def.h b/fs/nova/nova_def.h
new file mode 100644
index 000..1cbed6f
--- /dev/null
+++ b/fs/nova/nova_def.h
@@ -0,0 +1,128 @@
+/*
+ * BRIEF DESCRIPTION
+ *
+ * Definitions for the NOVA filesystem.
+ *
+ * Copyright 2015-2016 Regents of the University of California,
+ * UCSD Non-Volatile Systems Lab, Andiry Xu <jix...@cs.ucsd.edu>
+ * Copyright 2012-2013 Intel Corporation
+ * Copyright 2009-2011 Marco Stornelli <marco.storne...@gmail.com>
+ * Copyright 2003 Sony Corporation
+ * Copyright 2003 Matsushita Electric Industrial Co., Ltd.
+ * 2003-2004 (c) MontaVista Software, Inc. , Steve Longerbeam
+ *
+ * This file is licensed under the terms of the GNU General Public
+ * License version 2. This program is licensed "as is" without any
+ * warranty of any kind, whether express or implied.
+ */
+#ifndef _LINUX_NOVA_DEF_H
+#define _LINUX_NOVA_DEF_H
+
+#include 
+#include 
+
+#defineNOVA_SUPER_MAGIC0x4E4F5641  /* NOVA */
+
+/*
+ * The NOVA filesystem constants/structures
+ */
+
+/*
+ * Mount flags
+ */
+#define NOVA_MOUNT_XATTR_USER   0x02/* Extended user attributes */
+#define NOVA_MOUNT_POSIX_ACL0x04/* POSIX Access Control Lists */
+#define NOVA_MOUNT_DAX  0x08/* Direct Access */
+#define NOVA_MOUNT_ERRORS_CONT  0x10/* Continue on errors */
+#define NOVA_MOUNT_ERRORS_RO0x20/* Remount fs ro on errors */
+#define NOVA_MOUNT_ERRORS_PANIC 0x40/* Panic on errors */
+#define NOVA_MOUNT_HUGEMMAP 0x80/* Huge mappings with mmap */
+#define NOVA_MOUNT_HUGEIOREMAP  0x000100/* Huge mappings with ioremap */
+#define NOVA_MOUNT_FORMAT   0x000200/* was FS formatted on mount? */
+
+/*
+ * Maximal count of links to a file
+ */
+#define NOVA_LINK_MAX  32000
+
+#define NOVA_DEF_BLOCK_SIZE_4K 4096
+
+#define NOVA_INODE_BITS   7
+#define NOVA_INODE_SIZE   128/* must be power of two */
+
+#define NOVA_NAME_LEN 255
+
+#define MAX_CPUS 1024
+
+/* NOVA supported data blocks */
+#define NOVA_BLOCK_TYPE_4K 0
+#define NOVA_BLOCK_TYPE_2M 1
+#define NOVA_BLOCK_TYPE_1G 2
+#define NOVA_BLOCK_TYPE_MAX3
+
+#define META_BLK_SHIFT 9
+
+/*
+ * Play with this knob to change the default block type.
+ * By changing the NOVA_DEFAULT_BLOCK_TYPE to 2M or 1G,
+ * we should get pretty good coverage in testing.
+ */
+#define NOVA_DEFAULT_BLOCK_TYPE NOVA_BLOCK_TYPE_4K
+
+
+/* === Write ordering = */
+
+#define CACHELINE_SIZE  (64)
+#define CACHELINE_MASK  (~(CACHELINE_SIZE - 1))
+#define CACHELINE_ALIGN(addr) (((addr)+CACHELINE_SIZE-1) & CACHELINE_MASK)
+
+
+static inline bool arch_has_clwb(void)
+{
+   return static_cpu_has(X86_FEATURE_CLWB);
+}
+
+extern int support_clwb;
+
+#define _mm_clflush(addr)\
+   asm volatile("clflush %0" : "+m" (*(volatile char *)(addr)))
+#define _mm_clflushopt(addr)\
+   asm volatile(".byte 0x66; clflush %0" : "+m" \
+(*(volatile char *)(addr)))
+#define _mm_clwb(addr)\
+   asm volatile(".byte 0x66; xsaveopt %0" : "+m" \
+(*(volatile char *)(addr)))
+
+/* Provides ordering from all previous clflush too */
+static inline void PERSISTENT_MARK(void)
+{
+   /* TODO: Fix me. */
+}
+
+static inline void PERSISTENT_BARRIER(void)
+{
+   asm volatile ("sfence\n" : : );
+}
+
+static inline void nova_flush_buffer(void *buf, uint32_t len, bool fence)
+{
+   uint32_t i;
+
+   len = len + ((unsigned long)(buf) & (CACHELINE_SIZE - 1));
+   if (support_clwb) {
+   for (i = 0; i < len; i += CACHELINE_SIZE)
+   _mm_clwb(buf + i);
+   } else {
+   for (i = 0; i < len; i += CACHELINE_SIZE)
+   _mm_clflush(buf + i);
+   }
+   /* Do a fence only if asked. We often don't need to do a fence
+* immediately after clflush because even if we get context switched
+* between clflush and subsequent fence, the context switch operation
+* provides implicit fence.
+*/
+   if (fence)
+   PERSISTENT_BARRIER();
+}
+
+#endif /* _LINUX_NOVA_DEF_H */
-- 
2.7.4

[RFC v2 02/83] Add nova_def.h.

2018-03-10 Thread Andiry Xu

From: Andiry Xu 

This file defines NOVA filesystem macros and routines to persist updates
by using Intel persistent memory instruction CLWB or clflush.

Signed-off-by: Andiry Xu 
---
 fs/nova/nova_def.h | 128 +
 1 file changed, 128 insertions(+)
 create mode 100644 fs/nova/nova_def.h

diff --git a/fs/nova/nova_def.h b/fs/nova/nova_def.h
new file mode 100644
index 000..1cbed6f
--- /dev/null
+++ b/fs/nova/nova_def.h
@@ -0,0 +1,128 @@
+/*
+ * BRIEF DESCRIPTION
+ *
+ * Definitions for the NOVA filesystem.
+ *
+ * Copyright 2015-2016 Regents of the University of California,
+ * UCSD Non-Volatile Systems Lab, Andiry Xu 
+ * Copyright 2012-2013 Intel Corporation
+ * Copyright 2009-2011 Marco Stornelli 
+ * Copyright 2003 Sony Corporation
+ * Copyright 2003 Matsushita Electric Industrial Co., Ltd.
+ * 2003-2004 (c) MontaVista Software, Inc. , Steve Longerbeam
+ *
+ * This file is licensed under the terms of the GNU General Public
+ * License version 2. This program is licensed "as is" without any
+ * warranty of any kind, whether express or implied.
+ */
+#ifndef _LINUX_NOVA_DEF_H
+#define _LINUX_NOVA_DEF_H
+
+#include 
+#include 
+
+#defineNOVA_SUPER_MAGIC0x4E4F5641  /* NOVA */
+
+/*
+ * The NOVA filesystem constants/structures
+ */
+
+/*
+ * Mount flags
+ */
+#define NOVA_MOUNT_XATTR_USER   0x02/* Extended user attributes */
+#define NOVA_MOUNT_POSIX_ACL0x04/* POSIX Access Control Lists */
+#define NOVA_MOUNT_DAX  0x08/* Direct Access */
+#define NOVA_MOUNT_ERRORS_CONT  0x10/* Continue on errors */
+#define NOVA_MOUNT_ERRORS_RO0x20/* Remount fs ro on errors */
+#define NOVA_MOUNT_ERRORS_PANIC 0x40/* Panic on errors */
+#define NOVA_MOUNT_HUGEMMAP 0x80/* Huge mappings with mmap */
+#define NOVA_MOUNT_HUGEIOREMAP  0x000100/* Huge mappings with ioremap */
+#define NOVA_MOUNT_FORMAT   0x000200/* was FS formatted on mount? */
+
+/*
+ * Maximal count of links to a file
+ */
+#define NOVA_LINK_MAX  32000
+
+#define NOVA_DEF_BLOCK_SIZE_4K 4096
+
+#define NOVA_INODE_BITS   7
+#define NOVA_INODE_SIZE   128/* must be power of two */
+
+#define NOVA_NAME_LEN 255
+
+#define MAX_CPUS 1024
+
+/* NOVA supported data blocks */
+#define NOVA_BLOCK_TYPE_4K 0
+#define NOVA_BLOCK_TYPE_2M 1
+#define NOVA_BLOCK_TYPE_1G 2
+#define NOVA_BLOCK_TYPE_MAX3
+
+#define META_BLK_SHIFT 9
+
+/*
+ * Play with this knob to change the default block type.
+ * By changing the NOVA_DEFAULT_BLOCK_TYPE to 2M or 1G,
+ * we should get pretty good coverage in testing.
+ */
+#define NOVA_DEFAULT_BLOCK_TYPE NOVA_BLOCK_TYPE_4K
+
+
+/* === Write ordering = */
+
+#define CACHELINE_SIZE  (64)
+#define CACHELINE_MASK  (~(CACHELINE_SIZE - 1))
+#define CACHELINE_ALIGN(addr) (((addr)+CACHELINE_SIZE-1) & CACHELINE_MASK)
+
+
+static inline bool arch_has_clwb(void)
+{
+   return static_cpu_has(X86_FEATURE_CLWB);
+}
+
+extern int support_clwb;
+
+#define _mm_clflush(addr)\
+   asm volatile("clflush %0" : "+m" (*(volatile char *)(addr)))
+#define _mm_clflushopt(addr)\
+   asm volatile(".byte 0x66; clflush %0" : "+m" \
+(*(volatile char *)(addr)))
+#define _mm_clwb(addr)\
+   asm volatile(".byte 0x66; xsaveopt %0" : "+m" \
+(*(volatile char *)(addr)))
+
+/* Provides ordering from all previous clflush too */
+static inline void PERSISTENT_MARK(void)
+{
+   /* TODO: Fix me. */
+}
+
+static inline void PERSISTENT_BARRIER(void)
+{
+   asm volatile ("sfence\n" : : );
+}
+
+static inline void nova_flush_buffer(void *buf, uint32_t len, bool fence)
+{
+   uint32_t i;
+
+   len = len + ((unsigned long)(buf) & (CACHELINE_SIZE - 1));
+   if (support_clwb) {
+   for (i = 0; i < len; i += CACHELINE_SIZE)
+   _mm_clwb(buf + i);
+   } else {
+   for (i = 0; i < len; i += CACHELINE_SIZE)
+   _mm_clflush(buf + i);
+   }
+   /* Do a fence only if asked. We often don't need to do a fence
+* immediately after clflush because even if we get context switched
+* between clflush and subsequent fence, the context switch operation
+* provides implicit fence.
+*/
+   if (fence)
+   PERSISTENT_BARRIER();
+}
+
+#endif /* _LINUX_NOVA_DEF_H */
-- 
2.7.4

[RFC v2 08/83] NOVA superblock operations.

2018-03-10 Thread Andiry Xu

From: Andiry Xu <jix...@cs.ucsd.edu>

This is the entry point for NOVA filesystem mount and umount.
NOVA works on DAX devices. During initialization it gets the
device information, such as physical/virtual addresses and device size.
It does not access the DAX device during runtime.

During initialization NOVA also initializes the root inode.
The root inode is a reserved inode and resides on the fixed location.

The way to mount and initialize a NOVA instance is:

mount -t NOVA -o init /dev/pmem0 /mnt/NOVA

This creates a NOVA instance on /dev/pmem0 and mount on /mnt/NOVA.
Currently it cannot do anything except mount and umount.

Signed-off-by: Andiry Xu <jix...@cs.ucsd.edu>
---
 fs/nova/super.c | 630 
 1 file changed, 630 insertions(+)
 create mode 100644 fs/nova/super.c

diff --git a/fs/nova/super.c b/fs/nova/super.c
new file mode 100644
index 000..552fe5d
--- /dev/null
+++ b/fs/nova/super.c
@@ -0,0 +1,630 @@
+/*
+ * BRIEF DESCRIPTION
+ *
+ * Super block operations.
+ *
+ * Copyright 2015-2016 Regents of the University of California,
+ * UCSD Non-Volatile Systems Lab, Andiry Xu <jix...@cs.ucsd.edu>
+ * Copyright 2012-2013 Intel Corporation
+ * Copyright 2009-2011 Marco Stornelli <marco.storne...@gmail.com>
+ * Copyright 2003 Sony Corporation
+ * Copyright 2003 Matsushita Electric Industrial Co., Ltd.
+ * 2003-2004 (c) MontaVista Software, Inc. , Steve Longerbeam
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ *
+ * This file is licensed under the terms of the GNU General Public
+ * License version 2. This program is licensed "as is" without any
+ * warranty of any kind, whether express or implied.
+ */
+
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include "nova.h"
+#include "super.h"
+
+int support_clwb;
+
+module_param(nova_dbgmask, int, 0444);
+MODULE_PARM_DESC(nova_dbgmask, "Control debugging output");
+
+static struct super_operations nova_sops;
+
+static struct kmem_cache *nova_inode_cachep;
+
+
+/* FIXME: should the following variable be one per NOVA instance? */
+unsigned int nova_dbgmask;
+
+void nova_error_mng(struct super_block *sb, const char *fmt, ...)
+{
+   va_list args;
+
+   printk(KERN_CRIT "nova error: ");
+   va_start(args, fmt);
+   vprintk(fmt, args);
+   va_end(args);
+
+   if (test_opt(sb, ERRORS_PANIC))
+   panic("nova: panic from previous error\n");
+   if (test_opt(sb, ERRORS_RO)) {
+   printk(KERN_CRIT "nova err: remounting filesystem read-only");
+   sb->s_flags |= MS_RDONLY;
+   }
+}
+
+static void nova_set_blocksize(struct super_block *sb, unsigned long size)
+{
+   int bits;
+
+   /*
+* We've already validated the user input and the value here must be
+* between NOVA_MAX_BLOCK_SIZE and NOVA_MIN_BLOCK_SIZE
+* and it must be a power of 2.
+*/
+   bits = fls(size) - 1;
+   sb->s_blocksize_bits = bits;
+   sb->s_blocksize = (1 << bits);
+}
+
+static int nova_get_nvmm_info(struct super_block *sb,
+   struct nova_sb_info *sbi)
+{
+   void *virt_addr = NULL;
+   pfn_t __pfn_t;
+   long size;
+   struct dax_device *dax_dev;
+   int ret;
+
+   ret = bdev_dax_supported(sb, PAGE_SIZE);
+   nova_dbg_verbose("%s: dax_supported = %d; bdev->super=0x%p",
+__func__, ret, sb->s_bdev->bd_super);
+   if (ret) {
+   nova_err(sb, "device does not support DAX\n");
+   return ret;
+   }
+
+   sbi->s_bdev = sb->s_bdev;
+
+   dax_dev = fs_dax_get_by_host(sb->s_bdev->bd_disk->disk_name);
+   if (!dax_dev) {
+   nova_err(sb, "Couldn't retrieve DAX device.\n");
+   return -EINVAL;
+   }
+   sbi->s_dax_dev = dax_dev;
+
+   size = dax_direct_access(sbi->s_dax_dev, 0, LONG_MAX/PAGE_SIZE,
+_addr, &__pfn_t) * PAGE_SIZE;
+   if (size <= 0) {
+   nova_err(sb, "direct_access failed\n");
+   return -EINVAL;
+   }
+
+   sbi->virt_addr = virt_addr;
+
+   if (!sbi->virt_addr) {
+   nova_err(sb, "ioremap of the nova image failed(1)\n");
+   return -EINVAL;
+   }
+
+   sbi->phys_addr = pfn_t_to_pfn(__pfn_t) << PAGE_SHIFT;
+   sbi->initsize = size;
+   sbi->replica_reserved_inodes_addr = virt_addr + size -
+   (sbi->tail_reserved_blocks << PAGE_SHIFT);
+   sbi->replica_sb_addr = virt_addr + size - PAGE_SIZE;
+
+   nova_dbg("%s: dev

[RFC v2 08/83] NOVA superblock operations.

2018-03-10 Thread Andiry Xu

From: Andiry Xu 

This is the entry point for NOVA filesystem mount and umount.
NOVA works on DAX devices. During initialization it gets the
device information, such as physical/virtual addresses and device size.
It does not access the DAX device during runtime.

During initialization NOVA also initializes the root inode.
The root inode is a reserved inode and resides on the fixed location.

The way to mount and initialize a NOVA instance is:

mount -t NOVA -o init /dev/pmem0 /mnt/NOVA

This creates a NOVA instance on /dev/pmem0 and mount on /mnt/NOVA.
Currently it cannot do anything except mount and umount.

Signed-off-by: Andiry Xu 
---
 fs/nova/super.c | 630 
 1 file changed, 630 insertions(+)
 create mode 100644 fs/nova/super.c

diff --git a/fs/nova/super.c b/fs/nova/super.c
new file mode 100644
index 000..552fe5d
--- /dev/null
+++ b/fs/nova/super.c
@@ -0,0 +1,630 @@
+/*
+ * BRIEF DESCRIPTION
+ *
+ * Super block operations.
+ *
+ * Copyright 2015-2016 Regents of the University of California,
+ * UCSD Non-Volatile Systems Lab, Andiry Xu 
+ * Copyright 2012-2013 Intel Corporation
+ * Copyright 2009-2011 Marco Stornelli 
+ * Copyright 2003 Sony Corporation
+ * Copyright 2003 Matsushita Electric Industrial Co., Ltd.
+ * 2003-2004 (c) MontaVista Software, Inc. , Steve Longerbeam
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ *
+ * This file is licensed under the terms of the GNU General Public
+ * License version 2. This program is licensed "as is" without any
+ * warranty of any kind, whether express or implied.
+ */
+
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include "nova.h"
+#include "super.h"
+
+int support_clwb;
+
+module_param(nova_dbgmask, int, 0444);
+MODULE_PARM_DESC(nova_dbgmask, "Control debugging output");
+
+static struct super_operations nova_sops;
+
+static struct kmem_cache *nova_inode_cachep;
+
+
+/* FIXME: should the following variable be one per NOVA instance? */
+unsigned int nova_dbgmask;
+
+void nova_error_mng(struct super_block *sb, const char *fmt, ...)
+{
+   va_list args;
+
+   printk(KERN_CRIT "nova error: ");
+   va_start(args, fmt);
+   vprintk(fmt, args);
+   va_end(args);
+
+   if (test_opt(sb, ERRORS_PANIC))
+   panic("nova: panic from previous error\n");
+   if (test_opt(sb, ERRORS_RO)) {
+   printk(KERN_CRIT "nova err: remounting filesystem read-only");
+   sb->s_flags |= MS_RDONLY;
+   }
+}
+
+static void nova_set_blocksize(struct super_block *sb, unsigned long size)
+{
+   int bits;
+
+   /*
+* We've already validated the user input and the value here must be
+* between NOVA_MAX_BLOCK_SIZE and NOVA_MIN_BLOCK_SIZE
+* and it must be a power of 2.
+*/
+   bits = fls(size) - 1;
+   sb->s_blocksize_bits = bits;
+   sb->s_blocksize = (1 << bits);
+}
+
+static int nova_get_nvmm_info(struct super_block *sb,
+   struct nova_sb_info *sbi)
+{
+   void *virt_addr = NULL;
+   pfn_t __pfn_t;
+   long size;
+   struct dax_device *dax_dev;
+   int ret;
+
+   ret = bdev_dax_supported(sb, PAGE_SIZE);
+   nova_dbg_verbose("%s: dax_supported = %d; bdev->super=0x%p",
+__func__, ret, sb->s_bdev->bd_super);
+   if (ret) {
+   nova_err(sb, "device does not support DAX\n");
+   return ret;
+   }
+
+   sbi->s_bdev = sb->s_bdev;
+
+   dax_dev = fs_dax_get_by_host(sb->s_bdev->bd_disk->disk_name);
+   if (!dax_dev) {
+   nova_err(sb, "Couldn't retrieve DAX device.\n");
+   return -EINVAL;
+   }
+   sbi->s_dax_dev = dax_dev;
+
+   size = dax_direct_access(sbi->s_dax_dev, 0, LONG_MAX/PAGE_SIZE,
+_addr, &__pfn_t) * PAGE_SIZE;
+   if (size <= 0) {
+   nova_err(sb, "direct_access failed\n");
+   return -EINVAL;
+   }
+
+   sbi->virt_addr = virt_addr;
+
+   if (!sbi->virt_addr) {
+   nova_err(sb, "ioremap of the nova image failed(1)\n");
+   return -EINVAL;
+   }
+
+   sbi->phys_addr = pfn_t_to_pfn(__pfn_t) << PAGE_SHIFT;
+   sbi->initsize = size;
+   sbi->replica_reserved_inodes_addr = virt_addr + size -
+   (sbi->tail_reserved_blocks << PAGE_SHIFT);
+   sbi->replica_sb_addr = virt_addr + size - PAGE_SIZE;
+
+   nova_dbg("%s: dev %s, phys_addr 0x%llx, virt_addr %p, size %ld\n",
+   __func__, sbi->s_bdev->bd_disk->disk_nam

[RFC v2 09/83] Add Kconfig and Makefile

2018-03-10 Thread Andiry Xu

From: Andiry Xu <jix...@cs.ucsd.edu>

Signed-off-by: Andiry Xu <jix...@cs.ucsd.edu>
---
 fs/Kconfig   |  2 ++
 fs/Makefile  |  1 +
 fs/nova/Kconfig  | 15 +++
 fs/nova/Makefile |  7 +++
 4 files changed, 25 insertions(+)
 create mode 100644 fs/nova/Kconfig
 create mode 100644 fs/nova/Makefile

diff --git a/fs/Kconfig b/fs/Kconfig
index bc821a8..5e9ff3e 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -58,6 +58,8 @@ config FS_DAX_PMD
depends on ZONE_DEVICE
depends on TRANSPARENT_HUGEPAGE
 
+source "fs/nova/Kconfig"
+
 # Selected by DAX drivers that do not expect filesystem DAX to support
 # get_user_pages() of DAX mappings. I.e. "limited" indicates no support
 # for fork() of processes with MAP_SHARED mappings or support for
diff --git a/fs/Makefile b/fs/Makefile
index add789e..65ea619 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -113,6 +113,7 @@ obj-$(CONFIG_OMFS_FS)   += omfs/
 obj-$(CONFIG_JFS_FS)   += jfs/
 obj-$(CONFIG_XFS_FS)   += xfs/
 obj-$(CONFIG_9P_FS)+= 9p/
+obj-$(CONFIG_NOVA_FS)  += nova/
 obj-$(CONFIG_AFS_FS)   += afs/
 obj-$(CONFIG_NILFS2_FS)+= nilfs2/
 obj-$(CONFIG_BEFS_FS)  += befs/
diff --git a/fs/nova/Kconfig b/fs/nova/Kconfig
new file mode 100644
index 000..c1c692e
--- /dev/null
+++ b/fs/nova/Kconfig
@@ -0,0 +1,15 @@
+config NOVA_FS
+   tristate "NOVA: log-structured file system for non-volatile memories"
+   depends on FS_DAX
+   select CRC32
+   select LIBCRC32C
+   help
+ If your system has a block of fast (comparable in access speed to
+ system memory) and non-volatile byte-addressable memory and you wish
+ to mount a light-weight filesystem with strong consistency support
+ over it, say Y here.
+
+ To compile this as a module, choose M here: the module will be
+ called nova.
+
+ If unsure, say N.
diff --git a/fs/nova/Makefile b/fs/nova/Makefile
new file mode 100644
index 000..eb19646
--- /dev/null
+++ b/fs/nova/Makefile
@@ -0,0 +1,7 @@
+#
+# Makefile for the linux NOVA filesystem routines.
+#
+
+obj-$(CONFIG_NOVA_FS) += nova.o
+
+nova-y := bbuild.o inode.o rebuild.o super.o
-- 
2.7.4

[RFC v2 09/83] Add Kconfig and Makefile

2018-03-10 Thread Andiry Xu

From: Andiry Xu 

Signed-off-by: Andiry Xu 
---
 fs/Kconfig   |  2 ++
 fs/Makefile  |  1 +
 fs/nova/Kconfig  | 15 +++
 fs/nova/Makefile |  7 +++
 4 files changed, 25 insertions(+)
 create mode 100644 fs/nova/Kconfig
 create mode 100644 fs/nova/Makefile

diff --git a/fs/Kconfig b/fs/Kconfig
index bc821a8..5e9ff3e 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -58,6 +58,8 @@ config FS_DAX_PMD
depends on ZONE_DEVICE
depends on TRANSPARENT_HUGEPAGE
 
+source "fs/nova/Kconfig"
+
 # Selected by DAX drivers that do not expect filesystem DAX to support
 # get_user_pages() of DAX mappings. I.e. "limited" indicates no support
 # for fork() of processes with MAP_SHARED mappings or support for
diff --git a/fs/Makefile b/fs/Makefile
index add789e..65ea619 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -113,6 +113,7 @@ obj-$(CONFIG_OMFS_FS)   += omfs/
 obj-$(CONFIG_JFS_FS)   += jfs/
 obj-$(CONFIG_XFS_FS)   += xfs/
 obj-$(CONFIG_9P_FS)+= 9p/
+obj-$(CONFIG_NOVA_FS)  += nova/
 obj-$(CONFIG_AFS_FS)   += afs/
 obj-$(CONFIG_NILFS2_FS)+= nilfs2/
 obj-$(CONFIG_BEFS_FS)  += befs/
diff --git a/fs/nova/Kconfig b/fs/nova/Kconfig
new file mode 100644
index 000..c1c692e
--- /dev/null
+++ b/fs/nova/Kconfig
@@ -0,0 +1,15 @@
+config NOVA_FS
+   tristate "NOVA: log-structured file system for non-volatile memories"
+   depends on FS_DAX
+   select CRC32
+   select LIBCRC32C
+   help
+ If your system has a block of fast (comparable in access speed to
+ system memory) and non-volatile byte-addressable memory and you wish
+ to mount a light-weight filesystem with strong consistency support
+ over it, say Y here.
+
+ To compile this as a module, choose M here: the module will be
+ called nova.
+
+ If unsure, say N.
diff --git a/fs/nova/Makefile b/fs/nova/Makefile
new file mode 100644
index 000..eb19646
--- /dev/null
+++ b/fs/nova/Makefile
@@ -0,0 +1,7 @@
+#
+# Makefile for the linux NOVA filesystem routines.
+#
+
+obj-$(CONFIG_NOVA_FS) += nova.o
+
+nova-y := bbuild.o inode.o rebuild.o super.o
-- 
2.7.4

[RFC v2 15/83] Add free list data structure.

2018-03-10 Thread Andiry Xu

From: Andiry Xu <jix...@cs.ucsd.edu>

Free list is the data structure that NOVA uses to manage free pmem blocks.
Each CPU has its own free list to avoid contention.
Free list manages free pmem blocks (represented in range node) with red-black 
tree.

Signed-off-by: Andiry Xu <jix...@cs.ucsd.edu>
---
 fs/nova/Makefile |  2 +-
 fs/nova/balloc.c | 58 +
 fs/nova/balloc.h | 66 
 fs/nova/nova.h   |  1 +
 fs/nova/super.c  | 11 ++
 fs/nova/super.h  |  4 
 6 files changed, 141 insertions(+), 1 deletion(-)
 create mode 100644 fs/nova/balloc.c
 create mode 100644 fs/nova/balloc.h

diff --git a/fs/nova/Makefile b/fs/nova/Makefile
index 886356a..e2f7b07 100644
--- a/fs/nova/Makefile
+++ b/fs/nova/Makefile
@@ -4,4 +4,4 @@
 
 obj-$(CONFIG_NOVA_FS) += nova.o
 
-nova-y := bbuild.o inode.o rebuild.o stats.o super.o
+nova-y := balloc.o bbuild.o inode.o rebuild.o stats.o super.o
diff --git a/fs/nova/balloc.c b/fs/nova/balloc.c
new file mode 100644
index 000..450c942
--- /dev/null
+++ b/fs/nova/balloc.c
@@ -0,0 +1,58 @@
+/*
+ * NOVA persistent memory management
+ *
+ * Copyright 2015-2016 Regents of the University of California,
+ * UCSD Non-Volatile Systems Lab, Andiry Xu <jix...@cs.ucsd.edu>
+ * Copyright 2012-2013 Intel Corporation
+ * Copyright 2009-2011 Marco Stornelli <marco.storne...@gmail.com>
+ * Copyright 2003 Sony Corporation
+ * Copyright 2003 Matsushita Electric Industrial Co., Ltd.
+ * 2003-2004 (c) MontaVista Software, Inc. , Steve Longerbeam
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ */
+
+#include 
+#include 
+#include "nova.h"
+#include "inode.h"
+
+int nova_alloc_block_free_lists(struct super_block *sb)
+{
+   struct nova_sb_info *sbi = NOVA_SB(sb);
+   struct free_list *free_list;
+   int i;
+
+   sbi->free_lists = kcalloc(sbi->cpus, sizeof(struct free_list),
+ GFP_KERNEL);
+
+   if (!sbi->free_lists)
+   return -ENOMEM;
+
+   for (i = 0; i < sbi->cpus; i++) {
+   free_list = nova_get_free_list(sb, i);
+   free_list->block_free_tree = RB_ROOT;
+   spin_lock_init(_list->s_lock);
+   free_list->index = i;
+   }
+
+   return 0;
+}
+
+void nova_delete_free_lists(struct super_block *sb)
+{
+   struct nova_sb_info *sbi = NOVA_SB(sb);
+
+   /* Each tree is freed in save_blocknode_mappings */
+   kfree(sbi->free_lists);
+   sbi->free_lists = NULL;
+}
+
+
diff --git a/fs/nova/balloc.h b/fs/nova/balloc.h
new file mode 100644
index 000..e7c7a1d
--- /dev/null
+++ b/fs/nova/balloc.h
@@ -0,0 +1,66 @@
+#ifndef __BALLOC_H
+#define __BALLOC_H
+
+#include "inode.h"
+
+/* DRAM structure to hold a list of free PMEM blocks */
+struct free_list {
+   spinlock_t s_lock;
+   struct rb_root  block_free_tree;
+   struct nova_range_node *first_node; // lowest address free range
+   struct nova_range_node *last_node; // highest address free range
+
+   int index; // Which CPU do I belong to?
+
+   /*
+* Start and end of allocatable range, inclusive.
+*/
+   unsigned long   block_start;
+   unsigned long   block_end;
+
+   unsigned long   num_free_blocks;
+
+   /* How many nodes in the rb tree? */
+   unsigned long   num_blocknode;
+
+   u32 csum;   /* Protect integrity */
+
+   /* Statistics */
+   unsigned long   alloc_log_count;
+   unsigned long   alloc_data_count;
+   unsigned long   free_log_count;
+   unsigned long   free_data_count;
+   unsigned long   alloc_log_pages;
+   unsigned long   alloc_data_pages;
+   unsigned long   freed_log_pages;
+   unsigned long   freed_data_pages;
+
+   u64 padding[8]; /* Cache line break */
+};
+
+static inline
+struct free_list *nova_get_free_list(struct super_block *sb, int cpu)
+{
+   struct nova_sb_info *sbi = NOVA_SB(sb);
+
+   return >free_lists[cpu];
+}
+
+enum nova_alloc_direction {ALLOC_FROM_HEAD = 0,
+  ALLOC_FROM_TAIL = 1};
+
+enum nova_alloc_init {ALLOC_NO_INIT = 0,
+ ALLOC_INIT_ZERO = 1};
+
+enum alloc_type {
+   LOG = 1,
+   DATA,
+};
+
+
+
+
+int nova_alloc_block_free_lists(struct super_block *sb);
+void nova_delete_free_lists(struct super_block *sb);
+
+#endif
diff --git a/fs/nova/nova.h b/fs/nova/nova.h
i

[RFC v2 07/83] Initialize inode_info and rebuild inode information in nova_iget().

2018-03-10 Thread Andiry Xu

From: Andiry Xu <jix...@cs.ucsd.edu>

Incomplete nova_rebuild_inode() implemenation.
nova_rebuild_inode() will go through the inode log and rebuild
radix tree and metadata. Leave for later patches.

Signed-off-by: Andiry Xu <jix...@cs.ucsd.edu>
---
 fs/nova/bbuild.c  | 53 +
 fs/nova/bbuild.h  |  7 +++
 fs/nova/inode.c   |  6 ++
 fs/nova/nova.h| 10 ++
 fs/nova/rebuild.c | 48 
 5 files changed, 124 insertions(+)
 create mode 100644 fs/nova/bbuild.c
 create mode 100644 fs/nova/bbuild.h
 create mode 100644 fs/nova/rebuild.c

diff --git a/fs/nova/bbuild.c b/fs/nova/bbuild.c
new file mode 100644
index 000..8bc0545
--- /dev/null
+++ b/fs/nova/bbuild.c
@@ -0,0 +1,53 @@
+/*
+ * NOVA Recovery routines.
+ *
+ * Copyright 2015-2016 Regents of the University of California,
+ * UCSD Non-Volatile Systems Lab, Andiry Xu <jix...@cs.ucsd.edu>
+ * Copyright 2012-2013 Intel Corporation
+ * Copyright 2009-2011 Marco Stornelli <marco.storne...@gmail.com>
+ * Copyright 2003 Sony Corporation
+ * Copyright 2003 Matsushita Electric Industrial Co., Ltd.
+ * 2003-2004 (c) MontaVista Software, Inc. , Steve Longerbeam
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ */
+
+#include 
+#include 
+#include 
+#include 
+#include 
+#include "nova.h"
+#include "super.h"
+#include "inode.h"
+
+void nova_init_header(struct super_block *sb,
+   struct nova_inode_info_header *sih, u16 i_mode)
+{
+   sih->log_pages = 0;
+   sih->i_size = 0;
+   sih->ino = 0;
+   sih->i_blocks = 0;
+   sih->pi_addr = 0;
+   INIT_RADIX_TREE(>tree, GFP_ATOMIC);
+   sih->i_mode = i_mode;
+   sih->i_flags = 0;
+   sih->valid_entries = 0;
+   sih->num_entries = 0;
+   sih->last_setattr = 0;
+   sih->last_link_change = 0;
+   sih->last_dentry = 0;
+   sih->trans_id = 0;
+   sih->log_head = 0;
+   sih->log_tail = 0;
+   sih->i_blk_type = NOVA_DEFAULT_BLOCK_TYPE;
+   init_rwsem(>i_sem);
+}
+
diff --git a/fs/nova/bbuild.h b/fs/nova/bbuild.h
new file mode 100644
index 000..162a832
--- /dev/null
+++ b/fs/nova/bbuild.h
@@ -0,0 +1,7 @@
+#ifndef __BBUILD_H
+#define __BBUILD_H
+
+void nova_init_header(struct super_block *sb,
+   struct nova_inode_info_header *sih, u16 i_mode);
+
+#endif
diff --git a/fs/nova/inode.c b/fs/nova/inode.c
index bfdc5dc..f7d6410 100644
--- a/fs/nova/inode.c
+++ b/fs/nova/inode.c
@@ -158,6 +158,12 @@ struct inode *nova_iget(struct super_block *sb, unsigned 
long ino)
goto fail;
}
 
+   err = nova_rebuild_inode(sb, si, ino, pi_addr, 1);
+   if (err) {
+   nova_dbg("%s: failed to rebuild inode %lu\n", __func__, ino);
+   goto fail;
+   }
+
err = nova_read_inode(sb, inode, pi_addr);
if (unlikely(err)) {
nova_dbg("%s: failed to read inode %lu\n", __func__, ino);
diff --git a/fs/nova/nova.h b/fs/nova/nova.h
index 5eb696c..ded9fe8 100644
--- a/fs/nova/nova.h
+++ b/fs/nova/nova.h
@@ -296,4 +296,14 @@ static inline u64 nova_get_epoch_id(struct super_block *sb)
 }
 
 #include "inode.h"
+#include "bbuild.h"
+
+/* == */
+/* ==  Function prototypes  = */
+/* == */
+
+/* rebuild.c */
+int nova_rebuild_inode(struct super_block *sb, struct nova_inode_info *si,
+   u64 ino, u64 pi_addr, int rebuild_dir);
+
 #endif /* __NOVA_H */
diff --git a/fs/nova/rebuild.c b/fs/nova/rebuild.c
new file mode 100644
index 000..0595851
--- /dev/null
+++ b/fs/nova/rebuild.c
@@ -0,0 +1,48 @@
+/*
+ * BRIEF DESCRIPTION
+ *
+ * Inode rebuild methods.
+ *
+ * Copyright 2015-2016 Regents of the University of California,
+ * UCSD Non-Volatile Systems Lab, Andiry Xu <jix...@cs.ucsd.edu>
+ * Copyright 2012-2013 Intel Corporation
+ * Copyright 2009-2011 Marco Stornelli <marco.storne...@gmail.com>
+ * Copyright 2003 Sony Corporation
+ * Copyright 2003 Matsushita Electric Industrial Co., Ltd.
+ * 2003-2004 (c) MontaVista Software, Inc. , Steve Longerbeam
+ * This file is licensed under the terms of the GNU General Public
+ * License version 2. This program is licensed "as is" without any
+ * warranty of any kind, whether express or implied.
+ */
+
+#include "nova.h"
+#include "in

[RFC v2 15/83] Add free list data structure.

2018-03-10 Thread Andiry Xu

From: Andiry Xu 

Free list is the data structure that NOVA uses to manage free pmem blocks.
Each CPU has its own free list to avoid contention.
Free list manages free pmem blocks (represented in range node) with red-black 
tree.

Signed-off-by: Andiry Xu 
---
 fs/nova/Makefile |  2 +-
 fs/nova/balloc.c | 58 +
 fs/nova/balloc.h | 66 
 fs/nova/nova.h   |  1 +
 fs/nova/super.c  | 11 ++
 fs/nova/super.h  |  4 
 6 files changed, 141 insertions(+), 1 deletion(-)
 create mode 100644 fs/nova/balloc.c
 create mode 100644 fs/nova/balloc.h

diff --git a/fs/nova/Makefile b/fs/nova/Makefile
index 886356a..e2f7b07 100644
--- a/fs/nova/Makefile
+++ b/fs/nova/Makefile
@@ -4,4 +4,4 @@
 
 obj-$(CONFIG_NOVA_FS) += nova.o
 
-nova-y := bbuild.o inode.o rebuild.o stats.o super.o
+nova-y := balloc.o bbuild.o inode.o rebuild.o stats.o super.o
diff --git a/fs/nova/balloc.c b/fs/nova/balloc.c
new file mode 100644
index 000..450c942
--- /dev/null
+++ b/fs/nova/balloc.c
@@ -0,0 +1,58 @@
+/*
+ * NOVA persistent memory management
+ *
+ * Copyright 2015-2016 Regents of the University of California,
+ * UCSD Non-Volatile Systems Lab, Andiry Xu 
+ * Copyright 2012-2013 Intel Corporation
+ * Copyright 2009-2011 Marco Stornelli 
+ * Copyright 2003 Sony Corporation
+ * Copyright 2003 Matsushita Electric Industrial Co., Ltd.
+ * 2003-2004 (c) MontaVista Software, Inc. , Steve Longerbeam
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ */
+
+#include 
+#include 
+#include "nova.h"
+#include "inode.h"
+
+int nova_alloc_block_free_lists(struct super_block *sb)
+{
+   struct nova_sb_info *sbi = NOVA_SB(sb);
+   struct free_list *free_list;
+   int i;
+
+   sbi->free_lists = kcalloc(sbi->cpus, sizeof(struct free_list),
+ GFP_KERNEL);
+
+   if (!sbi->free_lists)
+   return -ENOMEM;
+
+   for (i = 0; i < sbi->cpus; i++) {
+   free_list = nova_get_free_list(sb, i);
+   free_list->block_free_tree = RB_ROOT;
+   spin_lock_init(_list->s_lock);
+   free_list->index = i;
+   }
+
+   return 0;
+}
+
+void nova_delete_free_lists(struct super_block *sb)
+{
+   struct nova_sb_info *sbi = NOVA_SB(sb);
+
+   /* Each tree is freed in save_blocknode_mappings */
+   kfree(sbi->free_lists);
+   sbi->free_lists = NULL;
+}
+
+
diff --git a/fs/nova/balloc.h b/fs/nova/balloc.h
new file mode 100644
index 000..e7c7a1d
--- /dev/null
+++ b/fs/nova/balloc.h
@@ -0,0 +1,66 @@
+#ifndef __BALLOC_H
+#define __BALLOC_H
+
+#include "inode.h"
+
+/* DRAM structure to hold a list of free PMEM blocks */
+struct free_list {
+   spinlock_t s_lock;
+   struct rb_root  block_free_tree;
+   struct nova_range_node *first_node; // lowest address free range
+   struct nova_range_node *last_node; // highest address free range
+
+   int index; // Which CPU do I belong to?
+
+   /*
+* Start and end of allocatable range, inclusive.
+*/
+   unsigned long   block_start;
+   unsigned long   block_end;
+
+   unsigned long   num_free_blocks;
+
+   /* How many nodes in the rb tree? */
+   unsigned long   num_blocknode;
+
+   u32 csum;   /* Protect integrity */
+
+   /* Statistics */
+   unsigned long   alloc_log_count;
+   unsigned long   alloc_data_count;
+   unsigned long   free_log_count;
+   unsigned long   free_data_count;
+   unsigned long   alloc_log_pages;
+   unsigned long   alloc_data_pages;
+   unsigned long   freed_log_pages;
+   unsigned long   freed_data_pages;
+
+   u64 padding[8]; /* Cache line break */
+};
+
+static inline
+struct free_list *nova_get_free_list(struct super_block *sb, int cpu)
+{
+   struct nova_sb_info *sbi = NOVA_SB(sb);
+
+   return >free_lists[cpu];
+}
+
+enum nova_alloc_direction {ALLOC_FROM_HEAD = 0,
+  ALLOC_FROM_TAIL = 1};
+
+enum nova_alloc_init {ALLOC_NO_INIT = 0,
+ ALLOC_INIT_ZERO = 1};
+
+enum alloc_type {
+   LOG = 1,
+   DATA,
+};
+
+
+
+
+int nova_alloc_block_free_lists(struct super_block *sb);
+void nova_delete_free_lists(struct super_block *sb);
+
+#endif
diff --git a/fs/nova/nova.h b/fs/nova/nova.h
index e0e85fb..c4abdd8 100644
--- a/fs/nova/nova.h
+++ b/fs/nova/nova.h
@@ -310,6 +310,7 @@ struct n

[RFC v2 07/83] Initialize inode_info and rebuild inode information in nova_iget().

2018-03-10 Thread Andiry Xu

From: Andiry Xu 

Incomplete nova_rebuild_inode() implemenation.
nova_rebuild_inode() will go through the inode log and rebuild
radix tree and metadata. Leave for later patches.

Signed-off-by: Andiry Xu 
---
 fs/nova/bbuild.c  | 53 +
 fs/nova/bbuild.h  |  7 +++
 fs/nova/inode.c   |  6 ++
 fs/nova/nova.h| 10 ++
 fs/nova/rebuild.c | 48 
 5 files changed, 124 insertions(+)
 create mode 100644 fs/nova/bbuild.c
 create mode 100644 fs/nova/bbuild.h
 create mode 100644 fs/nova/rebuild.c

diff --git a/fs/nova/bbuild.c b/fs/nova/bbuild.c
new file mode 100644
index 000..8bc0545
--- /dev/null
+++ b/fs/nova/bbuild.c
@@ -0,0 +1,53 @@
+/*
+ * NOVA Recovery routines.
+ *
+ * Copyright 2015-2016 Regents of the University of California,
+ * UCSD Non-Volatile Systems Lab, Andiry Xu 
+ * Copyright 2012-2013 Intel Corporation
+ * Copyright 2009-2011 Marco Stornelli 
+ * Copyright 2003 Sony Corporation
+ * Copyright 2003 Matsushita Electric Industrial Co., Ltd.
+ * 2003-2004 (c) MontaVista Software, Inc. , Steve Longerbeam
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ */
+
+#include 
+#include 
+#include 
+#include 
+#include 
+#include "nova.h"
+#include "super.h"
+#include "inode.h"
+
+void nova_init_header(struct super_block *sb,
+   struct nova_inode_info_header *sih, u16 i_mode)
+{
+   sih->log_pages = 0;
+   sih->i_size = 0;
+   sih->ino = 0;
+   sih->i_blocks = 0;
+   sih->pi_addr = 0;
+   INIT_RADIX_TREE(>tree, GFP_ATOMIC);
+   sih->i_mode = i_mode;
+   sih->i_flags = 0;
+   sih->valid_entries = 0;
+   sih->num_entries = 0;
+   sih->last_setattr = 0;
+   sih->last_link_change = 0;
+   sih->last_dentry = 0;
+   sih->trans_id = 0;
+   sih->log_head = 0;
+   sih->log_tail = 0;
+   sih->i_blk_type = NOVA_DEFAULT_BLOCK_TYPE;
+   init_rwsem(>i_sem);
+}
+
diff --git a/fs/nova/bbuild.h b/fs/nova/bbuild.h
new file mode 100644
index 000..162a832
--- /dev/null
+++ b/fs/nova/bbuild.h
@@ -0,0 +1,7 @@
+#ifndef __BBUILD_H
+#define __BBUILD_H
+
+void nova_init_header(struct super_block *sb,
+   struct nova_inode_info_header *sih, u16 i_mode);
+
+#endif
diff --git a/fs/nova/inode.c b/fs/nova/inode.c
index bfdc5dc..f7d6410 100644
--- a/fs/nova/inode.c
+++ b/fs/nova/inode.c
@@ -158,6 +158,12 @@ struct inode *nova_iget(struct super_block *sb, unsigned 
long ino)
goto fail;
}
 
+   err = nova_rebuild_inode(sb, si, ino, pi_addr, 1);
+   if (err) {
+   nova_dbg("%s: failed to rebuild inode %lu\n", __func__, ino);
+   goto fail;
+   }
+
err = nova_read_inode(sb, inode, pi_addr);
if (unlikely(err)) {
nova_dbg("%s: failed to read inode %lu\n", __func__, ino);
diff --git a/fs/nova/nova.h b/fs/nova/nova.h
index 5eb696c..ded9fe8 100644
--- a/fs/nova/nova.h
+++ b/fs/nova/nova.h
@@ -296,4 +296,14 @@ static inline u64 nova_get_epoch_id(struct super_block *sb)
 }
 
 #include "inode.h"
+#include "bbuild.h"
+
+/* == */
+/* ==  Function prototypes  = */
+/* == */
+
+/* rebuild.c */
+int nova_rebuild_inode(struct super_block *sb, struct nova_inode_info *si,
+   u64 ino, u64 pi_addr, int rebuild_dir);
+
 #endif /* __NOVA_H */
diff --git a/fs/nova/rebuild.c b/fs/nova/rebuild.c
new file mode 100644
index 000..0595851
--- /dev/null
+++ b/fs/nova/rebuild.c
@@ -0,0 +1,48 @@
+/*
+ * BRIEF DESCRIPTION
+ *
+ * Inode rebuild methods.
+ *
+ * Copyright 2015-2016 Regents of the University of California,
+ * UCSD Non-Volatile Systems Lab, Andiry Xu 
+ * Copyright 2012-2013 Intel Corporation
+ * Copyright 2009-2011 Marco Stornelli 
+ * Copyright 2003 Sony Corporation
+ * Copyright 2003 Matsushita Electric Industrial Co., Ltd.
+ * 2003-2004 (c) MontaVista Software, Inc. , Steve Longerbeam
+ * This file is licensed under the terms of the GNU General Public
+ * License version 2. This program is licensed "as is" without any
+ * warranty of any kind, whether express or implied.
+ */
+
+#include "nova.h"
+#include "inode.h"
+
+/* initialize nova inode header and other DRAM data structures */
+int nova_rebuild_inode(struct super_block *sb, struct nova_inode_info *si,
+   u64 in

[RFC v2 11/83] Add timing and I/O statistics for performance analysis and profiling.

2018-03-10 Thread Andiry Xu

From: Andiry Xu <jix...@cs.ucsd.edu>

Signed-off-by: Andiry Xu <jix...@cs.ucsd.edu>
---
 fs/nova/Makefile |   2 +-
 fs/nova/nova.h   |  12 +++
 fs/nova/stats.c  | 263 +++
 fs/nova/stats.h  | 178 +
 fs/nova/super.c  |   6 ++
 5 files changed, 460 insertions(+), 1 deletion(-)
 create mode 100644 fs/nova/stats.c
 create mode 100644 fs/nova/stats.h

diff --git a/fs/nova/Makefile b/fs/nova/Makefile
index eb19646..886356a 100644
--- a/fs/nova/Makefile
+++ b/fs/nova/Makefile
@@ -4,4 +4,4 @@
 
 obj-$(CONFIG_NOVA_FS) += nova.o
 
-nova-y := bbuild.o inode.o rebuild.o super.o
+nova-y := bbuild.o inode.o rebuild.o stats.o super.o
diff --git a/fs/nova/nova.h b/fs/nova/nova.h
index ded9fe8..ba7ffca 100644
--- a/fs/nova/nova.h
+++ b/fs/nova/nova.h
@@ -48,6 +48,7 @@
 #include 
 
 #include "nova_def.h"
+#include "stats.h"
 
 #define PAGE_SHIFT_2M 21
 #define PAGE_SHIFT_1G 30
@@ -135,6 +136,10 @@ extern unsigned int nova_dbgmask;
 #defineANY_CPU (65536)
 #defineFREE_BATCH  (16)
 
+
+extern int measure_timing;
+
+
 extern unsigned int blk_type_to_shift[NOVA_BLOCK_TYPE_MAX];
 extern unsigned int blk_type_to_size[NOVA_BLOCK_TYPE_MAX];
 
@@ -306,4 +311,11 @@ static inline u64 nova_get_epoch_id(struct super_block *sb)
 int nova_rebuild_inode(struct super_block *sb, struct nova_inode_info *si,
u64 ino, u64 pi_addr, int rebuild_dir);
 
+/* stats.c */
+void nova_get_timing_stats(void);
+void nova_get_IO_stats(void);
+void nova_print_timing_stats(struct super_block *sb);
+void nova_clear_stats(struct super_block *sb);
+void nova_print_inode(struct nova_inode *pi);
+
 #endif /* __NOVA_H */
diff --git a/fs/nova/stats.c b/fs/nova/stats.c
new file mode 100644
index 000..4b7c317
--- /dev/null
+++ b/fs/nova/stats.c
@@ -0,0 +1,263 @@
+/*
+ * NOVA File System statistics
+ *
+ * Copyright 2015-2016 Regents of the University of California,
+ * UCSD Non-Volatile Systems Lab, Andiry Xu <jix...@cs.ucsd.edu>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ */
+
+#include "nova.h"
+
+const char *Timingstring[TIMING_NUM] = {
+   /* Init */
+   " Initialization ",
+   "init",
+   "mount",
+   "ioremap",
+   "new_init",
+   "recovery",
+
+   /* Namei operations */
+   "= Directory operations =",
+   "create",
+   "lookup",
+   "link",
+   "unlink",
+   "symlink",
+   "mkdir",
+   "rmdir",
+   "mknod",
+   "rename",
+   "readdir",
+   "add_dentry",
+   "remove_dentry",
+   "setattr",
+   "setsize",
+
+   /* I/O operations */
+   " I/O operations ",
+   "dax_read",
+   "cow_write",
+   "inplace_write",
+   "copy_to_nvmm",
+   "dax_get_block",
+   "read_iter",
+   "write_iter",
+
+   /* Memory operations */
+   "== Memory operations ===",
+   "memcpy_read_nvmm",
+   "memcpy_write_nvmm",
+   "memcpy_write_back_to_nvmm",
+   "handle_partial_block",
+
+   /* Memory management */
+   "== Memory management ===",
+   "alloc_blocks",
+   "new_data_blocks",
+   "new_log_blocks",
+   "free_blocks",
+   "free_data_blocks",
+   "free_log_blocks",
+
+   /* Transaction */
+   "= Transaction ==",
+   "transaction_new_inode",
+   "transaction_link_change",
+   "update_tail",
+
+   /* Logging */
+   "= Logging operations ===",
+   "append_dir_entry",
+   "append_file_entry",
+   "append_link_change",
+   "append_setattr",
+   "inplace_update_entry",
+
+   /* Tree */
+   "=== Tree operations ",
+   "checking_entry",
+   "assign_blocks",
+
+   /* GC */
+

[RFC v2 11/83] Add timing and I/O statistics for performance analysis and profiling.

2018-03-10 Thread Andiry Xu

From: Andiry Xu 

Signed-off-by: Andiry Xu 
---
 fs/nova/Makefile |   2 +-
 fs/nova/nova.h   |  12 +++
 fs/nova/stats.c  | 263 +++
 fs/nova/stats.h  | 178 +
 fs/nova/super.c  |   6 ++
 5 files changed, 460 insertions(+), 1 deletion(-)
 create mode 100644 fs/nova/stats.c
 create mode 100644 fs/nova/stats.h

diff --git a/fs/nova/Makefile b/fs/nova/Makefile
index eb19646..886356a 100644
--- a/fs/nova/Makefile
+++ b/fs/nova/Makefile
@@ -4,4 +4,4 @@
 
 obj-$(CONFIG_NOVA_FS) += nova.o
 
-nova-y := bbuild.o inode.o rebuild.o super.o
+nova-y := bbuild.o inode.o rebuild.o stats.o super.o
diff --git a/fs/nova/nova.h b/fs/nova/nova.h
index ded9fe8..ba7ffca 100644
--- a/fs/nova/nova.h
+++ b/fs/nova/nova.h
@@ -48,6 +48,7 @@
 #include 
 
 #include "nova_def.h"
+#include "stats.h"
 
 #define PAGE_SHIFT_2M 21
 #define PAGE_SHIFT_1G 30
@@ -135,6 +136,10 @@ extern unsigned int nova_dbgmask;
 #defineANY_CPU (65536)
 #defineFREE_BATCH  (16)
 
+
+extern int measure_timing;
+
+
 extern unsigned int blk_type_to_shift[NOVA_BLOCK_TYPE_MAX];
 extern unsigned int blk_type_to_size[NOVA_BLOCK_TYPE_MAX];
 
@@ -306,4 +311,11 @@ static inline u64 nova_get_epoch_id(struct super_block *sb)
 int nova_rebuild_inode(struct super_block *sb, struct nova_inode_info *si,
u64 ino, u64 pi_addr, int rebuild_dir);
 
+/* stats.c */
+void nova_get_timing_stats(void);
+void nova_get_IO_stats(void);
+void nova_print_timing_stats(struct super_block *sb);
+void nova_clear_stats(struct super_block *sb);
+void nova_print_inode(struct nova_inode *pi);
+
 #endif /* __NOVA_H */
diff --git a/fs/nova/stats.c b/fs/nova/stats.c
new file mode 100644
index 000..4b7c317
--- /dev/null
+++ b/fs/nova/stats.c
@@ -0,0 +1,263 @@
+/*
+ * NOVA File System statistics
+ *
+ * Copyright 2015-2016 Regents of the University of California,
+ * UCSD Non-Volatile Systems Lab, Andiry Xu 
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ */
+
+#include "nova.h"
+
+const char *Timingstring[TIMING_NUM] = {
+   /* Init */
+   " Initialization ",
+   "init",
+   "mount",
+   "ioremap",
+   "new_init",
+   "recovery",
+
+   /* Namei operations */
+   "= Directory operations =",
+   "create",
+   "lookup",
+   "link",
+   "unlink",
+   "symlink",
+   "mkdir",
+   "rmdir",
+   "mknod",
+   "rename",
+   "readdir",
+   "add_dentry",
+   "remove_dentry",
+   "setattr",
+   "setsize",
+
+   /* I/O operations */
+   " I/O operations ",
+   "dax_read",
+   "cow_write",
+   "inplace_write",
+   "copy_to_nvmm",
+   "dax_get_block",
+   "read_iter",
+   "write_iter",
+
+   /* Memory operations */
+   "== Memory operations ===",
+   "memcpy_read_nvmm",
+   "memcpy_write_nvmm",
+   "memcpy_write_back_to_nvmm",
+   "handle_partial_block",
+
+   /* Memory management */
+   "== Memory management ===",
+   "alloc_blocks",
+   "new_data_blocks",
+   "new_log_blocks",
+   "free_blocks",
+   "free_data_blocks",
+   "free_log_blocks",
+
+   /* Transaction */
+   "= Transaction ==",
+   "transaction_new_inode",
+   "transaction_link_change",
+   "update_tail",
+
+   /* Logging */
+   "= Logging operations ===",
+   "append_dir_entry",
+   "append_file_entry",
+   "append_link_change",
+   "append_setattr",
+   "inplace_update_entry",
+
+   /* Tree */
+   "=== Tree operations ",
+   "checking_entry",
+   "assign_blocks",
+
+   /* GC */
+   "= Garbage collection ===",

[RFC v2 12/83] Add timing for mount and init.

2018-03-10 Thread Andiry Xu

From: Andiry Xu <jix...@cs.ucsd.edu>

Signed-off-by: Andiry Xu <jix...@cs.ucsd.edu>
---
 fs/nova/super.c | 25 ++---
 1 file changed, 22 insertions(+), 3 deletions(-)

diff --git a/fs/nova/super.c b/fs/nova/super.c
index 9295d23..3efb560 100644
--- a/fs/nova/super.c
+++ b/fs/nova/super.c
@@ -347,6 +347,9 @@ static struct nova_inode *nova_init(struct super_block *sb,
struct nova_inode *root_i, *pi;
struct nova_super_block *super;
struct nova_sb_info *sbi = NOVA_SB(sb);
+   timing_t init_time;
+
+   NOVA_START_TIMING(new_init_t, init_time);
 
nova_info("creating an empty nova of size %lu\n", size);
sbi->num_blocks = ((unsigned long)(size) >> PAGE_SHIFT);
@@ -357,6 +360,7 @@ static struct nova_inode *nova_init(struct super_block *sb,
 
if (!nova_check_size(sb, size)) {
nova_warn("Specified NOVA size too small 0x%lx.\n", size);
+   NOVA_END_TIMING(new_init_t, init_time);
return ERR_PTR(-EINVAL);
}
 
@@ -399,6 +403,7 @@ static struct nova_inode *nova_init(struct super_block *sb,
PERSISTENT_MARK();
PERSISTENT_BARRIER();
nova_info("NOVA initialization finish\n");
+   NOVA_END_TIMING(new_init_t, init_time);
return root_i;
 }
 
@@ -473,15 +478,22 @@ static int nova_fill_super(struct super_block *sb, void 
*data, int silent)
unsigned long blocksize;
u32 random = 0;
int retval = -EINVAL;
+   timing_t mount_time;
+
+   NOVA_START_TIMING(mount_t, mount_time);
 
BUILD_BUG_ON(sizeof(struct nova_super_block) > NOVA_SB_SIZE);
 
sbi = kzalloc(sizeof(struct nova_sb_info), GFP_KERNEL);
-   if (!sbi)
+   if (!sbi) {
+   NOVA_END_TIMING(mount_t, mount_time);
return -ENOMEM;
+   }
+
sbi->nova_sb = kzalloc(sizeof(struct nova_super_block), GFP_KERNEL);
if (!sbi->nova_sb) {
kfree(sbi);
+   NOVA_END_TIMING(mount_t, mount_time);
return -ENOMEM;
}
 
@@ -591,6 +603,7 @@ static int nova_fill_super(struct super_block *sb, void 
*data, int silent)
nova_update_mount_time(sb);
 
retval = 0;
+   NOVA_END_TIMING(mount_t, mount_time);
return retval;
 
 out:
@@ -600,6 +613,7 @@ static int nova_fill_super(struct super_block *sb, void 
*data, int silent)
kfree(sbi->nova_sb);
kfree(sbi);
nova_dbg("%s failed: return %d\n", __func__, retval);
+   NOVA_END_TIMING(mount_t, mount_time);
return retval;
 }
 
@@ -701,6 +715,9 @@ static struct file_system_type nova_fs_type = {
 static int __init init_nova_fs(void)
 {
int rc = 0;
+   timing_t init_time;
+
+   NOVA_START_TIMING(init_t, init_time);
 
nova_dbg("%s: %d cpus online\n", __func__, num_online_cpus());
if (arch_has_clwb())
@@ -711,17 +728,19 @@ static int __init init_nova_fs(void)
 
rc = init_inodecache();
if (rc)
-   return rc;
+   goto out;
 
rc = register_filesystem(_fs_type);
if (rc)
goto out1;
 
+out:
+   NOVA_END_TIMING(init_t, init_time);
return rc;
 
 out1:
destroy_inodecache();
-   return rc;
+   goto out;
 }
 
 static void __exit exit_nova_fs(void)
-- 
2.7.4

[RFC v2 12/83] Add timing for mount and init.

2018-03-10 Thread Andiry Xu

From: Andiry Xu 

Signed-off-by: Andiry Xu 
---
 fs/nova/super.c | 25 ++---
 1 file changed, 22 insertions(+), 3 deletions(-)

diff --git a/fs/nova/super.c b/fs/nova/super.c
index 9295d23..3efb560 100644
--- a/fs/nova/super.c
+++ b/fs/nova/super.c
@@ -347,6 +347,9 @@ static struct nova_inode *nova_init(struct super_block *sb,
struct nova_inode *root_i, *pi;
struct nova_super_block *super;
struct nova_sb_info *sbi = NOVA_SB(sb);
+   timing_t init_time;
+
+   NOVA_START_TIMING(new_init_t, init_time);
 
nova_info("creating an empty nova of size %lu\n", size);
sbi->num_blocks = ((unsigned long)(size) >> PAGE_SHIFT);
@@ -357,6 +360,7 @@ static struct nova_inode *nova_init(struct super_block *sb,
 
if (!nova_check_size(sb, size)) {
nova_warn("Specified NOVA size too small 0x%lx.\n", size);
+   NOVA_END_TIMING(new_init_t, init_time);
return ERR_PTR(-EINVAL);
}
 
@@ -399,6 +403,7 @@ static struct nova_inode *nova_init(struct super_block *sb,
PERSISTENT_MARK();
PERSISTENT_BARRIER();
nova_info("NOVA initialization finish\n");
+   NOVA_END_TIMING(new_init_t, init_time);
return root_i;
 }
 
@@ -473,15 +478,22 @@ static int nova_fill_super(struct super_block *sb, void 
*data, int silent)
unsigned long blocksize;
u32 random = 0;
int retval = -EINVAL;
+   timing_t mount_time;
+
+   NOVA_START_TIMING(mount_t, mount_time);
 
BUILD_BUG_ON(sizeof(struct nova_super_block) > NOVA_SB_SIZE);
 
sbi = kzalloc(sizeof(struct nova_sb_info), GFP_KERNEL);
-   if (!sbi)
+   if (!sbi) {
+   NOVA_END_TIMING(mount_t, mount_time);
return -ENOMEM;
+   }
+
sbi->nova_sb = kzalloc(sizeof(struct nova_super_block), GFP_KERNEL);
if (!sbi->nova_sb) {
kfree(sbi);
+   NOVA_END_TIMING(mount_t, mount_time);
return -ENOMEM;
}
 
@@ -591,6 +603,7 @@ static int nova_fill_super(struct super_block *sb, void 
*data, int silent)
nova_update_mount_time(sb);
 
retval = 0;
+   NOVA_END_TIMING(mount_t, mount_time);
return retval;
 
 out:
@@ -600,6 +613,7 @@ static int nova_fill_super(struct super_block *sb, void 
*data, int silent)
kfree(sbi->nova_sb);
kfree(sbi);
nova_dbg("%s failed: return %d\n", __func__, retval);
+   NOVA_END_TIMING(mount_t, mount_time);
return retval;
 }
 
@@ -701,6 +715,9 @@ static struct file_system_type nova_fs_type = {
 static int __init init_nova_fs(void)
 {
int rc = 0;
+   timing_t init_time;
+
+   NOVA_START_TIMING(init_t, init_time);
 
nova_dbg("%s: %d cpus online\n", __func__, num_online_cpus());
if (arch_has_clwb())
@@ -711,17 +728,19 @@ static int __init init_nova_fs(void)
 
rc = init_inodecache();
if (rc)
-   return rc;
+   goto out;
 
rc = register_filesystem(_fs_type);
if (rc)
goto out1;
 
+out:
+   NOVA_END_TIMING(init_t, init_time);
return rc;
 
 out1:
destroy_inodecache();
-   return rc;
+   goto out;
 }
 
 static void __exit exit_nova_fs(void)
-- 
2.7.4

[RFC v2 14/83] Add range node kmem cache.

2018-03-10 Thread Andiry Xu

From: Andiry Xu <jix...@cs.ucsd.edu>

Range node specifies a range of [start, end]. and is managed by a red-black 
tree.
NOVA uses range node to manage NVM allocator and inodes being used.

Signed-off-by: Andiry Xu <jix...@cs.ucsd.edu>
---
 fs/nova/nova.h  |  8 
 fs/nova/super.c | 45 ++---
 fs/nova/super.h |  2 ++
 3 files changed, 52 insertions(+), 3 deletions(-)

diff --git a/fs/nova/nova.h b/fs/nova/nova.h
index ba7ffca..e0e85fb 100644
--- a/fs/nova/nova.h
+++ b/fs/nova/nova.h
@@ -301,6 +301,14 @@ static inline u64 nova_get_epoch_id(struct super_block *sb)
 }
 
 #include "inode.h"
+
+/* A node in the RB tree representing a range of pages */
+struct nova_range_node {
+   struct rb_node node;
+   unsigned long range_low;
+   unsigned long range_high;
+};
+
 #include "bbuild.h"
 
 /* == */
diff --git a/fs/nova/super.c b/fs/nova/super.c
index f41cc04..aec1cd3 100644
--- a/fs/nova/super.c
+++ b/fs/nova/super.c
@@ -52,6 +52,7 @@ MODULE_PARM_DESC(nova_dbgmask, "Control debugging output");
 static struct super_operations nova_sops;
 
 static struct kmem_cache *nova_inode_cachep;
+static struct kmem_cache *nova_range_node_cachep;
 
 
 /* FIXME: should the following variable be one per NOVA instance? */
@@ -686,6 +687,20 @@ static void nova_put_super(struct super_block *sb)
sb->s_fs_info = NULL;
 }
 
+inline void nova_free_range_node(struct nova_range_node *node)
+{
+   kmem_cache_free(nova_range_node_cachep, node);
+}
+
+inline struct nova_range_node *nova_alloc_range_node(struct super_block *sb)
+{
+   struct nova_range_node *p;
+
+   p = (struct nova_range_node *)
+   kmem_cache_zalloc(nova_range_node_cachep, GFP_NOFS);
+   return p;
+}
+
 static struct inode *nova_alloc_inode(struct super_block *sb)
 {
struct nova_inode_info *vi;
@@ -719,6 +734,17 @@ static void init_once(void *foo)
inode_init_once(>vfs_inode);
 }
 
+static int __init init_rangenode_cache(void)
+{
+   nova_range_node_cachep = kmem_cache_create("nova_range_node_cache",
+   sizeof(struct nova_range_node),
+   0, (SLAB_RECLAIM_ACCOUNT |
+   SLAB_MEM_SPREAD), NULL);
+   if (nova_range_node_cachep == NULL)
+   return -ENOMEM;
+   return 0;
+}
+
 static int __init init_inodecache(void)
 {
nova_inode_cachep = kmem_cache_create("nova_inode_cache",
@@ -740,6 +766,11 @@ static void destroy_inodecache(void)
kmem_cache_destroy(nova_inode_cachep);
 }
 
+static void destroy_rangenode_cache(void)
+{
+   kmem_cache_destroy(nova_range_node_cachep);
+}
+
 
 /*
  * the super block writes are all done "on the fly", so the
@@ -781,20 +812,27 @@ static int __init init_nova_fs(void)
nova_info("Arch new instructions support: CLWB %s\n",
support_clwb ? "YES" : "NO");
 
-   rc = init_inodecache();
+   rc = init_rangenode_cache();
if (rc)
goto out;
 
-   rc = register_filesystem(_fs_type);
+   rc = init_inodecache();
if (rc)
goto out1;
 
+   rc = register_filesystem(_fs_type);
+   if (rc)
+   goto out2;
+
 out:
NOVA_END_TIMING(init_t, init_time);
return rc;
 
-out1:
+out2:
destroy_inodecache();
+
+out1:
+   destroy_rangenode_cache();
goto out;
 }
 
@@ -802,6 +840,7 @@ static void __exit exit_nova_fs(void)
 {
unregister_filesystem(_fs_type);
destroy_inodecache();
+   destroy_rangenode_cache();
 }
 
 MODULE_AUTHOR("Andiry Xu <jix...@cs.ucsd.edu>");
diff --git a/fs/nova/super.h b/fs/nova/super.h
index cb53908..b478080 100644
--- a/fs/nova/super.h
+++ b/fs/nova/super.h
@@ -145,5 +145,7 @@ static inline struct nova_super_block 
*nova_get_super(struct super_block *sb)
 }
 
 extern void nova_error_mng(struct super_block *sb, const char *fmt, ...);
+extern struct nova_range_node *nova_alloc_range_node(struct super_block *sb);
+extern void nova_free_range_node(struct nova_range_node *node);
 
 #endif
-- 
2.7.4

[RFC v2 14/83] Add range node kmem cache.

2018-03-10 Thread Andiry Xu

From: Andiry Xu 

Range node specifies a range of [start, end]. and is managed by a red-black 
tree.
NOVA uses range node to manage NVM allocator and inodes being used.

Signed-off-by: Andiry Xu 
---
 fs/nova/nova.h  |  8 
 fs/nova/super.c | 45 ++---
 fs/nova/super.h |  2 ++
 3 files changed, 52 insertions(+), 3 deletions(-)

diff --git a/fs/nova/nova.h b/fs/nova/nova.h
index ba7ffca..e0e85fb 100644
--- a/fs/nova/nova.h
+++ b/fs/nova/nova.h
@@ -301,6 +301,14 @@ static inline u64 nova_get_epoch_id(struct super_block *sb)
 }
 
 #include "inode.h"
+
+/* A node in the RB tree representing a range of pages */
+struct nova_range_node {
+   struct rb_node node;
+   unsigned long range_low;
+   unsigned long range_high;
+};
+
 #include "bbuild.h"
 
 /* == */
diff --git a/fs/nova/super.c b/fs/nova/super.c
index f41cc04..aec1cd3 100644
--- a/fs/nova/super.c
+++ b/fs/nova/super.c
@@ -52,6 +52,7 @@ MODULE_PARM_DESC(nova_dbgmask, "Control debugging output");
 static struct super_operations nova_sops;
 
 static struct kmem_cache *nova_inode_cachep;
+static struct kmem_cache *nova_range_node_cachep;
 
 
 /* FIXME: should the following variable be one per NOVA instance? */
@@ -686,6 +687,20 @@ static void nova_put_super(struct super_block *sb)
sb->s_fs_info = NULL;
 }
 
+inline void nova_free_range_node(struct nova_range_node *node)
+{
+   kmem_cache_free(nova_range_node_cachep, node);
+}
+
+inline struct nova_range_node *nova_alloc_range_node(struct super_block *sb)
+{
+   struct nova_range_node *p;
+
+   p = (struct nova_range_node *)
+   kmem_cache_zalloc(nova_range_node_cachep, GFP_NOFS);
+   return p;
+}
+
 static struct inode *nova_alloc_inode(struct super_block *sb)
 {
struct nova_inode_info *vi;
@@ -719,6 +734,17 @@ static void init_once(void *foo)
inode_init_once(>vfs_inode);
 }
 
+static int __init init_rangenode_cache(void)
+{
+   nova_range_node_cachep = kmem_cache_create("nova_range_node_cache",
+   sizeof(struct nova_range_node),
+   0, (SLAB_RECLAIM_ACCOUNT |
+   SLAB_MEM_SPREAD), NULL);
+   if (nova_range_node_cachep == NULL)
+   return -ENOMEM;
+   return 0;
+}
+
 static int __init init_inodecache(void)
 {
nova_inode_cachep = kmem_cache_create("nova_inode_cache",
@@ -740,6 +766,11 @@ static void destroy_inodecache(void)
kmem_cache_destroy(nova_inode_cachep);
 }
 
+static void destroy_rangenode_cache(void)
+{
+   kmem_cache_destroy(nova_range_node_cachep);
+}
+
 
 /*
  * the super block writes are all done "on the fly", so the
@@ -781,20 +812,27 @@ static int __init init_nova_fs(void)
nova_info("Arch new instructions support: CLWB %s\n",
support_clwb ? "YES" : "NO");
 
-   rc = init_inodecache();
+   rc = init_rangenode_cache();
if (rc)
goto out;
 
-   rc = register_filesystem(_fs_type);
+   rc = init_inodecache();
if (rc)
goto out1;
 
+   rc = register_filesystem(_fs_type);
+   if (rc)
+   goto out2;
+
 out:
NOVA_END_TIMING(init_t, init_time);
return rc;
 
-out1:
+out2:
destroy_inodecache();
+
+out1:
+   destroy_rangenode_cache();
goto out;
 }
 
@@ -802,6 +840,7 @@ static void __exit exit_nova_fs(void)
 {
unregister_filesystem(_fs_type);
destroy_inodecache();
+   destroy_rangenode_cache();
 }
 
 MODULE_AUTHOR("Andiry Xu ");
diff --git a/fs/nova/super.h b/fs/nova/super.h
index cb53908..b478080 100644
--- a/fs/nova/super.h
+++ b/fs/nova/super.h
@@ -145,5 +145,7 @@ static inline struct nova_super_block 
*nova_get_super(struct super_block *sb)
 }
 
 extern void nova_error_mng(struct super_block *sb, const char *fmt, ...);
+extern struct nova_range_node *nova_alloc_range_node(struct super_block *sb);
+extern void nova_free_range_node(struct nova_range_node *node);
 
 #endif
-- 
2.7.4

[RFC v2 16/83] Initialize block map and free lists in nova_init().

2018-03-10 Thread Andiry Xu

From: Andiry Xu <jix...@cs.ucsd.edu>

NOVA divides the pmem range equally among per-CPU free lists,
and format the red-black trees by inserting the initial free range.

Signed-off-by: Andiry Xu <jix...@cs.ucsd.edu>
---
 fs/nova/balloc.c | 161 +++
 fs/nova/balloc.h |  13 -
 fs/nova/super.c  |   2 +
 3 files changed, 175 insertions(+), 1 deletion(-)

diff --git a/fs/nova/balloc.c b/fs/nova/balloc.c
index 450c942..cb627db 100644
--- a/fs/nova/balloc.c
+++ b/fs/nova/balloc.c
@@ -55,4 +55,165 @@ void nova_delete_free_lists(struct super_block *sb)
sbi->free_lists = NULL;
 }
 
+// Initialize a free list.  Each CPU gets an equal share of the block space to
+// manage.
+static void nova_init_free_list(struct super_block *sb,
+   struct free_list *free_list, int index)
+{
+   struct nova_sb_info *sbi = NOVA_SB(sb);
+   unsigned long per_list_blocks;
+
+   per_list_blocks = sbi->num_blocks / sbi->cpus;
+
+   free_list->block_start = per_list_blocks * index;
+   free_list->block_end = free_list->block_start +
+   per_list_blocks - 1;
+   if (index == 0)
+   free_list->block_start += sbi->head_reserved_blocks;
+   if (index == sbi->cpus - 1)
+   free_list->block_end -= sbi->tail_reserved_blocks;
+}
+
+inline struct nova_range_node *nova_alloc_blocknode(struct super_block *sb)
+{
+   return nova_alloc_range_node(sb);
+}
+
+inline void nova_free_blocknode(struct super_block *sb,
+   struct nova_range_node *node)
+{
+   nova_free_range_node(node);
+}
+
+void nova_init_blockmap(struct super_block *sb, int recovery)
+{
+   struct nova_sb_info *sbi = NOVA_SB(sb);
+   struct rb_root *tree;
+   struct nova_range_node *blknode;
+   struct free_list *free_list;
+   int i;
+   int ret;
+
+   /* Divide the block range among per-CPU free lists */
+   sbi->per_list_blocks = sbi->num_blocks / sbi->cpus;
+   for (i = 0; i < sbi->cpus; i++) {
+   free_list = nova_get_free_list(sb, i);
+   tree = &(free_list->block_free_tree);
+   nova_init_free_list(sb, free_list, i);
+
+   /* For recovery, update these fields later */
+   if (recovery == 0) {
+   free_list->num_free_blocks = free_list->block_end -
+   free_list->block_start + 1;
+
+   blknode = nova_alloc_blocknode(sb);
+   if (blknode == NULL)
+   return;
+   blknode->range_low = free_list->block_start;
+   blknode->range_high = free_list->block_end;
+   ret = nova_insert_blocktree(sbi, tree, blknode);
+   if (ret) {
+   nova_err(sb, "%s failed\n", __func__);
+   nova_free_blocknode(sb, blknode);
+   return;
+   }
+   free_list->first_node = blknode;
+   free_list->last_node = blknode;
+   free_list->num_blocknode = 1;
+   }
+
+   nova_dbgv("%s: free list %d: block start %lu, end %lu, %lu free 
blocks\n",
+ __func__, i,
+ free_list->block_start,
+ free_list->block_end,
+ free_list->num_free_blocks);
+   }
+}
+
+static inline int nova_rbtree_compare_rangenode(struct nova_range_node *curr,
+   unsigned long range_low)
+{
+   if (range_low < curr->range_low)
+   return -1;
+   if (range_low > curr->range_high)
+   return 1;
 
+   return 0;
+}
+
+int nova_find_range_node(struct nova_sb_info *sbi,
+   struct rb_root *tree, unsigned long range_low,
+   struct nova_range_node **ret_node)
+{
+   struct nova_range_node *curr = NULL;
+   struct rb_node *temp;
+   int compVal;
+   int ret = 0;
+
+   temp = tree->rb_node;
+
+   while (temp) {
+   curr = container_of(temp, struct nova_range_node, node);
+   compVal = nova_rbtree_compare_rangenode(curr, range_low);
+
+   if (compVal == -1) {
+   temp = temp->rb_left;
+   } else if (compVal == 1) {
+   temp = temp->rb_right;
+   } else {
+   ret = 1;
+   break;
+   }
+   }
+
+   *ret_node = curr;
+   return ret;
+}
+
+
+int nova_insert_range_node(struct rb_root *tree,
+   struct nova_range_node *new_node)
+{
+   struct nova_range_node *curr;
+   struct rb_node **temp, *parent;
+   int compVal;
+
+

[RFC v2 16/83] Initialize block map and free lists in nova_init().

2018-03-10 Thread Andiry Xu

From: Andiry Xu 

NOVA divides the pmem range equally among per-CPU free lists,
and format the red-black trees by inserting the initial free range.

Signed-off-by: Andiry Xu 
---
 fs/nova/balloc.c | 161 +++
 fs/nova/balloc.h |  13 -
 fs/nova/super.c  |   2 +
 3 files changed, 175 insertions(+), 1 deletion(-)

diff --git a/fs/nova/balloc.c b/fs/nova/balloc.c
index 450c942..cb627db 100644
--- a/fs/nova/balloc.c
+++ b/fs/nova/balloc.c
@@ -55,4 +55,165 @@ void nova_delete_free_lists(struct super_block *sb)
sbi->free_lists = NULL;
 }
 
+// Initialize a free list.  Each CPU gets an equal share of the block space to
+// manage.
+static void nova_init_free_list(struct super_block *sb,
+   struct free_list *free_list, int index)
+{
+   struct nova_sb_info *sbi = NOVA_SB(sb);
+   unsigned long per_list_blocks;
+
+   per_list_blocks = sbi->num_blocks / sbi->cpus;
+
+   free_list->block_start = per_list_blocks * index;
+   free_list->block_end = free_list->block_start +
+   per_list_blocks - 1;
+   if (index == 0)
+   free_list->block_start += sbi->head_reserved_blocks;
+   if (index == sbi->cpus - 1)
+   free_list->block_end -= sbi->tail_reserved_blocks;
+}
+
+inline struct nova_range_node *nova_alloc_blocknode(struct super_block *sb)
+{
+   return nova_alloc_range_node(sb);
+}
+
+inline void nova_free_blocknode(struct super_block *sb,
+   struct nova_range_node *node)
+{
+   nova_free_range_node(node);
+}
+
+void nova_init_blockmap(struct super_block *sb, int recovery)
+{
+   struct nova_sb_info *sbi = NOVA_SB(sb);
+   struct rb_root *tree;
+   struct nova_range_node *blknode;
+   struct free_list *free_list;
+   int i;
+   int ret;
+
+   /* Divide the block range among per-CPU free lists */
+   sbi->per_list_blocks = sbi->num_blocks / sbi->cpus;
+   for (i = 0; i < sbi->cpus; i++) {
+   free_list = nova_get_free_list(sb, i);
+   tree = &(free_list->block_free_tree);
+   nova_init_free_list(sb, free_list, i);
+
+   /* For recovery, update these fields later */
+   if (recovery == 0) {
+   free_list->num_free_blocks = free_list->block_end -
+   free_list->block_start + 1;
+
+   blknode = nova_alloc_blocknode(sb);
+   if (blknode == NULL)
+   return;
+   blknode->range_low = free_list->block_start;
+   blknode->range_high = free_list->block_end;
+   ret = nova_insert_blocktree(sbi, tree, blknode);
+   if (ret) {
+   nova_err(sb, "%s failed\n", __func__);
+   nova_free_blocknode(sb, blknode);
+   return;
+   }
+   free_list->first_node = blknode;
+   free_list->last_node = blknode;
+   free_list->num_blocknode = 1;
+   }
+
+   nova_dbgv("%s: free list %d: block start %lu, end %lu, %lu free 
blocks\n",
+ __func__, i,
+ free_list->block_start,
+ free_list->block_end,
+ free_list->num_free_blocks);
+   }
+}
+
+static inline int nova_rbtree_compare_rangenode(struct nova_range_node *curr,
+   unsigned long range_low)
+{
+   if (range_low < curr->range_low)
+   return -1;
+   if (range_low > curr->range_high)
+   return 1;
 
+   return 0;
+}
+
+int nova_find_range_node(struct nova_sb_info *sbi,
+   struct rb_root *tree, unsigned long range_low,
+   struct nova_range_node **ret_node)
+{
+   struct nova_range_node *curr = NULL;
+   struct rb_node *temp;
+   int compVal;
+   int ret = 0;
+
+   temp = tree->rb_node;
+
+   while (temp) {
+   curr = container_of(temp, struct nova_range_node, node);
+   compVal = nova_rbtree_compare_rangenode(curr, range_low);
+
+   if (compVal == -1) {
+   temp = temp->rb_left;
+   } else if (compVal == 1) {
+   temp = temp->rb_right;
+   } else {
+   ret = 1;
+   break;
+   }
+   }
+
+   *ret_node = curr;
+   return ret;
+}
+
+
+int nova_insert_range_node(struct rb_root *tree,
+   struct nova_range_node *new_node)
+{
+   struct nova_range_node *curr;
+   struct rb_node **temp, *parent;
+   int compVal;
+
+   temp = &(tree->rb_node);
+

[RFC v2 19/83] Add pmem block free routines.

2018-03-10 Thread Andiry Xu

From: Andiry Xu <jix...@cs.ucsd.edu>

NOVA allocates/frees log pages and data pages in the same way.
For block free, NOVA first gets the corresponding free list by
checking the block number, and then inserts the freed range in
the red-black tree. NOVA always merge adjacent free ranges if possible.

Signed-off-by: Andiry Xu <jix...@cs.ucsd.edu>
---
 fs/nova/balloc.c | 223 +++
 fs/nova/balloc.h |   8 ++
 fs/nova/nova.h   |  23 ++
 3 files changed, 254 insertions(+)

diff --git a/fs/nova/balloc.c b/fs/nova/balloc.c
index 0742fe0..9108721 100644
--- a/fs/nova/balloc.c
+++ b/fs/nova/balloc.c
@@ -218,6 +218,229 @@ inline int nova_insert_blocktree(struct nova_sb_info *sbi,
return ret;
 }
 
+/* Used for both block free tree and inode inuse tree */
+int nova_find_free_slot(struct nova_sb_info *sbi,
+   struct rb_root *tree, unsigned long range_low,
+   unsigned long range_high, struct nova_range_node **prev,
+   struct nova_range_node **next)
+{
+   struct nova_range_node *ret_node = NULL;
+   struct rb_node *tmp;
+   int check_prev = 0, check_next = 0;
+   int ret;
+
+   ret = nova_find_range_node(sbi, tree, range_low, _node);
+   if (ret) {
+   nova_dbg("%s ERROR: %lu - %lu already in free list\n",
+   __func__, range_low, range_high);
+   return -EINVAL;
+   }
+
+   if (!ret_node) {
+   *prev = *next = NULL;
+   } else if (ret_node->range_high < range_low) {
+   *prev = ret_node;
+   tmp = rb_next(_node->node);
+   if (tmp) {
+   *next = container_of(tmp, struct nova_range_node, node);
+   check_next = 1;
+   } else {
+   *next = NULL;
+   }
+   } else if (ret_node->range_low > range_high) {
+   *next = ret_node;
+   tmp = rb_prev(_node->node);
+   if (tmp) {
+   *prev = container_of(tmp, struct nova_range_node, node);
+   check_prev = 1;
+   } else {
+   *prev = NULL;
+   }
+   } else {
+   nova_dbg("%s ERROR: %lu - %lu overlaps with existing node %lu - 
%lu\n",
+__func__, range_low, range_high, ret_node->range_low,
+   ret_node->range_high);
+   return -EINVAL;
+   }
+
+   return 0;
+}
+
+/*
+ * blocknr: start block number
+ * num: number of freed pages
+ * btype: is large page?
+ * log_page: is log page?
+ */
+static int nova_free_blocks(struct super_block *sb, unsigned long blocknr,
+   int num, unsigned short btype, int log_page)
+{
+   struct nova_sb_info *sbi = NOVA_SB(sb);
+   struct rb_root *tree;
+   unsigned long block_low;
+   unsigned long block_high;
+   unsigned long num_blocks = 0;
+   struct nova_range_node *prev = NULL;
+   struct nova_range_node *next = NULL;
+   struct nova_range_node *curr_node;
+   struct free_list *free_list;
+   int cpuid;
+   int new_node_used = 0;
+   int ret;
+   timing_t free_time;
+
+   if (num <= 0) {
+   nova_dbg("%s ERROR: free %d\n", __func__, num);
+   return -EINVAL;
+   }
+
+   NOVA_START_TIMING(free_blocks_t, free_time);
+   cpuid = blocknr / sbi->per_list_blocks;
+
+   /* Pre-allocate blocknode */
+   curr_node = nova_alloc_blocknode(sb);
+   if (curr_node == NULL) {
+   /* returning without freeing the block*/
+   NOVA_END_TIMING(free_blocks_t, free_time);
+   return -ENOMEM;
+   }
+
+   free_list = nova_get_free_list(sb, cpuid);
+   spin_lock(_list->s_lock);
+
+   tree = &(free_list->block_free_tree);
+
+   num_blocks = nova_get_numblocks(btype) * num;
+   block_low = blocknr;
+   block_high = blocknr + num_blocks - 1;
+
+   nova_dbgv("Free: %lu - %lu\n", block_low, block_high);
+
+   if (blocknr < free_list->block_start ||
+   blocknr + num > free_list->block_end + 1) {
+   nova_err(sb, "free blocks %lu to %lu, free list %d, start %lu, 
end %lu\n",
+   blocknr, blocknr + num - 1,
+   free_list->index,
+   free_list->block_start,
+   free_list->block_end);
+   ret = -EIO;
+   goto out;
+   }
+
+   ret = nova_find_free_slot(sbi, tree, block_low,
+   block_high, , );
+
+   if (ret) {
+   nova_dbg("%s: find free slot fail: %d\n", __func__, ret);
+   goto out;
+   }
+
+   if (prev && next && (block_low == prev-

[RFC v2 17/83] Add statfs support.

2018-03-10 Thread Andiry Xu

From: Andiry Xu <jix...@cs.ucsd.edu>

Signed-off-by: Andiry Xu <jix...@cs.ucsd.edu>
---
 fs/nova/balloc.c | 18 ++
 fs/nova/balloc.h |  1 +
 fs/nova/super.c  | 19 +++
 3 files changed, 38 insertions(+)

diff --git a/fs/nova/balloc.c b/fs/nova/balloc.c
index cb627db..0742fe0 100644
--- a/fs/nova/balloc.c
+++ b/fs/nova/balloc.c
@@ -217,3 +217,21 @@ inline int nova_insert_blocktree(struct nova_sb_info *sbi,
 
return ret;
 }
+
+/* We do not take locks so it's inaccurate */
+unsigned long nova_count_free_blocks(struct super_block *sb)
+{
+   struct nova_sb_info *sbi = NOVA_SB(sb);
+   struct free_list *free_list;
+   unsigned long num_free_blocks = 0;
+   int i;
+
+   for (i = 0; i < sbi->cpus; i++) {
+   free_list = nova_get_free_list(sb, i);
+   num_free_blocks += free_list->num_free_blocks;
+   }
+
+   return num_free_blocks;
+}
+
+
diff --git a/fs/nova/balloc.h b/fs/nova/balloc.h
index 57a93e4..537532e 100644
--- a/fs/nova/balloc.h
+++ b/fs/nova/balloc.h
@@ -66,6 +66,7 @@ inline struct nova_range_node *nova_alloc_blocknode(struct 
super_block *sb);
 inline void nova_free_blocknode(struct super_block *sb,
struct nova_range_node *bnode);
 extern void nova_init_blockmap(struct super_block *sb, int recovery);
+extern unsigned long nova_count_free_blocks(struct super_block *sb);
 inline int nova_insert_blocktree(struct nova_sb_info *sbi,
struct rb_root *tree, struct nova_range_node *new_node);
 
diff --git a/fs/nova/super.c b/fs/nova/super.c
index 9762f26..3500d19 100644
--- a/fs/nova/super.c
+++ b/fs/nova/super.c
@@ -629,6 +629,24 @@ static int nova_fill_super(struct super_block *sb, void 
*data, int silent)
return retval;
 }
 
+static int nova_statfs(struct dentry *d, struct kstatfs *buf)
+{
+   struct super_block *sb = d->d_sb;
+   struct nova_sb_info *sbi = (struct nova_sb_info *)sb->s_fs_info;
+
+   buf->f_type = NOVA_SUPER_MAGIC;
+   buf->f_bsize = sb->s_blocksize;
+
+   buf->f_blocks = sbi->num_blocks;
+   buf->f_bfree = buf->f_bavail = nova_count_free_blocks(sb);
+   buf->f_files = LONG_MAX;
+   buf->f_ffree = LONG_MAX - sbi->s_inodes_used_count;
+   buf->f_namelen = NOVA_NAME_LEN;
+   nova_dbg_verbose("nova_stats: total 4k free blocks 0x%llx\n",
+   buf->f_bfree);
+   return 0;
+}
+
 static int nova_show_options(struct seq_file *seq, struct dentry *root)
 {
struct nova_sb_info *sbi = NOVA_SB(root->d_sb);
@@ -794,6 +812,7 @@ static struct super_operations nova_sops = {
.alloc_inode= nova_alloc_inode,
.destroy_inode  = nova_destroy_inode,
.put_super  = nova_put_super,
+   .statfs = nova_statfs,
.remount_fs = nova_remount,
.show_options   = nova_show_options,
 };
-- 
2.7.4

[RFC v2 13/83] Add remount_fs and show_options methods.

2018-03-10 Thread Andiry Xu

From: Andiry Xu <jix...@cs.ucsd.edu>

Signed-off-by: Andiry Xu <jix...@cs.ucsd.edu>
---
 fs/nova/super.c | 55 +++
 1 file changed, 55 insertions(+)

diff --git a/fs/nova/super.c b/fs/nova/super.c
index 3efb560..f41cc04 100644
--- a/fs/nova/super.c
+++ b/fs/nova/super.c
@@ -617,6 +617,59 @@ static int nova_fill_super(struct super_block *sb, void 
*data, int silent)
return retval;
 }
 
+static int nova_show_options(struct seq_file *seq, struct dentry *root)
+{
+   struct nova_sb_info *sbi = NOVA_SB(root->d_sb);
+
+   if (sbi->mode != (0777 | S_ISVTX))
+   seq_printf(seq, ",mode=%03o", sbi->mode);
+   if (uid_valid(sbi->uid))
+   seq_printf(seq, ",uid=%u", from_kuid(_user_ns, sbi->uid));
+   if (gid_valid(sbi->gid))
+   seq_printf(seq, ",gid=%u", from_kgid(_user_ns, sbi->gid));
+   if (test_opt(root->d_sb, ERRORS_RO))
+   seq_puts(seq, ",errors=remount-ro");
+   if (test_opt(root->d_sb, ERRORS_PANIC))
+   seq_puts(seq, ",errors=panic");
+   if (test_opt(root->d_sb, DAX))
+   seq_puts(seq, ",dax");
+
+   return 0;
+}
+
+static int nova_remount(struct super_block *sb, int *mntflags, char *data)
+{
+   unsigned long old_sb_flags;
+   unsigned long old_mount_opt;
+   struct nova_sb_info *sbi = NOVA_SB(sb);
+   int ret = -EINVAL;
+
+   /* Store the old options */
+   mutex_lock(>s_lock);
+   old_sb_flags = sb->s_flags;
+   old_mount_opt = sbi->s_mount_opt;
+
+   if (nova_parse_options(data, sbi, 1))
+   goto restore_opt;
+
+   sb->s_flags = (sb->s_flags & ~MS_POSIXACL) |
+ ((sbi->s_mount_opt & NOVA_MOUNT_POSIX_ACL) ?
+  MS_POSIXACL : 0);
+
+   if ((*mntflags & MS_RDONLY) != (sb->s_flags & MS_RDONLY))
+   nova_update_mount_time(sb);
+
+   mutex_unlock(>s_lock);
+   ret = 0;
+   return ret;
+
+restore_opt:
+   sb->s_flags = old_sb_flags;
+   sbi->s_mount_opt = old_mount_opt;
+   mutex_unlock(>s_lock);
+   return ret;
+}
+
 static void nova_put_super(struct super_block *sb)
 {
struct nova_sb_info *sbi = NOVA_SB(sb);
@@ -697,6 +750,8 @@ static struct super_operations nova_sops = {
.alloc_inode= nova_alloc_inode,
.destroy_inode  = nova_destroy_inode,
.put_super  = nova_put_super,
+   .remount_fs = nova_remount,
+   .show_options   = nova_show_options,
 };
 
 static struct dentry *nova_mount(struct file_system_type *fs_type,
-- 
2.7.4

[RFC v2 17/83] Add statfs support.

2018-03-10 Thread Andiry Xu

From: Andiry Xu 

Signed-off-by: Andiry Xu 
---
 fs/nova/balloc.c | 18 ++
 fs/nova/balloc.h |  1 +
 fs/nova/super.c  | 19 +++
 3 files changed, 38 insertions(+)

diff --git a/fs/nova/balloc.c b/fs/nova/balloc.c
index cb627db..0742fe0 100644
--- a/fs/nova/balloc.c
+++ b/fs/nova/balloc.c
@@ -217,3 +217,21 @@ inline int nova_insert_blocktree(struct nova_sb_info *sbi,
 
return ret;
 }
+
+/* We do not take locks so it's inaccurate */
+unsigned long nova_count_free_blocks(struct super_block *sb)
+{
+   struct nova_sb_info *sbi = NOVA_SB(sb);
+   struct free_list *free_list;
+   unsigned long num_free_blocks = 0;
+   int i;
+
+   for (i = 0; i < sbi->cpus; i++) {
+   free_list = nova_get_free_list(sb, i);
+   num_free_blocks += free_list->num_free_blocks;
+   }
+
+   return num_free_blocks;
+}
+
+
diff --git a/fs/nova/balloc.h b/fs/nova/balloc.h
index 57a93e4..537532e 100644
--- a/fs/nova/balloc.h
+++ b/fs/nova/balloc.h
@@ -66,6 +66,7 @@ inline struct nova_range_node *nova_alloc_blocknode(struct 
super_block *sb);
 inline void nova_free_blocknode(struct super_block *sb,
struct nova_range_node *bnode);
 extern void nova_init_blockmap(struct super_block *sb, int recovery);
+extern unsigned long nova_count_free_blocks(struct super_block *sb);
 inline int nova_insert_blocktree(struct nova_sb_info *sbi,
struct rb_root *tree, struct nova_range_node *new_node);
 
diff --git a/fs/nova/super.c b/fs/nova/super.c
index 9762f26..3500d19 100644
--- a/fs/nova/super.c
+++ b/fs/nova/super.c
@@ -629,6 +629,24 @@ static int nova_fill_super(struct super_block *sb, void 
*data, int silent)
return retval;
 }
 
+static int nova_statfs(struct dentry *d, struct kstatfs *buf)
+{
+   struct super_block *sb = d->d_sb;
+   struct nova_sb_info *sbi = (struct nova_sb_info *)sb->s_fs_info;
+
+   buf->f_type = NOVA_SUPER_MAGIC;
+   buf->f_bsize = sb->s_blocksize;
+
+   buf->f_blocks = sbi->num_blocks;
+   buf->f_bfree = buf->f_bavail = nova_count_free_blocks(sb);
+   buf->f_files = LONG_MAX;
+   buf->f_ffree = LONG_MAX - sbi->s_inodes_used_count;
+   buf->f_namelen = NOVA_NAME_LEN;
+   nova_dbg_verbose("nova_stats: total 4k free blocks 0x%llx\n",
+   buf->f_bfree);
+   return 0;
+}
+
 static int nova_show_options(struct seq_file *seq, struct dentry *root)
 {
struct nova_sb_info *sbi = NOVA_SB(root->d_sb);
@@ -794,6 +812,7 @@ static struct super_operations nova_sops = {
.alloc_inode= nova_alloc_inode,
.destroy_inode  = nova_destroy_inode,
.put_super  = nova_put_super,
+   .statfs = nova_statfs,
.remount_fs = nova_remount,
.show_options   = nova_show_options,
 };
-- 
2.7.4

[RFC v2 13/83] Add remount_fs and show_options methods.

2018-03-10 Thread Andiry Xu

From: Andiry Xu 

Signed-off-by: Andiry Xu 
---
 fs/nova/super.c | 55 +++
 1 file changed, 55 insertions(+)

diff --git a/fs/nova/super.c b/fs/nova/super.c
index 3efb560..f41cc04 100644
--- a/fs/nova/super.c
+++ b/fs/nova/super.c
@@ -617,6 +617,59 @@ static int nova_fill_super(struct super_block *sb, void 
*data, int silent)
return retval;
 }
 
+static int nova_show_options(struct seq_file *seq, struct dentry *root)
+{
+   struct nova_sb_info *sbi = NOVA_SB(root->d_sb);
+
+   if (sbi->mode != (0777 | S_ISVTX))
+   seq_printf(seq, ",mode=%03o", sbi->mode);
+   if (uid_valid(sbi->uid))
+   seq_printf(seq, ",uid=%u", from_kuid(_user_ns, sbi->uid));
+   if (gid_valid(sbi->gid))
+   seq_printf(seq, ",gid=%u", from_kgid(_user_ns, sbi->gid));
+   if (test_opt(root->d_sb, ERRORS_RO))
+   seq_puts(seq, ",errors=remount-ro");
+   if (test_opt(root->d_sb, ERRORS_PANIC))
+   seq_puts(seq, ",errors=panic");
+   if (test_opt(root->d_sb, DAX))
+   seq_puts(seq, ",dax");
+
+   return 0;
+}
+
+static int nova_remount(struct super_block *sb, int *mntflags, char *data)
+{
+   unsigned long old_sb_flags;
+   unsigned long old_mount_opt;
+   struct nova_sb_info *sbi = NOVA_SB(sb);
+   int ret = -EINVAL;
+
+   /* Store the old options */
+   mutex_lock(>s_lock);
+   old_sb_flags = sb->s_flags;
+   old_mount_opt = sbi->s_mount_opt;
+
+   if (nova_parse_options(data, sbi, 1))
+   goto restore_opt;
+
+   sb->s_flags = (sb->s_flags & ~MS_POSIXACL) |
+ ((sbi->s_mount_opt & NOVA_MOUNT_POSIX_ACL) ?
+  MS_POSIXACL : 0);
+
+   if ((*mntflags & MS_RDONLY) != (sb->s_flags & MS_RDONLY))
+   nova_update_mount_time(sb);
+
+   mutex_unlock(>s_lock);
+   ret = 0;
+   return ret;
+
+restore_opt:
+   sb->s_flags = old_sb_flags;
+   sbi->s_mount_opt = old_mount_opt;
+   mutex_unlock(>s_lock);
+   return ret;
+}
+
 static void nova_put_super(struct super_block *sb)
 {
struct nova_sb_info *sbi = NOVA_SB(sb);
@@ -697,6 +750,8 @@ static struct super_operations nova_sops = {
.alloc_inode= nova_alloc_inode,
.destroy_inode  = nova_destroy_inode,
.put_super  = nova_put_super,
+   .remount_fs = nova_remount,
+   .show_options   = nova_show_options,
 };
 
 static struct dentry *nova_mount(struct file_system_type *fs_type,
-- 
2.7.4

[RFC v2 19/83] Add pmem block free routines.

2018-03-10 Thread Andiry Xu

From: Andiry Xu 

NOVA allocates/frees log pages and data pages in the same way.
For block free, NOVA first gets the corresponding free list by
checking the block number, and then inserts the freed range in
the red-black tree. NOVA always merge adjacent free ranges if possible.

Signed-off-by: Andiry Xu 
---
 fs/nova/balloc.c | 223 +++
 fs/nova/balloc.h |   8 ++
 fs/nova/nova.h   |  23 ++
 3 files changed, 254 insertions(+)

diff --git a/fs/nova/balloc.c b/fs/nova/balloc.c
index 0742fe0..9108721 100644
--- a/fs/nova/balloc.c
+++ b/fs/nova/balloc.c
@@ -218,6 +218,229 @@ inline int nova_insert_blocktree(struct nova_sb_info *sbi,
return ret;
 }
 
+/* Used for both block free tree and inode inuse tree */
+int nova_find_free_slot(struct nova_sb_info *sbi,
+   struct rb_root *tree, unsigned long range_low,
+   unsigned long range_high, struct nova_range_node **prev,
+   struct nova_range_node **next)
+{
+   struct nova_range_node *ret_node = NULL;
+   struct rb_node *tmp;
+   int check_prev = 0, check_next = 0;
+   int ret;
+
+   ret = nova_find_range_node(sbi, tree, range_low, _node);
+   if (ret) {
+   nova_dbg("%s ERROR: %lu - %lu already in free list\n",
+   __func__, range_low, range_high);
+   return -EINVAL;
+   }
+
+   if (!ret_node) {
+   *prev = *next = NULL;
+   } else if (ret_node->range_high < range_low) {
+   *prev = ret_node;
+   tmp = rb_next(_node->node);
+   if (tmp) {
+   *next = container_of(tmp, struct nova_range_node, node);
+   check_next = 1;
+   } else {
+   *next = NULL;
+   }
+   } else if (ret_node->range_low > range_high) {
+   *next = ret_node;
+   tmp = rb_prev(_node->node);
+   if (tmp) {
+   *prev = container_of(tmp, struct nova_range_node, node);
+   check_prev = 1;
+   } else {
+   *prev = NULL;
+   }
+   } else {
+   nova_dbg("%s ERROR: %lu - %lu overlaps with existing node %lu - 
%lu\n",
+__func__, range_low, range_high, ret_node->range_low,
+   ret_node->range_high);
+   return -EINVAL;
+   }
+
+   return 0;
+}
+
+/*
+ * blocknr: start block number
+ * num: number of freed pages
+ * btype: is large page?
+ * log_page: is log page?
+ */
+static int nova_free_blocks(struct super_block *sb, unsigned long blocknr,
+   int num, unsigned short btype, int log_page)
+{
+   struct nova_sb_info *sbi = NOVA_SB(sb);
+   struct rb_root *tree;
+   unsigned long block_low;
+   unsigned long block_high;
+   unsigned long num_blocks = 0;
+   struct nova_range_node *prev = NULL;
+   struct nova_range_node *next = NULL;
+   struct nova_range_node *curr_node;
+   struct free_list *free_list;
+   int cpuid;
+   int new_node_used = 0;
+   int ret;
+   timing_t free_time;
+
+   if (num <= 0) {
+   nova_dbg("%s ERROR: free %d\n", __func__, num);
+   return -EINVAL;
+   }
+
+   NOVA_START_TIMING(free_blocks_t, free_time);
+   cpuid = blocknr / sbi->per_list_blocks;
+
+   /* Pre-allocate blocknode */
+   curr_node = nova_alloc_blocknode(sb);
+   if (curr_node == NULL) {
+   /* returning without freeing the block*/
+   NOVA_END_TIMING(free_blocks_t, free_time);
+   return -ENOMEM;
+   }
+
+   free_list = nova_get_free_list(sb, cpuid);
+   spin_lock(_list->s_lock);
+
+   tree = &(free_list->block_free_tree);
+
+   num_blocks = nova_get_numblocks(btype) * num;
+   block_low = blocknr;
+   block_high = blocknr + num_blocks - 1;
+
+   nova_dbgv("Free: %lu - %lu\n", block_low, block_high);
+
+   if (blocknr < free_list->block_start ||
+   blocknr + num > free_list->block_end + 1) {
+   nova_err(sb, "free blocks %lu to %lu, free list %d, start %lu, 
end %lu\n",
+   blocknr, blocknr + num - 1,
+   free_list->index,
+   free_list->block_start,
+   free_list->block_end);
+   ret = -EIO;
+   goto out;
+   }
+
+   ret = nova_find_free_slot(sbi, tree, block_low,
+   block_high, , );
+
+   if (ret) {
+   nova_dbg("%s: find free slot fail: %d\n", __func__, ret);
+   goto out;
+   }
+
+   if (prev && next && (block_low == prev->range_high + 1) &&
+

[RFC v2 20/83] Pmem block allocation routines.

2018-03-10 Thread Andiry Xu

From: Andiry Xu <jix...@cs.ucsd.edu>

Upon a allocation request, NOVA first try the free list on current CPU.
If there are not enough blocks to allocate, NOVA will go to the
free list with the most free blocks.
Caller can specify allocation direction: from low address or from
high address.

Signed-off-by: Andiry Xu <jix...@cs.ucsd.edu>
---
 fs/nova/balloc.c | 270 +++
 fs/nova/balloc.h |  10 +++
 2 files changed, 280 insertions(+)

diff --git a/fs/nova/balloc.c b/fs/nova/balloc.c
index 9108721..8e99215 100644
--- a/fs/nova/balloc.c
+++ b/fs/nova/balloc.c
@@ -441,6 +441,276 @@ int nova_free_log_blocks(struct super_block *sb,
return ret;
 }
 
+static int not_enough_blocks(struct free_list *free_list,
+   unsigned long num_blocks, enum alloc_type atype)
+{
+   struct nova_range_node *first = free_list->first_node;
+   struct nova_range_node *last = free_list->last_node;
+
+   if (free_list->num_free_blocks < num_blocks || !first || !last) {
+   nova_dbgv("%s: num_free_blocks=%ld; num_blocks=%ld; first=0x%p; 
last=0x%p",
+ __func__, free_list->num_free_blocks, num_blocks,
+ first, last);
+   return 1;
+   }
+
+   return 0;
+}
+
+/* Return how many blocks allocated */
+static long nova_alloc_blocks_in_free_list(struct super_block *sb,
+   struct free_list *free_list, unsigned short btype,
+   enum alloc_type atype, unsigned long num_blocks,
+   unsigned long *new_blocknr, enum nova_alloc_direction from_tail)
+{
+   struct rb_root *tree;
+   struct nova_range_node *curr, *next = NULL, *prev = NULL;
+   struct rb_node *temp, *next_node, *prev_node;
+   unsigned long curr_blocks;
+   bool found = 0;
+   unsigned long step = 0;
+
+   if (!free_list->first_node || free_list->num_free_blocks == 0) {
+   nova_dbgv("%s: Can't alloc. free_list->first_node=0x%p 
free_list->num_free_blocks = %lu",
+ __func__, free_list->first_node,
+ free_list->num_free_blocks);
+   return -ENOSPC;
+   }
+
+   if (atype == LOG && not_enough_blocks(free_list, num_blocks, atype)) {
+   nova_dbgv("%s: Can't alloc.  not_enough_blocks() == true",
+ __func__);
+   return -ENOSPC;
+   }
+
+   tree = &(free_list->block_free_tree);
+   if (from_tail == ALLOC_FROM_HEAD)
+   temp = &(free_list->first_node->node);
+   else
+   temp = &(free_list->last_node->node);
+
+   while (temp) {
+   step++;
+   curr = container_of(temp, struct nova_range_node, node);
+
+   curr_blocks = curr->range_high - curr->range_low + 1;
+
+   if (num_blocks >= curr_blocks) {
+   /* Superpage allocation must succeed */
+   if (btype > 0 && num_blocks > curr_blocks)
+   goto next;
+
+   /* Otherwise, allocate the whole blocknode */
+   if (curr == free_list->first_node) {
+   next_node = rb_next(temp);
+   if (next_node)
+   next = container_of(next_node,
+   struct nova_range_node, node);
+   free_list->first_node = next;
+   }
+
+   if (curr == free_list->last_node) {
+   prev_node = rb_prev(temp);
+   if (prev_node)
+   prev = container_of(prev_node,
+   struct nova_range_node, node);
+   free_list->last_node = prev;
+   }
+
+   rb_erase(>node, tree);
+   free_list->num_blocknode--;
+   num_blocks = curr_blocks;
+   *new_blocknr = curr->range_low;
+   nova_free_blocknode(sb, curr);
+   found = 1;
+   break;
+   }
+
+   /* Allocate partial blocknode */
+   if (from_tail == ALLOC_FROM_HEAD) {
+   *new_blocknr = curr->range_low;
+   curr->range_low += num_blocks;
+   } else {
+   *new_blocknr = curr->range_high + 1 - num_blocks;
+   curr->range_high -= num_blocks;
+   }
+
+   found = 1;
+   break;
+next:
+   if (from_tail == ALLOC_FROM_HEAD)
+   temp = rb_next(temp);
+   el

[RFC v2 20/83] Pmem block allocation routines.

2018-03-10 Thread Andiry Xu

From: Andiry Xu 

Upon a allocation request, NOVA first try the free list on current CPU.
If there are not enough blocks to allocate, NOVA will go to the
free list with the most free blocks.
Caller can specify allocation direction: from low address or from
high address.

Signed-off-by: Andiry Xu 
---
 fs/nova/balloc.c | 270 +++
 fs/nova/balloc.h |  10 +++
 2 files changed, 280 insertions(+)

diff --git a/fs/nova/balloc.c b/fs/nova/balloc.c
index 9108721..8e99215 100644
--- a/fs/nova/balloc.c
+++ b/fs/nova/balloc.c
@@ -441,6 +441,276 @@ int nova_free_log_blocks(struct super_block *sb,
return ret;
 }
 
+static int not_enough_blocks(struct free_list *free_list,
+   unsigned long num_blocks, enum alloc_type atype)
+{
+   struct nova_range_node *first = free_list->first_node;
+   struct nova_range_node *last = free_list->last_node;
+
+   if (free_list->num_free_blocks < num_blocks || !first || !last) {
+   nova_dbgv("%s: num_free_blocks=%ld; num_blocks=%ld; first=0x%p; 
last=0x%p",
+ __func__, free_list->num_free_blocks, num_blocks,
+ first, last);
+   return 1;
+   }
+
+   return 0;
+}
+
+/* Return how many blocks allocated */
+static long nova_alloc_blocks_in_free_list(struct super_block *sb,
+   struct free_list *free_list, unsigned short btype,
+   enum alloc_type atype, unsigned long num_blocks,
+   unsigned long *new_blocknr, enum nova_alloc_direction from_tail)
+{
+   struct rb_root *tree;
+   struct nova_range_node *curr, *next = NULL, *prev = NULL;
+   struct rb_node *temp, *next_node, *prev_node;
+   unsigned long curr_blocks;
+   bool found = 0;
+   unsigned long step = 0;
+
+   if (!free_list->first_node || free_list->num_free_blocks == 0) {
+   nova_dbgv("%s: Can't alloc. free_list->first_node=0x%p 
free_list->num_free_blocks = %lu",
+ __func__, free_list->first_node,
+ free_list->num_free_blocks);
+   return -ENOSPC;
+   }
+
+   if (atype == LOG && not_enough_blocks(free_list, num_blocks, atype)) {
+   nova_dbgv("%s: Can't alloc.  not_enough_blocks() == true",
+ __func__);
+   return -ENOSPC;
+   }
+
+   tree = &(free_list->block_free_tree);
+   if (from_tail == ALLOC_FROM_HEAD)
+   temp = &(free_list->first_node->node);
+   else
+   temp = &(free_list->last_node->node);
+
+   while (temp) {
+   step++;
+   curr = container_of(temp, struct nova_range_node, node);
+
+   curr_blocks = curr->range_high - curr->range_low + 1;
+
+   if (num_blocks >= curr_blocks) {
+   /* Superpage allocation must succeed */
+   if (btype > 0 && num_blocks > curr_blocks)
+   goto next;
+
+   /* Otherwise, allocate the whole blocknode */
+   if (curr == free_list->first_node) {
+   next_node = rb_next(temp);
+   if (next_node)
+   next = container_of(next_node,
+   struct nova_range_node, node);
+   free_list->first_node = next;
+   }
+
+   if (curr == free_list->last_node) {
+   prev_node = rb_prev(temp);
+   if (prev_node)
+   prev = container_of(prev_node,
+   struct nova_range_node, node);
+   free_list->last_node = prev;
+   }
+
+   rb_erase(>node, tree);
+   free_list->num_blocknode--;
+   num_blocks = curr_blocks;
+   *new_blocknr = curr->range_low;
+   nova_free_blocknode(sb, curr);
+   found = 1;
+   break;
+   }
+
+   /* Allocate partial blocknode */
+   if (from_tail == ALLOC_FROM_HEAD) {
+   *new_blocknr = curr->range_low;
+   curr->range_low += num_blocks;
+   } else {
+   *new_blocknr = curr->range_high + 1 - num_blocks;
+   curr->range_high -= num_blocks;
+   }
+
+   found = 1;
+   break;
+next:
+   if (from_tail == ALLOC_FROM_HEAD)
+   temp = rb_next(temp);
+   else
+   temp = rb_prev(temp);
+

[RFC v2 18/83] Add freelist statistics printing.

2018-03-10 Thread Andiry Xu

From: Andiry Xu <jix...@cs.ucsd.edu>

Signed-off-by: Andiry Xu <jix...@cs.ucsd.edu>
---
 fs/nova/nova.h  |   1 +
 fs/nova/stats.c | 103 
 2 files changed, 104 insertions(+)

diff --git a/fs/nova/nova.h b/fs/nova/nova.h
index c4abdd8..404e133 100644
--- a/fs/nova/nova.h
+++ b/fs/nova/nova.h
@@ -326,5 +326,6 @@ void nova_get_IO_stats(void);
 void nova_print_timing_stats(struct super_block *sb);
 void nova_clear_stats(struct super_block *sb);
 void nova_print_inode(struct nova_inode *pi);
+void nova_print_free_lists(struct super_block *sb);
 
 #endif /* __NOVA_H */
diff --git a/fs/nova/stats.c b/fs/nova/stats.c
index 4b7c317..9ddd267 100644
--- a/fs/nova/stats.c
+++ b/fs/nova/stats.c
@@ -128,6 +128,61 @@ DEFINE_PER_CPU(u64[TIMING_NUM], Countstats_percpu);
 u64 IOstats[STATS_NUM];
 DEFINE_PER_CPU(u64[STATS_NUM], IOstats_percpu);
 
+static void nova_print_alloc_stats(struct super_block *sb)
+{
+   struct nova_sb_info *sbi = NOVA_SB(sb);
+   struct free_list *free_list;
+   unsigned long alloc_log_count = 0;
+   unsigned long alloc_log_pages = 0;
+   unsigned long alloc_data_count = 0;
+   unsigned long alloc_data_pages = 0;
+   unsigned long free_log_count = 0;
+   unsigned long freed_log_pages = 0;
+   unsigned long free_data_count = 0;
+   unsigned long freed_data_pages = 0;
+   int i;
+
+   nova_info("=== NOVA allocation stats ===\n");
+   nova_info("Alloc %llu, alloc steps %llu, average %llu\n",
+   Countstats[new_data_blocks_t], IOstats[alloc_steps],
+   Countstats[new_data_blocks_t] ?
+   IOstats[alloc_steps] / Countstats[new_data_blocks_t]
+   : 0);
+   nova_info("Free %llu\n", Countstats[free_data_t]);
+   nova_info("Fast GC %llu, check pages %llu, free pages %llu, average 
%llu\n",
+   Countstats[fast_gc_t], IOstats[fast_checked_pages],
+   IOstats[fast_gc_pages], Countstats[fast_gc_t] ?
+   IOstats[fast_gc_pages] / Countstats[fast_gc_t] : 0);
+   nova_info("Thorough GC %llu, checked pages %llu, free pages %llu, 
average %llu\n",
+   Countstats[thorough_gc_t],
+   IOstats[thorough_checked_pages], IOstats[thorough_gc_pages],
+   Countstats[thorough_gc_t] ?
+   IOstats[thorough_gc_pages] / Countstats[thorough_gc_t]
+   : 0);
+
+   for (i = 0; i < sbi->cpus; i++) {
+   free_list = nova_get_free_list(sb, i);
+
+   alloc_log_count += free_list->alloc_log_count;
+   alloc_log_pages += free_list->alloc_log_pages;
+   alloc_data_count += free_list->alloc_data_count;
+   alloc_data_pages += free_list->alloc_data_pages;
+   free_log_count += free_list->free_log_count;
+   freed_log_pages += free_list->freed_log_pages;
+   free_data_count += free_list->free_data_count;
+   freed_data_pages += free_list->freed_data_pages;
+   }
+
+   nova_info("alloc log count %lu, allocated log pages %lu, "
+   "alloc data count %lu, allocated data pages %lu, "
+   "free log count %lu, freed log pages %lu, "
+   "free data count %lu, freed data pages %lu\n",
+   alloc_log_count, alloc_log_pages,
+   alloc_data_count, alloc_data_pages,
+   free_log_count, freed_log_pages,
+   free_data_count, freed_data_pages);
+}
+
 static void nova_print_IO_stats(struct super_block *sb)
 {
nova_info("=== NOVA I/O stats ===\n");
@@ -209,6 +264,7 @@ void nova_print_timing_stats(struct super_block *sb)
}
 
nova_info("\n");
+   nova_print_alloc_stats(sb);
nova_print_IO_stats(sb);
 }
 
@@ -229,6 +285,8 @@ static void nova_clear_timing_stats(void)
 
 static void nova_clear_IO_stats(struct super_block *sb)
 {
+   struct nova_sb_info *sbi = NOVA_SB(sb);
+   struct free_list *free_list;
int i;
int cpu;
 
@@ -237,6 +295,19 @@ static void nova_clear_IO_stats(struct super_block *sb)
for_each_possible_cpu(cpu)
per_cpu(IOstats_percpu[i], cpu) = 0;
}
+
+   for (i = 0; i < sbi->cpus; i++) {
+   free_list = nova_get_free_list(sb, i);
+
+   free_list->alloc_log_count = 0;
+   free_list->alloc_log_pages = 0;
+   free_list->alloc_data_count = 0;
+   free_list->alloc_data_pages = 0;
+   free_list->free_log_count = 0;
+   free_list->freed_log_pages = 0;
+   free_list->free_data_count = 0;
+   free_list->freed_data_pages = 0;
+

[RFC v2 18/83] Add freelist statistics printing.

2018-03-10 Thread Andiry Xu

From: Andiry Xu 

Signed-off-by: Andiry Xu 
---
 fs/nova/nova.h  |   1 +
 fs/nova/stats.c | 103 
 2 files changed, 104 insertions(+)

diff --git a/fs/nova/nova.h b/fs/nova/nova.h
index c4abdd8..404e133 100644
--- a/fs/nova/nova.h
+++ b/fs/nova/nova.h
@@ -326,5 +326,6 @@ void nova_get_IO_stats(void);
 void nova_print_timing_stats(struct super_block *sb);
 void nova_clear_stats(struct super_block *sb);
 void nova_print_inode(struct nova_inode *pi);
+void nova_print_free_lists(struct super_block *sb);
 
 #endif /* __NOVA_H */
diff --git a/fs/nova/stats.c b/fs/nova/stats.c
index 4b7c317..9ddd267 100644
--- a/fs/nova/stats.c
+++ b/fs/nova/stats.c
@@ -128,6 +128,61 @@ DEFINE_PER_CPU(u64[TIMING_NUM], Countstats_percpu);
 u64 IOstats[STATS_NUM];
 DEFINE_PER_CPU(u64[STATS_NUM], IOstats_percpu);
 
+static void nova_print_alloc_stats(struct super_block *sb)
+{
+   struct nova_sb_info *sbi = NOVA_SB(sb);
+   struct free_list *free_list;
+   unsigned long alloc_log_count = 0;
+   unsigned long alloc_log_pages = 0;
+   unsigned long alloc_data_count = 0;
+   unsigned long alloc_data_pages = 0;
+   unsigned long free_log_count = 0;
+   unsigned long freed_log_pages = 0;
+   unsigned long free_data_count = 0;
+   unsigned long freed_data_pages = 0;
+   int i;
+
+   nova_info("=== NOVA allocation stats ===\n");
+   nova_info("Alloc %llu, alloc steps %llu, average %llu\n",
+   Countstats[new_data_blocks_t], IOstats[alloc_steps],
+   Countstats[new_data_blocks_t] ?
+   IOstats[alloc_steps] / Countstats[new_data_blocks_t]
+   : 0);
+   nova_info("Free %llu\n", Countstats[free_data_t]);
+   nova_info("Fast GC %llu, check pages %llu, free pages %llu, average 
%llu\n",
+   Countstats[fast_gc_t], IOstats[fast_checked_pages],
+   IOstats[fast_gc_pages], Countstats[fast_gc_t] ?
+   IOstats[fast_gc_pages] / Countstats[fast_gc_t] : 0);
+   nova_info("Thorough GC %llu, checked pages %llu, free pages %llu, 
average %llu\n",
+   Countstats[thorough_gc_t],
+   IOstats[thorough_checked_pages], IOstats[thorough_gc_pages],
+   Countstats[thorough_gc_t] ?
+   IOstats[thorough_gc_pages] / Countstats[thorough_gc_t]
+   : 0);
+
+   for (i = 0; i < sbi->cpus; i++) {
+   free_list = nova_get_free_list(sb, i);
+
+   alloc_log_count += free_list->alloc_log_count;
+   alloc_log_pages += free_list->alloc_log_pages;
+   alloc_data_count += free_list->alloc_data_count;
+   alloc_data_pages += free_list->alloc_data_pages;
+   free_log_count += free_list->free_log_count;
+   freed_log_pages += free_list->freed_log_pages;
+   free_data_count += free_list->free_data_count;
+   freed_data_pages += free_list->freed_data_pages;
+   }
+
+   nova_info("alloc log count %lu, allocated log pages %lu, "
+   "alloc data count %lu, allocated data pages %lu, "
+   "free log count %lu, freed log pages %lu, "
+   "free data count %lu, freed data pages %lu\n",
+   alloc_log_count, alloc_log_pages,
+   alloc_data_count, alloc_data_pages,
+   free_log_count, freed_log_pages,
+   free_data_count, freed_data_pages);
+}
+
 static void nova_print_IO_stats(struct super_block *sb)
 {
nova_info("=== NOVA I/O stats ===\n");
@@ -209,6 +264,7 @@ void nova_print_timing_stats(struct super_block *sb)
}
 
nova_info("\n");
+   nova_print_alloc_stats(sb);
nova_print_IO_stats(sb);
 }
 
@@ -229,6 +285,8 @@ static void nova_clear_timing_stats(void)
 
 static void nova_clear_IO_stats(struct super_block *sb)
 {
+   struct nova_sb_info *sbi = NOVA_SB(sb);
+   struct free_list *free_list;
int i;
int cpu;
 
@@ -237,6 +295,19 @@ static void nova_clear_IO_stats(struct super_block *sb)
for_each_possible_cpu(cpu)
per_cpu(IOstats_percpu[i], cpu) = 0;
}
+
+   for (i = 0; i < sbi->cpus; i++) {
+   free_list = nova_get_free_list(sb, i);
+
+   free_list->alloc_log_count = 0;
+   free_list->alloc_log_pages = 0;
+   free_list->alloc_data_count = 0;
+   free_list->alloc_data_pages = 0;
+   free_list->free_log_count = 0;
+   free_list->freed_log_pages = 0;
+   free_list->free_data_count = 0;
+   free_list->freed_data_pages = 0;
+   }
 }
 
 void nova_clear_stats(struct super_block *sb)

[RFC v2 23/83] Save allocator to pmem in put_super.

2018-03-10 Thread Andiry Xu

From: Andiry Xu <jix...@cs.ucsd.edu>

We allocate log pages and append free range node to the log of the reserved 
blocknode inode.
We can recover the allocator status by reading the log upon normal recovery.

Signed-off-by: Andiry Xu <jix...@cs.ucsd.edu>
---
 fs/nova/bbuild.c | 114 +++
 fs/nova/bbuild.h |   1 +
 fs/nova/inode.h  |  13 +++
 fs/nova/nova.h   |   7 
 fs/nova/super.c  |   2 +
 5 files changed, 137 insertions(+)

diff --git a/fs/nova/bbuild.c b/fs/nova/bbuild.c
index 8bc0545..12a2f11 100644
--- a/fs/nova/bbuild.c
+++ b/fs/nova/bbuild.c
@@ -51,3 +51,117 @@ void nova_init_header(struct super_block *sb,
init_rwsem(>i_sem);
 }
 
+static u64 nova_append_range_node_entry(struct super_block *sb,
+   struct nova_range_node *curr, u64 tail, unsigned long cpuid)
+{
+   u64 curr_p;
+   size_t size = sizeof(struct nova_range_node_lowhigh);
+   struct nova_range_node_lowhigh *entry;
+
+   curr_p = tail;
+
+   if (curr_p == 0 || (is_last_entry(curr_p, size) &&
+   next_log_page(sb, curr_p) == 0)) {
+   nova_dbg("%s: inode log reaches end?\n", __func__);
+   goto out;
+   }
+
+   if (is_last_entry(curr_p, size))
+   curr_p = next_log_page(sb, curr_p);
+
+   entry = (struct nova_range_node_lowhigh *)nova_get_block(sb, curr_p);
+   entry->range_low = cpu_to_le64(curr->range_low);
+   if (cpuid)
+   entry->range_low |= cpu_to_le64(cpuid << 56);
+   entry->range_high = cpu_to_le64(curr->range_high);
+   nova_dbgv("append entry block low 0x%lx, high 0x%lx\n",
+   curr->range_low, curr->range_high);
+
+   nova_flush_buffer(entry, sizeof(struct nova_range_node_lowhigh), 0);
+out:
+   return curr_p;
+}
+
+static u64 nova_save_range_nodes_to_log(struct super_block *sb,
+   struct rb_root *tree, u64 temp_tail, unsigned long cpuid)
+{
+   struct nova_range_node *curr;
+   struct rb_node *temp;
+   size_t size = sizeof(struct nova_range_node_lowhigh);
+   u64 curr_entry = 0;
+
+   /* Save in increasing order */
+   temp = rb_first(tree);
+   while (temp) {
+   curr = container_of(temp, struct nova_range_node, node);
+   curr_entry = nova_append_range_node_entry(sb, curr,
+   temp_tail, cpuid);
+   temp_tail = curr_entry + size;
+   temp = rb_next(temp);
+   rb_erase(>node, tree);
+   nova_free_range_node(curr);
+   }
+
+   return temp_tail;
+}
+
+static u64 nova_save_free_list_blocknodes(struct super_block *sb, int cpu,
+   u64 temp_tail)
+{
+   struct free_list *free_list;
+
+   free_list = nova_get_free_list(sb, cpu);
+   temp_tail = nova_save_range_nodes_to_log(sb,
+   _list->block_free_tree, temp_tail, 0);
+   return temp_tail;
+}
+
+void nova_save_blocknode_mappings_to_log(struct super_block *sb)
+{
+   struct nova_inode *pi = nova_get_inode_by_ino(sb, NOVA_BLOCKNODE_INO);
+   struct nova_inode_info_header sih;
+   struct nova_sb_info *sbi = NOVA_SB(sb);
+   struct free_list *free_list;
+   unsigned long num_blocknode = 0;
+   unsigned long num_pages;
+   int allocated;
+   u64 new_block = 0;
+   u64 temp_tail;
+   int i;
+
+   sih.ino = NOVA_BLOCKNODE_INO;
+   sih.i_blk_type = NOVA_DEFAULT_BLOCK_TYPE;
+
+   /* Allocate log pages before save blocknode mappings */
+   for (i = 0; i < sbi->cpus; i++) {
+   free_list = nova_get_free_list(sb, i);
+   num_blocknode += free_list->num_blocknode;
+   nova_dbgv("%s: free list %d: %lu nodes\n", __func__,
+   i, free_list->num_blocknode);
+   }
+
+   num_pages = num_blocknode / RANGENODE_PER_PAGE;
+   if (num_blocknode % RANGENODE_PER_PAGE)
+   num_pages++;
+
+   allocated = nova_allocate_inode_log_pages(sb, , num_pages,
+   _block, ANY_CPU, 0);
+   if (allocated != num_pages) {
+   nova_dbg("Error saving blocknode mappings: %d\n", allocated);
+   return;
+   }
+
+   temp_tail = new_block;
+   for (i = 0; i < sbi->cpus; i++)
+   temp_tail = nova_save_free_list_blocknodes(sb, i, temp_tail);
+
+   /* Finally update log head and tail */
+   pi->log_head = new_block;
+   nova_update_tail(pi, temp_tail);
+   nova_flush_buffer(>log_head, CACHELINE_SIZE, 0);
+
+   nova_dbg("%s: %lu blocknodes, %lu log pages, pi head 0x%llx, tail 
0x%llx\n",
+ __func__, num_blocknode, num_pages,
+ pi->log_head, pi->log_tail);
+}
+
diff --git a

[RFC v2 23/83] Save allocator to pmem in put_super.

2018-03-10 Thread Andiry Xu

From: Andiry Xu 

We allocate log pages and append free range node to the log of the reserved 
blocknode inode.
We can recover the allocator status by reading the log upon normal recovery.

Signed-off-by: Andiry Xu 
---
 fs/nova/bbuild.c | 114 +++
 fs/nova/bbuild.h |   1 +
 fs/nova/inode.h  |  13 +++
 fs/nova/nova.h   |   7 
 fs/nova/super.c  |   2 +
 5 files changed, 137 insertions(+)

diff --git a/fs/nova/bbuild.c b/fs/nova/bbuild.c
index 8bc0545..12a2f11 100644
--- a/fs/nova/bbuild.c
+++ b/fs/nova/bbuild.c
@@ -51,3 +51,117 @@ void nova_init_header(struct super_block *sb,
init_rwsem(>i_sem);
 }
 
+static u64 nova_append_range_node_entry(struct super_block *sb,
+   struct nova_range_node *curr, u64 tail, unsigned long cpuid)
+{
+   u64 curr_p;
+   size_t size = sizeof(struct nova_range_node_lowhigh);
+   struct nova_range_node_lowhigh *entry;
+
+   curr_p = tail;
+
+   if (curr_p == 0 || (is_last_entry(curr_p, size) &&
+   next_log_page(sb, curr_p) == 0)) {
+   nova_dbg("%s: inode log reaches end?\n", __func__);
+   goto out;
+   }
+
+   if (is_last_entry(curr_p, size))
+   curr_p = next_log_page(sb, curr_p);
+
+   entry = (struct nova_range_node_lowhigh *)nova_get_block(sb, curr_p);
+   entry->range_low = cpu_to_le64(curr->range_low);
+   if (cpuid)
+   entry->range_low |= cpu_to_le64(cpuid << 56);
+   entry->range_high = cpu_to_le64(curr->range_high);
+   nova_dbgv("append entry block low 0x%lx, high 0x%lx\n",
+   curr->range_low, curr->range_high);
+
+   nova_flush_buffer(entry, sizeof(struct nova_range_node_lowhigh), 0);
+out:
+   return curr_p;
+}
+
+static u64 nova_save_range_nodes_to_log(struct super_block *sb,
+   struct rb_root *tree, u64 temp_tail, unsigned long cpuid)
+{
+   struct nova_range_node *curr;
+   struct rb_node *temp;
+   size_t size = sizeof(struct nova_range_node_lowhigh);
+   u64 curr_entry = 0;
+
+   /* Save in increasing order */
+   temp = rb_first(tree);
+   while (temp) {
+   curr = container_of(temp, struct nova_range_node, node);
+   curr_entry = nova_append_range_node_entry(sb, curr,
+   temp_tail, cpuid);
+   temp_tail = curr_entry + size;
+   temp = rb_next(temp);
+   rb_erase(>node, tree);
+   nova_free_range_node(curr);
+   }
+
+   return temp_tail;
+}
+
+static u64 nova_save_free_list_blocknodes(struct super_block *sb, int cpu,
+   u64 temp_tail)
+{
+   struct free_list *free_list;
+
+   free_list = nova_get_free_list(sb, cpu);
+   temp_tail = nova_save_range_nodes_to_log(sb,
+   _list->block_free_tree, temp_tail, 0);
+   return temp_tail;
+}
+
+void nova_save_blocknode_mappings_to_log(struct super_block *sb)
+{
+   struct nova_inode *pi = nova_get_inode_by_ino(sb, NOVA_BLOCKNODE_INO);
+   struct nova_inode_info_header sih;
+   struct nova_sb_info *sbi = NOVA_SB(sb);
+   struct free_list *free_list;
+   unsigned long num_blocknode = 0;
+   unsigned long num_pages;
+   int allocated;
+   u64 new_block = 0;
+   u64 temp_tail;
+   int i;
+
+   sih.ino = NOVA_BLOCKNODE_INO;
+   sih.i_blk_type = NOVA_DEFAULT_BLOCK_TYPE;
+
+   /* Allocate log pages before save blocknode mappings */
+   for (i = 0; i < sbi->cpus; i++) {
+   free_list = nova_get_free_list(sb, i);
+   num_blocknode += free_list->num_blocknode;
+   nova_dbgv("%s: free list %d: %lu nodes\n", __func__,
+   i, free_list->num_blocknode);
+   }
+
+   num_pages = num_blocknode / RANGENODE_PER_PAGE;
+   if (num_blocknode % RANGENODE_PER_PAGE)
+   num_pages++;
+
+   allocated = nova_allocate_inode_log_pages(sb, , num_pages,
+   _block, ANY_CPU, 0);
+   if (allocated != num_pages) {
+   nova_dbg("Error saving blocknode mappings: %d\n", allocated);
+   return;
+   }
+
+   temp_tail = new_block;
+   for (i = 0; i < sbi->cpus; i++)
+   temp_tail = nova_save_free_list_blocknodes(sb, i, temp_tail);
+
+   /* Finally update log head and tail */
+   pi->log_head = new_block;
+   nova_update_tail(pi, temp_tail);
+   nova_flush_buffer(>log_head, CACHELINE_SIZE, 0);
+
+   nova_dbg("%s: %lu blocknodes, %lu log pages, pi head 0x%llx, tail 
0x%llx\n",
+ __func__, num_blocknode, num_pages,
+ pi->log_head, pi->log_tail);
+}
+
diff --git a/fs/nova/bbuild.h b/fs/nova/bbuild.h
index 162a832..

[RFC v2 22/83] Inode log pages allocation and reclaimation.

2018-03-10 Thread Andiry Xu

From: Andiry Xu <jix...@cs.ucsd.edu>

NOVA allocates one log page for each new inode. When the log is full,
NOVA allocates new log pages, extends the log by either doubling the log size
or increasing by fixed length, depends on log size.

Signed-off-by: Andiry Xu <jix...@cs.ucsd.edu>
---
 fs/nova/Makefile |   2 +-
 fs/nova/log.c| 327 +++
 fs/nova/log.h|  11 ++
 3 files changed, 339 insertions(+), 1 deletion(-)
 create mode 100644 fs/nova/log.c

diff --git a/fs/nova/Makefile b/fs/nova/Makefile
index e2f7b07..b3638a4 100644
--- a/fs/nova/Makefile
+++ b/fs/nova/Makefile
@@ -4,4 +4,4 @@
 
 obj-$(CONFIG_NOVA_FS) += nova.o
 
-nova-y := balloc.o bbuild.o inode.o rebuild.o stats.o super.o
+nova-y := balloc.o bbuild.o inode.o log.o rebuild.o stats.o super.o
diff --git a/fs/nova/log.c b/fs/nova/log.c
new file mode 100644
index 000..bdd133e
--- /dev/null
+++ b/fs/nova/log.c
@@ -0,0 +1,327 @@
+/*
+ * BRIEF DESCRIPTION
+ *
+ * Log methods
+ *
+ * Copyright 2015-2016 Regents of the University of California,
+ * UCSD Non-Volatile Systems Lab, Andiry Xu <jix...@cs.ucsd.edu>
+ * Copyright 2012-2013 Intel Corporation
+ * Copyright 2009-2011 Marco Stornelli <marco.storne...@gmail.com>
+ * Copyright 2003 Sony Corporation
+ * Copyright 2003 Matsushita Electric Industrial Co., Ltd.
+ * 2003-2004 (c) MontaVista Software, Inc. , Steve Longerbeam
+ * This file is licensed under the terms of the GNU General Public
+ * License version 2. This program is licensed "as is" without any
+ * warranty of any kind, whether express or implied.
+ */
+
+#include "nova.h"
+#include "inode.h"
+#include "log.h"
+
+/* Coalesce log pages to a singly linked list */
+static int nova_coalesce_log_pages(struct super_block *sb,
+   unsigned long prev_blocknr, unsigned long first_blocknr,
+   unsigned long num_pages)
+{
+   unsigned long next_blocknr;
+   u64 curr_block, next_page;
+   struct nova_inode_log_page *curr_page;
+   int i;
+
+   if (prev_blocknr) {
+   /* Link prev block and newly allocated head block */
+   curr_block = nova_get_block_off(sb, prev_blocknr,
+   NOVA_BLOCK_TYPE_4K);
+   curr_page = (struct nova_inode_log_page *)
+   nova_get_block(sb, curr_block);
+   next_page = nova_get_block_off(sb, first_blocknr,
+   NOVA_BLOCK_TYPE_4K);
+   nova_set_next_page_address(sb, curr_page, next_page, 0);
+   }
+
+   next_blocknr = first_blocknr + 1;
+   curr_block = nova_get_block_off(sb, first_blocknr,
+   NOVA_BLOCK_TYPE_4K);
+   curr_page = (struct nova_inode_log_page *)
+   nova_get_block(sb, curr_block);
+   for (i = 0; i < num_pages - 1; i++) {
+   next_page = nova_get_block_off(sb, next_blocknr,
+   NOVA_BLOCK_TYPE_4K);
+   nova_set_page_num_entries(sb, curr_page, 0, 0);
+   nova_set_page_invalid_entries(sb, curr_page, 0, 0);
+   nova_set_next_page_address(sb, curr_page, next_page, 0);
+   curr_page++;
+   next_blocknr++;
+   }
+
+   /* Last page */
+   nova_set_page_num_entries(sb, curr_page, 0, 0);
+   nova_set_page_invalid_entries(sb, curr_page, 0, 0);
+   nova_set_next_page_address(sb, curr_page, 0, 1);
+   return 0;
+}
+
+/* Log block resides in NVMM */
+int nova_allocate_inode_log_pages(struct super_block *sb,
+   struct nova_inode_info_header *sih, unsigned long num_pages,
+   u64 *new_block, int cpuid, enum nova_alloc_direction from_tail)
+{
+   unsigned long new_inode_blocknr;
+   unsigned long first_blocknr;
+   unsigned long prev_blocknr;
+   int allocated;
+   int ret_pages = 0;
+
+   allocated = nova_new_log_blocks(sb, sih, _inode_blocknr,
+   num_pages, ALLOC_NO_INIT, cpuid, from_tail);
+
+   if (allocated <= 0) {
+   nova_err(sb, "ERROR: no inode log page available: %d %d\n",
+   num_pages, allocated);
+   return allocated;
+   }
+   ret_pages += allocated;
+   num_pages -= allocated;
+   nova_dbg_verbose("Pi %lu: Alloc %d log blocks @ 0x%lx\n",
+   sih->ino, allocated, new_inode_blocknr);
+
+   /* Coalesce the pages */
+   nova_coalesce_log_pages(sb, 0, new_inode_blocknr, allocated);
+   first_blocknr = new_inode_blocknr;
+   prev_blocknr = new_inode_blocknr + allocated - 1;
+
+   /* Allocate remaining pages */
+   while (num_pages) {
+   allocated = nova_new_log_blocks(sb, sih,
+   _inode_blocknr, num_pages,
+   ALLOC_

[RFC v2 22/83] Inode log pages allocation and reclaimation.

2018-03-10 Thread Andiry Xu

From: Andiry Xu 

NOVA allocates one log page for each new inode. When the log is full,
NOVA allocates new log pages, extends the log by either doubling the log size
or increasing by fixed length, depends on log size.

Signed-off-by: Andiry Xu 
---
 fs/nova/Makefile |   2 +-
 fs/nova/log.c| 327 +++
 fs/nova/log.h|  11 ++
 3 files changed, 339 insertions(+), 1 deletion(-)
 create mode 100644 fs/nova/log.c

diff --git a/fs/nova/Makefile b/fs/nova/Makefile
index e2f7b07..b3638a4 100644
--- a/fs/nova/Makefile
+++ b/fs/nova/Makefile
@@ -4,4 +4,4 @@
 
 obj-$(CONFIG_NOVA_FS) += nova.o
 
-nova-y := balloc.o bbuild.o inode.o rebuild.o stats.o super.o
+nova-y := balloc.o bbuild.o inode.o log.o rebuild.o stats.o super.o
diff --git a/fs/nova/log.c b/fs/nova/log.c
new file mode 100644
index 000..bdd133e
--- /dev/null
+++ b/fs/nova/log.c
@@ -0,0 +1,327 @@
+/*
+ * BRIEF DESCRIPTION
+ *
+ * Log methods
+ *
+ * Copyright 2015-2016 Regents of the University of California,
+ * UCSD Non-Volatile Systems Lab, Andiry Xu 
+ * Copyright 2012-2013 Intel Corporation
+ * Copyright 2009-2011 Marco Stornelli 
+ * Copyright 2003 Sony Corporation
+ * Copyright 2003 Matsushita Electric Industrial Co., Ltd.
+ * 2003-2004 (c) MontaVista Software, Inc. , Steve Longerbeam
+ * This file is licensed under the terms of the GNU General Public
+ * License version 2. This program is licensed "as is" without any
+ * warranty of any kind, whether express or implied.
+ */
+
+#include "nova.h"
+#include "inode.h"
+#include "log.h"
+
+/* Coalesce log pages to a singly linked list */
+static int nova_coalesce_log_pages(struct super_block *sb,
+   unsigned long prev_blocknr, unsigned long first_blocknr,
+   unsigned long num_pages)
+{
+   unsigned long next_blocknr;
+   u64 curr_block, next_page;
+   struct nova_inode_log_page *curr_page;
+   int i;
+
+   if (prev_blocknr) {
+   /* Link prev block and newly allocated head block */
+   curr_block = nova_get_block_off(sb, prev_blocknr,
+   NOVA_BLOCK_TYPE_4K);
+   curr_page = (struct nova_inode_log_page *)
+   nova_get_block(sb, curr_block);
+   next_page = nova_get_block_off(sb, first_blocknr,
+   NOVA_BLOCK_TYPE_4K);
+   nova_set_next_page_address(sb, curr_page, next_page, 0);
+   }
+
+   next_blocknr = first_blocknr + 1;
+   curr_block = nova_get_block_off(sb, first_blocknr,
+   NOVA_BLOCK_TYPE_4K);
+   curr_page = (struct nova_inode_log_page *)
+   nova_get_block(sb, curr_block);
+   for (i = 0; i < num_pages - 1; i++) {
+   next_page = nova_get_block_off(sb, next_blocknr,
+   NOVA_BLOCK_TYPE_4K);
+   nova_set_page_num_entries(sb, curr_page, 0, 0);
+   nova_set_page_invalid_entries(sb, curr_page, 0, 0);
+   nova_set_next_page_address(sb, curr_page, next_page, 0);
+   curr_page++;
+   next_blocknr++;
+   }
+
+   /* Last page */
+   nova_set_page_num_entries(sb, curr_page, 0, 0);
+   nova_set_page_invalid_entries(sb, curr_page, 0, 0);
+   nova_set_next_page_address(sb, curr_page, 0, 1);
+   return 0;
+}
+
+/* Log block resides in NVMM */
+int nova_allocate_inode_log_pages(struct super_block *sb,
+   struct nova_inode_info_header *sih, unsigned long num_pages,
+   u64 *new_block, int cpuid, enum nova_alloc_direction from_tail)
+{
+   unsigned long new_inode_blocknr;
+   unsigned long first_blocknr;
+   unsigned long prev_blocknr;
+   int allocated;
+   int ret_pages = 0;
+
+   allocated = nova_new_log_blocks(sb, sih, _inode_blocknr,
+   num_pages, ALLOC_NO_INIT, cpuid, from_tail);
+
+   if (allocated <= 0) {
+   nova_err(sb, "ERROR: no inode log page available: %d %d\n",
+   num_pages, allocated);
+   return allocated;
+   }
+   ret_pages += allocated;
+   num_pages -= allocated;
+   nova_dbg_verbose("Pi %lu: Alloc %d log blocks @ 0x%lx\n",
+   sih->ino, allocated, new_inode_blocknr);
+
+   /* Coalesce the pages */
+   nova_coalesce_log_pages(sb, 0, new_inode_blocknr, allocated);
+   first_blocknr = new_inode_blocknr;
+   prev_blocknr = new_inode_blocknr + allocated - 1;
+
+   /* Allocate remaining pages */
+   while (num_pages) {
+   allocated = nova_new_log_blocks(sb, sih,
+   _inode_blocknr, num_pages,
+   ALLOC_NO_INIT, cpuid, from_tail);
+
+   nova_dbg_verbose("Alloc %d log blocks @ 0x%l

[RFC v2 21/83] Add log structure.

2018-03-10 Thread Andiry Xu

From: Andiry Xu <jix...@cs.ucsd.edu>

NOVA log is a singly linked list of 4KB pmem pages.
Each log page consists of two parts: 4064 bytes for log entries,
and 32 bytes for page tail structure. Page tail contains metadata
about the log page and the address of the next log page in the
linked list.

Signed-off-by: Andiry Xu <jix...@cs.ucsd.edu>
---
 fs/nova/log.h  | 187 +
 fs/nova/nova.h |   1 +
 2 files changed, 188 insertions(+)
 create mode 100644 fs/nova/log.h

diff --git a/fs/nova/log.h b/fs/nova/log.h
new file mode 100644
index 000..61586a3
--- /dev/null
+++ b/fs/nova/log.h
@@ -0,0 +1,187 @@
+#ifndef __LOG_H
+#define __LOG_H
+
+#include "balloc.h"
+#include "inode.h"
+
+/* === Log entry = */
+/* Inode entry in the log */
+
+#defineMAIN_LOG0
+#defineALTER_LOG   1
+
+#definePAGE_OFFSET_MASK4095
+#defineBLOCK_OFF(p)((p) & ~PAGE_OFFSET_MASK)
+
+#defineENTRY_LOC(p)((p) & PAGE_OFFSET_MASK)
+
+#defineLOG_BLOCK_TAIL  4064
+#definePAGE_TAIL(p)(BLOCK_OFF(p) + LOG_BLOCK_TAIL)
+
+/*
+ * Log page state and pointers to next page and the replica page
+ */
+struct nova_inode_page_tail {
+   __le32  invalid_entries;
+   __le32  num_entries;
+   __le64  epoch_id;   /* For snapshot list page */
+   __le64  padding;
+   __le64  next_page;
+} __attribute((__packed__));
+
+/* Fit in PAGE_SIZE */
+struct nova_inode_log_page {
+   char padding[LOG_BLOCK_TAIL];
+   struct nova_inode_page_tail page_tail;
+} __attribute((__packed__));
+
+
+enum nova_entry_type {
+   FILE_WRITE = 1,
+   DIR_LOG,
+   SET_ATTR,
+   LINK_CHANGE,
+   NEXT_PAGE,
+};
+
+static inline u8 nova_get_entry_type(void *p)
+{
+   u8 type;
+   int rc;
+
+   rc = memcpy_mcsafe(, p, sizeof(u8));
+   if (rc)
+   return rc;
+
+   return type;
+}
+
+static inline void nova_set_entry_type(void *p, enum nova_entry_type type)
+{
+   *(u8 *)p = type;
+}
+
+static inline u64 next_log_page(struct super_block *sb, u64 curr)
+{
+   struct nova_inode_log_page *curr_page;
+   u64 next = 0;
+   int rc;
+
+   curr = BLOCK_OFF(curr);
+   curr_page = (struct nova_inode_log_page *)nova_get_block(sb, curr);
+   rc = memcpy_mcsafe(, _page->page_tail.next_page,
+   sizeof(u64));
+   if (rc)
+   return rc;
+
+   return next;
+}
+
+static inline void nova_set_next_page_flag(struct super_block *sb, u64 curr_p)
+{
+   void *p;
+
+   if (ENTRY_LOC(curr_p) >= LOG_BLOCK_TAIL)
+   return;
+
+   p = nova_get_block(sb, curr_p);
+   nova_set_entry_type(p, NEXT_PAGE);
+   nova_flush_buffer(p, CACHELINE_SIZE, 1);
+}
+
+static inline void nova_set_next_page_address(struct super_block *sb,
+   struct nova_inode_log_page *curr_page, u64 next_page, int fence)
+{
+   curr_page->page_tail.next_page = next_page;
+   nova_flush_buffer(_page->page_tail,
+   sizeof(struct nova_inode_page_tail), 0);
+   if (fence)
+   PERSISTENT_BARRIER();
+}
+
+static inline void nova_set_page_num_entries(struct super_block *sb,
+   struct nova_inode_log_page *curr_page, int num, int flush)
+{
+   curr_page->page_tail.num_entries = num;
+   if (flush)
+   nova_flush_buffer(_page->page_tail,
+   sizeof(struct nova_inode_page_tail), 0);
+}
+
+static inline void nova_set_page_invalid_entries(struct super_block *sb,
+   struct nova_inode_log_page *curr_page, int num, int flush)
+{
+   curr_page->page_tail.invalid_entries = num;
+   if (flush)
+   nova_flush_buffer(_page->page_tail,
+   sizeof(struct nova_inode_page_tail), 0);
+}
+
+static inline void nova_inc_page_num_entries(struct super_block *sb,
+   u64 curr)
+{
+   struct nova_inode_log_page *curr_page;
+
+   curr = BLOCK_OFF(curr);
+   curr_page = (struct nova_inode_log_page *)nova_get_block(sb, curr);
+
+   curr_page->page_tail.num_entries++;
+   nova_flush_buffer(_page->page_tail,
+   sizeof(struct nova_inode_page_tail), 0);
+}
+
+static inline void nova_inc_page_invalid_entries(struct super_block *sb,
+   u64 curr)
+{
+   struct nova_inode_log_page *curr_page;
+
+   curr = BLOCK_OFF(curr);
+   curr_page = (struct nova_inode_log_page *)nova_get_block(sb, curr);
+
+   curr_page->page_tail.invalid_entries++;
+   if (curr_page->page_tail.invalid_entries >
+   curr_page->page_tail.num_entries) {
+   nova_dbg("Page 0x%llx has %u entries, %u invalid\n",
+   curr,
+   curr_page-

[RFC v2 21/83] Add log structure.

2018-03-10 Thread Andiry Xu

From: Andiry Xu 

NOVA log is a singly linked list of 4KB pmem pages.
Each log page consists of two parts: 4064 bytes for log entries,
and 32 bytes for page tail structure. Page tail contains metadata
about the log page and the address of the next log page in the
linked list.

Signed-off-by: Andiry Xu 
---
 fs/nova/log.h  | 187 +
 fs/nova/nova.h |   1 +
 2 files changed, 188 insertions(+)
 create mode 100644 fs/nova/log.h

diff --git a/fs/nova/log.h b/fs/nova/log.h
new file mode 100644
index 000..61586a3
--- /dev/null
+++ b/fs/nova/log.h
@@ -0,0 +1,187 @@
+#ifndef __LOG_H
+#define __LOG_H
+
+#include "balloc.h"
+#include "inode.h"
+
+/* === Log entry = */
+/* Inode entry in the log */
+
+#defineMAIN_LOG0
+#defineALTER_LOG   1
+
+#definePAGE_OFFSET_MASK4095
+#defineBLOCK_OFF(p)((p) & ~PAGE_OFFSET_MASK)
+
+#defineENTRY_LOC(p)((p) & PAGE_OFFSET_MASK)
+
+#defineLOG_BLOCK_TAIL  4064
+#definePAGE_TAIL(p)(BLOCK_OFF(p) + LOG_BLOCK_TAIL)
+
+/*
+ * Log page state and pointers to next page and the replica page
+ */
+struct nova_inode_page_tail {
+   __le32  invalid_entries;
+   __le32  num_entries;
+   __le64  epoch_id;   /* For snapshot list page */
+   __le64  padding;
+   __le64  next_page;
+} __attribute((__packed__));
+
+/* Fit in PAGE_SIZE */
+struct nova_inode_log_page {
+   char padding[LOG_BLOCK_TAIL];
+   struct nova_inode_page_tail page_tail;
+} __attribute((__packed__));
+
+
+enum nova_entry_type {
+   FILE_WRITE = 1,
+   DIR_LOG,
+   SET_ATTR,
+   LINK_CHANGE,
+   NEXT_PAGE,
+};
+
+static inline u8 nova_get_entry_type(void *p)
+{
+   u8 type;
+   int rc;
+
+   rc = memcpy_mcsafe(, p, sizeof(u8));
+   if (rc)
+   return rc;
+
+   return type;
+}
+
+static inline void nova_set_entry_type(void *p, enum nova_entry_type type)
+{
+   *(u8 *)p = type;
+}
+
+static inline u64 next_log_page(struct super_block *sb, u64 curr)
+{
+   struct nova_inode_log_page *curr_page;
+   u64 next = 0;
+   int rc;
+
+   curr = BLOCK_OFF(curr);
+   curr_page = (struct nova_inode_log_page *)nova_get_block(sb, curr);
+   rc = memcpy_mcsafe(, _page->page_tail.next_page,
+   sizeof(u64));
+   if (rc)
+   return rc;
+
+   return next;
+}
+
+static inline void nova_set_next_page_flag(struct super_block *sb, u64 curr_p)
+{
+   void *p;
+
+   if (ENTRY_LOC(curr_p) >= LOG_BLOCK_TAIL)
+   return;
+
+   p = nova_get_block(sb, curr_p);
+   nova_set_entry_type(p, NEXT_PAGE);
+   nova_flush_buffer(p, CACHELINE_SIZE, 1);
+}
+
+static inline void nova_set_next_page_address(struct super_block *sb,
+   struct nova_inode_log_page *curr_page, u64 next_page, int fence)
+{
+   curr_page->page_tail.next_page = next_page;
+   nova_flush_buffer(_page->page_tail,
+   sizeof(struct nova_inode_page_tail), 0);
+   if (fence)
+   PERSISTENT_BARRIER();
+}
+
+static inline void nova_set_page_num_entries(struct super_block *sb,
+   struct nova_inode_log_page *curr_page, int num, int flush)
+{
+   curr_page->page_tail.num_entries = num;
+   if (flush)
+   nova_flush_buffer(_page->page_tail,
+   sizeof(struct nova_inode_page_tail), 0);
+}
+
+static inline void nova_set_page_invalid_entries(struct super_block *sb,
+   struct nova_inode_log_page *curr_page, int num, int flush)
+{
+   curr_page->page_tail.invalid_entries = num;
+   if (flush)
+   nova_flush_buffer(_page->page_tail,
+   sizeof(struct nova_inode_page_tail), 0);
+}
+
+static inline void nova_inc_page_num_entries(struct super_block *sb,
+   u64 curr)
+{
+   struct nova_inode_log_page *curr_page;
+
+   curr = BLOCK_OFF(curr);
+   curr_page = (struct nova_inode_log_page *)nova_get_block(sb, curr);
+
+   curr_page->page_tail.num_entries++;
+   nova_flush_buffer(_page->page_tail,
+   sizeof(struct nova_inode_page_tail), 0);
+}
+
+static inline void nova_inc_page_invalid_entries(struct super_block *sb,
+   u64 curr)
+{
+   struct nova_inode_log_page *curr_page;
+
+   curr = BLOCK_OFF(curr);
+   curr_page = (struct nova_inode_log_page *)nova_get_block(sb, curr);
+
+   curr_page->page_tail.invalid_entries++;
+   if (curr_page->page_tail.invalid_entries >
+   curr_page->page_tail.num_entries) {
+   nova_dbg("Page 0x%llx has %u entries, %u invalid\n",
+   curr,
+   curr_page->page_tail.num_entries,
+

[RFC v2 26/83] Add inode_map to track inuse inodes.

2018-03-10 Thread Andiry Xu

From: Andiry Xu <jix...@cs.ucsd.edu>

NOVA uses per-CPU inode map to track inuse inodes.
It works in the same way as the allocator, the only difference is that inode map
tracks in-use inodes, while free list contains free ranges. NOVA always try
to allocate the first available inode number.

Signed-off-by: Andiry Xu <jix...@cs.ucsd.edu>
---
 fs/nova/inode.c | 190 
 fs/nova/inode.h |   3 +
 fs/nova/nova.h  |  10 +++
 fs/nova/super.c |  44 +
 fs/nova/super.h |   9 +++
 5 files changed, 256 insertions(+)

diff --git a/fs/nova/inode.c b/fs/nova/inode.c
index 4e2842d..7c10d0e 100644
--- a/fs/nova/inode.c
+++ b/fs/nova/inode.c
@@ -29,6 +29,43 @@
 unsigned int blk_type_to_shift[NOVA_BLOCK_TYPE_MAX] = {12, 21, 30};
 uint32_t blk_type_to_size[NOVA_BLOCK_TYPE_MAX] = {0x1000, 0x20, 
0x4000};
 
+int nova_init_inode_inuse_list(struct super_block *sb)
+{
+   struct nova_sb_info *sbi = NOVA_SB(sb);
+   struct nova_range_node *range_node;
+   struct inode_map *inode_map;
+   unsigned long range_high;
+   int i;
+   int ret;
+
+   sbi->s_inodes_used_count = NOVA_NORMAL_INODE_START;
+
+   range_high = NOVA_NORMAL_INODE_START / sbi->cpus;
+   if (NOVA_NORMAL_INODE_START % sbi->cpus)
+   range_high++;
+
+   for (i = 0; i < sbi->cpus; i++) {
+   inode_map = >inode_maps[i];
+   range_node = nova_alloc_inode_node(sb);
+   if (range_node == NULL)
+   /* FIXME: free allocated memories */
+   return -ENOMEM;
+
+   range_node->range_low = 0;
+   range_node->range_high = range_high;
+   ret = nova_insert_inodetree(sbi, range_node, i);
+   if (ret) {
+   nova_err(sb, "%s failed\n", __func__);
+   nova_free_inode_node(sb, range_node);
+   return ret;
+   }
+   inode_map->num_range_node_inode = 1;
+   inode_map->first_inode_range = range_node;
+   }
+
+   return 0;
+}
+
 static int nova_alloc_inode_table(struct super_block *sb,
struct nova_inode_info_header *sih)
 {
@@ -298,3 +335,156 @@ struct inode *nova_iget(struct super_block *sb, unsigned 
long ino)
return ERR_PTR(err);
 }
 
+inline int nova_insert_inodetree(struct nova_sb_info *sbi,
+   struct nova_range_node *new_node, int cpu)
+{
+   struct rb_root *tree;
+   int ret;
+
+   tree = >inode_maps[cpu].inode_inuse_tree;
+   ret = nova_insert_range_node(tree, new_node);
+   if (ret)
+   nova_dbg("ERROR: %s failed %d\n", __func__, ret);
+
+   return ret;
+}
+
+static inline int nova_search_inodetree(struct nova_sb_info *sbi,
+   unsigned long ino, struct nova_range_node **ret_node)
+{
+   struct rb_root *tree;
+   unsigned long internal_ino;
+   int cpu;
+
+   cpu = ino % sbi->cpus;
+   tree = >inode_maps[cpu].inode_inuse_tree;
+   internal_ino = ino / sbi->cpus;
+   return nova_find_range_node(sbi, tree, internal_ino, ret_node);
+}
+
+int nova_alloc_unused_inode(struct super_block *sb, int cpuid,
+   unsigned long *ino)
+{
+   struct nova_sb_info *sbi = NOVA_SB(sb);
+   struct inode_map *inode_map;
+   struct nova_range_node *i, *next_i;
+   struct rb_node *temp, *next;
+   unsigned long next_range_low;
+   unsigned long new_ino;
+   unsigned long MAX_INODE = 1UL << 31;
+
+   inode_map = >inode_maps[cpuid];
+   i = inode_map->first_inode_range;
+   NOVA_ASSERT(i);
+
+   temp = >node;
+   next = rb_next(temp);
+
+   if (!next) {
+   next_i = NULL;
+   next_range_low = MAX_INODE;
+   } else {
+   next_i = container_of(next, struct nova_range_node, node);
+   next_range_low = next_i->range_low;
+   }
+
+   new_ino = i->range_high + 1;
+
+   if (next_i && new_ino == (next_range_low - 1)) {
+   /* Fill the gap completely */
+   i->range_high = next_i->range_high;
+   rb_erase(_i->node, _map->inode_inuse_tree);
+   nova_free_inode_node(sb, next_i);
+   inode_map->num_range_node_inode--;
+   } else if (new_ino < (next_range_low - 1)) {
+   /* Aligns to left */
+   i->range_high = new_ino;
+   } else {
+   nova_dbg("%s: ERROR: new ino %lu, next low %lu\n", __func__,
+   new_ino, next_range_low);
+   return -ENOSPC;
+   }
+
+   *ino = new_ino * sbi->cpus + cpuid;
+   sbi->s_inodes_used_count++;
+   inode_map->allocated++;
+
+   nova_dbg_verbose("Alloc ino %lu\n", *ino);
+   return 0;
+}
+
+int nova_free_inuse_inode(struct supe

[RFC v2 26/83] Add inode_map to track inuse inodes.

2018-03-10 Thread Andiry Xu

From: Andiry Xu 

NOVA uses per-CPU inode map to track inuse inodes.
It works in the same way as the allocator, the only difference is that inode map
tracks in-use inodes, while free list contains free ranges. NOVA always try
to allocate the first available inode number.

Signed-off-by: Andiry Xu 
---
 fs/nova/inode.c | 190 
 fs/nova/inode.h |   3 +
 fs/nova/nova.h  |  10 +++
 fs/nova/super.c |  44 +
 fs/nova/super.h |   9 +++
 5 files changed, 256 insertions(+)

diff --git a/fs/nova/inode.c b/fs/nova/inode.c
index 4e2842d..7c10d0e 100644
--- a/fs/nova/inode.c
+++ b/fs/nova/inode.c
@@ -29,6 +29,43 @@
 unsigned int blk_type_to_shift[NOVA_BLOCK_TYPE_MAX] = {12, 21, 30};
 uint32_t blk_type_to_size[NOVA_BLOCK_TYPE_MAX] = {0x1000, 0x20, 
0x4000};
 
+int nova_init_inode_inuse_list(struct super_block *sb)
+{
+   struct nova_sb_info *sbi = NOVA_SB(sb);
+   struct nova_range_node *range_node;
+   struct inode_map *inode_map;
+   unsigned long range_high;
+   int i;
+   int ret;
+
+   sbi->s_inodes_used_count = NOVA_NORMAL_INODE_START;
+
+   range_high = NOVA_NORMAL_INODE_START / sbi->cpus;
+   if (NOVA_NORMAL_INODE_START % sbi->cpus)
+   range_high++;
+
+   for (i = 0; i < sbi->cpus; i++) {
+   inode_map = >inode_maps[i];
+   range_node = nova_alloc_inode_node(sb);
+   if (range_node == NULL)
+   /* FIXME: free allocated memories */
+   return -ENOMEM;
+
+   range_node->range_low = 0;
+   range_node->range_high = range_high;
+   ret = nova_insert_inodetree(sbi, range_node, i);
+   if (ret) {
+   nova_err(sb, "%s failed\n", __func__);
+   nova_free_inode_node(sb, range_node);
+   return ret;
+   }
+   inode_map->num_range_node_inode = 1;
+   inode_map->first_inode_range = range_node;
+   }
+
+   return 0;
+}
+
 static int nova_alloc_inode_table(struct super_block *sb,
struct nova_inode_info_header *sih)
 {
@@ -298,3 +335,156 @@ struct inode *nova_iget(struct super_block *sb, unsigned 
long ino)
return ERR_PTR(err);
 }
 
+inline int nova_insert_inodetree(struct nova_sb_info *sbi,
+   struct nova_range_node *new_node, int cpu)
+{
+   struct rb_root *tree;
+   int ret;
+
+   tree = >inode_maps[cpu].inode_inuse_tree;
+   ret = nova_insert_range_node(tree, new_node);
+   if (ret)
+   nova_dbg("ERROR: %s failed %d\n", __func__, ret);
+
+   return ret;
+}
+
+static inline int nova_search_inodetree(struct nova_sb_info *sbi,
+   unsigned long ino, struct nova_range_node **ret_node)
+{
+   struct rb_root *tree;
+   unsigned long internal_ino;
+   int cpu;
+
+   cpu = ino % sbi->cpus;
+   tree = >inode_maps[cpu].inode_inuse_tree;
+   internal_ino = ino / sbi->cpus;
+   return nova_find_range_node(sbi, tree, internal_ino, ret_node);
+}
+
+int nova_alloc_unused_inode(struct super_block *sb, int cpuid,
+   unsigned long *ino)
+{
+   struct nova_sb_info *sbi = NOVA_SB(sb);
+   struct inode_map *inode_map;
+   struct nova_range_node *i, *next_i;
+   struct rb_node *temp, *next;
+   unsigned long next_range_low;
+   unsigned long new_ino;
+   unsigned long MAX_INODE = 1UL << 31;
+
+   inode_map = >inode_maps[cpuid];
+   i = inode_map->first_inode_range;
+   NOVA_ASSERT(i);
+
+   temp = >node;
+   next = rb_next(temp);
+
+   if (!next) {
+   next_i = NULL;
+   next_range_low = MAX_INODE;
+   } else {
+   next_i = container_of(next, struct nova_range_node, node);
+   next_range_low = next_i->range_low;
+   }
+
+   new_ino = i->range_high + 1;
+
+   if (next_i && new_ino == (next_range_low - 1)) {
+   /* Fill the gap completely */
+   i->range_high = next_i->range_high;
+   rb_erase(_i->node, _map->inode_inuse_tree);
+   nova_free_inode_node(sb, next_i);
+   inode_map->num_range_node_inode--;
+   } else if (new_ino < (next_range_low - 1)) {
+   /* Aligns to left */
+   i->range_high = new_ino;
+   } else {
+   nova_dbg("%s: ERROR: new ino %lu, next low %lu\n", __func__,
+   new_ino, next_range_low);
+   return -ENOSPC;
+   }
+
+   *ino = new_ino * sbi->cpus + cpuid;
+   sbi->s_inodes_used_count++;
+   inode_map->allocated++;
+
+   nova_dbg_verbose("Alloc ino %lu\n", *ino);
+   return 0;
+}
+
+int nova_free_inuse_inode(struct super_block *sb, unsigned long ino)
+{
+   struct n

[RFC v2 24/83] Initialize and allocate inode table.

2018-03-10 Thread Andiry Xu

From: Andiry Xu <jix...@cs.ucsd.edu>

Inode table is a singly linked list of 2MB pages.
Each CPU has one inode table with initial size 2MB.
The inode table addresses are stored in the
INODE_TABLE_START of the pmem range.

Signed-off-by: Andiry Xu <jix...@cs.ucsd.edu>
---
 fs/nova/inode.c | 55 +++
 fs/nova/inode.h | 26 ++
 fs/nova/super.c |  3 +++
 3 files changed, 84 insertions(+)

diff --git a/fs/nova/inode.c b/fs/nova/inode.c
index f7d6410..42816ff 100644
--- a/fs/nova/inode.c
+++ b/fs/nova/inode.c
@@ -29,6 +29,61 @@
 unsigned int blk_type_to_shift[NOVA_BLOCK_TYPE_MAX] = {12, 21, 30};
 uint32_t blk_type_to_size[NOVA_BLOCK_TYPE_MAX] = {0x1000, 0x20, 
0x4000};
 
+static int nova_alloc_inode_table(struct super_block *sb,
+   struct nova_inode_info_header *sih)
+{
+   struct nova_sb_info *sbi = NOVA_SB(sb);
+   struct inode_table *inode_table;
+   unsigned long blocknr;
+   u64 block;
+   int allocated;
+   int i;
+
+   for (i = 0; i < sbi->cpus; i++) {
+   inode_table = nova_get_inode_table(sb, i);
+   if (!inode_table)
+   return -EINVAL;
+
+   allocated = nova_new_log_blocks(sb, sih, , 1,
+   ALLOC_INIT_ZERO, i, ALLOC_FROM_HEAD);
+
+   nova_dbgv("%s: allocate log @ 0x%lx\n", __func__,
+   blocknr);
+   if (allocated != 1 || blocknr == 0)
+   return -ENOSPC;
+
+   block = nova_get_block_off(sb, blocknr, NOVA_BLOCK_TYPE_2M);
+   inode_table->log_head = block;
+   nova_flush_buffer(inode_table, CACHELINE_SIZE, 0);
+   }
+
+   return 0;
+}
+
+int nova_init_inode_table(struct super_block *sb)
+{
+   struct nova_inode *pi = nova_get_inode_by_ino(sb, NOVA_INODETABLE_INO);
+   struct nova_inode_info_header sih;
+   int ret = 0;
+
+   pi->i_mode = 0;
+   pi->i_uid = 0;
+   pi->i_gid = 0;
+   pi->i_links_count = cpu_to_le16(1);
+   pi->i_flags = 0;
+   pi->nova_ino = NOVA_INODETABLE_INO;
+
+   pi->i_blk_type = NOVA_BLOCK_TYPE_2M;
+
+   sih.ino = NOVA_INODETABLE_INO;
+   sih.i_blk_type = NOVA_BLOCK_TYPE_2M;
+
+   ret = nova_alloc_inode_table(sb, );
+
+   PERSISTENT_BARRIER();
+   return ret;
+}
+
 void nova_set_inode_flags(struct inode *inode, struct nova_inode *pi,
unsigned int flags)
 {
diff --git a/fs/nova/inode.h b/fs/nova/inode.h
index 0594ef3..a88f0a2 100644
--- a/fs/nova/inode.h
+++ b/fs/nova/inode.h
@@ -60,6 +60,13 @@ struct nova_inode {
 } __attribute((__packed__));
 
 /*
+ * Inode table.  It's a linked list of pages.
+ */
+struct inode_table {
+   __le64 log_head;
+};
+
+/*
  * NOVA-specific inode state kept in DRAM
  */
 struct nova_inode_info_header {
@@ -136,6 +143,22 @@ static inline void nova_update_tail(struct nova_inode *pi, 
u64 new_tail)
NOVA_END_TIMING(update_tail_t, update_time);
 }
 
+static inline
+struct inode_table *nova_get_inode_table(struct super_block *sb, int cpu)
+{
+   struct nova_sb_info *sbi = NOVA_SB(sb);
+   int table_start;
+
+   if (cpu >= sbi->cpus)
+   return NULL;
+
+   table_start = INODE_TABLE_START;
+
+   return (struct inode_table *)((char *)nova_get_block(sb,
+   NOVA_DEF_BLOCK_SIZE_4K * table_start) +
+   cpu * CACHELINE_SIZE);
+}
+
 static inline unsigned int
 nova_inode_blk_shift(struct nova_inode_info_header *sih)
 {
@@ -197,7 +220,10 @@ static inline int nova_persist_inode(struct nova_inode *pi)
return 0;
 }
 
+
+int nova_init_inode_table(struct super_block *sb);
 int nova_get_inode_address(struct super_block *sb, u64 ino,
u64 *pi_addr, int extendable);
 struct inode *nova_iget(struct super_block *sb, unsigned long ino);
+
 #endif
diff --git a/fs/nova/super.c b/fs/nova/super.c
index 7ee3f66..32fe29b 100644
--- a/fs/nova/super.c
+++ b/fs/nova/super.c
@@ -378,6 +378,9 @@ static struct nova_inode *nova_init(struct super_block *sb,
 
nova_init_blockmap(sb, 0);
 
+   if (nova_init_inode_table(sb) < 0)
+   return ERR_PTR(-EINVAL);
+
sbi->nova_sb->s_size = cpu_to_le64(size);
sbi->nova_sb->s_blocksize = cpu_to_le32(blocksize);
sbi->nova_sb->s_magic = cpu_to_le32(NOVA_SUPER_MAGIC);
-- 
2.7.4

[RFC v2 24/83] Initialize and allocate inode table.

2018-03-10 Thread Andiry Xu

From: Andiry Xu 

Inode table is a singly linked list of 2MB pages.
Each CPU has one inode table with initial size 2MB.
The inode table addresses are stored in the
INODE_TABLE_START of the pmem range.

Signed-off-by: Andiry Xu 
---
 fs/nova/inode.c | 55 +++
 fs/nova/inode.h | 26 ++
 fs/nova/super.c |  3 +++
 3 files changed, 84 insertions(+)

diff --git a/fs/nova/inode.c b/fs/nova/inode.c
index f7d6410..42816ff 100644
--- a/fs/nova/inode.c
+++ b/fs/nova/inode.c
@@ -29,6 +29,61 @@
 unsigned int blk_type_to_shift[NOVA_BLOCK_TYPE_MAX] = {12, 21, 30};
 uint32_t blk_type_to_size[NOVA_BLOCK_TYPE_MAX] = {0x1000, 0x20, 
0x4000};
 
+static int nova_alloc_inode_table(struct super_block *sb,
+   struct nova_inode_info_header *sih)
+{
+   struct nova_sb_info *sbi = NOVA_SB(sb);
+   struct inode_table *inode_table;
+   unsigned long blocknr;
+   u64 block;
+   int allocated;
+   int i;
+
+   for (i = 0; i < sbi->cpus; i++) {
+   inode_table = nova_get_inode_table(sb, i);
+   if (!inode_table)
+   return -EINVAL;
+
+   allocated = nova_new_log_blocks(sb, sih, , 1,
+   ALLOC_INIT_ZERO, i, ALLOC_FROM_HEAD);
+
+   nova_dbgv("%s: allocate log @ 0x%lx\n", __func__,
+   blocknr);
+   if (allocated != 1 || blocknr == 0)
+   return -ENOSPC;
+
+   block = nova_get_block_off(sb, blocknr, NOVA_BLOCK_TYPE_2M);
+   inode_table->log_head = block;
+   nova_flush_buffer(inode_table, CACHELINE_SIZE, 0);
+   }
+
+   return 0;
+}
+
+int nova_init_inode_table(struct super_block *sb)
+{
+   struct nova_inode *pi = nova_get_inode_by_ino(sb, NOVA_INODETABLE_INO);
+   struct nova_inode_info_header sih;
+   int ret = 0;
+
+   pi->i_mode = 0;
+   pi->i_uid = 0;
+   pi->i_gid = 0;
+   pi->i_links_count = cpu_to_le16(1);
+   pi->i_flags = 0;
+   pi->nova_ino = NOVA_INODETABLE_INO;
+
+   pi->i_blk_type = NOVA_BLOCK_TYPE_2M;
+
+   sih.ino = NOVA_INODETABLE_INO;
+   sih.i_blk_type = NOVA_BLOCK_TYPE_2M;
+
+   ret = nova_alloc_inode_table(sb, );
+
+   PERSISTENT_BARRIER();
+   return ret;
+}
+
 void nova_set_inode_flags(struct inode *inode, struct nova_inode *pi,
unsigned int flags)
 {
diff --git a/fs/nova/inode.h b/fs/nova/inode.h
index 0594ef3..a88f0a2 100644
--- a/fs/nova/inode.h
+++ b/fs/nova/inode.h
@@ -60,6 +60,13 @@ struct nova_inode {
 } __attribute((__packed__));
 
 /*
+ * Inode table.  It's a linked list of pages.
+ */
+struct inode_table {
+   __le64 log_head;
+};
+
+/*
  * NOVA-specific inode state kept in DRAM
  */
 struct nova_inode_info_header {
@@ -136,6 +143,22 @@ static inline void nova_update_tail(struct nova_inode *pi, 
u64 new_tail)
NOVA_END_TIMING(update_tail_t, update_time);
 }
 
+static inline
+struct inode_table *nova_get_inode_table(struct super_block *sb, int cpu)
+{
+   struct nova_sb_info *sbi = NOVA_SB(sb);
+   int table_start;
+
+   if (cpu >= sbi->cpus)
+   return NULL;
+
+   table_start = INODE_TABLE_START;
+
+   return (struct inode_table *)((char *)nova_get_block(sb,
+   NOVA_DEF_BLOCK_SIZE_4K * table_start) +
+   cpu * CACHELINE_SIZE);
+}
+
 static inline unsigned int
 nova_inode_blk_shift(struct nova_inode_info_header *sih)
 {
@@ -197,7 +220,10 @@ static inline int nova_persist_inode(struct nova_inode *pi)
return 0;
 }
 
+
+int nova_init_inode_table(struct super_block *sb);
 int nova_get_inode_address(struct super_block *sb, u64 ino,
u64 *pi_addr, int extendable);
 struct inode *nova_iget(struct super_block *sb, unsigned long ino);
+
 #endif
diff --git a/fs/nova/super.c b/fs/nova/super.c
index 7ee3f66..32fe29b 100644
--- a/fs/nova/super.c
+++ b/fs/nova/super.c
@@ -378,6 +378,9 @@ static struct nova_inode *nova_init(struct super_block *sb,
 
nova_init_blockmap(sb, 0);
 
+   if (nova_init_inode_table(sb) < 0)
+   return ERR_PTR(-EINVAL);
+
sbi->nova_sb->s_size = cpu_to_le64(size);
sbi->nova_sb->s_blocksize = cpu_to_le32(blocksize);
sbi->nova_sb->s_magic = cpu_to_le32(NOVA_SUPER_MAGIC);
-- 
2.7.4

[RFC v2 27/83] Save the inode inuse list to pmem upon umount

2018-03-10 Thread Andiry Xu

From: Andiry Xu <jix...@cs.ucsd.edu>

Signed-off-by: Andiry Xu <jix...@cs.ucsd.edu>
---
 fs/nova/bbuild.c | 48 
 fs/nova/bbuild.h |  1 +
 fs/nova/super.c  |  1 +
 3 files changed, 50 insertions(+)

diff --git a/fs/nova/bbuild.c b/fs/nova/bbuild.c
index 12a2f11..66053cb 100644
--- a/fs/nova/bbuild.c
+++ b/fs/nova/bbuild.c
@@ -116,6 +116,54 @@ static u64 nova_save_free_list_blocknodes(struct 
super_block *sb, int cpu,
return temp_tail;
 }
 
+void nova_save_inode_list_to_log(struct super_block *sb)
+{
+   struct nova_inode *pi = nova_get_inode_by_ino(sb, NOVA_INODELIST_INO);
+   struct nova_inode_info_header sih;
+   struct nova_sb_info *sbi = NOVA_SB(sb);
+   unsigned long num_blocks;
+   unsigned long num_nodes = 0;
+   struct inode_map *inode_map;
+   unsigned long i;
+   u64 temp_tail;
+   u64 new_block;
+   int allocated;
+
+   sih.ino = NOVA_INODELIST_INO;
+   sih.i_blk_type = NOVA_DEFAULT_BLOCK_TYPE;
+   sih.i_blocks = 0;
+
+   for (i = 0; i < sbi->cpus; i++) {
+   inode_map = >inode_maps[i];
+   num_nodes += inode_map->num_range_node_inode;
+   }
+
+   num_blocks = num_nodes / RANGENODE_PER_PAGE;
+   if (num_nodes % RANGENODE_PER_PAGE)
+   num_blocks++;
+
+   allocated = nova_allocate_inode_log_pages(sb, , num_blocks,
+   _block, ANY_CPU, 0);
+   if (allocated != num_blocks) {
+   nova_dbg("Error saving inode list: %d\n", allocated);
+   return;
+   }
+
+   temp_tail = new_block;
+   for (i = 0; i < sbi->cpus; i++) {
+   inode_map = >inode_maps[i];
+   temp_tail = nova_save_range_nodes_to_log(sb,
+   _map->inode_inuse_tree, temp_tail, i);
+   }
+
+   pi->log_head = new_block;
+   nova_update_tail(pi, temp_tail);
+   nova_flush_buffer(>log_head, CACHELINE_SIZE, 0);
+
+   nova_dbg("%s: %lu inode nodes, pi head 0x%llx, tail 0x%llx\n",
+   __func__, num_nodes, pi->log_head, pi->log_tail);
+}
+
 void nova_save_blocknode_mappings_to_log(struct super_block *sb)
 {
struct nova_inode *pi = nova_get_inode_by_ino(sb, NOVA_BLOCKNODE_INO);
diff --git a/fs/nova/bbuild.h b/fs/nova/bbuild.h
index 59cc379..5d2b5f0 100644
--- a/fs/nova/bbuild.h
+++ b/fs/nova/bbuild.h
@@ -3,6 +3,7 @@
 
 void nova_init_header(struct super_block *sb,
struct nova_inode_info_header *sih, u16 i_mode);
+void nova_save_inode_list_to_log(struct super_block *sb);
 void nova_save_blocknode_mappings_to_log(struct super_block *sb);
 
 #endif
diff --git a/fs/nova/super.c b/fs/nova/super.c
index 9b60873..69e4afc 100644
--- a/fs/nova/super.c
+++ b/fs/nova/super.c
@@ -734,6 +734,7 @@ static void nova_put_super(struct super_block *sb)
int i;
 
if (sbi->virt_addr) {
+   nova_save_inode_list_to_log(sb);
/* Save everything before blocknode mapping! */
nova_save_blocknode_mappings_to_log(sb);
sbi->virt_addr = NULL;
-- 
2.7.4

[RFC v2 27/83] Save the inode inuse list to pmem upon umount

2018-03-10 Thread Andiry Xu

From: Andiry Xu 

Signed-off-by: Andiry Xu 
---
 fs/nova/bbuild.c | 48 
 fs/nova/bbuild.h |  1 +
 fs/nova/super.c  |  1 +
 3 files changed, 50 insertions(+)

diff --git a/fs/nova/bbuild.c b/fs/nova/bbuild.c
index 12a2f11..66053cb 100644
--- a/fs/nova/bbuild.c
+++ b/fs/nova/bbuild.c
@@ -116,6 +116,54 @@ static u64 nova_save_free_list_blocknodes(struct 
super_block *sb, int cpu,
return temp_tail;
 }
 
+void nova_save_inode_list_to_log(struct super_block *sb)
+{
+   struct nova_inode *pi = nova_get_inode_by_ino(sb, NOVA_INODELIST_INO);
+   struct nova_inode_info_header sih;
+   struct nova_sb_info *sbi = NOVA_SB(sb);
+   unsigned long num_blocks;
+   unsigned long num_nodes = 0;
+   struct inode_map *inode_map;
+   unsigned long i;
+   u64 temp_tail;
+   u64 new_block;
+   int allocated;
+
+   sih.ino = NOVA_INODELIST_INO;
+   sih.i_blk_type = NOVA_DEFAULT_BLOCK_TYPE;
+   sih.i_blocks = 0;
+
+   for (i = 0; i < sbi->cpus; i++) {
+   inode_map = >inode_maps[i];
+   num_nodes += inode_map->num_range_node_inode;
+   }
+
+   num_blocks = num_nodes / RANGENODE_PER_PAGE;
+   if (num_nodes % RANGENODE_PER_PAGE)
+   num_blocks++;
+
+   allocated = nova_allocate_inode_log_pages(sb, , num_blocks,
+   _block, ANY_CPU, 0);
+   if (allocated != num_blocks) {
+   nova_dbg("Error saving inode list: %d\n", allocated);
+   return;
+   }
+
+   temp_tail = new_block;
+   for (i = 0; i < sbi->cpus; i++) {
+   inode_map = >inode_maps[i];
+   temp_tail = nova_save_range_nodes_to_log(sb,
+   _map->inode_inuse_tree, temp_tail, i);
+   }
+
+   pi->log_head = new_block;
+   nova_update_tail(pi, temp_tail);
+   nova_flush_buffer(>log_head, CACHELINE_SIZE, 0);
+
+   nova_dbg("%s: %lu inode nodes, pi head 0x%llx, tail 0x%llx\n",
+   __func__, num_nodes, pi->log_head, pi->log_tail);
+}
+
 void nova_save_blocknode_mappings_to_log(struct super_block *sb)
 {
struct nova_inode *pi = nova_get_inode_by_ino(sb, NOVA_BLOCKNODE_INO);
diff --git a/fs/nova/bbuild.h b/fs/nova/bbuild.h
index 59cc379..5d2b5f0 100644
--- a/fs/nova/bbuild.h
+++ b/fs/nova/bbuild.h
@@ -3,6 +3,7 @@
 
 void nova_init_header(struct super_block *sb,
struct nova_inode_info_header *sih, u16 i_mode);
+void nova_save_inode_list_to_log(struct super_block *sb);
 void nova_save_blocknode_mappings_to_log(struct super_block *sb);
 
 #endif
diff --git a/fs/nova/super.c b/fs/nova/super.c
index 9b60873..69e4afc 100644
--- a/fs/nova/super.c
+++ b/fs/nova/super.c
@@ -734,6 +734,7 @@ static void nova_put_super(struct super_block *sb)
int i;
 
if (sbi->virt_addr) {
+   nova_save_inode_list_to_log(sb);
/* Save everything before blocknode mapping! */
nova_save_blocknode_mappings_to_log(sb);
sbi->virt_addr = NULL;
-- 
2.7.4

[RFC v2 25/83] Support get normal inode address and inode table extentsion.

2018-03-10 Thread Andiry Xu

From: Andiry Xu <jix...@cs.ucsd.edu>

Inodes are assigned to per-CPU inode tables in a round-robin way:

If there are four cores, then

CPU 0's inode table contains inode 0, inode 4, inode 8, ...
CPU 1's inode table contains inode 1, inode 5, inode 9, ...
CPU 2's inode table contains inode 2, inode 6, inode 10, ...
CPU 3's inode table contains inode 3, inode 7, inode 11, ...

So given an inode number, the inode table and inode position
can be easily calculated.

If NOVA runs out of 2MB inode table size, it will allocate a new
2MB log page and links it to the tail of the previous inode table.

Signed-off-by: Andiry Xu <jix...@cs.ucsd.edu>
---
 fs/nova/inode.c | 67 +++--
 1 file changed, 65 insertions(+), 2 deletions(-)

diff --git a/fs/nova/inode.c b/fs/nova/inode.c
index 42816ff..4e2842d 100644
--- a/fs/nova/inode.c
+++ b/fs/nova/inode.c
@@ -167,18 +167,81 @@ static int nova_read_inode(struct super_block *sb, struct 
inode *inode,
return ret;
 }
 
-/* Get the address in PMEM of an inode by inode number.  Allocate additional
+/*
+ * Get the address in PMEM of an inode by inode number.  Allocate additional
  * block to store additional inodes if necessary.
  */
 int nova_get_inode_address(struct super_block *sb, u64 ino,
u64 *pi_addr, int extendable)
 {
+   struct nova_sb_info *sbi = NOVA_SB(sb);
+   struct nova_inode_info_header sih;
+   struct inode_table *inode_table;
+   unsigned int data_bits;
+   unsigned int num_inodes_bits;
+   u64 curr;
+   unsigned int superpage_count;
+   u64 internal_ino;
+   int cpuid;
+   int extended = 0;
+   unsigned int index;
+   unsigned int i = 0;
+   unsigned long blocknr;
+   unsigned long curr_addr;
+   int allocated;
+
if (ino < NOVA_NORMAL_INODE_START) {
*pi_addr = nova_get_reserved_inode_addr(sb, ino);
return 0;
}
 
-   *pi_addr = 0;
+   sih.ino = NOVA_INODETABLE_INO;
+   sih.i_blk_type = NOVA_BLOCK_TYPE_2M;
+   data_bits = blk_type_to_shift[sih.i_blk_type];
+   num_inodes_bits = data_bits - NOVA_INODE_BITS;
+
+   cpuid = ino % sbi->cpus;
+   internal_ino = ino / sbi->cpus;
+
+   inode_table = nova_get_inode_table(sb, cpuid);
+   superpage_count = internal_ino >> num_inodes_bits;
+   index = internal_ino & ((1 << num_inodes_bits) - 1);
+
+   curr = inode_table->log_head;
+   if (curr == 0)
+   return -EINVAL;
+
+   for (i = 0; i < superpage_count; i++) {
+   if (curr == 0)
+   return -EINVAL;
+
+   curr_addr = (unsigned long)nova_get_block(sb, curr);
+   /* Next page pointer in the last 8 bytes of the superpage */
+   curr_addr += nova_inode_blk_size() - 8;
+   curr = *(u64 *)(curr_addr);
+
+   if (curr == 0) {
+   if (extendable == 0)
+   return -EINVAL;
+
+   extended = 1;
+
+   allocated = nova_new_log_blocks(sb, , ,
+   1, ALLOC_INIT_ZERO, cpuid, ALLOC_FROM_HEAD);
+
+   if (allocated != 1)
+   return allocated;
+
+   curr = nova_get_block_off(sb, blocknr,
+   NOVA_BLOCK_TYPE_2M);
+   *(u64 *)(curr_addr) = curr;
+   nova_flush_buffer((void *)curr_addr,
+   NOVA_INODE_SIZE, 1);
+   }
+   }
+
+   *pi_addr = curr + index * NOVA_INODE_SIZE;
+
return 0;
 }
 
-- 
2.7.4

[RFC v2 25/83] Support get normal inode address and inode table extentsion.

2018-03-10 Thread Andiry Xu

From: Andiry Xu 

Inodes are assigned to per-CPU inode tables in a round-robin way:

If there are four cores, then

CPU 0's inode table contains inode 0, inode 4, inode 8, ...
CPU 1's inode table contains inode 1, inode 5, inode 9, ...
CPU 2's inode table contains inode 2, inode 6, inode 10, ...
CPU 3's inode table contains inode 3, inode 7, inode 11, ...

So given an inode number, the inode table and inode position
can be easily calculated.

If NOVA runs out of 2MB inode table size, it will allocate a new
2MB log page and links it to the tail of the previous inode table.

Signed-off-by: Andiry Xu 
---
 fs/nova/inode.c | 67 +++--
 1 file changed, 65 insertions(+), 2 deletions(-)

diff --git a/fs/nova/inode.c b/fs/nova/inode.c
index 42816ff..4e2842d 100644
--- a/fs/nova/inode.c
+++ b/fs/nova/inode.c
@@ -167,18 +167,81 @@ static int nova_read_inode(struct super_block *sb, struct 
inode *inode,
return ret;
 }
 
-/* Get the address in PMEM of an inode by inode number.  Allocate additional
+/*
+ * Get the address in PMEM of an inode by inode number.  Allocate additional
  * block to store additional inodes if necessary.
  */
 int nova_get_inode_address(struct super_block *sb, u64 ino,
u64 *pi_addr, int extendable)
 {
+   struct nova_sb_info *sbi = NOVA_SB(sb);
+   struct nova_inode_info_header sih;
+   struct inode_table *inode_table;
+   unsigned int data_bits;
+   unsigned int num_inodes_bits;
+   u64 curr;
+   unsigned int superpage_count;
+   u64 internal_ino;
+   int cpuid;
+   int extended = 0;
+   unsigned int index;
+   unsigned int i = 0;
+   unsigned long blocknr;
+   unsigned long curr_addr;
+   int allocated;
+
if (ino < NOVA_NORMAL_INODE_START) {
*pi_addr = nova_get_reserved_inode_addr(sb, ino);
return 0;
}
 
-   *pi_addr = 0;
+   sih.ino = NOVA_INODETABLE_INO;
+   sih.i_blk_type = NOVA_BLOCK_TYPE_2M;
+   data_bits = blk_type_to_shift[sih.i_blk_type];
+   num_inodes_bits = data_bits - NOVA_INODE_BITS;
+
+   cpuid = ino % sbi->cpus;
+   internal_ino = ino / sbi->cpus;
+
+   inode_table = nova_get_inode_table(sb, cpuid);
+   superpage_count = internal_ino >> num_inodes_bits;
+   index = internal_ino & ((1 << num_inodes_bits) - 1);
+
+   curr = inode_table->log_head;
+   if (curr == 0)
+   return -EINVAL;
+
+   for (i = 0; i < superpage_count; i++) {
+   if (curr == 0)
+   return -EINVAL;
+
+   curr_addr = (unsigned long)nova_get_block(sb, curr);
+   /* Next page pointer in the last 8 bytes of the superpage */
+   curr_addr += nova_inode_blk_size() - 8;
+   curr = *(u64 *)(curr_addr);
+
+   if (curr == 0) {
+   if (extendable == 0)
+   return -EINVAL;
+
+   extended = 1;
+
+   allocated = nova_new_log_blocks(sb, , ,
+   1, ALLOC_INIT_ZERO, cpuid, ALLOC_FROM_HEAD);
+
+   if (allocated != 1)
+   return allocated;
+
+   curr = nova_get_block_off(sb, blocknr,
+   NOVA_BLOCK_TYPE_2M);
+   *(u64 *)(curr_addr) = curr;
+   nova_flush_buffer((void *)curr_addr,
+   NOVA_INODE_SIZE, 1);
+   }
+   }
+
+   *pi_addr = curr + index * NOVA_INODE_SIZE;
+
return 0;
 }
 
-- 
2.7.4

[RFC v2 31/83] Add new vfs inode allocation.

2018-03-10 Thread Andiry Xu

From: Andiry Xu <jix...@cs.ucsd.edu>

This routine allocates and initializes a new vfs inode, and setup
the attributes of corresponding NOVA inode and inode_info.
inode operations are missing now.

Signed-off-by: Andiry Xu <jix...@cs.ucsd.edu>
---
 fs/nova/inode.c | 144 +++-
 fs/nova/inode.h |   3 ++
 2 files changed, 146 insertions(+), 1 deletion(-)

diff --git a/fs/nova/inode.c b/fs/nova/inode.c
index e4b8960..15517cc 100644
--- a/fs/nova/inode.c
+++ b/fs/nova/inode.c
@@ -363,7 +363,49 @@ static inline int nova_search_inodetree(struct 
nova_sb_info *sbi,
return nova_find_range_node(sbi, tree, internal_ino, ret_node);
 }
 
-int nova_alloc_unused_inode(struct super_block *sb, int cpuid,
+static void nova_get_inode_flags(struct inode *inode, struct nova_inode *pi)
+{
+   unsigned int flags = inode->i_flags;
+   unsigned int nova_flags = le32_to_cpu(pi->i_flags);
+
+   nova_flags &= ~(FS_SYNC_FL | FS_APPEND_FL | FS_IMMUTABLE_FL |
+FS_NOATIME_FL | FS_DIRSYNC_FL);
+   if (flags & S_SYNC)
+   nova_flags |= FS_SYNC_FL;
+   if (flags & S_APPEND)
+   nova_flags |= FS_APPEND_FL;
+   if (flags & S_IMMUTABLE)
+   nova_flags |= FS_IMMUTABLE_FL;
+   if (flags & S_NOATIME)
+   nova_flags |= FS_NOATIME_FL;
+   if (flags & S_DIRSYNC)
+   nova_flags |= FS_DIRSYNC_FL;
+
+   pi->i_flags = cpu_to_le32(nova_flags);
+}
+
+static void nova_init_inode(struct inode *inode, struct nova_inode *pi)
+{
+   pi->i_mode = cpu_to_le16(inode->i_mode);
+   pi->i_uid = cpu_to_le32(i_uid_read(inode));
+   pi->i_gid = cpu_to_le32(i_gid_read(inode));
+   pi->i_links_count = cpu_to_le16(inode->i_nlink);
+   pi->i_size = cpu_to_le64(inode->i_size);
+   pi->i_atime = cpu_to_le32(inode->i_atime.tv_sec);
+   pi->i_ctime = cpu_to_le32(inode->i_ctime.tv_sec);
+   pi->i_mtime = cpu_to_le32(inode->i_mtime.tv_sec);
+   pi->i_generation = cpu_to_le32(inode->i_generation);
+   pi->log_head = 0;
+   pi->log_tail = 0;
+   pi->deleted = 0;
+   pi->delete_epoch_id = 0;
+   nova_get_inode_flags(inode, pi);
+
+   if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode))
+   pi->dev.rdev = cpu_to_le32(inode->i_rdev);
+}
+
+static int nova_alloc_unused_inode(struct super_block *sb, int cpuid,
unsigned long *ino)
 {
struct nova_sb_info *sbi = NOVA_SB(sb);
@@ -529,6 +571,106 @@ u64 nova_new_nova_inode(struct super_block *sb, u64 
*pi_addr)
return ino;
 }
 
+struct inode *nova_new_vfs_inode(enum nova_new_inode_type type,
+   struct inode *dir, u64 pi_addr, u64 ino, umode_t mode,
+   size_t size, dev_t rdev, const struct qstr *qstr, u64 epoch_id)
+{
+   struct super_block *sb;
+   struct nova_sb_info *sbi;
+   struct inode *inode;
+   struct nova_inode *diri = NULL;
+   struct nova_inode_info *si;
+   struct nova_inode_info_header *sih = NULL;
+   struct nova_inode *pi;
+   int errval;
+   timing_t new_inode_time;
+
+   NOVA_START_TIMING(new_vfs_inode_t, new_inode_time);
+   sb = dir->i_sb;
+   sbi = (struct nova_sb_info *)sb->s_fs_info;
+   inode = new_inode(sb);
+   if (!inode) {
+   errval = -ENOMEM;
+   goto fail2;
+   }
+
+   inode_init_owner(inode, dir, mode);
+   inode->i_blocks = inode->i_size = 0;
+   inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode);
+
+   inode->i_generation = atomic_add_return(1, >next_generation);
+   inode->i_size = size;
+
+   diri = nova_get_inode(sb, dir);
+   if (!diri) {
+   errval = -EACCES;
+   goto fail1;
+   }
+
+   pi = (struct nova_inode *)nova_get_block(sb, pi_addr);
+   nova_dbg_verbose("%s: allocating inode %llu @ 0x%llx\n",
+   __func__, ino, pi_addr);
+
+   /* chosen inode is in ino */
+   inode->i_ino = ino;
+
+   switch (type) {
+   case TYPE_CREATE:
+   inode->i_mapping->a_ops = _aops_dax;
+   break;
+   case TYPE_MKNOD:
+   init_special_inode(inode, mode, rdev);
+   break;
+   case TYPE_SYMLINK:
+   inode->i_mapping->a_ops = _aops_dax;
+   break;
+   case TYPE_MKDIR:
+   inode->i_mapping->a_ops = _aops_dax;
+   set_nlink(inode, 2);
+   break;
+   default:
+   nova_dbg("Unknown new inode type %d\n", type);
+   break;
+   }
+
+   /*
+* Pi is part of the dir log so no transaction is needed,
+* but we need to flush to NVMM.
+*/
+   pi->i_blk_type = NOVA_

[RFC v2 28/83] Add NOVA address space operations

2018-03-10 Thread Andiry Xu

From: Andiry Xu <jix...@cs.ucsd.edu>

direct_IO and writepages support.

Signed-off-by: Andiry Xu <jix...@cs.ucsd.edu>
---
 fs/nova/inode.c | 24 
 fs/nova/inode.h |  1 +
 2 files changed, 25 insertions(+)

diff --git a/fs/nova/inode.c b/fs/nova/inode.c
index 7c10d0e..a30b6aa 100644
--- a/fs/nova/inode.c
+++ b/fs/nova/inode.c
@@ -175,6 +175,7 @@ static int nova_read_inode(struct super_block *sb, struct 
inode *inode,
}
 
inode->i_blocks = sih->i_blocks;
+   inode->i_mapping->a_ops = _aops_dax;
 
switch (inode->i_mode & S_IFMT) {
case S_IFREG:
@@ -488,3 +489,26 @@ int nova_free_inuse_inode(struct super_block *sb, unsigned 
long ino)
return ret;
 }
 
+static ssize_t nova_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
+{
+   /* DAX does not support direct IO */
+   return -EIO;
+}
+
+static int nova_writepages(struct address_space *mapping,
+   struct writeback_control *wbc)
+{
+   int ret;
+   timing_t wp_time;
+
+   NOVA_START_TIMING(write_pages_t, wp_time);
+   ret = dax_writeback_mapping_range(mapping,
+   mapping->host->i_sb->s_bdev, wbc);
+   NOVA_END_TIMING(write_pages_t, wp_time);
+   return ret;
+}
+
+const struct address_space_operations nova_aops_dax = {
+   .writepages = nova_writepages,
+   .direct_IO  = nova_direct_IO,
+};
diff --git a/fs/nova/inode.h b/fs/nova/inode.h
index 497343d..e00b3b9 100644
--- a/fs/nova/inode.h
+++ b/fs/nova/inode.h
@@ -221,6 +221,7 @@ static inline int nova_persist_inode(struct nova_inode *pi)
 }
 
 
+extern const struct address_space_operations nova_aops_dax;
 int nova_init_inode_inuse_list(struct super_block *sb);
 int nova_init_inode_table(struct super_block *sb);
 int nova_get_inode_address(struct super_block *sb, u64 ino,
-- 
2.7.4

[RFC v2 31/83] Add new vfs inode allocation.

2018-03-10 Thread Andiry Xu

From: Andiry Xu 

This routine allocates and initializes a new vfs inode, and setup
the attributes of corresponding NOVA inode and inode_info.
inode operations are missing now.

Signed-off-by: Andiry Xu 
---
 fs/nova/inode.c | 144 +++-
 fs/nova/inode.h |   3 ++
 2 files changed, 146 insertions(+), 1 deletion(-)

diff --git a/fs/nova/inode.c b/fs/nova/inode.c
index e4b8960..15517cc 100644
--- a/fs/nova/inode.c
+++ b/fs/nova/inode.c
@@ -363,7 +363,49 @@ static inline int nova_search_inodetree(struct 
nova_sb_info *sbi,
return nova_find_range_node(sbi, tree, internal_ino, ret_node);
 }
 
-int nova_alloc_unused_inode(struct super_block *sb, int cpuid,
+static void nova_get_inode_flags(struct inode *inode, struct nova_inode *pi)
+{
+   unsigned int flags = inode->i_flags;
+   unsigned int nova_flags = le32_to_cpu(pi->i_flags);
+
+   nova_flags &= ~(FS_SYNC_FL | FS_APPEND_FL | FS_IMMUTABLE_FL |
+FS_NOATIME_FL | FS_DIRSYNC_FL);
+   if (flags & S_SYNC)
+   nova_flags |= FS_SYNC_FL;
+   if (flags & S_APPEND)
+   nova_flags |= FS_APPEND_FL;
+   if (flags & S_IMMUTABLE)
+   nova_flags |= FS_IMMUTABLE_FL;
+   if (flags & S_NOATIME)
+   nova_flags |= FS_NOATIME_FL;
+   if (flags & S_DIRSYNC)
+   nova_flags |= FS_DIRSYNC_FL;
+
+   pi->i_flags = cpu_to_le32(nova_flags);
+}
+
+static void nova_init_inode(struct inode *inode, struct nova_inode *pi)
+{
+   pi->i_mode = cpu_to_le16(inode->i_mode);
+   pi->i_uid = cpu_to_le32(i_uid_read(inode));
+   pi->i_gid = cpu_to_le32(i_gid_read(inode));
+   pi->i_links_count = cpu_to_le16(inode->i_nlink);
+   pi->i_size = cpu_to_le64(inode->i_size);
+   pi->i_atime = cpu_to_le32(inode->i_atime.tv_sec);
+   pi->i_ctime = cpu_to_le32(inode->i_ctime.tv_sec);
+   pi->i_mtime = cpu_to_le32(inode->i_mtime.tv_sec);
+   pi->i_generation = cpu_to_le32(inode->i_generation);
+   pi->log_head = 0;
+   pi->log_tail = 0;
+   pi->deleted = 0;
+   pi->delete_epoch_id = 0;
+   nova_get_inode_flags(inode, pi);
+
+   if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode))
+   pi->dev.rdev = cpu_to_le32(inode->i_rdev);
+}
+
+static int nova_alloc_unused_inode(struct super_block *sb, int cpuid,
unsigned long *ino)
 {
struct nova_sb_info *sbi = NOVA_SB(sb);
@@ -529,6 +571,106 @@ u64 nova_new_nova_inode(struct super_block *sb, u64 
*pi_addr)
return ino;
 }
 
+struct inode *nova_new_vfs_inode(enum nova_new_inode_type type,
+   struct inode *dir, u64 pi_addr, u64 ino, umode_t mode,
+   size_t size, dev_t rdev, const struct qstr *qstr, u64 epoch_id)
+{
+   struct super_block *sb;
+   struct nova_sb_info *sbi;
+   struct inode *inode;
+   struct nova_inode *diri = NULL;
+   struct nova_inode_info *si;
+   struct nova_inode_info_header *sih = NULL;
+   struct nova_inode *pi;
+   int errval;
+   timing_t new_inode_time;
+
+   NOVA_START_TIMING(new_vfs_inode_t, new_inode_time);
+   sb = dir->i_sb;
+   sbi = (struct nova_sb_info *)sb->s_fs_info;
+   inode = new_inode(sb);
+   if (!inode) {
+   errval = -ENOMEM;
+   goto fail2;
+   }
+
+   inode_init_owner(inode, dir, mode);
+   inode->i_blocks = inode->i_size = 0;
+   inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode);
+
+   inode->i_generation = atomic_add_return(1, >next_generation);
+   inode->i_size = size;
+
+   diri = nova_get_inode(sb, dir);
+   if (!diri) {
+   errval = -EACCES;
+   goto fail1;
+   }
+
+   pi = (struct nova_inode *)nova_get_block(sb, pi_addr);
+   nova_dbg_verbose("%s: allocating inode %llu @ 0x%llx\n",
+   __func__, ino, pi_addr);
+
+   /* chosen inode is in ino */
+   inode->i_ino = ino;
+
+   switch (type) {
+   case TYPE_CREATE:
+   inode->i_mapping->a_ops = _aops_dax;
+   break;
+   case TYPE_MKNOD:
+   init_special_inode(inode, mode, rdev);
+   break;
+   case TYPE_SYMLINK:
+   inode->i_mapping->a_ops = _aops_dax;
+   break;
+   case TYPE_MKDIR:
+   inode->i_mapping->a_ops = _aops_dax;
+   set_nlink(inode, 2);
+   break;
+   default:
+   nova_dbg("Unknown new inode type %d\n", type);
+   break;
+   }
+
+   /*
+* Pi is part of the dir log so no transaction is needed,
+* but we need to flush to NVMM.
+*/
+   pi->i_blk_type = NOVA_DEFAULT_BLOCK_TYPE;
+   pi->i_flags = nova_mask_fla

[RFC v2 28/83] Add NOVA address space operations

2018-03-10 Thread Andiry Xu

From: Andiry Xu 

direct_IO and writepages support.

Signed-off-by: Andiry Xu 
---
 fs/nova/inode.c | 24 
 fs/nova/inode.h |  1 +
 2 files changed, 25 insertions(+)

diff --git a/fs/nova/inode.c b/fs/nova/inode.c
index 7c10d0e..a30b6aa 100644
--- a/fs/nova/inode.c
+++ b/fs/nova/inode.c
@@ -175,6 +175,7 @@ static int nova_read_inode(struct super_block *sb, struct 
inode *inode,
}
 
inode->i_blocks = sih->i_blocks;
+   inode->i_mapping->a_ops = _aops_dax;
 
switch (inode->i_mode & S_IFMT) {
case S_IFREG:
@@ -488,3 +489,26 @@ int nova_free_inuse_inode(struct super_block *sb, unsigned 
long ino)
return ret;
 }
 
+static ssize_t nova_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
+{
+   /* DAX does not support direct IO */
+   return -EIO;
+}
+
+static int nova_writepages(struct address_space *mapping,
+   struct writeback_control *wbc)
+{
+   int ret;
+   timing_t wp_time;
+
+   NOVA_START_TIMING(write_pages_t, wp_time);
+   ret = dax_writeback_mapping_range(mapping,
+   mapping->host->i_sb->s_bdev, wbc);
+   NOVA_END_TIMING(write_pages_t, wp_time);
+   return ret;
+}
+
+const struct address_space_operations nova_aops_dax = {
+   .writepages = nova_writepages,
+   .direct_IO  = nova_direct_IO,
+};
diff --git a/fs/nova/inode.h b/fs/nova/inode.h
index 497343d..e00b3b9 100644
--- a/fs/nova/inode.h
+++ b/fs/nova/inode.h
@@ -221,6 +221,7 @@ static inline int nova_persist_inode(struct nova_inode *pi)
 }
 
 
+extern const struct address_space_operations nova_aops_dax;
 int nova_init_inode_inuse_list(struct super_block *sb);
 int nova_init_inode_table(struct super_block *sb);
 int nova_get_inode_address(struct super_block *sb, u64 ino,
-- 
2.7.4

[RFC v2 29/83] Add write_inode and dirty_inode routines.

2018-03-10 Thread Andiry Xu

From: Andiry Xu <jix...@cs.ucsd.edu>

Signed-off-by: Andiry Xu <jix...@cs.ucsd.edu>
---
 fs/nova/inode.c | 33 +
 fs/nova/inode.h |  2 ++
 fs/nova/super.c |  2 ++
 3 files changed, 37 insertions(+)

diff --git a/fs/nova/inode.c b/fs/nova/inode.c
index a30b6aa..29d172a 100644
--- a/fs/nova/inode.c
+++ b/fs/nova/inode.c
@@ -489,6 +489,39 @@ int nova_free_inuse_inode(struct super_block *sb, unsigned 
long ino)
return ret;
 }
 
+int nova_write_inode(struct inode *inode, struct writeback_control *wbc)
+{
+   /* write_inode should never be called because we always keep our inodes
+* clean. So let us know if write_inode ever gets called.
+*/
+// BUG();
+   return 0;
+}
+
+/*
+ * dirty_inode() is called from mark_inode_dirty_sync()
+ * usually dirty_inode should not be called because NOVA always keeps its 
inodes
+ * clean. Only exception is touch_atime which calls dirty_inode to update the
+ * i_atime field.
+ */
+void nova_dirty_inode(struct inode *inode, int flags)
+{
+   struct super_block *sb = inode->i_sb;
+   struct nova_inode_info *si = NOVA_I(inode);
+   struct nova_inode_info_header *sih = >header;
+   struct nova_inode *pi;
+
+   pi = nova_get_block(sb, sih->pi_addr);
+
+   /* only i_atime should have changed if at all.
+* we can do in-place atomic update
+*/
+   pi->i_atime = cpu_to_le32(inode->i_atime.tv_sec);
+   nova_persist_inode(pi);
+   /* Relax atime persistency */
+   nova_flush_buffer(>i_atime, sizeof(pi->i_atime), 0);
+}
+
 static ssize_t nova_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
 {
/* DAX does not support direct IO */
diff --git a/fs/nova/inode.h b/fs/nova/inode.h
index e00b3b9..f9f5c14 100644
--- a/fs/nova/inode.h
+++ b/fs/nova/inode.h
@@ -229,5 +229,7 @@ int nova_get_inode_address(struct super_block *sb, u64 ino,
 struct inode *nova_iget(struct super_block *sb, unsigned long ino);
 inline int nova_insert_inodetree(struct nova_sb_info *sbi,
struct nova_range_node *new_node, int cpu);
+extern int nova_write_inode(struct inode *inode, struct writeback_control 
*wbc);
+extern void nova_dirty_inode(struct inode *inode, int flags);
 
 #endif
diff --git a/fs/nova/super.c b/fs/nova/super.c
index 69e4afc..c0427fd 100644
--- a/fs/nova/super.c
+++ b/fs/nova/super.c
@@ -861,6 +861,8 @@ static void destroy_rangenode_cache(void)
 static struct super_operations nova_sops = {
.alloc_inode= nova_alloc_inode,
.destroy_inode  = nova_destroy_inode,
+   .write_inode= nova_write_inode,
+   .dirty_inode= nova_dirty_inode,
.put_super  = nova_put_super,
.statfs = nova_statfs,
.remount_fs = nova_remount,
-- 
2.7.4

[RFC v2 32/83] Add log entry definitions.

2018-03-10 Thread Andiry Xu

From: Andiry Xu <jix...@cs.ucsd.edu>

NOVA appends log entries to the inode log upon metadata change.

NOVA has four kinds of log entries:

File write entry describes a write to a contiguous range of pmem pages,
Dentry describes a file/directory being added or removed from a directory,
Setattr entry is used for updating inode attributes,
Link change entry describes link changes to an inode, e.g. link/unlink.
All of them are aligned to 8 bytes.

Signed-off-by: Andiry Xu <jix...@cs.ucsd.edu>
---
 fs/nova/log.h | 180 ++
 1 file changed, 180 insertions(+)

diff --git a/fs/nova/log.h b/fs/nova/log.h
index 2bc131f..6b4a085 100644
--- a/fs/nova/log.h
+++ b/fs/nova/log.h
@@ -62,6 +62,175 @@ static inline void nova_set_entry_type(void *p, enum 
nova_entry_type type)
*(u8 *)p = type;
 }
 
+/*
+ * Write log entry.  Records a write to a contiguous range of PMEM pages.
+ *
+ * Documentation/filesystems/nova.txt contains descriptions of some fields.
+ */
+struct nova_file_write_entry {
+   u8  entry_type;
+   u8  reassigned; /* Data is not latest */
+   u8  padding[2];
+   __le32  num_pages;
+   __le64  block;  /* offset of first block in this write */
+   __le64  pgoff;  /* file offset at the beginning of this write */
+   __le32  invalid_pages;  /* For GC */
+   /* For both ctime and mtime */
+   __le32  mtime;
+   __le64  size;   /* Write size for non-aligned writes */
+   __le64  epoch_id;
+   __le64  trans_id;
+   __le32  csumpadding;
+   __le32  csum;
+} __attribute((__packed__));
+
+#define WENTRY(entry)  ((struct nova_file_write_entry *) entry)
+
+/* List of file write entries */
+struct nova_file_write_item {
+   struct nova_file_write_entryentry;
+   struct list_headlist;
+};
+
+/*
+ * Log entry for adding a file/directory to a directory.
+ *
+ * Update DIR_LOG_REC_LEN if modify this struct!
+ */
+struct nova_dentry {
+   u8  entry_type;
+   u8  name_len;   /* length of the dentry name */
+   u8  reassigned; /* Currently deleted */
+   u8  invalid;/* Invalid now? */
+   __le16  de_len; /* length of this dentry */
+   __le16  links_count;
+   __le32  mtime;  /* For both mtime and ctime */
+   __le32  csum;   /* entry checksum */
+   __le64  ino;/* inode no pointed to by this entry */
+   __le64  padding;
+   __le64  epoch_id;
+   __le64  trans_id;
+   charname[NOVA_NAME_LEN + 1];/* File name */
+} __attribute((__packed__));
+
+#define DENTRY(entry)  ((struct nova_dentry *) entry)
+
+#define NOVA_DIR_PAD   8   /* Align to 8 bytes boundary */
+#define NOVA_DIR_ROUND (NOVA_DIR_PAD - 1)
+#define NOVA_DENTRY_HEADER_LEN 48
+#define NOVA_DIR_LOG_REC_LEN(name_len) \
+   (((name_len + 1) + NOVA_DENTRY_HEADER_LEN \
++ NOVA_DIR_ROUND) & ~NOVA_DIR_ROUND)
+
+#define NOVA_MAX_ENTRY_LEN NOVA_DIR_LOG_REC_LEN(NOVA_NAME_LEN)
+
+/*
+ * Log entry for updating file attributes.
+ */
+struct nova_setattr_logentry {
+   u8  entry_type;
+   u8  attr;   /* bitmap of which attributes to update */
+   __le16  mode;
+   __le32  uid;
+   __le32  gid;
+   __le32  atime;
+   __le32  mtime;
+   __le32  ctime;
+   __le64  size;/* File size after truncation */
+   __le64  epoch_id;
+   __le64  trans_id;
+   u8  invalid;
+   u8  paddings[3];
+   __le32  csum;
+} __attribute((__packed__));
+
+#define SENTRY(entry)  ((struct nova_setattr_logentry *) entry)
+
+/* Link change log entry.
+ *
+ * TODO: Do we need this to be 32 bytes?
+ */
+struct nova_link_change_entry {
+   u8  entry_type;
+   u8  invalid;
+   __le16  links;
+   __le32  ctime;
+   __le32  flags;
+   __le32  generation;/* for NFS handles */
+   __le64  epoch_id;
+   __le64  trans_id;
+   __le32  csumpadding;
+   __le32  csum;
+} __attribute((__packed__));
+
+#define LCENTRY(entry) ((struct nova_link_change_entry *) entry)
+
+
+/*
+ * Transient DRAM structure that describes changes needed to append a log entry
+ * to an inode
+ */
+struct nova_inode_update {
+   u64 head;
+   u64 tail;
+   u64 curr_entry;
+   struct nova_dentry *create_dentry;
+   struct nova_dentry *delete_dentry;
+};
+
+
+/*
+ * Transient DRAM structure to parameterize the creation of a log entry.
+ */
+struct nova_log_entry_info {
+   enum nova_entry_type type;
+   struct iattr *attr;
+   struct nova_inode_update *update;
+   void *data; /* struct dentry */
+   u64 epoch_id;
+   u64 trans_id;
+   u64 curr_p; /* output */
+   u64 file_size;  /* de_len for dentry */

[RFC v2 29/83] Add write_inode and dirty_inode routines.

2018-03-10 Thread Andiry Xu

From: Andiry Xu 

Signed-off-by: Andiry Xu 
---
 fs/nova/inode.c | 33 +
 fs/nova/inode.h |  2 ++
 fs/nova/super.c |  2 ++
 3 files changed, 37 insertions(+)

diff --git a/fs/nova/inode.c b/fs/nova/inode.c
index a30b6aa..29d172a 100644
--- a/fs/nova/inode.c
+++ b/fs/nova/inode.c
@@ -489,6 +489,39 @@ int nova_free_inuse_inode(struct super_block *sb, unsigned 
long ino)
return ret;
 }
 
+int nova_write_inode(struct inode *inode, struct writeback_control *wbc)
+{
+   /* write_inode should never be called because we always keep our inodes
+* clean. So let us know if write_inode ever gets called.
+*/
+// BUG();
+   return 0;
+}
+
+/*
+ * dirty_inode() is called from mark_inode_dirty_sync()
+ * usually dirty_inode should not be called because NOVA always keeps its 
inodes
+ * clean. Only exception is touch_atime which calls dirty_inode to update the
+ * i_atime field.
+ */
+void nova_dirty_inode(struct inode *inode, int flags)
+{
+   struct super_block *sb = inode->i_sb;
+   struct nova_inode_info *si = NOVA_I(inode);
+   struct nova_inode_info_header *sih = >header;
+   struct nova_inode *pi;
+
+   pi = nova_get_block(sb, sih->pi_addr);
+
+   /* only i_atime should have changed if at all.
+* we can do in-place atomic update
+*/
+   pi->i_atime = cpu_to_le32(inode->i_atime.tv_sec);
+   nova_persist_inode(pi);
+   /* Relax atime persistency */
+   nova_flush_buffer(>i_atime, sizeof(pi->i_atime), 0);
+}
+
 static ssize_t nova_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
 {
/* DAX does not support direct IO */
diff --git a/fs/nova/inode.h b/fs/nova/inode.h
index e00b3b9..f9f5c14 100644
--- a/fs/nova/inode.h
+++ b/fs/nova/inode.h
@@ -229,5 +229,7 @@ int nova_get_inode_address(struct super_block *sb, u64 ino,
 struct inode *nova_iget(struct super_block *sb, unsigned long ino);
 inline int nova_insert_inodetree(struct nova_sb_info *sbi,
struct nova_range_node *new_node, int cpu);
+extern int nova_write_inode(struct inode *inode, struct writeback_control 
*wbc);
+extern void nova_dirty_inode(struct inode *inode, int flags);
 
 #endif
diff --git a/fs/nova/super.c b/fs/nova/super.c
index 69e4afc..c0427fd 100644
--- a/fs/nova/super.c
+++ b/fs/nova/super.c
@@ -861,6 +861,8 @@ static void destroy_rangenode_cache(void)
 static struct super_operations nova_sops = {
.alloc_inode= nova_alloc_inode,
.destroy_inode  = nova_destroy_inode,
+   .write_inode= nova_write_inode,
+   .dirty_inode= nova_dirty_inode,
.put_super  = nova_put_super,
.statfs = nova_statfs,
.remount_fs = nova_remount,
-- 
2.7.4

[RFC v2 32/83] Add log entry definitions.

2018-03-10 Thread Andiry Xu

From: Andiry Xu 

NOVA appends log entries to the inode log upon metadata change.

NOVA has four kinds of log entries:

File write entry describes a write to a contiguous range of pmem pages,
Dentry describes a file/directory being added or removed from a directory,
Setattr entry is used for updating inode attributes,
Link change entry describes link changes to an inode, e.g. link/unlink.
All of them are aligned to 8 bytes.

Signed-off-by: Andiry Xu 
---
 fs/nova/log.h | 180 ++
 1 file changed, 180 insertions(+)

diff --git a/fs/nova/log.h b/fs/nova/log.h
index 2bc131f..6b4a085 100644
--- a/fs/nova/log.h
+++ b/fs/nova/log.h
@@ -62,6 +62,175 @@ static inline void nova_set_entry_type(void *p, enum 
nova_entry_type type)
*(u8 *)p = type;
 }
 
+/*
+ * Write log entry.  Records a write to a contiguous range of PMEM pages.
+ *
+ * Documentation/filesystems/nova.txt contains descriptions of some fields.
+ */
+struct nova_file_write_entry {
+   u8  entry_type;
+   u8  reassigned; /* Data is not latest */
+   u8  padding[2];
+   __le32  num_pages;
+   __le64  block;  /* offset of first block in this write */
+   __le64  pgoff;  /* file offset at the beginning of this write */
+   __le32  invalid_pages;  /* For GC */
+   /* For both ctime and mtime */
+   __le32  mtime;
+   __le64  size;   /* Write size for non-aligned writes */
+   __le64  epoch_id;
+   __le64  trans_id;
+   __le32  csumpadding;
+   __le32  csum;
+} __attribute((__packed__));
+
+#define WENTRY(entry)  ((struct nova_file_write_entry *) entry)
+
+/* List of file write entries */
+struct nova_file_write_item {
+   struct nova_file_write_entryentry;
+   struct list_headlist;
+};
+
+/*
+ * Log entry for adding a file/directory to a directory.
+ *
+ * Update DIR_LOG_REC_LEN if modify this struct!
+ */
+struct nova_dentry {
+   u8  entry_type;
+   u8  name_len;   /* length of the dentry name */
+   u8  reassigned; /* Currently deleted */
+   u8  invalid;/* Invalid now? */
+   __le16  de_len; /* length of this dentry */
+   __le16  links_count;
+   __le32  mtime;  /* For both mtime and ctime */
+   __le32  csum;   /* entry checksum */
+   __le64  ino;/* inode no pointed to by this entry */
+   __le64  padding;
+   __le64  epoch_id;
+   __le64  trans_id;
+   charname[NOVA_NAME_LEN + 1];/* File name */
+} __attribute((__packed__));
+
+#define DENTRY(entry)  ((struct nova_dentry *) entry)
+
+#define NOVA_DIR_PAD   8   /* Align to 8 bytes boundary */
+#define NOVA_DIR_ROUND (NOVA_DIR_PAD - 1)
+#define NOVA_DENTRY_HEADER_LEN 48
+#define NOVA_DIR_LOG_REC_LEN(name_len) \
+   (((name_len + 1) + NOVA_DENTRY_HEADER_LEN \
++ NOVA_DIR_ROUND) & ~NOVA_DIR_ROUND)
+
+#define NOVA_MAX_ENTRY_LEN NOVA_DIR_LOG_REC_LEN(NOVA_NAME_LEN)
+
+/*
+ * Log entry for updating file attributes.
+ */
+struct nova_setattr_logentry {
+   u8  entry_type;
+   u8  attr;   /* bitmap of which attributes to update */
+   __le16  mode;
+   __le32  uid;
+   __le32  gid;
+   __le32  atime;
+   __le32  mtime;
+   __le32  ctime;
+   __le64  size;/* File size after truncation */
+   __le64  epoch_id;
+   __le64  trans_id;
+   u8  invalid;
+   u8  paddings[3];
+   __le32  csum;
+} __attribute((__packed__));
+
+#define SENTRY(entry)  ((struct nova_setattr_logentry *) entry)
+
+/* Link change log entry.
+ *
+ * TODO: Do we need this to be 32 bytes?
+ */
+struct nova_link_change_entry {
+   u8  entry_type;
+   u8  invalid;
+   __le16  links;
+   __le32  ctime;
+   __le32  flags;
+   __le32  generation;/* for NFS handles */
+   __le64  epoch_id;
+   __le64  trans_id;
+   __le32  csumpadding;
+   __le32  csum;
+} __attribute((__packed__));
+
+#define LCENTRY(entry) ((struct nova_link_change_entry *) entry)
+
+
+/*
+ * Transient DRAM structure that describes changes needed to append a log entry
+ * to an inode
+ */
+struct nova_inode_update {
+   u64 head;
+   u64 tail;
+   u64 curr_entry;
+   struct nova_dentry *create_dentry;
+   struct nova_dentry *delete_dentry;
+};
+
+
+/*
+ * Transient DRAM structure to parameterize the creation of a log entry.
+ */
+struct nova_log_entry_info {
+   enum nova_entry_type type;
+   struct iattr *attr;
+   struct nova_inode_update *update;
+   void *data; /* struct dentry */
+   u64 epoch_id;
+   u64 trans_id;
+   u64 curr_p; /* output */
+   u64 file_size;  /* de_len for dentry */
+   u64 ino;
+   u32 time;
+   int link_ch

[RFC v2 33/83] Inode log and entry printing for debug purpose.

2018-03-10 Thread Andiry Xu

From: Andiry Xu <jix...@cs.ucsd.edu>

Signed-off-by: Andiry Xu <jix...@cs.ucsd.edu>
---
 fs/nova/nova.h  |   3 +
 fs/nova/stats.c | 234 
 2 files changed, 237 insertions(+)

diff --git a/fs/nova/nova.h b/fs/nova/nova.h
index bf4b6ac..03c4991 100644
--- a/fs/nova/nova.h
+++ b/fs/nova/nova.h
@@ -367,6 +367,9 @@ void nova_get_IO_stats(void);
 void nova_print_timing_stats(struct super_block *sb);
 void nova_clear_stats(struct super_block *sb);
 void nova_print_inode(struct nova_inode *pi);
+void nova_print_inode_log(struct super_block *sb, struct inode *inode);
+void nova_print_inode_log_pages(struct super_block *sb, struct inode *inode);
+int nova_check_inode_logs(struct super_block *sb, struct nova_inode *pi);
 void nova_print_free_lists(struct super_block *sb);
 
 #endif /* __NOVA_H */
diff --git a/fs/nova/stats.c b/fs/nova/stats.c
index 9ddd267..990e964 100644
--- a/fs/nova/stats.c
+++ b/fs/nova/stats.c
@@ -333,6 +333,240 @@ void nova_print_inode(struct nova_inode *pi)
pi->create_epoch_id, pi->delete_epoch_id);
 }
 
+static inline void nova_print_file_write_entry(struct super_block *sb,
+   u64 curr, struct nova_file_write_entry *entry)
+{
+   nova_dbg("file write entry @ 0x%llx: epoch %llu, trans %llu, "
+   "pgoff %llu, pages %u, blocknr %llu, reassigned %u, "
+   "invalid count %u, size %llu, mtime %u\n",
+   curr, entry->epoch_id, entry->trans_id,
+   entry->pgoff, entry->num_pages,
+   entry->block >> PAGE_SHIFT,
+   entry->reassigned,
+   entry->invalid_pages, entry->size, entry->mtime);
+}
+
+static inline void nova_print_set_attr_entry(struct super_block *sb,
+   u64 curr, struct nova_setattr_logentry *entry)
+{
+   nova_dbg("set attr entry @ 0x%llx: epoch %llu, trans %llu, invalid %u, "
+   "mode %u, size %llu, atime %u, mtime %u, ctime %u\n",
+   curr, entry->epoch_id, entry->trans_id,
+   entry->invalid, entry->mode,
+   entry->size, entry->atime, entry->mtime, entry->ctime);
+}
+
+static inline void nova_print_link_change_entry(struct super_block *sb,
+   u64 curr, struct nova_link_change_entry *entry)
+{
+   nova_dbg("link change entry @ 0x%llx: epoch %llu, trans %llu, "
+   "invalid %u, links %u, flags %u, ctime %u\n",
+   curr, entry->epoch_id, entry->trans_id,
+   entry->invalid, entry->links,
+   entry->flags, entry->ctime);
+}
+
+static inline size_t nova_print_dentry(struct super_block *sb,
+   u64 curr, struct nova_dentry *entry)
+{
+   nova_dbg("dir logentry @ 0x%llx: epoch %llu, trans %llu, "
+   "reassigned %u, invalid %u, inode %llu, links %u, "
+   "namelen %u, rec len %u, name %s, mtime %u\n",
+   curr, entry->epoch_id, entry->trans_id,
+   entry->reassigned, entry->invalid,
+   le64_to_cpu(entry->ino),
+   entry->links_count, entry->name_len,
+   le16_to_cpu(entry->de_len), entry->name,
+   entry->mtime);
+
+   return le16_to_cpu(entry->de_len);
+}
+
+u64 nova_print_log_entry(struct super_block *sb, u64 curr)
+{
+   void *addr;
+   size_t size;
+   u8 type;
+
+   addr = (void *)nova_get_block(sb, curr);
+   type = nova_get_entry_type(addr);
+   switch (type) {
+   case SET_ATTR:
+   nova_print_set_attr_entry(sb, curr, addr);
+   curr += sizeof(struct nova_setattr_logentry);
+   break;
+   case LINK_CHANGE:
+   nova_print_link_change_entry(sb, curr, addr);
+   curr += sizeof(struct nova_link_change_entry);
+   break;
+   case FILE_WRITE:
+   nova_print_file_write_entry(sb, curr, addr);
+   curr += sizeof(struct nova_file_write_entry);
+   break;
+   case DIR_LOG:
+   size = nova_print_dentry(sb, curr, addr);
+   curr += size;
+   if (size == 0) {
+   nova_dbg("%s: dentry with size 0 @ 0x%llx\n",
+   __func__, curr);
+   curr += sizeof(struct nova_file_write_entry);
+   NOVA_ASSERT(0);
+   }
+   break;
+   case NEXT_PAGE:
+   nova_dbg("%s: next page sign @ 0x%llx\n", __func__, curr);
+   curr = PAGE_TAIL(curr);
+   break;
+   defa

[RFC v2 33/83] Inode log and entry printing for debug purpose.

2018-03-10 Thread Andiry Xu

From: Andiry Xu 

Signed-off-by: Andiry Xu 
---
 fs/nova/nova.h  |   3 +
 fs/nova/stats.c | 234 
 2 files changed, 237 insertions(+)

diff --git a/fs/nova/nova.h b/fs/nova/nova.h
index bf4b6ac..03c4991 100644
--- a/fs/nova/nova.h
+++ b/fs/nova/nova.h
@@ -367,6 +367,9 @@ void nova_get_IO_stats(void);
 void nova_print_timing_stats(struct super_block *sb);
 void nova_clear_stats(struct super_block *sb);
 void nova_print_inode(struct nova_inode *pi);
+void nova_print_inode_log(struct super_block *sb, struct inode *inode);
+void nova_print_inode_log_pages(struct super_block *sb, struct inode *inode);
+int nova_check_inode_logs(struct super_block *sb, struct nova_inode *pi);
 void nova_print_free_lists(struct super_block *sb);
 
 #endif /* __NOVA_H */
diff --git a/fs/nova/stats.c b/fs/nova/stats.c
index 9ddd267..990e964 100644
--- a/fs/nova/stats.c
+++ b/fs/nova/stats.c
@@ -333,6 +333,240 @@ void nova_print_inode(struct nova_inode *pi)
pi->create_epoch_id, pi->delete_epoch_id);
 }
 
+static inline void nova_print_file_write_entry(struct super_block *sb,
+   u64 curr, struct nova_file_write_entry *entry)
+{
+   nova_dbg("file write entry @ 0x%llx: epoch %llu, trans %llu, "
+   "pgoff %llu, pages %u, blocknr %llu, reassigned %u, "
+   "invalid count %u, size %llu, mtime %u\n",
+   curr, entry->epoch_id, entry->trans_id,
+   entry->pgoff, entry->num_pages,
+   entry->block >> PAGE_SHIFT,
+   entry->reassigned,
+   entry->invalid_pages, entry->size, entry->mtime);
+}
+
+static inline void nova_print_set_attr_entry(struct super_block *sb,
+   u64 curr, struct nova_setattr_logentry *entry)
+{
+   nova_dbg("set attr entry @ 0x%llx: epoch %llu, trans %llu, invalid %u, "
+   "mode %u, size %llu, atime %u, mtime %u, ctime %u\n",
+   curr, entry->epoch_id, entry->trans_id,
+   entry->invalid, entry->mode,
+   entry->size, entry->atime, entry->mtime, entry->ctime);
+}
+
+static inline void nova_print_link_change_entry(struct super_block *sb,
+   u64 curr, struct nova_link_change_entry *entry)
+{
+   nova_dbg("link change entry @ 0x%llx: epoch %llu, trans %llu, "
+   "invalid %u, links %u, flags %u, ctime %u\n",
+   curr, entry->epoch_id, entry->trans_id,
+   entry->invalid, entry->links,
+   entry->flags, entry->ctime);
+}
+
+static inline size_t nova_print_dentry(struct super_block *sb,
+   u64 curr, struct nova_dentry *entry)
+{
+   nova_dbg("dir logentry @ 0x%llx: epoch %llu, trans %llu, "
+   "reassigned %u, invalid %u, inode %llu, links %u, "
+   "namelen %u, rec len %u, name %s, mtime %u\n",
+   curr, entry->epoch_id, entry->trans_id,
+   entry->reassigned, entry->invalid,
+   le64_to_cpu(entry->ino),
+   entry->links_count, entry->name_len,
+   le16_to_cpu(entry->de_len), entry->name,
+   entry->mtime);
+
+   return le16_to_cpu(entry->de_len);
+}
+
+u64 nova_print_log_entry(struct super_block *sb, u64 curr)
+{
+   void *addr;
+   size_t size;
+   u8 type;
+
+   addr = (void *)nova_get_block(sb, curr);
+   type = nova_get_entry_type(addr);
+   switch (type) {
+   case SET_ATTR:
+   nova_print_set_attr_entry(sb, curr, addr);
+   curr += sizeof(struct nova_setattr_logentry);
+   break;
+   case LINK_CHANGE:
+   nova_print_link_change_entry(sb, curr, addr);
+   curr += sizeof(struct nova_link_change_entry);
+   break;
+   case FILE_WRITE:
+   nova_print_file_write_entry(sb, curr, addr);
+   curr += sizeof(struct nova_file_write_entry);
+   break;
+   case DIR_LOG:
+   size = nova_print_dentry(sb, curr, addr);
+   curr += size;
+   if (size == 0) {
+   nova_dbg("%s: dentry with size 0 @ 0x%llx\n",
+   __func__, curr);
+   curr += sizeof(struct nova_file_write_entry);
+   NOVA_ASSERT(0);
+   }
+   break;
+   case NEXT_PAGE:
+   nova_dbg("%s: next page sign @ 0x%llx\n", __func__, curr);
+   curr = PAGE_TAIL(curr);
+   break;
+   default:
+   nova_dbg("%s: unknown type %d, 0x%

[RFC v2 36/83] Journal: Lite journal recovery.

2018-03-10 Thread Andiry Xu

From: Andiry Xu <jix...@cs.ucsd.edu>

Signed-off-by: Andiry Xu <jix...@cs.ucsd.edu>
---
 fs/nova/journal.c | 55 +++
 1 file changed, 55 insertions(+)

diff --git a/fs/nova/journal.c b/fs/nova/journal.c
index 75d590f..f31de97 100644
--- a/fs/nova/journal.c
+++ b/fs/nova/journal.c
@@ -106,3 +106,58 @@ static int nova_check_journal_entries(struct super_block 
*sb,
 
return 0;
 }
+
+/ Journal Recovery **/
+
+static void nova_undo_journal_entry(struct super_block *sb,
+   struct nova_lite_journal_entry *entry)
+{
+   u64 addr, value;
+
+   addr = le64_to_cpu(entry->data1);
+   value = le64_to_cpu(entry->data2);
+
+   *(u64 *)nova_get_block(sb, addr) = (u64)value;
+   nova_flush_buffer((void *)nova_get_block(sb, addr), CACHELINE_SIZE, 0);
+}
+
+static void nova_undo_lite_journal_entry(struct super_block *sb,
+   struct nova_lite_journal_entry *entry)
+{
+   u64 type;
+
+   type = le64_to_cpu(entry->type);
+
+   switch (type) {
+   case JOURNAL_INODE:
+   /* Currently unused */
+   break;
+   case JOURNAL_ENTRY:
+   nova_undo_journal_entry(sb, entry);
+   break;
+   default:
+   nova_dbg("%s: unknown data type %llu\n", __func__, type);
+   break;
+   }
+}
+
+/* Roll back all journal enries */
+static int nova_recover_lite_journal(struct super_block *sb,
+   struct journal_ptr_pair *pair)
+{
+   struct nova_lite_journal_entry *entry;
+   u64 temp;
+
+   temp = pair->journal_head;
+   while (temp != pair->journal_tail) {
+   entry = (struct nova_lite_journal_entry *)nova_get_block(sb,
+   temp);
+   nova_undo_lite_journal_entry(sb, entry);
+   temp = next_lite_journal(temp);
+   }
+
+   pair->journal_tail = pair->journal_head;
+   nova_flush_buffer(>journal_head, CACHELINE_SIZE, 1);
+
+   return 0;
+}
-- 
2.7.4

[RFC v2 34/83] Journal: NOVA light weight journal definitions.

2018-03-10 Thread Andiry Xu

From: Andiry Xu <jix...@cs.ucsd.edu>

NOVA uses per-CPU lite journals to provide fast atomicity guarantees
for multi-log appending and multi-word inplace updates.

NOVA uses undo journaling. Each journal is a circular buffer
of 4KB pmem page. Two pointers, journal_head and journal_tail
reside in the reserved journal block, and point to the journal page.
If the two pointers are not equal, there are uncommitted transactions
and NOVA recovers the data by replaying the journal entries.

Signed-off-by: Andiry Xu <jix...@cs.ucsd.edu>
---
 fs/nova/bbuild.c  |  1 +
 fs/nova/journal.h | 43 +++
 fs/nova/log.c |  1 +
 fs/nova/super.c   |  1 +
 4 files changed, 46 insertions(+)
 create mode 100644 fs/nova/journal.h

diff --git a/fs/nova/bbuild.c b/fs/nova/bbuild.c
index 66053cb..af1b352 100644
--- a/fs/nova/bbuild.c
+++ b/fs/nova/bbuild.c
@@ -25,6 +25,7 @@
 #include 
 #include 
 #include "nova.h"
+#include "journal.h"
 #include "super.h"
 #include "inode.h"
 
diff --git a/fs/nova/journal.h b/fs/nova/journal.h
new file mode 100644
index 000..d1d0ffb
--- /dev/null
+++ b/fs/nova/journal.h
@@ -0,0 +1,43 @@
+#ifndef __JOURNAL_H
+#define __JOURNAL_H
+
+#include 
+#include 
+#include "nova.h"
+#include "super.h"
+
+
+/* === Lite journal = */
+
+#define NOVA_MAX_JOURNAL_LENGTH 128
+
+#defineJOURNAL_INODE   1
+#defineJOURNAL_ENTRY   2
+
+/* Lightweight journal entry */
+struct nova_lite_journal_entry {
+   __le64 type;   // JOURNAL_INODE or JOURNAL_ENTRY
+   __le64 data1;
+   __le64 data2;
+   __le32 padding;
+   __le32 csum;
+} __attribute((__packed__));
+
+/* Head and tail pointers into a circular queue of journal entries.  There's
+ * one of these per CPU.
+ */
+struct journal_ptr_pair {
+   __le64 journal_head;
+   __le64 journal_tail;
+};
+
+static inline
+struct journal_ptr_pair *nova_get_journal_pointers(struct super_block *sb,
+   int cpu)
+{
+   return (struct journal_ptr_pair *)((char *)nova_get_block(sb,
+   NOVA_DEF_BLOCK_SIZE_4K * JOURNAL_START) + cpu * CACHELINE_SIZE);
+}
+
+
+#endif
diff --git a/fs/nova/log.c b/fs/nova/log.c
index bdd133e..f01b7c8 100644
--- a/fs/nova/log.c
+++ b/fs/nova/log.c
@@ -16,6 +16,7 @@
  */
 
 #include "nova.h"
+#include "journal.h"
 #include "inode.h"
 #include "log.h"
 
diff --git a/fs/nova/super.c b/fs/nova/super.c
index c0427fd..d73c202 100644
--- a/fs/nova/super.c
+++ b/fs/nova/super.c
@@ -38,6 +38,7 @@
 #include 
 #include 
 #include "nova.h"
+#include "journal.h"
 #include "super.h"
 
 int measure_timing;
-- 
2.7.4

[RFC v2 36/83] Journal: Lite journal recovery.

2018-03-10 Thread Andiry Xu

From: Andiry Xu 

Signed-off-by: Andiry Xu 
---
 fs/nova/journal.c | 55 +++
 1 file changed, 55 insertions(+)

diff --git a/fs/nova/journal.c b/fs/nova/journal.c
index 75d590f..f31de97 100644
--- a/fs/nova/journal.c
+++ b/fs/nova/journal.c
@@ -106,3 +106,58 @@ static int nova_check_journal_entries(struct super_block 
*sb,
 
return 0;
 }
+
+/ Journal Recovery **/
+
+static void nova_undo_journal_entry(struct super_block *sb,
+   struct nova_lite_journal_entry *entry)
+{
+   u64 addr, value;
+
+   addr = le64_to_cpu(entry->data1);
+   value = le64_to_cpu(entry->data2);
+
+   *(u64 *)nova_get_block(sb, addr) = (u64)value;
+   nova_flush_buffer((void *)nova_get_block(sb, addr), CACHELINE_SIZE, 0);
+}
+
+static void nova_undo_lite_journal_entry(struct super_block *sb,
+   struct nova_lite_journal_entry *entry)
+{
+   u64 type;
+
+   type = le64_to_cpu(entry->type);
+
+   switch (type) {
+   case JOURNAL_INODE:
+   /* Currently unused */
+   break;
+   case JOURNAL_ENTRY:
+   nova_undo_journal_entry(sb, entry);
+   break;
+   default:
+   nova_dbg("%s: unknown data type %llu\n", __func__, type);
+   break;
+   }
+}
+
+/* Roll back all journal enries */
+static int nova_recover_lite_journal(struct super_block *sb,
+   struct journal_ptr_pair *pair)
+{
+   struct nova_lite_journal_entry *entry;
+   u64 temp;
+
+   temp = pair->journal_head;
+   while (temp != pair->journal_tail) {
+   entry = (struct nova_lite_journal_entry *)nova_get_block(sb,
+   temp);
+   nova_undo_lite_journal_entry(sb, entry);
+   temp = next_lite_journal(temp);
+   }
+
+   pair->journal_tail = pair->journal_head;
+   nova_flush_buffer(>journal_head, CACHELINE_SIZE, 1);
+
+   return 0;
+}
-- 
2.7.4

[RFC v2 34/83] Journal: NOVA light weight journal definitions.

2018-03-10 Thread Andiry Xu

From: Andiry Xu 

NOVA uses per-CPU lite journals to provide fast atomicity guarantees
for multi-log appending and multi-word inplace updates.

NOVA uses undo journaling. Each journal is a circular buffer
of 4KB pmem page. Two pointers, journal_head and journal_tail
reside in the reserved journal block, and point to the journal page.
If the two pointers are not equal, there are uncommitted transactions
and NOVA recovers the data by replaying the journal entries.

Signed-off-by: Andiry Xu 
---
 fs/nova/bbuild.c  |  1 +
 fs/nova/journal.h | 43 +++
 fs/nova/log.c |  1 +
 fs/nova/super.c   |  1 +
 4 files changed, 46 insertions(+)
 create mode 100644 fs/nova/journal.h

diff --git a/fs/nova/bbuild.c b/fs/nova/bbuild.c
index 66053cb..af1b352 100644
--- a/fs/nova/bbuild.c
+++ b/fs/nova/bbuild.c
@@ -25,6 +25,7 @@
 #include 
 #include 
 #include "nova.h"
+#include "journal.h"
 #include "super.h"
 #include "inode.h"
 
diff --git a/fs/nova/journal.h b/fs/nova/journal.h
new file mode 100644
index 000..d1d0ffb
--- /dev/null
+++ b/fs/nova/journal.h
@@ -0,0 +1,43 @@
+#ifndef __JOURNAL_H
+#define __JOURNAL_H
+
+#include 
+#include 
+#include "nova.h"
+#include "super.h"
+
+
+/* === Lite journal = */
+
+#define NOVA_MAX_JOURNAL_LENGTH 128
+
+#defineJOURNAL_INODE   1
+#defineJOURNAL_ENTRY   2
+
+/* Lightweight journal entry */
+struct nova_lite_journal_entry {
+   __le64 type;   // JOURNAL_INODE or JOURNAL_ENTRY
+   __le64 data1;
+   __le64 data2;
+   __le32 padding;
+   __le32 csum;
+} __attribute((__packed__));
+
+/* Head and tail pointers into a circular queue of journal entries.  There's
+ * one of these per CPU.
+ */
+struct journal_ptr_pair {
+   __le64 journal_head;
+   __le64 journal_tail;
+};
+
+static inline
+struct journal_ptr_pair *nova_get_journal_pointers(struct super_block *sb,
+   int cpu)
+{
+   return (struct journal_ptr_pair *)((char *)nova_get_block(sb,
+   NOVA_DEF_BLOCK_SIZE_4K * JOURNAL_START) + cpu * CACHELINE_SIZE);
+}
+
+
+#endif
diff --git a/fs/nova/log.c b/fs/nova/log.c
index bdd133e..f01b7c8 100644
--- a/fs/nova/log.c
+++ b/fs/nova/log.c
@@ -16,6 +16,7 @@
  */
 
 #include "nova.h"
+#include "journal.h"
 #include "inode.h"
 #include "log.h"
 
diff --git a/fs/nova/super.c b/fs/nova/super.c
index c0427fd..d73c202 100644
--- a/fs/nova/super.c
+++ b/fs/nova/super.c
@@ -38,6 +38,7 @@
 #include 
 #include 
 #include "nova.h"
+#include "journal.h"
 #include "super.h"
 
 int measure_timing;
-- 
2.7.4

[RFC v2 38/83] Journal: NOVA lite journal initialization.

2018-03-10 Thread Andiry Xu

From: Andiry Xu <jix...@cs.ucsd.edu>

NOVA uses per-CPU spinlock to protect the journals.
Lite journal initialization consists of two parts:
for a new NOVA instance, hard_init allocates the journal pages.
soft_init initializes the locks and performs journal recovery.

Signed-off-by: Andiry Xu <jix...@cs.ucsd.edu>
---
 fs/nova/journal.c | 70 +++
 fs/nova/journal.h |  2 ++
 fs/nova/super.c   | 15 
 fs/nova/super.h   |  3 +++
 4 files changed, 90 insertions(+)

diff --git a/fs/nova/journal.c b/fs/nova/journal.c
index 0e203fa..d2578e2 100644
--- a/fs/nova/journal.c
+++ b/fs/nova/journal.c
@@ -340,3 +340,73 @@ void nova_commit_lite_transaction(struct super_block *sb, 
u64 tail, int cpu)
pair->journal_head = tail;
nova_flush_buffer(>journal_head, CACHELINE_SIZE, 1);
 }
+
+/ Initialization **/
+
+// Initialized DRAM journal state, validate, and recover
+int nova_lite_journal_soft_init(struct super_block *sb)
+{
+   struct nova_sb_info *sbi = NOVA_SB(sb);
+   struct journal_ptr_pair *pair;
+   int i;
+   int ret = 0;
+
+   sbi->journal_locks = kcalloc(sbi->cpus, sizeof(spinlock_t),
+GFP_KERNEL);
+   if (!sbi->journal_locks)
+   return -ENOMEM;
+
+   for (i = 0; i < sbi->cpus; i++)
+   spin_lock_init(>journal_locks[i]);
+
+   for (i = 0; i < sbi->cpus; i++) {
+   pair = nova_get_journal_pointers(sb, i);
+   if (pair->journal_head == pair->journal_tail)
+   continue;
+
+   /* Ensure all entries are genuine */
+   ret = nova_check_journal_entries(sb, pair);
+   if (ret) {
+   nova_err(sb, "Journal %d checksum failure\n", i);
+   ret = -EINVAL;
+   break;
+   }
+
+   ret = nova_recover_lite_journal(sb, pair);
+   }
+
+   return ret;
+}
+
+/* Initialized persistent journal state */
+int nova_lite_journal_hard_init(struct super_block *sb)
+{
+   struct nova_sb_info *sbi = NOVA_SB(sb);
+   struct nova_inode_info_header sih;
+   struct journal_ptr_pair *pair;
+   unsigned long blocknr = 0;
+   int allocated;
+   int i;
+   u64 block;
+
+   sih.ino = NOVA_LITEJOURNAL_INO;
+   sih.i_blk_type = NOVA_BLOCK_TYPE_4K;
+
+   for (i = 0; i < sbi->cpus; i++) {
+   pair = nova_get_journal_pointers(sb, i);
+
+   allocated = nova_new_log_blocks(sb, , , 1,
+   ALLOC_INIT_ZERO, ANY_CPU, ALLOC_FROM_HEAD);
+   nova_dbg_verbose("%s: allocate log @ 0x%lx\n", __func__,
+   blocknr);
+   if (allocated != 1 || blocknr == 0)
+   return -ENOSPC;
+
+   block = nova_get_block_off(sb, blocknr, NOVA_BLOCK_TYPE_4K);
+   pair->journal_head = pair->journal_tail = block;
+   nova_flush_buffer(pair, CACHELINE_SIZE, 0);
+   }
+
+   PERSISTENT_BARRIER();
+   return nova_lite_journal_soft_init(sb);
+}
diff --git a/fs/nova/journal.h b/fs/nova/journal.h
index 2259880..6e3a528 100644
--- a/fs/nova/journal.h
+++ b/fs/nova/journal.h
@@ -50,5 +50,7 @@ u64 nova_create_rename_transaction(struct super_block *sb,
 u64 nova_create_logentry_transaction(struct super_block *sb,
void *entry, enum nova_entry_type type, int cpu);
 void nova_commit_lite_transaction(struct super_block *sb, u64 tail, int cpu);
+int nova_lite_journal_soft_init(struct super_block *sb);
+int nova_lite_journal_hard_init(struct super_block *sb);
 
 #endif
diff --git a/fs/nova/super.c b/fs/nova/super.c
index d73c202..216d396 100644
--- a/fs/nova/super.c
+++ b/fs/nova/super.c
@@ -379,6 +379,11 @@ static struct nova_inode *nova_init(struct super_block *sb,
 
nova_init_blockmap(sb, 0);
 
+   if (nova_lite_journal_hard_init(sb) < 0) {
+   nova_err(sb, "Lite journal hard initialization failed\n");
+   return ERR_PTR(-EINVAL);
+   }
+
if (nova_init_inode_inuse_list(sb) < 0)
return ERR_PTR(-EINVAL);
 
@@ -598,6 +603,12 @@ static int nova_fill_super(struct super_block *sb, void 
*data, int silent)
goto out;
}
 
+   if (nova_lite_journal_soft_init(sb)) {
+   retval = -EINVAL;
+   nova_err(sb, "Lite journal initialization failed\n");
+   goto out;
+   }
+
blocksize = le32_to_cpu(sbi->nova_sb->s_blocksize);
nova_set_blocksize(sb, blocksize);
 
@@ -647,6 +658,9 @@ static int nova_fill_super(struct super_block *sb, void 
*data, int silent)
 
nova_delete_free_lists(sb);
 
+   kfree(sbi->journal_locks);
+   sbi-&g

[RFC v2 30/83] New NOVA inode allocation.

2018-03-10 Thread Andiry Xu

From: Andiry Xu <jix...@cs.ucsd.edu>

Allocate the new inode in a round-robin way.
Extend the inode table if needed.

Signed-off-by: Andiry Xu <jix...@cs.ucsd.edu>
---
 fs/nova/inode.c | 40 
 fs/nova/inode.h |  1 +
 2 files changed, 41 insertions(+)

diff --git a/fs/nova/inode.c b/fs/nova/inode.c
index 29d172a..e4b8960 100644
--- a/fs/nova/inode.c
+++ b/fs/nova/inode.c
@@ -489,6 +489,46 @@ int nova_free_inuse_inode(struct super_block *sb, unsigned 
long ino)
return ret;
 }
 
+/* Returns 0 on failure */
+u64 nova_new_nova_inode(struct super_block *sb, u64 *pi_addr)
+{
+   struct nova_sb_info *sbi = NOVA_SB(sb);
+   struct inode_map *inode_map;
+   unsigned long free_ino = 0;
+   int map_id;
+   u64 ino = 0;
+   int ret;
+   timing_t new_inode_time;
+
+   NOVA_START_TIMING(new_nova_inode_t, new_inode_time);
+   map_id = sbi->map_id;
+   sbi->map_id = (sbi->map_id + 1) % sbi->cpus;
+
+   inode_map = >inode_maps[map_id];
+
+   mutex_lock(_map->inode_table_mutex);
+   ret = nova_alloc_unused_inode(sb, map_id, _ino);
+   if (ret) {
+   nova_dbg("%s: alloc inode number failed %d\n", __func__, ret);
+   mutex_unlock(_map->inode_table_mutex);
+   return 0;
+   }
+
+   ret = nova_get_inode_address(sb, free_ino, pi_addr, 1);
+   if (ret) {
+   nova_dbg("%s: get inode address failed %d\n", __func__, ret);
+   mutex_unlock(_map->inode_table_mutex);
+   return 0;
+   }
+
+   mutex_unlock(_map->inode_table_mutex);
+
+   ino = free_ino;
+
+   NOVA_END_TIMING(new_nova_inode_t, new_inode_time);
+   return ino;
+}
+
 int nova_write_inode(struct inode *inode, struct writeback_control *wbc)
 {
/* write_inode should never be called because we always keep our inodes
diff --git a/fs/nova/inode.h b/fs/nova/inode.h
index f9f5c14..fc1876c 100644
--- a/fs/nova/inode.h
+++ b/fs/nova/inode.h
@@ -229,6 +229,7 @@ int nova_get_inode_address(struct super_block *sb, u64 ino,
 struct inode *nova_iget(struct super_block *sb, unsigned long ino);
 inline int nova_insert_inodetree(struct nova_sb_info *sbi,
struct nova_range_node *new_node, int cpu);
+u64 nova_new_nova_inode(struct super_block *sb, u64 *pi_addr);
 extern int nova_write_inode(struct inode *inode, struct writeback_control 
*wbc);
 extern void nova_dirty_inode(struct inode *inode, int flags);
 
-- 
2.7.4

[RFC v2 38/83] Journal: NOVA lite journal initialization.

2018-03-10 Thread Andiry Xu

From: Andiry Xu 

NOVA uses per-CPU spinlock to protect the journals.
Lite journal initialization consists of two parts:
for a new NOVA instance, hard_init allocates the journal pages.
soft_init initializes the locks and performs journal recovery.

Signed-off-by: Andiry Xu 
---
 fs/nova/journal.c | 70 +++
 fs/nova/journal.h |  2 ++
 fs/nova/super.c   | 15 
 fs/nova/super.h   |  3 +++
 4 files changed, 90 insertions(+)

diff --git a/fs/nova/journal.c b/fs/nova/journal.c
index 0e203fa..d2578e2 100644
--- a/fs/nova/journal.c
+++ b/fs/nova/journal.c
@@ -340,3 +340,73 @@ void nova_commit_lite_transaction(struct super_block *sb, 
u64 tail, int cpu)
pair->journal_head = tail;
nova_flush_buffer(>journal_head, CACHELINE_SIZE, 1);
 }
+
+/ Initialization **/
+
+// Initialized DRAM journal state, validate, and recover
+int nova_lite_journal_soft_init(struct super_block *sb)
+{
+   struct nova_sb_info *sbi = NOVA_SB(sb);
+   struct journal_ptr_pair *pair;
+   int i;
+   int ret = 0;
+
+   sbi->journal_locks = kcalloc(sbi->cpus, sizeof(spinlock_t),
+GFP_KERNEL);
+   if (!sbi->journal_locks)
+   return -ENOMEM;
+
+   for (i = 0; i < sbi->cpus; i++)
+   spin_lock_init(>journal_locks[i]);
+
+   for (i = 0; i < sbi->cpus; i++) {
+   pair = nova_get_journal_pointers(sb, i);
+   if (pair->journal_head == pair->journal_tail)
+   continue;
+
+   /* Ensure all entries are genuine */
+   ret = nova_check_journal_entries(sb, pair);
+   if (ret) {
+   nova_err(sb, "Journal %d checksum failure\n", i);
+   ret = -EINVAL;
+   break;
+   }
+
+   ret = nova_recover_lite_journal(sb, pair);
+   }
+
+   return ret;
+}
+
+/* Initialized persistent journal state */
+int nova_lite_journal_hard_init(struct super_block *sb)
+{
+   struct nova_sb_info *sbi = NOVA_SB(sb);
+   struct nova_inode_info_header sih;
+   struct journal_ptr_pair *pair;
+   unsigned long blocknr = 0;
+   int allocated;
+   int i;
+   u64 block;
+
+   sih.ino = NOVA_LITEJOURNAL_INO;
+   sih.i_blk_type = NOVA_BLOCK_TYPE_4K;
+
+   for (i = 0; i < sbi->cpus; i++) {
+   pair = nova_get_journal_pointers(sb, i);
+
+   allocated = nova_new_log_blocks(sb, , , 1,
+   ALLOC_INIT_ZERO, ANY_CPU, ALLOC_FROM_HEAD);
+   nova_dbg_verbose("%s: allocate log @ 0x%lx\n", __func__,
+   blocknr);
+   if (allocated != 1 || blocknr == 0)
+   return -ENOSPC;
+
+   block = nova_get_block_off(sb, blocknr, NOVA_BLOCK_TYPE_4K);
+   pair->journal_head = pair->journal_tail = block;
+   nova_flush_buffer(pair, CACHELINE_SIZE, 0);
+   }
+
+   PERSISTENT_BARRIER();
+   return nova_lite_journal_soft_init(sb);
+}
diff --git a/fs/nova/journal.h b/fs/nova/journal.h
index 2259880..6e3a528 100644
--- a/fs/nova/journal.h
+++ b/fs/nova/journal.h
@@ -50,5 +50,7 @@ u64 nova_create_rename_transaction(struct super_block *sb,
 u64 nova_create_logentry_transaction(struct super_block *sb,
void *entry, enum nova_entry_type type, int cpu);
 void nova_commit_lite_transaction(struct super_block *sb, u64 tail, int cpu);
+int nova_lite_journal_soft_init(struct super_block *sb);
+int nova_lite_journal_hard_init(struct super_block *sb);
 
 #endif
diff --git a/fs/nova/super.c b/fs/nova/super.c
index d73c202..216d396 100644
--- a/fs/nova/super.c
+++ b/fs/nova/super.c
@@ -379,6 +379,11 @@ static struct nova_inode *nova_init(struct super_block *sb,
 
nova_init_blockmap(sb, 0);
 
+   if (nova_lite_journal_hard_init(sb) < 0) {
+   nova_err(sb, "Lite journal hard initialization failed\n");
+   return ERR_PTR(-EINVAL);
+   }
+
if (nova_init_inode_inuse_list(sb) < 0)
return ERR_PTR(-EINVAL);
 
@@ -598,6 +603,12 @@ static int nova_fill_super(struct super_block *sb, void 
*data, int silent)
goto out;
}
 
+   if (nova_lite_journal_soft_init(sb)) {
+   retval = -EINVAL;
+   nova_err(sb, "Lite journal initialization failed\n");
+   goto out;
+   }
+
blocksize = le32_to_cpu(sbi->nova_sb->s_blocksize);
nova_set_blocksize(sb, blocksize);
 
@@ -647,6 +658,9 @@ static int nova_fill_super(struct super_block *sb, void 
*data, int silent)
 
nova_delete_free_lists(sb);
 
+   kfree(sbi->journal_locks);
+   sbi->journal_locks = NULL;
+
kfree(sbi->ino

[RFC v2 30/83] New NOVA inode allocation.

2018-03-10 Thread Andiry Xu

From: Andiry Xu 

Allocate the new inode in a round-robin way.
Extend the inode table if needed.

Signed-off-by: Andiry Xu 
---
 fs/nova/inode.c | 40 
 fs/nova/inode.h |  1 +
 2 files changed, 41 insertions(+)

diff --git a/fs/nova/inode.c b/fs/nova/inode.c
index 29d172a..e4b8960 100644
--- a/fs/nova/inode.c
+++ b/fs/nova/inode.c
@@ -489,6 +489,46 @@ int nova_free_inuse_inode(struct super_block *sb, unsigned 
long ino)
return ret;
 }
 
+/* Returns 0 on failure */
+u64 nova_new_nova_inode(struct super_block *sb, u64 *pi_addr)
+{
+   struct nova_sb_info *sbi = NOVA_SB(sb);
+   struct inode_map *inode_map;
+   unsigned long free_ino = 0;
+   int map_id;
+   u64 ino = 0;
+   int ret;
+   timing_t new_inode_time;
+
+   NOVA_START_TIMING(new_nova_inode_t, new_inode_time);
+   map_id = sbi->map_id;
+   sbi->map_id = (sbi->map_id + 1) % sbi->cpus;
+
+   inode_map = >inode_maps[map_id];
+
+   mutex_lock(_map->inode_table_mutex);
+   ret = nova_alloc_unused_inode(sb, map_id, _ino);
+   if (ret) {
+   nova_dbg("%s: alloc inode number failed %d\n", __func__, ret);
+   mutex_unlock(_map->inode_table_mutex);
+   return 0;
+   }
+
+   ret = nova_get_inode_address(sb, free_ino, pi_addr, 1);
+   if (ret) {
+   nova_dbg("%s: get inode address failed %d\n", __func__, ret);
+   mutex_unlock(_map->inode_table_mutex);
+   return 0;
+   }
+
+   mutex_unlock(_map->inode_table_mutex);
+
+   ino = free_ino;
+
+   NOVA_END_TIMING(new_nova_inode_t, new_inode_time);
+   return ino;
+}
+
 int nova_write_inode(struct inode *inode, struct writeback_control *wbc)
 {
/* write_inode should never be called because we always keep our inodes
diff --git a/fs/nova/inode.h b/fs/nova/inode.h
index f9f5c14..fc1876c 100644
--- a/fs/nova/inode.h
+++ b/fs/nova/inode.h
@@ -229,6 +229,7 @@ int nova_get_inode_address(struct super_block *sb, u64 ino,
 struct inode *nova_iget(struct super_block *sb, unsigned long ino);
 inline int nova_insert_inodetree(struct nova_sb_info *sbi,
struct nova_range_node *new_node, int cpu);
+u64 nova_new_nova_inode(struct super_block *sb, u64 *pi_addr);
 extern int nova_write_inode(struct inode *inode, struct writeback_control 
*wbc);
 extern void nova_dirty_inode(struct inode *inode, int flags);
 
-- 
2.7.4

[RFC v2 39/83] Log operation: dentry append.

2018-03-10 Thread Andiry Xu

From: Andiry Xu <jix...@cs.ucsd.edu>

NOVA performs atomic log appending by first appending the entry
to the tail of the log, and then atomically update the log tail pointer.

Signed-off-by: Andiry Xu <jix...@cs.ucsd.edu>
---
 fs/nova/log.c | 162 ++
 fs/nova/log.h |   4 ++
 2 files changed, 166 insertions(+)

diff --git a/fs/nova/log.c b/fs/nova/log.c
index f01b7c8..13f9597 100644
--- a/fs/nova/log.c
+++ b/fs/nova/log.c
@@ -20,6 +20,168 @@
 #include "inode.h"
 #include "log.h"
 
+static int nova_update_old_dentry(struct super_block *sb,
+   struct inode *dir, struct nova_dentry *dentry,
+   struct nova_log_entry_info *entry_info)
+{
+   unsigned short links_count;
+   int link_change = entry_info->link_change;
+   u64 addr;
+
+   dentry->epoch_id = entry_info->epoch_id;
+   dentry->trans_id = entry_info->trans_id;
+   /* Remove_dentry */
+   dentry->ino = cpu_to_le64(0);
+   dentry->invalid = 1;
+   dentry->mtime = cpu_to_le32(dir->i_mtime.tv_sec);
+
+   links_count = cpu_to_le16(dir->i_nlink);
+   if (links_count == 0 && link_change == -1)
+   links_count = 0;
+   else
+   links_count += link_change;
+   dentry->links_count = cpu_to_le16(links_count);
+
+   addr = nova_get_addr_off(NOVA_SB(sb), dentry);
+   nova_inc_page_invalid_entries(sb, addr);
+
+   nova_persist_entry(dentry);
+
+   return 0;
+}
+
+static int nova_update_new_dentry(struct super_block *sb,
+   struct inode *dir, struct nova_dentry *entry,
+   struct nova_log_entry_info *entry_info)
+{
+   struct dentry *dentry = entry_info->data;
+   unsigned short links_count;
+   int link_change = entry_info->link_change;
+
+   entry->entry_type = DIR_LOG;
+   entry->epoch_id = entry_info->epoch_id;
+   entry->trans_id = entry_info->trans_id;
+   entry->ino = entry_info->ino;
+   entry->name_len = dentry->d_name.len;
+   memcpy_to_pmem_nocache(entry->name, dentry->d_name.name,
+   dentry->d_name.len);
+   entry->name[dentry->d_name.len] = '\0';
+   entry->mtime = cpu_to_le32(dir->i_mtime.tv_sec);
+   //entry->size = cpu_to_le64(dir->i_size);
+
+   links_count = cpu_to_le16(dir->i_nlink);
+   if (links_count == 0 && link_change == -1)
+   links_count = 0;
+   else
+   links_count += link_change;
+   entry->links_count = cpu_to_le16(links_count);
+
+   /* Update actual de_len */
+   entry->de_len = cpu_to_le16(entry_info->file_size);
+
+   nova_persist_entry(entry);
+
+   return 0;
+}
+
+static int nova_update_log_entry(struct super_block *sb, struct inode *inode,
+   void *entry, struct nova_log_entry_info *entry_info)
+{
+   enum nova_entry_type type = entry_info->type;
+
+   switch (type) {
+   case FILE_WRITE:
+   break;
+   case DIR_LOG:
+   if (entry_info->inplace)
+   nova_update_old_dentry(sb, inode, entry, entry_info);
+   else
+   nova_update_new_dentry(sb, inode, entry, entry_info);
+   break;
+   case SET_ATTR:
+   break;
+   case LINK_CHANGE:
+   break;
+   default:
+   break;
+   }
+
+   return 0;
+}
+
+static int nova_append_log_entry(struct super_block *sb,
+   struct nova_inode *pi, struct inode *inode,
+   struct nova_inode_info_header *sih,
+   struct nova_log_entry_info *entry_info)
+{
+   void *entry;
+   enum nova_entry_type type = entry_info->type;
+   struct nova_inode_update *update = entry_info->update;
+   u64 tail;
+   u64 curr_p;
+   size_t size;
+   int extended = 0;
+
+   if (type == DIR_LOG)
+   size = entry_info->file_size;
+   else
+   size = nova_get_log_entry_size(sb, type);
+
+   tail = update->tail;
+
+   curr_p = nova_get_append_head(sb, pi, sih, tail, size,
+   MAIN_LOG, 0, );
+   if (curr_p == 0)
+   return -ENOSPC;
+
+   nova_dbg_verbose("%s: inode %lu attr change entry @ 0x%llx\n",
+   __func__, sih->ino, curr_p);
+
+   entry = nova_get_block(sb, curr_p);
+   /* inode is already updated with attr */
+   memset(entry, 0, size);
+   nova_update_log_entry(sb, inode, entry, entry_info);
+   nova_inc_page_num_entries(sb, curr_p);
+   update->curr_entry = curr_p;
+   update->tail = curr_p + size;
+
+   entry_info->curr_p = curr_p;
+   return 0;
+}
+
+int nova_append_dentry(struct super_block *sb, struct nova_inode *pi,
+   struct inode *dir, struct den

[RFC v2 39/83] Log operation: dentry append.

2018-03-10 Thread Andiry Xu

From: Andiry Xu 

NOVA performs atomic log appending by first appending the entry
to the tail of the log, and then atomically update the log tail pointer.

Signed-off-by: Andiry Xu 
---
 fs/nova/log.c | 162 ++
 fs/nova/log.h |   4 ++
 2 files changed, 166 insertions(+)

diff --git a/fs/nova/log.c b/fs/nova/log.c
index f01b7c8..13f9597 100644
--- a/fs/nova/log.c
+++ b/fs/nova/log.c
@@ -20,6 +20,168 @@
 #include "inode.h"
 #include "log.h"
 
+static int nova_update_old_dentry(struct super_block *sb,
+   struct inode *dir, struct nova_dentry *dentry,
+   struct nova_log_entry_info *entry_info)
+{
+   unsigned short links_count;
+   int link_change = entry_info->link_change;
+   u64 addr;
+
+   dentry->epoch_id = entry_info->epoch_id;
+   dentry->trans_id = entry_info->trans_id;
+   /* Remove_dentry */
+   dentry->ino = cpu_to_le64(0);
+   dentry->invalid = 1;
+   dentry->mtime = cpu_to_le32(dir->i_mtime.tv_sec);
+
+   links_count = cpu_to_le16(dir->i_nlink);
+   if (links_count == 0 && link_change == -1)
+   links_count = 0;
+   else
+   links_count += link_change;
+   dentry->links_count = cpu_to_le16(links_count);
+
+   addr = nova_get_addr_off(NOVA_SB(sb), dentry);
+   nova_inc_page_invalid_entries(sb, addr);
+
+   nova_persist_entry(dentry);
+
+   return 0;
+}
+
+static int nova_update_new_dentry(struct super_block *sb,
+   struct inode *dir, struct nova_dentry *entry,
+   struct nova_log_entry_info *entry_info)
+{
+   struct dentry *dentry = entry_info->data;
+   unsigned short links_count;
+   int link_change = entry_info->link_change;
+
+   entry->entry_type = DIR_LOG;
+   entry->epoch_id = entry_info->epoch_id;
+   entry->trans_id = entry_info->trans_id;
+   entry->ino = entry_info->ino;
+   entry->name_len = dentry->d_name.len;
+   memcpy_to_pmem_nocache(entry->name, dentry->d_name.name,
+   dentry->d_name.len);
+   entry->name[dentry->d_name.len] = '\0';
+   entry->mtime = cpu_to_le32(dir->i_mtime.tv_sec);
+   //entry->size = cpu_to_le64(dir->i_size);
+
+   links_count = cpu_to_le16(dir->i_nlink);
+   if (links_count == 0 && link_change == -1)
+   links_count = 0;
+   else
+   links_count += link_change;
+   entry->links_count = cpu_to_le16(links_count);
+
+   /* Update actual de_len */
+   entry->de_len = cpu_to_le16(entry_info->file_size);
+
+   nova_persist_entry(entry);
+
+   return 0;
+}
+
+static int nova_update_log_entry(struct super_block *sb, struct inode *inode,
+   void *entry, struct nova_log_entry_info *entry_info)
+{
+   enum nova_entry_type type = entry_info->type;
+
+   switch (type) {
+   case FILE_WRITE:
+   break;
+   case DIR_LOG:
+   if (entry_info->inplace)
+   nova_update_old_dentry(sb, inode, entry, entry_info);
+   else
+   nova_update_new_dentry(sb, inode, entry, entry_info);
+   break;
+   case SET_ATTR:
+   break;
+   case LINK_CHANGE:
+   break;
+   default:
+   break;
+   }
+
+   return 0;
+}
+
+static int nova_append_log_entry(struct super_block *sb,
+   struct nova_inode *pi, struct inode *inode,
+   struct nova_inode_info_header *sih,
+   struct nova_log_entry_info *entry_info)
+{
+   void *entry;
+   enum nova_entry_type type = entry_info->type;
+   struct nova_inode_update *update = entry_info->update;
+   u64 tail;
+   u64 curr_p;
+   size_t size;
+   int extended = 0;
+
+   if (type == DIR_LOG)
+   size = entry_info->file_size;
+   else
+   size = nova_get_log_entry_size(sb, type);
+
+   tail = update->tail;
+
+   curr_p = nova_get_append_head(sb, pi, sih, tail, size,
+   MAIN_LOG, 0, );
+   if (curr_p == 0)
+   return -ENOSPC;
+
+   nova_dbg_verbose("%s: inode %lu attr change entry @ 0x%llx\n",
+   __func__, sih->ino, curr_p);
+
+   entry = nova_get_block(sb, curr_p);
+   /* inode is already updated with attr */
+   memset(entry, 0, size);
+   nova_update_log_entry(sb, inode, entry, entry_info);
+   nova_inc_page_num_entries(sb, curr_p);
+   update->curr_entry = curr_p;
+   update->tail = curr_p + size;
+
+   entry_info->curr_p = curr_p;
+   return 0;
+}
+
+int nova_append_dentry(struct super_block *sb, struct nova_inode *pi,
+   struct inode *dir, struct dentry *dentry, u64 ino,
+   unsigned short de_len, struct nova_i

1 2 3 >

1 - 100 of 228 matches

Mail list logo