from:"\"Ryusuke Konishi\""

Re: [PATCH v2] vfs: remove unused wrapper block_page_mkwrite()

2015-10-13 Thread Ryusuke Konishi

On Tue, 13 Oct 2015 16:51:02 -0600, Ross Zwisler wrote:
> The function currently called "__block_page_mkwrite()" used to be called
> "block_page_mkwrite()" until a wrapper for this function was added by:
> 
> commit 24da4fab5a61 ("vfs: Create __block_page_mkwrite() helper passing
>   error values back")
> 
> This wrapper, the current "block_page_mkwrite()", is currently unused.
> __block_page_mkwrite() is used directly by ext4, nilfs2 and xfs.
> 
> Remove the unused wrapper, rename __block_page_mkwrite() back to
> block_page_mkwrite() and update the comment above block_page_mkwrite().
> 
> Signed-off-by: Ross Zwisler 
> Reviewed-by: Jan Kara 
> Cc: Jan Kara 
> Cc: Christoph Hellwig 
> Cc: Al Viro 

Acked-by: Ryusuke Konishi 

Thanks,
Ryusuke Konishi

> ---
>  fs/buffer.c | 24 ++--
>  fs/ext4/inode.c |  4 ++--
>  fs/nilfs2/file.c|  2 +-
>  fs/xfs/xfs_file.c   |  2 +-
>  include/linux/buffer_head.h |  2 --
>  5 files changed, 6 insertions(+), 28 deletions(-)
> 
> diff --git a/fs/buffer.c b/fs/buffer.c
> index 82283ab..e46c916 100644
> --- a/fs/buffer.c
> +++ b/fs/buffer.c
> @@ -2420,9 +2420,9 @@ EXPORT_SYMBOL(block_commit_write);
>   * unlock the page.
>   *
>   * Direct callers of this function should protect against filesystem freezing
> - * using sb_start_write() - sb_end_write() functions.
> + * using sb_start_pagefault() - sb_end_pagefault() functions.
>   */
> -int __block_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf,
> +int block_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf,
>get_block_t get_block)
>  {
>   struct page *page = vmf->page;
> @@ -2459,26 +2459,6 @@ out_unlock:
>   unlock_page(page);
>   return ret;
>  }
> -EXPORT_SYMBOL(__block_page_mkwrite);
> -
> -int block_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf,
> -get_block_t get_block)
> -{
> - int ret;
> - struct super_block *sb = file_inode(vma->vm_file)->i_sb;
> -
> - sb_start_pagefault(sb);
> -
> - /*
> -  * Update file times before taking page lock. We may end up failing the
> -  * fault so this update may be superfluous but who really cares...
> -  */
> - file_update_time(vma->vm_file);
> -
> - ret = __block_page_mkwrite(vma, vmf, get_block);
> - sb_end_pagefault(sb);
> - return block_page_mkwrite_return(ret);
> -}
>  EXPORT_SYMBOL(block_page_mkwrite);
>  
>  /*
> diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
> index 612fbcf..2d1ecd2 100644
> --- a/fs/ext4/inode.c
> +++ b/fs/ext4/inode.c
> @@ -5244,7 +5244,7 @@ int ext4_page_mkwrite(struct vm_area_struct *vma, 
> struct vm_fault *vmf)
>   !ext4_should_journal_data(inode) &&
>   !ext4_nonda_switch(inode->i_sb)) {
>   do {
> - ret = __block_page_mkwrite(vma, vmf,
> + ret = block_page_mkwrite(vma, vmf,
>  ext4_da_get_block_prep);
>   } while (ret == -ENOSPC &&
>  ext4_should_retry_alloc(inode->i_sb, &retries));
> @@ -5291,7 +5291,7 @@ retry_alloc:
>   ret = VM_FAULT_SIGBUS;
>   goto out;
>   }
> - ret = __block_page_mkwrite(vma, vmf, get_block);
> + ret = block_page_mkwrite(vma, vmf, get_block);
>   if (!ret && ext4_should_journal_data(inode)) {
>   if (ext4_walk_page_buffers(handle, page_buffers(page), 0,
> PAGE_CACHE_SIZE, NULL, do_journal_get_write_access)) {
> diff --git a/fs/nilfs2/file.c b/fs/nilfs2/file.c
> index 54575e3..088ba00 100644
> --- a/fs/nilfs2/file.c
> +++ b/fs/nilfs2/file.c
> @@ -109,7 +109,7 @@ static int nilfs_page_mkwrite(struct vm_area_struct *vma, 
> struct vm_fault *vmf)
>   goto out;
>  
>   file_update_time(vma->vm_file);
> - ret = __block_page_mkwrite(vma, vmf, nilfs_get_block);
> + ret = block_page_mkwrite(vma, vmf, nilfs_get_block);
>   if (ret) {
>   nilfs_transaction_abort(inode->i_sb);
>   goto out;
> diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
> index e78feb4..f80e90f 100644
> --- a/fs/xfs/xfs_file.c
> +++ b/fs/xfs/xfs_file.c
> @@ -1506,7 +1506,7 @@ xfs_filemap_page_mkwrite(
>   ret = __dax_mkwrite(vma, vmf, xfs_get_blocks_direct,
>   xfs_end_io_dax_write);
>   } else {
> - ret = __block_page_mkwrite(vma, vmf, xfs_get_blocks);
> + ret = blo

[PATCH 1/2] nilfs2: fix gcc unused-but-set-variable warnings

2015-10-11 Thread Ryusuke Konishi

Fix the following build warnings:

 $ make W=1
 [...]
   CC [M]  fs/nilfs2/btree.o
 fs/nilfs2/btree.c: In function 'nilfs_btree_split':
 fs/nilfs2/btree.c:923:8: warning: variable 'newptr' set but not used 
[-Wunused-but-set-variable]
   __u64 newptr;
 ^
 fs/nilfs2/btree.c:922:8: warning: variable 'newkey' set but not used 
[-Wunused-but-set-variable]
   __u64 newkey;
 ^
   CC [M]  fs/nilfs2/dat.o
 fs/nilfs2/dat.c: In function 'nilfs_dat_prepare_end':
 fs/nilfs2/dat.c:158:8: warning: variable 'start' set but not used 
[-Wunused-but-set-variable]
   __u64 start;
 ^
   CC [M]  fs/nilfs2/segment.o
 fs/nilfs2/segment.c: In function 'nilfs_segctor_do_immediate_flush':
 fs/nilfs2/segment.c:2433:6: warning: variable 'err' set but not used 
[-Wunused-but-set-variable]
   int err;
   ^
   CC [M]  fs/nilfs2/sufile.o
 fs/nilfs2/sufile.c: In function 'nilfs_sufile_alloc':
 fs/nilfs2/sufile.c:320:27: warning: variable 'ncleansegs' set but not used 
[-Wunused-but-set-variable]
   unsigned long nsegments, ncleansegs, nsus, cnt;
^
   CC [M]  fs/nilfs2/alloc.o
 fs/nilfs2/alloc.c: In function 'nilfs_palloc_prepare_alloc_entry':
 fs/nilfs2/alloc.c:478:38: warning: variable 'groups_per_desc_block' set but 
not used [-Wunused-but-set-variable]
   unsigned long n, entries_per_group, groups_per_desc_block;
   ^

Signed-off-by: Ryusuke Konishi 
---
 fs/nilfs2/alloc.c   | 3 +--
 fs/nilfs2/btree.c   | 5 -
 fs/nilfs2/dat.c | 2 --
 fs/nilfs2/segment.c | 3 +--
 fs/nilfs2/sufile.c  | 3 +--
 5 files changed, 3 insertions(+), 13 deletions(-)

diff --git a/fs/nilfs2/alloc.c b/fs/nilfs2/alloc.c
index 8df0f3b..dd6c142 100644
--- a/fs/nilfs2/alloc.c
+++ b/fs/nilfs2/alloc.c
@@ -475,7 +475,7 @@ int nilfs_palloc_prepare_alloc_entry(struct inode *inode,
void *desc_kaddr, *bitmap_kaddr;
unsigned long group, maxgroup, ngroups;
unsigned long group_offset, maxgroup_offset;
-   unsigned long n, entries_per_group, groups_per_desc_block;
+   unsigned long n, entries_per_group;
unsigned long i, j;
int pos, ret;
 
@@ -483,7 +483,6 @@ int nilfs_palloc_prepare_alloc_entry(struct inode *inode,
maxgroup = ngroups - 1;
group = nilfs_palloc_group(inode, req->pr_entry_nr, &group_offset);
entries_per_group = nilfs_palloc_entries_per_group(inode);
-   groups_per_desc_block = nilfs_palloc_groups_per_desc_block(inode);
 
for (i = 0; i < ngroups; i += n) {
if (group >= ngroups) {
diff --git a/fs/nilfs2/btree.c b/fs/nilfs2/btree.c
index 919fd5b..f609a85 100644
--- a/fs/nilfs2/btree.c
+++ b/fs/nilfs2/btree.c
@@ -919,8 +919,6 @@ static void nilfs_btree_split(struct nilfs_bmap *btree,
  int level, __u64 *keyp, __u64 *ptrp)
 {
struct nilfs_btree_node *node, *right;
-   __u64 newkey;
-   __u64 newptr;
int nchildren, n, move, ncblk;
 
node = nilfs_btree_get_nonroot_node(path, level);
@@ -942,9 +940,6 @@ static void nilfs_btree_split(struct nilfs_bmap *btree,
if (!buffer_dirty(path[level].bp_sib_bh))
mark_buffer_dirty(path[level].bp_sib_bh);
 
-   newkey = nilfs_btree_node_get_key(right, 0);
-   newptr = path[level].bp_newreq.bpr_ptr;
-
if (move) {
path[level].bp_index -= nilfs_btree_node_get_nchildren(node);
nilfs_btree_node_insert(right, path[level].bp_index,
diff --git a/fs/nilfs2/dat.c b/fs/nilfs2/dat.c
index 0d5fada..7dc23f1 100644
--- a/fs/nilfs2/dat.c
+++ b/fs/nilfs2/dat.c
@@ -155,7 +155,6 @@ void nilfs_dat_commit_start(struct inode *dat, struct 
nilfs_palloc_req *req,
 int nilfs_dat_prepare_end(struct inode *dat, struct nilfs_palloc_req *req)
 {
struct nilfs_dat_entry *entry;
-   __u64 start;
sector_t blocknr;
void *kaddr;
int ret;
@@ -169,7 +168,6 @@ int nilfs_dat_prepare_end(struct inode *dat, struct 
nilfs_palloc_req *req)
kaddr = kmap_atomic(req->pr_entry_bh->b_page);
entry = nilfs_palloc_block_get_entry(dat, req->pr_entry_nr,
 req->pr_entry_bh, kaddr);
-   start = le64_to_cpu(entry->de_start);
blocknr = le64_to_cpu(entry->de_blocknr);
kunmap_atomic(kaddr);
 
diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c
index c6abbad9..11c06e9 100644
--- a/fs/nilfs2/segment.c
+++ b/fs/nilfs2/segment.c
@@ -2430,7 +2430,6 @@ static void nilfs_segctor_thread_construct(struct 
nilfs_sc_info *sci, int mode)
 static void nilfs_segctor_do_immediate_flush(struct nilfs_sc_info *sci)
 {
int mode = 0;
-   int err;
 
spin_lock(&sci->sc_state_lock);
mode = (sci->sc_flush_request & FLUSH_DAT_BIT) ?
@@ -2438,7 +2437,7 @@ static void nilfs_segctor_do_immediate_f

[PATCH 2/2] nilfs2: fix gcc uninitialized-variable warnings in powerpc build

2015-10-11 Thread Ryusuke Konishi

Some false positive warnings are reported for powerpc build.

The following warnings are reported in
 http://kisskb.ellerman.id.au/kisskb/buildresult/12519703/

   CC  fs/nilfs2/super.o
 fs/nilfs2/super.c: In function 'nilfs_resize_fs':
 fs/nilfs2/super.c:376:2: warning: 'blocknr' may be used uninitialized in this 
function [-Wuninitialized]
 fs/nilfs2/super.c:362:11: note: 'blocknr' was declared here
   CC  fs/nilfs2/recovery.o
 fs/nilfs2/recovery.c: In function 'nilfs_salvage_orphan_logs':
 fs/nilfs2/recovery.c:631:21: warning: 'sum' may be used uninitialized in this 
function [-Wuninitialized]
 fs/nilfs2/recovery.c:585:32: note: 'sum' was declared here
 fs/nilfs2/recovery.c: In function 'nilfs_search_super_root':
 fs/nilfs2/recovery.c:873:11: warning: 'sum' may be used uninitialized in this 
function [-Wuninitialized]

Another similar warning is reported in
 http://kisskb.ellerman.id.au/kisskb/buildresult/12520079/

   CC  fs/nilfs2/btree.o
 fs/nilfs2/btree.c: In function 'nilfs_btree_convert_and_insert':
 include/asm-generic/bitops/non-atomic.h:105:20: warning: 'bh' may be used 
uninitialized in this function [-Wuninitialized]
 fs/nilfs2/btree.c:1859:22: note: 'bh' was declared here

This cleans out these warnings by forcing the variables to be initialized.

Signed-off-by: Ryusuke Konishi 
Reported-by: Geert Uytterhoeven 
---
 fs/nilfs2/btree.c| 2 +-
 fs/nilfs2/recovery.c | 4 ++--
 fs/nilfs2/super.c| 5 -
 3 files changed, 7 insertions(+), 4 deletions(-)

diff --git a/fs/nilfs2/btree.c b/fs/nilfs2/btree.c
index f609a85..3a3821b 100644
--- a/fs/nilfs2/btree.c
+++ b/fs/nilfs2/btree.c
@@ -1851,7 +1851,7 @@ int nilfs_btree_convert_and_insert(struct nilfs_bmap 
*btree,
   __u64 key, __u64 ptr,
   const __u64 *keys, const __u64 *ptrs, int n)
 {
-   struct buffer_head *bh;
+   struct buffer_head *bh = NULL;
union nilfs_bmap_ptr_req dreq, nreq, *di, *ni;
struct nilfs_bmap_stats stats;
int ret;
diff --git a/fs/nilfs2/recovery.c b/fs/nilfs2/recovery.c
index ff00a0b..9b4f205 100644
--- a/fs/nilfs2/recovery.c
+++ b/fs/nilfs2/recovery.c
@@ -582,7 +582,7 @@ static int nilfs_do_roll_forward(struct the_nilfs *nilfs,
 struct nilfs_recovery_info *ri)
 {
struct buffer_head *bh_sum = NULL;
-   struct nilfs_segment_summary *sum;
+   struct nilfs_segment_summary *sum = NULL;
sector_t pseg_start;
sector_t seg_start, seg_end;  /* Starting/ending DBN of full segment */
unsigned long nsalvaged_blocks = 0;
@@ -814,7 +814,7 @@ int nilfs_search_super_root(struct the_nilfs *nilfs,
struct nilfs_recovery_info *ri)
 {
struct buffer_head *bh_sum = NULL;
-   struct nilfs_segment_summary *sum;
+   struct nilfs_segment_summary *sum = NULL;
sector_t pseg_start, pseg_end, sr_pseg_start = 0;
sector_t seg_start, seg_end; /* range of full segment (block number) */
sector_t b, end;
diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c
index f47585b..c6b5008 100644
--- a/fs/nilfs2/super.c
+++ b/fs/nilfs2/super.c
@@ -361,7 +361,7 @@ static int nilfs_move_2nd_super(struct super_block *sb, 
loff_t sb2off)
struct nilfs_super_block *nsbp;
sector_t blocknr, newblocknr;
unsigned long offset;
-   int sb2i = -1;  /* array index of the secondary superblock */
+   int sb2i;  /* array index of the secondary superblock */
int ret = 0;
 
/* nilfs->ns_sem must be locked by the caller. */
@@ -372,6 +372,9 @@ static int nilfs_move_2nd_super(struct super_block *sb, 
loff_t sb2off)
} else if (nilfs->ns_sbh[0]->b_blocknr > nilfs->ns_first_data_block) {
sb2i = 0;
blocknr = nilfs->ns_sbh[0]->b_blocknr;
+   } else {
+   sb2i = -1;
+   blocknr = 0;
}
if (sb2i >= 0 && (u64)blocknr << nilfs->ns_blocksize_bits == sb2off)
goto out;  /* super block location is unchanged */
-- 
1.8.3.1

--
To unsubscribe from this list: send the line "unsubscribe linux-nilfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH 0/2] nilfs2: fix build warnings

2015-10-11 Thread Ryusuke Konishi

Hi Andrew,

Please send the following fixes to upstream:

Ryusuke Konishi (2):
  nilfs2: fix gcc unused-but-set-variable warnings
  nilfs2: fix gcc uninitialized-variable warnings in powerpc build

These prevent reported warnings in powerpc build and minor warnings
during build with "W=1".  Both were detected on the mainline.

Thanks,
Ryusuke Konishi
--

 fs/nilfs2/alloc.c| 3 +--
 fs/nilfs2/btree.c| 7 +--
 fs/nilfs2/dat.c  | 2 --
 fs/nilfs2/recovery.c | 4 ++--
 fs/nilfs2/segment.c  | 3 +--
 fs/nilfs2/sufile.c   | 3 +--
 fs/nilfs2/super.c| 5 -
 7 files changed, 10 insertions(+), 17 deletions(-)

--
To unsubscribe from this list: send the line "unsubscribe linux-nilfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH 0/5] nilfs2: introduce tracepoints

2015-10-06 Thread Ryusuke Konishi

Hi Andrew,

Please queue the following changes for the next merge window:

Hitoshi Mitake (4):
  nilfs2: add a tracepoint for tracking stage transition of segment 
construction
  nilfs2: add a tracepoint for transaction events
  nilfs2: add tracepoints for analyzing sufile manipulation
  nilfs2: add tracepoints for analyzing reading and writing metadata files

Ryusuke Konishi (1):
  MAINTAINERS: nilfs2: add header file for tracing

These introduces some tracepoints to nilfs2 to help tracking its
behavior for bottleneck detection, debugging, etc.

Thanks,
Ryusuke Konishi
--

 MAINTAINERS   |   1 +
 fs/nilfs2/mdt.c   |   6 ++
 fs/nilfs2/segment.c   | 104 
 fs/nilfs2/segment.h   |   3 +-
 fs/nilfs2/sufile.c|   8 ++
 include/trace/events/nilfs2.h | 224 ++
 6 files changed, 324 insertions(+), 22 deletions(-)

--
To unsubscribe from this list: send the line "unsubscribe linux-nilfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH 3/5] nilfs2: add tracepoints for analyzing sufile manipulation

2015-10-06 Thread Ryusuke Konishi

From: Hitoshi Mitake 

This patch adds tracepoints which would be useful for analyzing
segment usage from a perspective of high level sufile manipulation
(check, alloc, free). sufile is an important in-place updated metadata
file, so analyzing the behavior would be useful for performance
turning.

example of usage (a case of allocation):

$ sudo bin/tpoint nilfs2:nilfs2_segment_usage_allocated
Tracing nilfs2:nilfs2_segment_usage_allocated. Ctrl-C to end.
segctord-17800 [002] ...1 10671.867294: nilfs2_segment_usage_allocated: 
sufile = 880054f908a8 segnum = 2
segctord-17800 [002] ...1 10675.073477: nilfs2_segment_usage_allocated: 
sufile = 880054f908a8 segnum = 3

Cc: Benixon Dhas 
Cc: TK Kato 
Signed-off-by: Hitoshi Mitake 
Signed-off-by: Ryusuke Konishi 
Cc: Steven Rostedt 
---
 fs/nilfs2/sufile.c|  8 ++
 include/trace/events/nilfs2.h | 67 +++
 2 files changed, 75 insertions(+)

diff --git a/fs/nilfs2/sufile.c b/fs/nilfs2/sufile.c
index 2a869c3..7ff8f15 100644
--- a/fs/nilfs2/sufile.c
+++ b/fs/nilfs2/sufile.c
@@ -30,6 +30,8 @@
 #include "mdt.h"
 #include "sufile.h"
 
+#include 
+
 /**
  * struct nilfs_sufile_info - on-memory private data of sufile
  * @mi: on-memory private data of metadata file
@@ -358,6 +360,7 @@ int nilfs_sufile_alloc(struct inode *sufile, __u64 *segnump)
break; /* never happens */
}
}
+   trace_nilfs2_segment_usage_check(sufile, segnum, cnt);
ret = nilfs_sufile_get_segment_usage_block(sufile, segnum, 1,
   &su_bh);
if (ret < 0)
@@ -388,6 +391,9 @@ int nilfs_sufile_alloc(struct inode *sufile, __u64 *segnump)
nilfs_mdt_mark_dirty(sufile);
brelse(su_bh);
*segnump = segnum;
+
+   trace_nilfs2_segment_usage_allocated(sufile, segnum);
+
goto out_header;
}
 
@@ -490,6 +496,8 @@ void nilfs_sufile_do_free(struct inode *sufile, __u64 
segnum,
NILFS_SUI(sufile)->ncleansegs++;
 
nilfs_mdt_mark_dirty(sufile);
+
+   trace_nilfs2_segment_usage_freed(sufile, segnum);
 }
 
 /**
diff --git a/include/trace/events/nilfs2.h b/include/trace/events/nilfs2.h
index e5649ac..1b65ba6 100644
--- a/include/trace/events/nilfs2.h
+++ b/include/trace/events/nilfs2.h
@@ -95,6 +95,73 @@ TRACE_EVENT(nilfs2_transaction_transition,
  show_transaction_state(__entry->state))
 );
 
+TRACE_EVENT(nilfs2_segment_usage_check,
+   TP_PROTO(struct inode *sufile,
+__u64 segnum,
+unsigned long cnt),
+
+   TP_ARGS(sufile, segnum, cnt),
+
+   TP_STRUCT__entry(
+   __field(struct inode *, sufile)
+   __field(__u64, segnum)
+   __field(unsigned long, cnt)
+   ),
+
+   TP_fast_assign(
+   __entry->sufile = sufile;
+   __entry->segnum = segnum;
+   __entry->cnt = cnt;
+   ),
+
+   TP_printk("sufile = %p segnum = %llu cnt = %lu",
+ __entry->sufile,
+ __entry->segnum,
+ __entry->cnt)
+);
+
+TRACE_EVENT(nilfs2_segment_usage_allocated,
+   TP_PROTO(struct inode *sufile,
+__u64 segnum),
+
+   TP_ARGS(sufile, segnum),
+
+   TP_STRUCT__entry(
+   __field(struct inode *, sufile)
+   __field(__u64, segnum)
+   ),
+
+   TP_fast_assign(
+   __entry->sufile = sufile;
+   __entry->segnum = segnum;
+   ),
+
+   TP_printk("sufile = %p segnum = %llu",
+ __entry->sufile,
+ __entry->segnum)
+);
+
+TRACE_EVENT(nilfs2_segment_usage_freed,
+   TP_PROTO(struct inode *sufile,
+__u64 segnum),
+
+   TP_ARGS(sufile, segnum),
+
+   TP_STRUCT__entry(
+   __field(struct inode *, sufile)
+   __field(__u64, segnum)
+   ),
+
+   TP_fast_assign(
+   __entry->sufile = sufile;
+   __entry->segnum = segnum;
+   ),
+
+   TP_printk("sufile = %p segnum = %llu",
+ __entry->sufile,
+ __entry->segnum)
+);
+
 #endif /* _TRACE_NILFS2_H */
 
 /* This part must be outside protection */
-- 
1.8.3.1

--
To unsubscribe from this list: send the line "unsubscribe linux-nilfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH 2/5] nilfs2: add a tracepoint for transaction events

2015-10-06 Thread Ryusuke Konishi

From: Hitoshi Mitake 

This patch adds a tracepoint for transaction events of nilfs. With the
tracepoint, these events can be tracked: begin, abort, commit,
trylock, lock, and unlock. Basically, these events have corresponding
functions e.g. begin event corresponds nilfs_transaction_begin(). The
unlock event is an exception. It corresponds to the iteration in
nilfs_transaction_lock().

Only one tracepoint is introcued: nilfs2_transaction_transition. The
above events are distinguished with newly introduced enum. With this
tracepoint, we can analyse a critical section of segment constructoin.

Sample output by tpoint of perf-tools:
  cp-4457  [000] ...163.266220: nilfs2_transaction_transition: 
sb = 8802112b8800 ti = 8800bf5ccc58 count = 1 flags = 9 state = BEGIN
  cp-4457  [000] ...163.266221: nilfs2_transaction_transition: 
sb = 8802112b8800 ti = 8800bf5ccc58 count = 0 flags = 9 state = COMMIT
  cp-4457  [000] ...163.266221: nilfs2_transaction_transition: 
sb = 8802112b8800 ti = 8800bf5ccc58 count = 0 flags = 9 state = COMMIT
segctord-4371  [001] ...168.261196: nilfs2_transaction_transition: 
sb = 8802112b8800 ti = 8800b889bdf8 count = 0 flags = 10 state = TRYLOCK
segctord-4371  [001] ...168.261280: nilfs2_transaction_transition: 
sb = 8802112b8800 ti = 8800b889bdf8 count = 0 flags = 10 state = LOCK
segctord-4371  [001] ...168.261877: nilfs2_transaction_transition: 
sb = 8802112b8800 ti = 8800b889bdf8 count = 1 flags = 10 state = BEGIN
segctord-4371  [001] ...168.262116: nilfs2_transaction_transition: 
sb = 8802112b8800 ti = 8800b889bdf8 count = 0 flags = 18 state = COMMIT
segctord-4371  [001] ...168.265032: nilfs2_transaction_transition: 
sb = 8802112b8800 ti = 8800b889bdf8 count = 0 flags = 18 state = UNLOCK
segctord-4371  [001] ...1   132.376847: nilfs2_transaction_transition: 
sb = 8802112b8800 ti = 8800b889bdf8 count = 0 flags = 10 state = TRYLOCK

This patch also does trivial cleaning of comma usage in collection
stage transition event for consistent coding style.

Signed-off-by: Hitoshi Mitake 
Signed-off-by: Ryusuke Konishi 
Cc: Steven Rostedt 
---
 fs/nilfs2/segment.c   | 33 ++-
 include/trace/events/nilfs2.h | 53 +++
 2 files changed, 85 insertions(+), 1 deletion(-)

diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c
index ef35404..3fc4732 100644
--- a/fs/nilfs2/segment.c
+++ b/fs/nilfs2/segment.c
@@ -214,11 +214,18 @@ int nilfs_transaction_begin(struct super_block *sb,
 {
struct the_nilfs *nilfs;
int ret = nilfs_prepare_segment_lock(ti);
+   struct nilfs_transaction_info *trace_ti;
 
if (unlikely(ret < 0))
return ret;
-   if (ret > 0)
+   if (ret > 0) {
+   trace_ti = current->journal_info;
+
+   trace_nilfs2_transaction_transition(sb, trace_ti,
+   trace_ti->ti_count, trace_ti->ti_flags,
+   TRACE_NILFS2_TRANSACTION_BEGIN);
return 0;
+   }
 
sb_start_intwrite(sb);
 
@@ -229,6 +236,11 @@ int nilfs_transaction_begin(struct super_block *sb,
ret = -ENOSPC;
goto failed;
}
+
+   trace_ti = current->journal_info;
+   trace_nilfs2_transaction_transition(sb, trace_ti, trace_ti->ti_count,
+   trace_ti->ti_flags,
+   TRACE_NILFS2_TRANSACTION_BEGIN);
return 0;
 
  failed:
@@ -261,6 +273,8 @@ int nilfs_transaction_commit(struct super_block *sb)
ti->ti_flags |= NILFS_TI_COMMIT;
if (ti->ti_count > 0) {
ti->ti_count--;
+   trace_nilfs2_transaction_transition(sb, ti, ti->ti_count,
+   ti->ti_flags, TRACE_NILFS2_TRANSACTION_COMMIT);
return 0;
}
if (nilfs->ns_writer) {
@@ -272,6 +286,9 @@ int nilfs_transaction_commit(struct super_block *sb)
nilfs_segctor_do_flush(sci, 0);
}
up_read(&nilfs->ns_segctor_sem);
+   trace_nilfs2_transaction_transition(sb, ti, ti->ti_count,
+   ti->ti_flags, TRACE_NILFS2_TRANSACTION_COMMIT);
+
current->journal_info = ti->ti_save;
 
if (ti->ti_flags & NILFS_TI_SYNC)
@@ -290,10 +307,15 @@ void nilfs_transaction_abort(struct super_block *sb)
BUG_ON(ti == NULL || ti->ti_magic != NILFS_TI_MAGIC);
if (ti->ti_count > 0) {
ti->ti_count--;
+   trace_nilfs2_transaction_transition(sb, ti, ti->ti_count,
+   ti->ti_flags, TRACE_NILFS2_TRANSACTION_ABORT);
return;
}
up_read

[PATCH 4/5] nilfs2: add tracepoints for analyzing reading and writing metadata files

2015-10-06 Thread Ryusuke Konishi

From: Hitoshi Mitake 

This patch adds tracepoints for analyzing requests of reading and
writing metadata files. The tracepoints cover every in-place mdt files
(cpfile, sufile, and datfile).

Example of tracing mdt_insert_new_block():
  cp-14635 [000] ...1 30598.199309: nilfs2_mdt_insert_new_block: 
inode = 88022a8d0178 ino = 3 block = 155
  cp-14635 [000] ...1 30598.199520: nilfs2_mdt_insert_new_block: 
inode = 88022a8d0178 ino = 3 block = 5
  cp-14635 [000] ...1 30598.200828: nilfs2_mdt_insert_new_block: 
inode = 88022a8d0178 ino = 3 block = 253

Cc: TK Kato 
Signed-off-by: Hitoshi Mitake 
Signed-off-by: Ryusuke Konishi 
Cc: Steven Rostedt 
---
 fs/nilfs2/mdt.c   |  6 +
 include/trace/events/nilfs2.h | 54 +++
 2 files changed, 60 insertions(+)

diff --git a/fs/nilfs2/mdt.c b/fs/nilfs2/mdt.c
index dee34d9..1125f40 100644
--- a/fs/nilfs2/mdt.c
+++ b/fs/nilfs2/mdt.c
@@ -33,6 +33,7 @@
 #include "page.h"
 #include "mdt.h"
 
+#include 
 
 #define NILFS_MDT_MAX_RA_BLOCKS(16 - 1)
 
@@ -68,6 +69,9 @@ nilfs_mdt_insert_new_block(struct inode *inode, unsigned long 
block,
set_buffer_uptodate(bh);
mark_buffer_dirty(bh);
nilfs_mdt_mark_dirty(inode);
+
+   trace_nilfs2_mdt_insert_new_block(inode, inode->i_ino, block);
+
return 0;
 }
 
@@ -158,6 +162,8 @@ nilfs_mdt_submit_block(struct inode *inode, unsigned long 
blkoff,
get_bh(bh);
submit_bh(mode, bh);
ret = 0;
+
+   trace_nilfs2_mdt_submit_block(inode, inode->i_ino, blkoff, mode);
  out:
get_bh(bh);
*out_bh = bh;
diff --git a/include/trace/events/nilfs2.h b/include/trace/events/nilfs2.h
index 1b65ba6..c780581 100644
--- a/include/trace/events/nilfs2.h
+++ b/include/trace/events/nilfs2.h
@@ -162,6 +162,60 @@ TRACE_EVENT(nilfs2_segment_usage_freed,
  __entry->segnum)
 );
 
+TRACE_EVENT(nilfs2_mdt_insert_new_block,
+   TP_PROTO(struct inode *inode,
+unsigned long ino,
+unsigned long block),
+
+   TP_ARGS(inode, ino, block),
+
+   TP_STRUCT__entry(
+   __field(struct inode *, inode)
+   __field(unsigned long, ino)
+   __field(unsigned long, block)
+   ),
+
+   TP_fast_assign(
+   __entry->inode = inode;
+   __entry->ino = ino;
+   __entry->block = block;
+   ),
+
+   TP_printk("inode = %p ino = %lu block = %lu",
+ __entry->inode,
+ __entry->ino,
+ __entry->block)
+);
+
+TRACE_EVENT(nilfs2_mdt_submit_block,
+   TP_PROTO(struct inode *inode,
+unsigned long ino,
+unsigned long blkoff,
+int mode),
+
+   TP_ARGS(inode, ino, blkoff, mode),
+
+   TP_STRUCT__entry(
+   __field(struct inode *, inode)
+   __field(unsigned long, ino)
+   __field(unsigned long, blkoff)
+   __field(int, mode)
+   ),
+
+   TP_fast_assign(
+   __entry->inode = inode;
+   __entry->ino = ino;
+   __entry->blkoff = blkoff;
+   __entry->mode = mode;
+   ),
+
+   TP_printk("inode = %p ino = %lu blkoff = %lu mode = %x",
+ __entry->inode,
+ __entry->ino,
+ __entry->blkoff,
+ __entry->mode)
+);
+
 #endif /* _TRACE_NILFS2_H */
 
 /* This part must be outside protection */
-- 
1.8.3.1

--
To unsubscribe from this list: send the line "unsubscribe linux-nilfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH 5/5] MAINTAINERS: nilfs2: add header file for tracing

2015-10-06 Thread Ryusuke Konishi

This adds header file "include/trace/events/nilfs2.h" to
maintainer-ship of nilfs2 so that updates to the nilfs2 header file go
to the mailing list of nilfs2.

Signed-off-by: Ryusuke Konishi 
Cc: Hitoshi Mitake 
---
 MAINTAINERS | 1 +
 1 file changed, 1 insertion(+)

diff --git a/MAINTAINERS b/MAINTAINERS
index 797236b..6b15b7a 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -7371,6 +7371,7 @@ S:Supported
 F: Documentation/filesystems/nilfs2.txt
 F: fs/nilfs2/
 F: include/linux/nilfs2_fs.h
+F: include/trace/events/nilfs2.h
 
 NINJA SCSI-3 / NINJA SCSI-32Bi (16bit/CardBus) PCMCIA SCSI HOST ADAPTER DRIVER
 M: YOKOTA Hiroshi 
-- 
1.8.3.1

--
To unsubscribe from this list: send the line "unsubscribe linux-nilfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH 1/5] nilfs2: add a tracepoint for tracking stage transition of segment construction

2015-10-06 Thread Ryusuke Konishi

From: Hitoshi Mitake 

This patch adds a tracepoint for tracking stage transition of block
collection in segment construction. With the tracepoint, we can
analysis the behavior of segment construction in depth. It would be
useful for bottleneck detection and debugging, etc.

The tracepoint is created with the standard trace API of linux (like
ext3, ext4, f2fs and btrfs). So we can analysis with existing tools
easily. Of course, more detailed analysis will be possible if we can
create nilfs specific analysis tools.

Below is an example of event dump with Brendan Gregg's perf-tools
(https://github.com/brendangregg/perf-tools). Time consumption between
each stage can be obtained.

$ sudo bin/tpoint nilfs2:nilfs2_collection_stage_transition
Tracing nilfs2:nilfs2_collection_stage_transition. Ctrl-C to end.
segctord-14875 [003] ...1 28311.067794: 
nilfs2_collection_stage_transition: sci = 8800ce6de000 stage = ST_INIT
segctord-14875 [003] ...1 28311.068139: 
nilfs2_collection_stage_transition: sci = 8800ce6de000 stage = ST_GC
segctord-14875 [003] ...1 28311.068139: 
nilfs2_collection_stage_transition: sci = 8800ce6de000 stage = ST_FILE
segctord-14875 [003] ...1 28311.068486: 
nilfs2_collection_stage_transition: sci = 8800ce6de000 stage = ST_IFILE
segctord-14875 [003] ...1 28311.068540: 
nilfs2_collection_stage_transition: sci = 8800ce6de000 stage = ST_CPFILE
segctord-14875 [003] ...1 28311.068561: 
nilfs2_collection_stage_transition: sci = 8800ce6de000 stage = ST_SUFILE
segctord-14875 [003] ...1 28311.068565: 
nilfs2_collection_stage_transition: sci = 8800ce6de000 stage = ST_DAT
segctord-14875 [003] ...1 28311.068573: 
nilfs2_collection_stage_transition: sci = 8800ce6de000 stage = ST_SR
segctord-14875 [003] ...1 28311.068574: 
nilfs2_collection_stage_transition: sci = 8800ce6de000 stage = ST_DONE

For capturing transition correctly, this patch adds wrappers for the
member scnt of nilfs_cstage. With this change, every transition of the
stage can produce trace event in a correct manner.

Signed-off-by: Hitoshi Mitake 
Signed-off-by: Ryusuke Konishi 
Cc: Steven Rostedt 
---
 fs/nilfs2/segment.c   | 71 +++
 fs/nilfs2/segment.h   |  3 +-
 include/trace/events/nilfs2.h | 50 ++
 3 files changed, 103 insertions(+), 21 deletions(-)
 create mode 100644 include/trace/events/nilfs2.h

diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c
index c6abbad9..ef35404 100644
--- a/fs/nilfs2/segment.c
+++ b/fs/nilfs2/segment.c
@@ -77,6 +77,36 @@ enum {
NILFS_ST_DONE,
 };
 
+#define CREATE_TRACE_POINTS
+#include 
+
+/*
+ * nilfs_sc_cstage_inc(), nilfs_sc_cstage_set(), nilfs_sc_cstage_get() are
+ * wrapper functions of stage count (nilfs_sc_info->sc_stage.scnt). Users of
+ * the variable must use them because transition of stage count must involve
+ * trace events (trace_nilfs2_collection_stage_transition).
+ *
+ * nilfs_sc_cstage_get() isn't required for the above purpose because it 
doesn't
+ * produce tracepoint events. It is provided just for making the intention
+ * clear.
+ */
+static inline void nilfs_sc_cstage_inc(struct nilfs_sc_info *sci)
+{
+   sci->sc_stage.scnt++;
+   trace_nilfs2_collection_stage_transition(sci);
+}
+
+static inline void nilfs_sc_cstage_set(struct nilfs_sc_info *sci, int 
next_scnt)
+{
+   sci->sc_stage.scnt = next_scnt;
+   trace_nilfs2_collection_stage_transition(sci);
+}
+
+static inline int nilfs_sc_cstage_get(struct nilfs_sc_info *sci)
+{
+   return sci->sc_stage.scnt;
+}
+
 /* State flags of collection */
 #define NILFS_CF_NODE  0x0001  /* Collecting node blocks */
 #define NILFS_CF_IFILE_STARTED 0x0002  /* IFILE stage has started */
@@ -1062,7 +1092,7 @@ static int nilfs_segctor_collect_blocks(struct 
nilfs_sc_info *sci, int mode)
size_t ndone;
int err = 0;
 
-   switch (sci->sc_stage.scnt) {
+   switch (nilfs_sc_cstage_get(sci)) {
case NILFS_ST_INIT:
/* Pre-processes */
sci->sc_stage.flags = 0;
@@ -1071,7 +1101,7 @@ static int nilfs_segctor_collect_blocks(struct 
nilfs_sc_info *sci, int mode)
sci->sc_nblk_inc = 0;
sci->sc_curseg->sb_sum.flags = NILFS_SS_LOGBGN;
if (mode == SC_LSEG_DSYNC) {
-   sci->sc_stage.scnt = NILFS_ST_DSYNC;
+   nilfs_sc_cstage_set(sci, NILFS_ST_DSYNC);
goto dsync_mode;
}
}
@@ -1079,10 +1109,10 @@ static int nilfs_segctor_collect_blocks(struct 
nilfs_sc_info *sci, int mode)
sci->sc_stage.dirty_file_ptr = NULL;
sci->sc_stage.gc_inode_ptr = NULL;
if (mode == SC_FLUSH_DAT) {
-

Re: [PATCH] nilfs2: add tracepoints for analyzing reading and writing metadata files

2015-10-05 Thread Ryusuke Konishi

On Mon, 5 Oct 2015 19:21:43 +0900, Mitake Hitoshi wrote:
> On Sun, Oct 4, 2015 at 10:33 PM, Ryusuke Konishi
>  wrote:
>> On Sun,  4 Oct 2015 01:02:42 +0900, Mitake Hitoshi wrote:
>>> This patch adds tracepoints for analyzing requests of reading and
>>> writing metadata files. The tracepoints cover every in-place mdt files
>>> (cpfile, sufile, and datfile).
>>>
>>> Example of tracing mdt_insert_new_block():
>>>   cp-14635 [000] ...1 30598.199309: 
>>> nilfs2_mdt_insert_new_block: inode = 88022a8d0178 ino = 3 block = 155
>>>   cp-14635 [000] ...1 30598.199520: 
>>> nilfs2_mdt_insert_new_block: inode = 88022a8d0178 ino = 3 block = 5
>>>   cp-14635 [000] ...1 30598.200828: 
>>> nilfs2_mdt_insert_new_block: inode = 88022a8d0178 ino = 3 block = 253
>>>
>>> Cc: TK Kato 
>>> Signed-off-by: Hitoshi Mitake 
>>
>> Applied to the tracepoints branch.  Thanks.
> 
> Thanks, could you send the patches in the tracepoints branch to upstream?

Sure, I will.

Regards,
Ryusuke Konishi

--
To unsubscribe from this list: send the line "unsubscribe linux-nilfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH] nilfs2: add tracepoints for analyzing reading and writing metadata files

2015-10-04 Thread Ryusuke Konishi

On Sun,  4 Oct 2015 01:02:42 +0900, Mitake Hitoshi wrote:
> This patch adds tracepoints for analyzing requests of reading and
> writing metadata files. The tracepoints cover every in-place mdt files
> (cpfile, sufile, and datfile).
> 
> Example of tracing mdt_insert_new_block():
>   cp-14635 [000] ...1 30598.199309: nilfs2_mdt_insert_new_block: 
> inode = 88022a8d0178 ino = 3 block = 155
>   cp-14635 [000] ...1 30598.199520: nilfs2_mdt_insert_new_block: 
> inode = 88022a8d0178 ino = 3 block = 5
>   cp-14635 [000] ...1 30598.200828: nilfs2_mdt_insert_new_block: 
> inode = 88022a8d0178 ino = 3 block = 253
> 
> Cc: TK Kato 
> Signed-off-by: Hitoshi Mitake 

Applied to the tracepoints branch.  Thanks.

Ryusuke Konishi

> ---
>  fs/nilfs2/mdt.c   |  6 +
>  include/trace/events/nilfs2.h | 54 
> +++
>  2 files changed, 60 insertions(+)
> 
> diff --git a/fs/nilfs2/mdt.c b/fs/nilfs2/mdt.c
> index dee34d9..1125f40 100644
> --- a/fs/nilfs2/mdt.c
> +++ b/fs/nilfs2/mdt.c
> @@ -33,6 +33,7 @@
>  #include "page.h"
>  #include "mdt.h"
>  
> +#include 
>  
>  #define NILFS_MDT_MAX_RA_BLOCKS  (16 - 1)
>  
> @@ -68,6 +69,9 @@ nilfs_mdt_insert_new_block(struct inode *inode, unsigned 
> long block,
>   set_buffer_uptodate(bh);
>   mark_buffer_dirty(bh);
>   nilfs_mdt_mark_dirty(inode);
> +
> + trace_nilfs2_mdt_insert_new_block(inode, inode->i_ino, block);
> +
>   return 0;
>  }
>  
> @@ -158,6 +162,8 @@ nilfs_mdt_submit_block(struct inode *inode, unsigned long 
> blkoff,
>   get_bh(bh);
>   submit_bh(mode, bh);
>   ret = 0;
> +
> + trace_nilfs2_mdt_submit_block(inode, inode->i_ino, blkoff, mode);
>   out:
>   get_bh(bh);
>   *out_bh = bh;
> diff --git a/include/trace/events/nilfs2.h b/include/trace/events/nilfs2.h
> index 1b65ba6..c780581 100644
> --- a/include/trace/events/nilfs2.h
> +++ b/include/trace/events/nilfs2.h
> @@ -162,6 +162,60 @@ TRACE_EVENT(nilfs2_segment_usage_freed,
> __entry->segnum)
>  );
>  
> +TRACE_EVENT(nilfs2_mdt_insert_new_block,
> + TP_PROTO(struct inode *inode,
> +  unsigned long ino,
> +  unsigned long block),
> +
> + TP_ARGS(inode, ino, block),
> +
> + TP_STRUCT__entry(
> + __field(struct inode *, inode)
> + __field(unsigned long, ino)
> + __field(unsigned long, block)
> + ),
> +
> + TP_fast_assign(
> + __entry->inode = inode;
> + __entry->ino = ino;
> + __entry->block = block;
> + ),
> +
> + TP_printk("inode = %p ino = %lu block = %lu",
> +   __entry->inode,
> +   __entry->ino,
> +   __entry->block)
> +);
> +
> +TRACE_EVENT(nilfs2_mdt_submit_block,
> + TP_PROTO(struct inode *inode,
> +  unsigned long ino,
> +  unsigned long blkoff,
> +  int mode),
> +
> + TP_ARGS(inode, ino, blkoff, mode),
> +
> + TP_STRUCT__entry(
> + __field(struct inode *, inode)
> + __field(unsigned long, ino)
> + __field(unsigned long, blkoff)
> + __field(int, mode)
> + ),
> +
> + TP_fast_assign(
> + __entry->inode = inode;
> + __entry->ino = ino;
> + __entry->blkoff = blkoff;
> + __entry->mode = mode;
> + ),
> +
> + TP_printk("inode = %p ino = %lu blkoff = %lu mode = %x",
> +   __entry->inode,
> +   __entry->ino,
> +   __entry->blkoff,
> +   __entry->mode)
> +);
> +
>  #endif /* _TRACE_NILFS2_H */
>  
>  /* This part must be outside protection */
> -- 
> 1.9.1
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-nilfs" in
> the body of a message to majord...@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line "unsubscribe linux-nilfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH tracepoints] nilfs2: add tracepoints for analyzing sufile manipulation

2015-09-30 Thread Ryusuke Konishi

2015-09-29 22:37 GMT+09:00 Hitoshi Mitake :
> This patch adds tracepoints which would be useful for analyzing
> segment usage from a perspective of high level sufile manipulation
> (check, alloc, free). sufile is an important in-place updated metadata
> file, so analyzing the behavior would be useful for performance
> turning.
>
> example of usage (a case of allocation):
>
> $ sudo bin/tpoint nilfs2:nilfs2_segment_usage_allocated
> Tracing nilfs2:nilfs2_segment_usage_allocated. Ctrl-C to end.
> segctord-17800 [002] ...1 10671.867294: 
> nilfs2_segment_usage_allocated: sufile = 880054f908a8 segnum = 2
> segctord-17800 [002] ...1 10675.073477: 
> nilfs2_segment_usage_allocated: sufile = 880054f908a8 segnum = 3
>
> Cc: Benixon Dhas 
> Cc: TK Kato 
> Signed-off-by: Hitoshi Mitake 

Applied to the tracepoints branch.

Thanks,
Ryusuke Konishi

> ---
>  fs/nilfs2/sufile.c|  8 ++
>  include/trace/events/nilfs2.h | 67 
> +++
>  2 files changed, 75 insertions(+)
>
> diff --git a/fs/nilfs2/sufile.c b/fs/nilfs2/sufile.c
> index 2a869c3..7ff8f15 100644
> --- a/fs/nilfs2/sufile.c
> +++ b/fs/nilfs2/sufile.c
> @@ -30,6 +30,8 @@
>  #include "mdt.h"
>  #include "sufile.h"
>
> +#include 
> +
>  /**
>   * struct nilfs_sufile_info - on-memory private data of sufile
>   * @mi: on-memory private data of metadata file
> @@ -358,6 +360,7 @@ int nilfs_sufile_alloc(struct inode *sufile, __u64 
> *segnump)
> break; /* never happens */
> }
> }
> +   trace_nilfs2_segment_usage_check(sufile, segnum, cnt);
> ret = nilfs_sufile_get_segment_usage_block(sufile, segnum, 1,
>&su_bh);
> if (ret < 0)
> @@ -388,6 +391,9 @@ int nilfs_sufile_alloc(struct inode *sufile, __u64 
> *segnump)
> nilfs_mdt_mark_dirty(sufile);
> brelse(su_bh);
> *segnump = segnum;
> +
> +   trace_nilfs2_segment_usage_allocated(sufile, segnum);
> +
> goto out_header;
> }
>
> @@ -490,6 +496,8 @@ void nilfs_sufile_do_free(struct inode *sufile, __u64 
> segnum,
> NILFS_SUI(sufile)->ncleansegs++;
>
> nilfs_mdt_mark_dirty(sufile);
> +
> +   trace_nilfs2_segment_usage_freed(sufile, segnum);
>  }
>
>  /**
> diff --git a/include/trace/events/nilfs2.h b/include/trace/events/nilfs2.h
> index e5649ac..1b65ba6 100644
> --- a/include/trace/events/nilfs2.h
> +++ b/include/trace/events/nilfs2.h
> @@ -95,6 +95,73 @@ TRACE_EVENT(nilfs2_transaction_transition,
>   show_transaction_state(__entry->state))
>  );
>
> +TRACE_EVENT(nilfs2_segment_usage_check,
> +   TP_PROTO(struct inode *sufile,
> +__u64 segnum,
> +unsigned long cnt),
> +
> +   TP_ARGS(sufile, segnum, cnt),
> +
> +   TP_STRUCT__entry(
> +   __field(struct inode *, sufile)
> +   __field(__u64, segnum)
> +   __field(unsigned long, cnt)
> +   ),
> +
> +   TP_fast_assign(
> +   __entry->sufile = sufile;
> +   __entry->segnum = segnum;
> +   __entry->cnt = cnt;
> +   ),
> +
> +   TP_printk("sufile = %p segnum = %llu cnt = %lu",
> + __entry->sufile,
> + __entry->segnum,
> + __entry->cnt)
> +);
> +
> +TRACE_EVENT(nilfs2_segment_usage_allocated,
> +   TP_PROTO(struct inode *sufile,
> +__u64 segnum),
> +
> +   TP_ARGS(sufile, segnum),
> +
> +   TP_STRUCT__entry(
> +   __field(struct inode *, sufile)
> +   __field(__u64, segnum)
> +   ),
> +
> +   TP_fast_assign(
> +   __entry->sufile = sufile;
> +   __entry->segnum = segnum;
> +   ),
> +
> +   TP_printk("sufile = %p segnum = %llu",
> + __entry->sufile,
> + __entry->segnum)
> +);
> +
> +TRACE_EVENT(nilfs2_segment_usage_freed,
> +   TP_PROTO(struct inode *sufile,
> +__u64 segnum),
> +
> +   TP_ARGS(sufile, segnum),
> +
> +   TP_STRUCT__entry(
> +   __field(struct inode *, sufile)
> +   __fi

Re: [PATCH 14/17] fs/nilfs2: remove unnecessary new_valid_dev check

2015-09-29 Thread Ryusuke Konishi

On 2015/09/28 23:33, Yaowei Bai wrote:
> As new_valid_dev always returns 1, so !new_valid_dev check is not
> needed, remove it.
> 
> Signed-off-by: Yaowei Bai 

Acked-by: Ryusuke Konishi 

> ---
>   fs/nilfs2/namei.c | 3 ---
>   1 file changed, 3 deletions(-)
> 
> diff --git a/fs/nilfs2/namei.c b/fs/nilfs2/namei.c
> index 37dd6b0..c9a1a49 100644
> --- a/fs/nilfs2/namei.c
> +++ b/fs/nilfs2/namei.c
> @@ -120,9 +120,6 @@ nilfs_mknod(struct inode *dir, struct dentry *dentry, 
> umode_t mode, dev_t rdev)
>   struct nilfs_transaction_info ti;
>   int err;
>   
> - if (!new_valid_dev(rdev))
> - return -EINVAL;
> -
>   err = nilfs_transaction_begin(dir->i_sb, &ti, 1);
>   if (err)
>   return err;
> 


--
To unsubscribe from this list: send the line "unsubscribe linux-nilfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH 5/7] nilfs2: get rid of nilfs_palloc_group_is_in()

2015-09-19 Thread Ryusuke Konishi

This unfolds nilfs_palloc_group_is_in() helper function into
nilfs_palloc_freev() function to simplify a range check and an index
calculation repeatedy performed in a loop of the function.

Signed-off-by: Ryusuke Konishi 
---
 fs/nilfs2/alloc.c | 28 +---
 1 file changed, 9 insertions(+), 19 deletions(-)

diff --git a/fs/nilfs2/alloc.c b/fs/nilfs2/alloc.c
index b15daf8..5b7ee36 100644
--- a/fs/nilfs2/alloc.c
+++ b/fs/nilfs2/alloc.c
@@ -673,22 +673,6 @@ void nilfs_palloc_abort_free_entry(struct inode *inode,
 }
 
 /**
- * nilfs_palloc_group_is_in - judge if an entry is in a group
- * @inode: inode of metadata file using this allocator
- * @group: group number
- * @nr: serial number of the entry (e.g. inode number)
- */
-static int
-nilfs_palloc_group_is_in(struct inode *inode, unsigned long group, __u64 nr)
-{
-   __u64 first, last;
-
-   first = group * nilfs_palloc_entries_per_group(inode);
-   last = first + nilfs_palloc_entries_per_group(inode) - 1;
-   return (nr >= first) && (nr <= last);
-}
-
-/**
  * nilfs_palloc_freev - deallocate a set of persistent objects
  * @inode: inode of metadata file using this allocator
  * @entry_nrs: array of entry numbers to be deallocated
@@ -701,6 +685,8 @@ int nilfs_palloc_freev(struct inode *inode, __u64 
*entry_nrs, size_t nitems)
unsigned char *bitmap;
void *desc_kaddr, *bitmap_kaddr;
unsigned long group, group_offset;
+   __u64 group_min_nr;
+   const unsigned long epg = nilfs_palloc_entries_per_group(inode);
spinlock_t *lock;
int i, j, n, ret;
 
@@ -715,6 +701,10 @@ int nilfs_palloc_freev(struct inode *inode, __u64 
*entry_nrs, size_t nitems)
brelse(desc_bh);
return ret;
}
+
+   /* Get the first entry number of the group */
+   group_min_nr = (__u64)group * epg;
+
desc_kaddr = kmap(desc_bh->b_page);
desc = nilfs_palloc_block_get_group_desc(
inode, group, desc_bh, desc_kaddr);
@@ -722,10 +712,10 @@ int nilfs_palloc_freev(struct inode *inode, __u64 
*entry_nrs, size_t nitems)
bitmap = bitmap_kaddr + bh_offset(bitmap_bh);
lock = nilfs_mdt_bgl_lock(inode, group);
for (j = i, n = 0;
-(j < nitems) && nilfs_palloc_group_is_in(inode, group,
- entry_nrs[j]);
+j < nitems && entry_nrs[j] >= group_min_nr &&
+entry_nrs[j] < group_min_nr + epg;
 j++) {
-   nilfs_palloc_group(inode, entry_nrs[j], &group_offset);
+   group_offset = entry_nrs[j] - group_min_nr;
if (!nilfs_clear_bit_atomic(lock, group_offset,
bitmap)) {
nilfs_warning(inode->i_sb, __func__,
-- 
1.8.3.1

--
To unsubscribe from this list: send the line "unsubscribe linux-nilfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH 1/7] nilfs2: drop null test before destroy functions

2015-09-19 Thread Ryusuke Konishi

From: Julia Lawall 

Remove unneeded NULL test.

The semantic patch that makes this change is as follows:
(http://coccinelle.lip6.fr/)

// 
@@ expression x; @@
-if (x != NULL)
  \(kmem_cache_destroy\|mempool_destroy\|dma_pool_destroy\)(x);
// 

Signed-off-by: Julia Lawall 
Signed-off-by: Ryusuke Konishi 
---
 fs/nilfs2/super.c | 12 
 1 file changed, 4 insertions(+), 8 deletions(-)

diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c
index f47585b..c69455a 100644
--- a/fs/nilfs2/super.c
+++ b/fs/nilfs2/super.c
@@ -1405,14 +1405,10 @@ static void nilfs_destroy_cachep(void)
 */
rcu_barrier();
 
-   if (nilfs_inode_cachep)
-   kmem_cache_destroy(nilfs_inode_cachep);
-   if (nilfs_transaction_cachep)
-   kmem_cache_destroy(nilfs_transaction_cachep);
-   if (nilfs_segbuf_cachep)
-   kmem_cache_destroy(nilfs_segbuf_cachep);
-   if (nilfs_btree_path_cache)
-   kmem_cache_destroy(nilfs_btree_path_cache);
+   kmem_cache_destroy(nilfs_inode_cachep);
+   kmem_cache_destroy(nilfs_transaction_cachep);
+   kmem_cache_destroy(nilfs_segbuf_cachep);
+   kmem_cache_destroy(nilfs_btree_path_cache);
 }
 
 static int __init nilfs_init_cachep(void)
-- 
1.8.3.1

--
To unsubscribe from this list: send the line "unsubscribe linux-nilfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH 3/7] nilfs2: do not call nilfs_mdt_bgl_lock() needlessly

2015-09-19 Thread Ryusuke Konishi

In the bitmap based allocator implementation, nilfs_mdt_bgl_lock()
helper is frequently used to get a spinlock protecting a target block
group.  This reduces its usage and simplifies arguments of some
related functions by directly passing a pointer to the spinlock.

Signed-off-by: Ryusuke Konishi 
---
 fs/nilfs2/alloc.c | 84 ++-
 1 file changed, 40 insertions(+), 44 deletions(-)

diff --git a/fs/nilfs2/alloc.c b/fs/nilfs2/alloc.c
index afe98364..ff0d62c 100644
--- a/fs/nilfs2/alloc.c
+++ b/fs/nilfs2/alloc.c
@@ -133,38 +133,34 @@ nilfs_palloc_bitmap_blkoff(const struct inode *inode, 
unsigned long group)
 
 /**
  * nilfs_palloc_group_desc_nfrees - get the number of free entries in a group
- * @inode: inode of metadata file using this allocator
- * @group: group number
  * @desc: pointer to descriptor structure for the group
+ * @lock: spin lock protecting @desc
  */
 static unsigned long
-nilfs_palloc_group_desc_nfrees(struct inode *inode, unsigned long group,
-  const struct nilfs_palloc_group_desc *desc)
+nilfs_palloc_group_desc_nfrees(const struct nilfs_palloc_group_desc *desc,
+  spinlock_t *lock)
 {
unsigned long nfree;
 
-   spin_lock(nilfs_mdt_bgl_lock(inode, group));
+   spin_lock(lock);
nfree = le32_to_cpu(desc->pg_nfrees);
-   spin_unlock(nilfs_mdt_bgl_lock(inode, group));
+   spin_unlock(lock);
return nfree;
 }
 
 /**
  * nilfs_palloc_group_desc_add_entries - adjust count of free entries
- * @inode: inode of metadata file using this allocator
- * @group: group number
  * @desc: pointer to descriptor structure for the group
+ * @lock: spin lock protecting @desc
  * @n: delta to be added
  */
 static void
-nilfs_palloc_group_desc_add_entries(struct inode *inode,
-   unsigned long group,
-   struct nilfs_palloc_group_desc *desc,
-   u32 n)
+nilfs_palloc_group_desc_add_entries(struct nilfs_palloc_group_desc *desc,
+   spinlock_t *lock, u32 n)
 {
-   spin_lock(nilfs_mdt_bgl_lock(inode, group));
+   spin_lock(lock);
le32_add_cpu(&desc->pg_nfrees, n);
-   spin_unlock(nilfs_mdt_bgl_lock(inode, group));
+   spin_unlock(lock);
 }
 
 /**
@@ -332,17 +328,15 @@ void *nilfs_palloc_block_get_entry(const struct inode 
*inode, __u64 nr,
 
 /**
  * nilfs_palloc_find_available_slot - find available slot in a group
- * @inode: inode of metadata file using this allocator
- * @group: group number
- * @target: offset number of an entry in the group (start point)
  * @bitmap: bitmap of the group
+ * @target: offset number of an entry in the group (start point)
  * @bsize: size in bits
+ * @lock: spin lock protecting @bitmap
  */
-static int nilfs_palloc_find_available_slot(struct inode *inode,
-   unsigned long group,
+static int nilfs_palloc_find_available_slot(unsigned char *bitmap,
unsigned long target,
-   unsigned char *bitmap,
-   int bsize)
+   int bsize,
+   spinlock_t *lock)
 {
int curr, pos, end, i;
 
@@ -351,12 +345,11 @@ static int nilfs_palloc_find_available_slot(struct inode 
*inode,
if (end > bsize)
end = bsize;
pos = nilfs_find_next_zero_bit(bitmap, end, target);
-   if (pos < end &&
-   !nilfs_set_bit_atomic(
-   nilfs_mdt_bgl_lock(inode, group), pos, bitmap))
+   if (pos < end && !nilfs_set_bit_atomic(lock, pos, bitmap))
return pos;
-   } else
+   } else {
end = 0;
+   }
 
for (i = 0, curr = end;
 i < bsize;
@@ -370,10 +363,8 @@ static int nilfs_palloc_find_available_slot(struct inode 
*inode,
if (end > bsize)
end = bsize;
pos = nilfs_find_next_zero_bit(bitmap, end, curr);
-   if ((pos < end) &&
-   !nilfs_set_bit_atomic(
-   nilfs_mdt_bgl_lock(inode, group), pos,
-   bitmap))
+   if (pos < end &&
+   !nilfs_set_bit_atomic(lock, pos, bitmap))
return pos;
}
}
@@ -477,6 +468,7 @@ int nilfs_palloc_prepare_alloc_entry(struct inode *inode,
unsigned long group_offset, maxgroup_offset;
unsigned long n, entries_per_group, groups_per_desc_block;
unsigned long i, j;
+   spinlock_t *lock;
int pos, ret;

[PATCH 6/7] nilfs2: add helper functions to delete blocks from dat file

2015-09-19 Thread Ryusuke Konishi

This adds delete functions for data blocks of metadata files using
bitmap based allocator.  nilfs_palloc_delete_entry_block() deletes an
entry block (e.g. block storing dat entries), and
nilfs_palloc_delete_bitmap_block() deletes a bitmap block,
respectively.

These helpers are intended to be used in the successive change on
deallocator of block addresses ("nilfs2: free unused dat file blocks
during garbage collection").

Signed-off-by: Ryusuke Konishi 
---
 fs/nilfs2/alloc.c | 50 ++
 1 file changed, 50 insertions(+)

diff --git a/fs/nilfs2/alloc.c b/fs/nilfs2/alloc.c
index 5b7ee36..225b797 100644
--- a/fs/nilfs2/alloc.c
+++ b/fs/nilfs2/alloc.c
@@ -236,6 +236,26 @@ static int nilfs_palloc_get_block(struct inode *inode, 
unsigned long blkoff,
 }
 
 /**
+ * nilfs_palloc_delete_block - delete a block on the persistent allocator file
+ * @inode: inode of metadata file using this allocator
+ * @blkoff: block offset
+ * @prev: nilfs_bh_assoc struct of the last used buffer
+ * @lock: spin lock protecting @prev
+ */
+static int nilfs_palloc_delete_block(struct inode *inode, unsigned long blkoff,
+struct nilfs_bh_assoc *prev,
+spinlock_t *lock)
+{
+   spin_lock(lock);
+   if (prev->bh && blkoff == prev->blkoff) {
+   brelse(prev->bh);
+   prev->bh = NULL;
+   }
+   spin_unlock(lock);
+   return nilfs_mdt_delete_block(inode, blkoff);
+}
+
+/**
  * nilfs_palloc_get_desc_block - get buffer head of a group descriptor block
  * @inode: inode of metadata file using this allocator
  * @group: group number
@@ -274,6 +294,22 @@ static int nilfs_palloc_get_bitmap_block(struct inode 
*inode,
 }
 
 /**
+ * nilfs_palloc_delete_bitmap_block - delete a bitmap block
+ * @inode: inode of metadata file using this allocator
+ * @group: group number
+ */
+static int nilfs_palloc_delete_bitmap_block(struct inode *inode,
+   unsigned long group)
+{
+   struct nilfs_palloc_cache *cache = NILFS_MDT(inode)->mi_palloc_cache;
+
+   return nilfs_palloc_delete_block(inode,
+nilfs_palloc_bitmap_blkoff(inode,
+   group),
+&cache->prev_bitmap, &cache->lock);
+}
+
+/**
  * nilfs_palloc_get_entry_block - get buffer head of an entry block
  * @inode: inode of metadata file using this allocator
  * @nr: serial number of the entry (e.g. inode number)
@@ -292,6 +328,20 @@ int nilfs_palloc_get_entry_block(struct inode *inode, 
__u64 nr,
 }
 
 /**
+ * nilfs_palloc_delete_entry_block - delete an entry block
+ * @inode: inode of metadata file using this allocator
+ * @nr: serial number of the entry
+ */
+static int nilfs_palloc_delete_entry_block(struct inode *inode, __u64 nr)
+{
+   struct nilfs_palloc_cache *cache = NILFS_MDT(inode)->mi_palloc_cache;
+
+   return nilfs_palloc_delete_block(inode,
+nilfs_palloc_entry_blkoff(inode, nr),
+&cache->prev_entry, &cache->lock);
+}
+
+/**
  * nilfs_palloc_block_get_group_desc - get kernel address of a group descriptor
  * @inode: inode of metadata file using this allocator
  * @group: group number
-- 
1.8.3.1

--
To unsubscribe from this list: send the line "unsubscribe linux-nilfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH 7/7] nilfs2: free unused dat file blocks during garbage collection

2015-09-19 Thread Ryusuke Konishi

As a nilfs2 volume ages, the amount of available disk space
decreases little by little due to bloat of DAT (disk address
translation) metadata file.  Even if we delete all files in a file
system and free their block addresses from the DAT file through a
garbage collection, empty DAT blocks are not freed.

This fixes the issue by extending the deallocator of block addresses
so that empty data blocks and empty bitmap blocks of DAT are deleted.

The following comparison shows the effect of this patch.  Each shows
disk amount information of a nilfs2 volume that we cleaned out by
deleting all files and running gc after having filled 90% of its
capacity.

Before:
Filesystem 1K-blocks Used Available Use% Mounted on
/dev/sda1  500105212  3022844 472072192   1% /test

After:
Filesystem 1K-blocks Used Available Use% Mounted on
/dev/sda1  50010521216380 475078656   1% /test

Signed-off-by: Ryusuke Konishi 
---
 fs/nilfs2/alloc.c | 91 ---
 fs/nilfs2/alloc.h |  1 +
 2 files changed, 75 insertions(+), 17 deletions(-)

diff --git a/fs/nilfs2/alloc.c b/fs/nilfs2/alloc.c
index 225b797..b335a32 100644
--- a/fs/nilfs2/alloc.c
+++ b/fs/nilfs2/alloc.c
@@ -154,13 +154,17 @@ nilfs_palloc_group_desc_nfrees(const struct 
nilfs_palloc_group_desc *desc,
  * @lock: spin lock protecting @desc
  * @n: delta to be added
  */
-static void
+static u32
 nilfs_palloc_group_desc_add_entries(struct nilfs_palloc_group_desc *desc,
spinlock_t *lock, u32 n)
 {
+   u32 nfree;
+
spin_lock(lock);
le32_add_cpu(&desc->pg_nfrees, n);
+   nfree = le32_to_cpu(desc->pg_nfrees);
spin_unlock(lock);
+   return nfree;
 }
 
 /**
@@ -735,12 +739,18 @@ int nilfs_palloc_freev(struct inode *inode, __u64 
*entry_nrs, size_t nitems)
unsigned char *bitmap;
void *desc_kaddr, *bitmap_kaddr;
unsigned long group, group_offset;
-   __u64 group_min_nr;
+   __u64 group_min_nr, last_nrs[8];
const unsigned long epg = nilfs_palloc_entries_per_group(inode);
+   const unsigned epb = NILFS_MDT(inode)->mi_entries_per_block;
+   unsigned entry_start, end, pos;
spinlock_t *lock;
-   int i, j, n, ret;
+   int i, j, k, ret;
+   u32 nfree;
 
for (i = 0; i < nitems; i = j) {
+   int change_group = false;
+   int nempties = 0, n = 0;
+
group = nilfs_palloc_group(inode, entry_nrs[i], &group_offset);
ret = nilfs_palloc_get_desc_block(inode, group, 0, &desc_bh);
if (ret < 0)
@@ -755,17 +765,13 @@ int nilfs_palloc_freev(struct inode *inode, __u64 
*entry_nrs, size_t nitems)
/* Get the first entry number of the group */
group_min_nr = (__u64)group * epg;
 
-   desc_kaddr = kmap(desc_bh->b_page);
-   desc = nilfs_palloc_block_get_group_desc(
-   inode, group, desc_bh, desc_kaddr);
bitmap_kaddr = kmap(bitmap_bh->b_page);
bitmap = bitmap_kaddr + bh_offset(bitmap_bh);
lock = nilfs_mdt_bgl_lock(inode, group);
-   for (j = i, n = 0;
-j < nitems && entry_nrs[j] >= group_min_nr &&
-entry_nrs[j] < group_min_nr + epg;
-j++) {
-   group_offset = entry_nrs[j] - group_min_nr;
+
+   j = i;
+   entry_start = rounddown(group_offset, epb);
+   do {
if (!nilfs_clear_bit_atomic(lock, group_offset,
bitmap)) {
nilfs_warning(inode->i_sb, __func__,
@@ -775,18 +781,69 @@ int nilfs_palloc_freev(struct inode *inode, __u64 
*entry_nrs, size_t nitems)
} else {
n++;
}
-   }
-   nilfs_palloc_group_desc_add_entries(desc, lock, n);
+
+   j++;
+   if (j >= nitems || entry_nrs[j] < group_min_nr ||
+   entry_nrs[j] >= group_min_nr + epg) {
+   change_group = true;
+   } else {
+   group_offset = entry_nrs[j] - group_min_nr;
+   if (group_offset >= entry_start &&
+   group_offset < entry_start + epb) {
+   /* This entry is in the same block */
+   continue;
+   }
+   }
+
+   /* Test if the entry block is empty or not */
+   end = entry_start + epb;
+   pos = nilfs_find_next_bit(bitmap, end, entry_start);
+

[PATCH 4/7] nilfs2: refactor nilfs_palloc_find_available_slot()

2015-09-19 Thread Ryusuke Konishi

The current implementation of nilfs_palloc_find_available_slot()
function is overkill.  The underlying bit search routine is well
optimized, so this uses it more simply in
nilfs_palloc_find_available_slot().

Signed-off-by: Ryusuke Konishi 
---
 fs/nilfs2/alloc.c | 48 +---
 1 file changed, 21 insertions(+), 27 deletions(-)

diff --git a/fs/nilfs2/alloc.c b/fs/nilfs2/alloc.c
index ff0d62c..b15daf8 100644
--- a/fs/nilfs2/alloc.c
+++ b/fs/nilfs2/alloc.c
@@ -335,39 +335,33 @@ void *nilfs_palloc_block_get_entry(const struct inode 
*inode, __u64 nr,
  */
 static int nilfs_palloc_find_available_slot(unsigned char *bitmap,
unsigned long target,
-   int bsize,
+   unsigned bsize,
spinlock_t *lock)
 {
-   int curr, pos, end, i;
+   int pos, end = bsize;
 
-   if (target > 0) {
-   end = (target + BITS_PER_LONG - 1) & ~(BITS_PER_LONG - 1);
-   if (end > bsize)
-   end = bsize;
-   pos = nilfs_find_next_zero_bit(bitmap, end, target);
-   if (pos < end && !nilfs_set_bit_atomic(lock, pos, bitmap))
-   return pos;
-   } else {
-   end = 0;
+   if (likely(target < bsize)) {
+   pos = target;
+   do {
+   pos = nilfs_find_next_zero_bit(bitmap, end, pos);
+   if (pos >= end)
+   break;
+   if (!nilfs_set_bit_atomic(lock, pos, bitmap))
+   return pos;
+   } while (++pos < end);
+
+   end = target;
}
 
-   for (i = 0, curr = end;
-i < bsize;
-i += BITS_PER_LONG, curr += BITS_PER_LONG) {
-   /* wrap around */
-   if (curr >= bsize)
-   curr = 0;
-   while (*((unsigned long *)bitmap + curr / BITS_PER_LONG)
-  != ~0UL) {
-   end = curr + BITS_PER_LONG;
-   if (end > bsize)
-   end = bsize;
-   pos = nilfs_find_next_zero_bit(bitmap, end, curr);
-   if (pos < end &&
-   !nilfs_set_bit_atomic(lock, pos, bitmap))
-   return pos;
-   }
+   /* wrap around */
+   for (pos = 0; pos < end; pos++) {
+   pos = nilfs_find_next_zero_bit(bitmap, end, pos);
+   if (pos >= end)
+   break;
+   if (!nilfs_set_bit_atomic(lock, pos, bitmap))
+   return pos;
}
+
return -ENOSPC;
 }
 
-- 
1.8.3.1

--
To unsubscribe from this list: send the line "unsubscribe linux-nilfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH 2/7] nilfs2: use nilfs_warning() in allocator implementation

2015-09-19 Thread Ryusuke Konishi

This uses nilfs_warning() to replace "printk(KERN_WARNING ...);" in
the bitmap based allocator implementation of nilfs2.  The warning
messages are modified to include the device name and the inode number
in each message.  This makes it clear which metadata file of which
device has output warnings such as "entry number  already freed".

Signed-off-by: Ryusuke Konishi 
---
 fs/nilfs2/alloc.c | 20 
 1 file changed, 12 insertions(+), 8 deletions(-)

diff --git a/fs/nilfs2/alloc.c b/fs/nilfs2/alloc.c
index 8df0f3b..afe98364 100644
--- a/fs/nilfs2/alloc.c
+++ b/fs/nilfs2/alloc.c
@@ -583,8 +583,10 @@ void nilfs_palloc_commit_free_entry(struct inode *inode,
 
if (!nilfs_clear_bit_atomic(nilfs_mdt_bgl_lock(inode, group),
group_offset, bitmap))
-   printk(KERN_WARNING "%s: entry number %llu already freed\n",
-  __func__, (unsigned long long)req->pr_entry_nr);
+   nilfs_warning(inode->i_sb, __func__,
+ "entry number %llu already freed: ino=%lu\n",
+ (unsigned long long)req->pr_entry_nr,
+ (unsigned long)inode->i_ino);
else
nilfs_palloc_group_desc_add_entries(inode, group, desc, 1);
 
@@ -620,8 +622,10 @@ void nilfs_palloc_abort_alloc_entry(struct inode *inode,
bitmap = bitmap_kaddr + bh_offset(req->pr_bitmap_bh);
if (!nilfs_clear_bit_atomic(nilfs_mdt_bgl_lock(inode, group),
group_offset, bitmap))
-   printk(KERN_WARNING "%s: entry number %llu already freed\n",
-  __func__, (unsigned long long)req->pr_entry_nr);
+   nilfs_warning(inode->i_sb, __func__,
+ "entry number %llu already freed: ino=%lu\n",
+ (unsigned long long)req->pr_entry_nr,
+ (unsigned long)inode->i_ino);
else
nilfs_palloc_group_desc_add_entries(inode, group, desc, 1);
 
@@ -734,10 +738,10 @@ int nilfs_palloc_freev(struct inode *inode, __u64 
*entry_nrs, size_t nitems)
if (!nilfs_clear_bit_atomic(
nilfs_mdt_bgl_lock(inode, group),
group_offset, bitmap)) {
-   printk(KERN_WARNING
-  "%s: entry number %llu already freed\n",
-  __func__,
-  (unsigned long long)entry_nrs[j]);
+   nilfs_warning(inode->i_sb, __func__,
+ "entry number %llu already freed: 
ino=%lu\n",
+ (unsigned long long)entry_nrs[j],
+ (unsigned long)inode->i_ino);
} else {
n++;
}
-- 
1.8.3.1

--
To unsubscribe from this list: send the line "unsubscribe linux-nilfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH 0/7] nilfs2 updates

2015-09-19 Thread Ryusuke Konishi

Hi Andrew,

Please queue the following changes for the next merge window:

Julia Lawall (1):
  nilfs2: drop null test before destroy functions

Ryusuke Konishi (6):
  nilfs2: use nilfs_warning() in allocator implementation
  nilfs2: do not call nilfs_mdt_bgl_lock() needlessly
  nilfs2: refactor nilfs_palloc_find_available_slot()
  nilfs2: get rid of nilfs_palloc_group_is_in()
  nilfs2: add helper functions to delete blocks from dat file
  nilfs2: free unused dat file blocks during garbage collection

* Brief summary

> nilfs2: drop null test before destroy functions

This removes null tests before calling kmem_cache_destroy() function.

> nilfs2: use nilfs_warning() in allocator implementation
> nilfs2: do not call nilfs_mdt_bgl_lock() needlessly
> nilfs2: refactor nilfs_palloc_find_available_slot()
> nilfs2: get rid of nilfs_palloc_group_is_in()

These are for refactoring bitmap based object allocator/deallocator
(alloc.c).

> nilfs2: add helper functions to delete blocks from dat file
> nilfs2: free unused dat file blocks during garbage collection

These resolve decline of the available disk space due to bloat of DAT
metadata file.


Thanks,
Ryusuke Konishi
--

 fs/nilfs2/alloc.c | 305 +++---
 fs/nilfs2/alloc.h |   1 +
 fs/nilfs2/super.c |  12 +--
 3 files changed, 203 insertions(+), 115 deletions(-)


--
To unsubscribe from this list: send the line "unsubscribe linux-nilfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH 02/39] nilfs2: drop null test before destroy functions

2015-09-13 Thread Ryusuke Konishi

On Sun, 13 Sep 2015 14:14:55 +0200, Julia Lawall  wrote:
> Remove unneeded NULL test.
> 
> The semantic patch that makes this change is as follows:
> (http://coccinelle.lip6.fr/)
> 
> // 
> @@ expression x; @@
> -if (x != NULL)
>   \(kmem_cache_destroy\|mempool_destroy\|dma_pool_destroy\)(x);
> // 
> 
> Signed-off-by: Julia Lawall 

Looks OK.  I'll queue this in my tree.

Thanks,
Ryusuke Konishi

> 
> ---
>  fs/nilfs2/super.c |   12 
>  1 file changed, 4 insertions(+), 8 deletions(-)
> 
> diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c
> index f47585b..c69455a 100644
> --- a/fs/nilfs2/super.c
> +++ b/fs/nilfs2/super.c
> @@ -1405,14 +1405,10 @@ static void nilfs_destroy_cachep(void)
>*/
>   rcu_barrier();
>  
> - if (nilfs_inode_cachep)
> - kmem_cache_destroy(nilfs_inode_cachep);
> - if (nilfs_transaction_cachep)
> - kmem_cache_destroy(nilfs_transaction_cachep);
> - if (nilfs_segbuf_cachep)
> - kmem_cache_destroy(nilfs_segbuf_cachep);
> - if (nilfs_btree_path_cache)
> - kmem_cache_destroy(nilfs_btree_path_cache);
> + kmem_cache_destroy(nilfs_inode_cachep);
> + kmem_cache_destroy(nilfs_transaction_cachep);
> + kmem_cache_destroy(nilfs_segbuf_cachep);
> + kmem_cache_destroy(nilfs_btree_path_cache);
>  }
>  
>  static int __init nilfs_init_cachep(void)
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-nilfs" in
> the body of a message to majord...@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line "unsubscribe linux-nilfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: nilfs.org pwned

2015-08-29 Thread Ryusuke Konishi

On Wed, 26 Aug 2015 11:30:49 +0100, csm...@csmith-bm.vm.bytemark.co.uk wrote:
> The website, www.nilfs.org, appears to have been defaced. I was going to post
> a link on a forum for it, but decided against it in the end due to the
> defacement.
> 
> Otherwise, NILFS generally rocks, keep up the good work :)
> 
> Christian

Please refer to nilfs.sourceforge.net instead.
We no longer hosts nilfs.org.

Ryusuke Konishi
--
To unsubscribe from this list: send the line "unsubscribe linux-nilfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: NILFS2: double uuid

2015-06-09 Thread Ryusuke Konishi

On Tue, 9 Jun 2015 16:07:42 +0200, Karel Zak  wrote:
> On Tue, Jun 09, 2015 at 10:04:15PM +0900, Ryusuke Konishi wrote:
>> $ sudo nilfs-resize -y /dev/sdb1 1G
>> Partition size = 2146435072 bytes.
>> Shrink the filesystem size from 2146435072 bytes to 1073741824 bytes.
>> 128 segments will be truncated from segnum 127.
>> Moving 103 in-use segments.
>> progress |***|
>> Done.
>> 
>> $ sudo umount /test
>> $ sudo mount /dev/sdb1 /test
>> $ sudo LD_LIBRARY_PATH=/usr/local/lib lsblk -f
>> NAME   FSTYPE  LABEL   UUID MOUNTPOINT
>> [...]
>> sdb
>> `-sdb1  /test
>> 
>> This blank state continued until I shrank the partition or
>> re-extended the filesystem to the partition size.
>> 
>> Could you consider confining the s_dev_size test only to the
>> backup superblock ?
> 
> Hmm... why nilfs-resize does not update the size in the superblock?
> It seems like nilfs-resize bug.

nilfs-resize (to be exact, RESIZE ioctl of nilfs2) updates s_dev_size
in both superblocks.  What nilfs-resize doesn't change is the
partition size.  (It needs help of a partitioning tool)

> 
>> It seems that we don't have to drop the primary super block
>> even if s_dev_size doesn't fit to the partition size.
> 
> Yes, fixed. I have also enabled the s_dev_size check for whole-disk
> devices only to minimize number of situations when we rely on the
> s_dev_size.
> 
> Karel

Thanks again.  The updated libblkid/lsblk works frawlessly.

Regards,
Ryusuke Konishi
--
To unsubscribe from this list: send the line "unsubscribe linux-nilfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: NILFS2: double uuid

2015-06-09 Thread Ryusuke Konishi

Hi,

On 2015/06/09 17:53, Karel Zak wrote:

On Tue, Jun 09, 2015 at 12:31:27AM +0900, Ryusuke Konishi wrote:

It looks like the backup super block should be dropped from candidates
if its device size (sbp->s_dev_size) doesn't match the partition size.

Yeah, fixed:
http://git.kernel.org/cgit/utils/util-linux/util-linux.git/commit/?id=00817742ce360119e079a33e12cf84118ff7c63e

Note that workaround is to not use nilfs2 on the last partition or
have a tiny gap (1 sector is enough) between last partition and the
end of the whole-disk.

Karel

Thanks for your quick work!

I tested the patch. It almost worked fine.
One issue I found is a transient state after fs-resizing.

After shrinking the file system, both superblocks dropped and
lsblk failed to detect the filesystem:

$ sudo LD_LIBRARY_PATH=/usr/local/lib lsblk -f
NAME FSTYPE LABEL UUID
MOUNTPOINT

[...]
sdb
`-sdb1 nilfs2 2d7cd130-82a0-4a3c-b8a8-4ac5a26f5703 /test

$ sudo nilfs-resize -y /dev/sdb1 1G
Partition size = 2146435072 bytes.
Shrink the filesystem size from 2146435072 bytes to 1073741824 bytes.
128 segments will be truncated from segnum 127.
Moving 103 in-use segments.
progress |***|
Done.

$ sudo umount /test
$ sudo mount /dev/sdb1 /test
$ sudo LD_LIBRARY_PATH=/usr/local/lib lsblk -f
NAME FSTYPE LABEL UUID
MOUNTPOINT

[...]
sdb
`-sdb1 /test

This blank state continued until I shrank the partition or
re-extended the filesystem to the partition size.

Could you consider confining the s_dev_size test only to the
backup superblock ?

It seems that we don't have to drop the primary super block
even if s_dev_size doesn't fit to the partition size.

Regards,
Ryusuke Konishi

--
To unsubscribe from this list: send the line "unsubscribe linux-nilfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html

Re: NILFS2: double uuid

2015-06-08 Thread Ryusuke Konishi

(CCed to Karel Zak)
Hi,

I succeeded to reproduce this issue on Fedora 20, 21, 22 and Debian
jessie.  Also, I could narrow down the issue.

This turned out to be an issue of libblkid in util-linux and
introduced by the commit 5f77ce6f3269 ("libblkid: (nilfs2) check also
backup superblock"):

 * commit 1a38ad5c3271a59c7e51580242a2fbd3b0f16495 --> OK

$ sudo LD_LIBRARY_PATH=/usr/local/lib lsblk --version
lsblk from util-linux 2.24.153-1a38
$ sudo LD_LIBRARY_PATH=/usr/local/lib lsblk -f
NAME   FSTYPE  LABEL   UUID MOUNTPOINT
[...]
sdb
`-sdb1 nilfs2  c6cd2c9c-0291-4f9f-be9b-10ff8e2acbe6 /test

 * commit 5f77ce6f32692b473ffcec4c6f63dbd38cd5eeda  --> NG

$ sudo LD_LIBRARY_PATH=/usr/local/lib lsblk --version
lsblk from util-linux 2.24.154-5f77c
$ sudo LD_LIBRARY_PATH=/usr/local/lib lsblk -f
NAME   FSTYPE  LABEL   UUID MOUNTPOINT
[...]
sdbnilfs2  c6cd2c9c-0291-4f9f-be9b-10ff8e2acbe6
`-sdb1 nilfs2  c6cd2c9c-0291-4f9f-be9b-10ff8e2acbe6 /test

Here, the backup super block of /dev/sdb1 got detected also for
/dev/sdb by the commit 5f77ce6f3269.

This change has been applied between v2.24 and v2.24.1 of util-linux,
and not yet fixed in the mainline.

It causes the duplicate uuid and leads the UUID mount written in the
fstab file to mount the device itself (i.e. /dev/sdb in this example).
Thus the mount failure happens.

It looks like the backup super block should be dropped from candidates
if its device size (sbp->s_dev_size) doesn't match the partition size.

Regards,
Ryusuke Konishi

On Mon, 08 Jun 2015 19:31:51 +0900, Ryusuke Konishi wrote:
> Hi,
> 
> On 2015/06/08 19:08, Heinz Diehl wrote:
>> On 08.06.2015, Heinz Diehl wrote:
>>
>> To be more precise, here's what works and what don't, in detail
>> (and after a fresh install of Arch):
>>
>> The USB memory is xfs formatted and works fine:
>>
>> [root@alarmpi /]# lsblk -f
>> NAME FSTYPE LABEL UUID MOUNTPOINT
>> sda
>> `-sda1  xfs  ff17dda9-fcae-42e7-a438-9087de58902e
>> mmcblk0
>> |-mmcblk0p1 vfat EA5B-4477/boot
>> `-mmcblk0p2 ext4 c4ddc925-15ab-4465-ac78-967a845e98d5 /
>>
>>
>> Now, it's nilfs2 formatted:
>>
>> [root@alarmpi /]# mkfs.nilfs2 /dev/sda1
>> WARNING: Device /dev/sda1 appears to contain an existing xfs
>> superblock.
>> WARNING: All data will be lost after format!
>>
>> DO YOU REALLY WANT TO FORMAT DEVICE /dev/sda1?
>>
>> Continue? [y/N] y
>> mkfs.nilfs2 (nilfs-utils 2.2.3)
>> Start writing file system initial data to the device
>> Blocksize:4096  Device:/dev/sda1  Device Size:32026656768
>> File system initialization succeeded !!
>>
>> After that, all seems to be ok. lsblk shown no double uuid:
>>
>> [root@alarmpi /]# lsblk -f
>> NAME FSTYPE LABEL UUID MOUNTPOINT
>> sda
>> `-sda1  nilfs2   98da384c-392e-4551-98c0-d076524f5d8b
>> mmcblk0
>> |-mmcblk0p1 vfat EA5B-4477/boot
>> `-mmcblk0p2 ext4 c4ddc925-15ab-4465-ac78-967a845e98d5 /
>> [root@alarmpi /]#
>>
>>
>> Now the USB drive gets manually mounted, all is ok:
>>
>> [root@alarmpi /]# mount /dev/sda1 /USBDRIVE
>> [root@alarmpi /]# lsblk -f
>> NAME FSTYPE LABEL UUID MOUNTPOINT
>> sda
>> `-sda1 nilfs2 98da384c-392e-4551-98c0-d076524f5d8b /USBDRIVE
>> mmcblk0
>> |-mmcblk0p1 vfat EA5B-4477/boot
>> `-mmcblk0p2 ext4 c4ddc925-15ab-4465-ac78-967a845e98d5 /
>>
>>
>> Now, the newly formatted drive is registered in fstab to be
>> automatically mounted on boot:
>>
>> UUID=ff17dda9-fcae-42e7-a438-9087de58902e /USBDRIVE nilfs2 defaults 0
>> 0
>>
>> After rebooting the machine, nothing is mounted, and lsblk shows the
>> double uuid:
>>
>> [root@alarmpi /]# lsblk -f
>> NAME FSTYPE LABEL UUID MOUNTPOINT
>> sda  98da384c-392e-4551-98c0-d076524f5d8b
>> `-sda1  nilfs2   98da384c-392e-4551-98c0-d076524f5d8b
>> mmcblk0
>> |-mmcblk0p1 vfat EA5B-4477/boot
>> `-mmcblk0p2 ext4 c4ddc925-15ab-4465-ac78-967a845e98d5 /
>>
>> The logs say:
>>
>> Jun 08 11:23:47 alarmpi mount: mount.nilfs2: Error while mounting
>> /dev/sda on /USBDRIVE: Device or resource busy
>> Jun 08 11:23:47 alarmpi systemd: Failed to mount /USBDRIVE.
>>
>> Here it becomes clear what happens: the system wants to mount /dev/sda
>> rather than /dev/sda1, and thus fails.
>>
>> Out o

Re: NILFS2: double uuid

2015-06-08 Thread Ryusuke Konishi


Hi,

On 2015/06/08 19:08, Heinz Diehl wrote:

On 08.06.2015, Heinz Diehl wrote:

To be more precise, here's what works and what don't, in detail
(and after a fresh install of Arch):

The USB memory is xfs formatted and works fine:

[root@alarmpi /]# lsblk -f
NAMEFSTYPE LABEL UUID MOUNTPOINT
sda
`-sda1  xfs  ff17dda9-fcae-42e7-a438-9087de58902e
mmcblk0
|-mmcblk0p1 vfat EA5B-4477/boot
`-mmcblk0p2 ext4 c4ddc925-15ab-4465-ac78-967a845e98d5 /


Now, it's nilfs2 formatted:

[root@alarmpi /]# mkfs.nilfs2 /dev/sda1
WARNING: Device /dev/sda1 appears to contain an existing xfs superblock.
WARNING: All data will be lost after format!

DO YOU REALLY WANT TO FORMAT DEVICE /dev/sda1?

Continue? [y/N] y
mkfs.nilfs2 (nilfs-utils 2.2.3)
Start writing file system initial data to the device
Blocksize:4096  Device:/dev/sda1  Device Size:32026656768
File system initialization succeeded !!

After that, all seems to be ok. lsblk shown no double uuid:

[root@alarmpi /]# lsblk -f
NAMEFSTYPE LABEL UUID MOUNTPOINT
sda
`-sda1  nilfs2   98da384c-392e-4551-98c0-d076524f5d8b
mmcblk0
|-mmcblk0p1 vfat EA5B-4477/boot
`-mmcblk0p2 ext4 c4ddc925-15ab-4465-ac78-967a845e98d5 /
[root@alarmpi /]#


Now the USB drive gets manually mounted, all is ok:

[root@alarmpi /]# mount /dev/sda1 /USBDRIVE
[root@alarmpi /]# lsblk -f
NAMEFSTYPE LABEL UUID MOUNTPOINT
sda
`-sda1  nilfs2   98da384c-392e-4551-98c0-d076524f5d8b /USBDRIVE
mmcblk0
|-mmcblk0p1 vfat EA5B-4477/boot
`-mmcblk0p2 ext4 c4ddc925-15ab-4465-ac78-967a845e98d5 /


Now, the newly formatted drive is registered in fstab to be
automatically mounted on boot:

UUID=ff17dda9-fcae-42e7-a438-9087de58902e   /USBDRIVE   nilfs2  
defaults0   0

After rebooting the machine, nothing is mounted, and lsblk shows the
double uuid:

[root@alarmpi /]# lsblk -f
NAMEFSTYPE LABEL UUID MOUNTPOINT
sda  98da384c-392e-4551-98c0-d076524f5d8b
`-sda1  nilfs2   98da384c-392e-4551-98c0-d076524f5d8b
mmcblk0
|-mmcblk0p1 vfat EA5B-4477/boot
`-mmcblk0p2 ext4 c4ddc925-15ab-4465-ac78-967a845e98d5 /

The logs say:

Jun 08 11:23:47 alarmpi mount: mount.nilfs2: Error while mounting /dev/sda on 
/USBDRIVE: Device or resource busy
Jun 08 11:23:47 alarmpi systemd: Failed to mount /USBDRIVE.

Here it becomes clear what happens: the system wants to mount /dev/sda
rather than /dev/sda1, and thus fails.

Out of curiosity, I tried both xfs, ext4 and btrfs, and all of them
just work.


I've tested the same steps as you wrote above (first created an
xfs partition, overrode it with a nilfs2 partition, wrote a similar
entry to fstab, and reboot),  but didn't reproduce the issue.

On my CentOS 7 environment, lsblk and default mount are perfectly
working.

So, it may be a version dependent issue of util-linux.
I will try to reproduce and nallow down the issue with newer util-linux
packages.

Thanks,
Ryusuke Konishi

--
To unsubscribe from this list: send the line "unsubscribe linux-nilfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: NILFS2: double uuid

2015-06-08 Thread Ryusuke Konishi

(CCed to linux-nilfs@vger.kernel.org)
Hi Heinz,

On 2015/06/08 15:43, Heinz Diehl wrote:

Hi,

a nilfs2 formatted disk fails to mount via fstab due to double uuid's.
See lsblk output below. The logs indicate that the system attempts to
mount /dev/sdb rather than /dev/sdb1, which of course fails. In
addition, /dev/sdb should not have any uuid at all. Don't know why
that happens.

The phenomenon is easily reproducible: format a partition with nilfs2,
register it with the proper uuid in fstab and reboot. Tried both with
USB memory and real HDD.

[root@keera ~]# lsblk -f
NAMEFSTYPE LABEL UUID MOUNTPOINT
sdb  ff17dda9-fcae-42e7-a438-9087de58902e
`-sdb1  xfs  ff17dda9-fcae-42e7-a438-9087de58902e

Thanks,
  Heinz

On 2015/06/08 15:49, Heinz Diehl wrote:
> On 08.06.2015, Heinz Diehl wrote:
>
>> [root@keera ~]# lsblk -f
>> NAMEFSTYPE LABEL UUID MOUNTPOINT
>> sdb  ff17dda9-fcae-42e7-a438-9087de58902e
>> `-sdb1  xfs  ff17dda9-fcae-42e7-a438-9087de58902e
>
> Copy error: replace xfs with nilfs2. Sorry!

I couldn't reproduce the issue (in a CentOS 7 environment).

Could you tell us the version information of distro,
lsblk, libblkid, nilfs-utils, and kernel you are using ?

The following is an example of mine:

$ lsblk -f
NAME   FSTYPE LABEL  UUID MOUNTPOINT
sda
└─sda1 nilfs29dcd01c0-2bc8-41bf-a400-8ad8755aac6a
$ lsblk --version
lsblk from util-linux 2.23.2

$ lscp -V
lscp (nilfs-utils 2.2.3)

$ rpm -q libblkid util-linux
libblkid-2.23.2-22.el7_1.x86_64
util-linux-2.23.2-22.el7_1.x86_64

$ uname -r
4.1.0-rc7

Regards,
Ryusuke Konishi

--
To unsubscribe from this list: send the line "unsubscribe linux-nilfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH v2 7/9] nilfs2: ensure that all dirty blocks are written out

2015-05-31 Thread Ryusuke Konishi

On Sun, 10 May 2015 13:04:18 +0200, Andreas Rohner wrote:
> On 2015-05-09 14:17, Ryusuke Konishi wrote:
>> On Sun,  3 May 2015 12:05:20 +0200, Andreas Rohner wrote:
[...]
>> 
>> Uum. This still looks to have potential for leak of dirty block
>> collection between DAT and SUFILE since this retry is limited by
>> the fixed retry count.
>> 
>> How about adding function temporarily turning off the live block
>> tracking and using it after this propagation loop until log write
>> finishes ?
>> 
>> It would reduce the accuracy of live block count, but is it enough ?
>> How do you think ?  We have to eliminate the possibility of the leak
>> because it can cause file system corruption.  Every checkpoint must be
>> self-contained.
> 
> How exactly could it lead to file system corruption? Maybe I miss
> something important here, but it seems to me, that no corruption is
> possible.
> 
> The nilfs_sufile_flush_cache_node() function only reads in already
> existing blocks. No new blocks are created. If I mark those blocks
> dirty, the btree is not changed at all. If I do not call
> nilfs_bmap_propagate(), then the btree stays unchanged and there are no
> dangling pointers. The resulting checkpoint should be self-contained.

Good point.  As for btree, it looks like no inconsistency issue arises
since nilfs_sufile_flush_cache_node() never inserts new blocks as you
pointed out.  Even though we also must care inconsistency between
sufile header and sufile data blocks, and block count in inode as
well, fortunately these look to be ok, too.

However, I still think it's not good to carry over dirty blocks to the
next segment construction to avoid extra checkpoint creation and to
simplify things.

>From this viewpoint, I also prefer that nilfs_sufile_flush_cache() and
nilfs_sufile_flush_cache_node() are changed a bit so that they will
skip adjusting su_nlive_blks and su_nlive_lastmod if the sufile block
that includes the segment usage is not marked dirty and only_mark == 0
as well as turing off live block counting temporarily after the
sufile/DAT propagation loop.

> 
> The only problem would be, that I could lose some nlive_blks updates.
> 

Regards,
Ryusuke Konishi
--
To unsubscribe from this list: send the line "unsubscribe linux-nilfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH v2 9/9] nilfs2: prevent starvation of segments protected by snapshots

2015-05-31 Thread Ryusuke Konishi

On Sun, 31 May 2015 20:13:44 +0200, Andreas Rohner wrote:
> On 2015-05-31 18:45, Ryusuke Konishi wrote:
>> On Fri, 22 May 2015 20:10:05 +0200, Andreas Rohner wrote:
>>> On 2015-05-20 16:43, Ryusuke Konishi wrote:
>>>> On Sun,  3 May 2015 12:05:22 +0200, Andreas Rohner wrote:
[...]
>>>>  3. The ratio of the threshold "max_segblks" is hard coded to 50%
>>>> of blocks_per_segment.  It is not clear if the ratio is good
>>>> (versatile).
>>>
>>> The interval and percentage could be set in /etc/nilfs_cleanerd.conf.
>>>
>>> I chose 50% kind of arbitrarily. My intent was to encourage the GC to
>>> check the segment again in the future. I guess anything between 25% and
>>> 75% would also work.
>> 
>> Sound reasonable.
>> 
>> By the way, I am thinking we should move cleanerd into kernel as soon
>> as we can.  It's not only inefficient due to a large amount of data
>> exchange between kernel and user-land, but also is hindering changes
>> like we are trying.  We have to care compatibility unnecessarily due
>> to the early design mistake (i.e. the separation of gc to user-land).
> 
> I am a bit confused. Is it OK if I implement this functionality in
> nilfs_cleanerd for this patch set, or would it be better to implement it
> with a workqueue in the kernel, like you've suggested before?
> 
> If you intend to move nilfs_cleanerd into the kernel anyway, then the
> latter would make more sense to me. Which implementation do you prefer
> for this patch set?

If nilfs_cleanerd will remain in userland, then the userland
implementation looks better.  But, yes, if we will move the cleaner
into kernel, then the kernel implementation looks better because we
may be able to avoid unnecessary API change.  It's a dilemma.

Do you have any good idea to reduce or hide overhead of the
calibration (i.e. traversal rewrite of sufile) in regard to the kernel
implementation ?
I'm inclined to leave that in kernel for now.

Regards,
Ryusuke Konishi

> 
> Regards,
> Andreas Rohner
> 
>> Regards,
>> Ryusuke Konishi
>> --
>> To unsubscribe from this list: send the line "unsubscribe linux-nilfs" in
>> the body of a message to majord...@vger.kernel.org
>> More majordomo info at  http://vger.kernel.org/majordomo-info.html
>> 
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-nilfs" in
> the body of a message to majord...@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line "unsubscribe linux-nilfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH v2 9/9] nilfs2: prevent starvation of segments protected by snapshots

2015-05-31 Thread Ryusuke Konishi

On Fri, 22 May 2015 20:10:05 +0200, Andreas Rohner wrote:
> On 2015-05-20 16:43, Ryusuke Konishi wrote:
>> On Sun,  3 May 2015 12:05:22 +0200, Andreas Rohner wrote:
>>> It doesn't really matter if the number of reclaimable blocks for a
>>> segment is inaccurate, as long as the overall performance is better than
>>> the simple timestamp algorithm and starvation is prevented.
>>>
>>> The following steps will lead to starvation of a segment:
>>>
>>> 1. The segment is written
>>> 2. A snapshot is created
>>> 3. The files in the segment are deleted and the number of live
>>>blocks for the segment is decremented to a very low value
>>> 4. The GC tries to free the segment, but there are no reclaimable
>>>blocks, because they are all protected by the snapshot. To prevent an
>>>infinite loop the GC has to adjust the number of live blocks to the
>>>correct value.
>>> 5. The snapshot is converted to a checkpoint and the blocks in the
>>>segment are now reclaimable.
>>> 6. The GC will never attempt to clean the segment again, because it
>>>looks as if it had a high number of live blocks.
>>>
>>> To prevent this, the already existing padding field of the SUFILE entry
>>> is used to track the number of snapshot blocks in the segment. This
>>> number is only set by the GC, since it collects the necessary
>>> information anyway. So there is no need, to track which block belongs to
>>> which segment. In step 4 of the list above the GC will set the new field
>>> su_nsnapshot_blks. In step 5 all entries in the SUFILE are checked and
>>> entries with a big su_nsnapshot_blks field get their su_nlive_blks field
>>> reduced.
>>>
>>> Signed-off-by: Andreas Rohner 
>> 
>> I still don't know whether this workaround is the way we should take
>> or not.  This patch has several drawbacks:
>> 
>>  1. It introduces overheads to every "chcp cp" operation
>> due to traversal rewrite of sufile.
>> If the ratio of snapshot protected blocks is high, then
>> this overheads will be big.
>> 
>>  2. The traversal rewrite of sufile will causes many sufile blocks will be
>> written out.   If most blocks are protected by a snapshot,
>> more than 4MB of sufile blocks will be written per 1TB capacity.
>> 
>> Even though this rewrite may not happen for contiguous "chcp cp"
>> operations, it still has potential for creating sufile write blocks
>> if the application of nilfs manipulates snapshots frequently.
> 
> I could also implement this functionality in nilfs_cleanerd in
> userspace. Every time a "chcp cp" happens some kind of permanent flag
> like "snapshot_was_recently_deleted" is set at an appropriate location.
> The flag could be returned with GET_SUSTAT ioctl(). Then nilfs_cleanerd
> would, at certain intervals and if the flag is set, check all segments
> with GET_SUINFO ioctl() and set the ones that have potentially invalid
> values with SET_SUINFO ioctl(). After that it would clear the
> "snapshot_was_recently_deleted" flag. What do you think about this idea?

Sorry for my late reply.

I think moving the functionality to cleanerd and notifying some sort
of information to userland through ioctl for that, is a good idea
except that I feel the ioctl should be GET_CPSTAT instead of
GET_SUINFO because it's checkpoint/snapshot related information.

I think the parameter that should be added is a set of statistics
information including the number of deleted snapshots since the file
system was mounted last (1).  The counter (1) can serve as the
"snapshot_was_recently_deleted" flag if it monotonically increases.
Although we can use timestamp of when a snapshot was deleted last
time, it's not preferable than the counter (1) because the system
clock may be rewinded and it also has an issue related to precision.

Note that we must add GET_CPSTAT_V2 (or GET_SUSTAT_V2) and the
corresponding structure (i.e. nilfs_cpstat_v2, or so) since ioctl
codes depend on the size of argument data and it will be changed in
both ioctls; unfortunately, neither GET_CPSTAT nor GET_SUSTAT ioctl is
expandable.  Some ioctls like EVIOCGKEYCODE_V2 will be a reference for
this issue.

> 
> If the policy is "timestamp" the GC would of course skip this scan,
> because it is unnecessary.
> 
>>  3. The ratio of the threshold "max_segblks" is hard coded to 50%
>> of blocks_per_segment.  It is not clear if the ratio is good
>> (versatile).
> 
> The interval and percentage could be set in /etc/nilfs_clean

[PATCH 0/1] NILFS2: support NFSv2 export

2015-05-26 Thread Ryusuke Konishi

Hi Andrew,

please queue the following patch for the next merge window.  It fixes
an NFSv2 related issue reported in:

[1] http://marc.info/?l=linux-fsdevel&m=143104630128997
"[PATCH 0/3] make BTRFS, UDF, NILFS2 work with NFSv2."

Thanks,
Ryusuke Konishi
--
NeilBrown (1):
  NILFS2: support NFSv2 export

 fs/nilfs2/namei.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)


--
To unsubscribe from this list: send the line "unsubscribe linux-nilfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH 1/1] NILFS2: support NFSv2 export

2015-05-26 Thread Ryusuke Konishi

From: NeilBrown 

The "fh_len" passed to ->fh_to_* is not guaranteed to be that same as
that returned by encode_fh - it may be larger.

With NFSv2, the filehandle is fixed length, so it may appear longer
than expected and be zero-padded.

So we must test that fh_len is at least some value, not exactly equal
to it.

Signed-off-by: NeilBrown 
Signed-off-by: Ryusuke Konishi 
---
 fs/nilfs2/namei.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/fs/nilfs2/namei.c b/fs/nilfs2/namei.c
index 2218083..37dd6b0 100644
--- a/fs/nilfs2/namei.c
+++ b/fs/nilfs2/namei.c
@@ -496,8 +496,7 @@ static struct dentry *nilfs_fh_to_dentry(struct super_block 
*sb, struct fid *fh,
 {
struct nilfs_fid *fid = (struct nilfs_fid *)fh;
 
-   if ((fh_len != NILFS_FID_SIZE_NON_CONNECTABLE &&
-fh_len != NILFS_FID_SIZE_CONNECTABLE) ||
+   if (fh_len < NILFS_FID_SIZE_NON_CONNECTABLE ||
(fh_type != FILEID_NILFS_WITH_PARENT &&
 fh_type != FILEID_NILFS_WITHOUT_PARENT))
return NULL;
@@ -510,7 +509,7 @@ static struct dentry *nilfs_fh_to_parent(struct super_block 
*sb, struct fid *fh,
 {
struct nilfs_fid *fid = (struct nilfs_fid *)fh;
 
-   if (fh_len != NILFS_FID_SIZE_CONNECTABLE ||
+   if (fh_len < NILFS_FID_SIZE_CONNECTABLE ||
fh_type != FILEID_NILFS_WITH_PARENT)
return NULL;
 
-- 
1.8.3.1

--
To unsubscribe from this list: send the line "unsubscribe linux-nilfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH 2/9 linux-next] nilfs2: remove dir_pages() declaration

2015-05-24 Thread Ryusuke Konishi

On Sun, 24 May 2015 17:19:42 +0200, Fabian Frederick  wrote:
> dir_pages() is now declared in pagemap.h
> 
> Signed-off-by: Fabian Frederick 
> ---
>  fs/nilfs2/dir.c | 5 -
>  1 file changed, 5 deletions(-)
> 
> diff --git a/fs/nilfs2/dir.c b/fs/nilfs2/dir.c
> index 0ee0bed..6b8b92b 100644
> --- a/fs/nilfs2/dir.c
> +++ b/fs/nilfs2/dir.c
> @@ -61,11 +61,6 @@ static inline void nilfs_put_page(struct page *page)
>   page_cache_release(page);
>  }
>  
> -static inline unsigned long dir_pages(struct inode *inode)
> -{
> - return (inode->i_size+PAGE_CACHE_SIZE-1)>>PAGE_CACHE_SHIFT;
> -}
> -

Can you include this and similar changes in the first patch
"pagemap.h: declare dir_pages()" ?

The first patch transiently breaks build because it inserts a
duplicate definition of the dir_pages() inline function until it gets
removed from each file system by the successive patches.

This series looks non-divisible except the patch of ufs.

Regards,
Ryusuke Konishi

>  /*
>   * Return the offset into page `page_nr' of the last valid
>   * byte in that page, plus one.
> -- 
> 2.4.1
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to majord...@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at  http://www.tux.org/lkml/
--
To unsubscribe from this list: send the line "unsubscribe linux-nilfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH v2 9/9] nilfs2: prevent starvation of segments protected by snapshots

2015-05-20 Thread Ryusuke Konishi

On Wed, 20 May 2015 23:43:35 +0900 (JST), Ryusuke Konishi wrote:
> On Sun,  3 May 2015 12:05:22 +0200, Andreas Rohner wrote:
>> It doesn't really matter if the number of reclaimable blocks for a
>> segment is inaccurate, as long as the overall performance is better than
>> the simple timestamp algorithm and starvation is prevented.
>> 
>> The following steps will lead to starvation of a segment:
>> 
>> 1. The segment is written
>> 2. A snapshot is created
>> 3. The files in the segment are deleted and the number of live
>>blocks for the segment is decremented to a very low value
>> 4. The GC tries to free the segment, but there are no reclaimable
>>blocks, because they are all protected by the snapshot. To prevent an
>>infinite loop the GC has to adjust the number of live blocks to the
>>correct value.
>> 5. The snapshot is converted to a checkpoint and the blocks in the
>>segment are now reclaimable.
>> 6. The GC will never attempt to clean the segment again, because it
>>looks as if it had a high number of live blocks.
>> 
>> To prevent this, the already existing padding field of the SUFILE entry
>> is used to track the number of snapshot blocks in the segment. This
>> number is only set by the GC, since it collects the necessary
>> information anyway. So there is no need, to track which block belongs to
>> which segment. In step 4 of the list above the GC will set the new field
>> su_nsnapshot_blks. In step 5 all entries in the SUFILE are checked and
>> entries with a big su_nsnapshot_blks field get their su_nlive_blks field
>> reduced.
>> 
>> Signed-off-by: Andreas Rohner 
> 
> I still don't know whether this workaround is the way we should take
> or not.  This patch has several drawbacks:
> 
>  1. It introduces overheads to every "chcp cp" operation
> due to traversal rewrite of sufile.
> If the ratio of snapshot protected blocks is high, then
> this overheads will be big.
> 
>  2. The traversal rewrite of sufile will causes many sufile blocks will be
> written out.   If most blocks are protected by a snapshot,
> more than 4MB of sufile blocks will be written per 1TB capacity.
> 
> Even though this rewrite may not happen for contiguous "chcp cp"
> operations, it still has potential for creating sufile write blocks
> if the application of nilfs manipulates snapshots frequently.
> 
>  3. The ratio of the threshold "max_segblks" is hard coded to 50%
> of blocks_per_segment.  It is not clear if the ratio is good
> (versatile).
> 
> I will add comments inline below.
> 
>> ---
>>  fs/nilfs2/ioctl.c  | 50 +++-
>>  fs/nilfs2/sufile.c | 85 
>> ++
>>  fs/nilfs2/sufile.h |  3 ++
>>  3 files changed, 137 insertions(+), 1 deletion(-)
>> 
>> diff --git a/fs/nilfs2/ioctl.c b/fs/nilfs2/ioctl.c
>> index 40bf74a..431725f 100644
>> --- a/fs/nilfs2/ioctl.c
>> +++ b/fs/nilfs2/ioctl.c
>> @@ -200,6 +200,49 @@ static int nilfs_ioctl_getversion(struct inode *inode, 
>> void __user *argp)
>>  }
>>  
>>  /**
>> + * nilfs_ioctl_fix_starving_segs - fix potentially starving segments
>> + * @nilfs: nilfs object
>> + * @inode: inode object
>> + *
>> + * Description: Scans for segments, which are potentially starving and
>> + * reduces the number of live blocks to less than half of the maximum
>> + * number of blocks in a segment. This requires a scan of the whole SUFILE,
>> + * which can take a long time on certain devices and under certain 
>> conditions.
>> + * To avoid blocking other file system operations for too long the SUFILE is
>> + * scanned in steps of NILFS_SUFILE_STARVING_SEGS_STEP. After each step the
>> + * locks are released and cond_resched() is called.
>> + *
>> + * Return Value: On success, 0 is returned and on error, one of the
>> + * following negative error codes is returned.
>> + *
>> + * %-EIO - I/O error.
>> + *
>> + * %-ENOMEM - Insufficient amount of memory available.
>> + */
> 
>> +static int nilfs_ioctl_fix_starving_segs(struct the_nilfs *nilfs,
>> + struct inode *inode) {
> 
> This "inode" argument is meaningless for this routine.
> Consider passing "sb" instead.
> 
> I feel odd for the function name "fix starving segs".  It looks to
> give a workaround rather than solve the root problem of gc in nilfs.
> It looks like what this patch is doing, is "

Re: [PATCH v2 9/9] nilfs2: prevent starvation of segments protected by snapshots

2015-05-20 Thread Ryusuke Konishi

 + nilfs_transaction_begin(inode->i_sb, &ti, 0);
> +
> + ret = nilfs_sufile_fix_starving_segs(nilfs->ns_sufile, i,
> + NILFS_SUFILE_STARVING_SEGS_STEP);
> + if (unlikely(ret < 0)) {
> + nilfs_transaction_abort(inode->i_sb);
> + break;
> + }
> +
> + nilfs_transaction_commit(inode->i_sb); /* never fails */
> + cond_resched();
> + }
> +
> + return ret;
> +}
> +
> +/**
>   * nilfs_ioctl_change_cpmode - change checkpoint mode (checkpoint/snapshot)
>   * @inode: inode object
>   * @filp: file object
> @@ -224,7 +267,7 @@ static int nilfs_ioctl_change_cpmode(struct inode *inode, 
> struct file *filp,
>   struct the_nilfs *nilfs = inode->i_sb->s_fs_info;
>   struct nilfs_transaction_info ti;
>   struct nilfs_cpmode cpmode;
> - int ret;
> + int ret, is_snapshot;
>  
>   if (!capable(CAP_SYS_ADMIN))
>   return -EPERM;
> @@ -240,6 +283,7 @@ static int nilfs_ioctl_change_cpmode(struct inode *inode, 
> struct file *filp,
>   mutex_lock(&nilfs->ns_snapshot_mount_mutex);
>  
>   nilfs_transaction_begin(inode->i_sb, &ti, 0);
> + is_snapshot = nilfs_cpfile_is_snapshot(nilfs->ns_cpfile, cpmode.cm_cno);
>   ret = nilfs_cpfile_change_cpmode(
>   nilfs->ns_cpfile, cpmode.cm_cno, cpmode.cm_mode);
>   if (unlikely(ret < 0))
> @@ -248,6 +292,10 @@ static int nilfs_ioctl_change_cpmode(struct inode 
> *inode, struct file *filp,
>   nilfs_transaction_commit(inode->i_sb); /* never fails */
>  
>   mutex_unlock(&nilfs->ns_snapshot_mount_mutex);
> +

> + if (is_snapshot > 0 && cpmode.cm_mode == NILFS_CHECKPOINT &&
> + nilfs_feature_track_live_blks(nilfs))
> + ret = nilfs_ioctl_fix_starving_segs(nilfs, inode);

Should we use this return value ?
This doesn't relate to the success and failure of "chcp" operation.

nilfs_ioctl_fix_starving_segs() is called every time "chcp cp" is
called.  I prefer to delay this extra work with a workqueue and to
skip starting a new work if the previous work is still running.

>  out:
>   mnt_drop_write_file(filp);
>   return ret;
> diff --git a/fs/nilfs2/sufile.c b/fs/nilfs2/sufile.c
> index 9cd8820d..47e2c05 100644
> --- a/fs/nilfs2/sufile.c
> +++ b/fs/nilfs2/sufile.c
> @@ -1215,6 +1215,91 @@ out_sem:
>  }
>  
>  /**
> + * nilfs_sufile_fix_starving_segs - fix potentially starving segments
> + * @sufile: inode of segment usage file
> + * @segnum: segnum to start
> + * @nsegs: number of segments to check
> + *
> + * Description: Scans for segments, which are potentially starving and
> + * reduces the number of live blocks to less than half of the maximum
> + * number of blocks in a segment. This way the segment is more likely to be
> + * chosen by the GC. A segment is marked as potentially starving, if more
> + * than half of the blocks it contains are protected by snapshots.
> + *
> + * Return Value: On success, 0 is returned and on error, one of the
> + * following negative error codes is returned.
> + *
> + * %-EIO - I/O error.
> + *
> + * %-ENOMEM - Insufficient amount of memory available.
> + */
> +int nilfs_sufile_fix_starving_segs(struct inode *sufile, __u64 segnum,
> +__u64 nsegs)
> +{
> + struct buffer_head *su_bh;
> + struct nilfs_segment_usage *su;
> + size_t n, i, susz = NILFS_MDT(sufile)->mi_entry_size;
> + struct the_nilfs *nilfs = sufile->i_sb->s_fs_info;
> + void *kaddr;
> + unsigned long maxnsegs, segusages_per_block;
> + __u32 max_segblks = nilfs->ns_blocks_per_segment >> 1;
> + int ret = 0, blkdirty, dirty = 0;
> +
> + down_write(&NILFS_MDT(sufile)->mi_sem);
> +

> + maxnsegs = nilfs_sufile_get_nsegments(sufile);
> + segusages_per_block = nilfs_sufile_segment_usages_per_block(sufile);
> + nsegs += segnum;
> + if (nsegs > maxnsegs)
> + nsegs = maxnsegs;
> +
> + while (segnum < nsegs) {

This local variable "nsegs" is used as an (exclusive) end segment number.
It's confusing.   You should define "end" variable separately.
It can be simply calculated by:

end = min_t(__u64, segnum + nsegs, nilfs_sufile_get_nsegments(sufile));

("maxnsegs" can be removed.)

Note that the evaluation of each argument will never be done twice in
min_t() macro since min_t() temporarily stores the evaluation results
to hidden local variables and uses them for comparison.

Regards,
Ryusuke Konishi

Re: [PATCH 2/3] NILFS2: support NFSv2 export

2015-05-10 Thread Ryusuke Konishi

On Fri, 08 May 2015 10:16:23 +1000, NeilBrown  wrote:
> The "fh_len" passed to ->fh_to_* is not guaranteed to be that same as
> that returned by encode_fh - it may be larger.
> 
> With NFSv2, the filehandle is fixed length, so it may appear longer
> than expected and be zero-padded.
> 
> So we must test that fh_len is at least some value, not exactly equal
> to it.
> 
> Signed-off-by: NeilBrown 
> ---
>  fs/nilfs2/namei.c |6 +++---
>  1 file changed, 3 insertions(+), 3 deletions(-)
> 
> diff --git a/fs/nilfs2/namei.c b/fs/nilfs2/namei.c
> index 22180836ec22..b65fb79d16fd 100644
> --- a/fs/nilfs2/namei.c
> +++ b/fs/nilfs2/namei.c
> @@ -496,8 +496,8 @@ static struct dentry *nilfs_fh_to_dentry(struct 
> super_block *sb, struct fid *fh,
>  {
>   struct nilfs_fid *fid = (struct nilfs_fid *)fh;
>  
> - if ((fh_len != NILFS_FID_SIZE_NON_CONNECTABLE &&
> -  fh_len != NILFS_FID_SIZE_CONNECTABLE) ||

> + if ((fh_len < NILFS_FID_SIZE_NON_CONNECTABLE &&
> +  fh_len < NILFS_FID_SIZE_CONNECTABLE) ||
>   (fh_type != FILEID_NILFS_WITH_PARENT &&
>fh_type != FILEID_NILFS_WITHOUT_PARENT))
>   return NULL;

A bit weird.  "fh_len < NILFS_FID_SIZE_CONNECTABLE" implies "fh_len <
NILFS_FID_SIZE_NON_CONNECTABLE".

How about the following fix ?

if ((fh_type != FILEID_NILFS_WITH_PARENT ||
 fh_len < NILFS_FID_SIZE_CONNECTABLE) &&
(fh_type != FILEID_NILFS_WITHOUT_PARENT ||
 fh_len < NILFS_FID_SIZE_NON_CONNECTABLE))
return NULL;

Regards,
Ryusuke Konishi

> @@ -510,7 +510,7 @@ static struct dentry *nilfs_fh_to_parent(struct 
> super_block *sb, struct fid *fh,
>  {
>   struct nilfs_fid *fid = (struct nilfs_fid *)fh;
>  
> - if (fh_len != NILFS_FID_SIZE_CONNECTABLE ||
> + if (fh_len < NILFS_FID_SIZE_CONNECTABLE ||
>   fh_type != FILEID_NILFS_WITH_PARENT)
>   return NULL;
>  
> 
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-nilfs" in
> the body of a message to majord...@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line "unsubscribe linux-nilfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH v2 4/9] nilfs2: add kmem_cache for SUFILE cache nodes

2015-05-09 Thread Ryusuke Konishi

On Sat, 09 May 2015 21:10:21 +0200, Andreas Rohner wrote:
> On 2015-05-09 04:41, Ryusuke Konishi wrote:
>> On Sun,  3 May 2015 12:05:17 +0200, Andreas Rohner wrote:
>>> +static void nilfs_sufile_cache_node_init_once(void *obj)
>>> +{
>>> +   memset(obj, 0, sizeof(struct nilfs_sufile_cache_node));
>>> +}
>>> +
>> 
>> Note that nilfs_sufile_cache_node_init_once() is only called when each
>> cache entry is allocated first time.  It doesn't ensure each cache
>> entry is clean when it will be allocated with kmem_cache_alloc()
>> the second time and afterwards.
> 
> I kind of assumed it would be called for every object returned by
> kmem_cache_alloc(). In that case I have to do the initialization in
> nilfs_sufile_alloc_cache_node() and remove this function.
> 
> Regards,
> Andreas Rohner

You can use kmem_cache_zalloc() instead in that case.

Regards,
Ryusuke Konishi
--
To unsubscribe from this list: send the line "unsubscribe linux-nilfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH v2 1/9] nilfs2: copy file system feature flags to the nilfs object

2015-05-08 Thread Ryusuke Konishi

On Sun,  3 May 2015 12:05:14 +0200, Andreas Rohner wrote:
> This patch adds three new attributes to the nilfs object, which contain
> a copy of the feature flags from the super block. This can be used, to
> efficiently test whether file system feature flags are set or not.
> 
> Signed-off-by: Andreas Rohner 
> ---
>  fs/nilfs2/the_nilfs.c | 4 
>  fs/nilfs2/the_nilfs.h | 8 
>  2 files changed, 12 insertions(+)
> 
> diff --git a/fs/nilfs2/the_nilfs.c b/fs/nilfs2/the_nilfs.c
> index 69bd801..606fdfc 100644
> --- a/fs/nilfs2/the_nilfs.c
> +++ b/fs/nilfs2/the_nilfs.c
> @@ -630,6 +630,10 @@ int init_nilfs(struct the_nilfs *nilfs, struct 
> super_block *sb, char *data)
>   get_random_bytes(&nilfs->ns_next_generation,
>sizeof(nilfs->ns_next_generation));
>  
> + nilfs->ns_feature_compat = le64_to_cpu(sbp->s_feature_compat);
> + nilfs->ns_feature_compat_ro = le64_to_cpu(sbp->s_feature_compat_ro);
> + nilfs->ns_feature_incompat = le64_to_cpu(sbp->s_feature_incompat);

Consider moving these initialization to just before calling
nilfs_check_feature_compatibility().

It uses compat flags, and I'd like to unfold the function using these
internal variables sometime.

> +
>   err = nilfs_store_disk_layout(nilfs, sbp);
>   if (err)
>   goto failed_sbh;
> diff --git a/fs/nilfs2/the_nilfs.h b/fs/nilfs2/the_nilfs.h
> index 23778d3..12cd91d 100644
> --- a/fs/nilfs2/the_nilfs.h
> +++ b/fs/nilfs2/the_nilfs.h
> @@ -101,6 +101,9 @@ enum {
>   * @ns_dev_kobj: /sys/fs//
>   * @ns_dev_kobj_unregister: completion state
>   * @ns_dev_subgroups:  subgroups pointer
> + * @ns_feature_compat: Compatible feature set
> + * @ns_feature_compat_ro: Read-only compatible feature set
> + * @ns_feature_incompat: Incompatible feature set
>   */
>  struct the_nilfs {
>   unsigned long   ns_flags;
> @@ -201,6 +204,11 @@ struct the_nilfs {
>   struct kobject ns_dev_kobj;
>   struct completion ns_dev_kobj_unregister;
>   struct nilfs_sysfs_dev_subgroups *ns_dev_subgroups;
> +
> + /* Features */
> + __u64   ns_feature_compat;
> + __u64   ns_feature_compat_ro;
> + __u64   ns_feature_incompat;
>  };
>  
>  #define THE_NILFS_FNS(bit, name) \
> -- 
> 2.3.7
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-nilfs" in
> the body of a message to majord...@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line "unsubscribe linux-nilfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH] nilfs2: fix sanity check of btree level in nilfs_btree_root_broken()

2015-04-29 Thread Ryusuke Konishi

The range check for b-tree level parameter in nilfs_btree_root_broken()
is wrong; it accepts the case of "level == NILFS_BTREE_LEVEL_MAX" even
though the level is limited to values in the range of 0 to
(NILFS_BTREE_LEVEL_MAX - 1).

Since the level parameter is read from storage device and used to index
nilfs_btree_path array whose element count is NILFS_BTREE_LEVEL_MAX, it
can cause memory overrun during btree operations if the boundary value
is set to the level parameter on device.

This fixes the broken sanity check and adds a comment to clarify that
the upper bound NILFS_BTREE_LEVEL_MAX is exclusive.

Signed-off-by: Ryusuke Konishi 
Cc: 
---
 fs/nilfs2/btree.c | 2 +-
 include/linux/nilfs2_fs.h | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/fs/nilfs2/btree.c b/fs/nilfs2/btree.c
index 059f371..919fd5b 100644
--- a/fs/nilfs2/btree.c
+++ b/fs/nilfs2/btree.c
@@ -388,7 +388,7 @@ static int nilfs_btree_root_broken(const struct 
nilfs_btree_node *node,
nchildren = nilfs_btree_node_get_nchildren(node);
 
if (unlikely(level < NILFS_BTREE_LEVEL_NODE_MIN ||
-level > NILFS_BTREE_LEVEL_MAX ||
+level >= NILFS_BTREE_LEVEL_MAX ||
 nchildren < 0 ||
 nchildren > NILFS_BTREE_ROOT_NCHILDREN_MAX)) {
pr_crit("NILFS: bad btree root (inode number=%lu): level = %d, 
flags = 0x%x, nchildren = %d\n",
diff --git a/include/linux/nilfs2_fs.h b/include/linux/nilfs2_fs.h
index ff3fea3..9abb763 100644
--- a/include/linux/nilfs2_fs.h
+++ b/include/linux/nilfs2_fs.h
@@ -460,7 +460,7 @@ struct nilfs_btree_node {
 /* level */
 #define NILFS_BTREE_LEVEL_DATA  0
 #define NILFS_BTREE_LEVEL_NODE_MIN  (NILFS_BTREE_LEVEL_DATA + 1)
-#define NILFS_BTREE_LEVEL_MAX   14
+#define NILFS_BTREE_LEVEL_MAX   14 /* Max level (exclusive) */
 
 /**
  * struct nilfs_palloc_group_desc - block group descriptor
-- 
1.8.3.1

--
To unsubscribe from this list: send the line "unsubscribe linux-nilfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH 0/2] nilfs2: fix issues with nilfs_set_inode_flags()

2015-04-08 Thread Ryusuke Konishi

Hi Andrew,

Please queue the following changes for the next merge window:

Ryusuke Konishi (2):
  nilfs2: put out gfp mask manipulation from nilfs_set_inode_flags()
  nilfs2: use inode_set_flags() in nilfs_set_inode_flags()

These fix issues related to nilfs_set_inode_flags() function.

Thanks,
Ryusuke Konishi
--

 fs/nilfs2/inode.c | 19 ++-
 1 file changed, 10 insertions(+), 9 deletions(-)

--
To unsubscribe from this list: send the line "unsubscribe linux-nilfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH 1/2] nilfs2: put out gfp mask manipulation from nilfs_set_inode_flags()

2015-04-08 Thread Ryusuke Konishi

nilfs_set_inode_flags() function adjusts gfp-mask of inode->i_mapping
as well as i_flags, however, this coupling of operations is not
appropriate.

For instance, nilfs_ioctl_setflags(), one of three callers of
nilfs_set_inode_flags(), doesn't need to reinitialize the gfp-mask at
all.  In addition, nilfs_new_inode(), another caller of
nilfs_set_inode_flags(), doesn't either because it has already
initialized the gfp-mask.

Only __nilfs_read_inode(), the remaining caller, needs it.  So, this
moves the gfp mask manipulation to __nilfs_read_inode() from
nilfs_set_inode_flags().

Signed-off-by: Ryusuke Konishi 
---
 fs/nilfs2/inode.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c
index cf9e489..0c28ccb 100644
--- a/fs/nilfs2/inode.c
+++ b/fs/nilfs2/inode.c
@@ -456,8 +456,6 @@ void nilfs_set_inode_flags(struct inode *inode)
inode->i_flags |= S_NOATIME;
if (flags & FS_DIRSYNC_FL)
inode->i_flags |= S_DIRSYNC;
-   mapping_set_gfp_mask(inode->i_mapping,
-mapping_gfp_mask(inode->i_mapping) & ~__GFP_FS);
 }
 
 int nilfs_read_inode_common(struct inode *inode,
@@ -542,6 +540,8 @@ static int __nilfs_read_inode(struct super_block *sb,
brelse(bh);
up_read(&NILFS_MDT(nilfs->ns_dat)->mi_sem);
nilfs_set_inode_flags(inode);
+   mapping_set_gfp_mask(inode->i_mapping,
+mapping_gfp_mask(inode->i_mapping) & ~__GFP_FS);
return 0;
 
  failed_unmap:
-- 
1.8.3.1

--
To unsubscribe from this list: send the line "unsubscribe linux-nilfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH 2/2] nilfs2: use inode_set_flags() in nilfs_set_inode_flags()

2015-04-08 Thread Ryusuke Konishi

Use inode_set_flags() to atomically set i_flags instead of clearing
out the S_IMMUTABLE, S_APPEND, etc. flags and then setting them from
the FS_IMMUTABLE_FL, FS_APPEND_FL flags to avoid a race where an
immutable file has the immutable flag cleared for a brief window of
time.

This is a similar fix to commit 5f16f3225b06 ("ext4: atomically set
inode->i_flags in ext4_set_inode_flags()").

Signed-off-by: Ryusuke Konishi 
Cc: "Theodore Ts'o" 
---
 fs/nilfs2/inode.c | 15 ---
 1 file changed, 8 insertions(+), 7 deletions(-)

diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c
index 0c28ccb..72c7fbf 100644
--- a/fs/nilfs2/inode.c
+++ b/fs/nilfs2/inode.c
@@ -443,19 +443,20 @@ struct inode *nilfs_new_inode(struct inode *dir, umode_t 
mode)
 void nilfs_set_inode_flags(struct inode *inode)
 {
unsigned int flags = NILFS_I(inode)->i_flags;
+   unsigned int new_fl = 0;
 
-   inode->i_flags &= ~(S_SYNC | S_APPEND | S_IMMUTABLE | S_NOATIME |
-   S_DIRSYNC);
if (flags & FS_SYNC_FL)
-   inode->i_flags |= S_SYNC;
+   new_fl |= S_SYNC;
if (flags & FS_APPEND_FL)
-   inode->i_flags |= S_APPEND;
+   new_fl |= S_APPEND;
if (flags & FS_IMMUTABLE_FL)
-   inode->i_flags |= S_IMMUTABLE;
+   new_fl |= S_IMMUTABLE;
if (flags & FS_NOATIME_FL)
-   inode->i_flags |= S_NOATIME;
+   new_fl |= S_NOATIME;
if (flags & FS_DIRSYNC_FL)
-   inode->i_flags |= S_DIRSYNC;
+   new_fl |= S_DIRSYNC;
+   inode_set_flags(inode, new_fl, S_SYNC | S_APPEND | S_IMMUTABLE |
+   S_NOATIME | S_DIRSYNC);
 }
 
 int nilfs_read_inode_common(struct inode *inode,
-- 
1.8.3.1

--
To unsubscribe from this list: send the line "unsubscribe linux-nilfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH] lib/nilfs: add missing initialization of nilfs->n_opts

2015-04-03 Thread Ryusuke Konishi

Options of lib/nilfs is stored in n_opts member of nilfs object.  This
variable is not initialized in the current implementation of
nilfs_open API.  Thus, it can cause indeterminate behavior.

This issue is originally pointed out by Andreas Rohner in his rfc
patch titled "nilfs-utils: add support for tracking live blocks".

This fixes it separately.

Signed-off-by: Andreas Rohner 
Signed-off-by: Ryusuke Konishi 
---
 lib/nilfs.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/lib/nilfs.c b/lib/nilfs.c
index 30db654..1d18ffc 100644
--- a/lib/nilfs.c
+++ b/lib/nilfs.c
@@ -381,6 +381,7 @@ struct nilfs *nilfs_open(const char *dev, const char *dir, 
int flags)
nilfs->n_iocfd = -1;
nilfs->n_dev = NULL;
nilfs->n_ioc = NULL;
+   nilfs->n_opts = 0;
nilfs->n_mincno = NILFS_CNO_MIN;
memset(nilfs->n_sems, 0, sizeof(nilfs->n_sems));
 
-- 
1.8.3.1

--
To unsubscribe from this list: send the line "unsubscribe linux-nilfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH] nilfs2: fix gcc warning at nilfs_checkpoint_is_mounted()

2015-04-02 Thread Ryusuke Konishi

Fix the following build warning:

 fs/nilfs2/super.c: In function 'nilfs_checkpoint_is_mounted':
 fs/nilfs2/super.c:1023:10: warning: comparison of unsigned expression < 0 is 
always false [-Wtype-limits]
   if (cno < 0 || cno > nilfs->ns_cno)
   ^

This warning indicates that the comparision "cno < 0" is useless
because variable "cno" has an unsigned integer type "__u64".

Reported-by: David Binderman 
Signed-off-by: Ryusuke Konishi 
---
 fs/nilfs2/super.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c
index 5bc2a1c..c1725f20 100644
--- a/fs/nilfs2/super.c
+++ b/fs/nilfs2/super.c
@@ -1020,7 +1020,7 @@ int nilfs_checkpoint_is_mounted(struct super_block *sb, 
__u64 cno)
struct dentry *dentry;
int ret;
 
-   if (cno < 0 || cno > nilfs->ns_cno)
+   if (cno > nilfs->ns_cno)
return false;
 
if (cno >= nilfs_last_cno(nilfs))
-- 
1.8.3.1

--
To unsubscribe from this list: send the line "unsubscribe linux-nilfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH 5/7] nilfs2: add bmap function to seek a valid key

2015-03-25 Thread Ryusuke Konishi

Hi Andrew,

On Thu, 12 Mar 2015 23:38:12 +0900, Ryusuke Konishi wrote:
> Add a new bmap function, nilfs_bmap_seek_key(), which seeks a valid
> entry and returns its key starting from a given key.  This function
> can be used to skip hole blocks efficiently.
> 
> Signed-off-by: Ryusuke Konishi 
> ---

> +static int nilfs_btree_seek_key(const struct nilfs_bmap *btree, __u64 start,
> + __u64 *keyp)
> +{
> + struct nilfs_btree_path *path;
> + const int minlevel = NILFS_BTREE_LEVEL_NODE_MIN;
> + int ret;
> +

> + if (start > NILFS_BTREE_KEY_MAX)
> + return -ENOENT;

Could you apply the following amendment ?

I've got a report from Dan Carpenter that this range check causes a
warning:

 fs/nilfs2/btree.c:1611 nilfs_btree_seek_key() warn: impossible condition 
'(start > (~0)) => (0-u64max > u64max)'

A range check was needed there in a wip patch, but we no longer need
it.

Thanks,
Ryusuke Konishi
--
diff --git a/fs/nilfs2/btree.c b/fs/nilfs2/btree.c
index 841d177..059f371 100644
--- a/fs/nilfs2/btree.c
+++ b/fs/nilfs2/btree.c
@@ -1608,9 +1608,6 @@ static int nilfs_btree_seek_key(const struct nilfs_bmap 
*btree, __u64 start,
const int minlevel = NILFS_BTREE_LEVEL_NODE_MIN;
int ret;
 
-   if (start > NILFS_BTREE_KEY_MAX)
-   return -ENOENT;
-
path = nilfs_btree_alloc_path();
if (!path)
return -ENOMEM;
-- 
1.8.3.1

--
To unsubscribe from this list: send the line "unsubscribe linux-nilfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH 0/9] nilfs2: implementation of cost-benefit GC policy

2015-03-14 Thread Ryusuke Konishi

On Sat, 14 Mar 2015 13:24:25 +0100, Andreas Rohner wrote:
> Hi Ryusuke,
> 
> Thank you very much for your detailed review and feedback. I agree with
> all of your points and I will start working on a rewrite immediately.
> 
> On 2015-03-12 13:54, Ryusuke Konishi wrote:
>> Hi Andreas,
>> 
>> On Tue, 10 Mar 2015 21:37:50 +0100, Andreas Rohner wrote:
>>> Hi Ryusuke,
>>>
>>> Thanks for your thorough review.
>>>
>>> On 2015-03-10 06:21, Ryusuke Konishi wrote:
>>>> Hi Andreas,
>>>>
>>>> I looked through whole kernel patches and a part of util patches.
>>>> Overall comments are as follows:
>>>>
>>>> [Algorithm]
>>>> As for algorithm, it looks about OK except for the starvation
>>>> countermeasure.  The stavation countermeasure looks adhoc/hacky, but
>>>> it's good that it doesn't change kernel/userland interface; we may be
>>>> able to replace it with better ways in a future or in a revised
>>>> version of this patchset.
>>>>
>>>> (1) Drawback of the starvation countermeasure
>>>> The patch 9/9 looks to make the execution time of chcp operation
>>>> worse since it will scan through sufile to modify live block
>>>> counters.  How much does it prolong the execution time ?
>>>
>>> I'll do some tests, but I haven't noticed any significant performance
>>> drop. The GC basically does the same thing, every time it selects
>>> segments to reclaim.
>> 
>> GC is performed in background by an independent process.  What I'm
>> care about it that NILFS_IOCTL_CHANGE_CPMODE ioctl is called from
>> command line interface or application.  They differ in this meaning.
>> 
>> Was a worse case senario considered in the test ?
>> 
>> For example:
>> 1. Fill a TB class drive with data file(s), and make a snapshot on it.
>> 2. Run one pass GC to update snapshot block counts.
>> 3. And do "chcp cp"
>> 
>> If we don't observe noticeable delay on this class of drive, then I
>> think we can put the problem off.
> 
> Yesterday I did a worst case test as you suggested. I used an old 1 TB
> hard drive I had lying around. This was my setup:
> 
> 1. Write a 850GB file
> 2. Create a snapshot
> 3. Delete the file
> 4. Let GC run through all segments
> 5. Verify with lssu that the GC has updated all SUFILE entries
> 6. Drop the page cache
> 7. chcp cp
> 
> The following results are with the page cache dropped immediately before
> each call:
> 
> 1. chcp ss
> real  0m1.337s
> user  0m0.017s
> sys   0m0.030s
> 
> 2. chcp cp
> real  0m6.377s
> user  0m0.023s
> sys   0m0.053s
> 
> The following results are without the drop of the page cache:
> 
> 1. chcp ss
> real  0m0.137s
> user  0m0.010s
> sys   0m0.000s
> 
> 2. chcp cp
> real  0m0.016s
> user  0m0.010s
> sys   0m0.007s
> 
> There are 119233 segments in my test. Each SUFILE entry uses 32 bytes.
> So the worst case for 1 TB with 8 MB segments would be 3.57 MB of random
> reads and one 3.57 MB continuous write. You only get 6.377s because my
> hard drive is so slow. You wouldn't notice any difference on a modern
> SSD. Furthermore the SUFILE is also scanned by the segment allocation
> algorithm and the GC, so it is very likely already in the page cache.

6.377s is too long because nilfs_sufile_fix_starving_segs() locks
sufile mi_sem, and even lengthens lock period of the following locks:

 - cpfile mi_sem (held at nilfs_cpfile_clear_snapshot()).
 - transaction lock (held at nilfs_ioctl_change_cpmode()).
 - ns_snapshot_mount_mutex (held at nilfs_ioctl_change_cpmode()).

leading to freeze of all write operations, lssu, lscp, cleanerd, and
snapshot mount, etc.

It is preferable for the function to be moved outside of them and to
release/reacquire transaction lock and sufile mi_sem regularly in some
way.

Regards,
Ryusuke Konishi
--
To unsubscribe from this list: send the line "unsubscribe linux-nilfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH 9/9] nilfs2: prevent starvation of segments protected by snapshots

2015-03-14 Thread Ryusuke Konishi


One more comment.

On Sat, 14 Mar 2015 12:51:09 +0900 (JST), Ryusuke Konishi wrote:
> On Tue, 24 Feb 2015 20:01:44 +0100, Andreas Rohner wrote:
>> @@ -1050,6 +1069,85 @@ ssize_t nilfs_sufile_set_suinfo(struct inode *sufile, 
>> void *buf,
>>  }
>>  
>>  /**
>> + * nilfs_sufile_fix_starving_segs - fix potentially starving segments
>> + * @sufile: inode of segment usage file
>> + *
>> + * Description: Scans for segments, which are potentially starving and
>> + * reduces the number of live blocks to less than half of the maximum
>> + * number of blocks in a segment. This way the segment is more likely to be
>> + * chosen by the GC. A segment is marked as potentially starving, if more
>> + * than half of the blocks it contains are protected by snapshots.
>> + *
>> + * Return Value: On success, 0 is returned and on error, one of the
>> + * following negative error codes is returned.
>> + *
>> + * %-EIO - I/O error.
>> + *
>> + * %-ENOMEM - Insufficient amount of memory available.
>> + */
>> +int nilfs_sufile_fix_starving_segs(struct inode *sufile)
>> +{
>> +struct buffer_head *su_bh;
>> +struct nilfs_segment_usage *su;
>> +size_t n, i, susz = NILFS_MDT(sufile)->mi_entry_size;
>> +struct the_nilfs *nilfs = sufile->i_sb->s_fs_info;
>> +void *kaddr;
>> +unsigned long nsegs, segusages_per_block;
>> +__u32 max_segblks = nilfs->ns_blocks_per_segment / 2;
>> +__u64 segnum = 0;
>> +int ret = 0, blkdirty, dirty = 0;
>> +
>> +down_write(&NILFS_MDT(sufile)->mi_sem);
>> +
>> +segusages_per_block = nilfs_sufile_segment_usages_per_block(sufile);
>> +nsegs = nilfs_sufile_get_nsegments(sufile);
>> +
>> +while (segnum < nsegs) {
>> +n = nilfs_sufile_segment_usages_in_block(sufile, segnum,
>> + nsegs - 1);
>> +
>> +ret = nilfs_sufile_get_segment_usage_block(sufile, segnum,
>> +   0, &su_bh);
>> +if (ret < 0) {
>> +if (ret != -ENOENT)
>> +goto out;
>> +/* hole */
>> +segnum += n;
>> +continue;
>> +}
>> +
>> +kaddr = kmap_atomic(su_bh->b_page);
>> +su = nilfs_sufile_block_get_segment_usage(sufile, segnum,
>> +  su_bh, kaddr);
>> +blkdirty = 0;
>> +for (i = 0; i < n; ++i, ++segnum, su = (void *)su + susz) {
>> +if (le32_to_cpu(su->su_nsnapshot_blks) <= max_segblks)
>> +continue;
>> +
>> +if (su->su_nlive_blks <= max_segblks)
>> +continue;
>> +
>> +su->su_nlive_blks = max_segblks;
>> +blkdirty = 1;
>> +}
>> +
>> +kunmap_atomic(kaddr);
>> +if (blkdirty) {
>> +mark_buffer_dirty(su_bh);
>> +dirty = 1;
>> +}
>> +put_bh(su_bh);

Insert cond_resched() here to mitigate latency issue (mainly for the
environment in which voluntary preemption is turned off).

Regards,
Ryusuke Konishi

>> +}
>> +
>> +out:
>> +if (dirty)
>> +nilfs_mdt_mark_dirty(sufile);
>> +
>> +up_write(&NILFS_MDT(sufile)->mi_sem);
>> +return ret;
>> +}
>> +
>> +/**
>>   * nilfs_sufile_trim_fs() - trim ioctl handle function
>>   * @sufile: inode of segment usage file
>>   * @range: fstrim_range structure
--
To unsubscribe from this list: send the line "unsubscribe linux-nilfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH 9/9] nilfs2: prevent starvation of segments protected by snapshots

2015-03-14 Thread Ryusuke Konishi

On Sat, 14 Mar 2015 13:36:35 +0100, Andreas Rohner wrote:
> On 2015-03-14 04:51, Ryusuke Konishi wrote:
>> On Tue, 24 Feb 2015 20:01:44 +0100, Andreas Rohner wrote:
>>> diff --git a/include/linux/nilfs2_fs.h b/include/linux/nilfs2_fs.h
>>> index 6ffdc09..a3c7593 100644
>>> --- a/include/linux/nilfs2_fs.h
>>> +++ b/include/linux/nilfs2_fs.h
>>> @@ -222,11 +222,13 @@ struct nilfs_super_block {
>>>   */
>>>  #define NILFS_FEATURE_COMPAT_SUFILE_EXTENSION  (1ULL << 0)
>>>  #define NILFS_FEATURE_COMPAT_TRACK_LIVE_BLKS   (1ULL << 1)
>>> +#define NILFS_FEATURE_COMPAT_TRACK_SNAPSHOTS   (1ULL << 2)
>>>  
>>>  #define NILFS_FEATURE_COMPAT_RO_BLOCK_COUNT(1ULL << 0)
>>>  
>>>  #define NILFS_FEATURE_COMPAT_SUPP  (NILFS_FEATURE_COMPAT_SUFILE_EXTENSION \
>>> -   | NILFS_FEATURE_COMPAT_TRACK_LIVE_BLKS)
>>> +   | NILFS_FEATURE_COMPAT_TRACK_LIVE_BLKS \
>>> +   | NILFS_FEATURE_COMPAT_TRACK_SNAPSHOTS)
>>>  #define NILFS_FEATURE_COMPAT_RO_SUPP   
>>> NILFS_FEATURE_COMPAT_RO_BLOCK_COUNT
>>>  #define NILFS_FEATURE_INCOMPAT_SUPP0ULL
>>>  
>> 
>> You don't have to add three compat flags just for this one patchset.
>> Please unify it.
>> 
>> #define NILFS_FEATURE_COMPAT_TRACK_LIVE_BLKS (1ULL << 0)
>> 
>> looks to be enough.
> 
> I could merge the TRACK_LIVE_BLKS and TRACK_SNAPSHOTS flag, but I would
> suggest to at least leave the SUFILE_EXTENSION flag (maybe with a
> different name). The SUFILE_EXTENSION flag has to be set at mkfs time
> and it cannot be set or removed later, because you cannot change the on
> disk format later. I actually set SUFILE_EXTENSION by default in mkfs,
> because it is not harmful and it gives the user the option to switch the
> other flags on later.

I see, it sounds reasonable.

Regards,
Ryusuke Konishi
--
To unsubscribe from this list: send the line "unsubscribe linux-nilfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH 3/6] nilfs-utils: add support for tracking live blocks

2015-03-13 Thread Ryusuke Konishi

t;  
> -#define NILFS_FEATURE_COMPAT_SUPPNILFS_FEATURE_COMPAT_SUFILE_EXTENSION
> +#define NILFS_FEATURE_COMPAT_SUPP(NILFS_FEATURE_COMPAT_SUFILE_EXTENSION \
> + | NILFS_FEATURE_COMPAT_TRACK_LIVE_BLKS)
>  #define NILFS_FEATURE_COMPAT_RO_SUPP NILFS_FEATURE_COMPAT_RO_BLOCK_COUNT
>  #define NILFS_FEATURE_INCOMPAT_SUPP  0ULL
>  
> diff --git a/lib/feature.c b/lib/feature.c
> index d954cda..ebe8c3f 100644
> --- a/lib/feature.c
> +++ b/lib/feature.c
> @@ -57,6 +57,8 @@ static const struct nilfs_feature features[] = {
>   /* Compat features */
>   { NILFS_FEATURE_TYPE_COMPAT,
> NILFS_FEATURE_COMPAT_SUFILE_EXTENSION, "sufile_ext" },
> + { NILFS_FEATURE_TYPE_COMPAT,
> +   NILFS_FEATURE_COMPAT_TRACK_LIVE_BLKS, "track_live_blks" },
>   /* Read-only compat features */
>   { NILFS_FEATURE_TYPE_COMPAT_RO,
> NILFS_FEATURE_COMPAT_RO_BLOCK_COUNT, "block_count" },
> diff --git a/lib/nilfs.c b/lib/nilfs.c
> index 30db654..2067fc0 100644
> --- a/lib/nilfs.c
> +++ b/lib/nilfs.c
> @@ -290,34 +290,6 @@ int nilfs_opt_test_mmap(struct nilfs *nilfs)
>   return !!(nilfs->n_opts & NILFS_OPT_MMAP);
>  }
>  
> -/**
> - * nilfs_opt_set_set_suinfo - set set_suinfo option
> - * @nilfs: nilfs object
> - */
> -int nilfs_opt_set_set_suinfo(struct nilfs *nilfs)
> -{
> - nilfs->n_opts |= NILFS_OPT_SET_SUINFO;
> - return 0;
> -}
> -
> -/**
> - * nilfs_opt_clear_set_suinfo - clear set_suinfo option
> - * @nilfs: nilfs object
> - */
> -void nilfs_opt_clear_set_suinfo(struct nilfs *nilfs)
> -{
> - nilfs->n_opts &= ~NILFS_OPT_SET_SUINFO;
> -}
> -
> -/**
> - * nilfs_opt_test_set_suinfo - test whether set_suinfo option is set or not
> - * @nilfs: nilfs object
> - */
> -int nilfs_opt_test_set_suinfo(struct nilfs *nilfs)
> -{
> - return !!(nilfs->n_opts & NILFS_OPT_SET_SUINFO);
> -}
> -
>  static int nilfs_open_sem(struct nilfs *nilfs)
>  {
>   char semnambuf[NAME_MAX - 4];
> @@ -382,6 +354,7 @@ struct nilfs *nilfs_open(const char *dev, const char 
> *dir, int flags)
>   nilfs->n_dev = NULL;
>   nilfs->n_ioc = NULL;
>   nilfs->n_mincno = NILFS_CNO_MIN;
> + nilfs->n_opts = 0;

Please fix this as a separate patch.  This is a leak bug even though
it doesn't really matters.

Regards,
Ryusuke Konishi

>   memset(nilfs->n_sems, 0, sizeof(nilfs->n_sems));
>  
>   if (flags & NILFS_OPEN_RAW) {
> @@ -405,6 +378,9 @@ struct nilfs *nilfs_open(const char *dev, const char 
> *dir, int flags)
>   errno = ENOTSUP;
>   goto out_fd;
>   }
> +
> + if (nilfs_feature_track_live_blks(nilfs))
> + nilfs_opt_set_track_live_blks(nilfs);
>   }
>  
>   if (flags &
> diff --git a/man/mkfs.nilfs2.8 b/man/mkfs.nilfs2.8
> index 6c9a644..2431ac9 100644
> --- a/man/mkfs.nilfs2.8
> +++ b/man/mkfs.nilfs2.8
> @@ -176,6 +176,12 @@ cannot be disabled, because it changes the ondisk 
> format. Nevertheless it
>  is fully compatible with older versions of the file system. This feature
>  is on by default, because it is fully backwards compatible and can only
>  be set at file system creation time.
> +.TP
> +.B track_live_blks
> +Enables the tracking of live blocks, which might improve the effectiveness of
> +garbage collection, but entails a small runtime overhead. It is important to
> +note, that this feature depends on sufile_ext, which can only be set
> +at file system creation time.
>  .RE
>  .TP
>  .B \-q
> diff --git a/sbin/mkfs/mkfs.c b/sbin/mkfs/mkfs.c
> index 3985262..680311c 100644
> --- a/sbin/mkfs/mkfs.c
> +++ b/sbin/mkfs/mkfs.c
> @@ -1082,7 +1082,8 @@ static inline void check_ctime(time_t ctime)
>  
>  static const __u64 ok_features[NILFS_MAX_FEATURE_TYPES] = {
>   /* Compat */
> - NILFS_FEATURE_COMPAT_SUFILE_EXTENSION,
> + NILFS_FEATURE_COMPAT_SUFILE_EXTENSION |
> + NILFS_FEATURE_COMPAT_TRACK_LIVE_BLKS,
>   /* Read-only compat */
>   NILFS_FEATURE_COMPAT_RO_BLOCK_COUNT,
>   /* Incompat */
> diff --git a/sbin/nilfs-tune/nilfs-tune.c b/sbin/nilfs-tune/nilfs-tune.c
> index 60f1d39..7889310 100644
> --- a/sbin/nilfs-tune/nilfs-tune.c
> +++ b/sbin/nilfs-tune/nilfs-tune.c
> @@ -84,7 +84,7 @@ static void nilfs_tune_usage(void)
>  
>  static const __u64 ok_features[NILFS_MAX_FEATURE_TYPES] = {
>   /* Compat */
> - 0,
> + NILFS_FEATURE_COMPAT_TRACK_LIVE_BLKS,
>   /* Read-only compat */
>   NILFS_FEATURE_COMPAT_RO_BLOCK_COUNT,
>   /* Incompat */
> @@ -93,7 +93,7 @@ static const __u64 ok_features[NILFS_MAX_FEATURE_TYPES] = {
>  
>  static const __u64 clear_ok_features[NILFS_MAX_FEATURE_TYPES] = {
>   /* Compat */
> - 0,
> + NILFS_FEATURE_COMPAT_TRACK_LIVE_BLKS,
>   /* Read-only compat */
>   NILFS_FEATURE_COMPAT_RO_BLOCK_COUNT,
>   /* Incompat */
> -- 
> 2.3.0
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-nilfs" in
> the body of a message to majord...@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line "unsubscribe linux-nilfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH 4/9] nilfs2: add function to modify su_nlive_blks

2015-03-13 Thread Ryusuke Konishi

of segment usage file
> + * @mc: modification cache
> + * @segnum: segment number
> + * @value: signed value (can be positive and negative)
> + *
> + * Description: nilfs_sufile_mod_nlive_blks() adds @value to the 
> su_nlive_blks
> + * field of the segment usage entry for @segnum. If @mc is not NULL it first
> + * accumulates all modifications in the cache and flushes it if it is full.
> + * Otherwise the change is applied directly.
> + *
> + * Return Value: On success, zero is returned.  On error, one of the
> + * following negative error codes is returned.
> + *
> + * %-EIO - I/O error.
> + *
> + * %-ENOMEM - Insufficient amount of memory available.
> + *
> + * %-ENOENT - Given segment usage is in hole block
> + *
> + * %-EINVAL - Invalid segment usage number
> + */
> +int nilfs_sufile_mod_nlive_blks(struct inode *sufile,
> + struct nilfs_sufile_mod_cache *mc,
> + __u64 segnum, __s64 value)
> +{
> + int ret;
> +
> + if (!value || !nilfs_sufile_ext_supported(sufile))
> + return 0;
> +
> + if (!mc)
> + return nilfs_sufile_mc_update(sufile, segnum, value,
> + nilfs_sufile_do_flush_nlive_blks);
> +
> + if (!nilfs_sufile_mc_add(mc, segnum, value))
> + return 0;
> +
> + ret = nilfs_sufile_flush_nlive_blks(sufile, mc);
> +
> + nilfs_sufile_mc_reset(mc, segnum, value);
> +
> + return ret;
> +}
> +
> +/**
>   * nilfs_sufile_read - read or get sufile inode
>   * @sb: super block instance
>   * @susize: size of a segment usage entry
> diff --git a/fs/nilfs2/sufile.h b/fs/nilfs2/sufile.h
> index d56498b..ae3c52a 100644
> --- a/fs/nilfs2/sufile.h
> +++ b/fs/nilfs2/sufile.h
> @@ -195,4 +195,9 @@ static inline void nilfs_sufile_mc_destroy(struct 
> nilfs_sufile_mod_cache *mc)
>   }
>  }
>  
> +int nilfs_sufile_flush_nlive_blks(struct inode *,
> +   struct nilfs_sufile_mod_cache *);
> +int nilfs_sufile_mod_nlive_blks(struct inode *, struct 
> nilfs_sufile_mod_cache *,
> + __u64, __s64);
> +

Please add variable names to arguments of new declarations.
(You don't have to add variable names to unrelated declarations)

Regards,
Ryusuke Konishi

>  #endif   /* _NILFS_SUFILE_H */
> -- 
> 2.3.0
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-nilfs" in
> the body of a message to majord...@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line "unsubscribe linux-nilfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH 3/9] nilfs2: extend SUFILE on-disk format to enable counting of live blocks

2015-03-13 Thread Ryusuke Konishi

On Tue, 24 Feb 2015 20:01:38 +0100, Andreas Rohner wrote:
> *buf,
>   int cleansi, cleansu, dirtysi, dirtysu;
>   long ncleaned = 0, ndirtied = 0;
>   int ret = 0;
> + bool sup_ext = (supsz >= NILFS_EXT_SUINFO_UPDATE_SIZE);
> + bool su_ext = nilfs_sufile_ext_supported(sufile);
>  
>   if (unlikely(nsup == 0))
>   return ret;
> @@ -926,6 +949,9 @@ ssize_t nilfs_sufile_set_suinfo(struct inode *sufile, 
> void *buf,
>   (~0UL << __NR_NILFS_SUINFO_UPDATE_FIELDS))
>   || (nilfs_suinfo_update_nblocks(sup) &&
>   sup->sup_sui.sui_nblocks >
> + nilfs->ns_blocks_per_segment)
> + || (nilfs_suinfo_update_nlive_blks(sup) && sup_ext &&
> + sup->sup_sui.sui_nlive_blks >
>   nilfs->ns_blocks_per_segment))
>   return -EINVAL;
>   }
> @@ -953,6 +979,14 @@ ssize_t nilfs_sufile_set_suinfo(struct inode *sufile, 
> void *buf,
>   if (nilfs_suinfo_update_nblocks(sup))
>   su->su_nblocks = cpu_to_le32(sup->sup_sui.sui_nblocks);
>  
> + if (nilfs_suinfo_update_nlive_blks(sup) && sup_ext && su_ext)
> + su->su_nlive_blks =
> + cpu_to_le32(sup->sup_sui.sui_nlive_blks);
> +
> + if (nilfs_suinfo_update_nlive_lastmod(sup) && sup_ext && su_ext)
> + su->su_nlive_lastmod =
> + cpu_to_le64(sup->sup_sui.sui_nlive_lastmod);
> +
>   if (nilfs_suinfo_update_flags(sup)) {
>   /*
>* Active flag is a virtual flag projected by running
> diff --git a/fs/nilfs2/sufile.h b/fs/nilfs2/sufile.h
> index c446325..d56498b 100644
> --- a/fs/nilfs2/sufile.h
> +++ b/fs/nilfs2/sufile.h
> @@ -28,6 +28,11 @@
>  #include 
>  #include "mdt.h"
>  
> +static inline int
> +nilfs_sufile_ext_supported(const struct inode *sufile)
> +{
> + return NILFS_MDT(sufile)->mi_entry_size >= NILFS_EXT_SEGMENT_USAGE_SIZE;
> +}
>  
>  static inline unsigned long nilfs_sufile_get_nsegments(struct inode *sufile)
>  {
> diff --git a/include/linux/nilfs2_fs.h b/include/linux/nilfs2_fs.h
> index ff3fea3..5d83c55 100644
> --- a/include/linux/nilfs2_fs.h
> +++ b/include/linux/nilfs2_fs.h
> @@ -220,9 +220,11 @@ struct nilfs_super_block {
>   * If there is a bit set in the incompatible feature set that the kernel
>   * doesn't know about, it should refuse to mount the filesystem.
>   */
> -#define NILFS_FEATURE_COMPAT_RO_BLOCK_COUNT  0x0001ULL
> +#define NILFS_FEATURE_COMPAT_SUFILE_EXTENSION(1ULL << 0)

This feature name is not good.  sufile can be extended more in a future.
You should name it based on the meaning of the extension of this time.

As I mentioned in another patch, I think this could be unified to the
TRACK_LIVE_BLKS feature that a later patch adds since the live block
counting of this patchset is inherently depending on the extention of
sufile.

>  
> -#define NILFS_FEATURE_COMPAT_SUPP0ULL
> +#define NILFS_FEATURE_COMPAT_RO_BLOCK_COUNT  (1ULL << 0)
> +

Regards,
Ryusuke Konishi
--
To unsubscribe from this list: send the line "unsubscribe linux-nilfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH 9/9] nilfs2: prevent starvation of segments protected by snapshots

2015-03-13 Thread Ryusuke Konishi

 +++ b/fs/nilfs2/sufile.h
> @@ -45,7 +45,8 @@ int nilfs_sufile_set_alloc_range(struct inode *sufile, 
> __u64 start, __u64 end);
>  int nilfs_sufile_alloc(struct inode *, __u64 *);
>  int nilfs_sufile_mark_dirty(struct inode *sufile, __u64 segnum);
>  int nilfs_sufile_set_segment_usage(struct inode *sufile, __u64 segnum,
> -unsigned long nblocks, time_t modtime);
> +unsigned long nblocks, __u32 nsnapshot_blks,
> +time_t modtime);
>  int nilfs_sufile_get_stat(struct inode *, struct nilfs_sustat *);
>  ssize_t nilfs_sufile_get_suinfo(struct inode *, __u64, void *, unsigned,
>   size_t);
> @@ -72,6 +73,7 @@ int nilfs_sufile_resize(struct inode *sufile, __u64 
> newnsegs);
>  int nilfs_sufile_read(struct super_block *sb, size_t susize,
> struct nilfs_inode *raw_inode, struct inode **inodep);
>  int nilfs_sufile_trim_fs(struct inode *sufile, struct fstrim_range *range);
> +int nilfs_sufile_fix_starving_segs(struct inode *);
>  
>  /**
>   * nilfs_sufile_scrap - make a segment garbage
> diff --git a/fs/nilfs2/the_nilfs.h b/fs/nilfs2/the_nilfs.h
> index 87cab10..3d495f1 100644
> --- a/fs/nilfs2/the_nilfs.h
> +++ b/fs/nilfs2/the_nilfs.h
> @@ -409,4 +409,11 @@ static inline int nilfs_feature_track_live_blks(struct 
> the_nilfs *nilfs)
>       NILFS_FEATURE_COMPAT_SUFILE_EXTENSION);
>  }
>  
> +static inline int nilfs_feature_track_snapshots(struct the_nilfs *nilfs)
> +{
> + return (nilfs->ns_feature_compat &
> + NILFS_FEATURE_COMPAT_TRACK_SNAPSHOTS) &&
> + nilfs_feature_track_live_blks(nilfs);
> +}
> +
>  #endif /* _THE_NILFS_H */
> diff --git a/include/linux/nilfs2_fs.h b/include/linux/nilfs2_fs.h
> index 6ffdc09..a3c7593 100644
> --- a/include/linux/nilfs2_fs.h
> +++ b/include/linux/nilfs2_fs.h
> @@ -222,11 +222,13 @@ struct nilfs_super_block {
>   */
>  #define NILFS_FEATURE_COMPAT_SUFILE_EXTENSION(1ULL << 0)
>  #define NILFS_FEATURE_COMPAT_TRACK_LIVE_BLKS (1ULL << 1)
> +#define NILFS_FEATURE_COMPAT_TRACK_SNAPSHOTS (1ULL << 2)
>  
>  #define NILFS_FEATURE_COMPAT_RO_BLOCK_COUNT  (1ULL << 0)
>  
>  #define NILFS_FEATURE_COMPAT_SUPP(NILFS_FEATURE_COMPAT_SUFILE_EXTENSION \
> - | NILFS_FEATURE_COMPAT_TRACK_LIVE_BLKS)
> + | NILFS_FEATURE_COMPAT_TRACK_LIVE_BLKS \
> + | NILFS_FEATURE_COMPAT_TRACK_SNAPSHOTS)
>  #define NILFS_FEATURE_COMPAT_RO_SUPP NILFS_FEATURE_COMPAT_RO_BLOCK_COUNT
>  #define NILFS_FEATURE_INCOMPAT_SUPP  0ULL
>  

You don't have to add three compat flags just for this one patchset.
Please unify it.

#define NILFS_FEATURE_COMPAT_TRACK_LIVE_BLKS(1ULL << 0)

looks to be enough.

Regards,
Ryusuke Konishi


> @@ -630,7 +632,7 @@ struct nilfs_segment_usage {
>   __le32 su_nblocks;
>   __le32 su_flags;
>   __le32 su_nlive_blks;
> - __le32 su_pad;
> + __le32 su_nsnapshot_blks;
>   __le64 su_nlive_lastmod;
>  };
>  
> @@ -682,7 +684,7 @@ nilfs_segment_usage_set_clean(struct nilfs_segment_usage 
> *su, size_t susz)
>   su->su_flags = cpu_to_le32(0);
>   if (susz >= NILFS_EXT_SEGMENT_USAGE_SIZE) {
>   su->su_nlive_blks = cpu_to_le32(0);
> - su->su_pad = cpu_to_le32(0);
> + su->su_nsnapshot_blks = cpu_to_le32(0);
>   su->su_nlive_lastmod = cpu_to_le64(0);
>   }
>  }
> @@ -723,7 +725,7 @@ struct nilfs_suinfo {
>   __u32 sui_nblocks;
>   __u32 sui_flags;
>   __u32 sui_nlive_blks;
> - __u32 sui_pad;
> + __u32 sui_nsnapshot_blks;
>   __u64 sui_nlive_lastmod;
>  };
>  
> @@ -770,6 +772,7 @@ enum {
>   NILFS_SUINFO_UPDATE_FLAGS,
>   NILFS_SUINFO_UPDATE_NLIVE_BLKS,
>   NILFS_SUINFO_UPDATE_NLIVE_LASTMOD,
> + NILFS_SUINFO_UPDATE_NSNAPSHOT_BLKS,
>   __NR_NILFS_SUINFO_UPDATE_FIELDS,
>  };
>  
> @@ -794,6 +797,7 @@ NILFS_SUINFO_UPDATE_FNS(LASTMOD, lastmod)
>  NILFS_SUINFO_UPDATE_FNS(NBLOCKS, nblocks)
>  NILFS_SUINFO_UPDATE_FNS(FLAGS, flags)
>  NILFS_SUINFO_UPDATE_FNS(NLIVE_BLKS, nlive_blks)
> +NILFS_SUINFO_UPDATE_FNS(NSNAPSHOT_BLKS, nsnapshot_blks)
>  NILFS_SUINFO_UPDATE_FNS(NLIVE_LASTMOD, nlive_lastmod)
>  
>  #define NILFS_MIN_SUINFO_UPDATE_SIZE \
> -- 
> 2.3.0
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-nilfs" in
> the body of a message to majord...@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line "unsubscribe linux-nilfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH 5/9] nilfs2: add simple tracking of block deletions and updates

2015-03-13 Thread Ryusuke Konishi

e, NULL,
> + segbuf->sb_segnum,
> + segbuf->sb_nlive_blks_added);
>   }
>  }
>  
> -static void nilfs_cancel_segusage(struct list_head *logs, struct inode 
> *sufile)
> +static void nilfs_cancel_segusage(struct list_head *logs,
> +   struct the_nilfs *nilfs)
>  {
>   struct nilfs_segment_buffer *segbuf;
> + struct inode *sufile = nilfs->ns_sufile;
>   int ret;
>  
>   segbuf = NILFS_FIRST_SEGBUF(logs);
> @@ -1394,6 +1405,12 @@ static void nilfs_cancel_segusage(struct list_head 
> *logs, struct inode *sufile)
>segbuf->sb_fseg_start, 0);
>   WARN_ON(ret); /* always succeed because the segusage is dirty */
>  
> + if (nilfs_feature_track_live_blks(nilfs))
> + nilfs_sufile_mod_nlive_blks(sufile, NULL, segbuf->sb_segnum,
> + -((__s64)segbuf->sb_nlive_blks_added));
> +
> + segbuf->sb_nlive_blks_added = 0;
> +
>   list_for_each_entry_continue(segbuf, logs, sb_list) {
>   ret = nilfs_sufile_set_segment_usage(sufile, segbuf->sb_segnum,
>0, 0);
> @@ -1729,7 +1746,7 @@ static void nilfs_segctor_abort_construction(struct 
> nilfs_sc_info *sci,
>   nilfs_abort_logs(&logs, ret ? : err);
>  
>   list_splice_tail_init(&sci->sc_segbufs, &logs);
> - nilfs_cancel_segusage(&logs, nilfs->ns_sufile);
> + nilfs_cancel_segusage(&logs, nilfs);
>   nilfs_free_incomplete_logs(&logs, nilfs);
>  
>   if (sci->sc_stage.flags & NILFS_CF_SUFREED) {
> @@ -1995,7 +2012,7 @@ static int nilfs_segctor_do_construct(struct 
> nilfs_sc_info *sci, int mode)
>  
>   nilfs_segctor_fill_in_super_root(sci, nilfs);
>   }
> - nilfs_segctor_update_segusage(sci, nilfs->ns_sufile);
> + nilfs_segctor_update_segusage(sci, nilfs);
>  
>   /* Write partial segments */
>   nilfs_segctor_prepare_write(sci);

Please separate changes below.

> diff --git a/fs/nilfs2/the_nilfs.c b/fs/nilfs2/the_nilfs.c
> index 69bd801..606fdfc 100644
> --- a/fs/nilfs2/the_nilfs.c
> +++ b/fs/nilfs2/the_nilfs.c
> @@ -630,6 +630,10 @@ int init_nilfs(struct the_nilfs *nilfs, struct 
> super_block *sb, char *data)
>   get_random_bytes(&nilfs->ns_next_generation,
>sizeof(nilfs->ns_next_generation));
>  
> + nilfs->ns_feature_compat = le64_to_cpu(sbp->s_feature_compat);
> + nilfs->ns_feature_compat_ro = le64_to_cpu(sbp->s_feature_compat_ro);
> + nilfs->ns_feature_incompat = le64_to_cpu(sbp->s_feature_incompat);
> +
>   err = nilfs_store_disk_layout(nilfs, sbp);
>   if (err)
>   goto failed_sbh;
> diff --git a/fs/nilfs2/the_nilfs.h b/fs/nilfs2/the_nilfs.h
> index 23778d3..87cab10 100644
> --- a/fs/nilfs2/the_nilfs.h
> +++ b/fs/nilfs2/the_nilfs.h
> @@ -101,6 +101,9 @@ enum {
>   * @ns_dev_kobj: /sys/fs//
>   * @ns_dev_kobj_unregister: completion state
>   * @ns_dev_subgroups:  subgroups pointer
> + * @ns_feature_compat: Compatible feature set
> + * @ns_feature_compat_ro: Read-only compatible feature set
> + * @ns_feature_incompat: Incompatible feature set
>   */
>  struct the_nilfs {
>   unsigned long   ns_flags;
> @@ -201,6 +204,11 @@ struct the_nilfs {
>   struct kobject ns_dev_kobj;
>   struct completion ns_dev_kobj_unregister;
>   struct nilfs_sysfs_dev_subgroups *ns_dev_subgroups;
> +
> + /* Features */
> + __u64   ns_feature_compat;
> + __u64   ns_feature_compat_ro;
> + __u64   ns_feature_incompat;
>  };
>  
>  #define THE_NILFS_FNS(bit, name) \
> @@ -393,4 +401,12 @@ static inline int nilfs_flush_device(struct the_nilfs 
> *nilfs)
>   return err;
>  }
>  
> +static inline int nilfs_feature_track_live_blks(struct the_nilfs *nilfs)
> +{
> + return (nilfs->ns_feature_compat &
> + NILFS_FEATURE_COMPAT_TRACK_LIVE_BLKS) &&
> + (nilfs->ns_feature_compat &
> + NILFS_FEATURE_COMPAT_SUFILE_EXTENSION);
> +}
> +

This should be written as below:

static inline int nilfs_feature_track_live_blks(struct the_nilfs *nilfs)
{
const __u64 required_bits = NILFS_FEATURE_COMPAT_TRACK_LIVE_BLKS |
NILFS_FEATURE_COMPAT_SUFILE_EXTENSION;

return ((nilfs->ns_feature_compat & required_bits) == required_bits);
}

Or you can drop the track flag at mount time if
NILFS_FEATURE_COMPAT_SUFILE_EXTENSION flag is not set or
nilfs_sufile_ext_supported(sufile) is false.

Regards,
Ryusuke Konishi

>  #endif /* _THE_NILFS_H */
> diff --git a/include/linux/nilfs2_fs.h b/include/linux/nilfs2_fs.h
> index 5d83c55..6ccb2ad 100644
> --- a/include/linux/nilfs2_fs.h
> +++ b/include/linux/nilfs2_fs.h
> @@ -221,10 +221,12 @@ struct nilfs_super_block {
>   * doesn't know about, it should refuse to mount the filesystem.
>   */
>  #define NILFS_FEATURE_COMPAT_SUFILE_EXTENSION(1ULL << 0)
> +#define NILFS_FEATURE_COMPAT_TRACK_LIVE_BLKS (1ULL << 1)
>  
>  #define NILFS_FEATURE_COMPAT_RO_BLOCK_COUNT  (1ULL << 0)
>  
> -#define NILFS_FEATURE_COMPAT_SUPPNILFS_FEATURE_COMPAT_SUFILE_EXTENSION
> +#define NILFS_FEATURE_COMPAT_SUPP(NILFS_FEATURE_COMPAT_SUFILE_EXTENSION \
> + | NILFS_FEATURE_COMPAT_TRACK_LIVE_BLKS)
>  #define NILFS_FEATURE_COMPAT_RO_SUPP NILFS_FEATURE_COMPAT_RO_BLOCK_COUNT
>  #define NILFS_FEATURE_INCOMPAT_SUPP  0ULL
>  
> -- 
> 2.3.0
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-nilfs" in
> the body of a message to majord...@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line "unsubscribe linux-nilfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH 7/9] nilfs2: add additional flags for nilfs_vdesc

2015-03-13 Thread Ryusuke Konishi

ge.h
> @@ -36,13 +36,17 @@ enum {
>   BH_NILFS_Volatile,
>   BH_NILFS_Checked,
>   BH_NILFS_Redirected,
> + BH_NILFS_Snapshot,
> + BH_NILFS_Protection_Period,
>  };
>  
>  BUFFER_FNS(NILFS_Node, nilfs_node)   /* nilfs node buffers */
>  BUFFER_FNS(NILFS_Volatile, nilfs_volatile)
>  BUFFER_FNS(NILFS_Checked, nilfs_checked) /* buffer is verified */
>  BUFFER_FNS(NILFS_Redirected, nilfs_redirected)   /* redirected to a copy 
> */
> -

> +BUFFER_FNS(NILFS_Snapshot, nilfs_snapshot)   /* belongs to a snapshot */
> +BUFFER_FNS(NILFS_Protection_Period, nilfs_protection_period) /* protected by
> + protection period */
>  

I propose alternative names: "snapshot_protected", and
"period_protected" (or "time_protected") respectively to clarify
meaning of the flags.

>  int __nilfs_clear_page_dirty(struct page *);
>  
> diff --git a/include/linux/nilfs2_fs.h b/include/linux/nilfs2_fs.h
> index 6ccb2ad..6ffdc09 100644
> --- a/include/linux/nilfs2_fs.h
> +++ b/include/linux/nilfs2_fs.h
> @@ -900,7 +900,7 @@ struct nilfs_vinfo {
>   * @vd_blocknr: disk block number
>   * @vd_offset: logical block offset inside a file
>   * @vd_flags: flags (data or node block)
> - * @vd_pad: padding
> + * @vd_blk_flags: additional flags
>   */
>  struct nilfs_vdesc {
>   __u64 vd_ino;
> @@ -910,9 +910,63 @@ struct nilfs_vdesc {
>   __u64 vd_blocknr;
>   __u64 vd_offset;
>   __u32 vd_flags;
> - __u32 vd_pad;
> + /*
> +  * vd_blk_flags needed because vd_flags doesn't support
> +  * bit-flags because of backwards compatibility
> +  */
> + __u32 vd_blk_flags;
>  };
>  

> +/* vdesc flags */
> +enum {
> + NILFS_VDESC_DATA,
> + NILFS_VDESC_NODE,
> +
> + /* ... */
> +};
> +enum {
> + NILFS_VDESC_SNAPSHOT,
> + NILFS_VDESC_PROTECTION_PERIOD,
> +
> + /* ... */
> +
> + __NR_NILFS_VDESC_FIELDS,
> +};
> +
> +#define NILFS_VDESC_FNS(flag, name)  \
> +static inline void           \
> +nilfs_vdesc_set_##name(struct nilfs_vdesc *vdesc)\
> +{\
> + vdesc->vd_flags = NILFS_VDESC_##flag;   \
> +}\
> +static inline int\
> +nilfs_vdesc_##name(const struct nilfs_vdesc *vdesc)  \
> +{\
> + return vdesc->vd_flags == NILFS_VDESC_##flag;   \
> +}
> +

Do not add definitions for vd_flags, leave them, and
simplify your patch.

Regards,
Ryusuke Konishi

> +#define NILFS_VDESC_FNS2(flag, name) \
> +static inline void   \
> +nilfs_vdesc_set_##name(struct nilfs_vdesc *vdesc)\
> +{\
> + vdesc->vd_blk_flags |= (1UL << NILFS_VDESC_##flag); \
> +}\
> +static inline void   \
> +nilfs_vdesc_clear_##name(struct nilfs_vdesc *vdesc)  \
> +{\
> + vdesc->vd_blk_flags &= ~(1UL << NILFS_VDESC_##flag);\
> +}\
> +static inline int\
> +nilfs_vdesc_##name(const struct nilfs_vdesc *vdesc)  \
> +{\
> + return !!(vdesc->vd_blk_flags & (1UL << NILFS_VDESC_##flag));   \
> +}
> +
> +NILFS_VDESC_FNS(DATA, data)
> +NILFS_VDESC_FNS(NODE, node)
> +NILFS_VDESC_FNS2(SNAPSHOT, snapshot)
> +NILFS_VDESC_FNS2(PROTECTION_PERIOD, protection_period)
> +
>  /**
>   * struct nilfs_bdesc - descriptor of disk block number
>   * @bd_ino: inode number
> -- 
> 2.3.0
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-nilfs" in
> the body of a message to majord...@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line "unsubscribe linux-nilfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH 8/9] nilfs2: improve accuracy and correct for invalid GC values

2015-03-13 Thread Ryusuke Konishi

lks_gc() is called if the inode to
> + * which @bh belongs is a GC-Inode. In that case it is not necessary to
> + * decrement the previous segment, because at the end of the GC process it
> + * will be freed anyway. It is however necessary to check again if the blocks
> + * are alive here, because the last check was in userspace without the proper
> + * locking. Additionally the blocks protected by the protection period should
> + * be considered reclaimable. It is assumed, that @bh->b_blocknr contains
> + * a virtual block number, which is only true if @bh is part of a GC-Inode.
> + */
> +static void nilfs_segctor_dec_nlive_blks_gc(struct inode *dat,
> + struct nilfs_segment_buffer *segbuf,
> + struct buffer_head *bh) {
> + bool isreclaimable = buffer_nilfs_protection_period(bh) ||
> + !nilfs_dat_is_live(dat, bh->b_blocknr, NULL);
> +
> + if (!buffer_nilfs_snapshot(bh) && isreclaimable)
> + segbuf->sb_nlive_blks_diff--;
> +}
> +
> +/**
> + * nilfs_segctor_dec_nlive_blks_nogc - dec. nlive_blks of segment
> + * @nilfs: the nilfs object
> + * @mc: modification cache
> + * @sb: currtent segment buffer
> + * @blocknr: current block number
> + *
> + * Description: Gets the segment number of the segment @blocknr belongs to
> + * and decrements the su_nlive_blks field of the corresponding segment usage
> + * entry.
> + */
> +static void nilfs_segctor_dec_nlive_blks_nogc(struct the_nilfs *nilfs,
> +   struct nilfs_sufile_mod_cache *mc,
> +   struct nilfs_segment_buffer *sb,
> +   sector_t blocknr)
> +{
> + __u64 segnum = nilfs_get_segnum_of_block(nilfs, blocknr);
> +
> + if (segnum >= nilfs->ns_nsegments)
> + return;
> +
> + if (segnum == sb->sb_segnum)
> + sb->sb_nlive_blks_diff--;
> + else
> + nilfs_sufile_mod_nlive_blks(nilfs->ns_sufile, mc, segnum, -1);
> +}

As I mentioned before, sufile shouldn't be changed (in precise, newly
marked dirty) after the collection phase of sufile.  This looks to be
violating it.

Regards,
Ryusuke Konishi

> +
> +/**
> + * nilfs_segctor_dec_nlive_blks - dec. nlive_blks of previous segment
> + * @nilfs: the nilfs object
> + * @mc: modification cache
> + * @sb: currtent segment buffer
> + * @bh: current buffer head
> + * @ino: current inode number
> + * @gc_inode: true if current inode is a GC-Inode
> + *
> + * Description: Handles GC-Inodes and normal inodes differently. For normal
> + * inodes @bh->b_blocknr contains the location where the block was read in. 
> If
> + * the block is updated, the old version of it is considered reclaimable and 
> so
> + * the su_nlive_blks field of the segment usage information of the old 
> segment
> + * needs to be decremented. Only the DATFILE and SUFILE are decremented here,
> + * because normal files and other meta data files can be better decremented 
> in
> + * nilfs_dat_commit_end().
> + */
> +static void nilfs_segctor_dec_nlive_blks(struct the_nilfs *nilfs,
> +  struct nilfs_sufile_mod_cache *mc,
> +  struct nilfs_segment_buffer *sb,
> +  struct buffer_head *bh,
> +  ino_t ino,
> +  bool gc_inode)
> +{
> + bool isnode = buffer_nilfs_node(bh);
> +
> + if (gc_inode)
> + nilfs_segctor_dec_nlive_blks_gc(nilfs->ns_dat, sb, bh);
> + else if (ino == NILFS_DAT_INO || (ino == NILFS_SUFILE_INO && !isnode))
> + nilfs_segctor_dec_nlive_blks_nogc(nilfs, mc, sb, bh->b_blocknr);
> +}
> +
>  static int
>  nilfs_segctor_update_payload_blocknr(struct nilfs_sc_info *sci,
>struct nilfs_segment_buffer *segbuf,
>int mode)
>  {
> + struct the_nilfs *nilfs = sci->sc_super->s_fs_info;
>   struct inode *inode = NULL;
> + struct nilfs_inode_info *ii;
>   sector_t blocknr;
>   unsigned long nfinfo = segbuf->sb_sum.nfinfo;
>   unsigned long nblocks = 0, ndatablk = 0;
> @@ -1512,7 +1599,9 @@ nilfs_segctor_update_payload_blocknr(struct 
> nilfs_sc_info *sci,
>   union nilfs_binfo binfo;
>   struct buffer_head *bh, *bh_org;
>   ino_t ino = 0;
> - int err = 0;
> + int err = 0, gc_inode = 0, track_live_blks;
> +
> + track_live_blks = nilfs_feature_track_live_blks(nilfs);
>  
>   if (!nfinfo)
&g

Re: [PATCH 6/9] nilfs2: use modification cache to improve performance

2015-03-13 Thread Ryusuke Konishi

gt;   unsigned long from)
>  {
> + struct the_nilfs *nilfs = ii->vfs_inode.i_sb->s_fs_info;
> + struct nilfs_sufile_mod_cache mc, *mcp = NULL;
>   unsigned long b;
>   int ret;
>  
>   if (!test_bit(NILFS_I_BMAP, &ii->i_state))
>   return;
> +
> + if (nilfs_feature_track_live_blks(nilfs) &&
> + !nilfs_sufile_mc_init(&mc, NILFS_SUFILE_MC_SIZE_DEFAULT))
> + mcp = &mc;
> +
>  repeat:
>   ret = nilfs_bmap_last_key(ii->i_bmap, &b);
>   if (ret == -ENOENT)
> - return;
> + goto out_free;
>   else if (ret < 0)
>   goto failed;
>  
>   if (b < from)
> - return;
> + goto out_free;
>  
>   b -= min_t(unsigned long, NILFS_MAX_TRUNCATE_BLOCKS, b - from);
> - ret = nilfs_bmap_truncate(ii->i_bmap, b);
> + ret = nilfs_bmap_truncate_with_mc(ii->i_bmap, mcp, b);
>   nilfs_relax_pressure_in_lock(ii->vfs_inode.i_sb);
>   if (!ret || (ret == -ENOMEM &&
> -  nilfs_bmap_truncate(ii->i_bmap, b) == 0))
> +  nilfs_bmap_truncate_with_mc(ii->i_bmap, mcp, b) == 0))
>   goto repeat;
>  
> +out_free:
> + nilfs_sufile_flush_nlive_blks(nilfs->ns_sufile, mcp);
> + nilfs_sufile_mc_destroy(mcp);
> + return;
>  failed:
> + nilfs_sufile_flush_nlive_blks(nilfs->ns_sufile, mcp);
> + nilfs_sufile_mc_destroy(mcp);
>   nilfs_warning(ii->vfs_inode.i_sb, __func__,
> "failed to truncate bmap (ino=%lu, err=%d)",
> ii->vfs_inode.i_ino, ret);
> diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c
> index 6059f53..dc0070c 100644
> --- a/fs/nilfs2/segment.c
> +++ b/fs/nilfs2/segment.c
> @@ -511,7 +511,8 @@ static int nilfs_collect_file_data(struct nilfs_sc_info 
> *sci,
>  {
>   int err;
>  
> - err = nilfs_bmap_propagate(NILFS_I(inode)->i_bmap, bh);
> + err = nilfs_bmap_propagate_with_mc(NILFS_I(inode)->i_bmap,
> +sci->sc_mc, bh);
>   if (err < 0)
>   return err;
>  
> @@ -526,7 +527,8 @@ static int nilfs_collect_file_node(struct nilfs_sc_info 
> *sci,
>  struct buffer_head *bh,
>  struct inode *inode)
>  {
> - return nilfs_bmap_propagate(NILFS_I(inode)->i_bmap, bh);
> + return nilfs_bmap_propagate_with_mc(NILFS_I(inode)->i_bmap,
> + sci->sc_mc, bh);
>  }
>  
>  static int nilfs_collect_file_bmap(struct nilfs_sc_info *sci,
> @@ -1386,7 +1388,7 @@ static void nilfs_segctor_update_segusage(struct 
> nilfs_sc_info *sci,
>   segbuf->sb_nlive_blks_added = segbuf->sb_sum.nfileblk;
>  
>   if (nilfs_feature_track_live_blks(nilfs))
> - nilfs_sufile_mod_nlive_blks(sufile, NULL,
> + nilfs_sufile_mod_nlive_blks(sufile, sci->sc_mc,
>   segbuf->sb_segnum,
>   segbuf->sb_nlive_blks_added);
>   }
> @@ -2014,6 +2016,9 @@ static int nilfs_segctor_do_construct(struct 
> nilfs_sc_info *sci, int mode)
>   }
>   nilfs_segctor_update_segusage(sci, nilfs);
>  
> + nilfs_sufile_flush_nlive_blks(nilfs->ns_sufile,
> +   sci->sc_mc);
> +
>   /* Write partial segments */
>   nilfs_segctor_prepare_write(sci);
>  
> @@ -2603,6 +2608,7 @@ static struct nilfs_sc_info *nilfs_segctor_new(struct 
> super_block *sb,
>  {
>   struct the_nilfs *nilfs = sb->s_fs_info;
>   struct nilfs_sc_info *sci;
> + int ret;
>  
>   sci = kzalloc(sizeof(*sci), GFP_KERNEL);
>   if (!sci)
> @@ -2633,6 +2639,18 @@ static struct nilfs_sc_info *nilfs_segctor_new(struct 
> super_block *sb,
>   sci->sc_interval = HZ * nilfs->ns_interval;
>   if (nilfs->ns_watermark)
>   sci->sc_watermark = nilfs->ns_watermark;
> +
> + if (nilfs_feature_track_live_blks(nilfs)) {
> + sci->sc_mc = kmalloc(sizeof(*(sci->sc_mc)), GFP_KERNEL);
> + if (sci->sc_mc) {
> + ret = nilfs_sufile_mc_init(sci->sc_mc,
> +NILFS_SUFILE_MC_SIZE_EXT);
> + if (ret) {
> +         kfree(sci->sc_mc);
> + sci->sc_mc = NULL;
> + }
> +

Re: [PATCH 2/9] nilfs2: add simple cache for modifications to SUFILE

2015-03-13 Thread Ryusuke Konishi

On Tue, 24 Feb 2015 20:01:37 +0100, Andreas Rohner wrote:
> This patch adds a simple, small cache that can be used to accumulate
> modifications to SUFILE entries. This is for example useful for
> keeping track of reclaimable blocks, because most of the
> modifications consist of small increments or decrements. By adding
> these up and temporarily storing them in a small cache, the
> performance can be improved. Additionally lock contention is
> reduced.
> 
> Signed-off-by: Andreas Rohner 
> ---
>  fs/nilfs2/sufile.c | 178 
> +
>  fs/nilfs2/sufile.h |  44 +
>  2 files changed, 222 insertions(+)
> 
> diff --git a/fs/nilfs2/sufile.c b/fs/nilfs2/sufile.c
> index 1e8cac6..a369c30 100644
> --- a/fs/nilfs2/sufile.c
> +++ b/fs/nilfs2/sufile.c
> @@ -1168,6 +1168,184 @@ out_sem:
>  }
>  
>  /**
> + * nilfs_sufile_mc_init - inits segusg modification cache
> + * @mc: modification cache
> + * @capacity: maximum capacity of the mod cache
> + *
> + * Description: Allocates memory for an array of nilfs_sufile_mod structures
> + * according to @capacity. This memory must be freed with
> + * nilfs_sufile_mc_destroy().
> + *
> + * Return Value: On success, 0 is returned. On error, one of the following
> + * negative error codes is returned.
> + *
> + * %-ENOMEM - Insufficient amount of memory available.
> + *
> + * %-EINVAL - Invalid capacity.
> + */
> +int nilfs_sufile_mc_init(struct nilfs_sufile_mod_cache *mc, size_t capacity)
> +{
> + mc->mc_capacity = capacity;
> + if (!capacity)
> + return -EINVAL;
> +
> + mc->mc_mods = kmalloc(capacity * sizeof(struct nilfs_sufile_mod),
> +   GFP_KERNEL);

GFP_NOFS must be used instead of GFP_KERNEL to avoid initiating other
filesystem operations.

The abbreviation "mc" is not good, which is already used as the
abbreviation of "minimum clean" in userland.

> + if (!mc->mc_mods)
> + return -ENOMEM;
> +
> + mc->mc_size = 0;
> +
> + return 0;
> +}
> +
> +/**
> + * nilfs_sufile_mc_add - add signed value to segusg modification cache
> + * @mc: modification cache
> + * @segnum: segment number
> + * @value: signed value (can be positive and negative)
> + *
> + * Description: nilfs_sufile_mc_add() tries to add a pair of @segnum and
> + * @value to the modification cache. If the cache already contains a
> + * segment number equal to @segnum, then @value is simply added to the
> + * existing value. This way thousands of small modifications can be
> + * accumulated into one value. If @segnum cannot be found and the
> + * capacity allows it, a new element is added to the cache. If the
> + * capacity is reached an error value is returned.
> + *
> + * Return Value: On success, 0 is returned. On error, one of the following
> + * negative error codes is returned.
> + *
> + * %-ENOSPC - The mod cache has reached its capacity and must be flushed.
> + */
> +static inline int nilfs_sufile_mc_add(struct nilfs_sufile_mod_cache *mc,
> +   __u64 segnum, __s64 value)
> +{
> + struct nilfs_sufile_mod *mods = mc->mc_mods;
> + int i;
> +
> + for (i = 0; i < mc->mc_size; ++i, ++mods) {
> + if (mods->m_segnum == segnum) {
> + mods->m_value += value;
> + return 0;
> + }
> + }
> +
> + if (mc->mc_size < mc->mc_capacity) {
> + mods->m_segnum = segnum;
> + mods->m_value = value;
> + mc->mc_size++;
> + return 0;
> + }
> +
> + return -ENOSPC;
> +}
> +
> +/**
> + * nilfs_sufile_mc_clear - set mc_size to 0
> + * @mc: modification cache
> + *
> + * Description: nilfs_sufile_mc_clear() sets mc_size to 0, which enables
> + * nilfs_sufile_mc_add() to overwrite the elements in @mc.
> + */
> +static inline void nilfs_sufile_mc_clear(struct nilfs_sufile_mod_cache *mc)
> +{
> + mc->mc_size = 0;
> +}
> +
> +/**
> + * nilfs_sufile_mc_reset - clear cache and add one element
> + * @mc: modification cache
> + * @segnum: segment number
> + * @value: signed value (can be positive and negative)
> + *
> + * Description: Clears the modification cache in @mc and adds a new pair of
> + * @segnum and @value to it at the same time.
> + */
> +static inline void nilfs_sufile_mc_reset(struct nilfs_sufile_mod_cache *mc,
> +      __u64 segnum, __s64 value)
> +{
> + struct nilfs_sufile_mod *mods = mc->mc_mods;
> +
> + mods->m_segnum = segnum;
> + mods->m

[PATCH 1/7] nilfs2: do not use async write flag for segment summary buffers

2015-03-12 Thread Ryusuke Konishi

The async write flag is introduced to nilfs2 in the commit
7f42ec394156 ("nilfs2: fix issue with race condition of competition
between segments for dirty blocks"), but the flag only makes sense for
data buffers and btree node buffers.  It is not needed for segment
summary buffers.

This gits rid of the latter uses as part of refactoring of atomic bit
operations on buffer state bitmap.

Signed-off-by: Ryusuke Konishi 
Cc: Vyacheslav Dubeyko 
---
 fs/nilfs2/segment.c | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c
index 0c3f303..c9a4e60 100644
--- a/fs/nilfs2/segment.c
+++ b/fs/nilfs2/segment.c
@@ -1588,7 +1588,6 @@ static void nilfs_segctor_prepare_write(struct 
nilfs_sc_info *sci)
 
list_for_each_entry(bh, &segbuf->sb_segsum_buffers,
b_assoc_buffers) {
-   set_buffer_async_write(bh);
if (bh->b_page != bd_page) {
if (bd_page) {
lock_page(bd_page);
@@ -1688,7 +1687,6 @@ static void nilfs_abort_logs(struct list_head *logs, int 
err)
list_for_each_entry(segbuf, logs, sb_list) {
list_for_each_entry(bh, &segbuf->sb_segsum_buffers,
b_assoc_buffers) {
-   clear_buffer_async_write(bh);
if (bh->b_page != bd_page) {
if (bd_page)
end_page_writeback(bd_page);
@@ -1768,7 +1766,6 @@ static void nilfs_segctor_complete_write(struct 
nilfs_sc_info *sci)
b_assoc_buffers) {
set_buffer_uptodate(bh);
clear_buffer_dirty(bh);
-   clear_buffer_async_write(bh);
if (bh->b_page != bd_page) {
if (bd_page)
end_page_writeback(bd_page);
-- 
1.8.3.1

--
To unsubscribe from this list: send the line "unsubscribe linux-nilfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH 7/7] nilfs2: improve execution time of NILFS_IOCTL_GET_CPINFO ioctl

2015-03-12 Thread Ryusuke Konishi

The older a filesystem gets, the slower lscp command becomes.  This is
because nilfs_cpfile_do_get_cpinfo() function meets more hole blocks
as the start offset of valid checkpoint numbers gets bigger.

This reduces the overhead by skipping hole blocks efficiently with
nilfs_mdt_find_block() helper.

A measurement result of this patch is as follows:

Before:
$ time lscp
 CNODATE TIME  MODE  FLG  BLKCNT   ICNT
 5769303  2015-02-22 19:31:33   cp-  108  1
 5769304  2015-02-22 19:38:54   cp-  108  1

real0m0.182s
user0m0.003s
sys 0m0.180s

After:
$ time lscp
 CNODATE TIME  MODE  FLG  BLKCNT   ICNT
 5769303  2015-02-22 19:31:33   cp-  108  1
 5769304  2015-02-22 19:38:54   cp-  108  1

real0m0.003s
user0m0.001s
sys 0m0.002s

Signed-off-by: Ryusuke Konishi 
---
 fs/nilfs2/cpfile.c | 58 --
 1 file changed, 52 insertions(+), 6 deletions(-)

diff --git a/fs/nilfs2/cpfile.c b/fs/nilfs2/cpfile.c
index 0d58075..b6596ca 100644
--- a/fs/nilfs2/cpfile.c
+++ b/fs/nilfs2/cpfile.c
@@ -53,6 +53,13 @@ nilfs_cpfile_get_offset(const struct inode *cpfile, __u64 
cno)
return do_div(tcno, nilfs_cpfile_checkpoints_per_block(cpfile));
 }
 
+static __u64 nilfs_cpfile_first_checkpoint_in_block(const struct inode *cpfile,
+   unsigned long blkoff)
+{
+   return (__u64)nilfs_cpfile_checkpoints_per_block(cpfile) * blkoff
+   + 1 - NILFS_MDT(cpfile)->mi_first_entry_offset;
+}
+
 static unsigned long
 nilfs_cpfile_checkpoints_in_block(const struct inode *cpfile,
  __u64 curr,
@@ -146,6 +153,44 @@ static inline int nilfs_cpfile_get_checkpoint_block(struct 
inode *cpfile,
   create, nilfs_cpfile_block_init, bhp);
 }
 
+/**
+ * nilfs_cpfile_find_checkpoint_block - find and get a buffer on cpfile
+ * @cpfile: inode of cpfile
+ * @start_cno: start checkpoint number (inclusive)
+ * @end_cno: end checkpoint number (inclusive)
+ * @cnop: place to store the next checkpoint number
+ * @bhp: place to store a pointer to buffer_head struct
+ *
+ * Return Value: On success, it returns 0. On error, the following negative
+ * error code is returned.
+ *
+ * %-ENOMEM - Insufficient memory available.
+ *
+ * %-EIO - I/O error
+ *
+ * %-ENOENT - no block exists in the range.
+ */
+static int nilfs_cpfile_find_checkpoint_block(struct inode *cpfile,
+ __u64 start_cno, __u64 end_cno,
+ __u64 *cnop,
+ struct buffer_head **bhp)
+{
+   unsigned long start, end, blkoff;
+   int ret;
+
+   if (unlikely(start_cno > end_cno))
+   return -ENOENT;
+
+   start = nilfs_cpfile_get_blkoff(cpfile, start_cno);
+   end = nilfs_cpfile_get_blkoff(cpfile, end_cno);
+
+   ret = nilfs_mdt_find_block(cpfile, start, end, &blkoff, bhp);
+   if (!ret)
+   *cnop = (blkoff == start) ? start_cno :
+   nilfs_cpfile_first_checkpoint_in_block(cpfile, blkoff);
+   return ret;
+}
+
 static inline int nilfs_cpfile_delete_checkpoint_block(struct inode *cpfile,
   __u64 cno)
 {
@@ -403,14 +448,15 @@ static ssize_t nilfs_cpfile_do_get_cpinfo(struct inode 
*cpfile, __u64 *cnop,
return -ENOENT; /* checkpoint number 0 is invalid */
down_read(&NILFS_MDT(cpfile)->mi_sem);
 
-   for (n = 0; cno < cur_cno && n < nci; cno += ncps) {
-   ncps = nilfs_cpfile_checkpoints_in_block(cpfile, cno, cur_cno);
-   ret = nilfs_cpfile_get_checkpoint_block(cpfile, cno, 0, &bh);
+   for (n = 0; n < nci; cno += ncps) {
+   ret = nilfs_cpfile_find_checkpoint_block(
+   cpfile, cno, cur_cno - 1, &cno, &bh);
if (ret < 0) {
-   if (ret != -ENOENT)
-   goto out;
-   continue; /* skip hole */
+   if (likely(ret == -ENOENT))
+   break;
+   goto out;
}
+   ncps = nilfs_cpfile_checkpoints_in_block(cpfile, cno, cur_cno);
 
kaddr = kmap_atomic(bh->b_page);
cp = nilfs_cpfile_block_get_checkpoint(cpfile, cno, bh, kaddr);
-- 
1.8.3.1

--
To unsubscribe from this list: send the line "unsubscribe linux-nilfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH 0/7] nilfs2 updates

2015-03-12 Thread Ryusuke Konishi

Hi Andrew,

Please queue the following changes for the next merge window:

Ryusuke Konishi (7):
  nilfs2: do not use async write flag for segment summary buffers
  nilfs2: use set_mask_bits() for operations on buffer state bitmap
  nilfs2: use bgl_lock_ptr()
  nilfs2: unify type of key arguments in bmap interface
  nilfs2: add bmap function to seek a valid key
  nilfs2: add helper to find existent block on metadata file
  nilfs2: improve execution time of NILFS_IOCTL_GET_CPINFO ioctl

* Brief summary

> nilfs2: do not use async write flag for segment summary buffers
> nilfs2: use set_mask_bits() for operations on buffer state bitmap

These reduce the number of atomic bit operations against b_state
bitmap by utilizing set_mask_bits() common helper, or by removing
unnecessary bit operations.

> nilfs2: use bgl_lock_ptr()

This is a cleanup patch, which makes use of bgl_lock_ptr() common
helper for simplicity.

> nilfs2: unify type of key arguments in bmap interface
> nilfs2: add bmap function to seek a valid key
> nilfs2: add helper to find existent block on metadata file
> nilfs2: improve execution time of NILFS_IOCTL_GET_CPINFO ioctl

These improve execution time of the ioctl for checkpoint listing,
which gets worse as the file system ages.

Example:
  [The current implementation]
  $ time lscp
   CNODATE TIME  MODE  FLG  BLKCNT   ICNT
   5769303  2015-02-22 19:31:33   cp-  108  1
   5769304  2015-02-22 19:38:54   cp-  108  1

  real0m0.182s
  user0m0.003s
  sys 0m0.180s

  [With the patchset]
  $ time lscp
   CNODATE TIME  MODE  FLG  BLKCNT   ICNT
   5769303  2015-02-22 19:31:33   cp-  108  1
   5769304  2015-02-22 19:38:54   cp-  108  1

  real0m0.003s
  user0m0.001s
  sys     0m0.002s

Thanks,
Ryusuke Konishi
--

 fs/nilfs2/alloc.c   |  5 ++--
 fs/nilfs2/bmap.c| 48 +-
 fs/nilfs2/bmap.h| 13 +++
 fs/nilfs2/btree.c   | 66 +
 fs/nilfs2/cpfile.c  | 58 +-
 fs/nilfs2/direct.c  | 17 ++
 fs/nilfs2/inode.c   |  6 ++---
 fs/nilfs2/mdt.c | 54 +++
 fs/nilfs2/mdt.h | 10 ++--
 fs/nilfs2/page.c| 24 ---
 fs/nilfs2/segment.c | 17 +++---
 11 files changed, 266 insertions(+), 52 deletions(-)

--
To unsubscribe from this list: send the line "unsubscribe linux-nilfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH 4/7] nilfs2: unify type of key arguments in bmap interface

2015-03-12 Thread Ryusuke Konishi

The type of key arguments in block mapping interface varies depending
on function.  For instance, nilfs_bmap_lookup_at_level() takes "__u64"
for its key argument whereas nilfs_bmap_lookup() takes "unsigned
long".

This fits them to "__u64" to eliminate the variation.

Signed-off-by: Ryusuke Konishi 
---
 fs/nilfs2/alloc.c |  5 +++--
 fs/nilfs2/bmap.c  | 17 ++---
 fs/nilfs2/bmap.h  |  8 
 fs/nilfs2/inode.c |  6 +++---
 4 files changed, 16 insertions(+), 20 deletions(-)

diff --git a/fs/nilfs2/alloc.c b/fs/nilfs2/alloc.c
index 741fd02..8df0f3b 100644
--- a/fs/nilfs2/alloc.c
+++ b/fs/nilfs2/alloc.c
@@ -405,13 +405,14 @@ nilfs_palloc_rest_groups_in_desc_block(const struct inode 
*inode,
 static int nilfs_palloc_count_desc_blocks(struct inode *inode,
unsigned long *desc_blocks)
 {
-   unsigned long blknum;
+   __u64 blknum;
int ret;
 
ret = nilfs_bmap_last_key(NILFS_I(inode)->i_bmap, &blknum);
if (likely(!ret))
*desc_blocks = DIV_ROUND_UP(
-   blknum, NILFS_MDT(inode)->mi_blocks_per_desc_block);
+   (unsigned long)blknum,
+   NILFS_MDT(inode)->mi_blocks_per_desc_block);
return ret;
 }
 
diff --git a/fs/nilfs2/bmap.c b/fs/nilfs2/bmap.c
index aadbd0b..c82f436 100644
--- a/fs/nilfs2/bmap.c
+++ b/fs/nilfs2/bmap.c
@@ -152,9 +152,7 @@ static int nilfs_bmap_do_insert(struct nilfs_bmap *bmap, 
__u64 key, __u64 ptr)
  *
  * %-EEXIST - A record associated with @key already exist.
  */
-int nilfs_bmap_insert(struct nilfs_bmap *bmap,
- unsigned long key,
- unsigned long rec)
+int nilfs_bmap_insert(struct nilfs_bmap *bmap, __u64 key, unsigned long rec)
 {
int ret;
 
@@ -191,19 +189,16 @@ static int nilfs_bmap_do_delete(struct nilfs_bmap *bmap, 
__u64 key)
return bmap->b_ops->bop_delete(bmap, key);
 }
 
-int nilfs_bmap_last_key(struct nilfs_bmap *bmap, unsigned long *key)
+int nilfs_bmap_last_key(struct nilfs_bmap *bmap, __u64 *keyp)
 {
-   __u64 lastkey;
int ret;
 
down_read(&bmap->b_sem);
-   ret = bmap->b_ops->bop_last_key(bmap, &lastkey);
+   ret = bmap->b_ops->bop_last_key(bmap, keyp);
up_read(&bmap->b_sem);
 
if (ret < 0)
ret = nilfs_bmap_convert_error(bmap, __func__, ret);
-   else
-   *key = lastkey;
return ret;
 }
 
@@ -224,7 +219,7 @@ int nilfs_bmap_last_key(struct nilfs_bmap *bmap, unsigned 
long *key)
  *
  * %-ENOENT - A record associated with @key does not exist.
  */
-int nilfs_bmap_delete(struct nilfs_bmap *bmap, unsigned long key)
+int nilfs_bmap_delete(struct nilfs_bmap *bmap, __u64 key)
 {
int ret;
 
@@ -235,7 +230,7 @@ int nilfs_bmap_delete(struct nilfs_bmap *bmap, unsigned 
long key)
return nilfs_bmap_convert_error(bmap, __func__, ret);
 }
 
-static int nilfs_bmap_do_truncate(struct nilfs_bmap *bmap, unsigned long key)
+static int nilfs_bmap_do_truncate(struct nilfs_bmap *bmap, __u64 key)
 {
__u64 lastkey;
int ret;
@@ -276,7 +271,7 @@ static int nilfs_bmap_do_truncate(struct nilfs_bmap *bmap, 
unsigned long key)
  *
  * %-ENOMEM - Insufficient amount of memory available.
  */
-int nilfs_bmap_truncate(struct nilfs_bmap *bmap, unsigned long key)
+int nilfs_bmap_truncate(struct nilfs_bmap *bmap, __u64 key)
 {
int ret;
 
diff --git a/fs/nilfs2/bmap.h b/fs/nilfs2/bmap.h
index b89e680..9230d33 100644
--- a/fs/nilfs2/bmap.h
+++ b/fs/nilfs2/bmap.h
@@ -153,10 +153,10 @@ int nilfs_bmap_test_and_clear_dirty(struct nilfs_bmap *);
 int nilfs_bmap_read(struct nilfs_bmap *, struct nilfs_inode *);
 void nilfs_bmap_write(struct nilfs_bmap *, struct nilfs_inode *);
 int nilfs_bmap_lookup_contig(struct nilfs_bmap *, __u64, __u64 *, unsigned);
-int nilfs_bmap_insert(struct nilfs_bmap *, unsigned long, unsigned long);
-int nilfs_bmap_delete(struct nilfs_bmap *, unsigned long);
-int nilfs_bmap_last_key(struct nilfs_bmap *, unsigned long *);
-int nilfs_bmap_truncate(struct nilfs_bmap *, unsigned long);
+int nilfs_bmap_insert(struct nilfs_bmap *bmap, __u64 key, unsigned long rec);
+int nilfs_bmap_delete(struct nilfs_bmap *bmap, __u64 key);
+int nilfs_bmap_last_key(struct nilfs_bmap *bmap, __u64 *keyp);
+int nilfs_bmap_truncate(struct nilfs_bmap *bmap, __u64 key);
 void nilfs_bmap_clear(struct nilfs_bmap *);
 int nilfs_bmap_propagate(struct nilfs_bmap *, struct buffer_head *);
 void nilfs_bmap_lookup_dirty_buffers(struct nilfs_bmap *, struct list_head *);
diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c
index 8b59695..cf9e489 100644
--- a/fs/nilfs2/inode.c
+++ b/fs/nilfs2/inode.c
@@ -106,7 +106,7 @@ int nilfs_get_block(struct inode *inode, sector_t blkoff,
err = nilfs_transaction_begin(inode->i_sb, &ti, 1);
if (unlikely(err))

[PATCH 2/7] nilfs2: use set_mask_bits() for operations on buffer state bitmap

2015-03-12 Thread Ryusuke Konishi

nilfs_forget_buffer(), nilfs_clear_dirty_page(), and
nilfs_segctor_complete_write() are using a bunch of atomic bit
operations against buffer state bitmap.

This reduces the number of them by utilizing set_mask_bits() macro.

Signed-off-by: Ryusuke Konishi 
---
 fs/nilfs2/page.c| 24 ++--
 fs/nilfs2/segment.c | 14 --
 2 files changed, 18 insertions(+), 20 deletions(-)

diff --git a/fs/nilfs2/page.c b/fs/nilfs2/page.c
index 700ecbc..45d650a 100644
--- a/fs/nilfs2/page.c
+++ b/fs/nilfs2/page.c
@@ -89,18 +89,16 @@ struct buffer_head *nilfs_grab_buffer(struct inode *inode,
 void nilfs_forget_buffer(struct buffer_head *bh)
 {
struct page *page = bh->b_page;
+   const unsigned long clear_bits =
+   (1 << BH_Uptodate | 1 << BH_Dirty | 1 << BH_Mapped |
+1 << BH_Async_Write | 1 << BH_NILFS_Volatile |
+1 << BH_NILFS_Checked | 1 << BH_NILFS_Redirected);
 
lock_buffer(bh);
-   clear_buffer_nilfs_volatile(bh);
-   clear_buffer_nilfs_checked(bh);
-   clear_buffer_nilfs_redirected(bh);
-   clear_buffer_async_write(bh);
-   clear_buffer_dirty(bh);
+   set_mask_bits(&bh->b_state, clear_bits, 0);
if (nilfs_page_buffers_clean(page))
__nilfs_clear_page_dirty(page);
 
-   clear_buffer_uptodate(bh);
-   clear_buffer_mapped(bh);
bh->b_blocknr = -1;
ClearPageUptodate(page);
ClearPageMappedToDisk(page);
@@ -421,6 +419,10 @@ void nilfs_clear_dirty_page(struct page *page, bool silent)
 
if (page_has_buffers(page)) {
struct buffer_head *bh, *head;
+   const unsigned long clear_bits =
+   (1 << BH_Uptodate | 1 << BH_Dirty | 1 << BH_Mapped |
+1 << BH_Async_Write | 1 << BH_NILFS_Volatile |
+1 << BH_NILFS_Checked | 1 << BH_NILFS_Redirected);
 
bh = head = page_buffers(page);
do {
@@ -430,13 +432,7 @@ void nilfs_clear_dirty_page(struct page *page, bool silent)
"discard block %llu, size %zu",
(u64)bh->b_blocknr, bh->b_size);
}
-   clear_buffer_async_write(bh);
-   clear_buffer_dirty(bh);
-   clear_buffer_nilfs_volatile(bh);
-   clear_buffer_nilfs_checked(bh);
-   clear_buffer_nilfs_redirected(bh);
-   clear_buffer_uptodate(bh);
-   clear_buffer_mapped(bh);
+   set_mask_bits(&bh->b_state, clear_bits, 0);
unlock_buffer(bh);
} while (bh = bh->b_this_page, bh != head);
}
diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c
index c9a4e60..c6abbad9 100644
--- a/fs/nilfs2/segment.c
+++ b/fs/nilfs2/segment.c
@@ -24,6 +24,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -1785,12 +1786,13 @@ static void nilfs_segctor_complete_write(struct 
nilfs_sc_info *sci)
 */
list_for_each_entry(bh, &segbuf->sb_payload_buffers,
b_assoc_buffers) {
-   set_buffer_uptodate(bh);
-   clear_buffer_dirty(bh);
-   clear_buffer_async_write(bh);
-   clear_buffer_delay(bh);
-   clear_buffer_nilfs_volatile(bh);
-   clear_buffer_nilfs_redirected(bh);
+   const unsigned long set_bits = (1 << BH_Uptodate);
+   const unsigned long clear_bits =
+   (1 << BH_Dirty | 1 << BH_Async_Write |
+1 << BH_Delay | 1 << BH_NILFS_Volatile |
+1 << BH_NILFS_Redirected);
+
+   set_mask_bits(&bh->b_state, clear_bits, set_bits);
if (bh == segbuf->sb_super_root) {
if (bh->b_page != bd_page) {
end_page_writeback(bd_page);
-- 
1.8.3.1

--
To unsubscribe from this list: send the line "unsubscribe linux-nilfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH 5/7] nilfs2: add bmap function to seek a valid key

2015-03-12 Thread Ryusuke Konishi

Add a new bmap function, nilfs_bmap_seek_key(), which seeks a valid
entry and returns its key starting from a given key.  This function
can be used to skip hole blocks efficiently.

Signed-off-by: Ryusuke Konishi 
---
 fs/nilfs2/bmap.c   | 31 +
 fs/nilfs2/bmap.h   |  5 -
 fs/nilfs2/btree.c  | 66 ++
 fs/nilfs2/direct.c | 17 ++
 4 files changed, 118 insertions(+), 1 deletion(-)

diff --git a/fs/nilfs2/bmap.c b/fs/nilfs2/bmap.c
index c82f436..27f75bc 100644
--- a/fs/nilfs2/bmap.c
+++ b/fs/nilfs2/bmap.c
@@ -189,6 +189,37 @@ static int nilfs_bmap_do_delete(struct nilfs_bmap *bmap, 
__u64 key)
return bmap->b_ops->bop_delete(bmap, key);
 }
 
+/**
+ * nilfs_bmap_seek_key - seek a valid entry and return its key
+ * @bmap: bmap struct
+ * @start: start key number
+ * @keyp: place to store valid key
+ *
+ * Description: nilfs_bmap_seek_key() seeks a valid key on @bmap
+ * starting from @start, and stores it to @keyp if found.
+ *
+ * Return Value: On success, 0 is returned. On error, one of the following
+ * negative error codes is returned.
+ *
+ * %-EIO - I/O error.
+ *
+ * %-ENOMEM - Insufficient amount of memory available.
+ *
+ * %-ENOENT - No valid entry was found
+ */
+int nilfs_bmap_seek_key(struct nilfs_bmap *bmap, __u64 start, __u64 *keyp)
+{
+   int ret;
+
+   down_read(&bmap->b_sem);
+   ret = bmap->b_ops->bop_seek_key(bmap, start, keyp);
+   up_read(&bmap->b_sem);
+
+   if (ret < 0)
+   ret = nilfs_bmap_convert_error(bmap, __func__, ret);
+   return ret;
+}
+
 int nilfs_bmap_last_key(struct nilfs_bmap *bmap, __u64 *keyp)
 {
int ret;
diff --git a/fs/nilfs2/bmap.h b/fs/nilfs2/bmap.h
index 9230d33..bfa817c 100644
--- a/fs/nilfs2/bmap.h
+++ b/fs/nilfs2/bmap.h
@@ -76,8 +76,10 @@ struct nilfs_bmap_operations {
  union nilfs_binfo *);
int (*bop_mark)(struct nilfs_bmap *, __u64, int);
 
-   /* The following functions are internal use only. */
+   int (*bop_seek_key)(const struct nilfs_bmap *, __u64, __u64 *);
int (*bop_last_key)(const struct nilfs_bmap *, __u64 *);
+
+   /* The following functions are internal use only. */
int (*bop_check_insert)(const struct nilfs_bmap *, __u64);
int (*bop_check_delete)(struct nilfs_bmap *, __u64);
int (*bop_gather_data)(struct nilfs_bmap *, __u64 *, __u64 *, int);
@@ -155,6 +157,7 @@ void nilfs_bmap_write(struct nilfs_bmap *, struct 
nilfs_inode *);
 int nilfs_bmap_lookup_contig(struct nilfs_bmap *, __u64, __u64 *, unsigned);
 int nilfs_bmap_insert(struct nilfs_bmap *bmap, __u64 key, unsigned long rec);
 int nilfs_bmap_delete(struct nilfs_bmap *bmap, __u64 key);
+int nilfs_bmap_seek_key(struct nilfs_bmap *bmap, __u64 start, __u64 *keyp);
 int nilfs_bmap_last_key(struct nilfs_bmap *bmap, __u64 *keyp);
 int nilfs_bmap_truncate(struct nilfs_bmap *bmap, __u64 key);
 void nilfs_bmap_clear(struct nilfs_bmap *);
diff --git a/fs/nilfs2/btree.c b/fs/nilfs2/btree.c
index ecdbae1..841d177 100644
--- a/fs/nilfs2/btree.c
+++ b/fs/nilfs2/btree.c
@@ -633,6 +633,44 @@ static int nilfs_btree_do_lookup_last(const struct 
nilfs_bmap *btree,
return 0;
 }
 
+/**
+ * nilfs_btree_get_next_key - get next valid key from btree path array
+ * @btree: bmap struct of btree
+ * @path: array of nilfs_btree_path struct
+ * @minlevel: start level
+ * @nextkey: place to store the next valid key
+ *
+ * Return Value: If a next key was found, 0 is returned. Otherwise,
+ * -ENOENT is returned.
+ */
+static int nilfs_btree_get_next_key(const struct nilfs_bmap *btree,
+   const struct nilfs_btree_path *path,
+   int minlevel, __u64 *nextkey)
+{
+   struct nilfs_btree_node *node;
+   int maxlevel = nilfs_btree_height(btree) - 1;
+   int index, next_adj, level;
+
+   /* Next index is already set to bp_index for leaf nodes. */
+   next_adj = 0;
+   for (level = minlevel; level <= maxlevel; level++) {
+   if (level == maxlevel)
+   node = nilfs_btree_get_root(btree);
+   else
+   node = nilfs_btree_get_nonroot_node(path, level);
+
+   index = path[level].bp_index + next_adj;
+   if (index < nilfs_btree_node_get_nchildren(node)) {
+   /* Next key is in this node */
+   *nextkey = nilfs_btree_node_get_key(node, index);
+   return 0;
+   }
+   /* For non-leaf nodes, next index is stored at bp_index + 1. */
+   next_adj = 1;
+   }
+   return -ENOENT;
+}
+
 static int nilfs_btree_lookup(const struct nilfs_bmap *btree,
  __u64 key, int level, __u64 *ptrp)
 {
@@ -1563,6 +1601,30 @@ out:
return ret;
 }
 
+static int nilfs_btree_seek_key(const st

[PATCH 6/7] nilfs2: add helper to find existent block on metadata file

2015-03-12 Thread Ryusuke Konishi

Add a new metadata file function, nilfs_mdt_find_block(), which finds
an existent block on a metadata file in a given range of blocks.  This
function skips continuous hole blocks efficiently by using
nilfs_bmap_seek_key().

Signed-off-by: Ryusuke Konishi 
---
 fs/nilfs2/mdt.c | 54 ++
 fs/nilfs2/mdt.h |  3 +++
 2 files changed, 57 insertions(+)

diff --git a/fs/nilfs2/mdt.c b/fs/nilfs2/mdt.c
index 892cf5f..dee34d9 100644
--- a/fs/nilfs2/mdt.c
+++ b/fs/nilfs2/mdt.c
@@ -261,6 +261,60 @@ int nilfs_mdt_get_block(struct inode *inode, unsigned long 
blkoff, int create,
 }
 
 /**
+ * nilfs_mdt_find_block - find and get a buffer on meta data file.
+ * @inode: inode of the meta data file
+ * @start: start block offset (inclusive)
+ * @end: end block offset (inclusive)
+ * @blkoff: block offset
+ * @out_bh: place to store a pointer to buffer_head struct
+ *
+ * nilfs_mdt_find_block() looks up an existing block in range of
+ * [@start, @end] and stores pointer to a buffer head of the block to
+ * @out_bh, and block offset to @blkoff, respectively.  @out_bh and
+ * @blkoff are substituted only when zero is returned.
+ *
+ * Return Value: On success, it returns 0. On error, the following negative
+ * error code is returned.
+ *
+ * %-ENOMEM - Insufficient memory available.
+ *
+ * %-EIO - I/O error
+ *
+ * %-ENOENT - no block was found in the range
+ */
+int nilfs_mdt_find_block(struct inode *inode, unsigned long start,
+unsigned long end, unsigned long *blkoff,
+struct buffer_head **out_bh)
+{
+   __u64 next;
+   int ret;
+
+   if (unlikely(start > end))
+   return -ENOENT;
+
+   ret = nilfs_mdt_read_block(inode, start, true, out_bh);
+   if (!ret) {
+   *blkoff = start;
+   goto out;
+   }
+   if (unlikely(ret != -ENOENT || start == ULONG_MAX))
+   goto out;
+
+   ret = nilfs_bmap_seek_key(NILFS_I(inode)->i_bmap, start + 1, &next);
+   if (!ret) {
+   if (next <= end) {
+   ret = nilfs_mdt_read_block(inode, next, true, out_bh);
+   if (!ret)
+   *blkoff = next;
+   } else {
+   ret = -ENOENT;
+   }
+   }
+out:
+   return ret;
+}
+
+/**
  * nilfs_mdt_delete_block - make a hole on the meta data file.
  * @inode: inode of the meta data file
  * @block: block offset
diff --git a/fs/nilfs2/mdt.h b/fs/nilfs2/mdt.h
index a294ea3..fe529a8 100644
--- a/fs/nilfs2/mdt.h
+++ b/fs/nilfs2/mdt.h
@@ -78,6 +78,9 @@ int nilfs_mdt_get_block(struct inode *, unsigned long, int,
void (*init_block)(struct inode *,
   struct buffer_head *, void *),
struct buffer_head **);
+int nilfs_mdt_find_block(struct inode *inode, unsigned long start,
+unsigned long end, unsigned long *blkoff,
+struct buffer_head **out_bh);
 int nilfs_mdt_delete_block(struct inode *, unsigned long);
 int nilfs_mdt_forget_block(struct inode *, unsigned long);
 int nilfs_mdt_mark_block_dirty(struct inode *, unsigned long);
-- 
1.8.3.1

--
To unsubscribe from this list: send the line "unsubscribe linux-nilfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH 3/7] nilfs2: use bgl_lock_ptr()

2015-03-12 Thread Ryusuke Konishi

Simplify nilfs_mdt_bgl_lock() by utilizing bgl_lock_ptr() helper in
.

Signed-off-by: Ryusuke Konishi 
---
 fs/nilfs2/mdt.h | 7 +--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/fs/nilfs2/mdt.h b/fs/nilfs2/mdt.h
index ab172e8..a294ea3 100644
--- a/fs/nilfs2/mdt.h
+++ b/fs/nilfs2/mdt.h
@@ -111,7 +111,10 @@ static inline __u64 nilfs_mdt_cno(struct inode *inode)
return ((struct the_nilfs *)inode->i_sb->s_fs_info)->ns_cno;
 }
 
-#define nilfs_mdt_bgl_lock(inode, bg) \
-   (&NILFS_MDT(inode)->mi_bgl->locks[(bg) & (NR_BG_LOCKS-1)].lock)
+static inline spinlock_t *
+nilfs_mdt_bgl_lock(struct inode *inode, unsigned int block_group)
+{
+   return bgl_lock_ptr(NILFS_MDT(inode)->mi_bgl, block_group);
+}
 
 #endif /* _NILFS_MDT_H */
-- 
1.8.3.1

--
To unsubscribe from this list: send the line "unsubscribe linux-nilfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH 0/9] nilfs2: implementation of cost-benefit GC policy

2015-03-12 Thread Ryusuke Konishi

Hi Andreas,

On Tue, 10 Mar 2015 21:37:50 +0100, Andreas Rohner wrote:
> Hi Ryusuke,
> 
> Thanks for your thorough review.
> 
> On 2015-03-10 06:21, Ryusuke Konishi wrote:
>> Hi Andreas,
>> 
>> I looked through whole kernel patches and a part of util patches.
>> Overall comments are as follows:
>> 
>> [Algorithm]
>> As for algorithm, it looks about OK except for the starvation
>> countermeasure.  The stavation countermeasure looks adhoc/hacky, but
>> it's good that it doesn't change kernel/userland interface; we may be
>> able to replace it with better ways in a future or in a revised
>> version of this patchset.
>> 
>> (1) Drawback of the starvation countermeasure
>> The patch 9/9 looks to make the execution time of chcp operation
>> worse since it will scan through sufile to modify live block
>> counters.  How much does it prolong the execution time ?
> 
> I'll do some tests, but I haven't noticed any significant performance
> drop. The GC basically does the same thing, every time it selects
> segments to reclaim.

GC is performed in background by an independent process.  What I'm
care about it that NILFS_IOCTL_CHANGE_CPMODE ioctl is called from
command line interface or application.  They differ in this meaning.

Was a worse case senario considered in the test ?

For example:
1. Fill a TB class drive with data file(s), and make a snapshot on it.
2. Run one pass GC to update snapshot block counts.
3. And do "chcp cp"

If we don't observe noticeable delay on this class of drive, then I
think we can put the problem off.

>> In a use case of nilfs, many snapshots are created and they are
>> automatically changed back to plain checkpoints because old
>> snapshots are thinned out over time.  The patch 9/9 may impact on
>> such usage.
>>
>> (2) Compatibility
>> What will happen in the following case:
>> 1. Create a file system, use it with the new module, and
>>create snapshots.
>> 2. Mount it with an old module, and release snapshot with "chcp cp"
>> 3. Mount it with the new module, and cleanerd runs gc with
>>cost benefit or greedy policy.
> 
> Some segments could be subject to starvation. But it would probably only
> affect a small number of segments and it could be fixed by "chcp ss
> ; chcp cp ".

Ok, let's treat this as a restriction for now.
If you come up with any good idea, please propose.

>> (3) Durability against unexpected power failures (just a note)
>> The current patchset looks not to cause starvation issue even when
>> unexpected power failure occurs during or after executing "chcp
>> cp" because nilfs_ioctl_change_cpmode() do changes in a
>> transactional way with nilfs_transaction_begin/commit.
>> We should always think this kind of situtation to keep consistency.
>> 
>> [Coding Style]
>> (4) This patchset has several coding style issues. Please fix them and
>> re-check with the latest checkpatch script (script/checkpatch.pl).
> 
> I'll fix that. Sorry.
> 
>> patch 2:
>> WARNING: Prefer kmalloc_array over kmalloc with multiply
>> #85: FILE: fs/nilfs2/sufile.c:1192:
>> +mc->mc_mods = kmalloc(capacity * sizeof(struct nilfs_sufile_mod),
>> 
>> patch 5,6:
>> WARNING: 'aquired' may be misspelled - perhaps 'acquired'?
>> #60: 
>> the same semaphore has to be aquired. So if the DAT-Entry belongs to
>> 
>> WARNING: 'aquired' may be misspelled - perhaps 'acquired'?
>> #46: 
>> be aquired, which blocks the entire SUFILE and effectively turns
>> 
>> WARNING: 'aquired' may be misspelled - perhaps 'acquired'?
>> #53: 
>> afore mentioned lock only needs to be aquired, if the cache is full
>> 
>> (5) sub_sizeof macro:
>> The same definition exists as offsetofend() in vfio.h,
>> and a patch to move it to stddef.h is now proposed.
>> 
>> Please use the same name, and redefine it only if it's not
>> defined:
>> 
>> #ifndef offsetofend
>> #define offsetofend(TYPE, MEMBER) \
>> (offsetof(TYPE, MEMBER) + sizeof(((TYPE *)0)->MEMBER))
>> #endif
> 
> Ok I'll change that.
> 
>> [Implementation]
>> (6) b_blocknr
>> Please do not use bh->b_blocknr to store disk block number.  This
>> field is used to keep virtual block number except for DAT files.
>> It is only replaced to an actual block number during calling
>> submit_bh().  Keep this policy.
&

Re: [PATCH 1/9] nilfs2: refactor nilfs_sufile_updatev()

2015-03-12 Thread Ryusuke Konishi

m)
>  {
> - return nilfs_sufile_update(sufile, segnum, 0, nilfs_sufile_do_free);
> + return nilfs_sufile_update(sufile, &segnum, 0, 0,
> +(void *)nilfs_sufile_do_free);
>  }

ditto

>  /**
> @@ -98,8 +100,8 @@ static inline int nilfs_sufile_free(struct inode *sufile, 
> __u64 segnum)
>  static inline int nilfs_sufile_freev(struct inode *sufile, __u64 *segnumv,
>size_t nsegs, size_t *ndone)
>  {
> - return nilfs_sufile_updatev(sufile, segnumv, nsegs, 0, ndone,
> - nilfs_sufile_do_free);
> + return nilfs_sufile_updatev(sufile, segnumv, sizeof(__u64), 0, nsegs,
> + 0, ndone, (void *)nilfs_sufile_do_free);
>  }

ditto

>  /**
> @@ -116,8 +118,9 @@ static inline int nilfs_sufile_cancel_freev(struct inode 
> *sufile,
>   __u64 *segnumv, size_t nsegs,
>   size_t *ndone)
>  {
> - return nilfs_sufile_updatev(sufile, segnumv, nsegs, 0, ndone,
> - nilfs_sufile_do_cancel_free);
> + return nilfs_sufile_updatev(sufile, segnumv, sizeof(__u64), 0, nsegs,
> + 0, ndone,
> + (void *)nilfs_sufile_do_cancel_free);
>  }

ditto

>  /**
> @@ -139,8 +142,8 @@ static inline int nilfs_sufile_cancel_freev(struct inode 
> *sufile,
>   */
>  static inline int nilfs_sufile_set_error(struct inode *sufile, __u64 segnum)
>  {
> - return nilfs_sufile_update(sufile, segnum, 0,
> -nilfs_sufile_do_set_error);
> + return nilfs_sufile_update(sufile, &segnum, 0, 0,
> +(void *)nilfs_sufile_do_set_error);
>  }
>  
>  #endif   /* _NILFS_SUFILE_H */

ditto


Regards,
Ryusuke Konishi

> -- 
> 2.3.0
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-nilfs" in
> the body of a message to majord...@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line "unsubscribe linux-nilfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH 0/9] nilfs2: implementation of cost-benefit GC policy

2015-03-09 Thread Ryusuke Konishi

t in turn
triggers sufile.

This also helps to simplify nilfs_dat_commit_end() that the patchset
added two arguments for the dead block counting in the patchset.
I mean, "dead" argument and "count_blocks" argument can be unified by
    changing meaning of the "dead" argument.


I will add detail comments for patches tonight or another day.

Regards,
Ryusuke Konishi

On Wed, 25 Feb 2015 09:18:04 +0900 (JST), Ryusuke Konishi wrote:
> Hi Andreas,
> 
> Thank you for posting this proposal!
> 
> I would like to have time to review this series through, but please
> wait for several days. (This week I'm quite busy until weekend)
> 
> Thanks,
> Ryusuke Konishi
> 
> On Tue, 24 Feb 2015 20:01:35 +0100, Andreas Rohner wrote:
>> Hi everyone!
>> 
>> One of the biggest performance problems of NILFS is its
>> inefficient Timestamp GC policy. This patch set introduces two new GC
>> policies, namely Cost-Benefit and Greedy.
>> 
>> The Cost-Benefit policy is nothing new. It has been around for a long
>> time with log-structured file systems [1]. But it relies on accurate
>> information, about the number of live blocks in a segment. NILFS
>> currently does not provide the necessary information. So this patch set
>> extends the entries in the SUFILE to include a counter for the number of
>> live blocks. This counter is decremented whenever a file is deleted or
>> overwritten.
>> 
>> Except for some tricky parts, the counting of live blocks is quite
>> trivial. The problem is snapshots. At any time, a checkpoint can be
>> turned into a snapshot or vice versa. So blocks that are reclaimable at
>> one point in time, are protected by a snapshot a moment later.
>> 
>> This patch set does not try to track snapshots at all. Instead it uses a
>> heuristic approach to prevent the worst case scenario. The performance
>> is still significantly better than timestamp for my benchmarks.
>> 
>> The worst case scenario is, the following:
>> 
>> 1. Segment 1 is written
>> 2. Snapshot is created
>> 3. GC tries to reclaim Segment 1, but all blocks are protected
>>by the Snapshot. The GC has to set the number of live blocks
>>to maximum to avoid reclaiming this Segment again in the near future.
>> 4. Snapshot is deleted
>> 5. Segment 1 is reclaimable, but its counter is so high, that the GC
>>will never try to reclaim it again.
>> 
>> To prevent this kind of starvation I use another field in the SUFILE
>> entry, to store the number of blocks that are protected by a snapshot.
>> This value is just a heuristic and it is usually set to 0. Only if the
>> GC reclaims a segment, it is written to the SUFILE entry. The GC has to
>> check for snapshots anyway, so we get this information for free. By
>> storing this information in the SUFILE we can avoid starvation in the
>> following way:
>> 
>> 1. Segment 1 is written
>> 2. Snapshot is created
>> 3. GC tries to reclaim Segment 1, but all blocks are protected
>>by the Snapshot. The GC has to set the number of live blocks
>>to maximum to avoid reclaiming this Segment again in the near future.
>> 4. GC sets the number of snapshot blocks in Segment 1 in the SUFILE
>>entry
>> 5. Snapshot is deleted
>> 6. On Snapshot deletion we walk through every entry in the SUFILE and
>>reduce the number of live blocks to half, if the number of snapshot
>>blocks is bigger than half of the maximum.
>> 7. Segment 1 is reclaimable and the number of live blocks entry is at
>>half the maximum. The GC will try to reclaim this segment as soon as
>>there are no other better choices.
>> 
>> BENCHMARKS:
>> ---
>> 
>> My benchmark is quite simple. It consists of a process, that replays
>> real NFS traces at a faster speed. It thereby creates relatively
>> realistic patterns of file creation and deletions. At the same time
>> multiple snapshots are created and deleted in parallel. I use a 100GB
>> partition of a Samsung SSD:
>> 
>> WITH SNAPSHOTS EVERY 5 MINUTES:
>> 
>> Execution time   Wear (Data written to disk)
>> Timestamp:  100% 100%
>> Cost-Benefit:   80%  43%
>> 
>> NO SNAPSHOTS:
>> -
>> Execution time   Wear (Data written to disk)
>> Timestamp:  100% 100%
>> Cost-Benefit:   70%  45%
>> 
&

Re: [systemd-devel] nilfs-cleanerd startup on boot

2015-03-03 Thread Ryusuke Konishi

On 2015/03/04 10:11, dennis.mur...@wipro.com wrote:

-Original Message-
From: linux-nilfs-ow...@vger.kernel.org [mailto:linux-nilfs-
ow...@vger.kernel.org] On Behalf Of Ryusuke Konishi
Sent: Tuesday, March 03, 2015 6:18 PM
To: Dennis Murata (WT01 - ENU); lenn...@poettering.net
Cc: systemd-de...@lists.freedesktop.org; linux-nilfs@vger.kernel.org
Subject: Re: [systemd-devel] nilfs-cleanerd startup on boot

Hi

On 2015/03/04 0:44, dennis.mur...@wipro.com wrote:

I had mis-typed the address for the nilfs mail group

-Original Message-
From: Lennart Poettering [mailto:lenn...@poettering.net]
Sent: Saturday, February 28, 2015 12:34 PM
To: Dennis Murata (WT01 - ENU)
Cc: systemd-de...@lists.freedesktop.org; linus-ni...@vger.kernel.org
Subject: Re: [systemd-devel] nilfs-cleanerd startup on boot

On Fri, 27.02.15 18:31, dennis.mur...@wipro.com
(dennis.mur...@wipro.com) wrote:

I have a fedora 21 system that where I mount an nilfs2 file system.
I use a simple /etc/modules-load.d/nilfs.conf file to load the
kernel module and have an entry in the fstab.

Creating the modules-load.d snippet should not be necessary, as the
kernel should autoload the kernel module for it when it is first required.

I did not find this to be the case for fedora 21.

  > Without creating the file to load the module, any attempt I made to mount

the file system would get a unknown filetype error.  Does this point at  >

adding this module to the initrd file?

Is "nilfs2.ko" installed in your environment?

I had to add the kernel-modules-extra package that you list below.  This is 
probably why I also had to add the file in modules-load.d to get this module 
loaded

Try "modinfo nilfs2"

Older fedora needed kernel-modules-extra package. [1]

[1] http://nilfs.sourceforge.net/en/pkg_fedora.html

The file system mounts on boot as it should, but the nilfs-cleanerd
program does not startup.  If I umount /nilfs then mount /nilfs the
nilfs-cleanerd program starts as it should to cleanup the checkpoints.

How is that daemon supposed to be started? Is it forked off /bin/mount?

Does systemd use a different mount program at boot?

It uses /bin/mount for mounting normal file systems.

nilfs_cleanerd is invoked through /sbin/mount.nilfs2 helper. [2] The helper is
called from /sbin/mount if it exists.

What is confusing to me, is an umount then a mount will start the 
nilfs_cleanerd process so it does exist on the system.  I had expected it to be 
started as soon as the file system was mounted the first time.

Curious.  Is /sbin/mount.nilfs2 called in the first mount?

If mount.nilfs2 is unavailable at the early stage, the next best way
is to use rw-remount (i.e. mount -t nilfs2 -o rw,remount 
).  It would also run cleanerd instead of umount && mount.

Regards,
Ryusuke Konishi

/sbin/mount.nilfs2 is included in nilfs-utils package.

nilfs_cleanerd is just a user-land process, so it can be manually invoked if you
have root privilege. [3]

# /sbin/nilfs_cleanerd  

But, in this case, you need to kill nilfs_cleanerd manually before umount.  So, 
I
recommend running cleanerd through mount.nilfs2.

The above explanation may not suit for the recent fedora since nilfs-utils is 
not
yet tuned to systemd environment.

[2] http://nilfs.sourceforge.net/en/man8/mount.nilfs2.8.html
[3] http://nilfs.sourceforge.net/en/man8/nilfs_cleanerd.8.html

Regards,
Ryusuke Konishi

Is there something else that should be included other than the
nilfs.conf file?  I have just started using a system with systemd as
the init so please forgive my ignorance.

I have no idea about nilfs really, and we had no reports about any
problems with it before.

I wanted to look at the performance of nilfs and f2fs.

  > This is my first try at using these file systems

Lennart

--
Lennart Poettering, Red Hat

--
To unsubscribe from this list: send the line "unsubscribe linux-nilfs"
in the body of a message to majord...@vger.kernel.org More majordomo
info at  http://vger.kernel.org/majordomo-info.html

--
To unsubscribe from this list: send the line "unsubscribe linux-nilfs" in the 
body
of a message to majord...@vger.kernel.org More majordomo info at
http://vger.kernel.org/majordomo-info.html

--
To unsubscribe from this list: send the line "unsubscribe linux-nilfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

--
To unsubscribe from this list: send the line "unsubscribe linux-nilfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [systemd-devel] nilfs-cleanerd startup on boot

2015-03-03 Thread Ryusuke Konishi

Hi

On 2015/03/04 0:44, dennis.mur...@wipro.com wrote:

I had mis-typed the address for the nilfs mail group

-Original Message-
From: Lennart Poettering [mailto:lenn...@poettering.net]
Sent: Saturday, February 28, 2015 12:34 PM
To: Dennis Murata (WT01 - ENU)
Cc: systemd-de...@lists.freedesktop.org; linus-ni...@vger.kernel.org
Subject: Re: [systemd-devel] nilfs-cleanerd startup on boot

On Fri, 27.02.15 18:31, dennis.mur...@wipro.com
(dennis.mur...@wipro.com) wrote:

I have a fedora 21 system that where I mount an nilfs2 file system.
I use a simple /etc/modules-load.d/nilfs.conf file to load the kernel
module and have an entry in the fstab.

Creating the modules-load.d snippet should not be necessary, as the kernel
should autoload the kernel module for it when it is first required.

I did not find this to be the case for fedora 21.

> Without creating the file to load the module, any attempt I made to mount
> the file system would get a unknown filetype error.  Does this point at
> adding this module to the initrd file?

Is "nilfs2.ko" installed in your environment?

Try "modinfo nilfs2"

Older fedora needed kernel-modules-extra package. [1]

[1] http://nilfs.sourceforge.net/en/pkg_fedora.html

The file system mounts on boot as it should, but the nilfs-cleanerd
program does not startup.  If I umount /nilfs then mount /nilfs the
nilfs-cleanerd program starts as it should to cleanup the checkpoints.

How is that daemon supposed to be started? Is it forked off /bin/mount?

Does systemd use a different mount program at boot?

It uses /bin/mount for mounting normal file systems.

nilfs_cleanerd is invoked through /sbin/mount.nilfs2 helper. [2]
The helper is called from /sbin/mount if it exists.

/sbin/mount.nilfs2 is included in nilfs-utils package.

nilfs_cleanerd is just a user-land process, so it can be
manually invoked if you have root privilege. [3]

  # /sbin/nilfs_cleanerd  

But, in this case, you need to kill nilfs_cleanerd
manually before umount.  So, I recommend running cleanerd
through mount.nilfs2.

The above explanation may not suit for the recent fedora
since nilfs-utils is not yet tuned to systemd environment.

[2] http://nilfs.sourceforge.net/en/man8/mount.nilfs2.8.html
[3] http://nilfs.sourceforge.net/en/man8/nilfs_cleanerd.8.html

Regards,
Ryusuke Konishi

Is there something else that should be included other than the
nilfs.conf file?  I have just started using a system with systemd as
the init so please forgive my ignorance.

I have no idea about nilfs really, and we had no reports about any problems
with it before.

I wanted to look at the performance of nilfs and f2fs.

> This is my first try at using these file systems

Lennart

--
Lennart Poettering, Red Hat

--
To unsubscribe from this list: send the line "unsubscribe linux-nilfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

--
To unsubscribe from this list: send the line "unsubscribe linux-nilfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH 1/1] nilfs2: fix deadlock of segment constructor during recovery

2015-03-03 Thread Ryusuke Konishi

According to a report from Yuxuan Shui, nilfs2 in kernel 3.19 got
stuck during recovery at mount time.  The code path that caused the
deadlock was as follows:

  nilfs_fill_super()
load_nilfs()
  nilfs_salvage_orphan_logs()
* Do roll-forwarding, attach segment constructor for recovery,
  and kick it.

nilfs_segctor_thread()
  nilfs_segctor_thread_construct()
   * A lock is held with nilfs_transaction_lock()
 nilfs_segctor_do_construct()
   nilfs_segctor_drop_written_files()
 iput()
   iput_final()
 write_inode_now()
   writeback_single_inode()
 __writeback_single_inode()
   do_writepages()
 nilfs_writepage()
   nilfs_construct_dsync_segment()
 nilfs_transaction_lock() --> deadlock

This can happen if commit 7ef3ff2fea8b ("nilfs2: fix deadlock of
segment constructor over I_SYNC flag") is applied and roll-forward
recovery was performed at mount time.  The roll-forward recovery can
happen if datasync write is done and the file system crashes
immediately after that.  For instance, we can reproduce the issue with
the following steps:

 < nilfs2 is mounted on /nilfs (device: /dev/sdb1) >
 # dd if=/dev/zero of=/nilfs/test bs=4k count=1 && sync
 # dd if=/dev/zero of=/nilfs/test conv=notrunc oflag=dsync bs=4k
 count=1 && reboot -nfh
 < the system will immediately reboot >
 # mount -t nilfs2 /dev/sdb1 /nilfs

The deadlock occurs because iput() can run segment constructor through
writeback_single_inode() if MS_ACTIVE flag is not set on sb->s_flags.
The above commit changed segment constructor so that it calls iput()
asynchronously for inodes with i_nlink == 0, but that change was
imperfect.

This fixes the another deadlock by deferring iput() in segment
constructor even for the case that mount is not finished, that is, for
the case that MS_ACTIVE flag is not set.

Reported-by: Yuxuan Shui 
Signed-off-by: Ryusuke Konishi 
Cc: Al Viro 
Tested-by: Ryusuke Konishi 
Cc: sta...@vger.kernel.org
---
 fs/nilfs2/segment.c | 7 ---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c
index 469086b..0c3f303 100644
--- a/fs/nilfs2/segment.c
+++ b/fs/nilfs2/segment.c
@@ -1907,6 +1907,7 @@ static void nilfs_segctor_drop_written_files(struct 
nilfs_sc_info *sci,
 struct the_nilfs *nilfs)
 {
struct nilfs_inode_info *ii, *n;
+   int during_mount = !(sci->sc_super->s_flags & MS_ACTIVE);
int defer_iput = false;
 
spin_lock(&nilfs->ns_inode_lock);
@@ -1919,10 +1920,10 @@ static void nilfs_segctor_drop_written_files(struct 
nilfs_sc_info *sci,
brelse(ii->i_bh);
ii->i_bh = NULL;
list_del_init(&ii->i_dirty);
-   if (!ii->vfs_inode.i_nlink) {
+   if (!ii->vfs_inode.i_nlink || during_mount) {
/*
-* Defer calling iput() to avoid a deadlock
-* over I_SYNC flag for inodes with i_nlink == 0
+* Defer calling iput() to avoid deadlocks if
+* i_nlink == 0 or mount is not yet finished.
 */
list_add_tail(&ii->i_dirty, &sci->sc_iput_queue);
defer_iput = true;
-- 
1.8.3.1

--
To unsubscribe from this list: send the line "unsubscribe linux-nilfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH 0/1] nilfs2: fix deadlock of segment constructor during recovery

2015-03-03 Thread Ryusuke Konishi

Hi Andrew,

Please send the following bug fix to upstream.  It fixes another
deadlock issue of nilfs2 segment constructor, which was recently
reported.

Thanks,
Ryusuke Konishi
--
Ryusuke Konishi (1):
  nilfs2: fix deadlock of segment constructor during recovery

 fs/nilfs2/segment.c | 7 ---
 1 file changed, 4 insertions(+), 3 deletions(-)

--
To unsubscribe from this list: send the line "unsubscribe linux-nilfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: Making Nilfs ZAC Compliant

2015-02-27 Thread Ryusuke Konishi

Hi,
On Thu, 26 Feb 2015 19:54:48 +, Benixon Dhas wrote:
> Hi All,
> 
> We are trying to make Nilfs work with a SMR Device which adheres to
> Zoned ATA Commands(ZAC) Specification.  One of the restrictions in
> the specification is reading an unwritten part of the Zone(Segment
> in Nilfs) will cause a read error.
> 
> We observe that Nilfs does not write a complete physical segment(we
> use 256MB segment) always. After digging in the source a while we
> figured that this is due to the fact that Nilfs requires a certain
> number of minimum blocks for constructing a partial segment
> (NILFS_PSEG_MIN_BLOCKS), which currently is 2.  So we see some
> segments where the last block (in our case a block is 4k) is not
> being written to.

For recovery and GC, NILFS needs to insert one or more header blocks
before writing payload blocks.  Inevitably, the minimum size of a
partial segment becomes 2.

> When some utilities like garbage collector and dump segment reads
> (May not be an exhaustive list) a segment it tries to read the
> entire physical segment. This causes read errors in the kernel and
> hence retries for the last unwritten block in certain segments.

The recovery function of NILFS also needs to read entire physical
segment.  It never reads unwritten blocks if the file system was
cleanly unmounted, however, this is not the case for unclean shutdown
or panic.

Worse yet, if it gets an EIO from the underlying block layer, the
recovery will fail and the mount system call will abort.

> In an attempt to solve this problem we were trying to figure out if
> we can write some dummy data to the remaining unutilized blocks in
> the segment. But we are not sure what would be the best way to do
> this.
> 
> Another solution we had in mind was to figure out all places where
> segments are read, and modify it to prevent it from reading
> unwritten blocks. But we feel this might be more complex solution
> and might impact performance more.

Looks like sufile is available for this purpose.  It is maintaining
how many blocks are written for each segment.  You can see it in the
NBLOCKS field of the output of lssu command.

One restriction is that this metadata file (sufile) is unavailable
until mount system call succeeds.  The recovery code cannot use it.

> Please advise us on the best way to solve the problem. Also what
> would be architecturally a best place to fix the problem.

Writing dummy data to the dead space for SMR devices looks better to
me because it's simpler and the performance penalty seems not so high.

But,
What will happen if an unexpected power failure hits the device ?
Does that cause the file system to read unwritten blocks ?

If so, it seems that we need translation layer to hide these issues,
or a new error code or a new mechanism to make it possible for file
systems to know/handle them.

Regards,
Ryusuke Konishi
--
To unsubscribe from this list: send the line "unsubscribe linux-nilfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH 0/9] nilfs2: implementation of cost-benefit GC policy

2015-02-24 Thread Ryusuke Konishi

Hi Andreas,

Thank you for posting this proposal!

I would like to have time to review this series through, but please
wait for several days. (This week I'm quite busy until weekend)

Thanks,
Ryusuke Konishi

On Tue, 24 Feb 2015 20:01:35 +0100, Andreas Rohner wrote:
> Hi everyone!
> 
> One of the biggest performance problems of NILFS is its
> inefficient Timestamp GC policy. This patch set introduces two new GC
> policies, namely Cost-Benefit and Greedy.
> 
> The Cost-Benefit policy is nothing new. It has been around for a long
> time with log-structured file systems [1]. But it relies on accurate
> information, about the number of live blocks in a segment. NILFS
> currently does not provide the necessary information. So this patch set
> extends the entries in the SUFILE to include a counter for the number of
> live blocks. This counter is decremented whenever a file is deleted or
> overwritten.
> 
> Except for some tricky parts, the counting of live blocks is quite
> trivial. The problem is snapshots. At any time, a checkpoint can be
> turned into a snapshot or vice versa. So blocks that are reclaimable at
> one point in time, are protected by a snapshot a moment later.
> 
> This patch set does not try to track snapshots at all. Instead it uses a
> heuristic approach to prevent the worst case scenario. The performance
> is still significantly better than timestamp for my benchmarks.
> 
> The worst case scenario is, the following:
> 
> 1. Segment 1 is written
> 2. Snapshot is created
> 3. GC tries to reclaim Segment 1, but all blocks are protected
>by the Snapshot. The GC has to set the number of live blocks
>to maximum to avoid reclaiming this Segment again in the near future.
> 4. Snapshot is deleted
> 5. Segment 1 is reclaimable, but its counter is so high, that the GC
>will never try to reclaim it again.
> 
> To prevent this kind of starvation I use another field in the SUFILE
> entry, to store the number of blocks that are protected by a snapshot.
> This value is just a heuristic and it is usually set to 0. Only if the
> GC reclaims a segment, it is written to the SUFILE entry. The GC has to
> check for snapshots anyway, so we get this information for free. By
> storing this information in the SUFILE we can avoid starvation in the
> following way:
> 
> 1. Segment 1 is written
> 2. Snapshot is created
> 3. GC tries to reclaim Segment 1, but all blocks are protected
>by the Snapshot. The GC has to set the number of live blocks
>to maximum to avoid reclaiming this Segment again in the near future.
> 4. GC sets the number of snapshot blocks in Segment 1 in the SUFILE
>entry
> 5. Snapshot is deleted
> 6. On Snapshot deletion we walk through every entry in the SUFILE and
>reduce the number of live blocks to half, if the number of snapshot
>blocks is bigger than half of the maximum.
> 7. Segment 1 is reclaimable and the number of live blocks entry is at
>half the maximum. The GC will try to reclaim this segment as soon as
>there are no other better choices.
> 
> BENCHMARKS:
> ---
> 
> My benchmark is quite simple. It consists of a process, that replays
> real NFS traces at a faster speed. It thereby creates relatively
> realistic patterns of file creation and deletions. At the same time
> multiple snapshots are created and deleted in parallel. I use a 100GB
> partition of a Samsung SSD:
> 
> WITH SNAPSHOTS EVERY 5 MINUTES:
> 
> Execution time   Wear (Data written to disk)
> Timestamp:  100% 100%
> Cost-Benefit:   80%  43%
> 
> NO SNAPSHOTS:
> -
> Execution time   Wear (Data written to disk)
> Timestamp:  100% 100%
> Cost-Benefit:   70%  45%
> 
> I plan on adding more benchmark results soon.
> 
> Best regards,
> Andreas Rohner
> 
> [1] Mendel Rosenblum and John K. Ousterhout. The design and implementa-
> tion of a log-structured file system. ACM Trans. Comput. Syst.,
> 10(1):26–52, February 1992.
> 
> Andreas Rohner (9):
>   nilfs2: refactor nilfs_sufile_updatev()
>   nilfs2: add simple cache for modifications to SUFILE
>   nilfs2: extend SUFILE on-disk format to enable counting of live blocks
>   nilfs2: add function to modify su_nlive_blks
>   nilfs2: add simple tracking of block deletions and updates
>   nilfs2: use modification cache to improve performance
>   nilfs2: add additional flags for nilfs_vdesc
>   nilfs2: improve accuracy and correct for invalid GC values
>   nilfs2: prevent starvation of segments prot

[PATCH 3/4] nilfs2: add helper to find existent block on metadata file

2015-02-22 Thread Ryusuke Konishi

Add a new metadata file function, nilfs_mdt_find_block(), which finds
an existent block on a metadata file in a given range of blocks.  This
function skips continuous hole blocks efficiently by using
nilfs_bmap_seek_key().

Signed-off-by: Ryusuke Konishi 
---
 fs/nilfs2/mdt.c | 54 ++
 fs/nilfs2/mdt.h |  3 +++
 2 files changed, 57 insertions(+)

diff --git a/fs/nilfs2/mdt.c b/fs/nilfs2/mdt.c
index c4dcd1d..23bedab 100644
--- a/fs/nilfs2/mdt.c
+++ b/fs/nilfs2/mdt.c
@@ -261,6 +261,60 @@ int nilfs_mdt_get_block(struct inode *inode, unsigned long 
blkoff, int create,
 }
 
 /**
+ * nilfs_mdt_find_block - find and get a buffer on meta data file.
+ * @inode: inode of the meta data file
+ * @start: start block offset (inclusive)
+ * @end: end block offset (inclusive)
+ * @blkoff: block offset
+ * @out_bh: place to store a pointer to buffer_head struct
+ *
+ * nilfs_mdt_find_block() looks up an existing block in range of
+ * [@start, @end] and stores pointer to a buffer head of the block to
+ * @out_bh, and block offset to @blkoff, respectively.  @out_bh and
+ * @blkoff are substituted only when zero is returned.
+ *
+ * Return Value: On success, it returns 0. On error, the following negative
+ * error code is returned.
+ *
+ * %-ENOMEM - Insufficient memory available.
+ *
+ * %-EIO - I/O error
+ *
+ * %-ENOENT - no block was found in the range
+ */
+int nilfs_mdt_find_block(struct inode *inode, unsigned long start,
+unsigned long end, unsigned long *blkoff,
+struct buffer_head **out_bh)
+{
+   __u64 next;
+   int ret;
+
+   if (unlikely(start > end))
+   return -ENOENT;
+
+   ret = nilfs_mdt_read_block(inode, start, true, out_bh);
+   if (!ret) {
+   *blkoff = start;
+   goto out;
+   }
+   if (unlikely(ret != -ENOENT || start == ULONG_MAX))
+   goto out;
+
+   ret = nilfs_bmap_seek_key(NILFS_I(inode)->i_bmap, start + 1, &next);
+   if (!ret) {
+   if (next <= end) {
+   ret = nilfs_mdt_read_block(inode, next, true, out_bh);
+   if (!ret)
+   *blkoff = next;
+   } else {
+   ret = -ENOENT;
+   }
+   }
+out:
+   return ret;
+}
+
+/**
  * nilfs_mdt_delete_block - make a hole on the meta data file.
  * @inode: inode of the meta data file
  * @block: block offset
diff --git a/fs/nilfs2/mdt.h b/fs/nilfs2/mdt.h
index a294ea3..fe529a8 100644
--- a/fs/nilfs2/mdt.h
+++ b/fs/nilfs2/mdt.h
@@ -78,6 +78,9 @@ int nilfs_mdt_get_block(struct inode *, unsigned long, int,
void (*init_block)(struct inode *,
   struct buffer_head *, void *),
struct buffer_head **);
+int nilfs_mdt_find_block(struct inode *inode, unsigned long start,
+unsigned long end, unsigned long *blkoff,
+struct buffer_head **out_bh);
 int nilfs_mdt_delete_block(struct inode *, unsigned long);
 int nilfs_mdt_forget_block(struct inode *, unsigned long);
 int nilfs_mdt_mark_block_dirty(struct inode *, unsigned long);
-- 
1.8.3.1

--
To unsubscribe from this list: send the line "unsubscribe linux-nilfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH 1/4] nilfs2: unify type of key arguments in bmap interface

2015-02-22 Thread Ryusuke Konishi

The type of key arguments in block mapping interface varies depending
on function.  For instance, nilfs_bmap_lookup_at_level() takes "__u64"
for its key argument whereas nilfs_bmap_lookup() takes "unsigned
long".

This fits them to "__u64" to eliminate the variation.

Signed-off-by: Ryusuke Konishi 
---
 fs/nilfs2/alloc.c |  5 +++--
 fs/nilfs2/bmap.c  | 17 ++---
 fs/nilfs2/bmap.h  |  8 
 fs/nilfs2/inode.c |  6 +++---
 4 files changed, 16 insertions(+), 20 deletions(-)

diff --git a/fs/nilfs2/alloc.c b/fs/nilfs2/alloc.c
index 741fd02..8df0f3b 100644
--- a/fs/nilfs2/alloc.c
+++ b/fs/nilfs2/alloc.c
@@ -405,13 +405,14 @@ nilfs_palloc_rest_groups_in_desc_block(const struct inode 
*inode,
 static int nilfs_palloc_count_desc_blocks(struct inode *inode,
unsigned long *desc_blocks)
 {
-   unsigned long blknum;
+   __u64 blknum;
int ret;
 
ret = nilfs_bmap_last_key(NILFS_I(inode)->i_bmap, &blknum);
if (likely(!ret))
*desc_blocks = DIV_ROUND_UP(
-   blknum, NILFS_MDT(inode)->mi_blocks_per_desc_block);
+   (unsigned long)blknum,
+   NILFS_MDT(inode)->mi_blocks_per_desc_block);
return ret;
 }
 
diff --git a/fs/nilfs2/bmap.c b/fs/nilfs2/bmap.c
index aadbd0b..c82f436 100644
--- a/fs/nilfs2/bmap.c
+++ b/fs/nilfs2/bmap.c
@@ -152,9 +152,7 @@ static int nilfs_bmap_do_insert(struct nilfs_bmap *bmap, 
__u64 key, __u64 ptr)
  *
  * %-EEXIST - A record associated with @key already exist.
  */
-int nilfs_bmap_insert(struct nilfs_bmap *bmap,
- unsigned long key,
- unsigned long rec)
+int nilfs_bmap_insert(struct nilfs_bmap *bmap, __u64 key, unsigned long rec)
 {
int ret;
 
@@ -191,19 +189,16 @@ static int nilfs_bmap_do_delete(struct nilfs_bmap *bmap, 
__u64 key)
return bmap->b_ops->bop_delete(bmap, key);
 }
 
-int nilfs_bmap_last_key(struct nilfs_bmap *bmap, unsigned long *key)
+int nilfs_bmap_last_key(struct nilfs_bmap *bmap, __u64 *keyp)
 {
-   __u64 lastkey;
int ret;
 
down_read(&bmap->b_sem);
-   ret = bmap->b_ops->bop_last_key(bmap, &lastkey);
+   ret = bmap->b_ops->bop_last_key(bmap, keyp);
up_read(&bmap->b_sem);
 
if (ret < 0)
ret = nilfs_bmap_convert_error(bmap, __func__, ret);
-   else
-   *key = lastkey;
return ret;
 }
 
@@ -224,7 +219,7 @@ int nilfs_bmap_last_key(struct nilfs_bmap *bmap, unsigned 
long *key)
  *
  * %-ENOENT - A record associated with @key does not exist.
  */
-int nilfs_bmap_delete(struct nilfs_bmap *bmap, unsigned long key)
+int nilfs_bmap_delete(struct nilfs_bmap *bmap, __u64 key)
 {
int ret;
 
@@ -235,7 +230,7 @@ int nilfs_bmap_delete(struct nilfs_bmap *bmap, unsigned 
long key)
return nilfs_bmap_convert_error(bmap, __func__, ret);
 }
 
-static int nilfs_bmap_do_truncate(struct nilfs_bmap *bmap, unsigned long key)
+static int nilfs_bmap_do_truncate(struct nilfs_bmap *bmap, __u64 key)
 {
__u64 lastkey;
int ret;
@@ -276,7 +271,7 @@ static int nilfs_bmap_do_truncate(struct nilfs_bmap *bmap, 
unsigned long key)
  *
  * %-ENOMEM - Insufficient amount of memory available.
  */
-int nilfs_bmap_truncate(struct nilfs_bmap *bmap, unsigned long key)
+int nilfs_bmap_truncate(struct nilfs_bmap *bmap, __u64 key)
 {
int ret;
 
diff --git a/fs/nilfs2/bmap.h b/fs/nilfs2/bmap.h
index b89e680..9230d33 100644
--- a/fs/nilfs2/bmap.h
+++ b/fs/nilfs2/bmap.h
@@ -153,10 +153,10 @@ int nilfs_bmap_test_and_clear_dirty(struct nilfs_bmap *);
 int nilfs_bmap_read(struct nilfs_bmap *, struct nilfs_inode *);
 void nilfs_bmap_write(struct nilfs_bmap *, struct nilfs_inode *);
 int nilfs_bmap_lookup_contig(struct nilfs_bmap *, __u64, __u64 *, unsigned);
-int nilfs_bmap_insert(struct nilfs_bmap *, unsigned long, unsigned long);
-int nilfs_bmap_delete(struct nilfs_bmap *, unsigned long);
-int nilfs_bmap_last_key(struct nilfs_bmap *, unsigned long *);
-int nilfs_bmap_truncate(struct nilfs_bmap *, unsigned long);
+int nilfs_bmap_insert(struct nilfs_bmap *bmap, __u64 key, unsigned long rec);
+int nilfs_bmap_delete(struct nilfs_bmap *bmap, __u64 key);
+int nilfs_bmap_last_key(struct nilfs_bmap *bmap, __u64 *keyp);
+int nilfs_bmap_truncate(struct nilfs_bmap *bmap, __u64 key);
 void nilfs_bmap_clear(struct nilfs_bmap *);
 int nilfs_bmap_propagate(struct nilfs_bmap *, struct buffer_head *);
 void nilfs_bmap_lookup_dirty_buffers(struct nilfs_bmap *, struct list_head *);
diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c
index 8b59695..cf9e489 100644
--- a/fs/nilfs2/inode.c
+++ b/fs/nilfs2/inode.c
@@ -106,7 +106,7 @@ int nilfs_get_block(struct inode *inode, sector_t blkoff,
err = nilfs_transaction_begin(inode->i_sb, &ti, 1);
if (unlikely(err))

[PATCH 4/4] nilfs2: improve execution time of NILFS_IOCTL_GET_CPINFO ioctl

2015-02-22 Thread Ryusuke Konishi

The older a filesystem gets, the slower lscp command becomes.  This is
because nilfs_cpfile_do_get_cpinfo() function meets more hole blocks
as the start offset of valid checkpoint numbers gets bigger.

This reduces the overhead by skipping hole blocks efficiently with
nilfs_mdt_find_block() helper.

A measurement result of this patch is as follows:

Before:
$ time lscp
 CNODATE TIME  MODE  FLG  BLKCNT   ICNT
 5769303  2015-02-22 19:31:33   cp-  108  1
 5769304  2015-02-22 19:38:54   cp-  108  1

real0m0.182s
user0m0.003s
sys 0m0.180s

After:
$ time lscp
 CNODATE TIME  MODE  FLG  BLKCNT   ICNT
 5769303  2015-02-22 19:31:33   cp-  108  1
 5769304  2015-02-22 19:38:54   cp-  108  1

real0m0.003s
user0m0.001s
sys 0m0.002s

Signed-off-by: Ryusuke Konishi 
---
 fs/nilfs2/cpfile.c | 58 --
 1 file changed, 52 insertions(+), 6 deletions(-)

diff --git a/fs/nilfs2/cpfile.c b/fs/nilfs2/cpfile.c
index 0d58075..b6596ca 100644
--- a/fs/nilfs2/cpfile.c
+++ b/fs/nilfs2/cpfile.c
@@ -53,6 +53,13 @@ nilfs_cpfile_get_offset(const struct inode *cpfile, __u64 
cno)
return do_div(tcno, nilfs_cpfile_checkpoints_per_block(cpfile));
 }
 
+static __u64 nilfs_cpfile_first_checkpoint_in_block(const struct inode *cpfile,
+   unsigned long blkoff)
+{
+   return (__u64)nilfs_cpfile_checkpoints_per_block(cpfile) * blkoff
+   + 1 - NILFS_MDT(cpfile)->mi_first_entry_offset;
+}
+
 static unsigned long
 nilfs_cpfile_checkpoints_in_block(const struct inode *cpfile,
  __u64 curr,
@@ -146,6 +153,44 @@ static inline int nilfs_cpfile_get_checkpoint_block(struct 
inode *cpfile,
   create, nilfs_cpfile_block_init, bhp);
 }
 
+/**
+ * nilfs_cpfile_find_checkpoint_block - find and get a buffer on cpfile
+ * @cpfile: inode of cpfile
+ * @start_cno: start checkpoint number (inclusive)
+ * @end_cno: end checkpoint number (inclusive)
+ * @cnop: place to store the next checkpoint number
+ * @bhp: place to store a pointer to buffer_head struct
+ *
+ * Return Value: On success, it returns 0. On error, the following negative
+ * error code is returned.
+ *
+ * %-ENOMEM - Insufficient memory available.
+ *
+ * %-EIO - I/O error
+ *
+ * %-ENOENT - no block exists in the range.
+ */
+static int nilfs_cpfile_find_checkpoint_block(struct inode *cpfile,
+ __u64 start_cno, __u64 end_cno,
+ __u64 *cnop,
+ struct buffer_head **bhp)
+{
+   unsigned long start, end, blkoff;
+   int ret;
+
+   if (unlikely(start_cno > end_cno))
+   return -ENOENT;
+
+   start = nilfs_cpfile_get_blkoff(cpfile, start_cno);
+   end = nilfs_cpfile_get_blkoff(cpfile, end_cno);
+
+   ret = nilfs_mdt_find_block(cpfile, start, end, &blkoff, bhp);
+   if (!ret)
+   *cnop = (blkoff == start) ? start_cno :
+   nilfs_cpfile_first_checkpoint_in_block(cpfile, blkoff);
+   return ret;
+}
+
 static inline int nilfs_cpfile_delete_checkpoint_block(struct inode *cpfile,
   __u64 cno)
 {
@@ -403,14 +448,15 @@ static ssize_t nilfs_cpfile_do_get_cpinfo(struct inode 
*cpfile, __u64 *cnop,
return -ENOENT; /* checkpoint number 0 is invalid */
down_read(&NILFS_MDT(cpfile)->mi_sem);
 
-   for (n = 0; cno < cur_cno && n < nci; cno += ncps) {
-   ncps = nilfs_cpfile_checkpoints_in_block(cpfile, cno, cur_cno);
-   ret = nilfs_cpfile_get_checkpoint_block(cpfile, cno, 0, &bh);
+   for (n = 0; n < nci; cno += ncps) {
+   ret = nilfs_cpfile_find_checkpoint_block(
+   cpfile, cno, cur_cno - 1, &cno, &bh);
if (ret < 0) {
-   if (ret != -ENOENT)
-   goto out;
-   continue; /* skip hole */
+   if (likely(ret == -ENOENT))
+   break;
+   goto out;
}
+   ncps = nilfs_cpfile_checkpoints_in_block(cpfile, cno, cur_cno);
 
kaddr = kmap_atomic(bh->b_page);
cp = nilfs_cpfile_block_get_checkpoint(cpfile, cno, bh, kaddr);
-- 
1.8.3.1

--
To unsubscribe from this list: send the line "unsubscribe linux-nilfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH 0/4] nilfs2: shorten execution time of lscp command

2015-02-22 Thread Ryusuke Konishi

The older a filesystem gets, the slower lscp command becomes.  This is
because nilfs_cpfile_do_get_cpinfo() function meets more hole blocks
as the start offset of valid checkpoint numbers gets bigger.

This series introduces some helper functions which help to skip hole
blocks efficiently, and reduces the overhead with them.

A measurement result of this series is as follows:

Before:
$ time lscp
 CNODATE TIME  MODE  FLG  BLKCNT   ICNT
 5769303  2015-02-22 19:31:33   cp-  108  1
 5769304  2015-02-22 19:38:54   cp-  108  1

real0m0.182s
user0m0.003s
sys 0m0.180s

After:
$ time lscp
 CNODATE TIME  MODE  FLG  BLKCNT   ICNT
 5769303  2015-02-22 19:31:33   cp-  108  1
 5769304  2015-02-22 19:38:54   cp-  108  1

real0m0.003s
user0m0.001s
sys 0m0.002s


Thanks,
Ryusuke Konishi
--
Ryusuke Konishi (4):
  nilfs2: unify type of key arguments in bmap interface
  nilfs2: add bmap function to seek a valid key
  nilfs2: add helper to find existent block on metadata file
  nilfs2: improve execution time of NILFS_IOCTL_GET_CPINFO ioctl

 fs/nilfs2/alloc.c  |  5 +++--
 fs/nilfs2/bmap.c   | 48 ++-
 fs/nilfs2/bmap.h   | 13 ++-
 fs/nilfs2/btree.c  | 66 ++
 fs/nilfs2/cpfile.c | 58 ++-
 fs/nilfs2/direct.c | 17 ++
 fs/nilfs2/inode.c  |  6 ++---
 fs/nilfs2/mdt.c| 54 
 fs/nilfs2/mdt.h|  3 +++
 9 files changed, 243 insertions(+), 27 deletions(-)

--
To unsubscribe from this list: send the line "unsubscribe linux-nilfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH 2/4] nilfs2: add bmap function to seek a valid key

2015-02-22 Thread Ryusuke Konishi

Add a new bmap function, nilfs_bmap_seek_key(), which seeks a valid
entry and returns its key starting from a given key.  This function
can be used to skip hole blocks efficiently.

Signed-off-by: Ryusuke Konishi 
---
 fs/nilfs2/bmap.c   | 31 +
 fs/nilfs2/bmap.h   |  5 -
 fs/nilfs2/btree.c  | 66 ++
 fs/nilfs2/direct.c | 17 ++
 4 files changed, 118 insertions(+), 1 deletion(-)

diff --git a/fs/nilfs2/bmap.c b/fs/nilfs2/bmap.c
index c82f436..27f75bc 100644
--- a/fs/nilfs2/bmap.c
+++ b/fs/nilfs2/bmap.c
@@ -189,6 +189,37 @@ static int nilfs_bmap_do_delete(struct nilfs_bmap *bmap, 
__u64 key)
return bmap->b_ops->bop_delete(bmap, key);
 }
 
+/**
+ * nilfs_bmap_seek_key - seek a valid entry and return its key
+ * @bmap: bmap struct
+ * @start: start key number
+ * @keyp: place to store valid key
+ *
+ * Description: nilfs_bmap_seek_key() seeks a valid key on @bmap
+ * starting from @start, and stores it to @keyp if found.
+ *
+ * Return Value: On success, 0 is returned. On error, one of the following
+ * negative error codes is returned.
+ *
+ * %-EIO - I/O error.
+ *
+ * %-ENOMEM - Insufficient amount of memory available.
+ *
+ * %-ENOENT - No valid entry was found
+ */
+int nilfs_bmap_seek_key(struct nilfs_bmap *bmap, __u64 start, __u64 *keyp)
+{
+   int ret;
+
+   down_read(&bmap->b_sem);
+   ret = bmap->b_ops->bop_seek_key(bmap, start, keyp);
+   up_read(&bmap->b_sem);
+
+   if (ret < 0)
+   ret = nilfs_bmap_convert_error(bmap, __func__, ret);
+   return ret;
+}
+
 int nilfs_bmap_last_key(struct nilfs_bmap *bmap, __u64 *keyp)
 {
int ret;
diff --git a/fs/nilfs2/bmap.h b/fs/nilfs2/bmap.h
index 9230d33..bfa817c 100644
--- a/fs/nilfs2/bmap.h
+++ b/fs/nilfs2/bmap.h
@@ -76,8 +76,10 @@ struct nilfs_bmap_operations {
  union nilfs_binfo *);
int (*bop_mark)(struct nilfs_bmap *, __u64, int);
 
-   /* The following functions are internal use only. */
+   int (*bop_seek_key)(const struct nilfs_bmap *, __u64, __u64 *);
int (*bop_last_key)(const struct nilfs_bmap *, __u64 *);
+
+   /* The following functions are internal use only. */
int (*bop_check_insert)(const struct nilfs_bmap *, __u64);
int (*bop_check_delete)(struct nilfs_bmap *, __u64);
int (*bop_gather_data)(struct nilfs_bmap *, __u64 *, __u64 *, int);
@@ -155,6 +157,7 @@ void nilfs_bmap_write(struct nilfs_bmap *, struct 
nilfs_inode *);
 int nilfs_bmap_lookup_contig(struct nilfs_bmap *, __u64, __u64 *, unsigned);
 int nilfs_bmap_insert(struct nilfs_bmap *bmap, __u64 key, unsigned long rec);
 int nilfs_bmap_delete(struct nilfs_bmap *bmap, __u64 key);
+int nilfs_bmap_seek_key(struct nilfs_bmap *bmap, __u64 start, __u64 *keyp);
 int nilfs_bmap_last_key(struct nilfs_bmap *bmap, __u64 *keyp);
 int nilfs_bmap_truncate(struct nilfs_bmap *bmap, __u64 key);
 void nilfs_bmap_clear(struct nilfs_bmap *);
diff --git a/fs/nilfs2/btree.c b/fs/nilfs2/btree.c
index ecdbae1..841d177 100644
--- a/fs/nilfs2/btree.c
+++ b/fs/nilfs2/btree.c
@@ -633,6 +633,44 @@ static int nilfs_btree_do_lookup_last(const struct 
nilfs_bmap *btree,
return 0;
 }
 
+/**
+ * nilfs_btree_get_next_key - get next valid key from btree path array
+ * @btree: bmap struct of btree
+ * @path: array of nilfs_btree_path struct
+ * @minlevel: start level
+ * @nextkey: place to store the next valid key
+ *
+ * Return Value: If a next key was found, 0 is returned. Otherwise,
+ * -ENOENT is returned.
+ */
+static int nilfs_btree_get_next_key(const struct nilfs_bmap *btree,
+   const struct nilfs_btree_path *path,
+   int minlevel, __u64 *nextkey)
+{
+   struct nilfs_btree_node *node;
+   int maxlevel = nilfs_btree_height(btree) - 1;
+   int index, next_adj, level;
+
+   /* Next index is already set to bp_index for leaf nodes. */
+   next_adj = 0;
+   for (level = minlevel; level <= maxlevel; level++) {
+   if (level == maxlevel)
+   node = nilfs_btree_get_root(btree);
+   else
+   node = nilfs_btree_get_nonroot_node(path, level);
+
+   index = path[level].bp_index + next_adj;
+   if (index < nilfs_btree_node_get_nchildren(node)) {
+   /* Next key is in this node */
+   *nextkey = nilfs_btree_node_get_key(node, index);
+   return 0;
+   }
+   /* For non-leaf nodes, next index is stored at bp_index + 1. */
+   next_adj = 1;
+   }
+   return -ENOENT;
+}
+
 static int nilfs_btree_lookup(const struct nilfs_bmap *btree,
  __u64 key, int level, __u64 *ptrp)
 {
@@ -1563,6 +1601,30 @@ out:
return ret;
 }
 
+static int nilfs_btree_seek_key(const st

Re: [PATCH 1/1] nilfs2: fix potential memory overrun on inode

2015-02-20 Thread Ryusuke Konishi

On Fri, 20 Feb 2015 18:00:55 -0800, Andrew Morton wrote:
> On Sat, 21 Feb 2015 10:13:28 +0900 (JST) Ryusuke Konishi 
>  wrote:
> 
>> I've got a warning from 0day kernel testing backend:
>> 
>> fs/nilfs2/btree.c: In function 'nilfs_btree_root_broken':
>> >> fs/nilfs2/btree.c:394:3: warning: format '%lu' expects argument of type 
>> >> 'long unsigned int', but argument 2 has type 'ino_t' [-Wformat=]
>>pr_crit("NILFS: bad btree root (inode number=%lu): level = %d, 
>> flags = 0x%x, nchildren = %d\n",
>>^
>> 
>> This is output for s390 arch since ino_t doesn't mean "unsigned long"
>> in s390.
> 
> alpha uses uint for ino_t as well.
> 
> It seems a bit pointless - neither arch uses ino_t in ./arch/ code.  I
> suspect both could switch to ulong, which would make the world a
> slightly better place.

I entirely agree.

Regards,
Ryusuke Konishi
--
To unsubscribe from this list: send the line "unsubscribe linux-nilfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH 1/1] nilfs2: fix potential memory overrun on inode

2015-02-20 Thread Ryusuke Konishi

On Sat, 21 Feb 2015 09:22:08 +0900 (JST), Ryusuke Konishi wrote:
> On Fri, 20 Feb 2015 13:58:42 -0800, Andrew Morton wrote:
>> On Fri, 20 Feb 2015 22:46:35 +0900 Ryusuke Konishi 
>>  wrote:
>> 
>>> Each inode of nilfs2 stores a root node of a b-tree, and it turned out
>>> to have a memory overrun issue:
>>> 
>>> Each b-tree node of nilfs2 stores a set of key-value pairs and the
>>> number of them (in "bn_nchildren" member of nilfs_btree_node struct),
>>> as well as a few other "bn_*" members.
>>> 
>>> Since the value of "bn_nchildren" is used for operations on the
>>> key-values within the b-tree node, it can cause memory access overrun
>>> if a large number is incorrectly set to "bn_nchildren".
>>> 
>>> For instance, nilfs_btree_node_lookup() function determines the range
>>> of binary search with it, and too large "bn_nchildren" leads
>>> nilfs_btree_node_get_key() in that function to overrun.
>>> 
>>> As for intermediate b-tree nodes, this is prevented by a sanity check
>>> performed when each node is read from a drive, however, no sanity
>>> check has been done for root nodes stored in inodes.
>>> 
>>> This patch fixes the issue by adding missing sanity check against
>>> b-tree root nodes so that it's called when on-memory inodes are read
>>> from ifile, inode metadata file.
>> 
>> How would one trigger this overrun?  Mount an fs with a deliberately
>> corrupted/inconsistent fs image?
> 
> Yes, this can be triggered by mounting an fs with a corrupted image
> deliberately or by chance.
> 
>> Memory overrun sounds nasty so I'm thinking we add cc:stable to this
>> one.  OK?
> 
> Agreed.

Could you apply the following amendment ?

I've got a warning from 0day kernel testing backend:

fs/nilfs2/btree.c: In function 'nilfs_btree_root_broken':
>> fs/nilfs2/btree.c:394:3: warning: format '%lu' expects argument of type 
>> 'long unsigned int', but argument 2 has type 'ino_t' [-Wformat=]
   pr_crit("NILFS: bad btree root (inode number=%lu): level = %d, 
flags = 0x%x, nchildren = %d\n",
   ^

This is output for s390 arch since ino_t doesn't mean "unsigned long"
in s390.

Thanks,
Ryusuke Konishi
--
diff --git a/fs/nilfs2/btree.c b/fs/nilfs2/btree.c
index c645d7c..ecdbae1 100644
--- a/fs/nilfs2/btree.c
+++ b/fs/nilfs2/btree.c
@@ -378,7 +378,7 @@ static int nilfs_btree_node_broken(const struct 
nilfs_btree_node *node,
  * Return Value: If node is broken, 1 is returned. Otherwise, 0 is returned.
  */
 static int nilfs_btree_root_broken(const struct nilfs_btree_node *node,
-  ino_t ino)
+  unsigned long ino)
 {
int level, flags, nchildren;
int ret = 0;
-- 
1.8.3.1

--
To unsubscribe from this list: send the line "unsubscribe linux-nilfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH 1/1] nilfs2: fix potential memory overrun on inode

2015-02-20 Thread Ryusuke Konishi

On Fri, 20 Feb 2015 13:58:42 -0800, Andrew Morton wrote:
> On Fri, 20 Feb 2015 22:46:35 +0900 Ryusuke Konishi 
>  wrote:
> 
>> Each inode of nilfs2 stores a root node of a b-tree, and it turned out
>> to have a memory overrun issue:
>> 
>> Each b-tree node of nilfs2 stores a set of key-value pairs and the
>> number of them (in "bn_nchildren" member of nilfs_btree_node struct),
>> as well as a few other "bn_*" members.
>> 
>> Since the value of "bn_nchildren" is used for operations on the
>> key-values within the b-tree node, it can cause memory access overrun
>> if a large number is incorrectly set to "bn_nchildren".
>> 
>> For instance, nilfs_btree_node_lookup() function determines the range
>> of binary search with it, and too large "bn_nchildren" leads
>> nilfs_btree_node_get_key() in that function to overrun.
>> 
>> As for intermediate b-tree nodes, this is prevented by a sanity check
>> performed when each node is read from a drive, however, no sanity
>> check has been done for root nodes stored in inodes.
>> 
>> This patch fixes the issue by adding missing sanity check against
>> b-tree root nodes so that it's called when on-memory inodes are read
>> from ifile, inode metadata file.
> 
> How would one trigger this overrun?  Mount an fs with a deliberately
> corrupted/inconsistent fs image?

Yes, this can be triggered by mounting an fs with a corrupted image
deliberately or by chance.

> Memory overrun sounds nasty so I'm thinking we add cc:stable to this
> one.  OK?

Agreed.

Ryusuke Konishi
--
To unsubscribe from this list: send the line "unsubscribe linux-nilfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH 1/1] nilfs2: fix potential memory overrun on inode

2015-02-20 Thread Ryusuke Konishi

Each inode of nilfs2 stores a root node of a b-tree, and it turned out
to have a memory overrun issue:

Each b-tree node of nilfs2 stores a set of key-value pairs and the
number of them (in "bn_nchildren" member of nilfs_btree_node struct),
as well as a few other "bn_*" members.

Since the value of "bn_nchildren" is used for operations on the
key-values within the b-tree node, it can cause memory access overrun
if a large number is incorrectly set to "bn_nchildren".

For instance, nilfs_btree_node_lookup() function determines the range
of binary search with it, and too large "bn_nchildren" leads
nilfs_btree_node_get_key() in that function to overrun.

As for intermediate b-tree nodes, this is prevented by a sanity check
performed when each node is read from a drive, however, no sanity
check has been done for root nodes stored in inodes.

This patch fixes the issue by adding missing sanity check against
b-tree root nodes so that it's called when on-memory inodes are read
from ifile, inode metadata file.

Signed-off-by: Ryusuke Konishi 
---
 fs/nilfs2/btree.c | 47 ---
 1 file changed, 44 insertions(+), 3 deletions(-)

diff --git a/fs/nilfs2/btree.c b/fs/nilfs2/btree.c
index b2e3ff3..c645d7c 100644
--- a/fs/nilfs2/btree.c
+++ b/fs/nilfs2/btree.c
@@ -31,6 +31,8 @@
 #include "alloc.h"
 #include "dat.h"
 
+static void __nilfs_btree_init(struct nilfs_bmap *bmap);
+
 static struct nilfs_btree_path *nilfs_btree_alloc_path(void)
 {
struct nilfs_btree_path *path;
@@ -368,6 +370,34 @@ static int nilfs_btree_node_broken(const struct 
nilfs_btree_node *node,
return ret;
 }
 
+/**
+ * nilfs_btree_root_broken - verify consistency of btree root node
+ * @node: btree root node to be examined
+ * @ino: inode number
+ *
+ * Return Value: If node is broken, 1 is returned. Otherwise, 0 is returned.
+ */
+static int nilfs_btree_root_broken(const struct nilfs_btree_node *node,
+  ino_t ino)
+{
+   int level, flags, nchildren;
+   int ret = 0;
+
+   level = nilfs_btree_node_get_level(node);
+   flags = nilfs_btree_node_get_flags(node);
+   nchildren = nilfs_btree_node_get_nchildren(node);
+
+   if (unlikely(level < NILFS_BTREE_LEVEL_NODE_MIN ||
+level > NILFS_BTREE_LEVEL_MAX ||
+nchildren < 0 ||
+nchildren > NILFS_BTREE_ROOT_NCHILDREN_MAX)) {
+   pr_crit("NILFS: bad btree root (inode number=%lu): level = %d, 
flags = 0x%x, nchildren = %d\n",
+   ino, level, flags, nchildren);
+   ret = 1;
+   }
+   return ret;
+}
+
 int nilfs_btree_broken_node_block(struct buffer_head *bh)
 {
int ret;
@@ -1713,7 +1743,7 @@ nilfs_btree_commit_convert_and_insert(struct nilfs_bmap 
*btree,
 
/* convert and insert */
dat = NILFS_BMAP_USE_VBN(btree) ? nilfs_bmap_get_dat(btree) : NULL;
-   nilfs_btree_init(btree);
+   __nilfs_btree_init(btree);
if (nreq != NULL) {
nilfs_bmap_commit_alloc_ptr(btree, dreq, dat);
nilfs_bmap_commit_alloc_ptr(btree, nreq, dat);
@@ -2294,12 +2324,23 @@ static const struct nilfs_bmap_operations 
nilfs_btree_ops_gc = {
.bop_gather_data=   NULL,
 };
 
-int nilfs_btree_init(struct nilfs_bmap *bmap)
+static void __nilfs_btree_init(struct nilfs_bmap *bmap)
 {
bmap->b_ops = &nilfs_btree_ops;
bmap->b_nchildren_per_block =
NILFS_BTREE_NODE_NCHILDREN_MAX(nilfs_btree_node_size(bmap));
-   return 0;
+}
+
+int nilfs_btree_init(struct nilfs_bmap *bmap)
+{
+   int ret = 0;
+
+   __nilfs_btree_init(bmap);
+
+   if (nilfs_btree_root_broken(nilfs_btree_get_root(bmap),
+   bmap->b_inode->i_ino))
+   ret = -EIO;
+   return ret;
 }
 
 void nilfs_btree_init_gc(struct nilfs_bmap *bmap)
-- 
1.8.3.1

--
To unsubscribe from this list: send the line "unsubscribe linux-nilfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH 0/1] nilfs2: fix potential memory overrun on inode

2015-02-20 Thread Ryusuke Konishi

Hi Andrew,

please queue the following patch as a bug fix.  It fixes a memory
overrun issue recently I found in the b-tree implementation of nilfs2.

Thanks,
Ryusuke Konishi
--
Ryusuke Konishi (1):
  nilfs2: fix potential memory overrun on inode

 fs/nilfs2/btree.c | 47 ---
 1 file changed, 44 insertions(+), 3 deletions(-)

--
To unsubscribe from this list: send the line "unsubscribe linux-nilfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[ANNOUNCE] nilfs-utils 2.2.3 release

2015-02-15 Thread Ryusuke Konishi

nilfs-utils 2.2.3 was released on:

 http://nilfs.sourceforge.net/en/download.html

This release fixes potential leak of nilfs library and an issue that
lscp command takes too long to output checkpoint list in the reverse
direction.

Changes from nilfs-utils-2.2.2 are as follows:

Ryusuke Konishi (5):
  lib/nilfs.c: fix potential leak at nilfs_open()
  nilfs-utils: get rid of my_free()
  nilfs-utils: get rid of null checks before calling free()
  lscp: accelerate backward checkpoint listing
  nilfs-utils: v2.2.3 release


Ryusuke Konishi
--
To unsubscribe from this list: send the line "unsubscribe linux-nilfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH] lscp: accelerate backward checkpoint listing

2015-02-14 Thread Ryusuke Konishi

If the minimum checkpoint number of valid checkpoints is large to some
extent, "lscp -r" command takes very long time:

 $ lscp -r
 CNODATE TIME  MODE  FLG  BLKCNT   ICNT
 6541269  2015-02-11 18:38:30   cp-  435  2
 6541268  2015-02-11 18:38:25   cp-  484 51
  

This is because it tries to find younger checkpoints tracking back the
checkpoint list in a constant step size.

This fixes the issue by lengthening or shortening the step size
depending on whether the backward search found a younger checkpoint or
not.

This patch also inserts a dummy nilfs_get_cpinfo() call before
starting the backward search to make successive nilfs_get_cpinfo()
calls much faster.

Signed-off-by: Ryusuke Konishi 
---
 bin/lscp.c | 96 --
 1 file changed, 87 insertions(+), 9 deletions(-)

diff --git a/bin/lscp.c b/bin/lscp.c
index c855def..023be5c 100644
--- a/bin/lscp.c
+++ b/bin/lscp.c
@@ -84,6 +84,12 @@ static const struct option long_option[] = {
 #define LSCP_NCPINFO   512
 #define LSCP_MINDELTA  64  /* Minimum delta for reverse direction */
 
+enum lscp_state {
+   LSCP_INIT_ST,   /* Initial state */
+   LSCP_NORMAL_ST, /* Normal state */
+   LSCP_ACCEL_ST,  /* Accelerate state */
+   LSCP_DECEL_ST,  /* Decelerate state */
+};
 
 static __u64 param_index;
 static __u64 param_lines;
@@ -176,35 +182,107 @@ static int lscp_backward_cpinfo(struct nilfs *nilfs,
struct nilfs_cpinfo *cpi;
nilfs_cno_t sidx; /* start index (inclusive) */
nilfs_cno_t eidx; /* end index (exclusive) */
-   __u64 rest, delta;
+   nilfs_cno_t prev_head = 0;
+   __u64 rest, delta, v;
+   int state = LSCP_INIT_ST;
ssize_t n;
 
rest = param_lines && param_lines < cpstat->cs_ncps ? param_lines :
cpstat->cs_ncps;
+   if (!rest)
+   goto out;
eidx = param_index && param_index < cpstat->cs_cno ? param_index + 1 :
cpstat->cs_cno;
 
-   for ( ; rest > 0 && eidx > NILFS_CNO_MIN; eidx = sidx) {
-   delta = min_t(__u64, LSCP_NCPINFO,
- max_t(__u64, rest, LSCP_MINDELTA));
-   sidx = (eidx >= NILFS_CNO_MIN + delta) ? eidx - delta :
-   NILFS_CNO_MIN;
+recalc_delta:
+   delta = min_t(__u64, LSCP_NCPINFO, max_t(__u64, rest, LSCP_MINDELTA));
+   v = delta;
 
-   n = lscp_get_cpinfo(nilfs, sidx, NILFS_CHECKPOINT, eidx - sidx);
+   while (eidx > NILFS_CNO_MIN) {
+   if (eidx < NILFS_CNO_MIN + v || state == LSCP_INIT_ST)
+   sidx = NILFS_CNO_MIN;
+   else
+   sidx = eidx - v;
+
+   n = lscp_get_cpinfo(nilfs, sidx, NILFS_CHECKPOINT,
+   state == LSCP_NORMAL_ST ? eidx - sidx : 1);
if (n < 0)
return n;
if (!n)
break;
 
-   for (cpi = cpinfos + n - 1; cpi >= cpinfos && rest > 0; cpi--) {
+   if (state == LSCP_INIT_ST) {
+   /*
+* This state makes succesive
+* nilfs_get_cpinfo() calls much faster by
+* setting minimum checkpoint number in nilfs
+* struct.
+*/
+   if (cpinfos[0].ci_cno >= eidx)
+   goto out; /* out of range */
+   state = LSCP_NORMAL_ST;
+   continue;
+   } else if (cpinfos[0].ci_cno == prev_head) {
+   /* No younger checkpoint was found */
+
+   if (sidx == NILFS_CNO_MIN)
+   break;
+
+   /* go further back */
+   switch (state) {
+   case LSCP_NORMAL_ST:
+   state = LSCP_ACCEL_ST;
+   /* fall through */
+   case LSCP_ACCEL_ST:
+   if ((v << 1) > v)
+   v <<= 1;
+   break;
+   case LSCP_DECEL_ST:
+   state = LSCP_NORMAL_ST;
+   v = delta;
+   break;
+   }
+   eidx = sidx;
+   continue;
+   } else {
+   switch (state) {
+   case LSCP_ACCEL_ST:
+   case LSCP_DECEL_ST:
+   if (cpinfos[n - 1].ci_cno + 1 < prev_head) {
+

[PATCH] nilfs2: use bgl_lock_ptr()

2015-02-08 Thread Ryusuke Konishi

Simplify nilfs_mdt_bgl_lock() by utilizing bgl_lock_ptr() helper in
.

Signed-off-by: Ryusuke Konishi 
---
 fs/nilfs2/mdt.h | 7 +--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/fs/nilfs2/mdt.h b/fs/nilfs2/mdt.h
index ab172e8..a294ea3 100644
--- a/fs/nilfs2/mdt.h
+++ b/fs/nilfs2/mdt.h
@@ -111,7 +111,10 @@ static inline __u64 nilfs_mdt_cno(struct inode *inode)
return ((struct the_nilfs *)inode->i_sb->s_fs_info)->ns_cno;
 }
 
-#define nilfs_mdt_bgl_lock(inode, bg) \
-   (&NILFS_MDT(inode)->mi_bgl->locks[(bg) & (NR_BG_LOCKS-1)].lock)
+static inline spinlock_t *
+nilfs_mdt_bgl_lock(struct inode *inode, unsigned int block_group)
+{
+   return bgl_lock_ptr(NILFS_MDT(inode)->mi_bgl, block_group);
+}
 
 #endif /* _NILFS_MDT_H */
-- 
1.8.3.1

--
To unsubscribe from this list: send the line "unsubscribe linux-nilfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH 1/2] nilfs2: do not use async write flag for segment summary buffers

2015-02-08 Thread Ryusuke Konishi

The async write flag is introduced to nilfs2 in the commit 7f42ec3941
"nilfs2: fix issue with race condition of competition between segments
for dirty blocks", but the flag only makes sense for data buffers and
btree node buffers.  It is not needed for segment summary buffers.

This gits rid of the latter uses to prepare for refactoring of atomic
bit operations on buffer state bitmap.

Signed-off-by: Ryusuke Konishi 
Cc: Vyacheslav Dubeyko 
---
 fs/nilfs2/segment.c | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c
index 469086b..566cad8 100644
--- a/fs/nilfs2/segment.c
+++ b/fs/nilfs2/segment.c
@@ -1588,7 +1588,6 @@ static void nilfs_segctor_prepare_write(struct 
nilfs_sc_info *sci)
 
list_for_each_entry(bh, &segbuf->sb_segsum_buffers,
b_assoc_buffers) {
-   set_buffer_async_write(bh);
if (bh->b_page != bd_page) {
if (bd_page) {
lock_page(bd_page);
@@ -1688,7 +1687,6 @@ static void nilfs_abort_logs(struct list_head *logs, int 
err)
list_for_each_entry(segbuf, logs, sb_list) {
list_for_each_entry(bh, &segbuf->sb_segsum_buffers,
b_assoc_buffers) {
-   clear_buffer_async_write(bh);
if (bh->b_page != bd_page) {
if (bd_page)
end_page_writeback(bd_page);
@@ -1768,7 +1766,6 @@ static void nilfs_segctor_complete_write(struct 
nilfs_sc_info *sci)
b_assoc_buffers) {
set_buffer_uptodate(bh);
clear_buffer_dirty(bh);
-   clear_buffer_async_write(bh);
if (bh->b_page != bd_page) {
if (bd_page)
end_page_writeback(bd_page);
-- 
1.8.3.1

--
To unsubscribe from this list: send the line "unsubscribe linux-nilfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH 0/2] nilfs2: simplify atomic bit operations on buffer state bitmap

2015-02-08 Thread Ryusuke Konishi

This series reduces the number of atomic bit operations for buffer
state bitmap in nilfs2.

Ryusuke Konishi
--
Ryusuke Konishi (2):
  nilfs2: do not use async write flag for segment summary buffers
  nilfs2: use set_mask_bits() for operations on buffer state bitmap

 fs/nilfs2/page.c| 22 ++
 fs/nilfs2/segment.c | 16 
 2 files changed, 18 insertions(+), 20 deletions(-)

--
To unsubscribe from this list: send the line "unsubscribe linux-nilfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH 2/2] nilfs2: use set_mask_bits() for operations on buffer state bitmap

2015-02-08 Thread Ryusuke Konishi

nilfs_forget_buffer(), nilfs_clear_dirty_page(), and
nilfs_segctor_complete_write() are using a bunch of atomic bit
operations against buffer state bitmap.

This reduces the number of them by utilizing set_mask_bits() macro.
"BH_Dirty" bit is excluded from this aggregation since a Test-and-Set
bit operation is used to the bit and it's not clear whether the
replacement is safe.

Signed-off-by: Ryusuke Konishi 
---
 fs/nilfs2/page.c| 22 ++
 fs/nilfs2/segment.c | 13 -
 2 files changed, 18 insertions(+), 17 deletions(-)

diff --git a/fs/nilfs2/page.c b/fs/nilfs2/page.c
index da27664..9d6c8b9 100644
--- a/fs/nilfs2/page.c
+++ b/fs/nilfs2/page.c
@@ -89,18 +89,17 @@ struct buffer_head *nilfs_grab_buffer(struct inode *inode,
 void nilfs_forget_buffer(struct buffer_head *bh)
 {
struct page *page = bh->b_page;
+   const unsigned long clear_bits =
+   (1 << BH_Uptodate | 1 << BH_Mapped | 1 << BH_Async_Write |
+1 << BH_NILFS_Volatile | 1 << BH_NILFS_Checked |
+1 << BH_NILFS_Redirected);
 
lock_buffer(bh);
-   clear_buffer_nilfs_volatile(bh);
-   clear_buffer_nilfs_checked(bh);
-   clear_buffer_nilfs_redirected(bh);
-   clear_buffer_async_write(bh);
clear_buffer_dirty(bh);
if (nilfs_page_buffers_clean(page))
__nilfs_clear_page_dirty(page);
+   set_mask_bits(&bh->b_state, clear_bits, 0);
 
-   clear_buffer_uptodate(bh);
-   clear_buffer_mapped(bh);
bh->b_blocknr = -1;
ClearPageUptodate(page);
ClearPageMappedToDisk(page);
@@ -421,6 +420,10 @@ void nilfs_clear_dirty_page(struct page *page, bool silent)
 
if (page_has_buffers(page)) {
struct buffer_head *bh, *head;
+   const unsigned long clear_bits =
+   (1 << BH_Uptodate | 1 << BH_Mapped |
+1 << BH_Async_Write | 1 << BH_NILFS_Volatile |
+1 << BH_NILFS_Checked | 1 << BH_NILFS_Redirected);
 
bh = head = page_buffers(page);
do {
@@ -430,13 +433,8 @@ void nilfs_clear_dirty_page(struct page *page, bool silent)
"discard block %llu, size %zu",
(u64)bh->b_blocknr, bh->b_size);
}
-   clear_buffer_async_write(bh);
clear_buffer_dirty(bh);
-   clear_buffer_nilfs_volatile(bh);
-   clear_buffer_nilfs_checked(bh);
-   clear_buffer_nilfs_redirected(bh);
-   clear_buffer_uptodate(bh);
-   clear_buffer_mapped(bh);
+   set_mask_bits(&bh->b_state, clear_bits, 0);
unlock_buffer(bh);
} while (bh = bh->b_this_page, bh != head);
}
diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c
index 566cad8..e93b562 100644
--- a/fs/nilfs2/segment.c
+++ b/fs/nilfs2/segment.c
@@ -24,6 +24,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -1785,12 +1786,14 @@ static void nilfs_segctor_complete_write(struct 
nilfs_sc_info *sci)
 */
list_for_each_entry(bh, &segbuf->sb_payload_buffers,
b_assoc_buffers) {
-   set_buffer_uptodate(bh);
+   const unsigned long set_bits = (1 << BH_Uptodate);
+   const unsigned long clear_bits =
+   (1 << BH_Async_Write | 1 << BH_Delay |
+1 << BH_NILFS_Volatile |
+1 << BH_NILFS_Redirected);
+
clear_buffer_dirty(bh);
-   clear_buffer_async_write(bh);
-   clear_buffer_delay(bh);
-   clear_buffer_nilfs_volatile(bh);
-   clear_buffer_nilfs_redirected(bh);
+   set_mask_bits(&bh->b_state, clear_bits, set_bits);
if (bh == segbuf->sb_super_root) {
if (bh->b_page != bd_page) {
end_page_writeback(bd_page);
-- 
1.8.3.1

--
To unsubscribe from this list: send the line "unsubscribe linux-nilfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH] nilfs2: fix deadlock of segment constructor over I_SYNC flag

2015-02-04 Thread Ryusuke Konishi

Nilfs2 eventually hangs in a stress test with fsstress program.
This issue was caused by the following deadlock over I_SYNC flag
between nilfs_segctor_thread() and writeback_sb_inodes():

  nilfs_segctor_thread()
nilfs_segctor_thread_construct()
  nilfs_segctor_unlock()
nilfs_dispose_list()
  iput()
iput_final()
  evict()
inode_wait_for_writeback()  * wait for I_SYNC flag

  writeback_sb_inodes()
 * set I_SYNC flag on inode->i_state
__writeback_single_inode()
  do_writepages()
nilfs_writepages()
  nilfs_construct_dsync_segment()
nilfs_segctor_sync()
   * wait for completion of segment constructor
inode_sync_complete()
   * clear I_SYNC flag after __writeback_single_inode() completed

writeback_sb_inodes() calls do_writepages() for dirty inodes after
setting I_SYNC flag on inode->i_state.  do_writepages() calls
nilfs_writepages(), which can run segment constructor and wait for its
completion.  On the other hand, segment constructor calls iput(),
which can call evict() and wait for the I_SYNC flag on
inode_wait_for_writeback().

Since segment constructor doesn't know when I_SYNC will be set, it
cannot know whether iput() will block or not unless inode->i_nlink has
a non-zero count.  We can prevent evict() from being called in iput()
by implementing sop->drop_inode(), but it's not preferable to leave
inodes with i_nlink == 0 for long periods because it even defers file
truncation and inode deallocation.  So, this instead resolves the
deadlock by calling iput() asynchronously with a workqueue for inodes
with i_nlink == 0.

Signed-off-by: Ryusuke Konishi 
Cc: Al Viro 
Tested-by: Ryusuke Konishi 
Cc: sta...@vger.kernel.org
---
 fs/nilfs2/nilfs.h   |  2 --
 fs/nilfs2/segment.c | 44 +++-
 fs/nilfs2/segment.h |  5 +
 3 files changed, 44 insertions(+), 7 deletions(-)

diff --git a/fs/nilfs2/nilfs.h b/fs/nilfs2/nilfs.h
index 91093cd..3857040 100644
--- a/fs/nilfs2/nilfs.h
+++ b/fs/nilfs2/nilfs.h
@@ -141,7 +141,6 @@ enum {
  * @ti_save: Backup of journal_info field of task_struct
  * @ti_flags: Flags
  * @ti_count: Nest level
- * @ti_garbage:List of inode to be put when releasing semaphore
  */
 struct nilfs_transaction_info {
u32 ti_magic;
@@ -150,7 +149,6 @@ struct nilfs_transaction_info {
   one of other filesystems has a bug. */
unsigned short  ti_flags;
unsigned short  ti_count;
-   struct list_headti_garbage;
 };
 
 /* ti_magic */
diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c
index 7ef18fc..469086b 100644
--- a/fs/nilfs2/segment.c
+++ b/fs/nilfs2/segment.c
@@ -305,7 +305,6 @@ static void nilfs_transaction_lock(struct super_block *sb,
ti->ti_count = 0;
ti->ti_save = cur_ti;
ti->ti_magic = NILFS_TI_MAGIC;
-   INIT_LIST_HEAD(&ti->ti_garbage);
current->journal_info = ti;
 
for (;;) {
@@ -332,8 +331,6 @@ static void nilfs_transaction_unlock(struct super_block *sb)
 
up_write(&nilfs->ns_segctor_sem);
current->journal_info = ti->ti_save;
-   if (!list_empty(&ti->ti_garbage))
-   nilfs_dispose_list(nilfs, &ti->ti_garbage, 0);
 }
 
 static void *nilfs_segctor_map_segsum_entry(struct nilfs_sc_info *sci,
@@ -746,6 +743,15 @@ static void nilfs_dispose_list(struct the_nilfs *nilfs,
}
 }
 
+static void nilfs_iput_work_func(struct work_struct *work)
+{
+   struct nilfs_sc_info *sci = container_of(work, struct nilfs_sc_info,
+sc_iput_work);
+   struct the_nilfs *nilfs = sci->sc_super->s_fs_info;
+
+   nilfs_dispose_list(nilfs, &sci->sc_iput_queue, 0);
+}
+
 static int nilfs_test_metadata_dirty(struct the_nilfs *nilfs,
 struct nilfs_root *root)
 {
@@ -1900,8 +1906,8 @@ static int nilfs_segctor_collect_dirty_files(struct 
nilfs_sc_info *sci,
 static void nilfs_segctor_drop_written_files(struct nilfs_sc_info *sci,
 struct the_nilfs *nilfs)
 {
-   struct nilfs_transaction_info *ti = current->journal_info;
struct nilfs_inode_info *ii, *n;
+   int defer_iput = false;
 
spin_lock(&nilfs->ns_inode_lock);
list_for_each_entry_safe(ii, n, &sci->sc_dirty_files, i_dirty) {
@@ -1912,9 +1918,24 @@ static void nilfs_segctor_drop_written_files(struct 
nilfs_sc_info *sci,
clear_bit(NILFS_I_BUSY, &ii->i_state);
brelse(ii->i_bh);
ii->i_bh = NULL;
-   list_move_tail(&ii->i_dirty, &ti->ti_garbage);
+   list_del_init(&ii->i_dirty);
+   if (!ii->vfs_inode.i_nlink) {
+   /*
+

[PATCH 0/1] fix deadlock of segment constructor over I_SYNC flag

2015-02-04 Thread Ryusuke Konishi

Hi Andrew,

Please queue the following patch for the next merge window.  It fixes
a deadlock issue found in nilfs2.

Thanks,
Ryusuke Konishi
--
Ryusuke Konishi (1):
  nilfs2: fix deadlock of segment constructor over I_SYNC flag

 fs/nilfs2/nilfs.h   |  2 --
 fs/nilfs2/segment.c | 44 +++-
 fs/nilfs2/segment.h |  5 +
 3 files changed, 44 insertions(+), 7 deletions(-)

--
To unsubscribe from this list: send the line "unsubscribe linux-nilfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH v2 3/3] nilfs-utils: get rid of null checks before calling free()

2015-01-18 Thread Ryusuke Konishi

Remove unnecessary null checks before calling free() function.

Signed-off-by: Ryusuke Konishi 
---
 lib/cleaner_ctl.c |  6 ++
 lib/gc.c  |  3 +--
 lib/nilfs.c   | 19 +++
 lib/realpath.c|  9 +++--
 4 files changed, 13 insertions(+), 24 deletions(-)

diff --git a/lib/cleaner_ctl.c b/lib/cleaner_ctl.c
index fa41ac1..9c458a4 100644
--- a/lib/cleaner_ctl.c
+++ b/lib/cleaner_ctl.c
@@ -226,8 +226,7 @@ static int nilfs_cleaner_find_fs(struct nilfs_cleaner 
*cleaner,
   sizeof(canonical))) {
mdev = canonical;
}
-   if (last_match_dev)
-   free(last_match_dev);
+   free(last_match_dev);
last_match_dev = strdup(mdev);
if (!last_match_dev)
goto error;
@@ -238,8 +237,7 @@ static int nilfs_cleaner_find_fs(struct nilfs_cleaner 
*cleaner,
   sizeof(canonical))) {
mdir = canonical;
}
-   if (last_match_dir)
-   free(last_match_dir);
+   free(last_match_dir);
last_match_dir = strdup(mdir);
if (!last_match_dir)
goto error;
diff --git a/lib/gc.c b/lib/gc.c
index 54d0b66..48c295a 100644
--- a/lib/gc.c
+++ b/lib/gc.c
@@ -508,8 +508,7 @@ static int nilfs_toss_vdescs(struct nilfs *nilfs,
}
ret = 0;
  out:
-   if (ss != NULL)
-   free(ss);
+   free(ss);
return ret;
 }
 
diff --git a/lib/nilfs.c b/lib/nilfs.c
index 52ddee9..30db654 100644
--- a/lib/nilfs.c
+++ b/lib/nilfs.c
@@ -435,13 +435,10 @@ out_fd:
close(nilfs->n_devfd);
if (nilfs->n_iocfd >= 0)
close(nilfs->n_iocfd);
-   if (nilfs->n_dev != NULL)
-   free(nilfs->n_dev);
-   if (nilfs->n_ioc != NULL)
-   free(nilfs->n_ioc);
-   if (nilfs->n_sb != NULL)
-   free(nilfs->n_sb);
 
+   free(nilfs->n_dev);
+   free(nilfs->n_ioc);
+   free(nilfs->n_sb);
free(nilfs);
return NULL;
 }
@@ -458,12 +455,10 @@ void nilfs_close(struct nilfs *nilfs)
close(nilfs->n_devfd);
if (nilfs->n_iocfd >= 0)
close(nilfs->n_iocfd);
-   if (nilfs->n_dev != NULL)
-   free(nilfs->n_dev);
-   if (nilfs->n_ioc != NULL)
-   free(nilfs->n_ioc);
-   if (nilfs->n_sb != NULL)
-   free(nilfs->n_sb);
+
+   free(nilfs->n_dev);
+   free(nilfs->n_ioc);
+   free(nilfs->n_sb);
free(nilfs);
 }
 
diff --git a/lib/realpath.c b/lib/realpath.c
index 3f01b87..691360b 100644
--- a/lib/realpath.c
+++ b/lib/realpath.c
@@ -133,8 +133,7 @@ myrealpath(const char *path, char *resolved_path, int 
maxreslth) {
 
/* Insert symlink contents into path. */
m = strlen(path);
-   if (buf)
-   free(buf);
+   free(buf);
buf = malloc(m + n + 1);
if (!buf) {
errno = ENOMEM;
@@ -153,12 +152,10 @@ myrealpath(const char *path, char *resolved_path, int 
maxreslth) {
/* Make sure it's null terminated. */
*npath = '\0';
 
-   if (buf)
-   free(buf);
+   free(buf);
return resolved_path;
 
  err:
-   if (buf)
-   free(buf);
+   free(buf);
return NULL;
 }
-- 
1.8.3.1

--
To unsubscribe from this list: send the line "unsubscribe linux-nilfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH v2 1/3] lib/nilfs.c: fix potential leak at nilfs_open()

2015-01-18 Thread Ryusuke Konishi

nilfs_open() can exit without closing nilfs->n_devfd and freeing
nilfs->n_dev and nilfs->n_sb if it first initializes a nilfs object in
the code path for NILFS_OPEN_RAW mode and then escapes through
out_nilfs label.  This fixes the leak issue.

Signed-off-by: Ryusuke Konishi 
---
 lib/nilfs.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/lib/nilfs.c b/lib/nilfs.c
index 65bf7d5..52ddee9 100644
--- a/lib/nilfs.c
+++ b/lib/nilfs.c
@@ -411,9 +411,9 @@ struct nilfs *nilfs_open(const char *dev, const char *dir, 
int flags)
(NILFS_OPEN_RDONLY | NILFS_OPEN_WRONLY | NILFS_OPEN_RDWR)) {
if (nilfs_find_fs(nilfs, dev, dir, MNTOPT_RW) < 0) {
if (!(flags & NILFS_OPEN_RDONLY))
-   goto out_nilfs;
+   goto out_fd;
if (nilfs_find_fs(nilfs, dev, dir, MNTOPT_RO) < 0)
-   goto out_nilfs;
+   goto out_fd;
}
nilfs->n_iocfd = open(nilfs->n_ioc, O_RDONLY);
if (nilfs->n_iocfd < 0)
@@ -442,7 +442,6 @@ out_fd:
if (nilfs->n_sb != NULL)
free(nilfs->n_sb);
 
-out_nilfs:
free(nilfs);
return NULL;
 }
-- 
1.8.3.1

--
To unsubscribe from this list: send the line "unsubscribe linux-nilfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH v2 2/3] nilfs-utils: get rid of my_free()

2015-01-18 Thread Ryusuke Konishi

Remove my_free wrapper functions used in fstab.c, mount.nilfs2.c and
umount.nilfs2.c.  They are just doing an unnecessary null check before
calling free() and eliminable since free(NULL) is just ignored.

Signed-off-by: Ryusuke Konishi 
---
 sbin/mount/fstab.c | 20 +++-
 sbin/mount/mount.nilfs2.c  | 38 +++---
 sbin/mount/mount_mntent.h  |  8 
 sbin/mount/umount.nilfs2.c | 15 ---
 4 files changed, 30 insertions(+), 51 deletions(-)

diff --git a/sbin/mount/fstab.c b/sbin/mount/fstab.c
index b0addbe..656a39b 100644
--- a/sbin/mount/fstab.c
+++ b/sbin/mount/fstab.c
@@ -124,19 +124,13 @@ fstab_head() {
return &fstab;
 }
 
-static void
-my_free(const void *s) {
-   if (s)
-   free((void *) s);
-}
-
-static void
-my_free_mc(struct mntentchn *mc) {
+static void my_free_mc(struct mntentchn *mc)
+{
if (mc) {
-   my_free(mc->m.mnt_fsname);
-   my_free(mc->m.mnt_dir);
-   my_free(mc->m.mnt_type);
-   my_free(mc->m.mnt_opts);
+   free(mc->m.mnt_fsname);
+   free(mc->m.mnt_dir);
+   free(mc->m.mnt_type);
+   free(mc->m.mnt_opts);
free(mc);
}
 }
@@ -574,7 +568,7 @@ void update_mtab(const char *dir, struct my_mntent *instead)
}
} else {
/* Replace option strings. (changed for nilfs2) */
-   my_free(mc->m.mnt_opts);
+   free(mc->m.mnt_opts);
mc->m.mnt_opts = xstrdup(instead->mnt_opts);
}
} else if (instead) {
diff --git a/sbin/mount/mount.nilfs2.c b/sbin/mount/mount.nilfs2.c
index e9cb25e..e3fc727 100644
--- a/sbin/mount/mount.nilfs2.c
+++ b/sbin/mount/mount.nilfs2.c
@@ -24,7 +24,6 @@
  * The following functions are extracted from util-linux-2.12r/mount.c:
  *  - print_one()
  *  - update_mtab_entry()
- *  - my_free()
  */
 
 #ifdef HAVE_CONFIG_H
@@ -172,13 +171,6 @@ static void handle_signal(int sig)
}
 }
 
-static inline void my_free(const void *ptr)
-{
-   /* free(NULL) is ignored; the check below is just to be sure */
-   if (ptr)
-   free((void *)ptr);
-}
-
 static int device_is_readonly(const char *device, int *ro)
 {
int fd, res;
@@ -255,7 +247,7 @@ static struct mntentchn *find_rw_mount(const char *device)
break;
mc = getmntdevbackward(fsname, mc);
}
-   my_free(fsname);
+   free(fsname);
return mc;
 }
 
@@ -275,8 +267,8 @@ static int mounted(const char *spec, const char *node)
}
mc = getmntdirbackward(dir, mc);
}
-   my_free(fsname);
-   my_free(dir);
+   free(fsname);
+   free(dir);
return ret;
 }
 
@@ -335,10 +327,10 @@ update_mtab_entry(const char *spec, const char *node, 
const char *type,
my_endmntent(mfp);
unlock_mtab();
}
-   my_free(mnt.mnt_fsname);
-   my_free(mnt.mnt_dir);
-   my_free(mnt.mnt_type);
-   my_free(mnt.mnt_opts);
+   free(mnt.mnt_fsname);
+   free(mnt.mnt_dir);
+   free(mnt.mnt_type);
+   free(mnt.mnt_opts);
 }
 
 enum remount_type {
@@ -349,7 +341,7 @@ enum remount_type {
 
 static int check_remount_dir(struct mntentchn *mc, const char *mntdir)
 {
-   const char *dir = canonicalize(mntdir);
+   char *dir = canonicalize(mntdir);
int res = 0;
 
if (strcmp(dir, mc->m.mnt_dir) != 0) {
@@ -357,7 +349,7 @@ static int check_remount_dir(struct mntentchn *mc, const 
char *mntdir)
  progname, mntdir);
res = -1;
}
-   my_free(dir);
+   free(dir);
return res;
 }
 
@@ -514,7 +506,7 @@ do_mount_one(struct nilfs_mount_info *mi, const struct 
mount_options *mo)
} else
printf(_("%s not restarted\n"), NILFS_CLEANERD_NAME);
  out:
-   my_free(exopts);
+   free(exopts);
return res;
 }
 
@@ -542,14 +534,14 @@ static void update_mount_state(struct nilfs_mount_info 
*mi,
if (!check_mtab())
return;
 
-   my_free(mi->optstr);
+   free(mi->optstr);
exopts = fix_extra_opts_string(mo->extra_opts, pid, pp);
mi->optstr = fix_opts_string(((mo->flags & ~MS_NOMTAB) | MS_NETDEV),
 exopts, NULL);
 
update_mtab_entry(mi->device, mi->mntdir, fstype, mi->optstr, 0, 0,
  !mi->mounted);
-   my_free(exopts);
+   free(exopts);
 }
 
 static int mount_one(char *device, char *mntdir,
@@ -591,7 +583,7 @@ static int mount_one(char *device, char *mntdir,
 
err = 0;
  failed:
-   my_free(mi.optstr);
+   free(mi.optstr);
return err;
 }
 
@@ -655,7 +647,7 @@ int main(int argc

[PATCH v2 0/3] nilfs-utils: fix leak at nilfs_open and remove null checks before free()

2015-01-18 Thread Ryusuke Konishi

This series fixes potential leak at nilfs_open and removes unnecessary
null checks before calling free().

Changes from v1:
 * fix warnings at sbin/mount/{fstab.c,mount.nilfs2.c,umount.nilfs2.c}

Ryusuke Konishi
--
Ryusuke Konishi (3):
  lib/nilfs.c: fix potential leak at nilfs_open()
  nilfs-utils: get rid of my_free()
  nilfs-utils: get rid of null checks before calling free()

 lib/cleaner_ctl.c  |  6 ++
 lib/gc.c   |  3 +--
 lib/nilfs.c| 26 ++
 lib/realpath.c |  9 +++--
 sbin/mount/fstab.c | 20 +++-
 sbin/mount/mount.nilfs2.c  | 38 +++---
 sbin/mount/mount_mntent.h  |  8 
 sbin/mount/umount.nilfs2.c | 15 ---
 8 files changed, 46 insertions(+), 79 deletions(-)

--
To unsubscribe from this list: send the line "unsubscribe linux-nilfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH 3/3] nilfs-utils: get rid of null checks before calling free()

2015-01-18 Thread Ryusuke Konishi

Remove unnecessary null checks before calling free() function.

Signed-off-by: Ryusuke Konishi 
---
 lib/cleaner_ctl.c |  6 ++
 lib/gc.c  |  3 +--
 lib/nilfs.c   | 19 +++
 lib/realpath.c|  9 +++--
 4 files changed, 13 insertions(+), 24 deletions(-)

diff --git a/lib/cleaner_ctl.c b/lib/cleaner_ctl.c
index fa41ac1..9c458a4 100644
--- a/lib/cleaner_ctl.c
+++ b/lib/cleaner_ctl.c
@@ -226,8 +226,7 @@ static int nilfs_cleaner_find_fs(struct nilfs_cleaner 
*cleaner,
   sizeof(canonical))) {
mdev = canonical;
}
-   if (last_match_dev)
-   free(last_match_dev);
+   free(last_match_dev);
last_match_dev = strdup(mdev);
if (!last_match_dev)
goto error;
@@ -238,8 +237,7 @@ static int nilfs_cleaner_find_fs(struct nilfs_cleaner 
*cleaner,
   sizeof(canonical))) {
mdir = canonical;
}
-   if (last_match_dir)
-   free(last_match_dir);
+   free(last_match_dir);
last_match_dir = strdup(mdir);
if (!last_match_dir)
goto error;
diff --git a/lib/gc.c b/lib/gc.c
index 54d0b66..48c295a 100644
--- a/lib/gc.c
+++ b/lib/gc.c
@@ -508,8 +508,7 @@ static int nilfs_toss_vdescs(struct nilfs *nilfs,
}
ret = 0;
  out:
-   if (ss != NULL)
-   free(ss);
+   free(ss);
return ret;
 }
 
diff --git a/lib/nilfs.c b/lib/nilfs.c
index 52ddee9..30db654 100644
--- a/lib/nilfs.c
+++ b/lib/nilfs.c
@@ -435,13 +435,10 @@ out_fd:
close(nilfs->n_devfd);
if (nilfs->n_iocfd >= 0)
close(nilfs->n_iocfd);
-   if (nilfs->n_dev != NULL)
-   free(nilfs->n_dev);
-   if (nilfs->n_ioc != NULL)
-   free(nilfs->n_ioc);
-   if (nilfs->n_sb != NULL)
-   free(nilfs->n_sb);
 
+   free(nilfs->n_dev);
+   free(nilfs->n_ioc);
+   free(nilfs->n_sb);
free(nilfs);
return NULL;
 }
@@ -458,12 +455,10 @@ void nilfs_close(struct nilfs *nilfs)
close(nilfs->n_devfd);
if (nilfs->n_iocfd >= 0)
close(nilfs->n_iocfd);
-   if (nilfs->n_dev != NULL)
-   free(nilfs->n_dev);
-   if (nilfs->n_ioc != NULL)
-   free(nilfs->n_ioc);
-   if (nilfs->n_sb != NULL)
-   free(nilfs->n_sb);
+
+   free(nilfs->n_dev);
+   free(nilfs->n_ioc);
+   free(nilfs->n_sb);
free(nilfs);
 }
 
diff --git a/lib/realpath.c b/lib/realpath.c
index 3f01b87..691360b 100644
--- a/lib/realpath.c
+++ b/lib/realpath.c
@@ -133,8 +133,7 @@ myrealpath(const char *path, char *resolved_path, int 
maxreslth) {
 
/* Insert symlink contents into path. */
m = strlen(path);
-   if (buf)
-   free(buf);
+   free(buf);
buf = malloc(m + n + 1);
if (!buf) {
errno = ENOMEM;
@@ -153,12 +152,10 @@ myrealpath(const char *path, char *resolved_path, int 
maxreslth) {
/* Make sure it's null terminated. */
*npath = '\0';
 
-   if (buf)
-   free(buf);
+   free(buf);
return resolved_path;
 
  err:
-   if (buf)
-   free(buf);
+   free(buf);
return NULL;
 }
-- 
1.8.3.1

--
To unsubscribe from this list: send the line "unsubscribe linux-nilfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH 1/3] lib/nilfs.c: fix potential leak at nilfs_open()

2015-01-18 Thread Ryusuke Konishi

nilfs_open() can exit without closing nilfs->n_devfd and freeing
nilfs->n_dev and nilfs->n_sb if it first initializes a nilfs object in
the code path for NILFS_OPEN_RAW mode and then escapes through
out_nilfs label.  This fixes the leak issue.

Signed-off-by: Ryusuke Konishi 
---
 lib/nilfs.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/lib/nilfs.c b/lib/nilfs.c
index 65bf7d5..52ddee9 100644
--- a/lib/nilfs.c
+++ b/lib/nilfs.c
@@ -411,9 +411,9 @@ struct nilfs *nilfs_open(const char *dev, const char *dir, 
int flags)
(NILFS_OPEN_RDONLY | NILFS_OPEN_WRONLY | NILFS_OPEN_RDWR)) {
if (nilfs_find_fs(nilfs, dev, dir, MNTOPT_RW) < 0) {
if (!(flags & NILFS_OPEN_RDONLY))
-   goto out_nilfs;
+   goto out_fd;
if (nilfs_find_fs(nilfs, dev, dir, MNTOPT_RO) < 0)
-   goto out_nilfs;
+   goto out_fd;
}
nilfs->n_iocfd = open(nilfs->n_ioc, O_RDONLY);
if (nilfs->n_iocfd < 0)
@@ -442,7 +442,6 @@ out_fd:
if (nilfs->n_sb != NULL)
free(nilfs->n_sb);
 
-out_nilfs:
free(nilfs);
return NULL;
 }
-- 
1.8.3.1

--
To unsubscribe from this list: send the line "unsubscribe linux-nilfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

1 2 3 4 5 6 7 8 9 10 >

1 - 100 of 1034 matches

Mail list logo