On Tue, 24 Feb 2015 20:01:43 +0100, Andreas Rohner wrote:
> This patch improves the accuracy of the su_nlive_blks segment
> usage field by also counting the blocks of the DAT-File. A block in
> the DAT-File is considered reclaimable as soon as it is overwritten.
> There is no need to consider protection periods, snapshots or
> checkpoints. So whenever a block is overwritten during segment
> construction, the segment usage information of the segment at the
> previous location of the block is decremented. To get the previous
> location of the block the b_blocknr field of the buffer_head
> structure is used.
>
> SUFILE blocks are counted in a similar way, but if the GC reads a
> block into a GC inode, that already is in the cache, then there are
> two versions of the block. If this happens both versions will be
> counted, which can lead to small seemingly random incorrect values.
> But it is better to accept these small inaccuracies than to not
> count the SUFILE at all. These inaccuracies do not occur for the
> DAT-File, because it does not need a GC inode.
>
> Additionally the blocks that belong to a GC inode are rechecked if
> they are reclaimable. If so the corresponding counter is
> decremented. The blocks were already checked in userspace, but
> without the proper locking. It is furthermore possible, that blocks
> become reclaimable during the cleaning process. For example by
> deleting checkpoints. To improve the performance of these extra
> checks, flags from userspace are used to determine reclaimability.
> If a block belongs to a snapshot it cannot be reclaimable and if
> it is within the protection period it must be counted as
> reclaimable.
>
> Signed-off-by: Andreas Rohner <[email protected]>
> ---
> fs/nilfs2/dat.c | 70 ++++++++++++++++++++++++++++++++++++
> fs/nilfs2/dat.h | 1 +
> fs/nilfs2/inode.c | 2 ++
> fs/nilfs2/segbuf.c | 4 +++
> fs/nilfs2/segbuf.h | 1 +
> fs/nilfs2/segment.c | 101
> ++++++++++++++++++++++++++++++++++++++++++++++++++--
> 6 files changed, 177 insertions(+), 2 deletions(-)
>
> diff --git a/fs/nilfs2/dat.c b/fs/nilfs2/dat.c
> index d2c8f7e..63d079c 100644
> --- a/fs/nilfs2/dat.c
> +++ b/fs/nilfs2/dat.c
> @@ -35,6 +35,17 @@
> #define NILFS_CNO_MAX (~(__u64)0)
>
> /**
> + * nilfs_dat_entry_is_alive - check if @entry is alive
> + * @entry: DAT-Entry
> + *
> + * Description: Simple check if @entry is alive in the current checkpoint.
> + */
> +static inline int nilfs_dat_entry_is_live(struct nilfs_dat_entry *entry)
> +{
> + return entry->de_end == cpu_to_le64(NILFS_CNO_MAX);
> +}
> +
Do not use "inline" directive in *.c files. Compiler aggressively
does it. "noinline" directive should be used instead for functions
that we want to prevent from being inlined.
> +/**
> * struct nilfs_dat_info - on-memory private data of DAT file
> * @mi: on-memory private data of metadata file
> * @palloc_cache: persistent object allocator cache of DAT file
> @@ -391,6 +402,65 @@ int nilfs_dat_move(struct inode *dat, __u64 vblocknr,
> sector_t blocknr)
> }
>
> /**
> + * nilfs_dat_is_live - checks if the virtual block number is alive
> + * @dat: DAT file inode
> + * @vblocknr: virtual block number
> + * @errp: pointer to return code if error occurred
> + *
> + * Description: nilfs_dat_is_live() looks up the DAT-Entry for
> + * @vblocknr and determines if the corresponding block is alive in the
> current
> + * checkpoint or not. This check ignores snapshots and protection periods.
> + *
> + * Return Value: 1 if vblocknr is alive and 0 otherwise. On error, 0 is
> + * returned and @errp is set to one of the following negative error codes.
> + *
> + * %-EIO - I/O error.
> + *
> + * %-ENOMEM - Insufficient amount of memory available.
> + *
> + * %-ENOENT - A block number associated with @vblocknr does not exist.
> + */
> +int nilfs_dat_is_live(struct inode *dat, __u64 vblocknr, int *errp)
> +{
> + struct buffer_head *entry_bh, *bh;
> + struct nilfs_dat_entry *entry;
> + sector_t blocknr;
> + void *kaddr;
> + int ret = 0, err;
> +
> + err = nilfs_palloc_get_entry_block(dat, vblocknr, 0, &entry_bh);
> + if (err < 0)
> + goto out;
> +
> + if (!nilfs_doing_gc() && buffer_nilfs_redirected(entry_bh)) {
> + bh = nilfs_mdt_get_frozen_buffer(dat, entry_bh);
> + if (bh) {
> + WARN_ON(!buffer_uptodate(bh));
> + put_bh(entry_bh);
> + entry_bh = bh;
> + }
> + }
> +
> + kaddr = kmap_atomic(entry_bh->b_page);
> + entry = nilfs_palloc_block_get_entry(dat, vblocknr, entry_bh, kaddr);
> + blocknr = le64_to_cpu(entry->de_blocknr);
> + if (blocknr == 0) {
> + err = -ENOENT;
> + goto out_unmap;
> + }
> +
> + ret = nilfs_dat_entry_is_live(entry);
> +
> +out_unmap:
> + kunmap_atomic(kaddr);
> + put_bh(entry_bh);
> +out:
> + if (errp)
> + *errp = err;
Remove errp argument by returning it as the return value. Rather, the
true/false result should be returned via an argument if you'd like to
avoid mixing these two.
> + return ret;
> +}
> +
> +/**
> * nilfs_dat_translate - translate a virtual block number to a block number
> * @dat: DAT file inode
> * @vblocknr: virtual block number
> diff --git a/fs/nilfs2/dat.h b/fs/nilfs2/dat.h
> index d196f09..3cbddd6 100644
> --- a/fs/nilfs2/dat.h
> +++ b/fs/nilfs2/dat.h
> @@ -32,6 +32,7 @@ struct nilfs_palloc_req;
> struct nilfs_sufile_mod_cache;
>
> int nilfs_dat_translate(struct inode *, __u64, sector_t *);
> +int nilfs_dat_is_live(struct inode *, __u64, int *);
>
> int nilfs_dat_prepare_alloc(struct inode *, struct nilfs_palloc_req *);
> void nilfs_dat_commit_alloc(struct inode *, struct nilfs_palloc_req *);
> diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c
> index 7f6d056..5412a76 100644
> --- a/fs/nilfs2/inode.c
> +++ b/fs/nilfs2/inode.c
> @@ -90,6 +90,8 @@ int nilfs_get_block(struct inode *inode, sector_t blkoff,
> int err = 0, ret;
> unsigned maxblocks = bh_result->b_size >> inode->i_blkbits;
>
> + bh_result->b_blocknr = 0;
> +
Please do not add this in this big patch as I mentioned before.
> down_read(&NILFS_MDT(nilfs->ns_dat)->mi_sem);
> ret = nilfs_bmap_lookup_contig(ii->i_bmap, blkoff, &blknum, maxblocks);
> up_read(&NILFS_MDT(nilfs->ns_dat)->mi_sem);
> diff --git a/fs/nilfs2/segbuf.c b/fs/nilfs2/segbuf.c
> index 7a6e9cd..bbd807b 100644
> --- a/fs/nilfs2/segbuf.c
> +++ b/fs/nilfs2/segbuf.c
> @@ -58,6 +58,7 @@ struct nilfs_segment_buffer *nilfs_segbuf_new(struct
> super_block *sb)
> INIT_LIST_HEAD(&segbuf->sb_payload_buffers);
> segbuf->sb_super_root = NULL;
> segbuf->sb_nlive_blks_added = 0;
> + segbuf->sb_nlive_blks_diff = 0;
>
> init_completion(&segbuf->sb_bio_event);
> atomic_set(&segbuf->sb_err, 0);
> @@ -451,6 +452,9 @@ static int nilfs_segbuf_submit_bh(struct
> nilfs_segment_buffer *segbuf,
>
> len = bio_add_page(wi->bio, bh->b_page, bh->b_size, bh_offset(bh));
> if (len == bh->b_size) {
> + lock_buffer(bh);
> + map_bh(bh, segbuf->sb_super, wi->blocknr + wi->end);
> + unlock_buffer(bh);
> wi->end++;
> return 0;
> }
ditto. Stop this.
> diff --git a/fs/nilfs2/segbuf.h b/fs/nilfs2/segbuf.h
> index d04da26..4e994f7 100644
> --- a/fs/nilfs2/segbuf.h
> +++ b/fs/nilfs2/segbuf.h
> @@ -84,6 +84,7 @@ struct nilfs_segment_buffer {
> sector_t sb_pseg_start;
> unsigned sb_rest_blocks;
> __u32 sb_nlive_blks_added;
> + __s64 sb_nlive_blks_diff;
sb_nlive_blks_diff is always decremented. It looks better
to alter this to
__u32 sb_nlive_blks_deducted;
and increment it. (The term "diff" is ambiguous. Maybe,
it should be "deducted" or so.)
>
> /* Buffers */
> struct list_head sb_segsum_buffers;
> diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c
> index dc0070c..16c7c36 100644
> --- a/fs/nilfs2/segment.c
> +++ b/fs/nilfs2/segment.c
> @@ -1385,7 +1385,8 @@ static void nilfs_segctor_update_segusage(struct
> nilfs_sc_info *sci,
> WARN_ON(ret); /* always succeed because the segusage is dirty */
>
> /* should always be positive */
> - segbuf->sb_nlive_blks_added = segbuf->sb_sum.nfileblk;
> + segbuf->sb_nlive_blks_added = segbuf->sb_nlive_blks_diff +
> + segbuf->sb_sum.nfileblk;
>
> if (nilfs_feature_track_live_blks(nilfs))
> nilfs_sufile_mod_nlive_blks(sufile, sci->sc_mc,
> @@ -1497,12 +1498,98 @@ static void nilfs_list_replace_buffer(struct
> buffer_head *old_bh,
> /* The caller must release old_bh */
> }
>
> +/**
> + * nilfs_segctor_dec_nlive_blks_gc - dec. nlive_blks for blocks of GC-Inodes
> + * @dat: dat inode
> + * @segbuf: currtent segment buffer
> + * @bh: current buffer head
> + *
> + * Description: nilfs_segctor_dec_nlive_blks_gc() is called if the inode to
> + * which @bh belongs is a GC-Inode. In that case it is not necessary to
> + * decrement the previous segment, because at the end of the GC process it
> + * will be freed anyway. It is however necessary to check again if the blocks
> + * are alive here, because the last check was in userspace without the proper
> + * locking. Additionally the blocks protected by the protection period should
> + * be considered reclaimable. It is assumed, that @bh->b_blocknr contains
> + * a virtual block number, which is only true if @bh is part of a GC-Inode.
> + */
> +static void nilfs_segctor_dec_nlive_blks_gc(struct inode *dat,
> + struct nilfs_segment_buffer *segbuf,
> + struct buffer_head *bh) {
> + bool isreclaimable = buffer_nilfs_protection_period(bh) ||
> + !nilfs_dat_is_live(dat, bh->b_blocknr, NULL);
> +
> + if (!buffer_nilfs_snapshot(bh) && isreclaimable)
> + segbuf->sb_nlive_blks_diff--;
> +}
> +
> +/**
> + * nilfs_segctor_dec_nlive_blks_nogc - dec. nlive_blks of segment
> + * @nilfs: the nilfs object
> + * @mc: modification cache
> + * @sb: currtent segment buffer
> + * @blocknr: current block number
> + *
> + * Description: Gets the segment number of the segment @blocknr belongs to
> + * and decrements the su_nlive_blks field of the corresponding segment usage
> + * entry.
> + */
> +static void nilfs_segctor_dec_nlive_blks_nogc(struct the_nilfs *nilfs,
> + struct nilfs_sufile_mod_cache *mc,
> + struct nilfs_segment_buffer *sb,
> + sector_t blocknr)
> +{
> + __u64 segnum = nilfs_get_segnum_of_block(nilfs, blocknr);
> +
> + if (segnum >= nilfs->ns_nsegments)
> + return;
> +
> + if (segnum == sb->sb_segnum)
> + sb->sb_nlive_blks_diff--;
> + else
> + nilfs_sufile_mod_nlive_blks(nilfs->ns_sufile, mc, segnum, -1);
> +}
As I mentioned before, sufile shouldn't be changed (in precise, newly
marked dirty) after the collection phase of sufile. This looks to be
violating it.
Regards,
Ryusuke Konishi
> +
> +/**
> + * nilfs_segctor_dec_nlive_blks - dec. nlive_blks of previous segment
> + * @nilfs: the nilfs object
> + * @mc: modification cache
> + * @sb: currtent segment buffer
> + * @bh: current buffer head
> + * @ino: current inode number
> + * @gc_inode: true if current inode is a GC-Inode
> + *
> + * Description: Handles GC-Inodes and normal inodes differently. For normal
> + * inodes @bh->b_blocknr contains the location where the block was read in.
> If
> + * the block is updated, the old version of it is considered reclaimable and
> so
> + * the su_nlive_blks field of the segment usage information of the old
> segment
> + * needs to be decremented. Only the DATFILE and SUFILE are decremented here,
> + * because normal files and other meta data files can be better decremented
> in
> + * nilfs_dat_commit_end().
> + */
> +static void nilfs_segctor_dec_nlive_blks(struct the_nilfs *nilfs,
> + struct nilfs_sufile_mod_cache *mc,
> + struct nilfs_segment_buffer *sb,
> + struct buffer_head *bh,
> + ino_t ino,
> + bool gc_inode)
> +{
> + bool isnode = buffer_nilfs_node(bh);
> +
> + if (gc_inode)
> + nilfs_segctor_dec_nlive_blks_gc(nilfs->ns_dat, sb, bh);
> + else if (ino == NILFS_DAT_INO || (ino == NILFS_SUFILE_INO && !isnode))
> + nilfs_segctor_dec_nlive_blks_nogc(nilfs, mc, sb, bh->b_blocknr);
> +}
> +
> static int
> nilfs_segctor_update_payload_blocknr(struct nilfs_sc_info *sci,
> struct nilfs_segment_buffer *segbuf,
> int mode)
> {
> + struct the_nilfs *nilfs = sci->sc_super->s_fs_info;
> struct inode *inode = NULL;
> + struct nilfs_inode_info *ii;
> sector_t blocknr;
> unsigned long nfinfo = segbuf->sb_sum.nfinfo;
> unsigned long nblocks = 0, ndatablk = 0;
> @@ -1512,7 +1599,9 @@ nilfs_segctor_update_payload_blocknr(struct
> nilfs_sc_info *sci,
> union nilfs_binfo binfo;
> struct buffer_head *bh, *bh_org;
> ino_t ino = 0;
> - int err = 0;
> + int err = 0, gc_inode = 0, track_live_blks;
> +
> + track_live_blks = nilfs_feature_track_live_blks(nilfs);
>
> if (!nfinfo)
> goto out;
> @@ -1533,6 +1622,9 @@ nilfs_segctor_update_payload_blocknr(struct
> nilfs_sc_info *sci,
>
> inode = bh->b_page->mapping->host;
>
> + ii = NILFS_I(inode);
> + gc_inode = test_bit(NILFS_I_GCINODE, &ii->i_state);
> +
> if (mode == SC_LSEG_DSYNC)
> sc_op = &nilfs_sc_dsync_ops;
> else if (ino == NILFS_DAT_INO)
> @@ -1540,6 +1632,11 @@ nilfs_segctor_update_payload_blocknr(struct
> nilfs_sc_info *sci,
> else /* file blocks */
> sc_op = &nilfs_sc_file_ops;
> }
> +
> + if (track_live_blks)
> + nilfs_segctor_dec_nlive_blks(nilfs, sci->sc_mc, segbuf,
> + bh, ino, gc_inode);
> +
> bh_org = bh;
> get_bh(bh_org);
> err = nilfs_bmap_assign(NILFS_I(inode)->i_bmap, &bh, blocknr,
> --
> 2.3.0
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-nilfs" in
> the body of a message to [email protected]
> More majordomo info at http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line "unsubscribe linux-nilfs" in
the body of a message to [email protected]
More majordomo info at http://vger.kernel.org/majordomo-info.html