On 8/13/25 12:04, wangzijie wrote: > Sometimes I suffered the nat_tree_lock contention between > f2fs_write_checkpoint > and f2fs_get_node_info. Commit a9419b6("f2fs: do not bother checkpoint by > f2fs_get_node_info") also mentioned that situation. > > My idea is, when flush nat entries, we can use some structures to record nat > pages we may read, and readahead them before hold nat_tree_lock. Before > impletement code, I did some survey and found a submittion in community. > > Subject: f2fs: use bucket sort to avoid tree lookup and list sort when nat > flushing > Link: > https://lore.kernel.org/linux-f2fs-devel/20170520122435.17574-2-houpengy...@huawei.com/ > This patch aims to improve nat entry set sort by using bucket. > I steal that structure and readahead nat pages contain nat entry set which > cannot be moved > to journal according to dirty nat entry set bucket. > > By doing this, I think there are two benefits to reducing nat_tree_lock hold > time when > when flush nat entries.
Zijie, Can you please figure out some numbers for this patch? something like checkpoint latency or average or extreme time to grab nat_tree_lock...? > 1. avoid nat set tree lookup and sort > 2. readahead nat pages before holding nat_tree_lock It may cause performance regression if it races w/ drop_caches? Thanks, > > Signed-off-by: wangzijie <wangzij...@honor.com> > --- > fs/f2fs/f2fs.h | 1 + > fs/f2fs/node.c | 70 ++++++++++++++++++++++++-------------------------- > fs/f2fs/node.h | 2 +- > 3 files changed, 35 insertions(+), 38 deletions(-) > > diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h > index 46be75605..b27cc059f 100644 > --- a/fs/f2fs/f2fs.h > +++ b/fs/f2fs/f2fs.h > @@ -975,6 +975,7 @@ struct f2fs_nm_info { > struct radix_tree_root nat_set_root;/* root of the nat set cache */ > struct f2fs_rwsem nat_tree_lock; /* protect nat entry tree */ > struct list_head nat_entries; /* cached nat entry list (clean) */ > + struct list_head nat_dirty_set[NAT_ENTRY_PER_BLOCK + 1]; /* > store dirty nat set */ > spinlock_t nat_list_lock; /* protect clean nat entry list */ > unsigned int nat_cnt[MAX_NAT_STATE]; /* the # of cached nat entries */ > unsigned int nat_blocks; /* # of nat blocks */ > diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c > index 27743b93e..87c975ee8 100644 > --- a/fs/f2fs/node.c > +++ b/fs/f2fs/node.c > @@ -244,6 +244,12 @@ static void __del_from_nat_cache(struct f2fs_nm_info > *nm_i, struct nat_entry *e) > __free_nat_entry(e); > } > > +static void __relocate_nat_entry_set(struct f2fs_nm_info *nm_i, > + struct nat_entry_set > *set) > +{ > + list_move_tail(&set->set_list, &nm_i->nat_dirty_set[set->entry_cnt]); > +} > + > static struct nat_entry_set *__grab_nat_entry_set(struct f2fs_nm_info *nm_i, > struct nat_entry *ne) > { > @@ -260,6 +266,7 @@ static struct nat_entry_set *__grab_nat_entry_set(struct > f2fs_nm_info *nm_i, > head->set = set; > head->entry_cnt = 0; > f2fs_radix_tree_insert(&nm_i->nat_set_root, set, head); > + __relocate_nat_entry_set(nm_i, head); > } > return head; > } > @@ -279,8 +286,10 @@ static void __set_nat_cache_dirty(struct f2fs_nm_info > *nm_i, > * 2. update old block address to new one; > */ > if (!new_ne && (get_nat_flag(ne, IS_PREALLOC) || > - !get_nat_flag(ne, IS_DIRTY))) > + !get_nat_flag(ne, IS_DIRTY))) { > head->entry_cnt++; > + __relocate_nat_entry_set(nm_i, head); > + } > > set_nat_flag(ne, IS_PREALLOC, new_ne); > > @@ -309,6 +318,7 @@ static void __clear_nat_cache_dirty(struct f2fs_nm_info > *nm_i, > > set_nat_flag(ne, IS_DIRTY, false); > set->entry_cnt--; > + __relocate_nat_entry_set(nm_i, set); > nm_i->nat_cnt[DIRTY_NAT]--; > nm_i->nat_cnt[RECLAIMABLE_NAT]++; > } > @@ -2976,24 +2986,6 @@ static void remove_nats_in_journal(struct f2fs_sb_info > *sbi) > up_write(&curseg->journal_rwsem); > } > > -static void __adjust_nat_entry_set(struct nat_entry_set *nes, > - struct list_head *head, int max) > -{ > - struct nat_entry_set *cur; > - > - if (nes->entry_cnt >= max) > - goto add_out; > - > - list_for_each_entry(cur, head, set_list) { > - if (cur->entry_cnt >= nes->entry_cnt) { > - list_add(&nes->set_list, cur->set_list.prev); > - return; > - } > - } > -add_out: > - list_add_tail(&nes->set_list, head); > -} > - > static void __update_nat_bits(struct f2fs_sb_info *sbi, nid_t start_nid, > const struct f2fs_nat_block *nat_blk) > { > @@ -3095,6 +3087,7 @@ static int __flush_nat_entry_set(struct f2fs_sb_info > *sbi, > > /* Allow dirty nats by node block allocation in write_begin */ > if (!set->entry_cnt) { > + list_del(&set->set_list); > radix_tree_delete(&NM_I(sbi)->nat_set_root, set->set); > kmem_cache_free(nat_entry_set_slab, set); > } > @@ -3109,11 +3102,8 @@ int f2fs_flush_nat_entries(struct f2fs_sb_info *sbi, > struct cp_control *cpc) > struct f2fs_nm_info *nm_i = NM_I(sbi); > struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_HOT_DATA); > struct f2fs_journal *journal = curseg->journal; > - struct nat_entry_set *setvec[NAT_VEC_SIZE]; > struct nat_entry_set *set, *tmp; > - unsigned int found; > - nid_t set_idx = 0; > - LIST_HEAD(sets); > + int i; > int err = 0; > > /* > @@ -3129,6 +3119,16 @@ int f2fs_flush_nat_entries(struct f2fs_sb_info *sbi, > struct cp_control *cpc) > if (!nm_i->nat_cnt[DIRTY_NAT]) > return 0; > > + /* readahead sets which cannot be moved to journal */ > + if (!__has_cursum_space(journal, nm_i->nat_cnt[DIRTY_NAT], > NAT_JOURNAL)) { > + for (i = MAX_NAT_JENTRIES(journal); i <= NAT_ENTRY_PER_BLOCK; > i++) { > + list_for_each_entry_safe(set, tmp, > &nm_i->nat_dirty_set[i], set_list) { > + f2fs_ra_meta_pages(sbi, set->set, 1, > + META_NAT, true); > + } > + } > + } > + > f2fs_down_write(&nm_i->nat_tree_lock); > > /* > @@ -3141,21 +3141,13 @@ int f2fs_flush_nat_entries(struct f2fs_sb_info *sbi, > struct cp_control *cpc) > nm_i->nat_cnt[DIRTY_NAT], NAT_JOURNAL)) > remove_nats_in_journal(sbi); > > - while ((found = __gang_lookup_nat_set(nm_i, > - set_idx, NAT_VEC_SIZE, setvec))) { > - unsigned idx; > - > - set_idx = setvec[found - 1]->set + 1; > - for (idx = 0; idx < found; idx++) > - __adjust_nat_entry_set(setvec[idx], &sets, > - MAX_NAT_JENTRIES(journal)); > - } > - > /* flush dirty nats in nat entry set */ > - list_for_each_entry_safe(set, tmp, &sets, set_list) { > - err = __flush_nat_entry_set(sbi, set, cpc); > - if (err) > - break; > + for (i = 0; i <= NAT_ENTRY_PER_BLOCK; i++) { > + list_for_each_entry_safe(set, tmp, &nm_i->nat_dirty_set[i], > set_list) { > + err = __flush_nat_entry_set(sbi, set, cpc); > + if (err) > + break; > + } > } > > f2fs_up_write(&nm_i->nat_tree_lock); > @@ -3249,6 +3241,7 @@ static int init_node_manager(struct f2fs_sb_info *sbi) > struct f2fs_nm_info *nm_i = NM_I(sbi); > unsigned char *version_bitmap; > unsigned int nat_segs; > + int i; > int err; > > nm_i->nat_blkaddr = le32_to_cpu(sb_raw->nat_blkaddr); > @@ -3275,6 +3268,9 @@ static int init_node_manager(struct f2fs_sb_info *sbi) > INIT_LIST_HEAD(&nm_i->nat_entries); > spin_lock_init(&nm_i->nat_list_lock); > > + for (i = 0; i <= NAT_ENTRY_PER_BLOCK; i++) > + INIT_LIST_HEAD(&nm_i->nat_dirty_set[i]); > + > mutex_init(&nm_i->build_lock); > spin_lock_init(&nm_i->nid_list_lock); > init_f2fs_rwsem(&nm_i->nat_tree_lock); > diff --git a/fs/f2fs/node.h b/fs/f2fs/node.h > index 030390543..d805d4ce7 100644 > --- a/fs/f2fs/node.h > +++ b/fs/f2fs/node.h > @@ -158,7 +158,7 @@ enum mem_type { > }; > > struct nat_entry_set { > - struct list_head set_list; /* link with other nat sets */ > + struct list_head set_list; /* link with nat sets which have same > entry_cnt */ > struct list_head entry_list; /* link with dirty nat entries */ > nid_t set; /* set number*/ > unsigned int entry_cnt; /* the # of nat entries in set */ _______________________________________________ Linux-f2fs-devel mailing list Linux-f2fs-devel@lists.sourceforge.net https://lists.sourceforge.net/lists/listinfo/linux-f2fs-devel