> -----Original Message----- > From: Chao Yu <yuch...@huawei.com> > Sent: Friday, July 17, 2020 12:58 PM > To: Aravind Ramesh <aravind.ram...@wdc.com>; jaeg...@kernel.org; linux-f2fs- > de...@lists.sourceforge.net > Cc: Shinichiro Kawasaki <shinichiro.kawas...@wdc.com>; Matias Bjorling > <matias.bjorl...@wdc.com> > Subject: Re: [PATCH 1/2] mkfs.f2fs: zns zone-capacity support. > > On 2020/7/2 23:54, Aravind Ramesh wrote: > > NVM Express Zoned Namespace (ZNS) devices can have zone-capacity(zc) > > less than the zone-size. ZNS defines a per zone capacity which can be > > equal or less than the zone-size. Zone-capacity is the number of > > usable blocks in the zone. In such cases, the filesystem should not > > write/read beyond the zone-capacity. Update the super block with the > > usable number of blocks and free segment count in the ZNS device > > zones, if zone-capacity is less than zone-size. Set reserved segment > > count and overprovision ratio based on the usable segments in the zone. > > > > Signed-off-by: Aravind Ramesh <aravind.ram...@wdc.com> > > Signed-off-by: Shin'ichiro Kawasaki <shinichiro.kawas...@wdc.com> > > --- > > configure.ac | 4 ++++ > > include/f2fs_fs.h | 40 +++++++++++++++++++++++++++++++---- > > lib/libf2fs_io.c | 1 + > > lib/libf2fs_zoned.c | 51 > > +++++++++++++++++++++++++++++++++++++++++++-- > > mkfs/f2fs_format.c | 26 ++++++++++++++++++----- > > 5 files changed, 111 insertions(+), 11 deletions(-) > > > > diff --git a/configure.ac b/configure.ac index 9ac0c24..e9acd1a 100644 > > --- a/configure.ac > > +++ b/configure.ac > > @@ -213,6 +213,10 @@ AC_CONFIG_FILES([ > > tools/f2fs_io/Makefile > > ]) > > > > +AC_CHECK_MEMBER([struct blk_zone.capacity], > > + [AC_DEFINE(HAVE_BLK_ZONE_REP_V2, [1], [report zones includes > zone capacity])], > > + [], [[#include <linux/blkzoned.h>]]) > > + > > # export library version info for mkfs/libf2fs_format_la > > AC_SUBST(FMT_CURRENT, 6) AC_SUBST(FMT_REVISION, 0) diff --git > > a/include/f2fs_fs.h b/include/f2fs_fs.h index 709bfd8..40165ed 100644 > > --- a/include/f2fs_fs.h > > +++ b/include/f2fs_fs.h > > @@ -332,6 +332,7 @@ struct device_info { > > u_int32_t nr_zones; > > u_int32_t nr_rnd_zones; > > size_t zone_blocks; > > + size_t *zone_cap_blocks; > > }; > > > > typedef struct { > > @@ -1324,13 +1325,42 @@ blk_zone_cond_str(struct blk_zone *blkz) > > return "Unknown-cond"; > > } > > > > -#define blk_zone_empty(z) (blk_zone_cond(z) == BLK_ZONE_COND_EMPTY) > > +/* > > + * Handle kernel zone capacity support */ #ifndef > > +HAVE_BLK_ZONE_REP_V2 > > +#define BLK_ZONE_REP_CAPACITY (1 << 0) > > +struct blk_zone_v2 { > > + __u64 start; /* Zone start sector */ > > + __u64 len; /* Zone length in number of sectors */ > > + __u64 wp; /* Zone write pointer position */ > > + __u8 type; /* Zone type */ > > + __u8 cond; /* Zone condition */ > > + __u8 non_seq; /* Non-sequential write resources active */ > > + __u8 reset; /* Reset write pointer recommended */ > > + __u8 resv[4]; > > + __u64 capacity; /* Zone capacity in number of sectors */ > > + __u8 reserved[24]; > > +}; > > +#define blk_zone blk_zone_v2 > > > > +struct blk_zone_report_v2 { > > + __u64 sector; > > + __u32 nr_zones; > > + __u32 flags; > > +struct blk_zone zones[0]; > > +};
[snip...] > > @@ -1352,6 +1383,7 @@ static inline double > > get_best_overprovision(struct f2fs_super_block *sb) { > > double reserved, ovp, candidate, end, diff, space; > > double max_ovp = 0, max_space = 0; > > + u_int32_t usable_main_segs = f2fs_get_usable_segments(sb); > > > > if (get_sb(segment_count_main) < 256) { > > candidate = 10; > > @@ -1365,9 +1397,9 @@ static inline double > > get_best_overprovision(struct f2fs_super_block *sb) > > > > for (; candidate <= end; candidate += diff) { > > reserved = (2 * (100 / candidate + 1) + 6) * > > - get_sb(segs_per_sec); > > - ovp = (get_sb(segment_count_main) - reserved) * candidate / 100; > > - space = get_sb(segment_count_main) - reserved - ovp; > > + (usable_main_segs / get_sb(section_count)); > > It looks segs_per_sec becomes an average value in whole zns device, I'm not > sure > whether calculating with divide round down way is safe enough to reserve > space, as > it may cut several segments, which may be needed during foreground GC, so I > suggest to use DIV_ROUND_UP() here to avoid boundary issue. Yes, it tries to reflect the usable segs_per_sec rather than using a value based on zone-size, which could, reserve more than the intended amount of segments. I do see a round_down(x, y) in the f2fs-tools code. But no, round_up(). Does this look ok? DIV_ROUND_UP(n, d) (((n) + (d) - 1) / (d)) (from kernel.h) > > > + ovp = (usable_main_segs - reserved) * candidate / 100; > > + space = usable_main_segs - reserved - ovp; > > if (max_space < space) { > > max_space = space; > > max_ovp = candidate; > > diff --git a/lib/libf2fs_io.c b/lib/libf2fs_io.c index > > 1f597a9..138285d 100644 > > --- a/lib/libf2fs_io.c > > +++ b/lib/libf2fs_io.c > > @@ -784,6 +784,7 @@ int f2fs_finalize_device(void) > > break; > > } > > free(c.devices[i].path); > > + free(c.devices[i].zone_cap_blocks); > > } > > close(c.kd); > > > > diff --git a/lib/libf2fs_zoned.c b/lib/libf2fs_zoned.c index > > efc687c..f98fcdb 100644 > > --- a/lib/libf2fs_zoned.c > > +++ b/lib/libf2fs_zoned.c > > @@ -291,6 +291,13 @@ int f2fs_check_zones(int j) > > return -ENOMEM; > > } > > > > + dev->zone_cap_blocks = malloc(dev->nr_zones * sizeof(size_t)); > > + if (!dev->zone_cap_blocks) { > > + ERR_MSG("No memory for zone capacity list.\n"); > > + return -ENOMEM; > > + } > > + memset(dev->zone_cap_blocks, 0, (dev->nr_zones * sizeof(size_t))); > > + > > dev->nr_rnd_zones = 0; > > sector = 0; > > total_sectors = (dev->total_sectors * c.sector_size) >> 9; @@ > > -335,10 +342,15 @@ int f2fs_check_zones(int j) > > blk_zone_cond_str(blkz), > > blk_zone_sector(blkz), > > blk_zone_length(blkz)); > > + dev->zone_cap_blocks[n] = > > + blk_zone_length(blkz) >> > > + (F2FS_BLKSIZE_BITS - SECTOR_SHIFT); > > } else { > > DBG(2, > > - "Zone %05u: type 0x%x (%s), cond 0x%x (%s), > need_reset %d, " > > - "non_seq %d, sector %llu, %llu sectors, wp > > sector > %llu\n", > > + "Zone %05u: type 0x%x (%s), cond 0x%x (%s)," > > + " need_reset %d, non_seq %d, sector %llu," > > + " %llu sectors, capacity %llu," > > + " wp sector %llu\n", > > n, > > blk_zone_type(blkz), > > blk_zone_type_str(blkz), > > @@ -348,7 +360,11 @@ int f2fs_check_zones(int j) > > blk_zone_non_seq(blkz), > > blk_zone_sector(blkz), > > blk_zone_length(blkz), > > + blk_zone_capacity(blkz, rep->flags), > > blk_zone_wp_sector(blkz)); > > + dev->zone_cap_blocks[n] = > > + blk_zone_capacity(blkz, rep->flags) >> > > + (F2FS_BLKSIZE_BITS - SECTOR_SHIFT); > > } > > > > sector = blk_zone_sector(blkz) + blk_zone_length(blkz); > @@ -473,6 > > +489,33 @@ out: > > return ret; > > } > > > > +uint32_t f2fs_get_usable_segments(struct f2fs_super_block *sb) { > > +#ifdef HAVE_BLK_ZONE_REP_V2 > > + int i, j; > > + uint32_t usable_segs = 0, zone_segs; > > + for (i = 0; i < c.ndevs; i++) { > > + if (c.devices[i].zoned_model != F2FS_ZONED_HM) { > > + usable_segs += c.devices[i].total_segments; > > + continue; > > + } > > + for (j = 0; j < c.devices[i].nr_zones; j++) { > > + zone_segs = c.devices[i].zone_cap_blocks[j] >> > > + get_sb(log_blocks_per_seg); > > + if (c.devices[i].zone_cap_blocks[j] % > > + > DEFAULT_BLOCKS_PER_SEGMENT) > > + usable_segs += zone_segs + 1; > > + else > > + usable_segs += zone_segs; > > + } > > + } > > + usable_segs -= (get_sb(main_blkaddr) - get_sb(segment0_blkaddr)) >> > > + get_sb(log_blocks_per_seg); > > + return usable_segs; > > +#endif > > + return get_sb(segment_count_main); > > +} > > + > > #else > > > > int f2fs_report_zone(int i, u_int64_t UNUSED(sector), void > > *UNUSED(blkzone)) @@ -527,5 +570,9 @@ int f2fs_reset_zones(int i) > > return -1; > > } > > > > +uint32_t f2fs_get_usable_segments(struct f2fs_super_block *sb) { > > + return get_sb(segment_count_main); > > +} > > #endif > > > > diff --git a/mkfs/f2fs_format.c b/mkfs/f2fs_format.c index > > 4999cac..74a81c8 100644 > > --- a/mkfs/f2fs_format.c > > +++ b/mkfs/f2fs_format.c > > @@ -425,13 +425,19 @@ static int f2fs_prepare_super_block(void) > > > > set_sb(segment_count_main, get_sb(section_count) * c.segs_per_sec); > > > > - /* Let's determine the best reserved and overprovisioned space */ > > + /* > > + * Let's determine the best reserved and overprovisioned space. > > + * For Zoned device, if zone capacity less than zone size, the segments > > + * starting after the zone capacity are unusable in each zone. So get > > + * overprovision ratio and reserved seg count based on avg usable > > + * segs_per_sec. > > + */ > > if (c.overprovision == 0) > > c.overprovision = get_best_overprovision(sb); > > > > c.reserved_segments = > > - (2 * (100 / c.overprovision + 1) + NR_CURSEG_TYPE) > > - * c.segs_per_sec; > > + (2 * (100 / c.overprovision + 1) + NR_CURSEG_TYPE) * > > + (f2fs_get_usable_segments(sb) / get_sb(section_count)); > > Ditto, DIV_ROUND_UP() Ok > > > > > if (c.overprovision == 0 || c.total_segments < F2FS_MIN_SEGMENTS || > > (c.devices[0].total_sectors * > > @@ -672,19 +678,29 @@ static int f2fs_write_check_point_pack(void) > > set_cp(valid_block_count, 2 + c.quota_inum + c.quota_dnum + > > c.lpf_inum + c.lpf_dnum); > > set_cp(rsvd_segment_count, c.reserved_segments); > > - set_cp(overprov_segment_count, (get_sb(segment_count_main) - > > + > > + /* > > + * For zoned devices, if zone capacity less than zone size, get > > + * overprovision segment count based on usable segments in the device. > > + */ > > + set_cp(overprov_segment_count, (f2fs_get_usable_segments(sb) - > > get_cp(rsvd_segment_count)) * > > c.overprovision / 100); > > set_cp(overprov_segment_count, get_cp(overprov_segment_count) + > > get_cp(rsvd_segment_count)); > > > > + if (f2fs_get_usable_segments(sb) < (get_cp(rsvd_segment_count) + > > equal is not allowed as well? You are right, will make it " <= " Thanks for the feedback, Aravind > > > + get_cp(overprov_segment_count))) { > > + MSG(0, "\tError: Not enough segments to create F2FS Volume\n"); > > + goto free_nat_bits; > > + } > > MSG(0, "Info: Overprovision ratio = %.3lf%%\n", c.overprovision); > > MSG(0, "Info: Overprovision segments = %u (GC reserved = %u)\n", > > get_cp(overprov_segment_count), > > c.reserved_segments); > > > > /* main segments - reserved segments - (node + data segments) */ > > - set_cp(free_segment_count, get_sb(segment_count_main) - 6); > > + set_cp(free_segment_count, f2fs_get_usable_segments(sb) - 6); > > set_cp(user_block_count, ((get_cp(free_segment_count) + 6 - > > get_cp(overprov_segment_count)) * c.blks_per_seg)); > > /* cp page (2), data summaries (1), node summaries (3) */ > > _______________________________________________ Linux-f2fs-devel mailing list Linux-f2fs-devel@lists.sourceforge.net https://lists.sourceforge.net/lists/listinfo/linux-f2fs-devel