On Wed, Jan 28, 2026 at 04:26:20PM +0100, Christoph Hellwig wrote: > Currently all reads of the fsverity hashes is kicked off from the data > I/O completion handler, leading to needlessly dependent I/O. This is > worked around a bit by performing readahead on the level 0 nodes, but > still fairly ineffective. > > Switch to a model where the ->read_folio and ->readahead methods instead > kick off explicit readahead of the fsverity hashed so they are usually > available at I/O completion time. > > For 64k sequential reads on my test VM this improves read performance > from 2.4GB/s - 2.6GB/s to 3.5GB/s - 3.9GB/s. The improvements for > random reads are likely to be even bigger. > > Signed-off-by: Christoph Hellwig <[email protected]> > Acked-by: David Sterba <[email protected]> [btrfs]
Thanks for updating the kerneldoc and fixing the 'pgoff_t long' thing from the last round; Reviewed-by: "Darrick J. Wong" <[email protected]> --D > --- > fs/btrfs/verity.c | 4 +-- > fs/ext4/readpage.c | 7 ++++ > fs/ext4/verity.c | 13 +++++-- > fs/f2fs/data.c | 7 ++++ > fs/f2fs/verity.c | 13 +++++-- > fs/verity/pagecache.c | 39 ++++++++++++++------ > fs/verity/read_metadata.c | 17 ++++++--- > fs/verity/verify.c | 76 +++++++++++++++++++++++++-------------- > include/linux/fsverity.h | 29 ++++++++++----- > 9 files changed, 146 insertions(+), 59 deletions(-) > > diff --git a/fs/btrfs/verity.c b/fs/btrfs/verity.c > index e7643c22a6bf..c152bef71e8b 100644 > --- a/fs/btrfs/verity.c > +++ b/fs/btrfs/verity.c > @@ -697,7 +697,6 @@ int btrfs_get_verity_descriptor(struct inode *inode, void > *buf, size_t buf_size) > * > * @inode: inode to read a merkle tree page for > * @index: page index relative to the start of the merkle tree > - * @num_ra_pages: number of pages to readahead. Optional, we ignore it > * > * The Merkle tree is stored in the filesystem btree, but its pages are > cached > * with a logical position past EOF in the inode's mapping. > @@ -705,8 +704,7 @@ int btrfs_get_verity_descriptor(struct inode *inode, void > *buf, size_t buf_size) > * Returns the page we read, or an ERR_PTR on error. > */ > static struct page *btrfs_read_merkle_tree_page(struct inode *inode, > - pgoff_t index, > - unsigned long num_ra_pages) > + pgoff_t index) > { > struct folio *folio; > u64 off = (u64)index << PAGE_SHIFT; > diff --git a/fs/ext4/readpage.c b/fs/ext4/readpage.c > index 267594ef0b2c..e99072c8a619 100644 > --- a/fs/ext4/readpage.c > +++ b/fs/ext4/readpage.c > @@ -227,6 +227,7 @@ int ext4_mpage_readpages(struct inode *inode, > unsigned relative_block = 0; > struct ext4_map_blocks map; > unsigned int nr_pages, folio_pages; > + bool first_folio = true; > > map.m_pblk = 0; > map.m_lblk = 0; > @@ -242,6 +243,12 @@ int ext4_mpage_readpages(struct inode *inode, > if (rac) > folio = readahead_folio(rac); > > + if (first_folio) { > + if (ext4_need_verity(inode, folio->index)) > + fsverity_readahead(folio, nr_pages); > + first_folio = false; > + } > + > folio_pages = folio_nr_pages(folio); > prefetchw(&folio->flags); > > diff --git a/fs/ext4/verity.c b/fs/ext4/verity.c > index a071860ad36a..54ae4d4a176c 100644 > --- a/fs/ext4/verity.c > +++ b/fs/ext4/verity.c > @@ -358,11 +358,17 @@ static int ext4_get_verity_descriptor(struct inode > *inode, void *buf, > } > > static struct page *ext4_read_merkle_tree_page(struct inode *inode, > - pgoff_t index, > - unsigned long num_ra_pages) > + pgoff_t index) > { > index += ext4_verity_metadata_pos(inode) >> PAGE_SHIFT; > - return generic_read_merkle_tree_page(inode, index, num_ra_pages); > + return generic_read_merkle_tree_page(inode, index); > +} > + > +static void ext4_readahead_merkle_tree(struct inode *inode, pgoff_t index, > + unsigned long nr_pages) > +{ > + index += ext4_verity_metadata_pos(inode) >> PAGE_SHIFT; > + generic_readahead_merkle_tree(inode, index, nr_pages); > } > > static int ext4_write_merkle_tree_block(struct file *file, const void *buf, > @@ -380,5 +386,6 @@ const struct fsverity_operations ext4_verityops = { > .end_enable_verity = ext4_end_enable_verity, > .get_verity_descriptor = ext4_get_verity_descriptor, > .read_merkle_tree_page = ext4_read_merkle_tree_page, > + .readahead_merkle_tree = ext4_readahead_merkle_tree, > .write_merkle_tree_block = ext4_write_merkle_tree_block, > }; > diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c > index c30e69392a62..49bdc7e771f2 100644 > --- a/fs/f2fs/data.c > +++ b/fs/f2fs/data.c > @@ -2359,6 +2359,7 @@ static int f2fs_mpage_readpages(struct inode *inode, > unsigned nr_pages = rac ? readahead_count(rac) : 1; > unsigned max_nr_pages = nr_pages; > int ret = 0; > + bool first_folio = true; > > #ifdef CONFIG_F2FS_FS_COMPRESSION > if (f2fs_compressed_file(inode)) { > @@ -2383,6 +2384,12 @@ static int f2fs_mpage_readpages(struct inode *inode, > prefetchw(&folio->flags); > } > > + if (first_folio) { > + if (f2fs_need_verity(inode, folio->index)) > + fsverity_readahead(folio, nr_pages); > + first_folio = false; > + } > + > #ifdef CONFIG_F2FS_FS_COMPRESSION > index = folio->index; > > diff --git a/fs/f2fs/verity.c b/fs/f2fs/verity.c > index d37e584423af..628e8eafa96a 100644 > --- a/fs/f2fs/verity.c > +++ b/fs/f2fs/verity.c > @@ -256,11 +256,17 @@ static int f2fs_get_verity_descriptor(struct inode > *inode, void *buf, > } > > static struct page *f2fs_read_merkle_tree_page(struct inode *inode, > - pgoff_t index, > - unsigned long num_ra_pages) > + pgoff_t index) > { > index += f2fs_verity_metadata_pos(inode) >> PAGE_SHIFT; > - return generic_read_merkle_tree_page(inode, index, num_ra_pages); > + return generic_read_merkle_tree_page(inode, index); > +} > + > +static void f2fs_readahead_merkle_tree(struct inode *inode, pgoff_t index, > + unsigned long nr_pages) > +{ > + index += f2fs_verity_metadata_pos(inode) >> PAGE_SHIFT; > + generic_readahead_merkle_tree(inode, index, nr_pages); > } > > static int f2fs_write_merkle_tree_block(struct file *file, const void *buf, > @@ -278,5 +284,6 @@ const struct fsverity_operations f2fs_verityops = { > .end_enable_verity = f2fs_end_enable_verity, > .get_verity_descriptor = f2fs_get_verity_descriptor, > .read_merkle_tree_page = f2fs_read_merkle_tree_page, > + .readahead_merkle_tree = f2fs_readahead_merkle_tree, > .write_merkle_tree_block = f2fs_write_merkle_tree_block, > }; > diff --git a/fs/verity/pagecache.c b/fs/verity/pagecache.c > index eae419d8d091..196072bbe284 100644 > --- a/fs/verity/pagecache.c > +++ b/fs/verity/pagecache.c > @@ -16,8 +16,30 @@ > * to ->read_merkle_tree_page to the actual index where the Merkle tree is > * stored in the page cache for @inode. > */ > -struct page *generic_read_merkle_tree_page(struct inode *inode, pgoff_t > index, > - unsigned long num_ra_pages) > +struct page *generic_read_merkle_tree_page(struct inode *inode, pgoff_t > index) > +{ > + struct folio *folio; > + > + folio = read_mapping_folio(inode->i_mapping, index, NULL); > + if (IS_ERR(folio)) > + return ERR_CAST(folio); > + return folio_file_page(folio, index); > +} > +EXPORT_SYMBOL_GPL(generic_read_merkle_tree_page); > + > +/** > + * generic_readahead_merkle_tree() - generic ->readahead_merkle_tree helper > + * @inode: inode containing the Merkle tree > + * @index: 0-based index of the first Merkle tree page to read ahead in the > + * inode > + * @nr_pages: the number of Merkle tree pages that should be read > ahead > + * > + * The caller needs to adjust @index from the Merkle-tree relative index > passed > + * to ->read_merkle_tree_page to the actual index where the Merkle tree is > + * stored in the page cache for @inode. > + */ > +void generic_readahead_merkle_tree(struct inode *inode, pgoff_t index, > + unsigned long nr_pages) > { > struct folio *folio; > > @@ -26,14 +48,9 @@ struct page *generic_read_merkle_tree_page(struct inode > *inode, pgoff_t index, > (!IS_ERR(folio) && !folio_test_uptodate(folio))) { > DEFINE_READAHEAD(ractl, NULL, NULL, inode->i_mapping, index); > > - if (!IS_ERR(folio)) > - folio_put(folio); > - else if (num_ra_pages > 1) > - page_cache_ra_unbounded(&ractl, num_ra_pages, 0); > - folio = read_mapping_folio(inode->i_mapping, index, NULL); > - if (IS_ERR(folio)) > - return ERR_CAST(folio); > + page_cache_ra_unbounded(&ractl, nr_pages, 0); > } > - return folio_file_page(folio, index); > + if (!IS_ERR(folio)) > + folio_put(folio); > } > -EXPORT_SYMBOL_GPL(generic_read_merkle_tree_page); > +EXPORT_SYMBOL_GPL(generic_readahead_merkle_tree); > diff --git a/fs/verity/read_metadata.c b/fs/verity/read_metadata.c > index cba5d6af4e04..81b82e9ddb1d 100644 > --- a/fs/verity/read_metadata.c > +++ b/fs/verity/read_metadata.c > @@ -28,24 +28,31 @@ static int fsverity_read_merkle_tree(struct inode *inode, > if (offset >= end_offset) > return 0; > offs_in_page = offset_in_page(offset); > + index = offset >> PAGE_SHIFT; > last_index = (end_offset - 1) >> PAGE_SHIFT; > > + /* > + * Kick off readahead for the range we are going to read to ensure a > + * single large sequential read instead of lots of small ones. > + */ > + if (inode->i_sb->s_vop->readahead_merkle_tree) { > + inode->i_sb->s_vop->readahead_merkle_tree(inode, index, > + last_index - index + 1); > + } > + > /* > * Iterate through each Merkle tree page in the requested range and copy > * the requested portion to userspace. Note that the Merkle tree block > * size isn't important here, as we are returning a byte stream; i.e., > * we can just work with pages even if the tree block size != PAGE_SIZE. > */ > - for (index = offset >> PAGE_SHIFT; index <= last_index; index++) { > - unsigned long num_ra_pages = > - min_t(unsigned long, last_index - index + 1, > - inode->i_sb->s_bdi->io_pages); > + for (; index <= last_index; index++) { > unsigned int bytes_to_copy = min_t(u64, end_offset - offset, > PAGE_SIZE - offs_in_page); > struct page *page; > const void *virt; > > - page = vops->read_merkle_tree_page(inode, index, num_ra_pages); > + page = vops->read_merkle_tree_page(inode, index); > if (IS_ERR(page)) { > err = PTR_ERR(page); > fsverity_err(inode, > diff --git a/fs/verity/verify.c b/fs/verity/verify.c > index 86067c8b40cf..f5bea750b427 100644 > --- a/fs/verity/verify.c > +++ b/fs/verity/verify.c > @@ -9,6 +9,7 @@ > > #include <linux/bio.h> > #include <linux/export.h> > +#include <linux/pagemap.h> > > #define FS_VERITY_MAX_PENDING_BLOCKS 2 > > @@ -21,7 +22,6 @@ struct fsverity_pending_block { > struct fsverity_verification_context { > struct inode *inode; > struct fsverity_info *vi; > - unsigned long max_ra_pages; > > /* > * This is the queue of data blocks that are pending verification. When > @@ -37,6 +37,49 @@ struct fsverity_verification_context { > > static struct workqueue_struct *fsverity_read_workqueue; > > +/** > + * fsverity_readahead() - kick off readahead on fsverity hashes > + * @folio: first file data folio that is being read > + * @nr_pages: number of file data pages to be read > + * > + * Start readahead on the fsverity hashes that are needed to verity the file > + * data in the range from folio->index to folio->index + nr_pages. > + * > + * To be called from the file systems' ->read_folio and ->readahead methods > to > + * ensure that the hashes are already cached on completion of the file data > + * read if possible. > + */ > +void fsverity_readahead(struct folio *folio, unsigned long nr_pages) > +{ > + struct inode *inode = folio->mapping->host; > + const struct fsverity_info *vi = *fsverity_info_addr(inode); > + const struct merkle_tree_params *params = &vi->tree_params; > + u64 start_hidx = (u64)folio->index << params->log_blocks_per_page; > + u64 end_hidx = (((u64)folio->index + nr_pages) << > + params->log_blocks_per_page) - 1; > + int level; > + > + if (!inode->i_sb->s_vop->readahead_merkle_tree) > + return; > + > + for (level = 0; level < params->num_levels; level++) { > + unsigned long level_start = params->level_start[level]; > + unsigned long next_start_hidx = start_hidx >> params->log_arity; > + unsigned long next_end_hidx = end_hidx >> params->log_arity; > + pgoff_t start_idx = (level_start + next_start_hidx) >> > + params->log_blocks_per_page; > + pgoff_t end_idx = (level_start + next_end_hidx) >> > + params->log_blocks_per_page; > + > + inode->i_sb->s_vop->readahead_merkle_tree(inode, start_idx, > + end_idx - start_idx + 1); > + > + start_hidx = next_start_hidx; > + end_hidx = next_end_hidx; > + } > +} > +EXPORT_SYMBOL_GPL(fsverity_readahead); > + > /* > * Returns true if the hash block with index @hblock_idx in the tree, > located in > * @hpage, has already been verified. > @@ -114,8 +157,7 @@ static bool is_hash_block_verified(struct fsverity_info > *vi, struct page *hpage, > * Return: %true if the data block is valid, else %false. > */ > static bool verify_data_block(struct inode *inode, struct fsverity_info *vi, > - const struct fsverity_pending_block *dblock, > - unsigned long max_ra_pages) > + const struct fsverity_pending_block *dblock) > { > const u64 data_pos = dblock->pos; > const struct merkle_tree_params *params = &vi->tree_params; > @@ -200,8 +242,7 @@ static bool verify_data_block(struct inode *inode, struct > fsverity_info *vi, > (params->block_size - 1); > > hpage = inode->i_sb->s_vop->read_merkle_tree_page(inode, > - hpage_idx, level == 0 ? min(max_ra_pages, > - params->tree_pages - hpage_idx) : 0); > + hpage_idx); > if (IS_ERR(hpage)) { > fsverity_err(inode, > "Error %ld reading Merkle tree page %lu", > @@ -272,14 +313,12 @@ static bool verify_data_block(struct inode *inode, > struct fsverity_info *vi, > > static void > fsverity_init_verification_context(struct fsverity_verification_context *ctx, > - struct inode *inode, > - unsigned long max_ra_pages) > + struct inode *inode) > { > struct fsverity_info *vi = *fsverity_info_addr(inode); > > ctx->inode = inode; > ctx->vi = vi; > - ctx->max_ra_pages = max_ra_pages; > ctx->num_pending = 0; > if (vi->tree_params.hash_alg->algo_id == HASH_ALGO_SHA256 && > sha256_finup_2x_is_optimized()) > @@ -322,8 +361,7 @@ fsverity_verify_pending_blocks(struct > fsverity_verification_context *ctx) > } > > for (i = 0; i < ctx->num_pending; i++) { > - if (!verify_data_block(ctx->inode, vi, &ctx->pending_blocks[i], > - ctx->max_ra_pages)) > + if (!verify_data_block(ctx->inode, vi, &ctx->pending_blocks[i])) > return false; > } > fsverity_clear_pending_blocks(ctx); > @@ -373,7 +411,7 @@ bool fsverity_verify_blocks(struct folio *folio, size_t > len, size_t offset) > { > struct fsverity_verification_context ctx; > > - fsverity_init_verification_context(&ctx, folio->mapping->host, 0); > + fsverity_init_verification_context(&ctx, folio->mapping->host); > > if (fsverity_add_data_blocks(&ctx, folio, len, offset) && > fsverity_verify_pending_blocks(&ctx)) > @@ -403,22 +441,8 @@ void fsverity_verify_bio(struct bio *bio) > struct inode *inode = bio_first_folio_all(bio)->mapping->host; > struct fsverity_verification_context ctx; > struct folio_iter fi; > - unsigned long max_ra_pages = 0; > - > - if (bio->bi_opf & REQ_RAHEAD) { > - /* > - * If this bio is for data readahead, then we also do readahead > - * of the first (largest) level of the Merkle tree. Namely, > - * when a Merkle tree page is read, we also try to piggy-back on > - * some additional pages -- up to 1/4 the number of data pages. > - * > - * This improves sequential read performance, as it greatly > - * reduces the number of I/O requests made to the Merkle tree. > - */ > - max_ra_pages = bio->bi_iter.bi_size >> (PAGE_SHIFT + 2); > - } > > - fsverity_init_verification_context(&ctx, inode, max_ra_pages); > + fsverity_init_verification_context(&ctx, inode); > > bio_for_each_folio_all(fi, bio) { > if (!fsverity_add_data_blocks(&ctx, fi.folio, fi.length, > diff --git a/include/linux/fsverity.h b/include/linux/fsverity.h > index 121703625cc8..bade511cf3aa 100644 > --- a/include/linux/fsverity.h > +++ b/include/linux/fsverity.h > @@ -97,10 +97,6 @@ struct fsverity_operations { > * > * @inode: the inode > * @index: 0-based index of the page within the Merkle tree > - * @num_ra_pages: The number of Merkle tree pages that should be > - * prefetched starting at @index if the page at @index > - * isn't already cached. Implementations may ignore this > - * argument; it's only a performance optimization. > * > * This can be called at any time on an open verity file. It may be > * called by multiple processes concurrently, even with the same page. > @@ -110,8 +106,23 @@ struct fsverity_operations { > * Return: the page on success, ERR_PTR() on failure > */ > struct page *(*read_merkle_tree_page)(struct inode *inode, > - pgoff_t index, > - unsigned long num_ra_pages); > + pgoff_t index); > + > + /** > + * Perform readahead of a Merkle tree for the given inode. > + * > + * @inode: the inode > + * @index: 0-based index of the first page within the Merkle tree > + * @nr_pages: number of pages to be read ahead. > + * > + * This can be called at any time on an open verity file. It may be > + * called by multiple processes concurrently, even with the same range. > + * > + * Optional method so that ->read_merkle_tree_page preferably finds > + * cached data instead of issuing dependent I/O. > + */ > + void (*readahead_merkle_tree)(struct inode *inode, pgoff_t index, > + unsigned long nr_pages); > > /** > * Write a Merkle tree block to the given inode. > @@ -308,8 +319,10 @@ static inline int fsverity_file_open(struct inode > *inode, struct file *filp) > } > > void fsverity_cleanup_inode(struct inode *inode); > +void fsverity_readahead(struct folio *folio, unsigned long nr_pages); > > -struct page *generic_read_merkle_tree_page(struct inode *inode, pgoff_t > index, > - unsigned long num_ra_pages); > +struct page *generic_read_merkle_tree_page(struct inode *inode, pgoff_t > index); > +void generic_readahead_merkle_tree(struct inode *inode, pgoff_t index, > + unsigned long nr_pages); > > #endif /* _LINUX_FSVERITY_H */ > -- > 2.47.3 > > _______________________________________________ Linux-f2fs-devel mailing list [email protected] https://lists.sourceforge.net/lists/listinfo/linux-f2fs-devel
