----- Original Message ----- | Hi, | | Were we not intending to make this turn itself off in cases where it | produces no benefit? I thought the plan was to track the state via | the | file descriptor in order to avoid readingahead the same blocks over | and | over again too, | | Steve.
Hi Steve, Based on our discussion, I revised the patch to implement a scheme whereby we don't duplicate read-ahead requests. However, I decided against using the file descriptor in favor of a simple bitmap that keeps track of where we've done read-ahead. So now instead of a hash table in the inode, I have a structure that contains two pointers: (1) hash table as before, and (2) the bitmap to keep track of read-ahead requests. I'm very happy with the results. I've got a file system with 500,000 files in a single directory. On my RHEL5 system, when I perform the command: "time ls -fR /mnt/gfs2 > /dev/null" I get these times: Stock pre-release 5.8: 0m34.481s Previously posted patch: 0m18.811s With this patch: 0m9.843s which is more than three times faster, and nearly twice as fast as the version I previously posted. I haven't tried my "1000x1000" test on it yet. The upstream version of the patch is given below. There is one noticeable difference between RHEL5 and upstream: I changed kmalloc to vmalloc. Not sure your feelings on that. Also, I have a feeling you're going to ask me to remove the #define MAX_RA_BLOCKS and hard-code it to 32. What can I say? Regards, Bob Peterson Red Hat File Systems -- commit 19ec5ffb5eaf915944861f8ebd4c065818a238c5 Author: Bob Peterson <rpete...@redhat.com> Date: Thu Oct 6 10:34:05 2011 -0500 GFS2: Add readahead to sequential directory traversal This patch adds read-ahead capability to GFS2's directory hash table management. It greatly improves performance for some directory operations. For example: In one of my file systems that has 1000 directories, each of which has 1000 files, time to execute a recursive ls (time ls -fR /mnt/gfs2 > /dev/null) was reduced from 2m2.814s on a stock kernel to 0m45.938s. diff --git a/fs/gfs2/dir.c b/fs/gfs2/dir.c index 2045d70..1c89ae0 100644 --- a/fs/gfs2/dir.c +++ b/fs/gfs2/dir.c @@ -329,48 +329,58 @@ fail: * gfs2_dir_get_hash_table - Get pointer to the dir hash table * @ip: The inode in question * - * Returns: The hash table or an error + * Returns: an error code */ -static __be64 *gfs2_dir_get_hash_table(struct gfs2_inode *ip) +static int gfs2_dir_get_hash_table(struct gfs2_inode *ip) { struct inode *inode = &ip->i_inode; int ret; - u32 hsize; + u32 hsize, rasize; __be64 *hc; + u8 *ra; BUG_ON(!(ip->i_diskflags & GFS2_DIF_EXHASH)); - hc = ip->i_hash_cache; - if (hc) - return hc; + if (ip->i_hash_cache.h_ht) + return 0; hsize = 1 << ip->i_depth; + rasize = hsize / 8; /* bit array */ hsize *= sizeof(__be64); if (hsize != i_size_read(&ip->i_inode)) { gfs2_consist_inode(ip); - return ERR_PTR(-EIO); + return -EIO; } - hc = kmalloc(hsize, GFP_NOFS); - ret = -ENOMEM; + hc = vmalloc(hsize); if (hc == NULL) - return ERR_PTR(-ENOMEM); + return -ENOMEM; + + ra = vzalloc(rasize); + if (ra == NULL) { + vfree(hc); + return -ENOMEM; + } ret = gfs2_dir_read_data(ip, hc, hsize); if (ret < 0) { - kfree(hc); - return ERR_PTR(ret); + vfree(hc); + vfree(ra); + return ret; } spin_lock(&inode->i_lock); - if (ip->i_hash_cache) - kfree(hc); - else - ip->i_hash_cache = hc; + if (ip->i_hash_cache.h_ht) { + vfree(hc); + vfree(ra); + } else { + ip->i_hash_cache.h_ht = hc; + ip->i_hash_cache.h_ra = ra; + } spin_unlock(&inode->i_lock); - return ip->i_hash_cache; + return 0; } /** @@ -381,9 +391,12 @@ static __be64 *gfs2_dir_get_hash_table(struct gfs2_inode *ip) */ void gfs2_dir_hash_inval(struct gfs2_inode *ip) { - __be64 *hc = ip->i_hash_cache; - ip->i_hash_cache = NULL; - kfree(hc); + __be64 *hc = ip->i_hash_cache.h_ht; + u8 *h_ra = ip->i_hash_cache.h_ra; + ip->i_hash_cache.h_ht = NULL; + ip->i_hash_cache.h_ra = NULL; + vfree(hc); + vfree(h_ra); } static inline int gfs2_dirent_sentinel(const struct gfs2_dirent *dent) @@ -730,14 +743,15 @@ static int get_leaf(struct gfs2_inode *dip, u64 leaf_no, * Returns: 0 on success, error code otherwise */ -static int get_leaf_nr(struct gfs2_inode *dip, u32 index, - u64 *leaf_out) +static int get_leaf_nr(struct gfs2_inode *dip, u32 index, u64 *leaf_out) { + int error; __be64 *hash; - hash = gfs2_dir_get_hash_table(dip); - if (IS_ERR(hash)) - return PTR_ERR(hash); + error = gfs2_dir_get_hash_table(dip); + if (error) + return error; + hash = dip->i_hash_cache.h_ht; *leaf_out = be64_to_cpu(*(hash + index)); return 0; } @@ -1097,27 +1111,35 @@ fail_brelse: static int dir_double_exhash(struct gfs2_inode *dip) { struct buffer_head *dibh; - u32 hsize; + u32 hsize, rasize; u32 hsize_bytes; - __be64 *hc; - __be64 *hc2, *h; + __be64 *hc, *hc2, *h; + u8 *ra; int x; - int error = 0; + int error; hsize = 1 << dip->i_depth; + rasize = hsize / 8; /* bit array */ hsize_bytes = hsize * sizeof(__be64); - hc = gfs2_dir_get_hash_table(dip); - if (IS_ERR(hc)) - return PTR_ERR(hc); + error = gfs2_dir_get_hash_table(dip); + if (error) + return error; - h = hc2 = kmalloc(hsize_bytes * 2, GFP_NOFS); - if (!hc2) + hc = dip->i_hash_cache.h_ht; + h = hc2 = vmalloc(hsize_bytes * 2); + if (h == NULL) return -ENOMEM; + ra = vzalloc(rasize); + if (ra == NULL) { + error = -ENOMEM; + goto out_vfree; + } + error = gfs2_meta_inode_buffer(dip, &dibh); if (error) - goto out_kfree; + goto out_rafree; for (x = 0; x < hsize; x++) { *h++ = *hc; @@ -1130,7 +1152,8 @@ static int dir_double_exhash(struct gfs2_inode *dip) goto fail; gfs2_dir_hash_inval(dip); - dip->i_hash_cache = hc2; + dip->i_hash_cache.h_ht = hc2; + dip->i_hash_cache.h_ra = ra; dip->i_depth++; gfs2_dinode_out(dip, dibh->b_data); brelse(dibh); @@ -1142,8 +1165,11 @@ fail: i_size_write(&dip->i_inode, hsize_bytes); gfs2_dinode_out(dip, dibh->b_data); brelse(dibh); -out_kfree: - kfree(hc2); + +out_rafree: + vfree(ra); +out_vfree: + vfree(hc2); return error; } @@ -1376,6 +1402,53 @@ out: return error; } +#define MAX_RA_BLOCKS 32 + +/* gfs2_dir_readahead - Issue read-ahead requests for leaf blocks. + * + * Note: we can't calculate each index like dir_e_read can because we don't + * have the leaf, and therefore we don't have the depth, and therefore we + * don't have the length. So we have to just read enough ahead to make up + * for the loss of information. */ +static void gfs2_dir_readahead(struct inode *inode, unsigned hsize, u32 index) +{ + struct gfs2_inode *ip = GFS2_I(inode); + struct gfs2_glock *gl = ip->i_gl; + struct buffer_head *bh; + u64 blocknr = 0, last; + unsigned count = 0; + int ra_byte, ra_bit; + + while (index < hsize) { + ra_byte = index / 8; + ra_bit = index % 8; + last = blocknr; + blocknr = be64_to_cpu(ip->i_hash_cache.h_ht[index++]); + if (blocknr == last) + continue; + count++; + if (count > MAX_RA_BLOCKS) + break; + + /* figure out if we've already requested this block */ + if ((ip->i_hash_cache.h_ra[ra_byte] >> ra_bit) & 0x01) + continue; + + ip->i_hash_cache.h_ra[ra_byte] |= (1 << ra_bit); + bh = gfs2_getbuf(gl, blocknr, 1); + if (trylock_buffer(bh)) { + if (buffer_uptodate(bh)) { + unlock_buffer(bh); + brelse(bh); + continue; + } + bh->b_end_io = end_buffer_read_sync; + submit_bh(READA | REQ_META, bh); + continue; + } + brelse(bh); + } +} /** * dir_e_read - Reads the entries from a directory into a filldir buffer @@ -1391,20 +1464,23 @@ static int dir_e_read(struct inode *inode, u64 *offset, void *opaque, filldir_t filldir) { struct gfs2_inode *dip = GFS2_I(inode); - u32 hsize, len = 0; + u32 hsize, len; u32 hash, index; __be64 *lp; int copied = 0; - int error = 0; + int error; unsigned depth = 0; hsize = 1 << dip->i_depth; hash = gfs2_dir_offset2hash(*offset); index = hash >> (32 - dip->i_depth); - lp = gfs2_dir_get_hash_table(dip); - if (IS_ERR(lp)) - return PTR_ERR(lp); + error = gfs2_dir_get_hash_table(dip); + if (error) + return error; + + lp = dip->i_hash_cache.h_ht; + gfs2_dir_readahead(inode, hsize, index); while (index < hsize) { error = gfs2_dir_read_leaf(inode, offset, opaque, filldir, @@ -1925,14 +2001,13 @@ int gfs2_dir_exhash_dealloc(struct gfs2_inode *dip) u32 index = 0, next_index; __be64 *lp; u64 leaf_no; - int error = 0, last; + int error, last; hsize = 1 << dip->i_depth; - - lp = gfs2_dir_get_hash_table(dip); - if (IS_ERR(lp)) - return PTR_ERR(lp); - + error = gfs2_dir_get_hash_table(dip); + if (error) + return error; + lp = dip->i_hash_cache.h_ht; while (index < hsize) { leaf_no = be64_to_cpu(lp[index]); if (leaf_no) { diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h index 892ac37..731d763 100644 --- a/fs/gfs2/incore.h +++ b/fs/gfs2/incore.h @@ -271,6 +271,11 @@ enum { }; +struct gfs2_hash_table { + __be64 *h_ht; /* hash table data */ + u8 *h_ra; /* array indicating whether hash table block was read */ +}; + struct gfs2_inode { struct inode i_inode; u64 i_no_addr; @@ -285,7 +290,7 @@ struct gfs2_inode { u64 i_goal; /* goal block for allocations */ struct rw_semaphore i_rw_mutex; struct list_head i_trunc_list; - __be64 *i_hash_cache; + struct gfs2_hash_table i_hash_cache; u32 i_entries; u32 i_diskflags; u8 i_height; diff --git a/fs/gfs2/main.c b/fs/gfs2/main.c index 8a139ff..a834b88 100644 --- a/fs/gfs2/main.c +++ b/fs/gfs2/main.c @@ -41,7 +41,8 @@ static void gfs2_init_inode_once(void *foo) init_rwsem(&ip->i_rw_mutex); INIT_LIST_HEAD(&ip->i_trunc_list); ip->i_alloc = NULL; - ip->i_hash_cache = NULL; + ip->i_hash_cache.h_ht = NULL; + ip->i_hash_cache.h_ra = NULL; } static void gfs2_init_glock_once(void *foo)