Re: [Cluster-devel] [GFS2 Patch] GFS2: Add readahead to sequential directory traversal

Bob Peterson Thu, 06 Oct 2011 09:16:24 -0700

----- Original Message -----
| Hi,
| 
| Were we not intending to make this turn itself off in cases where it
| produces no benefit? I thought the plan was to track the state via
| the
| file descriptor in order to avoid readingahead the same blocks over
| and
| over again too,
| 
| Steve.


Hi Steve,

Based on our discussion, I revised the patch to implement a
scheme whereby we don't duplicate read-ahead requests.  However,
I decided against using the file descriptor in favor of a
simple bitmap that keeps track of where we've done read-ahead.

So now instead of a hash table in the inode, I have a structure
that contains two pointers: (1) hash table as before, and
(2) the bitmap to keep track of read-ahead requests.

I'm very happy with the results.  I've got a file system with
500,000 files in a single directory.  On my RHEL5 system, when
I perform the command: "time ls -fR /mnt/gfs2 > /dev/null" I
get these times:

Stock pre-release 5.8:   0m34.481s
Previously posted patch: 0m18.811s
With this patch:         0m9.843s

which is more than three times faster, and nearly twice as
fast as the version I previously posted.  I haven't tried my
"1000x1000" test on it yet.

The upstream version of the patch is given below.
There is one noticeable difference between RHEL5 and upstream:
I changed kmalloc to vmalloc.  Not sure your feelings on that.
Also, I have a feeling you're going to ask me to remove the
#define MAX_RA_BLOCKS and hard-code it to 32.  What can I say?

Regards,

Bob Peterson
Red Hat File Systems
--
commit 19ec5ffb5eaf915944861f8ebd4c065818a238c5
Author: Bob Peterson <rpete...@redhat.com>
Date:   Thu Oct 6 10:34:05 2011 -0500

    GFS2: Add readahead to sequential directory traversal
    
    This patch adds read-ahead capability to GFS2's
    directory hash table management.  It greatly improves
    performance for some directory operations.  For example:
    In one of my file systems that has 1000 directories, each
    of which has 1000 files, time to execute a recursive
    ls (time ls -fR /mnt/gfs2 > /dev/null) was reduced
    from 2m2.814s on a stock kernel to 0m45.938s.

diff --git a/fs/gfs2/dir.c b/fs/gfs2/dir.c
index 2045d70..1c89ae0 100644
--- a/fs/gfs2/dir.c
+++ b/fs/gfs2/dir.c
@@ -329,48 +329,58 @@ fail:
  * gfs2_dir_get_hash_table - Get pointer to the dir hash table
  * @ip: The inode in question
  *
- * Returns: The hash table or an error
+ * Returns: an error code
  */
 
-static __be64 *gfs2_dir_get_hash_table(struct gfs2_inode *ip)
+static int gfs2_dir_get_hash_table(struct gfs2_inode *ip)
 {
        struct inode *inode = &ip->i_inode;
        int ret;
-       u32 hsize;
+       u32 hsize, rasize;
        __be64 *hc;
+       u8 *ra;
 
        BUG_ON(!(ip->i_diskflags & GFS2_DIF_EXHASH));
 
-       hc = ip->i_hash_cache;
-       if (hc)
-               return hc;
+       if (ip->i_hash_cache.h_ht)
+               return 0;
 
        hsize = 1 << ip->i_depth;
+       rasize = hsize / 8; /* bit array */
        hsize *= sizeof(__be64);
        if (hsize != i_size_read(&ip->i_inode)) {
                gfs2_consist_inode(ip);
-               return ERR_PTR(-EIO);
+               return -EIO;
        }
 
-       hc = kmalloc(hsize, GFP_NOFS);
-       ret = -ENOMEM;
+       hc = vmalloc(hsize);
        if (hc == NULL)
-               return ERR_PTR(-ENOMEM);
+               return -ENOMEM;
+
+       ra = vzalloc(rasize);
+       if (ra == NULL) {
+               vfree(hc);
+               return -ENOMEM;
+       }
 
        ret = gfs2_dir_read_data(ip, hc, hsize);
        if (ret < 0) {
-               kfree(hc);
-               return ERR_PTR(ret);
+               vfree(hc);
+               vfree(ra);
+               return ret;
        }
 
        spin_lock(&inode->i_lock);
-       if (ip->i_hash_cache)
-               kfree(hc);
-       else
-               ip->i_hash_cache = hc;
+       if (ip->i_hash_cache.h_ht) {
+               vfree(hc);
+               vfree(ra);
+       } else {
+               ip->i_hash_cache.h_ht = hc;
+               ip->i_hash_cache.h_ra = ra;
+       }
        spin_unlock(&inode->i_lock);
 
-       return ip->i_hash_cache;
+       return 0;
 }
 
 /**
@@ -381,9 +391,12 @@ static __be64 *gfs2_dir_get_hash_table(struct gfs2_inode 
*ip)
  */
 void gfs2_dir_hash_inval(struct gfs2_inode *ip)
 {
-       __be64 *hc = ip->i_hash_cache;
-       ip->i_hash_cache = NULL;
-       kfree(hc);
+       __be64 *hc = ip->i_hash_cache.h_ht;
+       u8 *h_ra = ip->i_hash_cache.h_ra;
+       ip->i_hash_cache.h_ht = NULL;
+       ip->i_hash_cache.h_ra = NULL;
+       vfree(hc);
+       vfree(h_ra);
 }
 
 static inline int gfs2_dirent_sentinel(const struct gfs2_dirent *dent)
@@ -730,14 +743,15 @@ static int get_leaf(struct gfs2_inode *dip, u64 leaf_no,
  * Returns: 0 on success, error code otherwise
  */
 
-static int get_leaf_nr(struct gfs2_inode *dip, u32 index,
-                      u64 *leaf_out)
+static int get_leaf_nr(struct gfs2_inode *dip, u32 index, u64 *leaf_out)
 {
+       int error;
        __be64 *hash;
 
-       hash = gfs2_dir_get_hash_table(dip);
-       if (IS_ERR(hash))
-               return PTR_ERR(hash);
+       error = gfs2_dir_get_hash_table(dip);
+       if (error)
+               return error;
+       hash = dip->i_hash_cache.h_ht;
        *leaf_out = be64_to_cpu(*(hash + index));
        return 0;
 }
@@ -1097,27 +1111,35 @@ fail_brelse:
 static int dir_double_exhash(struct gfs2_inode *dip)
 {
        struct buffer_head *dibh;
-       u32 hsize;
+       u32 hsize, rasize;
        u32 hsize_bytes;
-       __be64 *hc;
-       __be64 *hc2, *h;
+       __be64 *hc, *hc2, *h;
+       u8 *ra;
        int x;
-       int error = 0;
+       int error;
 
        hsize = 1 << dip->i_depth;
+       rasize = hsize / 8; /* bit array */
        hsize_bytes = hsize * sizeof(__be64);
 
-       hc = gfs2_dir_get_hash_table(dip);
-       if (IS_ERR(hc))
-               return PTR_ERR(hc);
+       error = gfs2_dir_get_hash_table(dip);
+       if (error)
+               return error;
 
-       h = hc2 = kmalloc(hsize_bytes * 2, GFP_NOFS);
-       if (!hc2)
+       hc = dip->i_hash_cache.h_ht;
+       h = hc2 = vmalloc(hsize_bytes * 2);
+       if (h == NULL)
                return -ENOMEM;
 
+       ra = vzalloc(rasize);
+       if (ra == NULL) {
+               error = -ENOMEM;
+               goto out_vfree;
+       }
+
        error = gfs2_meta_inode_buffer(dip, &dibh);
        if (error)
-               goto out_kfree;
+               goto out_rafree;
 
        for (x = 0; x < hsize; x++) {
                *h++ = *hc;
@@ -1130,7 +1152,8 @@ static int dir_double_exhash(struct gfs2_inode *dip)
                goto fail;
 
        gfs2_dir_hash_inval(dip);
-       dip->i_hash_cache = hc2;
+       dip->i_hash_cache.h_ht = hc2;
+       dip->i_hash_cache.h_ra = ra;
        dip->i_depth++;
        gfs2_dinode_out(dip, dibh->b_data);
        brelse(dibh);
@@ -1142,8 +1165,11 @@ fail:
        i_size_write(&dip->i_inode, hsize_bytes);
        gfs2_dinode_out(dip, dibh->b_data);
        brelse(dibh);
-out_kfree:
-       kfree(hc2);
+
+out_rafree:
+       vfree(ra);
+out_vfree:
+       vfree(hc2);
        return error;
 }
 
@@ -1376,6 +1402,53 @@ out:
        return error;
 }
 
+#define MAX_RA_BLOCKS 32
+
+/* gfs2_dir_readahead - Issue read-ahead requests for leaf blocks.
+ *
+ * Note: we can't calculate each index like dir_e_read can because we don't
+ * have the leaf, and therefore we don't have the depth, and therefore we
+ * don't have the length. So we have to just read enough ahead to make up
+ * for the loss of information. */
+static void gfs2_dir_readahead(struct inode *inode, unsigned hsize, u32 index)
+{
+       struct gfs2_inode *ip = GFS2_I(inode);
+       struct gfs2_glock *gl = ip->i_gl;
+       struct buffer_head *bh;
+       u64 blocknr = 0, last;
+       unsigned count = 0;
+       int ra_byte, ra_bit;
+
+       while (index < hsize) {
+               ra_byte = index / 8;
+               ra_bit = index % 8;
+               last = blocknr;
+               blocknr = be64_to_cpu(ip->i_hash_cache.h_ht[index++]);
+               if (blocknr == last)
+                       continue;
+               count++;
+               if (count > MAX_RA_BLOCKS)
+                       break;
+
+               /* figure out if we've already requested this block */
+               if ((ip->i_hash_cache.h_ra[ra_byte] >> ra_bit) & 0x01)
+                       continue;
+
+               ip->i_hash_cache.h_ra[ra_byte] |= (1 << ra_bit);
+               bh = gfs2_getbuf(gl, blocknr, 1);
+               if (trylock_buffer(bh)) {
+                       if (buffer_uptodate(bh)) {
+                               unlock_buffer(bh);
+                               brelse(bh);
+                               continue;
+                       }
+                       bh->b_end_io = end_buffer_read_sync;
+                       submit_bh(READA | REQ_META, bh);
+                       continue;
+               }
+               brelse(bh);
+       }
+}
 
 /**
  * dir_e_read - Reads the entries from a directory into a filldir buffer
@@ -1391,20 +1464,23 @@ static int dir_e_read(struct inode *inode, u64 *offset, 
void *opaque,
                      filldir_t filldir)
 {
        struct gfs2_inode *dip = GFS2_I(inode);
-       u32 hsize, len = 0;
+       u32 hsize, len;
        u32 hash, index;
        __be64 *lp;
        int copied = 0;
-       int error = 0;
+       int error;
        unsigned depth = 0;
 
        hsize = 1 << dip->i_depth;
        hash = gfs2_dir_offset2hash(*offset);
        index = hash >> (32 - dip->i_depth);
 
-       lp = gfs2_dir_get_hash_table(dip);
-       if (IS_ERR(lp))
-               return PTR_ERR(lp);
+       error = gfs2_dir_get_hash_table(dip);
+       if (error)
+               return error;
+
+       lp = dip->i_hash_cache.h_ht;
+       gfs2_dir_readahead(inode, hsize, index);
 
        while (index < hsize) {
                error = gfs2_dir_read_leaf(inode, offset, opaque, filldir,
@@ -1925,14 +2001,13 @@ int gfs2_dir_exhash_dealloc(struct gfs2_inode *dip)
        u32 index = 0, next_index;
        __be64 *lp;
        u64 leaf_no;
-       int error = 0, last;
+       int error, last;
 
        hsize = 1 << dip->i_depth;
-
-       lp = gfs2_dir_get_hash_table(dip);
-       if (IS_ERR(lp))
-               return PTR_ERR(lp);
-
+       error = gfs2_dir_get_hash_table(dip);
+       if (error)
+               return error;
+       lp = dip->i_hash_cache.h_ht;
        while (index < hsize) {
                leaf_no = be64_to_cpu(lp[index]);
                if (leaf_no) {
diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
index 892ac37..731d763 100644
--- a/fs/gfs2/incore.h
+++ b/fs/gfs2/incore.h
@@ -271,6 +271,11 @@ enum {
 };
 
 
+struct gfs2_hash_table {
+       __be64 *h_ht; /* hash table data */
+       u8 *h_ra; /* array indicating whether hash table block was read */
+};
+
 struct gfs2_inode {
        struct inode i_inode;
        u64 i_no_addr;
@@ -285,7 +290,7 @@ struct gfs2_inode {
        u64 i_goal;     /* goal block for allocations */
        struct rw_semaphore i_rw_mutex;
        struct list_head i_trunc_list;
-       __be64 *i_hash_cache;
+       struct gfs2_hash_table i_hash_cache;
        u32 i_entries;
        u32 i_diskflags;
        u8 i_height;
diff --git a/fs/gfs2/main.c b/fs/gfs2/main.c
index 8a139ff..a834b88 100644
--- a/fs/gfs2/main.c
+++ b/fs/gfs2/main.c
@@ -41,7 +41,8 @@ static void gfs2_init_inode_once(void *foo)
        init_rwsem(&ip->i_rw_mutex);
        INIT_LIST_HEAD(&ip->i_trunc_list);
        ip->i_alloc = NULL;
-       ip->i_hash_cache = NULL;
+       ip->i_hash_cache.h_ht = NULL;
+       ip->i_hash_cache.h_ra = NULL;
 }
 
 static void gfs2_init_glock_once(void *foo)

Re: [Cluster-devel] [GFS2 Patch] GFS2: Add readahead to sequential directory traversal

Reply via email to