Hi,

This patch implements a new cookie encoding scheme for encoding, decoding
and filling directory entries. This new scheme prevents duplicate cookies,
so it eliminates the need to sort the entries to prevent duplicates.
Thus, it simplies the code greatly.

Regards,

Bob Peterson
Red Hat File Systems

Signed-off-by: Bob Peterson <rpete...@redhat.com> 
---
 fs/gfs2/dir.c | 165 +++++++++++++++++++++++-----------------------------------
 1 file changed, 66 insertions(+), 99 deletions(-)

diff --git a/fs/gfs2/dir.c b/fs/gfs2/dir.c
index 5d4261f..039479f 100644
--- a/fs/gfs2/dir.c
+++ b/fs/gfs2/dir.c
@@ -58,7 +58,6 @@
 #include <linux/slab.h>
 #include <linux/spinlock.h>
 #include <linux/buffer_head.h>
-#include <linux/sort.h>
 #include <linux/gfs2_ondisk.h>
 #include <linux/crc32.h>
 #include <linux/vmalloc.h>
@@ -80,8 +79,48 @@
 
 #define MAX_RA_BLOCKS 32 /* max read-ahead blocks */
 
-#define gfs2_disk_hash2offset(h) (((u64)(h)) >> 1)
-#define gfs2_dir_offset2hash(p) ((u32)(((u64)(p)) << 1))
+/**
+ * Notes about the directory hash table:
+ *
+ * Hash table index is hash >> (32 - dip->i_depth). Max directory depth is 17.
+ * 32 - 17 = 15. The hash value is 32-bits, so 0x100000000 >> 15 = 0x20000.
+ * So the maximum index to the hash table is 0x1ffff.
+ *
+ * At the same time, pages are at most 4K, and GFS2 dirents are 96 bytes.
+ * According to other comments, 99 is the maximum number of entries that can
+ * fit in a single leaf block. However, if you do the math: Take one 4K block,
+ * subtract out the leaf header, so 0x1000 - 0x68 = 0xf98 bytes available.
+ * The minimum dirent, including a 1-character file name is 0x30 bytes long.
+ * So 0xf98 / 0x30 = 0x53, which is 83 decimal. So the theoretical limit is
+ * 83 index entries per leaf block.
+ *
+ * So we can represent the index number with 17 bits, plus the index in 7.
+ * Therefore, we can represent the offset to any dirent by a 32-bit number:
+ * 18-bits for hash table index, 7 for page index, 8 for leaf block extension
+ * 
+ *                0                 1
+ *                01234567 89abcdef 01234567 89abcdef
+ *                pppppppp iiiiiiii iiiiiiii iooooooo
+ *                leafblk  Hash table index   Offset
+ * Leaf blk:      11111111                            (0xff)
+ * HTI Max Value:          11111111 11111111 1        (0x1ffff)
+ * Pg offset Max:                             1111111 (0x7f)
+ **/
+
+#define gfs2_dir_offset2pge(p) (((u32)p >> 24) & 0xff) /* leaf extension blk */
+#define gfs2_dir_pge2offset(p) (p << 24)
+
+#define gfs2_dir_offset2hti(p) (((u32)p >> 7) & 0x01ffff) /* hash tbl index */
+#define gfs2_dir_hti2offset(p) (p << 7)
+
+#define gfs2_dir_offset2pgo(p) ((u32)p & 0x7f)       /* leaf blk page offset */
+#define gfs2_dir_pgo2offset(p) (p)
+
+static inline u32 dirloc2offset(u32 page, u32 hti, u32 off)
+{
+       return (gfs2_dir_pge2offset(page) | gfs2_dir_hti2offset(hti) |
+               gfs2_dir_pgo2offset(off));
+}
 
 struct qstr gfs2_qdot __read_mostly;
 struct qstr gfs2_qdotdot __read_mostly;
@@ -1176,48 +1215,6 @@ out_kfree:
 }
 
 /**
- * compare_dents - compare directory entries by hash value
- * @a: first dent
- * @b: second dent
- *
- * When comparing the hash entries of @a to @b:
- *   gt: returns 1
- *   lt: returns -1
- *   eq: returns 0
- */
-
-static int compare_dents(const void *a, const void *b)
-{
-       const struct gfs2_dirent *dent_a, *dent_b;
-       u32 hash_a, hash_b;
-       int ret = 0;
-
-       dent_a = *(const struct gfs2_dirent **)a;
-       hash_a = be32_to_cpu(dent_a->de_hash);
-
-       dent_b = *(const struct gfs2_dirent **)b;
-       hash_b = be32_to_cpu(dent_b->de_hash);
-
-       if (hash_a > hash_b)
-               ret = 1;
-       else if (hash_a < hash_b)
-               ret = -1;
-       else {
-               unsigned int len_a = be16_to_cpu(dent_a->de_name_len);
-               unsigned int len_b = be16_to_cpu(dent_b->de_name_len);
-
-               if (len_a > len_b)
-                       ret = 1;
-               else if (len_a < len_b)
-                       ret = -1;
-               else
-                       ret = memcmp(dent_a + 1, dent_b + 1, len_a);
-       }
-
-       return ret;
-}
-
-/**
  * do_filldir_main - read out directory entries
  * @dip: The GFS2 inode
  * @ctx: what to feed the entries to
@@ -1225,11 +1222,6 @@ static int compare_dents(const void *a, const void *b)
  * @entries: the number of entries in darr
  * @copied: pointer to int that's non-zero if a entry has been copied out
  *
- * Jump through some hoops to make sure that if there are hash collsions,
- * they are read out at the beginning of a buffer.  We want to minimize
- * the possibility that they will fall into different readdir buffers or
- * that someone will want to seek to that location.
- *
  * Returns: errno, >0 if the actor tells you to stop
  */
 
@@ -1237,42 +1229,15 @@ static int do_filldir_main(struct gfs2_inode *dip, 
struct dir_context *ctx,
                           const struct gfs2_dirent **darr, u32 entries,
                           int *copied)
 {
-       const struct gfs2_dirent *dent, *dent_next;
-       u64 off, off_next;
+       const struct gfs2_dirent *dent;
        unsigned int x, y;
-       int run = 0;
-
-       sort(darr, entries, sizeof(struct gfs2_dirent *), compare_dents, NULL);
-
-       dent_next = darr[0];
-       off_next = be32_to_cpu(dent_next->de_hash);
-       off_next = gfs2_disk_hash2offset(off_next);
-
-       for (x = 0, y = 1; x < entries; x++, y++) {
-               dent = dent_next;
-               off = off_next;
-
-               if (y < entries) {
-                       dent_next = darr[y];
-                       off_next = be32_to_cpu(dent_next->de_hash);
-                       off_next = gfs2_disk_hash2offset(off_next);
-
-                       if (off < ctx->pos)
-                               continue;
-                       ctx->pos = off;
-
-                       if (off_next == off) {
-                               if (*copied && !run)
-                                       return 1;
-                               run = 1;
-                       } else
-                               run = 0;
-               } else {
-                       if (off < ctx->pos)
-                               continue;
-                       ctx->pos = off;
-               }
+       u32 page = gfs2_dir_offset2pge((u32)ctx->pos);
+       u32 hti = gfs2_dir_offset2hti((u32)ctx->pos);
+       u32 pgo = gfs2_dir_offset2pgo((u32)ctx->pos);
 
+       for (x = pgo, y = pgo; x < entries; x++, y++) {
+               dent = darr[y];
+               ctx->pos = dirloc2offset(page, hti, y);
                if (!dir_emit(ctx, (const char *)(dent + 1),
                                be16_to_cpu(dent->de_name_len),
                                be64_to_cpu(dent->de_inum.no_addr),
@@ -1282,12 +1247,6 @@ static int do_filldir_main(struct gfs2_inode *dip, 
struct dir_context *ctx,
                *copied = 1;
        }
 
-       /* Increment the ctx->pos by one, so the next time we come into the
-          do_filldir fxn, we get the next entry instead of the last one in the
-          current leaf */
-
-       ctx->pos++;
-
        return 0;
 }
 
@@ -1326,7 +1285,9 @@ static int gfs2_dir_read_leaf(struct inode *inode, struct 
dir_context *ctx,
        int leaf = 0;
        int error, i;
        u64 lfn = leaf_no;
+       int leaf_extn = gfs2_dir_offset2pge(ctx->pos);
 
+       i = 0;
        do {
                error = get_leaf(ip, lfn, &bh);
                if (error)
@@ -1334,10 +1295,13 @@ static int gfs2_dir_read_leaf(struct inode *inode, 
struct dir_context *ctx,
                lf = (struct gfs2_leaf *)bh->b_data;
                if (leaves == 0)
                        *depth = be16_to_cpu(lf->lf_depth);
-               entries += be16_to_cpu(lf->lf_entries);
-               leaves++;
+               if (i >= leaf_extn) {
+                       entries += be16_to_cpu(lf->lf_entries);
+                       leaves++;
+               }
                lfn = be64_to_cpu(lf->lf_next);
                brelse(bh);
+               i++;
        } while(lfn);
 
        if (!entries)
@@ -1357,6 +1321,7 @@ static int gfs2_dir_read_leaf(struct inode *inode, struct 
dir_context *ctx,
        g.pdent = darr;
        g.offset = 0;
        lfn = leaf_no;
+       i = 0;
 
        do {
                error = get_leaf(ip, lfn, &bh);
@@ -1364,7 +1329,7 @@ static int gfs2_dir_read_leaf(struct inode *inode, struct 
dir_context *ctx,
                        goto out_free;
                lf = (struct gfs2_leaf *)bh->b_data;
                lfn = be64_to_cpu(lf->lf_next);
-               if (lf->lf_entries) {
+               if (i >= leaf_extn && lf->lf_entries) {
                        entries2 += be16_to_cpu(lf->lf_entries);
                        dent = gfs2_dirent_scan(inode, bh->b_data, bh->b_size,
                                                gfs2_dirent_gather, NULL, &g);
@@ -1386,6 +1351,7 @@ static int gfs2_dir_read_leaf(struct inode *inode, struct 
dir_context *ctx,
                } else {
                        brelse(bh);
                }
+               i++;
        } while(lfn);
 
        BUG_ON(entries2 != entries);
@@ -1458,15 +1424,15 @@ static int dir_e_read(struct inode *inode, struct 
dir_context *ctx,
 {
        struct gfs2_inode *dip = GFS2_I(inode);
        u32 hsize, len = 0;
-       u32 hash, index;
+       u32 index;
        __be64 *lp;
        int copied = 0;
        int error = 0;
        unsigned depth = 0;
+       u32 page = gfs2_dir_offset2pge((u32)ctx->pos);
 
        hsize = 1 << dip->i_depth;
-       hash = gfs2_dir_offset2hash(ctx->pos);
-       index = hash >> (32 - dip->i_depth);
+       index = gfs2_dir_offset2hti(ctx->pos);
 
        if (dip->i_hash_cache == NULL)
                f_ra->start = 0;
@@ -1485,6 +1451,7 @@ static int dir_e_read(struct inode *inode, struct 
dir_context *ctx,
 
                len = 1 << (dip->i_depth - depth);
                index = (index & ~(len - 1)) + len;
+               ctx->pos = dirloc2offset(page, index, 0);
        }
 
        if (error > 0)
@@ -1502,6 +1469,7 @@ int gfs2_dir_read(struct inode *inode, struct dir_context 
*ctx,
        struct buffer_head *dibh;
        int copied = 0;
        int error;
+       int strtoff = gfs2_dir_offset2pgo(ctx->pos);
 
        if (!dip->i_entries)
                return 0;
@@ -1534,13 +1502,12 @@ int gfs2_dir_read(struct inode *inode, struct 
dir_context *ctx,
                        fs_warn(sdp, "Number of entries corrupt in dir %llu, "
                                "ip->i_entries (%u) != g.offset (%u)\n",
                                (unsigned long long)dip->i_no_addr,
-                               dip->i_entries,
-                               g.offset);
+                               dip->i_entries, g.offset);
                        error = -EIO;
                        goto out;
                }
                error = do_filldir_main(dip, ctx, darr,
-                                       dip->i_entries, &copied);
+                                       dip->i_entries - strtoff, &copied);
 out:
                kfree(darr);
        }

Reply via email to