Signed-off-by: Bob Peterson <rpete...@redhat.com>
---
fs/gfs2/dir.c | 165 +++++++++++++++++++++++-----------------------------------
1 file changed, 66 insertions(+), 99 deletions(-)
diff --git a/fs/gfs2/dir.c b/fs/gfs2/dir.c
index 5d4261f..039479f 100644
--- a/fs/gfs2/dir.c
+++ b/fs/gfs2/dir.c
@@ -58,7 +58,6 @@
#include <linux/slab.h>
#include <linux/spinlock.h>
#include <linux/buffer_head.h>
-#include <linux/sort.h>
#include <linux/gfs2_ondisk.h>
#include <linux/crc32.h>
#include <linux/vmalloc.h>
@@ -80,8 +79,48 @@
#define MAX_RA_BLOCKS 32 /* max read-ahead blocks */
-#define gfs2_disk_hash2offset(h) (((u64)(h)) >> 1)
-#define gfs2_dir_offset2hash(p) ((u32)(((u64)(p)) << 1))
+/**
+ * Notes about the directory hash table:
+ *
+ * Hash table index is hash >> (32 - dip->i_depth). Max directory depth is 17.
+ * 32 - 17 = 15. The hash value is 32-bits, so 0x100000000 >> 15 = 0x20000.
+ * So the maximum index to the hash table is 0x1ffff.
+ *
+ * At the same time, pages are at most 4K, and GFS2 dirents are 96 bytes.
+ * According to other comments, 99 is the maximum number of entries that can
+ * fit in a single leaf block. However, if you do the math: Take one 4K block,
+ * subtract out the leaf header, so 0x1000 - 0x68 = 0xf98 bytes available.
+ * The minimum dirent, including a 1-character file name is 0x30 bytes long.
+ * So 0xf98 / 0x30 = 0x53, which is 83 decimal. So the theoretical limit is
+ * 83 index entries per leaf block.
+ *
+ * So we can represent the index number with 17 bits, plus the index in 7.
+ * Therefore, we can represent the offset to any dirent by a 32-bit number:
+ * 18-bits for hash table index, 7 for page index, 8 for leaf block extension
+ *
+ * 0 1
+ * 01234567 89abcdef 01234567 89abcdef
+ * pppppppp iiiiiiii iiiiiiii iooooooo
+ * leafblk Hash table index Offset
+ * Leaf blk: 11111111 (0xff)
+ * HTI Max Value: 11111111 11111111 1 (0x1ffff)
+ * Pg offset Max: 1111111 (0x7f)
+ **/
+
+#define gfs2_dir_offset2pge(p) (((u32)p >> 24) & 0xff) /* leaf extension blk */
+#define gfs2_dir_pge2offset(p) (p << 24)
+
+#define gfs2_dir_offset2hti(p) (((u32)p >> 7) & 0x01ffff) /* hash tbl index */
+#define gfs2_dir_hti2offset(p) (p << 7)
+
+#define gfs2_dir_offset2pgo(p) ((u32)p & 0x7f) /* leaf blk page offset */
+#define gfs2_dir_pgo2offset(p) (p)
+
+static inline u32 dirloc2offset(u32 page, u32 hti, u32 off)
+{
+ return (gfs2_dir_pge2offset(page) | gfs2_dir_hti2offset(hti) |
+ gfs2_dir_pgo2offset(off));
+}
struct qstr gfs2_qdot __read_mostly;
struct qstr gfs2_qdotdot __read_mostly;
@@ -1176,48 +1215,6 @@ out_kfree:
}
/**
- * compare_dents - compare directory entries by hash value
- * @a: first dent
- * @b: second dent
- *
- * When comparing the hash entries of @a to @b:
- * gt: returns 1
- * lt: returns -1
- * eq: returns 0
- */
-
-static int compare_dents(const void *a, const void *b)
-{
- const struct gfs2_dirent *dent_a, *dent_b;
- u32 hash_a, hash_b;
- int ret = 0;
-
- dent_a = *(const struct gfs2_dirent **)a;
- hash_a = be32_to_cpu(dent_a->de_hash);
-
- dent_b = *(const struct gfs2_dirent **)b;
- hash_b = be32_to_cpu(dent_b->de_hash);
-
- if (hash_a > hash_b)
- ret = 1;
- else if (hash_a < hash_b)
- ret = -1;
- else {
- unsigned int len_a = be16_to_cpu(dent_a->de_name_len);
- unsigned int len_b = be16_to_cpu(dent_b->de_name_len);
-
- if (len_a > len_b)
- ret = 1;
- else if (len_a < len_b)
- ret = -1;
- else
- ret = memcmp(dent_a + 1, dent_b + 1, len_a);
- }
-
- return ret;
-}
-
-/**
* do_filldir_main - read out directory entries
* @dip: The GFS2 inode
* @ctx: what to feed the entries to
@@ -1225,11 +1222,6 @@ static int compare_dents(const void *a, const void *b)
* @entries: the number of entries in darr
* @copied: pointer to int that's non-zero if a entry has been copied out
*
- * Jump through some hoops to make sure that if there are hash collsions,
- * they are read out at the beginning of a buffer. We want to minimize
- * the possibility that they will fall into different readdir buffers or
- * that someone will want to seek to that location.
- *
* Returns: errno, >0 if the actor tells you to stop
*/
@@ -1237,42 +1229,15 @@ static int do_filldir_main(struct gfs2_inode *dip, struct dir_context *ctx,
const struct gfs2_dirent **darr, u32 entries,
int *copied)
{
- const struct gfs2_dirent *dent, *dent_next;
- u64 off, off_next;
+ const struct gfs2_dirent *dent;
unsigned int x, y;
- int run = 0;
-
- sort(darr, entries, sizeof(struct gfs2_dirent *), compare_dents, NULL);
-
- dent_next = darr[0];
- off_next = be32_to_cpu(dent_next->de_hash);
- off_next = gfs2_disk_hash2offset(off_next);
-
- for (x = 0, y = 1; x < entries; x++, y++) {
- dent = dent_next;
- off = off_next;
-
- if (y < entries) {
- dent_next = darr[y];
- off_next = be32_to_cpu(dent_next->de_hash);
- off_next = gfs2_disk_hash2offset(off_next);
-
- if (off < ctx->pos)
- continue;
- ctx->pos = off;
-
- if (off_next == off) {
- if (*copied && !run)
- return 1;
- run = 1;
- } else
- run = 0;
- } else {
- if (off < ctx->pos)
- continue;
- ctx->pos = off;
- }
+ u32 page = gfs2_dir_offset2pge((u32)ctx->pos);
+ u32 hti = gfs2_dir_offset2hti((u32)ctx->pos);
+ u32 pgo = gfs2_dir_offset2pgo((u32)ctx->pos);
+ for (x = pgo, y = pgo; x < entries; x++, y++) {
+ dent = darr[y];
+ ctx->pos = dirloc2offset(page, hti, y);
if (!dir_emit(ctx, (const char *)(dent + 1),
be16_to_cpu(dent->de_name_len),
be64_to_cpu(dent->de_inum.no_addr),
@@ -1282,12 +1247,6 @@ static int do_filldir_main(struct gfs2_inode *dip,
struct dir_context *ctx,
*copied = 1;
}
- /* Increment the ctx->pos by one, so the next time we come into the
- do_filldir fxn, we get the next entry instead of the last one in the
- current leaf */
-
- ctx->pos++;
-
return 0;
}
@@ -1326,7 +1285,9 @@ static int gfs2_dir_read_leaf(struct inode *inode, struct dir_context *ctx,
int leaf = 0;
int error, i;
u64 lfn = leaf_no;
+ int leaf_extn = gfs2_dir_offset2pge(ctx->pos);
+ i = 0;
do {
error = get_leaf(ip, lfn, &bh);
if (error)
@@ -1334,10 +1295,13 @@ static int gfs2_dir_read_leaf(struct inode *inode,
struct dir_context *ctx,
lf = (struct gfs2_leaf *)bh->b_data;
if (leaves == 0)
*depth = be16_to_cpu(lf->lf_depth);
- entries += be16_to_cpu(lf->lf_entries);
- leaves++;
+ if (i >= leaf_extn) {
+ entries += be16_to_cpu(lf->lf_entries);
+ leaves++;
+ }
lfn = be64_to_cpu(lf->lf_next);
brelse(bh);
+ i++;
} while(lfn);
if (!entries)
@@ -1357,6 +1321,7 @@ static int gfs2_dir_read_leaf(struct inode *inode, struct
dir_context *ctx,
g.pdent = darr;
g.offset = 0;
lfn = leaf_no;
+ i = 0;
do {
error = get_leaf(ip, lfn, &bh);
@@ -1364,7 +1329,7 @@ static int gfs2_dir_read_leaf(struct inode *inode, struct
dir_context *ctx,
goto out_free;
lf = (struct gfs2_leaf *)bh->b_data;
lfn = be64_to_cpu(lf->lf_next);
- if (lf->lf_entries) {
+ if (i >= leaf_extn && lf->lf_entries) {
entries2 += be16_to_cpu(lf->lf_entries);
dent = gfs2_dirent_scan(inode, bh->b_data, bh->b_size,
gfs2_dirent_gather, NULL, &g);
@@ -1386,6 +1351,7 @@ static int gfs2_dir_read_leaf(struct inode *inode, struct
dir_context *ctx,
} else {
brelse(bh);
}
+ i++;
} while(lfn);
BUG_ON(entries2 != entries);
@@ -1458,15 +1424,15 @@ static int dir_e_read(struct inode *inode, struct
dir_context *ctx,
{
struct gfs2_inode *dip = GFS2_I(inode);
u32 hsize, len = 0;
- u32 hash, index;
+ u32 index;
__be64 *lp;
int copied = 0;
int error = 0;
unsigned depth = 0;
+ u32 page = gfs2_dir_offset2pge((u32)ctx->pos);
hsize = 1 << dip->i_depth;
- hash = gfs2_dir_offset2hash(ctx->pos);
- index = hash >> (32 - dip->i_depth);
+ index = gfs2_dir_offset2hti(ctx->pos);
if (dip->i_hash_cache == NULL)
f_ra->start = 0;
@@ -1485,6 +1451,7 @@ static int dir_e_read(struct inode *inode, struct
dir_context *ctx,
len = 1 << (dip->i_depth - depth);
index = (index & ~(len - 1)) + len;
+ ctx->pos = dirloc2offset(page, index, 0);
}
if (error > 0)
@@ -1502,6 +1469,7 @@ int gfs2_dir_read(struct inode *inode, struct dir_context
*ctx,
struct buffer_head *dibh;
int copied = 0;
int error;
+ int strtoff = gfs2_dir_offset2pgo(ctx->pos);
if (!dip->i_entries)
return 0;
@@ -1534,13 +1502,12 @@ int gfs2_dir_read(struct inode *inode, struct
dir_context *ctx,
fs_warn(sdp, "Number of entries corrupt in dir %llu, "
"ip->i_entries (%u) != g.offset (%u)\n",
(unsigned long long)dip->i_no_addr,
- dip->i_entries,
- g.offset);
+ dip->i_entries, g.offset);
error = -EIO;
goto out;
}
error = do_filldir_main(dip, ctx, darr,
- dip->i_entries, &copied);
+ dip->i_entries - strtoff, &copied);
out:
kfree(darr);
}