Jonathan Nieder wrote: > Brian Kroth wrote: >> linux-image-3.5-trunk-amd64 (3.5-1~experimental.1) on the server >> does indeed appear to fix this. [...] > Very nice to hear. > > Please test the attached patches together against a 3.2.y kernel, > for example using the following instructions.
Actually attached this time. Sorry for the confusion.
From: Bernd Schubert <[email protected]> Date: Tue, 13 Mar 2012 22:51:38 -0400 Subject: fs: add new FMODE flags: FMODE_32bithash and FMODE_64bithash commit 6a8a13e03861c0ab83ab07d573ca793cff0e5d00 upstream. Those flags are supposed to be set by NFS readdir() to tell ext3/ext4 to 32bit (NFSv2) or 64bit hash values (offsets) in seekdir(). Signed-off-by: Bernd Schubert <[email protected]> Signed-off-by: "Theodore Ts'o" <[email protected]> Signed-off-by: Jonathan Nieder <[email protected]> --- include/linux/fs.h | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/include/linux/fs.h b/include/linux/fs.h index 29b6353..fb7ce74 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -92,6 +92,10 @@ struct inodes_stat_t { /* File is opened using open(.., 3, ..) and is writeable only for ioctls (specialy hack for floppy.c) */ #define FMODE_WRITE_IOCTL ((__force fmode_t)0x100) +/* 32bit hashes as llseek() offset (for directories) */ +#define FMODE_32BITHASH ((__force fmode_t)0x200) +/* 64bit hashes as llseek() offset (for directories) */ +#define FMODE_64BITHASH ((__force fmode_t)0x400) /* * Don't update ctime and mtime. -- 1.7.10.4
From: Fan Yong <[email protected]> Date: Sun, 18 Mar 2012 22:44:40 -0400 Subject: ext4: return 32/64-bit dir name hash according to usage type commit d1f5273e9adb40724a85272f248f210dc4ce919a upstream. Traditionally ext2/3/4 has returned a 32-bit hash value from llseek() to appease NFSv2, which can only handle a 32-bit cookie for seekdir() and telldir(). However, this causes problems if there are 32-bit hash collisions, since the NFSv2 server can get stuck resending the same entries from the directory repeatedly. Allow ext4 to return a full 64-bit hash (both major and minor) for telldir to decrease the chance of hash collisions. This still needs integration on the NFS side. Patch-updated-by: Bernd Schubert <[email protected]> (blame me if something is not correct) Signed-off-by: Fan Yong <[email protected]> Signed-off-by: Andreas Dilger <[email protected]> Signed-off-by: Bernd Schubert <[email protected]> Signed-off-by: "Theodore Ts'o" <[email protected]> Signed-off-by: Jonathan Nieder <[email protected]> --- fs/ext4/dir.c | 214 ++++++++++++++++++++++++++++++++++++++++++++------------ fs/ext4/ext4.h | 6 +- fs/ext4/hash.c | 4 +- 3 files changed, 176 insertions(+), 48 deletions(-) diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c index 164c560..689d1b1 100644 --- a/fs/ext4/dir.c +++ b/fs/ext4/dir.c @@ -32,24 +32,8 @@ static unsigned char ext4_filetype_table[] = { DT_UNKNOWN, DT_REG, DT_DIR, DT_CHR, DT_BLK, DT_FIFO, DT_SOCK, DT_LNK }; -static int ext4_readdir(struct file *, void *, filldir_t); static int ext4_dx_readdir(struct file *filp, void *dirent, filldir_t filldir); -static int ext4_release_dir(struct inode *inode, - struct file *filp); - -const struct file_operations ext4_dir_operations = { - .llseek = ext4_llseek, - .read = generic_read_dir, - .readdir = ext4_readdir, /* we take BKL. needed?*/ - .unlocked_ioctl = ext4_ioctl, -#ifdef CONFIG_COMPAT - .compat_ioctl = ext4_compat_ioctl, -#endif - .fsync = ext4_sync_file, - .release = ext4_release_dir, -}; - static unsigned char get_dtype(struct super_block *sb, int filetype) { @@ -60,6 +44,26 @@ static unsigned char get_dtype(struct super_block *sb, int filetype) return (ext4_filetype_table[filetype]); } +/** + * Check if the given dir-inode refers to an htree-indexed directory + * (or a directory which chould potentially get coverted to use htree + * indexing). + * + * Return 1 if it is a dx dir, 0 if not + */ +static int is_dx_dir(struct inode *inode) +{ + struct super_block *sb = inode->i_sb; + + if (EXT4_HAS_COMPAT_FEATURE(inode->i_sb, + EXT4_FEATURE_COMPAT_DIR_INDEX) && + ((ext4_test_inode_flag(inode, EXT4_INODE_INDEX)) || + ((inode->i_size >> sb->s_blocksize_bits) == 1))) + return 1; + + return 0; +} + /* * Return 0 if the directory entry is OK, and 1 if there is a problem * @@ -115,18 +119,13 @@ static int ext4_readdir(struct file *filp, unsigned int offset; int i, stored; struct ext4_dir_entry_2 *de; - struct super_block *sb; int err; struct inode *inode = filp->f_path.dentry->d_inode; + struct super_block *sb = inode->i_sb; int ret = 0; int dir_has_error = 0; - sb = inode->i_sb; - - if (EXT4_HAS_COMPAT_FEATURE(inode->i_sb, - EXT4_FEATURE_COMPAT_DIR_INDEX) && - ((ext4_test_inode_flag(inode, EXT4_INODE_INDEX)) || - ((inode->i_size >> sb->s_blocksize_bits) == 1))) { + if (is_dx_dir(inode)) { err = ext4_dx_readdir(filp, dirent, filldir); if (err != ERR_BAD_DX_DIR) { ret = err; @@ -254,22 +253,134 @@ out: return ret; } +static inline int is_32bit_api(void) +{ +#ifdef CONFIG_COMPAT + return is_compat_task(); +#else + return (BITS_PER_LONG == 32); +#endif +} + /* * These functions convert from the major/minor hash to an f_pos - * value. + * value for dx directories * - * Currently we only use major hash numer. This is unfortunate, but - * on 32-bit machines, the same VFS interface is used for lseek and - * llseek, so if we use the 64 bit offset, then the 32-bit versions of - * lseek/telldir/seekdir will blow out spectacularly, and from within - * the ext2 low-level routine, we don't know if we're being called by - * a 64-bit version of the system call or the 32-bit version of the - * system call. Worse yet, NFSv2 only allows for a 32-bit readdir - * cookie. Sigh. + * Upper layer (for example NFS) should specify FMODE_32BITHASH or + * FMODE_64BITHASH explicitly. On the other hand, we allow ext4 to be mounted + * directly on both 32-bit and 64-bit nodes, under such case, neither + * FMODE_32BITHASH nor FMODE_64BITHASH is specified. */ -#define hash2pos(major, minor) (major >> 1) -#define pos2maj_hash(pos) ((pos << 1) & 0xffffffff) -#define pos2min_hash(pos) (0) +static inline loff_t hash2pos(struct file *filp, __u32 major, __u32 minor) +{ + if ((filp->f_mode & FMODE_32BITHASH) || + (!(filp->f_mode & FMODE_64BITHASH) && is_32bit_api())) + return major >> 1; + else + return ((__u64)(major >> 1) << 32) | (__u64)minor; +} + +static inline __u32 pos2maj_hash(struct file *filp, loff_t pos) +{ + if ((filp->f_mode & FMODE_32BITHASH) || + (!(filp->f_mode & FMODE_64BITHASH) && is_32bit_api())) + return (pos << 1) & 0xffffffff; + else + return ((pos >> 32) << 1) & 0xffffffff; +} + +static inline __u32 pos2min_hash(struct file *filp, loff_t pos) +{ + if ((filp->f_mode & FMODE_32BITHASH) || + (!(filp->f_mode & FMODE_64BITHASH) && is_32bit_api())) + return 0; + else + return pos & 0xffffffff; +} + +/* + * Return 32- or 64-bit end-of-file for dx directories + */ +static inline loff_t ext4_get_htree_eof(struct file *filp) +{ + if ((filp->f_mode & FMODE_32BITHASH) || + (!(filp->f_mode & FMODE_64BITHASH) && is_32bit_api())) + return EXT4_HTREE_EOF_32BIT; + else + return EXT4_HTREE_EOF_64BIT; +} + + +/* + * ext4_dir_llseek() based on generic_file_llseek() to handle both + * non-htree and htree directories, where the "offset" is in terms + * of the filename hash value instead of the byte offset. + * + * NOTE: offsets obtained *before* ext4_set_inode_flag(dir, EXT4_INODE_INDEX) + * will be invalid once the directory was converted into a dx directory + */ +loff_t ext4_dir_llseek(struct file *file, loff_t offset, int origin) +{ + struct inode *inode = file->f_mapping->host; + loff_t ret = -EINVAL; + int dx_dir = is_dx_dir(inode); + + mutex_lock(&inode->i_mutex); + + /* NOTE: relative offsets with dx directories might not work + * as expected, as it is difficult to figure out the + * correct offset between dx hashes */ + + switch (origin) { + case SEEK_END: + if (unlikely(offset > 0)) + goto out_err; /* not supported for directories */ + + /* so only negative offsets are left, does that have a + * meaning for directories at all? */ + if (dx_dir) + offset += ext4_get_htree_eof(file); + else + offset += inode->i_size; + break; + case SEEK_CUR: + /* + * Here we special-case the lseek(fd, 0, SEEK_CUR) + * position-querying operation. Avoid rewriting the "same" + * f_pos value back to the file because a concurrent read(), + * write() or lseek() might have altered it + */ + if (offset == 0) { + offset = file->f_pos; + goto out_ok; + } + + offset += file->f_pos; + break; + } + + if (unlikely(offset < 0)) + goto out_err; + + if (!dx_dir) { + if (offset > inode->i_sb->s_maxbytes) + goto out_err; + } else if (offset > ext4_get_htree_eof(file)) + goto out_err; + + /* Special lock needed here? */ + if (offset != file->f_pos) { + file->f_pos = offset; + file->f_version = 0; + } + +out_ok: + ret = offset; +out_err: + mutex_unlock(&inode->i_mutex); + + return ret; +} /* * This structure holds the nodes of the red-black tree used to store @@ -330,15 +441,16 @@ static void free_rb_tree_fname(struct rb_root *root) } -static struct dir_private_info *ext4_htree_create_dir_info(loff_t pos) +static struct dir_private_info *ext4_htree_create_dir_info(struct file *filp, + loff_t pos) { struct dir_private_info *p; p = kzalloc(sizeof(struct dir_private_info), GFP_KERNEL); if (!p) return NULL; - p->curr_hash = pos2maj_hash(pos); - p->curr_minor_hash = pos2min_hash(pos); + p->curr_hash = pos2maj_hash(filp, pos); + p->curr_minor_hash = pos2min_hash(filp, pos); return p; } @@ -429,7 +541,7 @@ static int call_filldir(struct file *filp, void *dirent, "null fname?!?\n"); return 0; } - curr_pos = hash2pos(fname->hash, fname->minor_hash); + curr_pos = hash2pos(filp, fname->hash, fname->minor_hash); while (fname) { error = filldir(dirent, fname->name, fname->name_len, curr_pos, @@ -454,13 +566,13 @@ static int ext4_dx_readdir(struct file *filp, int ret; if (!info) { - info = ext4_htree_create_dir_info(filp->f_pos); + info = ext4_htree_create_dir_info(filp, filp->f_pos); if (!info) return -ENOMEM; filp->private_data = info; } - if (filp->f_pos == EXT4_HTREE_EOF) + if (filp->f_pos == ext4_get_htree_eof(filp)) return 0; /* EOF */ /* Some one has messed with f_pos; reset the world */ @@ -468,8 +580,8 @@ static int ext4_dx_readdir(struct file *filp, free_rb_tree_fname(&info->root); info->curr_node = NULL; info->extra_fname = NULL; - info->curr_hash = pos2maj_hash(filp->f_pos); - info->curr_minor_hash = pos2min_hash(filp->f_pos); + info->curr_hash = pos2maj_hash(filp, filp->f_pos); + info->curr_minor_hash = pos2min_hash(filp, filp->f_pos); } /* @@ -501,7 +613,7 @@ static int ext4_dx_readdir(struct file *filp, if (ret < 0) return ret; if (ret == 0) { - filp->f_pos = EXT4_HTREE_EOF; + filp->f_pos = ext4_get_htree_eof(filp); break; } info->curr_node = rb_first(&info->root); @@ -521,7 +633,7 @@ static int ext4_dx_readdir(struct file *filp, info->curr_minor_hash = fname->minor_hash; } else { if (info->next_hash == ~0) { - filp->f_pos = EXT4_HTREE_EOF; + filp->f_pos = ext4_get_htree_eof(filp); break; } info->curr_hash = info->next_hash; @@ -540,3 +652,15 @@ static int ext4_release_dir(struct inode *inode, struct file *filp) return 0; } + +const struct file_operations ext4_dir_operations = { + .llseek = ext4_dir_llseek, + .read = generic_read_dir, + .readdir = ext4_readdir, + .unlocked_ioctl = ext4_ioctl, +#ifdef CONFIG_COMPAT + .compat_ioctl = ext4_compat_ioctl, +#endif + .fsync = ext4_sync_file, + .release = ext4_release_dir, +}; diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 8cb184c..2ac1eef 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -1597,7 +1597,11 @@ struct dx_hash_info u32 *seed; }; -#define EXT4_HTREE_EOF 0x7fffffff + +/* 32 and 64 bit signed EOF for dx directories */ +#define EXT4_HTREE_EOF_32BIT ((1UL << (32 - 1)) - 1) +#define EXT4_HTREE_EOF_64BIT ((1ULL << (64 - 1)) - 1) + /* * Control parameters used by ext4_htree_next_block diff --git a/fs/ext4/hash.c b/fs/ext4/hash.c index ac8f168..fa8e491 100644 --- a/fs/ext4/hash.c +++ b/fs/ext4/hash.c @@ -200,8 +200,8 @@ int ext4fs_dirhash(const char *name, int len, struct dx_hash_info *hinfo) return -1; } hash = hash & ~1; - if (hash == (EXT4_HTREE_EOF << 1)) - hash = (EXT4_HTREE_EOF-1) << 1; + if (hash == (EXT4_HTREE_EOF_32BIT << 1)) + hash = (EXT4_HTREE_EOF_32BIT - 1) << 1; hinfo->hash = hash; hinfo->minor_hash = minor_hash; return 0; -- 1.7.10.4
From: Bernd Schubert <[email protected]> Date: Sun, 18 Mar 2012 22:44:49 -0400 Subject: nfsd: rename 'int access' to 'int may_flags' in nfsd_open() commit 999448a8c0202d8c41711c92385323520644527b upstream. Just rename this variable, as the next patch will add a flag and 'access' as variable name would not be correct any more. Signed-off-by: Bernd Schubert <[email protected]> Signed-off-by: "Theodore Ts'o" <[email protected]> Acked-by: J. Bruce Fields <[email protected]> Signed-off-by: Jonathan Nieder <[email protected]> --- fs/nfsd/vfs.c | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index 5c3cd82..b395c61 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -726,12 +726,13 @@ static int nfsd_open_break_lease(struct inode *inode, int access) /* * Open an existing file or directory. - * The access argument indicates the type of open (read/write/lock) + * The may_flags argument indicates the type of open (read/write/lock) + * and additional flags. * N.B. After this call fhp needs an fh_put */ __be32 nfsd_open(struct svc_rqst *rqstp, struct svc_fh *fhp, int type, - int access, struct file **filp) + int may_flags, struct file **filp) { struct dentry *dentry; struct inode *inode; @@ -746,7 +747,7 @@ nfsd_open(struct svc_rqst *rqstp, struct svc_fh *fhp, int type, * and (hopefully) checked permission - so allow OWNER_OVERRIDE * in case a chmod has now revoked permission. */ - err = fh_verify(rqstp, fhp, type, access | NFSD_MAY_OWNER_OVERRIDE); + err = fh_verify(rqstp, fhp, type, may_flags | NFSD_MAY_OWNER_OVERRIDE); if (err) goto out; @@ -757,7 +758,7 @@ nfsd_open(struct svc_rqst *rqstp, struct svc_fh *fhp, int type, * or any access when mandatory locking enabled */ err = nfserr_perm; - if (IS_APPEND(inode) && (access & NFSD_MAY_WRITE)) + if (IS_APPEND(inode) && (may_flags & NFSD_MAY_WRITE)) goto out; /* * We must ignore files (but only files) which might have mandatory @@ -770,12 +771,12 @@ nfsd_open(struct svc_rqst *rqstp, struct svc_fh *fhp, int type, if (!inode->i_fop) goto out; - host_err = nfsd_open_break_lease(inode, access); + host_err = nfsd_open_break_lease(inode, may_flags); if (host_err) /* NOMEM or WOULDBLOCK */ goto out_nfserr; - if (access & NFSD_MAY_WRITE) { - if (access & NFSD_MAY_READ) + if (may_flags & NFSD_MAY_WRITE) { + if (may_flags & NFSD_MAY_READ) flags = O_RDWR|O_LARGEFILE; else flags = O_WRONLY|O_LARGEFILE; @@ -785,7 +786,8 @@ nfsd_open(struct svc_rqst *rqstp, struct svc_fh *fhp, int type, if (IS_ERR(*filp)) host_err = PTR_ERR(*filp); else - host_err = ima_file_check(*filp, access); + host_err = ima_file_check(*filp, may_flags); + out_nfserr: err = nfserrno(host_err); out: -- 1.7.10.4
From: Bernd Schubert <[email protected]> Date: Sun, 18 Mar 2012 22:44:50 -0400 Subject: nfsd: vfs_llseek() with 32 or 64 bit offsets (hashes) commit 06effdbb49af5f6c7d20affaec74603914acc768 upstream. Use 32-bit or 64-bit llseek() hashes for directory offsets depending on the NFS version. NFSv2 gets 32-bit hashes only. NOTE: This patch got rather complex as Christoph asked to set the filp->f_mode flag in the open call or immediatly after dentry_open() in nfsd_open() to avoid races. Personally I still do not see a reason for that and in my opinion FMODE_32BITHASH/FMODE_64BITHASH flags could be set nfsd_readdir(), as it follows directly after nfsd_open() without a chance of races. Signed-off-by: Bernd Schubert <[email protected]> Signed-off-by: "Theodore Ts'o" <[email protected]> Acked-by: J. Bruce Fields <[email protected]> Signed-off-by: Jonathan Nieder <[email protected]> --- fs/nfsd/vfs.c | 15 +++++++++++++-- fs/nfsd/vfs.h | 2 ++ 2 files changed, 15 insertions(+), 2 deletions(-) diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index b395c61..959039e 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -785,9 +785,15 @@ nfsd_open(struct svc_rqst *rqstp, struct svc_fh *fhp, int type, flags, current_cred()); if (IS_ERR(*filp)) host_err = PTR_ERR(*filp); - else + else { host_err = ima_file_check(*filp, may_flags); + if (may_flags & NFSD_MAY_64BIT_COOKIE) + (*filp)->f_mode |= FMODE_64BITHASH; + else + (*filp)->f_mode |= FMODE_32BITHASH; + } + out_nfserr: err = nfserrno(host_err); out: @@ -2011,8 +2017,13 @@ nfsd_readdir(struct svc_rqst *rqstp, struct svc_fh *fhp, loff_t *offsetp, __be32 err; struct file *file; loff_t offset = *offsetp; + int may_flags = NFSD_MAY_READ; - err = nfsd_open(rqstp, fhp, S_IFDIR, NFSD_MAY_READ, &file); + /* NFSv2 only supports 32 bit cookies */ + if (rqstp->rq_vers > 2) + may_flags |= NFSD_MAY_64BIT_COOKIE; + + err = nfsd_open(rqstp, fhp, S_IFDIR, may_flags, &file); if (err) goto out; diff --git a/fs/nfsd/vfs.h b/fs/nfsd/vfs.h index 3f54ad0..85d4d42 100644 --- a/fs/nfsd/vfs.h +++ b/fs/nfsd/vfs.h @@ -27,6 +27,8 @@ #define NFSD_MAY_BYPASS_GSS 0x400 #define NFSD_MAY_READ_IF_EXEC 0x800 +#define NFSD_MAY_64BIT_COOKIE 0x1000 /* 64 bit readdir cookies for >= NFSv3 */ + #define NFSD_MAY_CREATE (NFSD_MAY_EXEC|NFSD_MAY_WRITE) #define NFSD_MAY_REMOVE (NFSD_MAY_EXEC|NFSD_MAY_WRITE|NFSD_MAY_TRUNC) -- 1.7.10.4
From: Eric Sandeen <[email protected]> Date: Thu, 26 Apr 2012 13:10:39 -0500 Subject: ext3: return 32/64-bit dir name hash according to usage type commit d7dab39b6e16d5eea78ed3c705d2a2d0772b4f06 upstream. This is based on commit d1f5273e9adb40724a85272f248f210dc4ce919a ext4: return 32/64-bit dir name hash according to usage type by Fan Yong <[email protected]> Traditionally ext2/3/4 has returned a 32-bit hash value from llseek() to appease NFSv2, which can only handle a 32-bit cookie for seekdir() and telldir(). However, this causes problems if there are 32-bit hash collisions, since the NFSv2 server can get stuck resending the same entries from the directory repeatedly. Allow ext3 to return a full 64-bit hash (both major and minor) for telldir to decrease the chance of hash collisions. This patch does implement a new ext3_dir_llseek op, because with 64-bit hashes, nfs will attempt to seek to a hash "offset" which is much larger than ext3's s_maxbytes. So for dx dirs, we call generic_file_llseek_size() with the appropriate max hash value as the maximum seekable size. Otherwise we just pass through to generic_file_llseek(). Patch-updated-by: Bernd Schubert <[email protected]> Patch-updated-by: Eric Sandeen <[email protected]> (blame us if something is not correct) Signed-off-by: Eric Sandeen <[email protected]> Signed-off-by: Jan Kara <[email protected]> Signed-off-by: Jonathan Nieder <[email protected]> --- fs/ext3/dir.c | 167 ++++++++++++++++++++++++++++++++++------------- fs/ext3/hash.c | 4 +- include/linux/ext3_fs.h | 6 +- 3 files changed, 129 insertions(+), 48 deletions(-) diff --git a/fs/ext3/dir.c b/fs/ext3/dir.c index 34f0a07..3268697 100644 --- a/fs/ext3/dir.c +++ b/fs/ext3/dir.c @@ -25,6 +25,7 @@ #include <linux/jbd.h> #include <linux/ext3_fs.h> #include <linux/buffer_head.h> +#include <linux/compat.h> #include <linux/slab.h> #include <linux/rbtree.h> @@ -32,24 +33,8 @@ static unsigned char ext3_filetype_table[] = { DT_UNKNOWN, DT_REG, DT_DIR, DT_CHR, DT_BLK, DT_FIFO, DT_SOCK, DT_LNK }; -static int ext3_readdir(struct file *, void *, filldir_t); static int ext3_dx_readdir(struct file * filp, void * dirent, filldir_t filldir); -static int ext3_release_dir (struct inode * inode, - struct file * filp); - -const struct file_operations ext3_dir_operations = { - .llseek = generic_file_llseek, - .read = generic_read_dir, - .readdir = ext3_readdir, /* we take BKL. needed?*/ - .unlocked_ioctl = ext3_ioctl, -#ifdef CONFIG_COMPAT - .compat_ioctl = ext3_compat_ioctl, -#endif - .fsync = ext3_sync_file, /* BKL held */ - .release = ext3_release_dir, -}; - static unsigned char get_dtype(struct super_block *sb, int filetype) { @@ -60,6 +45,25 @@ static unsigned char get_dtype(struct super_block *sb, int filetype) return (ext3_filetype_table[filetype]); } +/** + * Check if the given dir-inode refers to an htree-indexed directory + * (or a directory which chould potentially get coverted to use htree + * indexing). + * + * Return 1 if it is a dx dir, 0 if not + */ +static int is_dx_dir(struct inode *inode) +{ + struct super_block *sb = inode->i_sb; + + if (EXT3_HAS_COMPAT_FEATURE(inode->i_sb, + EXT3_FEATURE_COMPAT_DIR_INDEX) && + ((EXT3_I(inode)->i_flags & EXT3_INDEX_FL) || + ((inode->i_size >> sb->s_blocksize_bits) == 1))) + return 1; + + return 0; +} int ext3_check_dir_entry (const char * function, struct inode * dir, struct ext3_dir_entry_2 * de, @@ -99,18 +103,13 @@ static int ext3_readdir(struct file * filp, unsigned long offset; int i, stored; struct ext3_dir_entry_2 *de; - struct super_block *sb; int err; struct inode *inode = filp->f_path.dentry->d_inode; + struct super_block *sb = inode->i_sb; int ret = 0; int dir_has_error = 0; - sb = inode->i_sb; - - if (EXT3_HAS_COMPAT_FEATURE(inode->i_sb, - EXT3_FEATURE_COMPAT_DIR_INDEX) && - ((EXT3_I(inode)->i_flags & EXT3_INDEX_FL) || - ((inode->i_size >> sb->s_blocksize_bits) == 1))) { + if (is_dx_dir(inode)) { err = ext3_dx_readdir(filp, dirent, filldir); if (err != ERR_BAD_DX_DIR) { ret = err; @@ -232,22 +231,87 @@ out: return ret; } +static inline int is_32bit_api(void) +{ +#ifdef CONFIG_COMPAT + return is_compat_task(); +#else + return (BITS_PER_LONG == 32); +#endif +} + /* * These functions convert from the major/minor hash to an f_pos - * value. + * value for dx directories * - * Currently we only use major hash numer. This is unfortunate, but - * on 32-bit machines, the same VFS interface is used for lseek and - * llseek, so if we use the 64 bit offset, then the 32-bit versions of - * lseek/telldir/seekdir will blow out spectacularly, and from within - * the ext2 low-level routine, we don't know if we're being called by - * a 64-bit version of the system call or the 32-bit version of the - * system call. Worse yet, NFSv2 only allows for a 32-bit readdir - * cookie. Sigh. + * Upper layer (for example NFS) should specify FMODE_32BITHASH or + * FMODE_64BITHASH explicitly. On the other hand, we allow ext3 to be mounted + * directly on both 32-bit and 64-bit nodes, under such case, neither + * FMODE_32BITHASH nor FMODE_64BITHASH is specified. */ -#define hash2pos(major, minor) (major >> 1) -#define pos2maj_hash(pos) ((pos << 1) & 0xffffffff) -#define pos2min_hash(pos) (0) +static inline loff_t hash2pos(struct file *filp, __u32 major, __u32 minor) +{ + if ((filp->f_mode & FMODE_32BITHASH) || + (!(filp->f_mode & FMODE_64BITHASH) && is_32bit_api())) + return major >> 1; + else + return ((__u64)(major >> 1) << 32) | (__u64)minor; +} + +static inline __u32 pos2maj_hash(struct file *filp, loff_t pos) +{ + if ((filp->f_mode & FMODE_32BITHASH) || + (!(filp->f_mode & FMODE_64BITHASH) && is_32bit_api())) + return (pos << 1) & 0xffffffff; + else + return ((pos >> 32) << 1) & 0xffffffff; +} + +static inline __u32 pos2min_hash(struct file *filp, loff_t pos) +{ + if ((filp->f_mode & FMODE_32BITHASH) || + (!(filp->f_mode & FMODE_64BITHASH) && is_32bit_api())) + return 0; + else + return pos & 0xffffffff; +} + +/* + * Return 32- or 64-bit end-of-file for dx directories + */ +static inline loff_t ext3_get_htree_eof(struct file *filp) +{ + if ((filp->f_mode & FMODE_32BITHASH) || + (!(filp->f_mode & FMODE_64BITHASH) && is_32bit_api())) + return EXT3_HTREE_EOF_32BIT; + else + return EXT3_HTREE_EOF_64BIT; +} + + +/* + * ext3_dir_llseek() calls generic_file_llseek[_size]() to handle both + * non-htree and htree directories, where the "offset" is in terms + * of the filename hash value instead of the byte offset. + * + * Because we may return a 64-bit hash that is well beyond s_maxbytes, + * we need to pass the max hash as the maximum allowable offset in + * the htree directory case. + * + * NOTE: offsets obtained *before* ext3_set_inode_flag(dir, EXT3_INODE_INDEX) + * will be invalid once the directory was converted into a dx directory + */ +loff_t ext3_dir_llseek(struct file *file, loff_t offset, int origin) +{ + struct inode *inode = file->f_mapping->host; + int dx_dir = is_dx_dir(inode); + + if (likely(dx_dir)) + return generic_file_llseek_size(file, offset, origin, + ext3_get_htree_eof(file)); + else + return generic_file_llseek(file, offset, origin); +} /* * This structure holds the nodes of the red-black tree used to store @@ -308,15 +372,16 @@ static void free_rb_tree_fname(struct rb_root *root) } -static struct dir_private_info *ext3_htree_create_dir_info(loff_t pos) +static struct dir_private_info *ext3_htree_create_dir_info(struct file *filp, + loff_t pos) { struct dir_private_info *p; p = kzalloc(sizeof(struct dir_private_info), GFP_KERNEL); if (!p) return NULL; - p->curr_hash = pos2maj_hash(pos); - p->curr_minor_hash = pos2min_hash(pos); + p->curr_hash = pos2maj_hash(filp, pos); + p->curr_minor_hash = pos2min_hash(filp, pos); return p; } @@ -406,7 +471,7 @@ static int call_filldir(struct file * filp, void * dirent, printk("call_filldir: called with null fname?!?\n"); return 0; } - curr_pos = hash2pos(fname->hash, fname->minor_hash); + curr_pos = hash2pos(filp, fname->hash, fname->minor_hash); while (fname) { error = filldir(dirent, fname->name, fname->name_len, curr_pos, @@ -431,13 +496,13 @@ static int ext3_dx_readdir(struct file * filp, int ret; if (!info) { - info = ext3_htree_create_dir_info(filp->f_pos); + info = ext3_htree_create_dir_info(filp, filp->f_pos); if (!info) return -ENOMEM; filp->private_data = info; } - if (filp->f_pos == EXT3_HTREE_EOF) + if (filp->f_pos == ext3_get_htree_eof(filp)) return 0; /* EOF */ /* Some one has messed with f_pos; reset the world */ @@ -445,8 +510,8 @@ static int ext3_dx_readdir(struct file * filp, free_rb_tree_fname(&info->root); info->curr_node = NULL; info->extra_fname = NULL; - info->curr_hash = pos2maj_hash(filp->f_pos); - info->curr_minor_hash = pos2min_hash(filp->f_pos); + info->curr_hash = pos2maj_hash(filp, filp->f_pos); + info->curr_minor_hash = pos2min_hash(filp, filp->f_pos); } /* @@ -478,7 +543,7 @@ static int ext3_dx_readdir(struct file * filp, if (ret < 0) return ret; if (ret == 0) { - filp->f_pos = EXT3_HTREE_EOF; + filp->f_pos = ext3_get_htree_eof(filp); break; } info->curr_node = rb_first(&info->root); @@ -498,7 +563,7 @@ static int ext3_dx_readdir(struct file * filp, info->curr_minor_hash = fname->minor_hash; } else { if (info->next_hash == ~0) { - filp->f_pos = EXT3_HTREE_EOF; + filp->f_pos = ext3_get_htree_eof(filp); break; } info->curr_hash = info->next_hash; @@ -517,3 +582,15 @@ static int ext3_release_dir (struct inode * inode, struct file * filp) return 0; } + +const struct file_operations ext3_dir_operations = { + .llseek = ext3_dir_llseek, + .read = generic_read_dir, + .readdir = ext3_readdir, + .unlocked_ioctl = ext3_ioctl, +#ifdef CONFIG_COMPAT + .compat_ioctl = ext3_compat_ioctl, +#endif + .fsync = ext3_sync_file, + .release = ext3_release_dir, +}; diff --git a/fs/ext3/hash.c b/fs/ext3/hash.c index 7d215b4..d4d3ade 100644 --- a/fs/ext3/hash.c +++ b/fs/ext3/hash.c @@ -200,8 +200,8 @@ int ext3fs_dirhash(const char *name, int len, struct dx_hash_info *hinfo) return -1; } hash = hash & ~1; - if (hash == (EXT3_HTREE_EOF << 1)) - hash = (EXT3_HTREE_EOF-1) << 1; + if (hash == (EXT3_HTREE_EOF_32BIT << 1)) + hash = (EXT3_HTREE_EOF_32BIT - 1) << 1; hinfo->hash = hash; hinfo->minor_hash = minor_hash; return 0; diff --git a/include/linux/ext3_fs.h b/include/linux/ext3_fs.h index dec9911..d59ab12 100644 --- a/include/linux/ext3_fs.h +++ b/include/linux/ext3_fs.h @@ -781,7 +781,11 @@ struct dx_hash_info u32 *seed; }; -#define EXT3_HTREE_EOF 0x7fffffff + +/* 32 and 64 bit signed EOF for dx directories */ +#define EXT3_HTREE_EOF_32BIT ((1UL << (32 - 1)) - 1) +#define EXT3_HTREE_EOF_64BIT ((1ULL << (64 - 1)) - 1) + /* * Control parameters used by ext3_htree_next_block -- 1.7.10.4

