Author: charles Date: Sat Oct 25 19:36:23 2008 GMT Module: SOURCES Tag: HEAD ---- Log message: - ext4dev latest patchset
---- Files affected: SOURCES: kernel-ext4.patch (NONE -> 1.1) (NEW) ---- Diffs: ================================================================ Index: SOURCES/kernel-ext4.patch diff -u /dev/null SOURCES/kernel-ext4.patch:1.1 --- /dev/null Sat Oct 25 21:36:23 2008 +++ SOURCES/kernel-ext4.patch Sat Oct 25 21:36:17 2008 @@ -0,0 +1,8651 @@ +Patchset: 2.6.26-ext4-7 + +This patch was created by combining the ext4-pushed-post-2.6.27-rc1.gz +patches with the stable patches in 2.6.27-rc3-ext4-1 series. + + Documentation/filesystems/ext4.txt | 131 ++- + fs/buffer.c | 19 +- + fs/ext4/acl.c | 188 ++-- + fs/ext4/balloc.c | 221 +++-- + fs/ext4/dir.c | 37 +- + fs/ext4/ext4.h | 64 +- + fs/ext4/ext4_extents.h | 5 +- + fs/ext4/ext4_i.h | 10 +- + fs/ext4/ext4_jbd2.h | 29 +- + fs/ext4/ext4_sb.h | 5 +- + fs/ext4/extents.c | 277 +++--- + fs/ext4/file.c | 20 +- + fs/ext4/fsync.c | 4 + + fs/ext4/group.h | 2 +- + fs/ext4/ialloc.c | 169 +++- + fs/ext4/inode.c | 1931 ++++++++++++++++++++++++++++++------ + fs/ext4/mballoc.c | 744 +++++++++++---- + fs/ext4/mballoc.h | 10 +- + fs/ext4/migrate.c | 3 +- + fs/ext4/namei.c | 45 +- + fs/ext4/resize.c | 134 ++- + fs/ext4/super.c | 451 ++++++--- + fs/ext4/xattr.c | 4 +- + fs/ext4/xattr_trusted.c | 4 +- + fs/ext4/xattr_user.c | 4 +- + fs/jbd2/checkpoint.c | 1 - + fs/jbd2/commit.c | 308 +++---- + fs/jbd2/journal.c | 54 +- + fs/jbd2/transaction.c | 365 +++---- + fs/mpage.c | 14 +- + include/linux/fs.h | 2 + + include/linux/jbd2.h | 73 +- + include/linux/mpage.h | 10 + + include/linux/percpu_counter.h | 12 +- + include/linux/writeback.h | 1 + + lib/percpu_counter.c | 7 +- + mm/filemap.c | 3 +- + mm/page-writeback.c | 3 + + 38 files changed, 3822 insertions(+), 1542 deletions(-) + +diff --git a/Documentation/filesystems/ext4.txt b/Documentation/filesystems/ext4.txt +index 0c5086d..0d53949 100644 +--- a/Documentation/filesystems/ext4.txt ++++ b/Documentation/filesystems/ext4.txt +@@ -13,72 +13,99 @@ Mailing list: [EMAIL PROTECTED] + 1. Quick usage instructions: + =========================== + +- - Grab updated e2fsprogs from +- ftp://ftp.kernel.org/pub/linux/kernel/people/tytso/e2fsprogs-interim/ +- This is a patchset on top of e2fsprogs-1.39, which can be found at ++ - Compile and install the latest version of e2fsprogs (as of this ++ writing version 1.41) from: ++ ++ http://sourceforge.net/project/showfiles.php?group_id=2406 ++ ++ or ++ + ftp://ftp.kernel.org/pub/linux/kernel/people/tytso/e2fsprogs/ + +- - It's still mke2fs -j /dev/hda1 ++ or grab the latest git repository from: ++ ++ git://git.kernel.org/pub/scm/fs/ext2/e2fsprogs.git ++ ++ - Note that it is highly important to install the mke2fs.conf file ++ that comes with the e2fsprogs 1.41.x sources in /etc/mke2fs.conf. If ++ you have edited the /etc/mke2fs.conf file installed on your system, ++ you will need to merge your changes with the version from e2fsprogs ++ 1.41.x. ++ ++ - Create a new filesystem using the ext4dev filesystem type: ++ ++ # mke2fs -t ext4dev /dev/hda1 ++ ++ Or configure an existing ext3 filesystem to support extents and set ++ the test_fs flag to indicate that it's ok for an in-development ++ filesystem to touch this filesystem: + +- - mount /dev/hda1 /wherever -t ext4dev ++ # tune2fs -O extents -E test_fs /dev/hda1 + +- - To enable extents, ++ If the filesystem was created with 128 byte inodes, it can be ++ converted to use 256 byte for greater efficiency via: + +- mount /dev/hda1 /wherever -t ext4dev -o extents ++ # tune2fs -I 256 /dev/hda1 + +- - The filesystem is compatible with the ext3 driver until you add a file +- which has extents (ie: `mount -o extents', then create a file). ++ (Note: we currently do not have tools to convert an ext4dev ++ filesystem back to ext3; so please do not do try this on production ++ filesystems.) + +- NOTE: The "extents" mount flag is temporary. It will soon go away and +- extents will be enabled by the "-o extents" flag to mke2fs or tune2fs ++ - Mounting: ++ ++ # mount -t ext4dev /dev/hda1 /wherever + + - When comparing performance with other filesystems, remember that +- ext3/4 by default offers higher data integrity guarantees than most. So +- when comparing with a metadata-only journalling filesystem, use `mount -o +- data=writeback'. And you might as well use `mount -o nobh' too along +- with it. Making the journal larger than the mke2fs default often helps +- performance with metadata-intensive workloads. ++ ext3/4 by default offers higher data integrity guarantees than most. ++ So when comparing with a metadata-only journalling filesystem, such ++ as ext3, use `mount -o data=writeback'. And you might as well use ++ `mount -o nobh' too along with it. Making the journal larger than ++ the mke2fs default often helps performance with metadata-intensive ++ workloads. + + 2. Features + =========== + + 2.1 Currently available + +-* ability to use filesystems > 16TB ++* ability to use filesystems > 16TB (e2fsprogs support not available yet) + * extent format reduces metadata overhead (RAM, IO for access, transactions) + * extent format more robust in face of on-disk corruption due to magics, + * internal redunancy in tree +- +-2.1 Previously available, soon to be enabled by default by "mkefs.ext4": +- +-* dir_index and resize inode will be on by default +-* large inodes will be used by default for fast EAs, nsec timestamps, etc ++* improved file allocation (multi-block alloc) ++* fix 32000 subdirectory limit ++* nsec timestamps for mtime, atime, ctime, create time ++* inode version field on disk (NFSv4, Lustre) ++* reduced e2fsck time via uninit_bg feature ++* journal checksumming for robustness, performance ++* persistent file preallocation (e.g for streaming media, databases) ++* ability to pack bitmaps and inode tables into larger virtual groups via the ++ flex_bg feature ++* large file support ++* Inode allocation using large virtual block groups via flex_bg ++* delayed allocation ++* large block (up to pagesize) support ++* efficent new ordered mode in JBD2 and ext4(avoid using buffer head to force ++ the ordering) + + 2.2 Candidate features for future inclusion + +-There are several under discussion, whether they all make it in is +-partly a function of how much time everyone has to work on them: ++* Online defrag (patches available but not well tested) ++* reduced mke2fs time via lazy itable initialization in conjuction with ++ the uninit_bg feature (capability to do this is available in e2fsprogs ++ but a kernel thread to do lazy zeroing of unused inode table blocks ++ after filesystem is first mounted is required for safety) + +-* improved file allocation (multi-block alloc, delayed alloc; basically done) +-* fix 32000 subdirectory limit (patch exists, needs some e2fsck work) +-* nsec timestamps for mtime, atime, ctime, create time (patch exists, +- needs some e2fsck work) +-* inode version field on disk (NFSv4, Lustre; prototype exists) +-* reduced mke2fs/e2fsck time via uninitialized groups (prototype exists) +-* journal checksumming for robustness, performance (prototype exists) +-* persistent file preallocation (e.g for streaming media, databases) ++There are several others under discussion, whether they all make it in is ++partly a function of how much time everyone has to work on them. Features like ++metadata checksumming have been discussed and planned for a bit but no patches ++exist yet so I'm not sure they're in the near-term roadmap. + +-Features like metadata checksumming have been discussed and planned for +-a bit but no patches exist yet so I'm not sure they're in the near-term +-roadmap. ++The big performance win will come with mballoc, delalloc and flex_bg ++grouping of bitmaps and inode tables. Some test results available here: + +-The big performance win will come with mballoc and delalloc. CFS has +-been using mballoc for a few years already with Lustre, and IBM + Bull +-did a lot of benchmarking on it. The reason it isn't in the first set of +-patches is partly a manageability issue, and partly because it doesn't +-directly affect the on-disk format (outside of much better allocation) +-so it isn't critical to get into the first round of changes. I believe +-Alex is working on a new set of patches right now. ++ - http://www.bullopensource.org/ext4/20080530/ffsb-write-2.6.26-rc2.html ++ - http://www.bullopensource.org/ext4/20080530/ffsb-readwrite-2.6.26-rc2.html + + 3. Options + ========== +@@ -222,9 +249,11 @@ stripe=n Number of filesystem blocks that mballoc will try + to use for allocation size and alignment. For RAID5/6 + systems this should be the number of data + disks * RAID chunk size in file system blocks. +- ++delalloc (*) Deferring block allocation until write-out time. ++nodelalloc Disable delayed allocation. Blocks are allocation ++ when data is copied from user to page cache. + Data Mode +---------- ++========= + There are 3 different data modes: + + * writeback mode +@@ -236,10 +265,10 @@ typically provide the best ext4 performance. + + * ordered mode + In data=ordered mode, ext4 only officially journals metadata, but it logically +-groups metadata and data blocks into a single unit called a transaction. When +-it's time to write the new metadata out to disk, the associated data blocks +-are written first. In general, this mode performs slightly slower than +-writeback but significantly faster than journal mode. ++groups metadata information related to data changes with the data blocks into a ++single unit called a transaction. When it's time to write the new metadata ++out to disk, the associated data blocks are written first. In general, ++this mode performs slightly slower than writeback but significantly faster than journal mode. + + * journal mode + data=journal mode provides full data and metadata journaling. All new data is +@@ -247,7 +276,8 @@ written to the journal first, and then to its final location. + In the event of a crash, the journal can be replayed, bringing both data and + metadata into a consistent state. This mode is the slowest except when data + needs to be read from and written to disk at the same time where it +-outperforms all others modes. ++outperforms all others modes. Curently ext4 does not have delayed ++allocation support if this data journalling mode is selected. + + References + ========== +@@ -256,7 +286,8 @@ kernel source: <file:fs/ext4/> + <file:fs/jbd2/> + + programs: http://e2fsprogs.sourceforge.net/ +- http://ext2resize.sourceforge.net + + useful links: http://fedoraproject.org/wiki/ext3-devel + http://www.bullopensource.org/ext4/ ++ http://ext4.wiki.kernel.org/index.php/Main_Page ++ http://fedoraproject.org/wiki/Features/Ext4 +diff --git a/fs/buffer.c b/fs/buffer.c +index 0f51c0f..5fa1512 100644 +--- a/fs/buffer.c ++++ b/fs/buffer.c +@@ -1691,11 +1691,13 @@ static int __block_write_full_page(struct inode *inode, struct page *page, + */ + clear_buffer_dirty(bh); + set_buffer_uptodate(bh); +- } else if (!buffer_mapped(bh) && buffer_dirty(bh)) { ++ } else if ((!buffer_mapped(bh) || buffer_delay(bh)) && ++ buffer_dirty(bh)) { + WARN_ON(bh->b_size != blocksize); + err = get_block(inode, block, bh, 1); + if (err) + goto recover; ++ clear_buffer_delay(bh); + if (buffer_new(bh)) { + /* blockdev mappings never come here */ + clear_buffer_new(bh); +@@ -1774,7 +1776,8 @@ recover: + bh = head; + /* Recovery: lock and submit the mapped buffers */ + do { +- if (buffer_mapped(bh) && buffer_dirty(bh)) { ++ if (buffer_mapped(bh) && buffer_dirty(bh) && ++ !buffer_delay(bh)) { + lock_buffer(bh); + mark_buffer_async_write(bh); + } else { +@@ -2061,6 +2064,7 @@ int generic_write_end(struct file *file, struct address_space *mapping, + struct page *page, void *fsdata) + { + struct inode *inode = mapping->host; ++ int i_size_changed = 0; + + copied = block_write_end(file, mapping, pos, len, copied, page, fsdata); + +@@ -2073,12 +2077,21 @@ int generic_write_end(struct file *file, struct address_space *mapping, + */ + if (pos+copied > inode->i_size) { + i_size_write(inode, pos+copied); +- mark_inode_dirty(inode); ++ i_size_changed = 1; + } + + unlock_page(page); + page_cache_release(page); + ++ /* ++ * Don't mark the inode dirty under page lock. First, it unnecessarily ++ * makes the holding time of page lock longer. Second, it forces lock ++ * ordering of page lock and transaction start for journaling ++ * filesystems. ++ */ ++ if (i_size_changed) ++ mark_inode_dirty(inode); ++ + return copied; + } + EXPORT_SYMBOL(generic_write_end); +diff --git a/fs/ext4/acl.c b/fs/ext4/acl.c +index 3c8dab8..a234b54 100644 +--- a/fs/ext4/acl.c ++++ b/fs/ext4/acl.c +@@ -40,34 +40,35 @@ ext4_acl_from_disk(const void *value, size_t size) + acl = posix_acl_alloc(count, GFP_NOFS); + if (!acl) + return ERR_PTR(-ENOMEM); +- for (n=0; n < count; n++) { ++ for (n = 0; n < count; n++) { + ext4_acl_entry *entry = + (ext4_acl_entry *)value; + if ((char *)value + sizeof(ext4_acl_entry_short) > end) + goto fail; + acl->a_entries[n].e_tag = le16_to_cpu(entry->e_tag); + acl->a_entries[n].e_perm = le16_to_cpu(entry->e_perm); +- switch(acl->a_entries[n].e_tag) { +- case ACL_USER_OBJ: +- case ACL_GROUP_OBJ: +- case ACL_MASK: +- case ACL_OTHER: +- value = (char *)value + +- sizeof(ext4_acl_entry_short); +- acl->a_entries[n].e_id = ACL_UNDEFINED_ID; +- break; +- +- case ACL_USER: +- case ACL_GROUP: +- value = (char *)value + sizeof(ext4_acl_entry); +- if ((char *)value > end) +- goto fail; +- acl->a_entries[n].e_id = +- le32_to_cpu(entry->e_id); +- break; +- +- default: ++ ++ switch (acl->a_entries[n].e_tag) { ++ case ACL_USER_OBJ: ++ case ACL_GROUP_OBJ: ++ case ACL_MASK: ++ case ACL_OTHER: ++ value = (char *)value + ++ sizeof(ext4_acl_entry_short); ++ acl->a_entries[n].e_id = ACL_UNDEFINED_ID; ++ break; ++ ++ case ACL_USER: ++ case ACL_GROUP: ++ value = (char *)value + sizeof(ext4_acl_entry); ++ if ((char *)value > end) + goto fail; ++ acl->a_entries[n].e_id = ++ le32_to_cpu(entry->e_id); ++ break; ++ ++ default: ++ goto fail; + } + } + if (value != end) +@@ -96,27 +97,26 @@ ext4_acl_to_disk(const struct posix_acl *acl, size_t *size) + return ERR_PTR(-ENOMEM); + ext_acl->a_version = cpu_to_le32(EXT4_ACL_VERSION); + e = (char *)ext_acl + sizeof(ext4_acl_header); +- for (n=0; n < acl->a_count; n++) { ++ for (n = 0; n < acl->a_count; n++) { + ext4_acl_entry *entry = (ext4_acl_entry *)e; + entry->e_tag = cpu_to_le16(acl->a_entries[n].e_tag); + entry->e_perm = cpu_to_le16(acl->a_entries[n].e_perm); +- switch(acl->a_entries[n].e_tag) { +- case ACL_USER: +- case ACL_GROUP: +- entry->e_id = +- cpu_to_le32(acl->a_entries[n].e_id); +- e += sizeof(ext4_acl_entry); +- break; +- +- case ACL_USER_OBJ: +- case ACL_GROUP_OBJ: +- case ACL_MASK: +- case ACL_OTHER: +- e += sizeof(ext4_acl_entry_short); +- break; +- +- default: +- goto fail; ++ switch (acl->a_entries[n].e_tag) { ++ case ACL_USER: ++ case ACL_GROUP: ++ entry->e_id = cpu_to_le32(acl->a_entries[n].e_id); ++ e += sizeof(ext4_acl_entry); ++ break; ++ ++ case ACL_USER_OBJ: ++ case ACL_GROUP_OBJ: ++ case ACL_MASK: ++ case ACL_OTHER: ++ e += sizeof(ext4_acl_entry_short); ++ break; ++ ++ default: ++ goto fail; + } + } + return (char *)ext_acl; +@@ -167,23 +167,23 @@ ext4_get_acl(struct inode *inode, int type) + if (!test_opt(inode->i_sb, POSIX_ACL)) + return NULL; + +- switch(type) { +- case ACL_TYPE_ACCESS: +- acl = ext4_iget_acl(inode, &ei->i_acl); +- if (acl != EXT4_ACL_NOT_CACHED) +- return acl; +- name_index = EXT4_XATTR_INDEX_POSIX_ACL_ACCESS; +- break; +- +- case ACL_TYPE_DEFAULT: +- acl = ext4_iget_acl(inode, &ei->i_default_acl); +- if (acl != EXT4_ACL_NOT_CACHED) +- return acl; +- name_index = EXT4_XATTR_INDEX_POSIX_ACL_DEFAULT; +- break; +- +- default: +- return ERR_PTR(-EINVAL); ++ switch (type) { ++ case ACL_TYPE_ACCESS: ++ acl = ext4_iget_acl(inode, &ei->i_acl); ++ if (acl != EXT4_ACL_NOT_CACHED) ++ return acl; ++ name_index = EXT4_XATTR_INDEX_POSIX_ACL_ACCESS; ++ break; ++ ++ case ACL_TYPE_DEFAULT: ++ acl = ext4_iget_acl(inode, &ei->i_default_acl); ++ if (acl != EXT4_ACL_NOT_CACHED) ++ return acl; ++ name_index = EXT4_XATTR_INDEX_POSIX_ACL_DEFAULT; ++ break; ++ ++ default: ++ return ERR_PTR(-EINVAL); + } + retval = ext4_xattr_get(inode, name_index, "", NULL, 0); + if (retval > 0) { +@@ -201,14 +201,14 @@ ext4_get_acl(struct inode *inode, int type) + kfree(value); + + if (!IS_ERR(acl)) { +- switch(type) { +- case ACL_TYPE_ACCESS: +- ext4_iset_acl(inode, &ei->i_acl, acl); +- break; +- +- case ACL_TYPE_DEFAULT: +- ext4_iset_acl(inode, &ei->i_default_acl, acl); +- break; ++ switch (type) { ++ case ACL_TYPE_ACCESS: ++ ext4_iset_acl(inode, &ei->i_acl, acl); ++ break; ++ ++ case ACL_TYPE_DEFAULT: ++ ext4_iset_acl(inode, &ei->i_default_acl, acl); ++ break; + } + } + return acl; +@@ -232,31 +232,31 @@ ext4_set_acl(handle_t *handle, struct inode *inode, int type, + if (S_ISLNK(inode->i_mode)) + return -EOPNOTSUPP; + +- switch(type) { +- case ACL_TYPE_ACCESS: +- name_index = EXT4_XATTR_INDEX_POSIX_ACL_ACCESS; +- if (acl) { +- mode_t mode = inode->i_mode; +- error = posix_acl_equiv_mode(acl, &mode); +- if (error < 0) +- return error; +- else { +- inode->i_mode = mode; +- ext4_mark_inode_dirty(handle, inode); +- if (error == 0) +- acl = NULL; +- } ++ switch (type) { ++ case ACL_TYPE_ACCESS: ++ name_index = EXT4_XATTR_INDEX_POSIX_ACL_ACCESS; ++ if (acl) { ++ mode_t mode = inode->i_mode; ++ error = posix_acl_equiv_mode(acl, &mode); ++ if (error < 0) ++ return error; ++ else { ++ inode->i_mode = mode; ++ ext4_mark_inode_dirty(handle, inode); ++ if (error == 0) ++ acl = NULL; + } +- break; ++ } ++ break; + +- case ACL_TYPE_DEFAULT: +- name_index = EXT4_XATTR_INDEX_POSIX_ACL_DEFAULT; +- if (!S_ISDIR(inode->i_mode)) +- return acl ? -EACCES : 0; +- break; ++ case ACL_TYPE_DEFAULT: ++ name_index = EXT4_XATTR_INDEX_POSIX_ACL_DEFAULT; ++ if (!S_ISDIR(inode->i_mode)) ++ return acl ? -EACCES : 0; ++ break; + +- default: +- return -EINVAL; ++ default: ++ return -EINVAL; + } + if (acl) { + value = ext4_acl_to_disk(acl, &size); +@@ -269,14 +269,14 @@ ext4_set_acl(handle_t *handle, struct inode *inode, int type, + + kfree(value); + if (!error) { +- switch(type) { +- case ACL_TYPE_ACCESS: +- ext4_iset_acl(inode, &ei->i_acl, acl); +- break; +- +- case ACL_TYPE_DEFAULT: +- ext4_iset_acl(inode, &ei->i_default_acl, acl); +- break; ++ switch (type) { ++ case ACL_TYPE_ACCESS: ++ ext4_iset_acl(inode, &ei->i_acl, acl); ++ break; ++ ++ case ACL_TYPE_DEFAULT: ++ ext4_iset_acl(inode, &ei->i_default_acl, acl); ++ break; + } + } + return error; +diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c +index 9cc80b9..e9fa960 100644 +--- a/fs/ext4/balloc.c ++++ b/fs/ext4/balloc.c +@@ -47,7 +47,7 @@ static int ext4_block_in_group(struct super_block *sb, ext4_fsblk_t block, + ext4_group_t block_group) + { + ext4_group_t actual_group; +- ext4_get_group_no_and_offset(sb, block, &actual_group, 0); ++ ext4_get_group_no_and_offset(sb, block, &actual_group, NULL); + if (actual_group == block_group) + return 1; + return 0; +@@ -121,12 +121,7 @@ unsigned ext4_init_block_bitmap(struct super_block *sb, struct buffer_head *bh, + le16_to_cpu(sbi->s_es->s_reserved_gdt_blocks); + } + } else { /* For META_BG_BLOCK_GROUPS */ +- int group_rel = (block_group - +- le32_to_cpu(sbi->s_es->s_first_meta_bg)) % +- EXT4_DESC_PER_BLOCK(sb); +- if (group_rel == 0 || group_rel == 1 || +- (group_rel == EXT4_DESC_PER_BLOCK(sb) - 1)) +- bit_max += 1; ++ bit_max += ext4_bg_num_gdb(sb, block_group); + } + + if (block_group == sbi->s_groups_count - 1) { +@@ -295,7 +290,7 @@ err_out: + return 0; + } + /** +- * read_block_bitmap() ++ * ext4_read_block_bitmap() + * @sb: super block + * @block_group: given block group + * +@@ -305,7 +300,7 @@ err_out: + * Return buffer_head on success or NULL in case of failure. + */ + struct buffer_head * +-read_block_bitmap(struct super_block *sb, ext4_group_t block_group) ++ext4_read_block_bitmap(struct super_block *sb, ext4_group_t block_group) + { + struct ext4_group_desc * desc; <<Diff was trimmed, longer than 597 lines>> _______________________________________________ pld-cvs-commit mailing list [email protected] http://lists.pld-linux.org/mailman/listinfo/pld-cvs-commit
