[PATCH 3/4] ext3: add block bitmap validation

2007-11-15 Thread Aneesh Kumar K.V
When a new block bitmap is read from disk in read_block_bitmap()
there are a few bits that should ALWAYS be set.  In particular,
the blocks given corresponding to block bitmap, inode bitmap and inode tables.
Validate the block bitmap against these blocks.

Signed-off-by: Aneesh Kumar K.V [EMAIL PROTECTED]
---
 fs/ext3/balloc.c |   77 -
 1 files changed, 69 insertions(+), 8 deletions(-)

diff --git a/fs/ext3/balloc.c b/fs/ext3/balloc.c
index a8ba7e8..8f732b8 100644
--- a/fs/ext3/balloc.c
+++ b/fs/ext3/balloc.c
@@ -80,13 +80,52 @@ struct ext3_group_desc * ext3_get_group_desc(struct 
super_block * sb,
return desc + offset;
 }
 
+static int ext3_valid_block_bitmap(struct super_block *sb,
+   struct ext3_group_desc *desc,
+   unsigned int block_group,
+   struct buffer_head *bh)
+{
+   ext3_grpblk_t offset;
+   ext3_grpblk_t next_zero_bit;
+   ext3_fsblk_t bitmap_blk;
+   ext3_fsblk_t group_first_block;
+
+   group_first_block = ext3_group_first_block_no(sb, block_group);
+
+   /* check whether block bitmap block number is set */
+   bitmap_blk = le32_to_cpu(desc-bg_block_bitmap);
+   offset = bitmap_blk - group_first_block;
+   if (!ext3_test_bit(offset, bh-b_data)) {
+   /* bad block bitmap */
+   return 0;
+   }
+   /* check whether the inode bitmap block number is set */
+   bitmap_blk = le32_to_cpu(desc-bg_inode_bitmap);
+   offset = bitmap_blk - group_first_block;
+   if (!ext3_test_bit(offset, bh-b_data)) {
+   /* bad block bitmap */
+   return 0;
+   }
+   /* check whether the inode table block number is set */
+   bitmap_blk = le32_to_cpu(desc-bg_inode_table);
+   offset = bitmap_blk - group_first_block;
+   next_zero_bit = ext3_find_next_zero_bit(bh-b_data,
+   EXT3_SB(sb)-s_itb_per_group + 1,
+   offset);
+   if (next_zero_bit = offset + EXT3_SB(sb)-s_itb_per_group)
+   /* good bitmap for inode tables */
+   return 1;
+
+   return 0;
+}
+
 /**
  * read_block_bitmap()
  * @sb:super block
  * @block_group:   given block group
  *
- * Read the bitmap for a given block_group, reading into the specified
- * slot in the superblock's bitmap cache.
+ * Read the bitmap for a given block_group,and validate the
+ * bits for block/inode/inode tables are set in the bitmaps
  *
  * Return buffer_head on success or NULL in case of failure.
  */
@@ -95,17 +134,39 @@ read_block_bitmap(struct super_block *sb, unsigned int 
block_group)
 {
struct ext3_group_desc * desc;
struct buffer_head * bh = NULL;
+   ext3_fsblk_t bitmap_blk;
 
-   desc = ext3_get_group_desc (sb, block_group, NULL);
+   desc = ext3_get_group_desc(sb, block_group, NULL);
if (!desc)
-   goto error_out;
-   bh = sb_bread(sb, le32_to_cpu(desc-bg_block_bitmap));
-   if (!bh)
-   ext3_error (sb, read_block_bitmap,
+   return NULL;
+   bitmap_blk = le32_to_cpu(desc-bg_block_bitmap);
+   bh = sb_getblk(sb, bitmap_blk);
+   if (unlikely(!bh)) {
+   ext3_error(sb, __FUNCTION__,
+   Cannot read block bitmap - 
+   block_group = %d, block_bitmap = %u,
+   block_group, le32_to_cpu(desc-bg_block_bitmap));
+   return NULL;
+   }
+   if (likely(bh_uptodate_or_lock(bh)))
+   return bh;
+
+   if (bh_submit_read(bh)  0) {
+   brelse(bh);
+   ext3_error(sb, __FUNCTION__,
Cannot read block bitmap - 
block_group = %d, block_bitmap = %u,
block_group, le32_to_cpu(desc-bg_block_bitmap));
-error_out:
+   return NULL;
+   }
+   if (!ext3_valid_block_bitmap(sb, desc, block_group, bh)) {
+   brelse(bh);
+   ext3_error(sb, __FUNCTION__,
+   Invalid block bitmap - 
+   block_group = %d, block = %lu,
+   block_group, bitmap_blk);
+   return NULL;
+   }
return bh;
 }
 /*
-- 
1.5.3.5.652.gf192c-dirty

-
To unsubscribe from this list: send the line unsubscribe linux-ext4 in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 4/4] ext4: add block bitmap validation

2007-11-15 Thread Aneesh Kumar K.V
When a new block bitmap is read from disk in read_block_bitmap()
there are a few bits that should ALWAYS be set.  In particular,
the blocks given corresponding to block bitmap, inode bitmap and inode tables.
Validate the block bitmap against these blocks.

Signed-off-by: Aneesh Kumar K.V [EMAIL PROTECTED]
---
 fs/ext4/balloc.c |   96 +-
 1 files changed, 80 insertions(+), 16 deletions(-)

diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index 71ee95e..2657a1e 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -189,13 +189,62 @@ struct ext4_group_desc * ext4_get_group_desc(struct 
super_block * sb,
return desc;
 }
 
+static int ext4_valid_block_bitmap(struct super_block *sb,
+   struct ext4_group_desc *desc,
+   unsigned int block_group,
+   struct buffer_head *bh)
+{
+   ext4_grpblk_t offset;
+   ext4_grpblk_t next_zero_bit;
+   ext4_fsblk_t bitmap_blk;
+   ext4_fsblk_t group_first_block;
+
+   if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FLEX_BG)) {
+   /* with FLEX_BG, the inode/block bitmaps and itable
+* blocks may not be in the group at all
+* so the bitmap validation will be skipped for those groups
+* or it has to also read the block group where the bitmaps
+* are located to verify they are set.
+*/
+   return 1;
+   }
+   group_first_block = ext4_group_first_block_no(sb, block_group);
+
+   /* check whether block bitmap block number is set */
+   bitmap_blk = ext4_block_bitmap(sb, desc);
+   offset = bitmap_blk - group_first_block;
+   if (!ext4_test_bit(offset, bh-b_data)) {
+   /* bad block bitmap */
+   return 0;
+   }
+
+   /* check whether the inode bitmap block number is set */
+   bitmap_blk = ext4_inode_bitmap(sb, desc);
+   offset = bitmap_blk - group_first_block;
+   if (!ext4_test_bit(offset, bh-b_data)) {
+   /* bad block bitmap */
+   return 0;
+   }
+
+   /* check whether the inode table block number is set */
+   bitmap_blk = ext4_inode_table(sb, desc);
+   offset = bitmap_blk - group_first_block;
+   next_zero_bit = ext4_find_next_zero_bit(bh-b_data,
+   EXT4_SB(sb)-s_itb_per_group + 1,
+   offset);
+   if (next_zero_bit = offset + EXT4_SB(sb)-s_itb_per_group)
+   /* good bitmap for inode tables */
+   return 1;
+
+   return 0;
+}
 /**
  * read_block_bitmap()
  * @sb:super block
  * @block_group:   given block group
  *
- * Read the bitmap for a given block_group, reading into the specified
- * slot in the superblock's bitmap cache.
+ * Read the bitmap for a given block_group,and validate the
+ * bits for block/inode/inode tables are set in the bitmaps
  *
  * Return buffer_head on success or NULL in case of failure.
  */
@@ -210,25 +259,40 @@ read_block_bitmap(struct super_block *sb, unsigned int 
block_group)
if (!desc)
return NULL;
bitmap_blk = ext4_block_bitmap(sb, desc);
+   bh = sb_getblk(sb, bitmap_blk);
+   if (unlikely(!bh)) {
+   ext4_error(sb, __FUNCTION__,
+   Cannot read block bitmap - 
+   block_group = %d, block_bitmap = %llu,
+   block_group, bitmap_blk);
+   return NULL;
+   }
+   if (bh_uptodate_or_lock(bh))
+   return bh;
+
if (desc-bg_flags  cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
-   bh = sb_getblk(sb, bitmap_blk);
-   if (!buffer_uptodate(bh)) {
-   lock_buffer(bh);
-   if (!buffer_uptodate(bh)) {
-   ext4_init_block_bitmap(sb, bh, block_group,
-  desc);
-   set_buffer_uptodate(bh);
-   }
-   unlock_buffer(bh);
-   }
-   } else {
-   bh = sb_bread(sb, bitmap_blk);
+   ext4_init_block_bitmap(sb, bh, block_group, desc);
+   set_buffer_uptodate(bh);
+   unlock_buffer(bh);
+   return bh;
}
-   if (!bh)
-   ext4_error (sb, __FUNCTION__,
+   if (bh_submit_read(bh)  0) {
+   brelse(bh);
+   ext4_error(sb, __FUNCTION__,
Cannot read block bitmap - 
block_group = %d, block_bitmap = %llu,
block_group, bitmap_blk);
+   return NULL;
+   }
+   if (!ext4_valid_block_bitmap(sb, desc, block_group, bh)) {
+   brelse(bh);
+  

Re: ext3 metaclustering performance numbers and updated patch

2007-11-15 Thread Abhishek Rai
Please ignore previous patch, it has an issue with operator precedence
in ext3_get_grp_metacluster(), try this instead:

Signed-off-by: Abhishek Rai [EMAIL PROTECTED]

diff -uprdN linux-2.6.23mm1-clean/fs/ext3/balloc.c
linux-2.6.23mm1-ext3mc/fs/ext3/balloc.c
--- linux-2.6.23mm1-clean/fs/ext3/balloc.c  2007-10-17 18:31:42.0 
-0700
+++ linux-2.6.23mm1-ext3mc/fs/ext3/balloc.c 2007-11-15 11:23:51.0 
-0800
@@ -711,6 +711,7 @@ bitmap_search_next_usable_block(ext3_grp
ext3_grpblk_t next;
struct journal_head *jh = bh2jh(bh);

+   BUG_ON(start  maxblocks);
while (start  maxblocks) {
next = ext3_find_next_zero_bit(bh-b_data, maxblocks, start);
if (next = maxblocks)
@@ -841,10 +842,12 @@ claim_block(spinlock_t *lock, ext3_grpbl
 static ext3_grpblk_t
 ext3_try_to_allocate(struct super_block *sb, handle_t *handle, int group,
struct buffer_head *bitmap_bh, ext3_grpblk_t grp_goal,
-   unsigned long *count, struct ext3_reserve_window 
*my_rsv)
+   int use_metacluster, unsigned long *count,
+   struct ext3_reserve_window *my_rsv)
 {
ext3_fsblk_t group_first_block;
ext3_grpblk_t start, end;
+   ext3_grpblk_t mc_start, mc_end, start2 = -1, end2 = -1;
unsigned long num = 0;

/* we do allocation within the reservation window if we have a window */
@@ -872,12 +875,48 @@ ext3_try_to_allocate(struct super_block
}

BUG_ON(start  EXT3_BLOCKS_PER_GROUP(sb));
+   /* start must have been set to grp_goal if one still exists. */
+   BUG_ON(grp_goal = 0  start != grp_goal);
+
+   if (test_opt(sb, METACLUSTER)  !use_metacluster) {
+   ext3_get_grp_metacluster(sb, mc_start, mc_end);
+
+   /*
+* If there is an overlap with metacluster range, adjust our
+* range to remove overlap, splitting our range into two if
+* needed.
+*/
+   if (mc_end  mc_start) {
+   if (mc_start = start)
+   start = max_t(ext3_grpblk_t, start, mc_end);
+   else if (mc_end = end)
+   end = min_t(ext3_grpblk_t, end, mc_start);
+   else {
+   start2 = mc_end;
+   end2 = end;
+   end = mc_start;
+   }
+   }
+   }
+
+   if (start = end)
+   goto fail_access;
+
+   if (grp_goal  0)
+   grp_goal = start;

 repeat:
if (grp_goal  0 || !ext3_test_allocatable(grp_goal, bitmap_bh)) {
grp_goal = find_next_usable_block(start, bitmap_bh, end);
-   if (grp_goal  0)
+   if (grp_goal  0) {
+   if (start2 = 0) {
+   start = start2;
+   end = end2;
+   start2 = -1;
+   goto repeat;
+   }
goto fail_access;
+   }
if (!my_rsv) {
int i;

@@ -898,8 +937,15 @@ repeat:
 */
start++;
grp_goal++;
-   if (start = end)
-   goto fail_access;
+   if (start = end) {
+   if (start2  0)
+   goto fail_access;
+
+   start = start2;
+   end = end2;
+   start2 = -1;
+   grp_goal = -1;
+   }
goto repeat;
}
num++;
@@ -1084,6 +1130,7 @@ static int alloc_new_reservation(struct
unsigned long size;
int ret;
spinlock_t *rsv_lock = EXT3_SB(sb)-s_rsv_window_lock;
+   ext3_grpblk_t mc_start, mc_end;

group_first_block = ext3_group_first_block_no(sb, group);
group_end_block = group_first_block + (EXT3_BLOCKS_PER_GROUP(sb) - 1);
@@ -1143,6 +1190,7 @@ static int alloc_new_reservation(struct
 * To make sure the reservation window has a free bit inside it, we
 * need to check the bitmap after we found a reservable window.
 */
+   ext3_get_grp_metacluster(sb, mc_start, mc_end);
 retry:
ret = find_next_reservable_window(search_head, my_rsv, sb,
start_block, group_end_block);
@@ -1170,6 +1218,11 @@ retry:
my_rsv-rsv_start - group_first_block,
bitmap_bh, group_end_block - group_first_block + 1);

+   if (first_free_block = mc_start  first_free_block  mc_end) {
+   start_block = mc_end;
+   goto next;
+   }
+
if (first_free_block  0) {
/*
 * no free block left on 

Re: test for EA validity checking

2007-11-15 Thread Andreas Dilger
On Nov 10, 2007  05:10 -0700, Andreas Dilger wrote:
 Attached is a test image for extended attribute block checking.
 This small image contains a number of different kinds of corruptions
 (bad magic, bad checksum, empty block, bad EA block number), as well
 as an old-format (v1) EA, and a valid v2 EA.

Attached is an updated patch with a new test image, that includes a
corrupt EA set on a block device.  That had been causing us some
e2fsck heartburn because of random inode table corruption.

This goes along with an updated expand-extra-isize patch.  The root of
the problem was that I had added an extra check to verify the EA magic
in ext2fs_read_ext_attr(), but the return value of this function was used
in check_blocks() to set pctx-errcode, and later in check_ext_attr() the
non-zero value of pctx-errcode caused an abort when
ext2fs_inode_has_valid_blocks() was false and it caused PR_1_BLOCK_ITERATE
to be hit (a fatal error).

I ended up checking the magic in all of the ext2fs_read_ext_attr() callsites,
but IMHO the library should at least be doing basic validity checking like
this.

One question that remains unclear is whether pctx-errcode being set from
the early call of check_ext_attr() should cause check_blocks() to abort?
In the common regular-file/directory/slow-symlink case pctx-errcode is
reset by the call to ext2fs_block_iterate2(), so it would seem reasonable
to go back to checking EA magic in ext2fs_read_ext_attr(), and then clearing
pctx-errcode in check_ext_attr() if the problem is fixed.

Cheers, Andreas
--
Andreas Dilger
Sr. Software Engineer, Lustre Group
Sun Microsystems of Canada, Inc.



e2fsprogs-tests-f_ea_checks.patch
Description: Binary data
Index: e2fsprogs-1.40.1/lib/ext2fs/ext_attr.c
===
--- e2fsprogs-1.40.1.orig/lib/ext2fs/ext_attr.c
+++ e2fsprogs-1.40.1/lib/ext2fs/ext_attr.c
@@ -17,6 +17,7 @@
 #endif
 #include string.h
 #include time.h
+#include errno.h
 
 #include ext2_fs.h
 #include ext2_ext_attr.h
@@ -60,11 +61,39 @@ __u32 ext2fs_ext_attr_hash_entry(struct 
 #undef NAME_HASH_SHIFT
 #undef VALUE_HASH_SHIFT
 
+#define BLOCK_HASH_SHIFT 16
+/*
+ * Re-compute the extended attribute hash value after an entry has changed.
+ */
+static void ext2fs_attr_rehash(struct ext2_ext_attr_header *header,
+  struct ext2_ext_attr_entry *entry)
+{
+   struct ext2_ext_attr_entry *here;
+   __u32 hash = 0;
+
+   entry-e_hash = ext2fs_ext_attr_hash_entry(entry, (char *) header +
+  entry-e_value_offs);
+
+   here = ENTRY(header+1);
+   while (!EXT2_EXT_IS_LAST_ENTRY(here)) {
+   if (!here-e_hash) {
+   /* Block is not shared if an entry's hash value == 0 */
+   hash = 0;
+   break;
+   }
+   hash = (hash  BLOCK_HASH_SHIFT) ^
+  (hash  (8*sizeof(hash) - BLOCK_HASH_SHIFT)) ^
+  here-e_hash;
+   here = EXT2_EXT_ATTR_NEXT(here);
+   }
+   header-h_hash = hash;
+}
+
 errcode_t ext2fs_read_ext_attr(ext2_filsys fs, blk_t block, void *buf)
 {
errcode_t   retval;
 
-   retval = io_channel_read_blk(fs-io, block, 1, buf);
+   retval = io_channel_read_blk(fs-io, block, 1, buf);
if (retval)
return retval;
 #ifdef EXT2FS_ENABLE_SWAPFS
@@ -92,7 +121,7 @@ errcode_t ext2fs_write_ext_attr(ext2_fil
} else
 #endif
write_buf = (char *) inbuf;
-   retval = io_channel_write_blk(fs-io, block, 1, write_buf);
+   retval = io_channel_write_blk(fs-io, block, 1, write_buf);
if (buf)
ext2fs_free_mem(buf);
if (!retval)
@@ -126,7 +155,10 @@ errcode_t ext2fs_adjust_ea_refcount(ext2
if (retval)
goto errout;
 
-   header = (struct ext2_ext_attr_header *) block_buf;
+   header = BHDR(block_buf);
+   if (header-h_magic != EXT2_EXT_ATTR_MAGIC)
+   return EXT2_ET_EA_BAD_MAGIC;
+
header-h_refcount += adjust;
if (newcount)
*newcount = header-h_refcount;
@@ -140,3 +172,881 @@ errout:
ext2fs_free_mem(buf);
return retval;
 }
+
+struct ext2_attr_info {
+   int name_index;
+   const char *name;
+   const char *value;
+   int value_len;
+};
+
+struct ext2_attr_search {
+   struct ext2_ext_attr_entry *first;
+   char *base;
+   char *end;
+   struct ext2_ext_attr_entry *here;
+   int not_found;
+};
+
+struct ext2_attr_ibody_find {
+   ext2_ino_t ino;
+   struct ext2_attr_search s;
+};
+
+struct ext2_attr_block_find {
+   struct ext2_attr_search s;
+   char *block;
+};
+
+void ext2fs_attr_shift_entries(struct ext2_ext_attr_entry *entry,
+  int value_offs_shift, char *to,
+  char *from, int n)
+{
+   struct 

Re: ext3 metaclustering performance numbers and updated patch

2007-11-15 Thread Abhishek Rai
Andreas,

Thanks for the great review. I've hopefully addressed all your concerns.
Please find my responses inline followed by an updated patch.

On Nov 9, 2007 11:52 PM, Andreas Dilger [EMAIL PROTECTED] wrote:
 On Nov 09, 2007  17:33 -0800, Abhishek Rai wrote:
  Benchmark 5: fsck
  Description: Prepare a newly formated 400GB disk as follows: create
  200 files of 0.5GB each, 100 files of 1GB each, 40 files of 2.5GB ech,
  and 10 files of 10GB each. fsck command line: fsck -f -n
  1. vanilla:
 Total: 12m18.1s
 User: 15.9s
 System: 18.3s
  2. mc:
 Total: 4m47.0s
 User: 16.0s
 System: 17.1s

 The fsck improvement is quite impressive.

 Do you have any benchmarks with small files 64kB-1MB in size
 (e.g. kernbench or mongo)?  I'm wondering if there are any drawbacks
 from the metaclustering block reservation of blocks if the files are
 relatively small because of the extra seeking involved.

Here are the results for kernbench from a 8 cpu machine with 32GB RAM.

Benchmark: kernbench
Description: 6 runs, very low standard deviation
1. vanilla:
   Elapsed: 35.60
   User: 228.79
   System: 21.10
2. mc:
   Elapsed: 35.12
   User: 228.47
   System: 21.08


 Also, it took me a while to determine whether the metacluster is
 done on a per-group basis instead of per-file or per-filesystem.
 It is counter-intuitive because the ext3_get_metacluster_range()
 function only takes the superblock as a parameter and not e.g. the
 group or inode number.

Good point, renamed the function to ext3_get_grp_metacluster(). But
refrained from adding a group parameter as that might suggest we vary
metaclusters from group to group.


  In comparison, approaches that optimize inode table reads or bitmap
  block reads deliver little performance gains on such disks with lots
  of data as they target only a small fraction of the total e2fsck
  latency. However, such approaches (e.g., uninit_groups), deliver
  substantial improvement for disks with little data, but e2fsck on such
  disks doesn't take too long anyway and hence may not be considered a
  serious issue in the first place.

 Did you do a comparison of the metacluster code with extents?  That
 should also improve the e2fsck time in a similar manner by virtue of
 not needing as many (or any) indirect blocks.  In ideal allocation
 cases a single extent can represent a whole block group of blocks,
 and files up to 480MB might not even need any index blocks.

 I understand you aren't very keen on changing the on-disk format, but
 it would be interesting to compare the relative benefits of the two
 on identical hardware.

That's a valid point. I'm definitely going to do a performance
comparison of metaclusters with extents and report my findings.


 I also suspect that using ext3_new_indirect_block() for extent
 index blocks on large files would have a similar positive impact for
 ext4 extents in the case where they are needed).

I think this would be worth investigating though I don't expect to see
long e2fsck times with extents in the first place. One difference with
extents would be that
prefetching might not be as relevant for extent index blocks as it is
for indirect blocks given the much larger data coverage achieved by
the former. However, prefetching is just an optimization, its
not the main point of this change. I'll investigate this in conjunction
with metaclustering vs extent comparison.

 While your patch is fairly complex, I personally don't have a big objection
 to it going into the kernel, but there was a lot of vocal opposition to
 ext3 changes in the past so you may have an uphill battle with other
 kernel developers.

  @@ -1713,6 +1783,161 @@ ext3_fsblk_t ext3_new_block(handle_t *ha
  +int ext3_new_indirect_blocks(handle_t *handle, struct inode *inode,
  + unsigned long group_no, unsigned long *count,
  + ext3_fsblk_t new_blocks[])
  +{
  + if (err  err != -ENOSPC)
  + printk(ext3_new_indirect_blocks: error %d, err);

 Should this be an ext3_error() instead of a printk() or do all of the
 functions that generate errors already call ext3_error()?  If there is
 a good reason NOT to make it an ext3_error() you are missing a KERN_*
 error level.

Done


  +typedef struct {
  + __le32  *p;
  + __le32  key;
  + struct buffer_head *bh;
  +} Indirect;

 Typedefs are frowned upon in Linux kernel code.

  @@ -233,12 +255,6 @@ no_delete:
  -typedef struct {
  - __le32  *p;
  - __le32  key;
  - struct buffer_head *bh;
  -} Indirect;

 Though, hmm, it seems you didn't invent this yourself...

  -static ext3_fsblk_t ext3_find_near(struct inode *inode, Indirect *ind)
  +static inline ext3_fsblk_t ext3_find_near(struct inode *inode, Indirect 
  *ind)

 Function is much too large to inline, even if it has just a single callsite.

In the worst case it will not be inlined, so that shouldn't be a
problem. Still, I removed the inline just in case someone adds an
additional callsite in 

- forbid-user-to-change-file-flags-on-quota-files.patch removed from -mm tree

2007-11-15 Thread akpm

The patch titled
 Forbid user to change file flags on quota files
has been removed from the -mm tree.  Its filename was
 forbid-user-to-change-file-flags-on-quota-files.patch

This patch was dropped because it was merged into mainline or a subsystem tree

--
Subject: Forbid user to change file flags on quota files
From: Jan Kara [EMAIL PROTECTED]

Forbid user from changing file flags on quota files.  User has no bussiness
in playing with these flags when quota is on.  Furthermore there is a
remote possibility of deadlock due to a lock inversion between quota file's
i_mutex and transaction's start (i_mutex for quota file is locked only when
trasaction is started in quota operations) in ext3 and ext4.

Signed-off-by: Jan Kara [EMAIL PROTECTED]
Cc: LIOU Payphone [EMAIL PROTECTED]
Cc: linux-ext4@vger.kernel.org
Acked-by: Dave Kleikamp [EMAIL PROTECTED]
Cc: [EMAIL PROTECTED]
Signed-off-by: Andrew Morton [EMAIL PROTECTED]
---

 fs/ext2/ioctl.c |5 +
 fs/ext3/ioctl.c |5 +
 fs/ext4/ioctl.c |5 +
 fs/jfs/ioctl.c  |3 +++
 fs/reiserfs/ioctl.c |3 +++
 5 files changed, 21 insertions(+)

diff -puN fs/ext2/ioctl.c~forbid-user-to-change-file-flags-on-quota-files 
fs/ext2/ioctl.c
--- a/fs/ext2/ioctl.c~forbid-user-to-change-file-flags-on-quota-files
+++ a/fs/ext2/ioctl.c
@@ -47,6 +47,11 @@ int ext2_ioctl (struct inode * inode, st
flags = ~EXT2_DIRSYNC_FL;
 
mutex_lock(inode-i_mutex);
+   /* Is it quota file? Do not allow user to mess with it */
+   if (IS_NOQUOTA(inode)) {
+   mutex_unlock(inode-i_mutex);
+   return -EPERM;
+   }
oldflags = ei-i_flags;
 
/*
diff -puN fs/ext3/ioctl.c~forbid-user-to-change-file-flags-on-quota-files 
fs/ext3/ioctl.c
--- a/fs/ext3/ioctl.c~forbid-user-to-change-file-flags-on-quota-files
+++ a/fs/ext3/ioctl.c
@@ -51,6 +51,11 @@ int ext3_ioctl (struct inode * inode, st
flags = ~EXT3_DIRSYNC_FL;
 
mutex_lock(inode-i_mutex);
+   /* Is it quota file? Do not allow user to mess with it */
+   if (IS_NOQUOTA(inode)) {
+   mutex_unlock(inode-i_mutex);
+   return -EPERM;
+   }
oldflags = ei-i_flags;
 
/* The JOURNAL_DATA flag is modifiable only by root */
diff -puN fs/ext4/ioctl.c~forbid-user-to-change-file-flags-on-quota-files 
fs/ext4/ioctl.c
--- a/fs/ext4/ioctl.c~forbid-user-to-change-file-flags-on-quota-files
+++ a/fs/ext4/ioctl.c
@@ -51,6 +51,11 @@ int ext4_ioctl (struct inode * inode, st
flags = ~EXT4_DIRSYNC_FL;
 
mutex_lock(inode-i_mutex);
+   /* Is it quota file? Do not allow user to mess with it */
+   if (IS_NOQUOTA(inode)) {
+   mutex_unlock(inode-i_mutex);
+   return -EPERM;
+   }
oldflags = ei-i_flags;
 
/* The JOURNAL_DATA flag is modifiable only by root */
diff -puN fs/jfs/ioctl.c~forbid-user-to-change-file-flags-on-quota-files 
fs/jfs/ioctl.c
--- a/fs/jfs/ioctl.c~forbid-user-to-change-file-flags-on-quota-files
+++ a/fs/jfs/ioctl.c
@@ -79,6 +79,9 @@ int jfs_ioctl(struct inode * inode, stru
if (!S_ISDIR(inode-i_mode))
flags = ~JFS_DIRSYNC_FL;
 
+   /* Is it quota file? Do not allow user to mess with it */
+   if (IS_NOQUOTA(inode))
+   return -EPERM;
jfs_get_inode_flags(jfs_inode);
oldflags = jfs_inode-mode2;
 
diff -puN fs/reiserfs/ioctl.c~forbid-user-to-change-file-flags-on-quota-files 
fs/reiserfs/ioctl.c
--- a/fs/reiserfs/ioctl.c~forbid-user-to-change-file-flags-on-quota-files
+++ a/fs/reiserfs/ioctl.c
@@ -57,6 +57,9 @@ int reiserfs_ioctl(struct inode *inode, 
if (get_user(flags, (int __user *)arg))
return -EFAULT;
 
+   /* Is it quota file? Do not allow user to mess with it. 
*/
+   if (IS_NOQUOTA(inode))
+   return -EPERM;
if (((flags ^ REISERFS_I(inode)-
  i_attrs)  (REISERFS_IMMUTABLE_FL |
  REISERFS_APPEND_FL))
_

Patches currently in -mm which might be from [EMAIL PROTECTED] are

origin.patch
git-ocfs2.patch
r-o-bind-mounts-elevate-write-count-for-some-ioctls-vs-forbid-user-to-change-file-flags-on-quota-files.patch
iget-stop-ext3-from-using-iget-and-read_inode-try.patch
iget-stop-ext3-from-using-iget-and-read_inode-try-checkpatch-fixes.patch
iget-stop-ext4-from-using-iget-and-read_inode-try.patch

-
To unsubscribe from this list: send the line unsubscribe linux-ext4 in
the body of a message to [EMAIL PROTECTED]

[PATCH] types fixup for mballoc

2007-11-15 Thread Eric Sandeen
I ran into a potential overflow in ext4_mb_scan_aligned, 
and went looking for others in mballoc.  This patch hits a 
few spots, compile-tested only at this point, comments welcome.

This patch:

changes fe_len to an int, I don't think we need it to be a long,
looking at how it's used (should it be a grpblk_t?)  Also change
anything assigned to return value of mb_find_extent, since it returns
fe_len.

changes anything that does groupno * EXT4_BLOCKS_PER_GROUP
or pa-pa_pstart + whatever to an ext4_fsblk_t

fixes up any related formats

The change to ext4_mb_scan_aligned to add a modulo to avoid an overflow
may be too clever... it could use an extra look I think.

Signed-off-by: Eric Sandeen [EMAIL PROTECTED]

---

Index: linux-2.6.24-rc1/fs/ext4/mballoc.c
===
--- linux-2.6.23.orig/fs/ext4/mballoc.c
+++ linux-2.6.23/fs/ext4/mballoc.c
@@ -363,7 +363,7 @@ struct ext4_free_extent {
ext4_lblk_t fe_logical;
ext4_grpblk_t fe_start;
ext4_grpnum_t fe_group;
-   unsigned long fe_len;
+   int fe_len;
 };
 
 /*
@@ -586,14 +586,14 @@ static void mb_free_blocks_double(struct
BUG_ON(!ext4_is_group_locked(sb, e4b-bd_group));
for (i = 0; i  count; i++) {
if (!mb_test_bit(first + i, e4b-bd_info-bb_bitmap)) {
-   unsigned long blocknr;
+   ext4_fsblk_t blocknr;
blocknr = e4b-bd_group * EXT4_BLOCKS_PER_GROUP(sb);
blocknr += first + i;
blocknr +=
le32_to_cpu(EXT4_SB(sb)-s_es-s_first_data_block);
 
ext4_error(sb, __FUNCTION__, double-free of inode
-   %lu's block %lu(bit %u in group %lu)\n,
+   %lu's block %llu(bit %u in group %lu)\n,
   inode ? inode-i_ino : 0, blocknr,
   first + i, e4b-bd_group);
}
@@ -1226,14 +1226,14 @@ static int mb_free_blocks(struct inode *
order = 0;
 
if (!mb_test_bit(block, EXT4_MB_BITMAP(e4b))) {
-   unsigned long blocknr;
+   ext4_fsblk_t blocknr;
blocknr = e4b-bd_group * EXT4_BLOCKS_PER_GROUP(sb);
blocknr += block;
blocknr +=
le32_to_cpu(EXT4_SB(sb)-s_es-s_first_data_block);
 
ext4_error(sb, __FUNCTION__, double-free of inode
-   %lu's block %lu(bit %u in group %lu)\n,
+   %lu's block %llu(bit %u in group %lu)\n,
   inode ? inode-i_ino : 0, blocknr, block,
   e4b-bd_group);
}
@@ -1416,7 +1416,7 @@ static void ext4_mb_use_best_found(struc
struct ext4_buddy *e4b)
 {
struct ext4_sb_info *sbi = EXT4_SB(ac-ac_sb);
-   unsigned long ret;
+   int ret;
 
BUG_ON(ac-ac_b_ex.fe_group != e4b-bd_group);
BUG_ON(ac-ac_status == AC_STATUS_FOUND);
@@ -1609,7 +1609,7 @@ static int ext4_mb_find_by_goal(struct e
 ac-ac_g_ex.fe_len, ex);
 
if (max = ac-ac_g_ex.fe_len  ac-ac_g_ex.fe_len == sbi-s_stripe) {
-   unsigned long start;
+   ext4_fsblk_t start;
start = (e4b-bd_group * EXT4_BLOCKS_PER_GROUP(ac-ac_sb) +
ex.fe_start + le32_to_cpu(es-s_first_data_block));
if (start % sbi-s_stripe == 0) {
@@ -1732,13 +1732,14 @@ static void ext4_mb_scan_aligned(struct 
struct ext4_sb_info *sbi = EXT4_SB(sb);
void *bitmap = EXT4_MB_BITMAP(e4b);
struct ext4_free_extent ex;
-   unsigned long i;
-   unsigned long max;
+   ext4_grpblk_t i;
+   int max;
 
BUG_ON(sbi-s_stripe == 0);
 
-   /* find first stripe-aligned block */
-   i = e4b-bd_group * EXT4_BLOCKS_PER_GROUP(sb)
+   /* find first stripe-aligned block in group */
+   /* early modulo here to avoid 32-bit overflow */
+   i = (e4b-bd_group % sbi-s_stripe)  * EXT4_BLOCKS_PER_GROUP(sb)
+ le32_to_cpu(sbi-s_es-s_first_data_block);
i = ((i + sbi-s_stripe - 1) / sbi-s_stripe) * sbi-s_stripe;
i = (i - le32_to_cpu(sbi-s_es-s_first_data_block))
@@ -2026,35 +2027,35 @@ static int ext4_mb_seq_history_show(stru
if (hs-op == EXT4_MB_HISTORY_ALLOC) {
fmt = %-5u %-8u %-23s %-23s %-23s %-5u %-5u %-2u 
%-5u %-5s %-5u %-6u\n;
-   sprintf(buf2, %lu/%d/[EMAIL PROTECTED], hs-result.fe_group,
+   sprintf(buf2, %lu/%d/[EMAIL PROTECTED], hs-result.fe_group,
hs-result.fe_start, hs-result.fe_len,
-   (unsigned long)hs-result.fe_logical);
-   

Re: [PATCH] ext3,4:fdatasync should skip metadata writeout

2007-11-15 Thread Andrew Morton
On Fri, 16 Nov 2007 11:47:27 +0900 Hisashi Hifumi [EMAIL PROTECTED] wrote:

 Currently fdatasync is identical to fsync in ext3,4.
 I think fdatasync should skip journal flush in data=ordered and 
 data=writeback mode
 because this syscall is not required to synchronize the metadata.

I suppose so.  Although one wonders what earthly point there is in syncing
a file's data if we haven't yet written out the metadata which is required
for locating that data.

IOW, fdatasync() is only useful if the application knows that it is overwriting
already-instantiated blocks.

In which case it might as well have used fsync().  For ext2-style filesystems,
anyway.

hm.  It needs some thought.
-
To unsubscribe from this list: send the line unsubscribe linux-ext4 in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH] ext3,4:fdatasync should skip metadata writeout

2007-11-15 Thread Hisashi Hifumi
Hi.
Currently fdatasync is identical to fsync in ext3,4.
I think fdatasync should skip journal flush in data=ordered and data=writeback 
mode
because this syscall is not required to synchronize the metadata.

My patch as below is similar to the approach of GFS2's fsync code(gfs2_fsync).

Thanks.

Signed-off-by :Hisashi Hifumi [EMAIL PROTECTED]

diff -Nrup linux-2.6.24-rc2.org/fs/ext3/fsync.c linux-2.6.24-rc2/fs/ext3/fsync.c
--- linux-2.6.24-rc2.org/fs/ext3/fsync.c2007-11-07 06:57:46.0 
+0900
+++ linux-2.6.24-rc2/fs/ext3/fsync.c2007-11-15 17:50:24.0 +0900
@@ -72,6 +72,9 @@ int ext3_sync_file(struct file * file, s
goto out;
}
 
+   if (datasync)
+   goto out;   
+
/*
* The VFS has written the file data.  If the inode is unaltered
* then we need not start a commit.
diff -Nrup linux-2.6.24-rc2.org/fs/ext4/fsync.c linux-2.6.24-rc2/fs/ext4/fsync.c
--- linux-2.6.24-rc2.org/fs/ext4/fsync.c2007-11-07 06:57:46.0 
+0900
+++ linux-2.6.24-rc2/fs/ext4/fsync.c2007-11-15 17:50:54.0 +0900
@@ -72,6 +72,9 @@ int ext4_sync_file(struct file * file, s
goto out;
}
 
+   if (datasync)
+   goto out;
+
/*
* The VFS has written the file data.  If the inode is unaltered
* then we need not start a commit.

patch-2624rc2-ext34-fdatasync.txt
Description: Binary data


Re: [PATCH] ext3,4:fdatasync should skip metadata writeout

2007-11-15 Thread Jörn Engel
On Fri, 16 November 2007 11:47:27 +0900, Hisashi Hifumi wrote:
 
 diff -Nrup linux-2.6.24-rc2.org/fs/ext3/fsync.c 
 linux-2.6.24-rc2/fs/ext3/fsync.c
 --- linux-2.6.24-rc2.org/fs/ext3/fsync.c  2007-11-07 06:57:46.0 
 +0900
 +++ linux-2.6.24-rc2/fs/ext3/fsync.c  2007-11-15 17:50:24.0 +0900
 @@ -72,6 +72,9 @@ int ext3_sync_file(struct file * file, s
   goto out;
   }
  
 + if (datasync)
 + goto out;   
 +
   /*
   * The VFS has written the file data.  If the inode is unaltered
   * then we need not start a commit.

This is wrong.  If I_DIRTY_DATASYNC is set, the inode needs to be
written even for datasync.

How about the patch below?

Jörn

-- 
Audacity augments courage; hesitation, fear.
-- Publilius Syrus


Signed-off-by: Jörn Engel [EMAIL PROTECTED]
---

 fs/ext3/fsync.c |3 ++-
 fs/ext4/fsync.c |3 ++-
 2 files changed, 4 insertions(+), 2 deletions(-)

--- git_I_DIRTY/fs/ext3/fsync.c~ext3_datasync   2007-11-15 20:51:54.0 
+0100
+++ git_I_DIRTY/fs/ext3/fsync.c 2007-11-16 04:42:28.0 +0100
@@ -76,7 +76,8 @@ int ext3_sync_file(struct file * file, s
 * The VFS has written the file data.  If the inode is unaltered
 * then we need not start a commit.
 */
-   if (inode-i_state  (I_DIRTY_SYNC|I_DIRTY_DATASYNC)) {
+   if (((inode-i_state  I_DIRTY_SYNC)  !datasync)
+   || (inode-i_state  I_DIRTY_DATASYNC)) {
struct writeback_control wbc = {
.sync_mode = WB_SYNC_ALL,
.nr_to_write = 0, /* sys_fsync did this */
--- git_I_DIRTY/fs/ext4/fsync.c~ext3_datasync   2007-11-15 20:51:54.0 
+0100
+++ git_I_DIRTY/fs/ext4/fsync.c 2007-11-16 04:44:29.0 +0100
@@ -76,7 +76,8 @@ int ext4_sync_file(struct file * file, s
 * The VFS has written the file data.  If the inode is unaltered
 * then we need not start a commit.
 */
-   if (inode-i_state  (I_DIRTY_SYNC|I_DIRTY_DATASYNC)) {
+   if (((inode-i_state  I_DIRTY_SYNC)  !datasync)
+   || (inode-i_state  I_DIRTY_DATASYNC)) {
struct writeback_control wbc = {
.sync_mode = WB_SYNC_ALL,
.nr_to_write = 0, /* sys_fsync did this */
-
To unsubscribe from this list: send the line unsubscribe linux-ext4 in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] ext3,4:fdatasync should skip metadata writeout

2007-11-15 Thread Andrew Morton
On Thu, 15 Nov 2007 22:47:40 -0500 Wendy Cheng [EMAIL PROTECTED] wrote:

 Andrew Morton wrote:
 
 On Fri, 16 Nov 2007 11:47:27 +0900 Hisashi Hifumi [EMAIL PROTECTED] wrote:
 
   
 
 Currently fdatasync is identical to fsync in ext3,4.
 I think fdatasync should skip journal flush in data=ordered and 
 data=writeback mode
 because this syscall is not required to synchronize the metadata.
 
 
 
 I suppose so.  Although one wonders what earthly point there is in syncing
 a file's data if we haven't yet written out the metadata which is required
 for locating that data.
 
 IOW, fdatasync() is only useful if the application knows that it is 
 overwriting
 already-instantiated blocks.
 
 In which case it might as well have used fsync().  For ext2-style 
 filesystems,
 anyway.
 
 hm.  It needs some thought.
 
   
 
 
 There are non-trivial amount of performance critical programs, 
 particularly in financial application segment ported from legacy UNIX 
 platforms, know the difference between fsync() and fdatasync(). Those 
 can certainly take advantages of this separation. Don't underestimate 
 the talents of these application programmers.
 

If they're that good, they'll be using sync_file_range() ;)
-
To unsubscribe from this list: send the line unsubscribe linux-ext4 in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] ext3,4:fdatasync should skip metadata writeout

2007-11-15 Thread Jörn Engel
On Thu, 15 November 2007 18:59:19 -0800, Andrew Morton wrote:
 On Fri, 16 Nov 2007 11:47:27 +0900 Hisashi Hifumi [EMAIL PROTECTED] wrote:
 
  Currently fdatasync is identical to fsync in ext3,4.
  I think fdatasync should skip journal flush in data=ordered and 
  data=writeback mode
  because this syscall is not required to synchronize the metadata.
 
 I suppose so.  Although one wonders what earthly point there is in syncing
 a file's data if we haven't yet written out the metadata which is required
 for locating that data.
 
 IOW, fdatasync() is only useful if the application knows that it is 
 overwriting
 already-instantiated blocks.
 
 In which case it might as well have used fsync().  For ext2-style filesystems,
 anyway.

fsync() will sync an inode even if only i_atime was changed.
fdatasync() would ignore such changes.  I guess atime was the major
reason for creating fdatasync() in the first place.

The patch I sent you just minutes ago sorta documents this.
I_DIRTY_DATASYNC was added with patch-2.4.0-test12 for just this reason.

So basically an application can almost always use fdatasync() instead of
fsync() and rely on the kernel to only cut corners, if doing so will not
endanger the data synced to disk.

Jörn

-- 
Joern's library part 6:
http://www.gzip.org/zlib/feldspar.html
-
To unsubscribe from this list: send the line unsubscribe linux-ext4 in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] types fixup for mballoc

2007-11-15 Thread Eric Sandeen
Eric Sandeen wrote:
 I ran into a potential overflow in ext4_mb_scan_aligned, 
 and went looking for others in mballoc.  This patch hits a 
 few spots, compile-tested only at this point, comments welcome.
 
 This patch:
 

... introduces a 64-bit divide.  Oops... will fix that up  resend.

-Eric
-
To unsubscribe from this list: send the line unsubscribe linux-ext4 in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html