Re: [EXT4 set 5][PATCH 1/1] expand inode i_extra_isize to support features in larger inode

2007-07-16 Thread Mingming Cao
On Fri, 2007-07-13 at 02:05 -0700, Andrew Morton wrote:
 On Tue, 10 Jul 2007 16:32:47 -0700 Andrew Morton [EMAIL PROTECTED] wrote:
 
   + brelse(bh);
   + up_write(EXT4_I(inode)-xattr_sem);
   + return error;
   +}
   +
  
  We're doing GFP_KERNEL memory allocations while holding xattr_sem.  This
  can cause the VM to reenter the filesystem, perhaps taking i_mutex and/or
  i_truncate_sem and/or journal_start() (I forget whether this still
  happens).  Have we checked whether this can occur and if so, whether we are
  OK from a lock ranking POV?  Bear in mind that journalled-data mode is more
  complex in this regard.
 
 I notice that everyone carefully avoided addressing this ;)

I am not sure why we need GFP_KERNEL flag here. I think we should use
GFP_NOFS instead. The following patch use the GFP_NOFS flag, as well as
fixing memory leak issue introduced by the ext4 expand inode extra isize
patch.

Fixing memory allocation issue with expand inode extra isize patch.

- use GFP_NOFS instead of GFP_KERNEL flag for memory allocation
- use kzalloc instead of kmalloc
- fix memory leak in the success case, at the end of while loop.


Signed-off-by: Mingming Cao [EMAIL PROTECTED]
---
 fs/ext4/xattr.c |   16 
 1 file changed, 12 insertions(+), 4 deletions(-)

Index: linux-2.6.22/fs/ext4/xattr.c
===
--- linux-2.6.22.orig/fs/ext4/xattr.c   2007-07-16 16:12:18.0 -0700
+++ linux-2.6.22/fs/ext4/xattr.c2007-07-16 16:35:59.0 -0700
@@ -1204,8 +1204,8 @@ retry:
unsigned int shift_bytes; /* No. of bytes to shift EAs by? */
unsigned int min_total_size = ~0U;
 
-   is = kzalloc(sizeof(struct ext4_xattr_ibody_find), GFP_KERNEL);
-   bs = kzalloc(sizeof(struct ext4_xattr_block_find), GFP_KERNEL);
+   is = kzalloc(sizeof(struct ext4_xattr_ibody_find), GFP_NOFS);
+   bs = kzalloc(sizeof(struct ext4_xattr_block_find), GFP_NOFS);
if (!is || !bs) {
error = -ENOMEM;
goto cleanup;
@@ -1251,8 +1251,8 @@ retry:
size = le32_to_cpu(entry-e_value_size);
entry_size = EXT4_XATTR_LEN(entry-e_name_len);
i.name_index = entry-e_name_index,
-   buffer = kmalloc(EXT4_XATTR_SIZE(size), GFP_KERNEL);
-   b_entry_name = kmalloc(entry-e_name_len + 1, GFP_KERNEL);
+   buffer = kzalloc(EXT4_XATTR_SIZE(size), GFP_NOFS);
+   b_entry_name = kzalloc(entry-e_name_len + 1, GFP_NOFS);
if (!buffer || !b_entry_name) {
error = -ENOMEM;
goto cleanup;
@@ -1302,7 +1302,15 @@ retry:
error = ext4_xattr_block_set(handle, inode, i, bs);
if (error)
goto cleanup;
+   kfree(b_entry_name);
+   kfree(buffer);
+   brelse(is-iloc.bh);
+   kfree(is);
+   kfree(bs);
+   brelse(bh);
}
+   up_write(EXT4_I(inode)-xattr_sem);
+return 0;
 
 cleanup:
kfree(b_entry_name);





-
To unsubscribe from this list: send the line unsubscribe linux-fsdevel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [EXT4 set 5][PATCH 1/1] expand inode i_extra_isize to support features in larger inode

2007-07-16 Thread Andreas Dilger
On Jul 16, 2007  16:52 -0700, Mingming Cao wrote:
 I am not sure why we need GFP_KERNEL flag here. I think we should use
 GFP_NOFS instead. The following patch use the GFP_NOFS flag, as well as
 fixing memory leak issue introduced by the ext4 expand inode extra isize
 patch.
 
 Fixing memory allocation issue with expand inode extra isize patch.
 
 - use GFP_NOFS instead of GFP_KERNEL flag for memory allocation
 - use kzalloc instead of kmalloc

This doesn't need kzalloc() for buffer and b_entry_name, since they are
immediately overwritten by memcpy().

 - fix memory leak in the success case, at the end of while loop.
   goto cleanup;
 @@ -1302,7 +1302,15 @@ retry:
   error = ext4_xattr_block_set(handle, inode, i, bs);
   if (error)
   goto cleanup;
 + kfree(b_entry_name);
 + kfree(buffer);
 + brelse(is-iloc.bh);
 + kfree(is);
 + kfree(bs);
 + brelse(bh);
   }
 + up_write(EXT4_I(inode)-xattr_sem);
 +return 0;
  
  cleanup:
   kfree(b_entry_name);

I don't think you should have brelse(bh) inside the loop, since it is
allocated before the loop starts.

Cheers, Andreas
--
Andreas Dilger
Principal Software Engineer
Cluster File Systems, Inc.

-
To unsubscribe from this list: send the line unsubscribe linux-fsdevel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [EXT4 set 5][PATCH 1/1] expand inode i_extra_isize to support features in larger inode

2007-07-16 Thread Mingming Cao
On Mon, 2007-07-16 at 18:06 -0600, Andreas Dilger wrote:
 On Jul 16, 2007  16:52 -0700, Mingming Cao wrote:
  I am not sure why we need GFP_KERNEL flag here. I think we should use
  GFP_NOFS instead. The following patch use the GFP_NOFS flag, as well as
  fixing memory leak issue introduced by the ext4 expand inode extra isize
  patch.
  
  Fixing memory allocation issue with expand inode extra isize patch.
  
  - use GFP_NOFS instead of GFP_KERNEL flag for memory allocation
  - use kzalloc instead of kmalloc
 
 This doesn't need kzalloc() for buffer and b_entry_name, since they are
 immediately overwritten by memcpy().
 
ok.
  - fix memory leak in the success case, at the end of while loop.
  goto cleanup;
  @@ -1302,7 +1302,15 @@ retry:
  error = ext4_xattr_block_set(handle, inode, i, bs);
  if (error)
  goto cleanup;
  +   kfree(b_entry_name);
  +   kfree(buffer);
  +   brelse(is-iloc.bh);
  +   kfree(is);
  +   kfree(bs);
  +   brelse(bh);
  }
  +   up_write(EXT4_I(inode)-xattr_sem);
  +return 0;
   
   cleanup:
  kfree(b_entry_name);
 
 I don't think you should have brelse(bh) inside the loop, since it is
 allocated before the loop starts.
 
Ahh right. thanks.

Updated fix.

Fixing memory allocation issue with expand inode extra isize patch.

- use GFP_NOFS instead of GFP_KERNEL to prevent fs reenter
- fix memory leak in the success case, at the end of while loop.


Signed-off-by: Mingming Cao [EMAIL PROTECTED]
---
 fs/ext4/xattr.c |   16 
 1 file changed, 12 insertions(+), 4 deletions(-)

Index: linux-2.6.22/fs/ext4/xattr.c
===
--- linux-2.6.22.orig/fs/ext4/xattr.c   2007-07-16 17:21:14.0 -0700
+++ linux-2.6.22/fs/ext4/xattr.c2007-07-16 17:22:25.0 -0700
@@ -1204,8 +1204,8 @@ retry:
unsigned int shift_bytes; /* No. of bytes to shift EAs by? */
unsigned int min_total_size = ~0U;
 
-   is = kzalloc(sizeof(struct ext4_xattr_ibody_find), GFP_KERNEL);
-   bs = kzalloc(sizeof(struct ext4_xattr_block_find), GFP_KERNEL);
+   is = kzalloc(sizeof(struct ext4_xattr_ibody_find), GFP_NOFS);
+   bs = kzalloc(sizeof(struct ext4_xattr_block_find), GFP_NOFS);
if (!is || !bs) {
error = -ENOMEM;
goto cleanup;
@@ -1251,8 +1251,8 @@ retry:
size = le32_to_cpu(entry-e_value_size);
entry_size = EXT4_XATTR_LEN(entry-e_name_len);
i.name_index = entry-e_name_index,
-   buffer = kmalloc(EXT4_XATTR_SIZE(size), GFP_KERNEL);
-   b_entry_name = kmalloc(entry-e_name_len + 1, GFP_KERNEL);
+   buffer = kmalloc(EXT4_XATTR_SIZE(size), GFP_NOFS);
+   b_entry_name = kmalloc(entry-e_name_len + 1, GFP_NOFS);
if (!buffer || !b_entry_name) {
error = -ENOMEM;
goto cleanup;
@@ -1302,7 +1302,15 @@ retry:
error = ext4_xattr_block_set(handle, inode, i, bs);
if (error)
goto cleanup;
+   kfree(b_entry_name);
+   kfree(buffer);
+   brelse(is-iloc.bh);
+   kfree(is);
+   kfree(bs);
}
+   brelse(bh);
+   up_write(EXT4_I(inode)-xattr_sem);
+   return 0;
 
 cleanup:
kfree(b_entry_name);


-
To unsubscribe from this list: send the line unsubscribe linux-fsdevel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [EXT4 set 5][PATCH 1/1] expand inode i_extra_isize to support features in larger inode

2007-07-15 Thread Peter Zijlstra
On Fri, 2007-07-13 at 14:47 -0700, Zach Brown wrote:

 Peter, do you have any interest in seeing how far we can get
 at tracking lock_page()?  I'm not holding my breath, but any little bit
 would probably help.

I ran headfirst into the fact the unlock_page() need not be called by
the same task that did lock_page().

Esp IO-completion interrupts love to unlock pages they did not lock
themselves. Not at all sure that is fixable, it seems to be the nature
of the async structure of the problem :-(



-
To unsubscribe from this list: send the line unsubscribe linux-fsdevel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [EXT4 set 5][PATCH 1/1] expand inode i_extra_isize to support features in larger inode

2007-07-15 Thread Peter Zijlstra
On Fri, 2007-07-13 at 14:47 -0700, Zach Brown wrote:

 Peter, do you have any interest in seeing how far we can get
 at tracking lock_page()?  I'm not holding my breath, but any little bit
 would probably help.

Would this be a valid report? 

( /me goes hunt a x86_64 unwinder patch that will apply to this tree.
  These stacktraces are pain )

===
[ INFO: possible circular locking dependency detected ]
[ 2.6.22-rt3-dirty #34
---
mount/1296 is trying to acquire lock:
 (ei-truncate_mutex){--..}, at: [802f75e5] 
ext3_get_blocks_handle+0x1a4/0x8f7

but task is already holding lock:
 (lock_page_0){--..}, at: [80267107] 
generic_file_buffered_write+0x1ee/0x646

which lock already depends on the new lock.


the existing dependency chain (in reverse order) is:

- #1 (lock_page_0){--..}:
   [80251b26] __lock_acquire+0xa72/0xc35
   [802520c9] lock_acquire+0x48/0x61
   [80265e22] add_to_page_cache_lru+0xe/0x23
   [80265d31] add_to_page_cache+0x1de/0x2c1
   [80265e22] add_to_page_cache_lru+0xe/0x23
   [80266985] find_or_create_page+0x4c/0x73
   [802ae716] __getblk+0x118/0x23c
   [802afa91] __bread+0x6/0x9c
   [802f382d] read_block_bitmap+0x34/0x65
   [802f3e1b] ext3_free_blocks_sb+0xec/0x3d4
   [802f4131] ext3_free_blocks+0x2e/0x61
   [802f82bc] ext3_free_data+0xaa/0xda
   [802f8976] ext3_truncate+0x4d2/0x84e
   [8026df5a] pagevec_lookup+0x17/0x1e
   [8026e7b1] truncate_inode_pages_range+0x1f4/0x323
   [802614b4] add_preempt_count+0x14/0xe4
   [80304d13] journal_stop+0x1fe/0x21d
   [8027661a] vmtruncate+0xa2/0xc0
   [802a292b] inode_setattr+0x22/0x10a
   [802f9b51] ext3_setattr+0x136/0x18f
   [802a2b1d] notify_change+0x10a/0x241
   [802a2b3b] notify_change+0x128/0x241
   [8028e35e] do_truncate+0x56/0x7f
   [8028e369] do_truncate+0x61/0x7f
   [80296278] get_write_access+0x3f/0x45
   [802973c7] may_open+0x193/0x1af
   [80299869] open_namei+0x2cb/0x63e
   [8025718b] rt_up_read+0x53/0x5c
   [8056da59] do_page_fault+0x479/0x7cc
   [8028dce1] do_filp_open+0x1c/0x38
   [8056a4f9] rt_spin_unlock+0x17/0x47
   [8028da05] get_unused_fd+0xf9/0x107
   [8028dd45] do_sys_open+0x48/0xd5
   [8020950e] system_call+0x7e/0x83
   [] 0x

- #0 (ei-truncate_mutex){--..}:
   [802503b9] print_circular_bug_header+0xcc/0xd3
   [80251a22] __lock_acquire+0x96e/0xc35
   [802520c9] lock_acquire+0x48/0x61
   [802f75e5] ext3_get_blocks_handle+0x1a4/0x8f7
   [8056a6d4] _mutex_lock+0x26/0x52
   [802f75e5] ext3_get_blocks_handle+0x1a4/0x8f7
   [802504b2] find_usage_backwards+0xb0/0xd9
   [802504b2] find_usage_backwards+0xb0/0xd9
   [80250d7c] debug_check_no_locks_freed+0x11d/0x129
   [80250c33] trace_hardirqs_on_caller+0x115/0x138
   [8024efdc] lockdep_init_map+0xac/0x41f
   [802614b4] add_preempt_count+0x14/0xe4
   [802f8035] ext3_get_block+0xc2/0xe4
   [802aeed3] __block_prepare_write+0x195/0x442
   [802f7f73] ext3_get_block+0x0/0xe4
   [802af19a] block_prepare_write+0x1a/0x25
   [802f93e9] ext3_prepare_write+0xb2/0x17b
   [802671b1] generic_file_buffered_write+0x298/0x646
   [8023944e] current_fs_time+0x3b/0x40
   [802614b4] add_preempt_count+0x14/0xe4
   [802678ae] __generic_file_aio_write_nolock+0x34f/0x3b9
   [8024ed3d] put_lock_stats+0xe/0x2a
   [80267964] generic_file_aio_write+0x4c/0xc4
   [80267979] generic_file_aio_write+0x61/0xc4
   [802fcf18] ext3_orphan_del+0x53/0x19f
   [802f5768] ext3_file_write+0x1c/0x9d
   [8028ef31] do_sync_write+0xcc/0x10f
   [80246f9c] autoremove_wake_function+0x0/0x2e
   [8024ecfe] get_lock_stats+0xe/0x3f
   [8024ed9a] lock_release_holdtime+0x41/0x4f
   [8024ed3d] put_lock_stats+0xe/0x2a
   [8028dfb1] sys_fchmod+0xa3/0xbd
   [8056a717] _mutex_unlock+0x17/0x20
   [8028f6cd] vfs_write+0xb6/0x148
   [8028fc61] sys_write+0x48/0x74
   [8020950e] system_call+0x7e/0x83
   [] 0x

other info that might help us debug this:

2 locks held by mount/1296:
 #0:  (inode-i_mutex){--..}, at: [80267964] 
generic_file_aio_write+0x4c/0xc4
 #1:  (lock_page_0){--..}, at: [80267107] 

Re: [EXT4 set 5][PATCH 1/1] expand inode i_extra_isize to support features in larger inode

2007-07-15 Thread Peter Zijlstra
On Sun, 2007-07-15 at 15:02 +0200, Peter Zijlstra wrote:
 On Fri, 2007-07-13 at 14:47 -0700, Zach Brown wrote:
 
  Peter, do you have any interest in seeing how far we can get
  at tracking lock_page()?  I'm not holding my breath, but any little bit
  would probably help.
 
 Would this be a valid report? 

===
[ INFO: possible circular locking dependency detected ]
[ 2.6.22-rt3-dirty #35
---
mkdir/1662 is trying to acquire lock:
 (lock_page_0){--..}, at: [80265df6] add_to_page_cache_lru+0xe/0x23

but task is already holding lock:
 (jbd_handle){--..}, at: [80305797] journal_start+0x108/0x12c

which lock already depends on the new lock.


the existing dependency chain (in reverse order) is:

- #1 (jbd_handle){--..}:
   [80251b16] __lock_acquire+0xa72/0xc35
   [802520b9] lock_acquire+0x48/0x61
   [80305797] journal_start+0x108/0x12c
   [803057b3] journal_start+0x124/0x12c
   [802f92e9] ext3_prepare_write+0x42/0x17b
   [80267185] generic_file_buffered_write+0x298/0x646
   [8023943e] current_fs_time+0x3b/0x40
   [802614a4] add_preempt_count+0x14/0xe4
   [80267882] __generic_file_aio_write_nolock+0x34f/0x3b9
   [8024ed2d] put_lock_stats+0xe/0x2a
   [80267938] generic_file_aio_write+0x4c/0xc4
   [8026794d] generic_file_aio_write+0x61/0xc4
   [802fce88] ext3_orphan_del+0x53/0x19f
   [802f56d8] ext3_file_write+0x1c/0x9d
   [8028eedd] do_sync_write+0xcc/0x10f
   [80246f8d] autoremove_wake_function+0x0/0x2e
   [8024ecee] get_lock_stats+0xe/0x3f
   [8024ed8a] lock_release_holdtime+0x41/0x4f
   [8024ed2d] put_lock_stats+0xe/0x2a
   [8028df5d] sys_fchmod+0xa3/0xbd
   [8056a6d7] _mutex_unlock+0x17/0x20
   [8028f679] vfs_write+0xb6/0x148
   [8028fc0d] sys_write+0x48/0x74
   [8020950e] system_call+0x7e/0x83
   [] 0x

- #0 (lock_page_0){--..}:
   [802503a9] print_circular_bug_header+0xcc/0xd3
   [80251a12] __lock_acquire+0x96e/0xc35
   [802520b9] lock_acquire+0x48/0x61
   [80265df6] add_to_page_cache_lru+0xe/0x23
   [80265d05] add_to_page_cache+0x1de/0x2c1
   [80265df6] add_to_page_cache_lru+0xe/0x23
   [80266959] find_or_create_page+0x4c/0x73
   [802ae6c2] __getblk+0x118/0x23c
   [802f7d9a] ext3_getblk+0xf2/0x23b
   [80306337] journal_dirty_metadata+0x1a8/0x1b3
   [80301e3e] __ext3_journal_dirty_metadata+0x1e/0x46
   [802f6c63] ext3_mark_iloc_dirty+0x293/0x30a
   [802f70a1] ext3_mark_inode_dirty+0x3f/0x48
   [802f644e] ext3_new_inode+0x8ff/0x943
   [802f8c9c] ext3_bread+0x11/0x84
   [802fccc3] ext3_mkdir+0xdd/0x24f
   [80296893] vfs_mkdir+0x6d/0xb5
   [8029902e] sys_mkdirat+0xa1/0xec
   [80569f4d] trace_hardirqs_on_thunk+0x3a/0x3c
   [80250c23] trace_hardirqs_on_caller+0x115/0x138
   [80569f4d] trace_hardirqs_on_thunk+0x3a/0x3c
   [8020950e] system_call+0x7e/0x83
   [] 0x

other info that might help us debug this:

2 locks held by mkdir/1662:
 #0:  (inode-i_mutex/1){--..}, at: [80297154] 
lookup_create+0x26/0x8b
 #1:  (jbd_handle){--..}, at: [80305797] journal_start+0x108/0x12c

stack backtrace:

Call Trace:
 [8024ffad] print_circular_bug_tail+0x69/0x72
 [802503a9] print_circular_bug_header+0xcc/0xd3
 [80251a12] __lock_acquire+0x96e/0xc35
 [802520b9] lock_acquire+0x48/0x61
 [80265df6] add_to_page_cache_lru+0xe/0x23
 [80265d05] add_to_page_cache+0x1de/0x2c1
 [80265df6] add_to_page_cache_lru+0xe/0x23
 [80266959] find_or_create_page+0x4c/0x73
 [802ae6c2] __getblk+0x118/0x23c
 [802f7d9a] ext3_getblk+0xf2/0x23b
 [80306337] journal_dirty_metadata+0x1a8/0x1b3
 [80301e3e] __ext3_journal_dirty_metadata+0x1e/0x46
 [802f6c63] ext3_mark_iloc_dirty+0x293/0x30a
 [802f70a1] ext3_mark_inode_dirty+0x3f/0x48
 [802f644e] ext3_new_inode+0x8ff/0x943
 [802f8c9c] ext3_bread+0x11/0x84
 [802fccc3] ext3_mkdir+0xdd/0x24f
 [80296893] vfs_mkdir+0x6d/0xb5
 [8029902e] sys_mkdirat+0xa1/0xec
 [80569f4d] trace_hardirqs_on_thunk+0x3a/0x3c
 [80250c23] trace_hardirqs_on_caller+0x115/0x138
 [80569f4d] trace_hardirqs_on_thunk+0x3a/0x3c
 [8020950e] system_call+0x7e/0x83

INFO: lockdep is turned off.
---
| preempt count:  ]
| 0-level deep critical section nesting:



Re: [EXT4 set 5][PATCH 1/1] expand inode i_extra_isize to support features in larger inode

2007-07-15 Thread Peter Zijlstra
On Sun, 2007-07-15 at 11:11 -0700, Andrew Morton wrote:
 On Sun, 15 Jul 2007 15:02:23 +0200 Peter Zijlstra [EMAIL PROTECTED] wrote:
 
  On Fri, 2007-07-13 at 14:47 -0700, Zach Brown wrote:
  
   Peter, do you have any interest in seeing how far we can get
   at tracking lock_page()?  I'm not holding my breath, but any little bit
   would probably help.
  
  Would this be a valid report? 
  
  ( /me goes hunt a x86_64 unwinder patch that will apply to this tree.
These stacktraces are pain )
 
 They are.  lockdep reports are a pain too.  It's still a struggle to
 understand wtf they're trying to tell you.  Mabe it's just me.

It got you confused alright,..

  ===
  [ INFO: possible circular locking dependency detected ]
  [ 2.6.22-rt3-dirty #34
  ---
  mount/1296 is trying to acquire lock:
   (ei-truncate_mutex){--..}, at: [802f75e5] 
  ext3_get_blocks_handle+0x1a4/0x8f7
  
  but task is already holding lock:
   (lock_page_0){--..}, at: [80267107] 
  generic_file_buffered_write+0x1ee/0x646
 
  which lock already depends on the new lock.
 

So, the offence is trying to acquire ei-truncate_mutex while already
holding lock_page_0.

These traces show how the previous (reverse) dependancy came into being

  
  the existing dependency chain (in reverse order) is:
  
  - #1 (lock_page_0){--..}:
 [80251b26] __lock_acquire+0xa72/0xc35
 [802520c9] lock_acquire+0x48/0x61
 [80265e22] add_to_page_cache_lru+0xe/0x23
 [80265d31] add_to_page_cache+0x1de/0x2c1
 [80265e22] add_to_page_cache_lru+0xe/0x23
 [80266985] find_or_create_page+0x4c/0x73
 [802ae716] __getblk+0x118/0x23c
 [802afa91] __bread+0x6/0x9c
 [802f382d] read_block_bitmap+0x34/0x65
 [802f3e1b] ext3_free_blocks_sb+0xec/0x3d4
 [802f4131] ext3_free_blocks+0x2e/0x61
 [802f82bc] ext3_free_data+0xaa/0xda
 [802f8976] ext3_truncate+0x4d2/0x84e
 [8026df5a] pagevec_lookup+0x17/0x1e
 [8026e7b1] truncate_inode_pages_range+0x1f4/0x323
 [802614b4] add_preempt_count+0x14/0xe4
 [80304d13] journal_stop+0x1fe/0x21d
 [8027661a] vmtruncate+0xa2/0xc0
 [802a292b] inode_setattr+0x22/0x10a
 [802f9b51] ext3_setattr+0x136/0x18f
 [802a2b1d] notify_change+0x10a/0x241
 [802a2b3b] notify_change+0x128/0x241
 [8028e35e] do_truncate+0x56/0x7f
 [8028e369] do_truncate+0x61/0x7f
 [80296278] get_write_access+0x3f/0x45
 [802973c7] may_open+0x193/0x1af
 [80299869] open_namei+0x2cb/0x63e
 [8025718b] rt_up_read+0x53/0x5c
 [8056da59] do_page_fault+0x479/0x7cc
 [8028dce1] do_filp_open+0x1c/0x38
 [8056a4f9] rt_spin_unlock+0x17/0x47
 [8028da05] get_unused_fd+0xf9/0x107
 [8028dd45] do_sys_open+0x48/0xd5
 [8020950e] system_call+0x7e/0x83
 [] 0x
 
 I guess we're doing lock_page() against a blockdev pagecache page here
 while holding truncate_mutex against some S_ISREG file.

So this trace ( - #1 ) shows how lock_page_0 became to depend on
ei-truncate_mutex ( - #0 ).

  - #0 (ei-truncate_mutex){--..}:
 [802503b9] print_circular_bug_header+0xcc/0xd3
 [80251a22] __lock_acquire+0x96e/0xc35
 [802520c9] lock_acquire+0x48/0x61
 [802f75e5] ext3_get_blocks_handle+0x1a4/0x8f7
 [8056a6d4] _mutex_lock+0x26/0x52
 [802f75e5] ext3_get_blocks_handle+0x1a4/0x8f7
 [802504b2] find_usage_backwards+0xb0/0xd9
 [802504b2] find_usage_backwards+0xb0/0xd9
 [80250d7c] debug_check_no_locks_freed+0x11d/0x129
 [80250c33] trace_hardirqs_on_caller+0x115/0x138
 [8024efdc] lockdep_init_map+0xac/0x41f
 [802614b4] add_preempt_count+0x14/0xe4
 [802f8035] ext3_get_block+0xc2/0xe4
 [802aeed3] __block_prepare_write+0x195/0x442
 [802f7f73] ext3_get_block+0x0/0xe4
 [802af19a] block_prepare_write+0x1a/0x25
 [802f93e9] ext3_prepare_write+0xb2/0x17b
 [802671b1] generic_file_buffered_write+0x298/0x646
 [8023944e] current_fs_time+0x3b/0x40
 [802614b4] add_preempt_count+0x14/0xe4
 [802678ae] __generic_file_aio_write_nolock+0x34f/0x3b9
 [8024ed3d] put_lock_stats+0xe/0x2a
 [80267964] generic_file_aio_write+0x4c/0xc4
 [80267979] generic_file_aio_write+0x61/0xc4
 [802fcf18] 

Re: [EXT4 set 5][PATCH 1/1] expand inode i_extra_isize to support features in larger inode

2007-07-15 Thread Andrew Morton
On Sun, 15 Jul 2007 21:21:03 +0200 Peter Zijlstra [EMAIL PROTECTED] wrote:

 Shows the current stacktrace where we violate the previously established
 locking order.

yup, but the lock_page() which we did inside truncate_mutex was a 
lock_page() against a different address_space: the blockdev mapping.

So this is OK - we'll never take truncate_mutex against the blockdev
mapping (it doesn't have one, for a start ;))

This is similar to the quite common case where we take inode A's
i_mutex inside inode B's i_mutex, which needs special lockdep annotations.

I think.  I haven't looked into this in detail.
-
To unsubscribe from this list: send the line unsubscribe linux-fsdevel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [EXT4 set 5][PATCH 1/1] expand inode i_extra_isize to support features in larger inode

2007-07-15 Thread Peter Zijlstra
On Sun, 2007-07-15 at 12:59 -0700, Andrew Morton wrote:
 On Sun, 15 Jul 2007 21:21:03 +0200 Peter Zijlstra [EMAIL PROTECTED] wrote:
 
  Shows the current stacktrace where we violate the previously established
  locking order.
 
 yup, but the lock_page() which we did inside truncate_mutex was a 
 lock_page() against a different address_space: the blockdev mapping.
 
 So this is OK - we'll never take truncate_mutex against the blockdev
 mapping (it doesn't have one, for a start ;))
 
 This is similar to the quite common case where we take inode A's
 i_mutex inside inode B's i_mutex, which needs special lockdep annotations.
 
 I think.  I haven't looked into this in detail.

Right, I can make lock_page classes per address space. Lets see if this
one goes away.

-
To unsubscribe from this list: send the line unsubscribe linux-fsdevel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [EXT4 set 5][PATCH 1/1] expand inode i_extra_isize to support features in larger inode

2007-07-14 Thread Peter Zijlstra
On Fri, 2007-07-13 at 14:47 -0700, Zach Brown wrote:
  I fear the consequences of this change :(
 
 I love it.  In the past I've lost time by working with patches which
 didn't quite realize that ext3 holds a transaction open during
 -direct_IO.
 
  Oh well, please keep it alive, maybe beat on it a bit, resend it
  later on?
 
 I can test the patch to make sure that it catches mistakes I've made in
 the past. 

That would be much appreciated.

  Peter, do you have any interest in seeing how far we can get
 at tracking lock_page()?  I'm not holding my breath, but any little bit
 would probably help.

Yeah I'm going to try that one next.. 

-
To unsubscribe from this list: send the line unsubscribe linux-fsdevel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [EXT4 set 5][PATCH 1/1] expand inode i_extra_isize to support features in larger inode

2007-07-13 Thread Peter Zijlstra
On Fri, 2007-07-13 at 02:05 -0700, Andrew Morton wrote:

 Except lockdep doesn't know about journal_start(), which has ranking
 requirements similar to a semaphore.  

Something like so?

Or can journal_stop() be done by a different task than the one that did
journal_start()? - in which case nothing much can be done :-/

This seems to boot... albeit I did not push it hard.

Signed-off-by: Peter Zijlstra [EMAIL PROTECTED]
---
 fs/jbd/transaction.c |9 +
 include/linux/jbd.h  |5 +
 2 files changed, 14 insertions(+)

Index: linux-2.6/fs/jbd/transaction.c
===
--- linux-2.6.orig/fs/jbd/transaction.c
+++ linux-2.6/fs/jbd/transaction.c
@@ -233,6 +233,8 @@ out:
return ret;
 }
 
+static struct lock_class_key jbd_handle_key;
+
 /* Allocate a new handle.  This should probably be in a slab... */
 static handle_t *new_handle(int nblocks)
 {
@@ -243,6 +245,8 @@ static handle_t *new_handle(int nblocks)
handle-h_buffer_credits = nblocks;
handle-h_ref = 1;
 
+   lockdep_init_map(handle-h_lockdep_map, jbd_handle, jbd_handle_key, 
0);
+
return handle;
 }
 
@@ -286,6 +290,9 @@ handle_t *journal_start(journal_t *journ
current-journal_info = NULL;
handle = ERR_PTR(err);
}
+
+   lock_acquire(handle-h_lockdep_map, 0, 0, 0, 2, _THIS_IP_);
+
return handle;
 }
 
@@ -1411,6 +1418,8 @@ int journal_stop(handle_t *handle)
spin_unlock(journal-j_state_lock);
}
 
+   lock_release(handle-h_lockdep_map, 1, _THIS_IP_);
+
jbd_free_handle(handle);
return err;
 }
Index: linux-2.6/include/linux/jbd.h
===
--- linux-2.6.orig/include/linux/jbd.h
+++ linux-2.6/include/linux/jbd.h
@@ -30,6 +30,7 @@
 #include linux/bit_spinlock.h
 #include linux/mutex.h
 #include linux/timer.h
+#include linux/lockdep.h
 
 #include asm/semaphore.h
 #endif
@@ -405,6 +406,10 @@ struct handle_s
unsigned inth_sync: 1;  /* sync-on-close */
unsigned inth_jdata:1;  /* force data journaling */
unsigned inth_aborted:  1;  /* fatal error on handle */
+
+#ifdef CONFIG_LOCKDEP
+   struct lockdep_map  h_lockdep_map;
+#endif
 };
 
 


-
To unsubscribe from this list: send the line unsubscribe linux-fsdevel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [EXT4 set 5][PATCH 1/1] expand inode i_extra_isize to support features in larger inode

2007-07-13 Thread Andrew Morton
On Tue, 10 Jul 2007 16:32:47 -0700 Andrew Morton [EMAIL PROTECTED] wrote:

  +   brelse(bh);
  +   up_write(EXT4_I(inode)-xattr_sem);
  +   return error;
  +}
  +
 
 We're doing GFP_KERNEL memory allocations while holding xattr_sem.  This
 can cause the VM to reenter the filesystem, perhaps taking i_mutex and/or
 i_truncate_sem and/or journal_start() (I forget whether this still
 happens).  Have we checked whether this can occur and if so, whether we are
 OK from a lock ranking POV?  Bear in mind that journalled-data mode is more
 complex in this regard.

I notice that everyone carefully avoided addressing this ;)

Oh well, hopefully people are testing with lockdep enabled.  As long
as the fs is put under extreme memory pressure, most bugs should be reported.

Except lockdep doesn't know about journal_start(), which has ranking
requirements similar to a semaphore.  Nor does it know about lock_page().
We already have hard-to-hit but deadlockable bugs in this area.


-
To unsubscribe from this list: send the line unsubscribe linux-fsdevel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [EXT4 set 5][PATCH 1/1] expand inode i_extra_isize to support features in larger inode

2007-07-13 Thread Andreas Dilger
On Jul 13, 2007  15:33 +0200, Peter Zijlstra wrote:
 On Fri, 2007-07-13 at 02:05 -0700, Andrew Morton wrote:
 Or can journal_stop() be done by a different task than the one that did
 journal_start()? - in which case nothing much can be done :-/

The call to journal_stop() has to be in the same process, since the
journal handle is also held in current-journal_info so the handle
does not need to be passed as an argument all over the VFS.

 This seems to boot... albeit I did not push it hard.

Can you please also make a patch for jbd2.


Cheers, Andreas
--
Andreas Dilger
Principal Software Engineer
Cluster File Systems, Inc.

-
To unsubscribe from this list: send the line unsubscribe linux-fsdevel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [EXT4 set 5][PATCH 1/1] expand inode i_extra_isize to support features in larger inode

2007-07-13 Thread Andrew Morton
On Fri, 13 Jul 2007 15:33:41 +0200
Peter Zijlstra [EMAIL PROTECTED] wrote:

 On Fri, 2007-07-13 at 02:05 -0700, Andrew Morton wrote:
 
  Except lockdep doesn't know about journal_start(), which has ranking
  requirements similar to a semaphore.  
 
 Something like so?

Looks OK.

 Or can journal_stop() be done by a different task than the one that did
 journal_start()? - in which case nothing much can be done :-/

Yeah, journal_start() and journal_stop() are well-behaved.
 
 This seems to boot... albeit I did not push it hard.

I fear the consequences of this change :(

Oh well, please keep it alive, maybe beat on it a bit, resend it
later on?
-
To unsubscribe from this list: send the line unsubscribe linux-fsdevel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [EXT4 set 5][PATCH 1/1] expand inode i_extra_isize to support features in larger inode

2007-07-13 Thread Zach Brown
 I fear the consequences of this change :(

I love it.  In the past I've lost time by working with patches which
didn't quite realize that ext3 holds a transaction open during
-direct_IO.

 Oh well, please keep it alive, maybe beat on it a bit, resend it
 later on?

I can test the patch to make sure that it catches mistakes I've made in
the past.  Peter, do you have any interest in seeing how far we can get
at tracking lock_page()?  I'm not holding my breath, but any little bit
would probably help.

- z
-
To unsubscribe from this list: send the line unsubscribe linux-fsdevel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [EXT4 set 5][PATCH 1/1] expand inode i_extra_isize to support features in larger inode

2007-07-13 Thread Andreas Dilger
On Jul 13, 2007  02:05 -0700, Andrew Morton wrote:
 On Tue, 10 Jul 2007 16:32:47 -0700 Andrew Morton [EMAIL PROTECTED] wrote:
 
   + brelse(bh);
   + up_write(EXT4_I(inode)-xattr_sem);
   + return error;
   +}
   +
  
  We're doing GFP_KERNEL memory allocations while holding xattr_sem.  This
  can cause the VM to reenter the filesystem, perhaps taking i_mutex and/or
  i_truncate_sem and/or journal_start() (I forget whether this still
  happens).  Have we checked whether this can occur and if so, whether we are
  OK from a lock ranking POV?  Bear in mind that journalled-data mode is more
  complex in this regard.
 
 I notice that everyone carefully avoided addressing this ;)
 
 Oh well, hopefully people are testing with lockdep enabled.  As long
 as the fs is put under extreme memory pressure, most bugs should be reported.

I have no objection to changing these to GFP_NOFS or GFP_ATOMIC, because
the number of times this function is called is really quite small (only
for existing inodes when the size of the fixed fields in the inode is
increasing) and the buffers are freed immediately so this won't put any
undue strain on the atomic memory pools.

That said, there is also a GFP_KERNEL allocations in ext3_xattr_block_set()
under xattr_sem, so the same problem would exist there.

I also just noticed that buffer and b_entry_name are leaked in
ext4_expand_extra_isize() if the while loop is run more than one time
(again a relatively rare event).

Cheers, Andreas
--
Andreas Dilger
Principal Software Engineer
Cluster File Systems, Inc.

-
To unsubscribe from this list: send the line unsubscribe linux-fsdevel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [EXT4 set 5][PATCH 1/1] expand inode i_extra_isize to support features in larger inode

2007-07-12 Thread Kalpak Shah
On Tue, 2007-07-10 at 16:32 -0700, Andrew Morton wrote:
 On Sun, 01 Jul 2007 03:38:01 -0400
 Mingming Cao [EMAIL PROTECTED] wrote:
 
  This patch is on top of the nanosecond timestamp and i_version_hi
  patches. 
 
 This sort of information isn't needed (or desired) when this patch hits the
 git tree.  Please ensure that things like this are cleaned up before the
 patches go to Linus.

The incremental patch I have attached contains an updated Changelog
entry which cleans this up.

  +   !(EXT4_I(inode)-i_state  EXT4_STATE_NO_EXPAND)) {
  +   /* We need extra buffer credits since we may write into EA block
  +* with this same handle */
  +   if ((jbd2_journal_extend(handle,
  +EXT4_DATA_TRANS_BLOCKS(inode-i_sb))) == 0) {
  +   ret = ext4_expand_extra_isize(inode,
  +   EXT4_SB(inode-i_sb)-s_want_extra_isize,
  +   iloc, handle);
  +   if (ret) {
  +   EXT4_I(inode)-i_state |= EXT4_STATE_NO_EXPAND;
  +   if (!expand_message) {
  +   ext4_warning(inode-i_sb, __FUNCTION__,
  +   Unable to expand inode %lu. Delete
  +some EAs or run e2fsck.,
  +   inode-i_ino);
  +   expand_message = 1;
  +   }
  +   }
  +   }
  +   }
 
 Maybe that message should come out once per mount rather than once per boot
 (or once per modprobe)?

Done. Now the message gets printed the first time s_mnt_count changes,
which means once per mount.

  +   if (free  new_extra_isize) {
  +   if (!tried_min_extra_isize  s_min_extra_isize) {
  +   tried_min_extra_isize++;
  +   new_extra_isize = s_min_extra_isize;
 
 Aren't we missing a brelse(bh) here?

I have corrected this.

   
  +#define IHDR(inode, raw_inode) \
  +   ((struct ext4_xattr_ibody_header *) \
  +   ((void *)raw_inode + \
  +   EXT4_GOOD_OLD_INODE_SIZE + \
  +   EXT4_I(inode)-i_extra_isize))
  +#define IFIRST(hdr) ((struct ext4_xattr_entry *)((hdr)+1))
 
 Neither of these _have_ to be implemented as macros and hence they should
 not be.

These macros existed before and have just been moved. There are similar
such macros in the ext3/4 xattr code and they should probably be changed
to inlines. But that should be done in a different patch.

Thanks,
Kalpak.
We need to make sure that existing ext3 filesystems can also avail the new
fields that have been added to the ext4 inode. We use s_want_extra_isize and
s_min_extra_isize to decide by how much we should expand the inode. If
EXT4_FEATURE_RO_COMPAT_EXTRA_ISIZE feature is set then we expand the inode by
max(s_want_extra_isize, s_min_extra_isize , sizeof(ext4_inode) -
EXT4_GOOD_OLD_INODE_SIZE) bytes. Actually it is still an open question
about whether users should be able to set s_*_extra_isize smaller than
the known fields or not.

This patch also adds the functionality to expand inodes to include the
newly added fields. We start by trying to expand by s_want_extra_isize
bytes and if its fails we try to expand by s_min_extra_isize bytes. This
is done by changing the i_extra_isize if enough space is available in
the inode and no EAs are present. If EAs are present and there is enough
space in the inode then the EAs in the inode are shifted to make space.
If enough space is not available in the inode due to the EAs then 1 or
more EAs are shifted to the external EA block. In the worst case when
even the external EA block does not have enough space we inform the user
that some EA would need to be deleted or s_min_extra_isize would have to
be reduced.

This would be online expansion of inodes. expand_extra_isize option will also
be added to e2fsck which will expand all the inodes and if for any reason 
expansion fails for any inode then the EXT4_FEATURE_RO_COMPAT_EXTRA_ISIZE
feature will be reset.

Signed-off-by: Andreas Dilger [EMAIL PROTECTED]
Signed-off-by: Kalpak Shah [EMAIL PROTECTED]
Signed-off-by: Mingming Cao [EMAIL PROTECTED]

Index: linux-2.6.22/fs/ext4/inode.c
===
--- linux-2.6.22.orig/fs/ext4/inode.c
+++ linux-2.6.22/fs/ext4/inode.c
@@ -3130,9 +3130,8 @@ int ext4_expand_extra_isize(struct inode
 	struct ext4_xattr_ibody_header *header;
 	struct ext4_xattr_entry *entry;
 
-	if (EXT4_I(inode)-i_extra_isize = new_extra_isize) {
+	if (EXT4_I(inode)-i_extra_isize = new_extra_isize)
 		return 0;
-	}
 
 	raw_inode = ext4_raw_inode(iloc);
 
@@ -3148,9 +3147,9 @@ int ext4_expand_extra_isize(struct inode
 		return 0;
 	}
 
-	/* try to expand with EA present */
+	/* try to expand with EAs present */
 	return ext4_expand_extra_isize_ea(inode, new_extra_isize,
-		raw_inode, handle);
+	 

Re: [EXT4 set 5][PATCH 1/1] expand inode i_extra_isize to support features in larger inode

2007-07-11 Thread Andreas Dilger
On Jul 10, 2007  16:32 -0700, Andrew Morton wrote:
  err = ext4_reserve_inode_write(handle, inode, iloc);
  +   if (EXT4_I(inode)-i_extra_isize 
  +   EXT4_SB(inode-i_sb)-s_want_extra_isize 
  +   !(EXT4_I(inode)-i_state  EXT4_STATE_NO_EXPAND)) {
  +   /* We need extra buffer credits since we may write into EA block
  +* with this same handle */
  +   if ((jbd2_journal_extend(handle,
  +EXT4_DATA_TRANS_BLOCKS(inode-i_sb))) == 0) {
  +   ret = ext4_expand_extra_isize(inode,
  +   EXT4_SB(inode-i_sb)-s_want_extra_isize,
  +   iloc, handle);
  +   if (ret) {
  +   EXT4_I(inode)-i_state |= EXT4_STATE_NO_EXPAND;
  +   if (!expand_message) {
  +   ext4_warning(inode-i_sb, __FUNCTION__,
  +   Unable to expand inode %lu. Delete
  +some EAs or run e2fsck.,
  +   inode-i_ino);
  +   expand_message = 1;
  +   }
  +   }
  +   }
  +   }
 
 Maybe that message should come out once per mount rather than once per boot
 (or once per modprobe)?

Probably true.

 What are the consequences of a jbd2_journal_extend() failure in here?

Not fatal, just like every user of i_extra_isize.  If the inode isn't a
large inode, or it can't be expanded then there will be a minor loss of
functionality on that inode.  If the i_extra_isize is critical, then
the sysadmin will have run e2fsck to force s_min_extra_isize large enough.

Note that this is only applicable for filesystems which are upgraded.  For
new inodes (i.e. all inodes that exist in the filesystem if it was always
run on a kernel with the currently understood extra fields) then this will
never be invoked (until such a time new extra fields are added).

  +static void ext4_xattr_shift_entries(struct ext4_xattr_entry *entry,
  +int value_offs_shift, void *to,
  +void *from, size_t n, int blocksize)
  +{
  +   /* Adjust the value offsets of the entries */
  +   for (; !IS_LAST_ENTRY(last); last = EXT4_XATTR_NEXT(last)) {
  +   if (!last-e_value_block  last-e_value_size) {
  +   new_offs = le16_to_cpu(last-e_value_offs) +
  +   value_offs_shift;
  +   BUG_ON(new_offs + le32_to_cpu(last-e_value_size)
  + blocksize);
  +   last-e_value_offs = cpu_to_le16(new_offs);
  +   }
  +   }
  +   /* Shift the entries by n bytes */
  +   memmove(to, from, n);
  +}
 
 Under what circumstances will that new BUG_ON trigger?  Can it be triggered
 via incorrect disk contents?  If so, it should not be there.

Only under code defect or memory corruption.  The needed extra space was
already checked in ext4_expand_extra_isize_ea(), but I asked for this
BUG_ON() because it would otherwise seem that there was a chance from
reading just the above code that the shifted EA might overflow the buffer.

  +int ext4_expand_extra_isize_ea(struct inode *inode, int new_extra_isize,
  +   struct ext4_inode *raw_inode, handle_t *handle)
  +{
  +   down_write(EXT4_I(inode)-xattr_sem);
  +retry:
  +   if (EXT4_I(inode)-i_extra_isize = new_extra_isize) {
  +   up_write(EXT4_I(inode)-xattr_sem);
  +   return 0;
  +   }
 
 So..  xattr_sem is a lock which protects i_extra_isize?  That seems a bit
 strange to me - I'd have expected i_mutex.

Well, since this is the only code that ever changes i_extra_isize, and it
also needs to move the EAs around, it makes sense to use the EA lock for it.
This is also a relatively low-contention lock, and it needs to be held to
access any of the EAs (which are the only things beyond i_extra_isize).

  +   if (EXT4_I(inode)-i_file_acl) {
  +   bh = sb_bread(inode-i_sb, EXT4_I(inode)-i_file_acl);
  +   error = -EIO;
  +   if (!bh)
  +   goto cleanup;
  +   if (ext4_xattr_check_block(bh)) {
  +   ext4_error(inode-i_sb, __FUNCTION__,
  +   inode %lu: bad block %llu, inode-i_ino,
  +   EXT4_I(inode)-i_file_acl);
  +   error = -EIO;
  +   goto cleanup;
  +   }
  +   base = BHDR(bh);
  +   first = BFIRST(bh);
  +   end = bh-b_data + bh-b_size;
  +   min_offs = end - base;
  +   free = ext4_xattr_free_space(first, min_offs, base,
  +total_blk);
  +   if (free  new_extra_isize) {
  +   if (!tried_min_extra_isize  s_min_extra_isize) {
  +   tried_min_extra_isize++;
  + 

Re: [EXT4 set 5][PATCH 1/1] expand inode i_extra_isize to support features in larger inode

2007-07-11 Thread Andrew Morton
On Wed, 11 Jul 2007 06:10:56 -0600 Andreas Dilger [EMAIL PROTECTED] wrote:

 On Jul 10, 2007  16:32 -0700, Andrew Morton wrote:
 err = ext4_reserve_inode_write(handle, inode, iloc);
   + if (EXT4_I(inode)-i_extra_isize 
   + EXT4_SB(inode-i_sb)-s_want_extra_isize 
   + !(EXT4_I(inode)-i_state  EXT4_STATE_NO_EXPAND)) {
   + /* We need extra buffer credits since we may write into EA block
   +  * with this same handle */
   + if ((jbd2_journal_extend(handle,
   +  EXT4_DATA_TRANS_BLOCKS(inode-i_sb))) == 0) {
   + ret = ext4_expand_extra_isize(inode,
   + EXT4_SB(inode-i_sb)-s_want_extra_isize,
   + iloc, handle);
   + if (ret) {
   + EXT4_I(inode)-i_state |= EXT4_STATE_NO_EXPAND;
   + if (!expand_message) {
   + ext4_warning(inode-i_sb, __FUNCTION__,
   + Unable to expand inode %lu. Delete
   +  some EAs or run e2fsck.,
   + inode-i_ino);
   + expand_message = 1;
   + }
   + }
   + }
   + }
  
  Maybe that message should come out once per mount rather than once per boot
  (or once per modprobe)?
 
 Probably true.
 
  What are the consequences of a jbd2_journal_extend() failure in here?
 
 Not fatal, just like every user of i_extra_isize.  If the inode isn't a
 large inode, or it can't be expanded then there will be a minor loss of
 functionality on that inode.  If the i_extra_isize is critical, then
 the sysadmin will have run e2fsck to force s_min_extra_isize large enough.
 
 Note that this is only applicable for filesystems which are upgraded.  For
 new inodes (i.e. all inodes that exist in the filesystem if it was always
 run on a kernel with the currently understood extra fields) then this will
 never be invoked (until such a time new extra fields are added).

I'd suggest that we get a comment in the code explaining this: this
unchecked error does rather stand out.

   + if (EXT4_I(inode)-i_file_acl) {
   + bh = sb_bread(inode-i_sb, EXT4_I(inode)-i_file_acl);
   + error = -EIO;
   + if (!bh)
   + goto cleanup;
   + if (ext4_xattr_check_block(bh)) {
   + ext4_error(inode-i_sb, __FUNCTION__,
   + inode %lu: bad block %llu, inode-i_ino,
   + EXT4_I(inode)-i_file_acl);
   + error = -EIO;
   + goto cleanup;
   + }
   + base = BHDR(bh);
   + first = BFIRST(bh);
   + end = bh-b_data + bh-b_size;
   + min_offs = end - base;
   + free = ext4_xattr_free_space(first, min_offs, base,
   +  total_blk);
   + if (free  new_extra_isize) {
   + if (!tried_min_extra_isize  s_min_extra_isize) {
   + tried_min_extra_isize++;
   + new_extra_isize = s_min_extra_isize;
  
  Aren't we missing a brelse(bh) here?
 
 Seems likely, yes.

OK - could we get a positive ack from someone indicating that this will get
looked at?  Because I am about to forget about it.

-
To unsubscribe from this list: send the line unsubscribe linux-fsdevel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [EXT4 set 5][PATCH 1/1] expand inode i_extra_isize to support features in larger inode

2007-07-11 Thread Kalpak Shah
On Wed, 2007-07-11 at 10:34 -0700, Andrew Morton wrote:
 On Wed, 11 Jul 2007 06:10:56 -0600 Andreas Dilger [EMAIL PROTECTED] wrote:
 
  On Jul 10, 2007  16:32 -0700, Andrew Morton wrote:
err = ext4_reserve_inode_write(handle, inode, iloc);
+   if (EXT4_I(inode)-i_extra_isize 
+   EXT4_SB(inode-i_sb)-s_want_extra_isize 
+   !(EXT4_I(inode)-i_state  EXT4_STATE_NO_EXPAND)) {
+   /* We need extra buffer credits since we may write into 
EA block
+* with this same handle */
+   if ((jbd2_journal_extend(handle,
+EXT4_DATA_TRANS_BLOCKS(inode-i_sb))) == 
0) {
+   ret = ext4_expand_extra_isize(inode,
+   
EXT4_SB(inode-i_sb)-s_want_extra_isize,
+   iloc, handle);
+   if (ret) {
+   EXT4_I(inode)-i_state |= 
EXT4_STATE_NO_EXPAND;
+   if (!expand_message) {
+   ext4_warning(inode-i_sb, 
__FUNCTION__,
+   Unable to expand inode %lu. 
Delete
+some EAs or run e2fsck.,
+   inode-i_ino);
+   expand_message = 1;
+   }
+   }
+   }
+   }
   
   Maybe that message should come out once per mount rather than once per 
   boot
   (or once per modprobe)?
  
  Probably true.
  
   What are the consequences of a jbd2_journal_extend() failure in here?
  
  Not fatal, just like every user of i_extra_isize.  If the inode isn't a
  large inode, or it can't be expanded then there will be a minor loss of
  functionality on that inode.  If the i_extra_isize is critical, then
  the sysadmin will have run e2fsck to force s_min_extra_isize large enough.
  
  Note that this is only applicable for filesystems which are upgraded.  For
  new inodes (i.e. all inodes that exist in the filesystem if it was always
  run on a kernel with the currently understood extra fields) then this will
  never be invoked (until such a time new extra fields are added).
 
 I'd suggest that we get a comment in the code explaining this: this
 unchecked error does rather stand out.
 
+   if (EXT4_I(inode)-i_file_acl) {
+   bh = sb_bread(inode-i_sb, EXT4_I(inode)-i_file_acl);
+   error = -EIO;
+   if (!bh)
+   goto cleanup;
+   if (ext4_xattr_check_block(bh)) {
+   ext4_error(inode-i_sb, __FUNCTION__,
+   inode %lu: bad block %llu, 
inode-i_ino,
+   EXT4_I(inode)-i_file_acl);
+   error = -EIO;
+   goto cleanup;
+   }
+   base = BHDR(bh);
+   first = BFIRST(bh);
+   end = bh-b_data + bh-b_size;
+   min_offs = end - base;
+   free = ext4_xattr_free_space(first, min_offs, base,
+total_blk);
+   if (free  new_extra_isize) {
+   if (!tried_min_extra_isize  
s_min_extra_isize) {
+   tried_min_extra_isize++;
+   new_extra_isize = s_min_extra_isize;
   
   Aren't we missing a brelse(bh) here?
  
  Seems likely, yes.
 
 OK - could we get a positive ack from someone indicating that this will get
 looked at?  Because I am about to forget about it.

I will send a patch to add the comments and make the suggested
corrections.

Thanks,
Kalpak.

 To unsubscribe from this list: send the line unsubscribe linux-ext4 in
 the body of a message to [EMAIL PROTECTED]
 More majordomo info at  http://vger.kernel.org/majordomo-info.html

-
To unsubscribe from this list: send the line unsubscribe linux-fsdevel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [EXT4 set 5][PATCH 1/1] expand inode i_extra_isize to support features in larger inode

2007-07-10 Thread Andrew Morton
On Sun, 01 Jul 2007 03:38:01 -0400
Mingming Cao [EMAIL PROTECTED] wrote:

 This patch is on top of the nanosecond timestamp and i_version_hi
 patches. 

This sort of information isn't needed (or desired) when this patch hits the
git tree.  Please ensure that things like this are cleaned up before the
patches go to Linus.

 We need to make sure that existing filesystems can also avail the new
 fields that have been added to the inode. We use s_want_extra_isize and
 s_min_extra_isize to decide by how much we should expand the inode. If
 EXT4_FEATURE_RO_COMPAT_EXTRA_ISIZE feature is set then we expand by
 max(s_want_extra_isize, s_min_extra_isize , sizeof(ext4_inode) -
 EXT4_GOOD_OLD_INODE_SIZE) bytes. Actually it is still an open question
 about whether users should be able to set s_*_extra_isize smaller than
 the known fields or not. 
 
 This patch also adds the functionality to expand inodes to include the
 newly added fields. We start by trying to expand by s_want_extra_isize
 bytes and if its fails we try to expand by s_min_extra_isize bytes. This
 is done by changing the i_extra_isize if enough space is available in
 the inode and no EAs are present. If EAs are present and there is enough
 space in the inode then the EAs in the inode are shifted to make space.
 If enough space is not available in the inode due to the EAs then 1 or
 more EAs are shifted to the external EA block. In the worst case when
 even the external EA block does not have enough space we inform the user
 that some EA would need to be deleted or s_min_extra_isize would have to
 be reduced.
 
 This would be online expansion of inodes. I am also working on adding an
 expand_inodes option to e2fsck which will expand all the used inodes.
 
 Signed-off-by: Andreas Dilger [EMAIL PROTECTED]
 Signed-off-by: Kalpak Shah [EMAIL PROTECTED]
 Signed-off-by: Mingming Cao [EMAIL PROTECTED]
 
 Index: linux-2.6.22-rc4/fs/ext4/inode.c
 ===
 --- linux-2.6.22-rc4.orig/fs/ext4/inode.c 2007-06-14 17:32:27.0 
 -0700
 +++ linux-2.6.22-rc4/fs/ext4/inode.c  2007-06-14 17:32:41.0 -0700
 @@ -3120,6 +3120,40 @@ ext4_reserve_inode_write(handle_t *handl
  }
  
  /*
 + * Expand an inode by new_extra_isize bytes.
 + * Returns 0 on success or negative error number on failure.
 + */
 +int ext4_expand_extra_isize(struct inode *inode, unsigned int 
 new_extra_isize,
 + struct ext4_iloc iloc, handle_t *handle)
 +{
 + struct ext4_inode *raw_inode;
 + struct ext4_xattr_ibody_header *header;
 + struct ext4_xattr_entry *entry;
 +
 + if (EXT4_I(inode)-i_extra_isize = new_extra_isize) {
 + return 0;
 + }

This patch has lots of unneeded braces in it.  checkpatch appears to catch
them.


 + raw_inode = ext4_raw_inode(iloc);
 +
 + header = IHDR(inode, raw_inode);
 + entry = IFIRST(header);
 +
 + /* No extended attributes present */
 + if (!(EXT4_I(inode)-i_state  EXT4_STATE_XATTR) ||
 + header-h_magic != cpu_to_le32(EXT4_XATTR_MAGIC)) {
 + memset((void *)raw_inode + EXT4_GOOD_OLD_INODE_SIZE, 0,
 + new_extra_isize);
 + EXT4_I(inode)-i_extra_isize = new_extra_isize;
 + return 0;
 + }
 +
 + /* try to expand with EA present */
 + return ext4_expand_extra_isize_ea(inode, new_extra_isize,
 + raw_inode, handle);
 +}
 +
 +/*
   * What we do here is to mark the in-core inode as clean with respect to 
 inode
   * dirtiness (it may still be data-dirty).
   * This means that the in-core inode may be reaped by prune_icache
 @@ -3143,10 +3177,33 @@ ext4_reserve_inode_write(handle_t *handl
  int ext4_mark_inode_dirty(handle_t *handle, struct inode *inode)
  {
   struct ext4_iloc iloc;
 - int err;
 + int err, ret;
 + static int expand_message;
  
   might_sleep();
   err = ext4_reserve_inode_write(handle, inode, iloc);
 + if (EXT4_I(inode)-i_extra_isize 
 + EXT4_SB(inode-i_sb)-s_want_extra_isize 
 + !(EXT4_I(inode)-i_state  EXT4_STATE_NO_EXPAND)) {
 + /* We need extra buffer credits since we may write into EA block
 +  * with this same handle */
 + if ((jbd2_journal_extend(handle,
 +  EXT4_DATA_TRANS_BLOCKS(inode-i_sb))) == 0) {
 + ret = ext4_expand_extra_isize(inode,
 + EXT4_SB(inode-i_sb)-s_want_extra_isize,
 + iloc, handle);
 + if (ret) {
 + EXT4_I(inode)-i_state |= EXT4_STATE_NO_EXPAND;
 + if (!expand_message) {
 + ext4_warning(inode-i_sb, __FUNCTION__,
 + Unable to expand inode %lu. Delete
 +  some EAs or run e2fsck.,
 + 

[EXT4 set 5][PATCH 1/1] expand inode i_extra_isize to support features in larger inode

2007-07-01 Thread Mingming Cao
This patch is on top of the nanosecond timestamp and i_version_hi
patches. 

We need to make sure that existing filesystems can also avail the new
fields that have been added to the inode. We use s_want_extra_isize and
s_min_extra_isize to decide by how much we should expand the inode. If
EXT4_FEATURE_RO_COMPAT_EXTRA_ISIZE feature is set then we expand by
max(s_want_extra_isize, s_min_extra_isize , sizeof(ext4_inode) -
EXT4_GOOD_OLD_INODE_SIZE) bytes. Actually it is still an open question
about whether users should be able to set s_*_extra_isize smaller than
the known fields or not. 

This patch also adds the functionality to expand inodes to include the
newly added fields. We start by trying to expand by s_want_extra_isize
bytes and if its fails we try to expand by s_min_extra_isize bytes. This
is done by changing the i_extra_isize if enough space is available in
the inode and no EAs are present. If EAs are present and there is enough
space in the inode then the EAs in the inode are shifted to make space.
If enough space is not available in the inode due to the EAs then 1 or
more EAs are shifted to the external EA block. In the worst case when
even the external EA block does not have enough space we inform the user
that some EA would need to be deleted or s_min_extra_isize would have to
be reduced.

This would be online expansion of inodes. I am also working on adding an
expand_inodes option to e2fsck which will expand all the used inodes.

Signed-off-by: Andreas Dilger [EMAIL PROTECTED]
Signed-off-by: Kalpak Shah [EMAIL PROTECTED]
Signed-off-by: Mingming Cao [EMAIL PROTECTED]

Index: linux-2.6.22-rc4/fs/ext4/inode.c
===
--- linux-2.6.22-rc4.orig/fs/ext4/inode.c   2007-06-14 17:32:27.0 
-0700
+++ linux-2.6.22-rc4/fs/ext4/inode.c2007-06-14 17:32:41.0 -0700
@@ -3120,6 +3120,40 @@ ext4_reserve_inode_write(handle_t *handl
 }
 
 /*
+ * Expand an inode by new_extra_isize bytes.
+ * Returns 0 on success or negative error number on failure.
+ */
+int ext4_expand_extra_isize(struct inode *inode, unsigned int new_extra_isize,
+   struct ext4_iloc iloc, handle_t *handle)
+{
+   struct ext4_inode *raw_inode;
+   struct ext4_xattr_ibody_header *header;
+   struct ext4_xattr_entry *entry;
+
+   if (EXT4_I(inode)-i_extra_isize = new_extra_isize) {
+   return 0;
+   }
+
+   raw_inode = ext4_raw_inode(iloc);
+
+   header = IHDR(inode, raw_inode);
+   entry = IFIRST(header);
+
+   /* No extended attributes present */
+   if (!(EXT4_I(inode)-i_state  EXT4_STATE_XATTR) ||
+   header-h_magic != cpu_to_le32(EXT4_XATTR_MAGIC)) {
+   memset((void *)raw_inode + EXT4_GOOD_OLD_INODE_SIZE, 0,
+   new_extra_isize);
+   EXT4_I(inode)-i_extra_isize = new_extra_isize;
+   return 0;
+   }
+
+   /* try to expand with EA present */
+   return ext4_expand_extra_isize_ea(inode, new_extra_isize,
+   raw_inode, handle);
+}
+
+/*
  * What we do here is to mark the in-core inode as clean with respect to inode
  * dirtiness (it may still be data-dirty).
  * This means that the in-core inode may be reaped by prune_icache
@@ -3143,10 +3177,33 @@ ext4_reserve_inode_write(handle_t *handl
 int ext4_mark_inode_dirty(handle_t *handle, struct inode *inode)
 {
struct ext4_iloc iloc;
-   int err;
+   int err, ret;
+   static int expand_message;
 
might_sleep();
err = ext4_reserve_inode_write(handle, inode, iloc);
+   if (EXT4_I(inode)-i_extra_isize 
+   EXT4_SB(inode-i_sb)-s_want_extra_isize 
+   !(EXT4_I(inode)-i_state  EXT4_STATE_NO_EXPAND)) {
+   /* We need extra buffer credits since we may write into EA block
+* with this same handle */
+   if ((jbd2_journal_extend(handle,
+EXT4_DATA_TRANS_BLOCKS(inode-i_sb))) == 0) {
+   ret = ext4_expand_extra_isize(inode,
+   EXT4_SB(inode-i_sb)-s_want_extra_isize,
+   iloc, handle);
+   if (ret) {
+   EXT4_I(inode)-i_state |= EXT4_STATE_NO_EXPAND;
+   if (!expand_message) {
+   ext4_warning(inode-i_sb, __FUNCTION__,
+   Unable to expand inode %lu. Delete
+some EAs or run e2fsck.,
+   inode-i_ino);
+   expand_message = 1;
+   }
+   }
+   }
+   }
if (!err)
err = ext4_mark_iloc_dirty(handle, inode, iloc);
return err;
Index: linux-2.6.22-rc4/include/linux/ext4_fs.h