Re: [PATCH v3 1/2] mbcache: decoupling the locking of local from global data

2013-10-30 Thread Thavatchai Makphaibulchoke
On 10/30/2013 08:42 AM, Theodore Ts'o wrote:
> I tried running xfstests with this patch, and it blew up on
> generic/020 test:
> 
> generic/020   [10:21:50][  105.170352] [ cut here ]
> [  105.171683] kernel BUG at 
> /usr/projects/linux/ext4/include/linux/bit_spinlock.h:76!
> [  105.173346] invalid opcode:  [#1] SMP DEBUG_PAGEALLOC
> [  105.173346] Modules linked in:
> [  105.173346] CPU: 1 PID: 8519 Comm: attr Not tainted 
> 3.12.0-rc5-8-gffbe1d7-dirty #1492
> [  105.173346] Hardware name: Bochs Bochs, BIOS Bochs 01/01/2011
> [  105.173346] task: f5abe560 ti: f2274000 task.ti: f2274000
> [  105.173346] EIP: 0060:[] EFLAGS: 00010246 CPU: 1
> [  105.173346] EIP is at hlist_bl_unlock+0x7/0x1c
> [  105.173346] EAX: f488d360 EBX: f488d360 ECX:  EDX: f2998800
> [  105.173346] ESI: f29987f0 EDI: 6954c848 EBP: f2275cc8 ESP: f2275cb8
> [  105.173346]  DS: 007b ES: 007b FS: 00d8 GS: 0033 SS: 0068
> [  105.173346] CR0: 80050033 CR2: b76bcf54 CR3: 34844000 CR4: 06f0
> [  105.173346] Stack:
> [  105.173346]  c026bc78 f2275d48 6954c848 f29987f0 f2275d24 c02cd7a9 
> f2275ce4 c02e2881
> [  105.173346]  f255d8c8  f1109020 f4a67f00 f2275d54 f2275d08 
> c02cd020 6954c848
> [  105.173346]  f4a67f00 f1109000 f2b0eba8 f2ee3800 f2275d28 f4f811e8 
> f2275d38 
> [  105.173346] Call Trace:
> [  105.173346]  [] ? mb_cache_entry_find_first+0x4b/0x55
> [  105.173346]  [] ext4_xattr_block_set+0x248/0x6e7
> [  105.173346]  [] ? jbd2_journal_put_journal_head+0xe2/0xed
> [  105.173346]  [] ? ext4_xattr_find_entry+0x52/0xac
> [  105.173346]  [] ext4_xattr_set_handle+0x1c7/0x30f
> [  105.173346]  [] ext4_xattr_set+0xa5/0xe1
> [  105.173346]  [] ext4_xattr_user_set+0x46/0x5f
> [  105.173346]  [] generic_setxattr+0x4c/0x5e
> [  105.173346]  [] ? generic_listxattr+0x95/0x95
> [  105.173346]  [] __vfs_setxattr_noperm+0x56/0xb6
> [  105.173346]  [] vfs_setxattr+0x63/0x7e
> [  105.173346]  [] setxattr+0xfb/0x139
> [  105.173346]  [] ? __lock_acquire+0x540/0xca6
> [  105.173346]  [] ? lg_local_unlock+0x1b/0x34
> [  105.173346]  [] ? trace_hardirqs_off_caller+0x2e/0x98
> [  105.173346]  [] ? kmem_cache_free+0xd4/0x149
> [  105.173346]  [] ? lock_acquire+0xdd/0x107
> [  105.173346]  [] ? __sb_start_write+0xee/0x11d
> [  105.173346]  [] ? mnt_want_write+0x1e/0x3e
> [  105.173346]  [] ? trace_hardirqs_on_caller+0x12a/0x17e
> [  105.173346]  [] ? __mnt_want_write+0x4e/0x60
> [  105.173346]  [] SyS_lsetxattr+0x6a/0x9f
> [  105.173346]  [] syscall_call+0x7/0xb
> [  105.173346] Code: 00 00 00 00 5b 5d c3 55 89 e5 53 3e 8d 74 26 00 8b 58 08 
> 89 c2 8b 43 18 e8 3f c9 fb ff f0 ff 4b 0c 5b 5d c3 8b 10 80 e2 01 75 02 <0f> 
> 0b 55 89 e5 0f ba 30 00 89 e0 25 00 e0 ff ff ff 48 14 5d c3
> [  105.173346] EIP: [] hlist_bl_unlock+0x7/0x1c SS:ESP 0068:f2275cb8
> [  105.273781] ---[ end trace 1ee45ddfc1df0935 ]---
> 
> When I tried to find a potential problem, I immediately ran into this.
> I'm not entirely sure it's the problem, but it's raised a number of
> red flags for me in terms of (a) how much testing you've employed with
> this patch set, and (b) how maintaining and easy-to-audit the code
> will be with this extra locking.  The comments are good start, but
> some additional comments about exactly what assumptions a function
> assumes about locks that are held on function entry, or especially if
> the locking is different on function entry and function exit, might
> make it easier for people to audit this patch.
> 
> Or maybe this commit needs to be split up with first a conversion from
> using list_head to hlist_hl_node, and the changing the locking?  The
> bottom line is that we need to somehow make this patch easier to
> validate/review.
> 

Thanks for the comemnts.  Yes, I did run through xfstests.  My guess is that 
you probably ran into a race condition that I did not.

I will try to port the patch to a more recent kernel, including the 
mb_cache_shrink_scan() sent earlier (BTW, it looks good) and debug the problem.

Yes, those are good suggestions.  Once I find the problem, I will resubmit with 
more comments and also split it into two patches, as suggested. 

>> @@ -520,18 +647,23 @@ __mb_cache_entry_find(struct list_head *l, struct 
>> list_head *head,
>>  ce->e_queued++;
>>  prepare_to_wait(_cache_queue, ,
>>  TASK_UNINTERRUPTIBLE);
>> -spin_unlock(_cache_spinlock);
>> +hlist_bl_unlock(head);
>>  schedule();
>> -spin_lock(_cache_spinlock);
>> +hlist_bl_lock(head);
>> +mb_assert(ce->e_index_hash_p == head);
>>  ce->e_queued--;
>>  }
>> +hlist_bl_unlock(head);
>>  finish_wait(_cache_queue, );
>>  
>> -if 

Re: [PATCH v3 1/2] mbcache: decoupling the locking of local from global data

2013-10-30 Thread Theodore Ts'o
On Wed, Sep 04, 2013 at 10:39:15AM -0600, T Makphaibulchoke wrote:
> The patch increases the parallelism of mb_cache_entry utilization by
> replacing list_head with hlist_bl_node for the implementation of both the
> block and index hash tables.  Each hlist_bl_node contains a built-in lock
> used to protect mb_cache's local block and index hash chains. The global
> data mb_cache_lru_list and mb_cache_list continue to be protected by the
> global mb_cache_spinlock.

In the process of applying this patch to the ext4 tree, I had to
rework one of the patches to account for a change upstream to the
shrinker interface (which modified mb_cache_shrink_fn() to be
mb_cache_shrink_scan()).

Can you verify that the changes I made look sane?

Thanks,

- Ted

diff --git a/fs/mbcache.c b/fs/mbcache.c
index 1f90cd0..44e7153 100644
--- a/fs/mbcache.c
+++ b/fs/mbcache.c
@@ -200,25 +200,38 @@ forget:
 static unsigned long
 mb_cache_shrink_scan(struct shrinker *shrink, struct shrink_control *sc)
 {
-   LIST_HEAD(free_list);
-   struct mb_cache_entry *entry, *tmp;
int nr_to_scan = sc->nr_to_scan;
gfp_t gfp_mask = sc->gfp_mask;
unsigned long freed = 0;
 
mb_debug("trying to free %d entries", nr_to_scan);
-   spin_lock(_cache_spinlock);
-   while (nr_to_scan-- && !list_empty(_cache_lru_list)) {
-   struct mb_cache_entry *ce =
-   list_entry(mb_cache_lru_list.next,
-  struct mb_cache_entry, e_lru_list);
-   list_move_tail(>e_lru_list, _list);
-   __mb_cache_entry_unhash(ce);
-   freed++;
-   }
-   spin_unlock(_cache_spinlock);
-   list_for_each_entry_safe(entry, tmp, _list, e_lru_list) {
-   __mb_cache_entry_forget(entry, gfp_mask);
+   while (nr_to_scan > 0) {
+   struct mb_cache_entry *ce;
+
+   spin_lock(_cache_spinlock);
+   if (list_empty(_cache_lru_list)) {
+   spin_unlock(_cache_spinlock);
+   break;
+   }
+   ce = list_entry(mb_cache_lru_list.next,
+   struct mb_cache_entry, e_lru_list);
+   list_del_init(>e_lru_list);
+   spin_unlock(_cache_spinlock);
+
+   hlist_bl_lock(ce->e_block_hash_p);
+   hlist_bl_lock(ce->e_index_hash_p);
+   if (!(ce->e_used || ce->e_queued)) {
+   __mb_cache_entry_unhash_index(ce);
+   hlist_bl_unlock(ce->e_index_hash_p);
+   __mb_cache_entry_unhash_block(ce);
+   hlist_bl_unlock(ce->e_block_hash_p);
+   __mb_cache_entry_forget(ce, gfp_mask);
+   --nr_to_scan;
+   freed++;
+   } else {
+   hlist_bl_unlock(ce->e_index_hash_p);
+   hlist_bl_unlock(ce->e_block_hash_p);
+   }
}
return freed;
 }
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH v3 1/2] mbcache: decoupling the locking of local from global data

2013-10-30 Thread Theodore Ts'o
On Wed, Sep 04, 2013 at 10:39:15AM -0600, T Makphaibulchoke wrote:
> The patch increases the parallelism of mb_cache_entry utilization by
> replacing list_head with hlist_bl_node for the implementation of both the
> block and index hash tables.  Each hlist_bl_node contains a built-in lock
> used to protect mb_cache's local block and index hash chains. The global
> data mb_cache_lru_list and mb_cache_list continue to be protected by the
> global mb_cache_spinlock.
> 
> Signed-off-by: T. Makphaibulchoke 

I tried running xfstests with this patch, and it blew up on
generic/020 test:

generic/020 [10:21:50][  105.170352] [ cut here ]
[  105.171683] kernel BUG at 
/usr/projects/linux/ext4/include/linux/bit_spinlock.h:76!
[  105.173346] invalid opcode:  [#1] SMP DEBUG_PAGEALLOC
[  105.173346] Modules linked in:
[  105.173346] CPU: 1 PID: 8519 Comm: attr Not tainted 
3.12.0-rc5-8-gffbe1d7-dirty #1492
[  105.173346] Hardware name: Bochs Bochs, BIOS Bochs 01/01/2011
[  105.173346] task: f5abe560 ti: f2274000 task.ti: f2274000
[  105.173346] EIP: 0060:[] EFLAGS: 00010246 CPU: 1
[  105.173346] EIP is at hlist_bl_unlock+0x7/0x1c
[  105.173346] EAX: f488d360 EBX: f488d360 ECX:  EDX: f2998800
[  105.173346] ESI: f29987f0 EDI: 6954c848 EBP: f2275cc8 ESP: f2275cb8
[  105.173346]  DS: 007b ES: 007b FS: 00d8 GS: 0033 SS: 0068
[  105.173346] CR0: 80050033 CR2: b76bcf54 CR3: 34844000 CR4: 06f0
[  105.173346] Stack:
[  105.173346]  c026bc78 f2275d48 6954c848 f29987f0 f2275d24 c02cd7a9 f2275ce4 
c02e2881
[  105.173346]  f255d8c8  f1109020 f4a67f00 f2275d54 f2275d08 c02cd020 
6954c848
[  105.173346]  f4a67f00 f1109000 f2b0eba8 f2ee3800 f2275d28 f4f811e8 f2275d38 

[  105.173346] Call Trace:
[  105.173346]  [] ? mb_cache_entry_find_first+0x4b/0x55
[  105.173346]  [] ext4_xattr_block_set+0x248/0x6e7
[  105.173346]  [] ? jbd2_journal_put_journal_head+0xe2/0xed
[  105.173346]  [] ? ext4_xattr_find_entry+0x52/0xac
[  105.173346]  [] ext4_xattr_set_handle+0x1c7/0x30f
[  105.173346]  [] ext4_xattr_set+0xa5/0xe1
[  105.173346]  [] ext4_xattr_user_set+0x46/0x5f
[  105.173346]  [] generic_setxattr+0x4c/0x5e
[  105.173346]  [] ? generic_listxattr+0x95/0x95
[  105.173346]  [] __vfs_setxattr_noperm+0x56/0xb6
[  105.173346]  [] vfs_setxattr+0x63/0x7e
[  105.173346]  [] setxattr+0xfb/0x139
[  105.173346]  [] ? __lock_acquire+0x540/0xca6
[  105.173346]  [] ? lg_local_unlock+0x1b/0x34
[  105.173346]  [] ? trace_hardirqs_off_caller+0x2e/0x98
[  105.173346]  [] ? kmem_cache_free+0xd4/0x149
[  105.173346]  [] ? lock_acquire+0xdd/0x107
[  105.173346]  [] ? __sb_start_write+0xee/0x11d
[  105.173346]  [] ? mnt_want_write+0x1e/0x3e
[  105.173346]  [] ? trace_hardirqs_on_caller+0x12a/0x17e
[  105.173346]  [] ? __mnt_want_write+0x4e/0x60
[  105.173346]  [] SyS_lsetxattr+0x6a/0x9f
[  105.173346]  [] syscall_call+0x7/0xb
[  105.173346] Code: 00 00 00 00 5b 5d c3 55 89 e5 53 3e 8d 74 26 00 8b 58 08 
89 c2 8b 43 18 e8 3f c9 fb ff f0 ff 4b 0c 5b 5d c3 8b 10 80 e2 01 75 02 <0f> 0b 
55 89 e5 0f ba 30 00 89 e0 25 00 e0 ff ff ff 48 14 5d c3
[  105.173346] EIP: [] hlist_bl_unlock+0x7/0x1c SS:ESP 0068:f2275cb8
[  105.273781] ---[ end trace 1ee45ddfc1df0935 ]---

When I tried to find a potential problem, I immediately ran into this.
I'm not entirely sure it's the problem, but it's raised a number of
red flags for me in terms of (a) how much testing you've employed with
this patch set, and (b) how maintaining and easy-to-audit the code
will be with this extra locking.  The comments are good start, but
some additional comments about exactly what assumptions a function
assumes about locks that are held on function entry, or especially if
the locking is different on function entry and function exit, might
make it easier for people to audit this patch.

Or maybe this commit needs to be split up with first a conversion from
using list_head to hlist_hl_node, and the changing the locking?  The
bottom line is that we need to somehow make this patch easier to
validate/review.

> @@ -520,18 +647,23 @@ __mb_cache_entry_find(struct list_head *l, struct 
> list_head *head,
>   ce->e_queued++;
>   prepare_to_wait(_cache_queue, ,
>   TASK_UNINTERRUPTIBLE);
> - spin_unlock(_cache_spinlock);
> + hlist_bl_unlock(head);
>   schedule();
> - spin_lock(_cache_spinlock);
> + hlist_bl_lock(head);
> + mb_assert(ce->e_index_hash_p == head);
>   ce->e_queued--;
>   }
> + hlist_bl_unlock(head);
>   finish_wait(_cache_queue, );
>  
> - if (!__mb_cache_entry_is_hashed(ce)) {
> + hlist_bl_lock(ce->e_block_hash_p);
> + if 

Re: [PATCH v3 1/2] mbcache: decoupling the locking of local from global data

2013-10-30 Thread Theodore Ts'o
On Wed, Sep 04, 2013 at 10:39:15AM -0600, T Makphaibulchoke wrote:
 The patch increases the parallelism of mb_cache_entry utilization by
 replacing list_head with hlist_bl_node for the implementation of both the
 block and index hash tables.  Each hlist_bl_node contains a built-in lock
 used to protect mb_cache's local block and index hash chains. The global
 data mb_cache_lru_list and mb_cache_list continue to be protected by the
 global mb_cache_spinlock.
 
 Signed-off-by: T. Makphaibulchoke t...@hp.com

I tried running xfstests with this patch, and it blew up on
generic/020 test:

generic/020 [10:21:50][  105.170352] [ cut here ]
[  105.171683] kernel BUG at 
/usr/projects/linux/ext4/include/linux/bit_spinlock.h:76!
[  105.173346] invalid opcode:  [#1] SMP DEBUG_PAGEALLOC
[  105.173346] Modules linked in:
[  105.173346] CPU: 1 PID: 8519 Comm: attr Not tainted 
3.12.0-rc5-8-gffbe1d7-dirty #1492
[  105.173346] Hardware name: Bochs Bochs, BIOS Bochs 01/01/2011
[  105.173346] task: f5abe560 ti: f2274000 task.ti: f2274000
[  105.173346] EIP: 0060:[c026b464] EFLAGS: 00010246 CPU: 1
[  105.173346] EIP is at hlist_bl_unlock+0x7/0x1c
[  105.173346] EAX: f488d360 EBX: f488d360 ECX:  EDX: f2998800
[  105.173346] ESI: f29987f0 EDI: 6954c848 EBP: f2275cc8 ESP: f2275cb8
[  105.173346]  DS: 007b ES: 007b FS: 00d8 GS: 0033 SS: 0068
[  105.173346] CR0: 80050033 CR2: b76bcf54 CR3: 34844000 CR4: 06f0
[  105.173346] Stack:
[  105.173346]  c026bc78 f2275d48 6954c848 f29987f0 f2275d24 c02cd7a9 f2275ce4 
c02e2881
[  105.173346]  f255d8c8  f1109020 f4a67f00 f2275d54 f2275d08 c02cd020 
6954c848
[  105.173346]  f4a67f00 f1109000 f2b0eba8 f2ee3800 f2275d28 f4f811e8 f2275d38 

[  105.173346] Call Trace:
[  105.173346]  [c026bc78] ? mb_cache_entry_find_first+0x4b/0x55
[  105.173346]  [c02cd7a9] ext4_xattr_block_set+0x248/0x6e7
[  105.173346]  [c02e2881] ? jbd2_journal_put_journal_head+0xe2/0xed
[  105.173346]  [c02cd020] ? ext4_xattr_find_entry+0x52/0xac
[  105.173346]  [c02ce307] ext4_xattr_set_handle+0x1c7/0x30f
[  105.173346]  [c02ce4f4] ext4_xattr_set+0xa5/0xe1
[  105.173346]  [c02ceb36] ext4_xattr_user_set+0x46/0x5f
[  105.173346]  [c024a4da] generic_setxattr+0x4c/0x5e
[  105.173346]  [c024a48e] ? generic_listxattr+0x95/0x95
[  105.173346]  [c024ab0f] __vfs_setxattr_noperm+0x56/0xb6
[  105.173346]  [c024abd2] vfs_setxattr+0x63/0x7e
[  105.173346]  [c024ace8] setxattr+0xfb/0x139
[  105.173346]  [c01b200a] ? __lock_acquire+0x540/0xca6
[  105.173346]  [c01877a3] ? lg_local_unlock+0x1b/0x34
[  105.173346]  [c01af8dd] ? trace_hardirqs_off_caller+0x2e/0x98
[  105.173346]  [c0227e69] ? kmem_cache_free+0xd4/0x149
[  105.173346]  [c01b2c2b] ? lock_acquire+0xdd/0x107
[  105.173346]  [c023225e] ? __sb_start_write+0xee/0x11d
[  105.173346]  [c0247383] ? mnt_want_write+0x1e/0x3e
[  105.173346]  [c01b3019] ? trace_hardirqs_on_caller+0x12a/0x17e
[  105.173346]  [c0247353] ? __mnt_want_write+0x4e/0x60
[  105.173346]  [c024af3b] SyS_lsetxattr+0x6a/0x9f
[  105.173346]  [c078d0e8] syscall_call+0x7/0xb
[  105.173346] Code: 00 00 00 00 5b 5d c3 55 89 e5 53 3e 8d 74 26 00 8b 58 08 
89 c2 8b 43 18 e8 3f c9 fb ff f0 ff 4b 0c 5b 5d c3 8b 10 80 e2 01 75 02 0f 0b 
55 89 e5 0f ba 30 00 89 e0 25 00 e0 ff ff ff 48 14 5d c3
[  105.173346] EIP: [c026b464] hlist_bl_unlock+0x7/0x1c SS:ESP 0068:f2275cb8
[  105.273781] ---[ end trace 1ee45ddfc1df0935 ]---

When I tried to find a potential problem, I immediately ran into this.
I'm not entirely sure it's the problem, but it's raised a number of
red flags for me in terms of (a) how much testing you've employed with
this patch set, and (b) how maintaining and easy-to-audit the code
will be with this extra locking.  The comments are good start, but
some additional comments about exactly what assumptions a function
assumes about locks that are held on function entry, or especially if
the locking is different on function entry and function exit, might
make it easier for people to audit this patch.

Or maybe this commit needs to be split up with first a conversion from
using list_head to hlist_hl_node, and the changing the locking?  The
bottom line is that we need to somehow make this patch easier to
validate/review.

 @@ -520,18 +647,23 @@ __mb_cache_entry_find(struct list_head *l, struct 
 list_head *head,
   ce-e_queued++;
   prepare_to_wait(mb_cache_queue, wait,
   TASK_UNINTERRUPTIBLE);
 - spin_unlock(mb_cache_spinlock);
 + hlist_bl_unlock(head);
   schedule();
 - spin_lock(mb_cache_spinlock);
 + hlist_bl_lock(head);
 + mb_assert(ce-e_index_hash_p == head);
   ce-e_queued--;
   }
 + hlist_bl_unlock(head);
 

Re: [PATCH v3 1/2] mbcache: decoupling the locking of local from global data

2013-10-30 Thread Theodore Ts'o
On Wed, Sep 04, 2013 at 10:39:15AM -0600, T Makphaibulchoke wrote:
 The patch increases the parallelism of mb_cache_entry utilization by
 replacing list_head with hlist_bl_node for the implementation of both the
 block and index hash tables.  Each hlist_bl_node contains a built-in lock
 used to protect mb_cache's local block and index hash chains. The global
 data mb_cache_lru_list and mb_cache_list continue to be protected by the
 global mb_cache_spinlock.

In the process of applying this patch to the ext4 tree, I had to
rework one of the patches to account for a change upstream to the
shrinker interface (which modified mb_cache_shrink_fn() to be
mb_cache_shrink_scan()).

Can you verify that the changes I made look sane?

Thanks,

- Ted

diff --git a/fs/mbcache.c b/fs/mbcache.c
index 1f90cd0..44e7153 100644
--- a/fs/mbcache.c
+++ b/fs/mbcache.c
@@ -200,25 +200,38 @@ forget:
 static unsigned long
 mb_cache_shrink_scan(struct shrinker *shrink, struct shrink_control *sc)
 {
-   LIST_HEAD(free_list);
-   struct mb_cache_entry *entry, *tmp;
int nr_to_scan = sc-nr_to_scan;
gfp_t gfp_mask = sc-gfp_mask;
unsigned long freed = 0;
 
mb_debug(trying to free %d entries, nr_to_scan);
-   spin_lock(mb_cache_spinlock);
-   while (nr_to_scan--  !list_empty(mb_cache_lru_list)) {
-   struct mb_cache_entry *ce =
-   list_entry(mb_cache_lru_list.next,
-  struct mb_cache_entry, e_lru_list);
-   list_move_tail(ce-e_lru_list, free_list);
-   __mb_cache_entry_unhash(ce);
-   freed++;
-   }
-   spin_unlock(mb_cache_spinlock);
-   list_for_each_entry_safe(entry, tmp, free_list, e_lru_list) {
-   __mb_cache_entry_forget(entry, gfp_mask);
+   while (nr_to_scan  0) {
+   struct mb_cache_entry *ce;
+
+   spin_lock(mb_cache_spinlock);
+   if (list_empty(mb_cache_lru_list)) {
+   spin_unlock(mb_cache_spinlock);
+   break;
+   }
+   ce = list_entry(mb_cache_lru_list.next,
+   struct mb_cache_entry, e_lru_list);
+   list_del_init(ce-e_lru_list);
+   spin_unlock(mb_cache_spinlock);
+
+   hlist_bl_lock(ce-e_block_hash_p);
+   hlist_bl_lock(ce-e_index_hash_p);
+   if (!(ce-e_used || ce-e_queued)) {
+   __mb_cache_entry_unhash_index(ce);
+   hlist_bl_unlock(ce-e_index_hash_p);
+   __mb_cache_entry_unhash_block(ce);
+   hlist_bl_unlock(ce-e_block_hash_p);
+   __mb_cache_entry_forget(ce, gfp_mask);
+   --nr_to_scan;
+   freed++;
+   } else {
+   hlist_bl_unlock(ce-e_index_hash_p);
+   hlist_bl_unlock(ce-e_block_hash_p);
+   }
}
return freed;
 }
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH v3 1/2] mbcache: decoupling the locking of local from global data

2013-10-30 Thread Thavatchai Makphaibulchoke
On 10/30/2013 08:42 AM, Theodore Ts'o wrote:
 I tried running xfstests with this patch, and it blew up on
 generic/020 test:
 
 generic/020   [10:21:50][  105.170352] [ cut here ]
 [  105.171683] kernel BUG at 
 /usr/projects/linux/ext4/include/linux/bit_spinlock.h:76!
 [  105.173346] invalid opcode:  [#1] SMP DEBUG_PAGEALLOC
 [  105.173346] Modules linked in:
 [  105.173346] CPU: 1 PID: 8519 Comm: attr Not tainted 
 3.12.0-rc5-8-gffbe1d7-dirty #1492
 [  105.173346] Hardware name: Bochs Bochs, BIOS Bochs 01/01/2011
 [  105.173346] task: f5abe560 ti: f2274000 task.ti: f2274000
 [  105.173346] EIP: 0060:[c026b464] EFLAGS: 00010246 CPU: 1
 [  105.173346] EIP is at hlist_bl_unlock+0x7/0x1c
 [  105.173346] EAX: f488d360 EBX: f488d360 ECX:  EDX: f2998800
 [  105.173346] ESI: f29987f0 EDI: 6954c848 EBP: f2275cc8 ESP: f2275cb8
 [  105.173346]  DS: 007b ES: 007b FS: 00d8 GS: 0033 SS: 0068
 [  105.173346] CR0: 80050033 CR2: b76bcf54 CR3: 34844000 CR4: 06f0
 [  105.173346] Stack:
 [  105.173346]  c026bc78 f2275d48 6954c848 f29987f0 f2275d24 c02cd7a9 
 f2275ce4 c02e2881
 [  105.173346]  f255d8c8  f1109020 f4a67f00 f2275d54 f2275d08 
 c02cd020 6954c848
 [  105.173346]  f4a67f00 f1109000 f2b0eba8 f2ee3800 f2275d28 f4f811e8 
 f2275d38 
 [  105.173346] Call Trace:
 [  105.173346]  [c026bc78] ? mb_cache_entry_find_first+0x4b/0x55
 [  105.173346]  [c02cd7a9] ext4_xattr_block_set+0x248/0x6e7
 [  105.173346]  [c02e2881] ? jbd2_journal_put_journal_head+0xe2/0xed
 [  105.173346]  [c02cd020] ? ext4_xattr_find_entry+0x52/0xac
 [  105.173346]  [c02ce307] ext4_xattr_set_handle+0x1c7/0x30f
 [  105.173346]  [c02ce4f4] ext4_xattr_set+0xa5/0xe1
 [  105.173346]  [c02ceb36] ext4_xattr_user_set+0x46/0x5f
 [  105.173346]  [c024a4da] generic_setxattr+0x4c/0x5e
 [  105.173346]  [c024a48e] ? generic_listxattr+0x95/0x95
 [  105.173346]  [c024ab0f] __vfs_setxattr_noperm+0x56/0xb6
 [  105.173346]  [c024abd2] vfs_setxattr+0x63/0x7e
 [  105.173346]  [c024ace8] setxattr+0xfb/0x139
 [  105.173346]  [c01b200a] ? __lock_acquire+0x540/0xca6
 [  105.173346]  [c01877a3] ? lg_local_unlock+0x1b/0x34
 [  105.173346]  [c01af8dd] ? trace_hardirqs_off_caller+0x2e/0x98
 [  105.173346]  [c0227e69] ? kmem_cache_free+0xd4/0x149
 [  105.173346]  [c01b2c2b] ? lock_acquire+0xdd/0x107
 [  105.173346]  [c023225e] ? __sb_start_write+0xee/0x11d
 [  105.173346]  [c0247383] ? mnt_want_write+0x1e/0x3e
 [  105.173346]  [c01b3019] ? trace_hardirqs_on_caller+0x12a/0x17e
 [  105.173346]  [c0247353] ? __mnt_want_write+0x4e/0x60
 [  105.173346]  [c024af3b] SyS_lsetxattr+0x6a/0x9f
 [  105.173346]  [c078d0e8] syscall_call+0x7/0xb
 [  105.173346] Code: 00 00 00 00 5b 5d c3 55 89 e5 53 3e 8d 74 26 00 8b 58 08 
 89 c2 8b 43 18 e8 3f c9 fb ff f0 ff 4b 0c 5b 5d c3 8b 10 80 e2 01 75 02 0f 
 0b 55 89 e5 0f ba 30 00 89 e0 25 00 e0 ff ff ff 48 14 5d c3
 [  105.173346] EIP: [c026b464] hlist_bl_unlock+0x7/0x1c SS:ESP 0068:f2275cb8
 [  105.273781] ---[ end trace 1ee45ddfc1df0935 ]---
 
 When I tried to find a potential problem, I immediately ran into this.
 I'm not entirely sure it's the problem, but it's raised a number of
 red flags for me in terms of (a) how much testing you've employed with
 this patch set, and (b) how maintaining and easy-to-audit the code
 will be with this extra locking.  The comments are good start, but
 some additional comments about exactly what assumptions a function
 assumes about locks that are held on function entry, or especially if
 the locking is different on function entry and function exit, might
 make it easier for people to audit this patch.
 
 Or maybe this commit needs to be split up with first a conversion from
 using list_head to hlist_hl_node, and the changing the locking?  The
 bottom line is that we need to somehow make this patch easier to
 validate/review.
 

Thanks for the comemnts.  Yes, I did run through xfstests.  My guess is that 
you probably ran into a race condition that I did not.

I will try to port the patch to a more recent kernel, including the 
mb_cache_shrink_scan() sent earlier (BTW, it looks good) and debug the problem.

Yes, those are good suggestions.  Once I find the problem, I will resubmit with 
more comments and also split it into two patches, as suggested. 

 @@ -520,18 +647,23 @@ __mb_cache_entry_find(struct list_head *l, struct 
 list_head *head,
  ce-e_queued++;
  prepare_to_wait(mb_cache_queue, wait,
  TASK_UNINTERRUPTIBLE);
 -spin_unlock(mb_cache_spinlock);
 +hlist_bl_unlock(head);
  schedule();
 -spin_lock(mb_cache_spinlock);
 +hlist_bl_lock(head);
 +mb_assert(ce-e_index_hash_p == head);
  ce-e_queued--;
  }
 +

[PATCH v3 1/2] mbcache: decoupling the locking of local from global data

2013-09-04 Thread T Makphaibulchoke
The patch increases the parallelism of mb_cache_entry utilization by
replacing list_head with hlist_bl_node for the implementation of both the
block and index hash tables.  Each hlist_bl_node contains a built-in lock
used to protect mb_cache's local block and index hash chains. The global
data mb_cache_lru_list and mb_cache_list continue to be protected by the
global mb_cache_spinlock.

Signed-off-by: T. Makphaibulchoke 
---
Changed in v3:
- Removed all hash lock macros.
- Fixed a possible race condition updating the e_used and
  e_queued members of an mb_cache_entry between mb_cache_entry_get
  function, traversing an block hash chain, and mb_cache_entry_find
  function, traversing an index hash chain.

Changed in v2:
- As per Linus Torvalds' suggestion, instead of allocating spinlock
  arrays to protect the hash chains, use hlist_bl_head, which already
  contains builtin lock.

 fs/mbcache.c| 306 +++-
 include/linux/mbcache.h |  10 +-
 2 files changed, 229 insertions(+), 87 deletions(-)

diff --git a/fs/mbcache.c b/fs/mbcache.c
index 8c32ef3..dd45fe9 100644
--- a/fs/mbcache.c
+++ b/fs/mbcache.c
@@ -26,6 +26,38 @@
  * back on the lru list.
  */
 
+/*
+ * Lock descriptions and usage:
+ *
+ * Each hash chain of both the block and index hash tables now contains
+ * a built-in lock used to serialize accesses to the hash chain.
+ *
+ * Accesses to global data structures mb_cache_list and mb_cache_lru_list
+ * are serialized via the global spinlock mb_cache_spinlock.
+ *
+ * Lock ordering:
+ *
+ * Each block hash chain's lock has the highest order, followed by each
+ * index hash chain's lock, with mb_cache_spinlock the lowest.
+ * While holding a block hash chain lock a thread can acquire either
+ * an index hash chain lock or mb_cache_spinlock.
+ *
+ * Synchronization:
+ *
+ * Since both the e_used and e_queued members of each mb_cache_entry can
+ * be updated while traversing either a block hash chain or an index hash
+ * chain and the index hash chain lock has lower oder, each index hash
+ * chain's lock, in addition to being used to serialize accesses to the
+ * index hash chain, is also used to serialize accesses to both e_used
+ * and e_queued.
+ *
+ * To avoid having a dangling reference to an already freed
+ * mb_cache_entry, an mb_cache_entry is only freed when it is not on a
+ * block hash chain and also no longer being referenced.  Both e_used
+ * and e_queued are 0's.  When an mb_cache_entry is explicitly
+ * freed it is first removed from a block hash chain.
+ */
+
 #include 
 #include 
 
@@ -35,9 +67,9 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 
-
 #ifdef MB_CACHE_DEBUG
 # define mb_debug(f...) do { \
printk(KERN_DEBUG f); \
@@ -99,23 +131,34 @@ static struct shrinker mb_cache_shrinker = {
 };
 
 static inline int
-__mb_cache_entry_is_hashed(struct mb_cache_entry *ce)
+__mb_cache_entry_is_block_hashed(struct mb_cache_entry *ce)
 {
-   return !list_empty(>e_block_list);
+   return !hlist_bl_unhashed(>e_block_list);
 }
 
 
 static void
-__mb_cache_entry_unhash(struct mb_cache_entry *ce)
+__mb_cache_entry_unhash_block(struct mb_cache_entry *ce)
 {
-   if (__mb_cache_entry_is_hashed(ce)) {
-   list_del_init(>e_block_list);
-   list_del(>e_index.o_list);
-   }
+   if (__mb_cache_entry_is_block_hashed(ce))
+   hlist_bl_del_init(>e_block_list);
+}
+
+static inline int
+__mb_cache_entry_is_index_hashed(struct mb_cache_entry *ce)
+{
+   return !hlist_bl_unhashed(>e_index.o_list);
 }
 
 
 static void
+__mb_cache_entry_unhash_index(struct mb_cache_entry *ce)
+{
+   if (__mb_cache_entry_is_index_hashed(ce))
+   hlist_bl_del(>e_index.o_list);
+}
+
+static void
 __mb_cache_entry_forget(struct mb_cache_entry *ce, gfp_t gfp_mask)
 {
struct mb_cache *cache = ce->e_cache;
@@ -128,8 +171,8 @@ __mb_cache_entry_forget(struct mb_cache_entry *ce, gfp_t 
gfp_mask)
 
 static void
 __mb_cache_entry_release_unlock(struct mb_cache_entry *ce)
-   __releases(mb_cache_spinlock)
 {
+   hlist_bl_lock(ce->e_index_hash_p);
/* Wake up all processes queuing for this cache entry. */
if (ce->e_queued)
wake_up_all(_cache_queue);
@@ -137,15 +180,20 @@ __mb_cache_entry_release_unlock(struct mb_cache_entry *ce)
ce->e_used -= MB_CACHE_WRITER;
ce->e_used--;
if (!(ce->e_used || ce->e_queued)) {
-   if (!__mb_cache_entry_is_hashed(ce))
+   hlist_bl_unlock(ce->e_index_hash_p);
+   if (!__mb_cache_entry_is_block_hashed(ce))
goto forget;
+   spin_lock(_cache_spinlock);
mb_assert(list_empty(>e_lru_list));
list_add_tail(>e_lru_list, _cache_lru_list);
-   }
-   spin_unlock(_cache_spinlock);
+   spin_unlock(_cache_spinlock);

[PATCH v3 1/2] mbcache: decoupling the locking of local from global data

2013-09-04 Thread T Makphaibulchoke
The patch increases the parallelism of mb_cache_entry utilization by
replacing list_head with hlist_bl_node for the implementation of both the
block and index hash tables.  Each hlist_bl_node contains a built-in lock
used to protect mb_cache's local block and index hash chains. The global
data mb_cache_lru_list and mb_cache_list continue to be protected by the
global mb_cache_spinlock.

Signed-off-by: T. Makphaibulchoke t...@hp.com
---
Changed in v3:
- Removed all hash lock macros.
- Fixed a possible race condition updating the e_used and
  e_queued members of an mb_cache_entry between mb_cache_entry_get
  function, traversing an block hash chain, and mb_cache_entry_find
  function, traversing an index hash chain.

Changed in v2:
- As per Linus Torvalds' suggestion, instead of allocating spinlock
  arrays to protect the hash chains, use hlist_bl_head, which already
  contains builtin lock.

 fs/mbcache.c| 306 +++-
 include/linux/mbcache.h |  10 +-
 2 files changed, 229 insertions(+), 87 deletions(-)

diff --git a/fs/mbcache.c b/fs/mbcache.c
index 8c32ef3..dd45fe9 100644
--- a/fs/mbcache.c
+++ b/fs/mbcache.c
@@ -26,6 +26,38 @@
  * back on the lru list.
  */
 
+/*
+ * Lock descriptions and usage:
+ *
+ * Each hash chain of both the block and index hash tables now contains
+ * a built-in lock used to serialize accesses to the hash chain.
+ *
+ * Accesses to global data structures mb_cache_list and mb_cache_lru_list
+ * are serialized via the global spinlock mb_cache_spinlock.
+ *
+ * Lock ordering:
+ *
+ * Each block hash chain's lock has the highest order, followed by each
+ * index hash chain's lock, with mb_cache_spinlock the lowest.
+ * While holding a block hash chain lock a thread can acquire either
+ * an index hash chain lock or mb_cache_spinlock.
+ *
+ * Synchronization:
+ *
+ * Since both the e_used and e_queued members of each mb_cache_entry can
+ * be updated while traversing either a block hash chain or an index hash
+ * chain and the index hash chain lock has lower oder, each index hash
+ * chain's lock, in addition to being used to serialize accesses to the
+ * index hash chain, is also used to serialize accesses to both e_used
+ * and e_queued.
+ *
+ * To avoid having a dangling reference to an already freed
+ * mb_cache_entry, an mb_cache_entry is only freed when it is not on a
+ * block hash chain and also no longer being referenced.  Both e_used
+ * and e_queued are 0's.  When an mb_cache_entry is explicitly
+ * freed it is first removed from a block hash chain.
+ */
+
 #include linux/kernel.h
 #include linux/module.h
 
@@ -35,9 +67,9 @@
 #include linux/slab.h
 #include linux/sched.h
 #include linux/init.h
+#include linux/list_bl.h
 #include linux/mbcache.h
 
-
 #ifdef MB_CACHE_DEBUG
 # define mb_debug(f...) do { \
printk(KERN_DEBUG f); \
@@ -99,23 +131,34 @@ static struct shrinker mb_cache_shrinker = {
 };
 
 static inline int
-__mb_cache_entry_is_hashed(struct mb_cache_entry *ce)
+__mb_cache_entry_is_block_hashed(struct mb_cache_entry *ce)
 {
-   return !list_empty(ce-e_block_list);
+   return !hlist_bl_unhashed(ce-e_block_list);
 }
 
 
 static void
-__mb_cache_entry_unhash(struct mb_cache_entry *ce)
+__mb_cache_entry_unhash_block(struct mb_cache_entry *ce)
 {
-   if (__mb_cache_entry_is_hashed(ce)) {
-   list_del_init(ce-e_block_list);
-   list_del(ce-e_index.o_list);
-   }
+   if (__mb_cache_entry_is_block_hashed(ce))
+   hlist_bl_del_init(ce-e_block_list);
+}
+
+static inline int
+__mb_cache_entry_is_index_hashed(struct mb_cache_entry *ce)
+{
+   return !hlist_bl_unhashed(ce-e_index.o_list);
 }
 
 
 static void
+__mb_cache_entry_unhash_index(struct mb_cache_entry *ce)
+{
+   if (__mb_cache_entry_is_index_hashed(ce))
+   hlist_bl_del(ce-e_index.o_list);
+}
+
+static void
 __mb_cache_entry_forget(struct mb_cache_entry *ce, gfp_t gfp_mask)
 {
struct mb_cache *cache = ce-e_cache;
@@ -128,8 +171,8 @@ __mb_cache_entry_forget(struct mb_cache_entry *ce, gfp_t 
gfp_mask)
 
 static void
 __mb_cache_entry_release_unlock(struct mb_cache_entry *ce)
-   __releases(mb_cache_spinlock)
 {
+   hlist_bl_lock(ce-e_index_hash_p);
/* Wake up all processes queuing for this cache entry. */
if (ce-e_queued)
wake_up_all(mb_cache_queue);
@@ -137,15 +180,20 @@ __mb_cache_entry_release_unlock(struct mb_cache_entry *ce)
ce-e_used -= MB_CACHE_WRITER;
ce-e_used--;
if (!(ce-e_used || ce-e_queued)) {
-   if (!__mb_cache_entry_is_hashed(ce))
+   hlist_bl_unlock(ce-e_index_hash_p);
+   if (!__mb_cache_entry_is_block_hashed(ce))
goto forget;
+   spin_lock(mb_cache_spinlock);
mb_assert(list_empty(ce-e_lru_list));