[PATCH] Btrfs: save preloaded extent_state's in a percpu cache V2

2011-06-21 Thread Josef Bacik
When doing DIO tracing I noticed we were doing a ton of allocations, a lot of
the time for extent_states.  Some of the time we don't even use the prealloc'ed
extent_state, it just get's free'd up.  So instead create a per-cpu cache like
the radix tree stuff.  So we will check to see if our per-cpu cache has a
prealloc'ed extent_state in it and if so we just continue, else we alloc a new
one and fill the cache.  Then if we need to use a prealloc'ed extent_state we
can just take it out of our per-cpu cache.  We will also refill the cache on
free to try and limit the number of times we have to ask the allocator for
caches.  With this patch dbench 50 goes from ~210 mb/s to ~260 mb/s.  Thanks,

Signed-off-by: Josef Bacik jo...@redhat.com
---
V1-V2: Fix a uneven preempt_disable() when we use a prealloc but still have to
search again.
 fs/btrfs/extent_io.c |  167 +
 1 files changed, 126 insertions(+), 41 deletions(-)

diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 7055d11..9adc614 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -12,6 +12,7 @@
 #include linux/pagevec.h
 #include linux/prefetch.h
 #include linux/cleancache.h
+#include linux/cpu.h
 #include extent_io.h
 #include extent_map.h
 #include compat.h
@@ -31,6 +32,8 @@ static DEFINE_SPINLOCK(leak_lock);
 
 #define BUFFER_LRU_MAX 64
 
+static DEFINE_PER_CPU(struct extent_state *, extent_state_preloads) = NULL;
+
 struct tree_entry {
u64 start;
u64 end;
@@ -71,10 +74,36 @@ free_state_cache:
return -ENOMEM;
 }
 
+static void __free_extent_state(struct extent_state *state)
+{
+   if (!state)
+   return;
+   if (atomic_dec_and_test(state-refs)) {
+#if LEAK_DEBUG
+   unsigned long flags;
+#endif
+   WARN_ON(state-tree);
+#if LEAK_DEBUG
+   spin_lock_irqsave(leak_lock, flags);
+   list_del(state-leak_list);
+   spin_unlock_irqrestore(leak_lock, flags);
+#endif
+   kmem_cache_free(extent_state_cache, state);
+   }
+}
 void extent_io_exit(void)
 {
struct extent_state *state;
struct extent_buffer *eb;
+   int cpu;
+
+   for_each_possible_cpu(cpu) {
+   state = per_cpu(extent_state_preloads, cpu);
+   if (!state)
+   continue;
+   per_cpu(extent_state_preloads, cpu) = NULL;
+   __free_extent_state(state);
+   }
 
while (!list_empty(states)) {
state = list_entry(states.next, struct extent_state, leak_list);
@@ -114,16 +143,11 @@ void extent_io_tree_init(struct extent_io_tree *tree,
tree-mapping = mapping;
 }
 
-static struct extent_state *alloc_extent_state(gfp_t mask)
+static void init_extent_state(struct extent_state *state)
 {
-   struct extent_state *state;
 #if LEAK_DEBUG
unsigned long flags;
 #endif
-
-   state = kmem_cache_alloc(extent_state_cache, mask);
-   if (!state)
-   return state;
state-state = 0;
state-private = 0;
state-tree = NULL;
@@ -134,6 +158,16 @@ static struct extent_state *alloc_extent_state(gfp_t mask)
 #endif
atomic_set(state-refs, 1);
init_waitqueue_head(state-wq);
+}
+
+static struct extent_state *alloc_extent_state(gfp_t mask)
+{
+   struct extent_state *state;
+
+   state = kmem_cache_alloc(extent_state_cache, mask);
+   if (!state)
+   return state;
+   init_extent_state(state);
return state;
 }
 
@@ -142,6 +176,7 @@ void free_extent_state(struct extent_state *state)
if (!state)
return;
if (atomic_dec_and_test(state-refs)) {
+   struct extent_state *tmp;
 #if LEAK_DEBUG
unsigned long flags;
 #endif
@@ -151,10 +186,53 @@ void free_extent_state(struct extent_state *state)
list_del(state-leak_list);
spin_unlock_irqrestore(leak_lock, flags);
 #endif
-   kmem_cache_free(extent_state_cache, state);
+   preempt_disable();
+   tmp = __get_cpu_var(extent_state_preloads);
+   if (!tmp) {
+   init_extent_state(state);
+   __get_cpu_var(extent_state_preloads) = state;
+   } else {
+   kmem_cache_free(extent_state_cache, state);
+   }
+   preempt_enable();
}
 }
 
+/*
+ * Pre-load an extent state.
+ */
+static int extent_state_preload(gfp_t mask)
+{
+   struct extent_state *state;
+
+   preempt_disable();
+   state = __get_cpu_var(extent_state_preloads);
+   if (!state) {
+   struct extent_state *tmp;
+   preempt_enable();
+   tmp = alloc_extent_state(mask);
+   if (!tmp)
+   return -ENOMEM;
+   preempt_disable();
+   state = __get_cpu_var(extent_state_preloads);
+  

Re: [PATCH] Btrfs: save preloaded extent_state's in a percpu cache V2

2011-06-21 Thread Andi Kleen
Josef Bacik jo...@redhat.com writes:

 When doing DIO tracing I noticed we were doing a ton of allocations, a lot of
 the time for extent_states.  Some of the time we don't even use the 
 prealloc'ed
 extent_state, it just get's free'd up.  So instead create a per-cpu cache like
 the radix tree stuff.  So we will check to see if our per-cpu cache has a
 prealloc'ed extent_state in it and if so we just continue, else we alloc a new
 one and fill the cache.  Then if we need to use a prealloc'ed extent_state we
 can just take it out of our per-cpu cache.  We will also refill the cache on
 free to try and limit the number of times we have to ask the allocator for
 caches.  With this patch dbench 50 goes from ~210 mb/s to ~260 mb/s.  Thanks,

You're just reimplementing a poor man's custom slab cache -- all of this is 
already
done in slab.

If the difference is really that big better fix slab and have everyone 
benefit?

Did you use slub or slab?
Did you analyze where the cycles are spent?

-Andi

-- 
a...@linux.intel.com -- Speaking for myself only
--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] Btrfs: save preloaded extent_state's in a percpu cache V2

2011-06-21 Thread Josef Bacik
On 06/21/2011 04:20 PM, Andi Kleen wrote:
 Josef Bacik jo...@redhat.com writes:
 
 When doing DIO tracing I noticed we were doing a ton of allocations, a lot of
 the time for extent_states.  Some of the time we don't even use the 
 prealloc'ed
 extent_state, it just get's free'd up.  So instead create a per-cpu cache 
 like
 the radix tree stuff.  So we will check to see if our per-cpu cache has a
 prealloc'ed extent_state in it and if so we just continue, else we alloc a 
 new
 one and fill the cache.  Then if we need to use a prealloc'ed extent_state we
 can just take it out of our per-cpu cache.  We will also refill the cache on
 free to try and limit the number of times we have to ask the allocator for
 caches.  With this patch dbench 50 goes from ~210 mb/s to ~260 mb/s.  Thanks,
 
 You're just reimplementing a poor man's custom slab cache -- all of this is 
 already
 done in slab.
 
 If the difference is really that big better fix slab and have everyone 
 benefit?
 
 Did you use slub or slab?
 Did you analyze where the cycles are spent?
 

Ugh slub debugging bites me again.  Thanks,

Josef

--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html