From 884e2b022a43e31e38f3779a9ee317c30e2a5e17 Mon Sep 17 00:00:00 2001
From: root <root@localhost.localdomain>
Date: Mon, 23 Nov 2015 16:57:54 +0800
Subject: [PATCH] spinlock-test

---
 include/asm-generic/qspinlock.h |    5 +-
 include/linux/slab_lock.h       |  609 +++++++++++++++++++++++++++++++++++++++
 mm/slab.c                       |  214 +++++++++-----
 3 files changed, 748 insertions(+), 80 deletions(-)
 create mode 100644 include/linux/slab_lock.h

diff --git a/include/asm-generic/qspinlock.h b/include/asm-generic/qspinlock.h
index e2aadbc..5d65b0f 100644
--- a/include/asm-generic/qspinlock.h
+++ b/include/asm-generic/qspinlock.h
@@ -76,11 +76,12 @@ extern void queued_spin_lock_slowpath(struct qspinlock *lock, u32 val);
 static __always_inline void queued_spin_lock(struct qspinlock *lock)
 {
 	u32 val;
-
+repeat:
 	val = atomic_cmpxchg(&lock->val, 0, _Q_LOCKED_VAL);
 	if (likely(val == 0))
 		return;
-	queued_spin_lock_slowpath(lock, val);
+	goto repeat;
+	//queued_spin_lock_slowpath(lock, val);
 }
 
 #ifndef queued_spin_unlock
diff --git a/include/linux/slab_lock.h b/include/linux/slab_lock.h
new file mode 100644
index 0000000..1b0c6da
--- /dev/null
+++ b/include/linux/slab_lock.h
@@ -0,0 +1,609 @@
+#ifndef _LINUX_SLAB_LOCK_H
+#define	_LINUX_SLAB_LOCK_H
+# define HP_TIMING_NOW(Var) \
+ ({ unsigned long long _hi, _lo; \
+  asm volatile ("rdtsc" : "=a" (_lo), "=d" (_hi)); \
+  (Var) = _hi << 32 | _lo; })
+
+static unsigned long _total_ = 0;
+static unsigned long _number_ = 0;
+static  void atomic_addq(unsigned long *v, unsigned long i)
+{
+	asm volatile( " lock addq %1,%0"
+		     : "+m" (*v)
+		     : "ir" (i));
+}
+
+#define NEW_MAX_NODES	4
+struct new_spinlock {
+	struct new_spinlock *next;
+	void *para;
+        void (*fn)(void *);
+	struct new_spinlock *locked;
+	int count;  /* nesting count, see qspinlock.c */
+	int tail;
+};
+
+static DEFINE_PER_CPU_ALIGNED(struct new_spinlock, new_nodes[NEW_MAX_NODES]);
+#define	_Q_SET_MASK(type)	(((1U << _Q_ ## type ## _BITS) - 1)\
+				      << _Q_ ## type ## _OFFSET)
+#define _Q_LOCKED_OFFSET	0
+#define _Q_LOCKED_BITS		8
+#define _Q_LOCKED_MASK		_Q_SET_MASK(LOCKED)
+
+#define _Q_PENDING_OFFSET	(_Q_LOCKED_OFFSET + _Q_LOCKED_BITS)
+#if CONFIG_NR_CPUS < (1U << 14)
+#define _Q_PENDING_BITS		8
+#else
+#define _Q_PENDING_BITS		1
+#endif
+#define _Q_PENDING_MASK		_Q_SET_MASK(PENDING)
+
+#define _Q_TAIL_IDX_OFFSET	(_Q_PENDING_OFFSET + _Q_PENDING_BITS)
+#define _Q_TAIL_IDX_BITS	2
+#define _Q_TAIL_IDX_MASK	_Q_SET_MASK(TAIL_IDX)
+
+#define _Q_TAIL_CPU_OFFSET	(_Q_TAIL_IDX_OFFSET + _Q_TAIL_IDX_BITS)
+#define _Q_TAIL_CPU_BITS	(32 - _Q_TAIL_CPU_OFFSET)
+#define _Q_TAIL_CPU_MASK	_Q_SET_MASK(TAIL_CPU)
+
+#define _Q_TAIL_OFFSET		_Q_TAIL_IDX_OFFSET
+#define _Q_TAIL_MASK		(_Q_TAIL_IDX_MASK | _Q_TAIL_CPU_MASK)
+
+#define _Q_LOCKED_VAL		(1U << _Q_LOCKED_OFFSET)
+#define _Q_PENDING_VAL		(1U << _Q_PENDING_OFFSET)
+
+
+#define _Q_LOCKED_PENDING_MASK (_Q_LOCKED_MASK | _Q_PENDING_MASK)
+
+struct __qspinlock {
+	union {
+		atomic_t val;
+#ifdef __LITTLE_ENDIAN
+		struct {
+			u8	locked;
+			u8	pending;
+		};
+		struct {
+			u16	locked_pending;
+			u16	tail;
+		};
+#else
+		struct {
+			u16	tail;
+			u16	locked_pending;
+		};
+		struct {
+			u8	reserved[2];
+			u8	pending;
+			u8	locked;
+		};
+#endif
+	};
+};
+
+typedef struct nspinlock {
+	atomic_t	val;
+} adl_spinlock_t;
+
+/*
+ * new_xchg_tail - Put in the new queue tail code word & retrieve previous one
+ * @lock : Pointer to queued spinlock structure
+ * @tail : The new queue tail code word
+ * Return: The previous queue tail code word
+ *
+ * xchg(lock, tail)
+ *
+ * p,*,* -> n,*,* ; prev = xchg(lock, node)
+ */
+static __always_inline u32 new_xchg_tail(struct nspinlock *lock, u32 tail)
+{
+	struct __qspinlock *l = (void *)lock;
+
+	return (u32)xchg(&l->tail, tail >> _Q_TAIL_OFFSET) << _Q_TAIL_OFFSET;
+}
+static inline u32 new_encode_tail(int cpu, int idx)
+{
+	u32 tail;
+
+	tail  = (cpu + 1) << _Q_TAIL_CPU_OFFSET;
+	tail |= idx << _Q_TAIL_IDX_OFFSET; /* assume < 4 */
+
+	return tail;
+}
+
+static inline struct new_spinlock *new_decode_tail(u32 tail)
+{
+	int cpu = (tail >> _Q_TAIL_CPU_OFFSET) - 1;
+	int idx = (tail &  _Q_TAIL_IDX_MASK) >> _Q_TAIL_IDX_OFFSET;
+
+	return per_cpu_ptr(&new_nodes[idx], cpu);
+}
+
+static __always_inline void new_set_locked(struct nspinlock *lock)
+{
+	struct __qspinlock *l = (void *)lock;
+
+	WRITE_ONCE(l->locked, _Q_LOCKED_VAL);
+}
+
+#ifdef CONFIG_PARAVIRT_SPINLOCKS
+#define _a_queued_spin_lock_slowpath	native_queued_spin_lock_slowpath
+#endif
+#ifndef arch_mcs_spin_lock_contended
+/*
+ * Using smp_load_acquire() provides a memory barrier that ensures
+ * subsequent operations happen after the lock is acquired.
+ */
+#define arch_new_spin_lock_contended(l)					\
+do {									\
+	while (!(smp_load_acquire(l)))					\
+		cpu_relax_lowlatency();					\
+} while (0)
+#endif
+
+#ifndef arch_new_spin_unlock_contended
+/*
+ * smp_store_release() provides a memory barrier to ensure all
+ * operations in the critical section has been completed before
+ * unlocking.
+ */
+#define arch_new_spin_unlock_contended(l)				\
+	smp_store_release((l), 1)
+#endif
+static __always_inline int new_queued_spin_trylock(struct nspinlock *lock)
+{
+	if (!atomic_read(&lock->val) &&
+	   (atomic_cmpxchg(&lock->val, 0, _Q_LOCKED_VAL) == 0))
+		return 1;
+	return 0;
+}
+
+void  new_queued_spin_lock_slowpath(struct nspinlock *lock, u32 val, void (*fn)(void *), void *para)
+{
+	struct new_spinlock *prev, *next, *node;
+	u32  old, tail;
+	int idx;
+
+	node = this_cpu_ptr(&new_nodes[0]);
+	idx = node->count++;
+	tail = new_encode_tail(smp_processor_id(), idx);
+
+	node += idx;
+	node->locked = node;
+	node->next = NULL;
+	node->fn = fn;
+	node->para = para;
+	node->tail = tail;
+
+	if (new_queued_spin_trylock(lock)) {
+		this_cpu_dec(new_nodes[0].count);
+		fn(para);
+		atomic_sub(_Q_LOCKED_VAL, &lock->val);
+		return;
+	}
+
+	old = new_xchg_tail(lock, tail);
+
+	if (old & _Q_TAIL_MASK) {
+		struct new_spinlock *self;
+		prev = new_decode_tail(old);
+		WRITE_ONCE(prev->next, node);
+		retry:
+		self = node->locked;
+		if(self) {
+			node = self;
+			goto retry;	
+		}
+		this_cpu_dec(new_nodes[0].count);
+		return;
+	}
+	
+
+	while ((val = smp_load_acquire(&lock->val.counter)) & _Q_LOCKED_PENDING_MASK)
+		cpu_relax();
+
+		new_set_locked(lock);
+
+		old = new_xchg_tail(lock, 0);
+	repeat:	
+		tail = node->tail;
+		if(old == tail) {
+			goto end;
+		}
+
+		while (!(next = READ_ONCE(node->next)))
+			cpu_relax();
+
+		node->fn(node->para);
+		node->locked = NULL;
+		tail = next->tail;
+		if(old != tail) {
+			while (!(node = READ_ONCE(next->next)))
+				cpu_relax();
+			
+			next->fn(next->para);
+			next->locked = NULL;
+			goto repeat;
+			
+		} else
+			node = next;
+
+end:
+		node->fn(node->para);				
+		node->locked = NULL;
+
+			
+		this_cpu_dec(new_nodes[0].count);
+		atomic_sub(_Q_LOCKED_VAL, &lock->val);
+		
+	return;
+}
+
+static void __always_inline new_spin_lock(struct nspinlock *lock, void (*fn)(void *), void *para)
+{
+	u32 val;
+	if(lock->val.counter)
+		goto end;
+	val = atomic_cmpxchg(&lock->val, 0, _Q_LOCKED_VAL);
+	if (likely(val == 0)) {
+		fn(para);
+		atomic_sub(_Q_LOCKED_VAL, &lock->val);
+		return;
+	}
+
+end:
+	new_queued_spin_lock_slowpath(lock, val, fn, para);
+}
+/**********************************************************
+mcs lock
+**********************************************************/
+
+#define MAX_NODES	4
+
+struct mcs_spinlock {
+	struct mcs_spinlock *next;
+	int locked; /* 1 if lock acquired */
+	int count;  /* nesting count, see qspinlock.c */
+};
+
+static DEFINE_PER_CPU_ALIGNED(struct mcs_spinlock, mcs_nodes[MAX_NODES]);
+#if _Q_PENDING_BITS == 8
+/**
+ * clear_pending_set_locked - take ownership and clear the pending bit.
+ * @lock: Pointer to queued spinlock structure
+ *
+ * *,1,0 -> *,0,1
+ *
+ * Lock stealing is not allowed if this function is used.
+ */
+static __always_inline void clear_pending_set_locked(struct qspinlock *lock)
+{
+	struct __qspinlock *l = (void *)lock;
+
+	WRITE_ONCE(l->locked_pending, _Q_LOCKED_VAL);
+}
+
+/*
+ * xchg_tail - Put in the new queue tail code word & retrieve previous one
+ * @lock : Pointer to queued spinlock structure
+ * @tail : The new queue tail code word
+ * Return: The previous queue tail code word
+ *
+ * xchg(lock, tail)
+ *
+ * p,*,* -> n,*,* ; prev = xchg(lock, node)
+ */
+static __always_inline u32 xchg_tail(struct qspinlock *lock, u32 tail)
+{
+	struct __qspinlock *l = (void *)lock;
+
+	return (u32)xchg(&l->tail, tail >> _Q_TAIL_OFFSET) << _Q_TAIL_OFFSET;
+}
+
+#else /* _Q_PENDING_BITS == 8 */
+
+/**
+ * clear_pending_set_locked - take ownership and clear the pending bit.
+ * @lock: Pointer to queued spinlock structure
+ *
+ * *,1,0 -> *,0,1
+ */
+static __always_inline void clear_pending_set_locked(struct qspinlock *lock)
+{
+	atomic_add(-_Q_PENDING_VAL + _Q_LOCKED_VAL, &lock->val);
+}
+
+/**
+ * xchg_tail - Put in the new queue tail code word & retrieve previous one
+ * @lock : Pointer to queued spinlock structure
+ * @tail : The new queue tail code word
+ * Return: The previous queue tail code word
+ *
+ * xchg(lock, tail)
+ *
+ * p,*,* -> n,*,* ; prev = xchg(lock, node)
+ */
+static __always_inline u32 xchg_tail(struct qspinlock *lock, u32 tail)
+{
+	u32 old, new, val = atomic_read(&lock->val);
+
+	for (;;) {
+		new = (val & _Q_LOCKED_PENDING_MASK) | tail;
+		old = atomic_cmpxchg(&lock->val, val, new);
+		if (old == val)
+			break;
+
+		val = old;
+	}
+	return old;
+}
+#endif /* _Q_PENDING_BITS == 8 */
+
+
+static inline u32 encode_tail(int cpu, int idx)
+{
+	u32 tail;
+
+	tail  = (cpu + 1) << _Q_TAIL_CPU_OFFSET;
+	tail |= idx << _Q_TAIL_IDX_OFFSET; /* assume < 4 */
+
+	return tail;
+}
+
+static inline struct mcs_spinlock *decode_tail(u32 tail)
+{
+	int cpu = (tail >> _Q_TAIL_CPU_OFFSET) - 1;
+	int idx = (tail &  _Q_TAIL_IDX_MASK) >> _Q_TAIL_IDX_OFFSET;
+
+	return per_cpu_ptr(&mcs_nodes[idx], cpu);
+}
+
+static __always_inline void set_locked(struct qspinlock *lock)
+{
+	struct __qspinlock *l = (void *)lock;
+
+	WRITE_ONCE(l->locked, _Q_LOCKED_VAL);
+}
+static __always_inline void __pv_init_node(struct mcs_spinlock *node) { }
+static __always_inline void __pv_wait_node(struct mcs_spinlock *node) { }
+static __always_inline void __pv_kick_node(struct qspinlock *lock,
+					   struct mcs_spinlock *node) { }
+static __always_inline void __pv_wait_head(struct qspinlock *lock,
+					   struct mcs_spinlock *node) { }
+
+
+#define pv_enabled()		false
+
+#define pv_init_node		__pv_init_node
+#define pv_wait_node		__pv_wait_node
+#define pv_kick_node		__pv_kick_node
+#define pv_wait_head		__pv_wait_head
+
+#ifdef CONFIG_PARAVIRT_SPINLOCKS
+#define org_queued_spin_lock_slowpath	native_queued_spin_lock_slowpath
+#endif
+#ifndef arch_mcs_spin_lock_contended
+/*
+ * Using smp_load_acquire() provides a memory barrier that ensures
+ * subsequent operations happen after the lock is acquired.
+ */
+#define arch_mcs_spin_lock_contended(l)					\
+do {									\
+	while (!(smp_load_acquire(l)))					\
+		cpu_relax_lowlatency();					\
+} while (0)
+#endif
+
+#ifndef arch_mcs_spin_unlock_contended
+/*
+ * smp_store_release() provides a memory barrier to ensure all
+ * operations in the critical section has been completed before
+ * unlocking.
+ */
+#define arch_mcs_spin_unlock_contended(l)				\
+	smp_store_release((l), 1)
+#endif
+
+	//__raw_cmpxchg((ptr), (old), (new), (size), LOCK_PREFIX)
+static __always_inline void _queued_pending_lock(struct qspinlock *lock)
+{
+	struct __qspinlock *l = (void *)lock;
+	while(cmpxchg(&l->locked_pending, _Q_PENDING_VAL, _Q_LOCKED_VAL) != _Q_LOCKED_VAL)
+		cpu_relax();
+}
+
+void  org_queued_spin_lock_slowpath(struct qspinlock *lock, u32 val)
+{
+	struct mcs_spinlock *prev, *next, *node;
+	u32 new, old, tail;
+	int idx;
+
+	BUILD_BUG_ON(CONFIG_NR_CPUS >= (1U << _Q_TAIL_CPU_BITS));
+
+	if (pv_enabled())
+		goto queue;
+
+	if (virt_spin_lock(lock))
+		return;
+
+	/*
+	 * wait for in-progress pending->locked hand-overs
+	 *
+	 * 0,1,0 -> 0,0,1
+	 */
+	
+	if (val == _Q_PENDING_VAL) {
+		while ((val = atomic_read(&lock->val)) == _Q_PENDING_VAL)
+			cpu_relax();
+	}
+
+	/*
+	 * trylock || pending
+	 *
+	 * 0,0,0 -> 0,0,1 ; trylock
+	 * 0,0,1 -> 0,1,1 ; pending
+	 */
+	for (;;) {
+		/*
+		 * If we observe any contention; queue.
+		 */
+		if (val & ~_Q_LOCKED_MASK)
+			goto queue;
+
+		new = _Q_LOCKED_VAL;
+		if (val == new)
+			new |= _Q_PENDING_VAL;
+
+		old = atomic_cmpxchg(&lock->val, val, new);
+		if (old == val)
+			break;
+
+		val = old;
+	}
+
+	/*
+	 * we won the trylock
+	 */
+	if (new == _Q_LOCKED_VAL)
+		return;
+
+	/*
+	 * we're pending, wait for the owner to go away.
+	 *
+	 * *,1,1 -> *,1,0
+	 *
+	 * this wait loop must be a load-acquire such that we match the
+	 * store-release that clears the locked bit and create lock
+	 * sequentiality; this is because not all clear_pending_set_locked()
+	 * implementations imply full barriers.
+	 */
+	
+ 	
+	while ((val = smp_load_acquire(&lock->val.counter)) & _Q_LOCKED_MASK) {
+		cpu_relax();
+	}
+	/*
+	 * take ownership and clear the pending bit.
+	 *
+	 * *,1,0 -> *,0,1
+	 */
+	clear_pending_set_locked(lock);
+	return;
+
+	/*
+	 * End of pending bit optimistic spinning and beginning of MCS
+	 * queuing.
+	 */
+queue:
+	node = this_cpu_ptr(&mcs_nodes[0]);
+	idx = node->count++;
+	tail = encode_tail(smp_processor_id(), idx);
+
+	node += idx;
+	node->locked = 0;
+	node->next = NULL;
+	pv_init_node(node);
+
+	/*
+	 * We touched a (possibly) cold cacheline in the per-cpu queue node;
+	 * attempt the trylock once more in the hope someone let go while we
+	 * weren't watching.
+	 */
+	if (queued_spin_trylock(lock))
+		goto release;
+
+	/*
+	 * We have already touched the queueing cacheline; don't bother with
+	 * pending stuff.
+	 *
+	 * p,*,* -> n,*,*
+	 */
+	old = xchg_tail(lock, tail);
+
+	/*
+	 * if there was a previous node; link it and wait until reaching the
+	 * head of the waitqueue.
+	 */
+	if (old & _Q_TAIL_MASK) {
+		prev = decode_tail(old);
+		WRITE_ONCE(prev->next, node);
+
+		pv_wait_node(node);
+		arch_mcs_spin_lock_contended(&node->locked);
+	}
+
+	/*
+	 * we're at the head of the waitqueue, wait for the owner & pending to
+	 * go away.
+	 *
+	 * *,x,y -> *,0,0
+	 *
+	 * this wait loop must use a load-acquire such that we match the
+	 * store-release that clears the locked bit and create lock
+	 * sequentiality; this is because the set_locked() function below
+	 * does not imply a full barrier.
+	 *
+	 */
+	pv_wait_head(lock, node);
+	while ((val = smp_load_acquire(&lock->val.counter)) & _Q_LOCKED_PENDING_MASK) {
+		cpu_relax();
+	}
+
+	/*
+	 * claim the lock:
+	 *
+	 * n,0,0 -> 0,0,1 : lock, uncontended
+	 * *,0,0 -> *,0,1 : lock, contended
+	 *
+	 * If the queue head is the only one in the queue (lock value == tail),
+	 * clear the tail code and grab the lock. Otherwise, we only need
+	 * to grab the lock.
+	 */
+	for (;;) {
+		if (val != tail) {
+			set_locked(lock);
+			break;
+		}
+		old = atomic_cmpxchg(&lock->val, val, _Q_LOCKED_VAL);
+		if (old == val)
+			goto release;	/* No contention */
+
+		val = old;
+	}
+
+	/*
+	 * contended path; wait for next, release.
+	 */
+	while (!(next = READ_ONCE(node->next)))
+		cpu_relax();
+
+	arch_mcs_spin_unlock_contended(&next->locked);
+	pv_kick_node(lock, next);
+
+release:
+	/*
+	 * release the node
+	 */
+	this_cpu_dec(mcs_nodes[0].count);
+}
+
+static __always_inline void org_queued_spin_lock(struct qspinlock *lock)
+{
+	u32 val;
+
+	val = atomic_cmpxchg(&lock->val, 0, _Q_LOCKED_VAL);
+	if (likely(val == 0))
+		return;
+	org_queued_spin_lock_slowpath(lock, val);
+}
+
+static __always_inline void org_queued_spin_unlock(struct qspinlock *lock)
+{
+	/*
+	 * smp_mb__before_atomic() in order to guarantee release semantics
+	 */
+	//smp_mb__before_atomic_dec();
+	atomic_sub(_Q_LOCKED_VAL, &lock->val);
+}
+#endif	/* _LINUX_SLAB_LOCK_H */
diff --git a/mm/slab.c b/mm/slab.c
index 4fcc5dd..c1c4821 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -87,6 +87,7 @@
  */
 
 #include	<linux/slab.h>
+#include	<linux/slab_lock.h>
 #include	<linux/mm.h>
 #include	<linux/poison.h>
 #include	<linux/swap.h>
@@ -128,7 +129,6 @@
 #include	"internal.h"
 
 #include	"slab.h"
-
 /*
  * DEBUG	- 1 for kmem_cache_create() to honour; SLAB_RED_ZONE & SLAB_POISON.
  *		  0 for faster, smaller code (especially in the critical paths).
@@ -2765,104 +2765,135 @@ static void *cache_free_debugcheck(struct kmem_cache *cachep, void *objp,
 #define kfree_debugcheck(x) do { } while(0)
 #define cache_free_debugcheck(x,objp,z) (objp)
 #endif
-
-static void *cache_alloc_refill(struct kmem_cache *cachep, gfp_t flags,
-							bool force_refill)
-{
+struct refill_para {
+	struct kmem_cache *cachep;
 	int batchcount;
 	struct kmem_cache_node *n;
 	struct array_cache *ac;
 	int node;
+};
 
-	check_irq_off();
-	node = numa_mem_id();
-	if (unlikely(force_refill))
-		goto force_grow;
-retry:
-	ac = cpu_cache_get(cachep);
-	batchcount = ac->batchcount;
-	if (!ac->touched && batchcount > BATCHREFILL_LIMIT) {
-		/*
-		 * If there was little recent activity on this cache, then
-		 * perform only a partial refill.  Otherwise we could generate
-		 * refill bouncing.
-		 */
-		batchcount = BATCHREFILL_LIMIT;
-	}
-	n = get_node(cachep, node);
+static void refill_fn(void *p)
+{
 
-	BUG_ON(ac->avail > 0 || !n);
-	spin_lock(&n->list_lock);
+
+	struct refill_para  *pa = p;
 
 	/* See if we can refill from the shared array */
-	if (n->shared && transfer_objects(ac, n->shared, batchcount)) {
-		n->shared->touched = 1;
+	if (pa->n->shared && transfer_objects(pa->ac, pa->n->shared, pa->batchcount)) {
+		pa->n->shared->touched = 1;
 		goto alloc_done;
 	}
 
-	while (batchcount > 0) {
+	while (pa->batchcount > 0) {
 		struct list_head *entry;
 		struct page *page;
 		/* Get slab alloc is to come from. */
-		entry = n->slabs_partial.next;
-		if (entry == &n->slabs_partial) {
-			n->free_touched = 1;
-			entry = n->slabs_free.next;
-			if (entry == &n->slabs_free)
+		entry = pa->n->slabs_partial.next;
+		if (entry == &pa->n->slabs_partial) {
+			pa->n->free_touched = 1;
+			entry = pa->n->slabs_free.next;
+			if (entry == &pa->n->slabs_free)
 				goto must_grow;
 		}
 
 		page = list_entry(entry, struct page, lru);
-		check_spinlock_acquired(cachep);
+		check_spinlock_acquired(pa->cachep);
 
 		/*
 		 * The slab was either on partial or free list so
 		 * there must be at least one object available for
 		 * allocation.
 		 */
-		BUG_ON(page->active >= cachep->num);
+		BUG_ON(page->active >= pa->cachep->num);
 
-		while (page->active < cachep->num && batchcount--) {
-			STATS_INC_ALLOCED(cachep);
-			STATS_INC_ACTIVE(cachep);
-			STATS_SET_HIGH(cachep);
+		while (page->active < pa->cachep->num && pa->batchcount--) {
+			STATS_INC_ALLOCED(pa->cachep);
+			STATS_INC_ACTIVE(pa->cachep);
+			STATS_SET_HIGH(pa->cachep);
 
-			ac_put_obj(cachep, ac, slab_get_obj(cachep, page,
-									node));
+			ac_put_obj(pa->cachep, pa->ac, slab_get_obj(pa->cachep, page,
+									pa->node));
 		}
 
 		/* move slabp to correct slabp list: */
 		list_del(&page->lru);
-		if (page->active == cachep->num)
-			list_add(&page->lru, &n->slabs_full);
+		if (page->active == pa->cachep->num)
+			list_add(&page->lru, &pa->n->slabs_full);
 		else
-			list_add(&page->lru, &n->slabs_partial);
+			list_add(&page->lru, &pa->n->slabs_partial);
 	}
 
 must_grow:
-	n->free_objects -= ac->avail;
+	pa->n->free_objects -= pa->ac->avail;
 alloc_done:
-	spin_unlock(&n->list_lock);
+	return;
+
+}
+
+#define ORG_QUEUED_SPINLOCK (1)
+static void *cache_alloc_refill(struct kmem_cache *pcachep, gfp_t flags,
+							bool force_refill)
+{
+	struct refill_para  pa;
+	unsigned long start, end;
+	pa.cachep = pcachep;
+
+	check_irq_off();
+	pa.node = numa_mem_id();
+	if (unlikely(force_refill))
+		goto force_grow;
+retry:
+	pa.ac = cpu_cache_get(pa.cachep);
+	pa.batchcount = pa.ac->batchcount;
+	if (!pa.ac->touched && pa.batchcount > BATCHREFILL_LIMIT) {
+		/*
+		 * If there was little recent activity on this cache, then
+		 * perform only a partial refill.  Otherwise we could generate
+		 * refill bouncing.
+		 */
+		pa.batchcount = BATCHREFILL_LIMIT;
+	}
+	pa.n = get_node(pa.cachep, pa.node);
+
+	BUG_ON(pa.ac->avail > 0 || !pa.n);
+	HP_TIMING_NOW(start); 
+#if ORG_QUEUED_SPINLOCK
+	org_queued_spin_lock((struct qspinlock *)&pa.n->list_lock);
+	refill_fn(&pa);
+	org_queued_spin_unlock((struct qspinlock *)&pa.n->list_lock);
+#else
+	new_spin_lock((struct nspinlock *)&pa.n->list_lock, refill_fn, &pa);
+#endif
+	HP_TIMING_NOW(end); 
+		
+	atomic_addq(&_total_, end - start);
+	atomic_addq(&_number_, 1);
+	
+	if(cmpxchg(&_number_,1000000, 1) == 1000000) {
+		printk("\n cost time is %ld\n", _total_);
+		_total_ = 0;
+	}
 
-	if (unlikely(!ac->avail)) {
+	if (unlikely(!pa.ac->avail)) {
 		int x;
 force_grow:
-		x = cache_grow(cachep, gfp_exact_node(flags), node, NULL);
+		x = cache_grow(pa.cachep, gfp_exact_node(flags), pa.node, NULL);
 
 		/* cache_grow can reenable interrupts, then ac could change. */
-		ac = cpu_cache_get(cachep);
-		node = numa_mem_id();
+		pa.ac = cpu_cache_get(pa.cachep);
+		pa.node = numa_mem_id();
 
 		/* no objects in sight? abort */
-		if (!x && (ac->avail == 0 || force_refill))
+		if (!x && (pa.ac->avail == 0 || force_refill))
 			return NULL;
 
-		if (!ac->avail)		/* objects refilled by interrupt? */
+		if (!pa.ac->avail)		/* objects refilled by interrupt? */
 			goto retry;
 	}
-	ac->touched = 1;
+	pa.ac->touched = 1;
 
-	return ac_get_obj(cachep, ac, flags, force_refill);
+	return ac_get_obj(pa.cachep, pa.ac, flags, force_refill);
 }
 
 static inline void cache_alloc_debugcheck_before(struct kmem_cache *cachep,
@@ -3316,42 +3347,42 @@ static void free_block(struct kmem_cache *cachep, void **objpp,
 	}
 }
 
-static void cache_flusharray(struct kmem_cache *cachep, struct array_cache *ac)
-{
-	int batchcount;
+struct flusharray_para {
+	
 	struct kmem_cache_node *n;
-	int node = numa_mem_id();
-	LIST_HEAD(list);
+	int batchcount;
+	int node;
+	struct list_head list;
+	struct kmem_cache *cachep;
+	struct array_cache *ac;
+};
 
-	batchcount = ac->batchcount;
-#if DEBUG
-	BUG_ON(!batchcount || batchcount > ac->avail);
-#endif
-	check_irq_off();
-	n = get_node(cachep, node);
-	spin_lock(&n->list_lock);
-	if (n->shared) {
-		struct array_cache *shared_array = n->shared;
+static void flusharray_fn(void *p)
+{
+
+	struct flusharray_para *pa = p;
+	if (pa->n->shared) {
+		struct array_cache *shared_array = pa->n->shared;
 		int max = shared_array->limit - shared_array->avail;
 		if (max) {
-			if (batchcount > max)
-				batchcount = max;
+			if (pa->batchcount > max)
+				pa->batchcount = max;
 			memcpy(&(shared_array->entry[shared_array->avail]),
-			       ac->entry, sizeof(void *) * batchcount);
-			shared_array->avail += batchcount;
+			       pa->ac->entry, sizeof(void *) * pa->batchcount);
+			shared_array->avail += pa->batchcount;
 			goto free_done;
 		}
 	}
 
-	free_block(cachep, ac->entry, batchcount, node, &list);
+	free_block(pa->cachep, pa->ac->entry, pa->batchcount, pa->node, &pa->list);
 free_done:
 #if STATS
 	{
 		int i = 0;
 		struct list_head *p;
 
-		p = n->slabs_free.next;
-		while (p != &(n->slabs_free)) {
+		p = pa->n->slabs_free.next;
+		while (p != &(pa->n->slabs_free)) {
 			struct page *page;
 
 			page = list_entry(p, struct page, lru);
@@ -3360,13 +3391,40 @@ free_done:
 			i++;
 			p = p->next;
 		}
-		STATS_SET_FREEABLE(cachep, i);
+		STATS_SET_FREEABLE(pa->cachep, i);
 	}
 #endif
-	spin_unlock(&n->list_lock);
-	slabs_destroy(cachep, &list);
-	ac->avail -= batchcount;
-	memmove(ac->entry, &(ac->entry[batchcount]), sizeof(void *)*ac->avail);
+
+	return;
+}
+static void cache_flusharray(struct kmem_cache *pcachep, struct array_cache *pac)
+{
+	struct flusharray_para pa;	
+
+	unsigned long start, end;
+	INIT_LIST_HEAD(&pa.list);
+	pa.node = numa_mem_id();
+	pa.cachep = pcachep;
+	pa.ac = pac;
+	pa.batchcount = pa.ac->batchcount;
+#if DEBUG
+	BUG_ON(!pa.batchcount || pa.batchcount > pa.ac->avail);
+#endif
+	check_irq_off();
+	pa.n = get_node(pa.cachep, pa.node);
+	HP_TIMING_NOW(start); 
+#if ORG_QUEUED_SPINLOCK
+	org_queued_spin_lock((struct qspinlock *)&pa.n->list_lock);
+	flusharray_fn(&pa);
+	org_queued_spin_unlock((struct qspinlock *)&pa.n->list_lock);
+#else
+	new_spin_lock((struct nspinlock *)&pa.n->list_lock, flusharray_fn, &pa);
+#endif
+	HP_TIMING_NOW(end); 
+	atomic_addq(&_total_, end - start);
+	slabs_destroy(pa.cachep, &pa.list);
+	pa.ac->avail -= pa.batchcount;
+	memmove(pa.ac->entry, &(pa.ac->entry[pa.batchcount]), sizeof(void *)*pa.ac->avail);
 }
 
 /*
-- 
1.7.1