[PATCH] async_tx: replace page_address with kmap_atomic

2015-07-02 Thread Yuanhan Liu
As a page might belong to highmem.

Strictly nested kmap_atomic() order is followed according to doc
Documentation/vm/highmem.txt

CC: Dan Williams 
CC: Shaohua Li 
Signed-off-by: Yuanhan Liu 
---
 crypto/async_tx/async_pq.c  | 18 +-
 crypto/async_tx/async_raid6_recov.c | 31 ---
 crypto/async_tx/async_xor.c | 17 ++---
 3 files changed, 51 insertions(+), 15 deletions(-)

diff --git a/crypto/async_tx/async_pq.c b/crypto/async_tx/async_pq.c
index 5d355e0..a408b7e 100644
--- a/crypto/async_tx/async_pq.c
+++ b/crypto/async_tx/async_pq.c
@@ -136,7 +136,7 @@ do_sync_gen_syndrome(struct page **blocks, unsigned int 
offset, int disks,
BUG_ON(i > disks - 3); /* P or Q can't be zero */
srcs[i] = (void*)raid6_empty_zero_page;
} else {
-   srcs[i] = page_address(blocks[i]) + offset;
+   srcs[i] = kmap_atomic(blocks[i]) + offset;
if (i < disks - 2) {
stop = i;
if (start == -1)
@@ -150,6 +150,12 @@ do_sync_gen_syndrome(struct page **blocks, unsigned int 
offset, int disks,
raid6_call.xor_syndrome(disks, start, stop, len, srcs);
} else
raid6_call.gen_syndrome(disks, len, srcs);
+
+   for (i = disks; i--; ) {
+   if (blocks[i])
+   kunmap_atomic(srcs[i]);
+   }
+
async_tx_sync_epilog(submit);
 }
 
@@ -395,14 +401,15 @@ async_syndrome_val(struct page **blocks, unsigned int 
offset, int disks,
 */
tx = NULL;
*pqres = 0;
+   s = kmap_atomic(spare) + offset;
if (p_src) {
init_async_submit(submit, ASYNC_TX_XOR_ZERO_DST, NULL,
  NULL, NULL, scribble);
tx = async_xor(spare, blocks, offset, disks-2, len, 
submit);
async_tx_quiesce();
-   p = page_address(p_src) + offset;
-   s = page_address(spare) + offset;
+   p = kmap_atomic(p_src) + offset;
*pqres |= !!memcmp(p, s, len) << SUM_CHECK_P;
+   kunmap_atomic(p);
}
 
if (q_src) {
@@ -411,10 +418,11 @@ async_syndrome_val(struct page **blocks, unsigned int 
offset, int disks,
init_async_submit(submit, 0, NULL, NULL, NULL, 
scribble);
tx = async_gen_syndrome(blocks, offset, disks, len, 
submit);
async_tx_quiesce();
-   q = page_address(q_src) + offset;
-   s = page_address(spare) + offset;
+   q = kmap_atomic(q_src) + offset;
*pqres |= !!memcmp(q, s, len) << SUM_CHECK_Q;
+   kunmap_atomic(q);
}
+   kunmap_atomic(s);
 
/* restore P, Q and submit */
P(blocks, disks) = p_src;
diff --git a/crypto/async_tx/async_raid6_recov.c 
b/crypto/async_tx/async_raid6_recov.c
index 934a849..abcacb0 100644
--- a/crypto/async_tx/async_raid6_recov.c
+++ b/crypto/async_tx/async_raid6_recov.c
@@ -80,9 +80,9 @@ async_sum_product(struct page *dest, struct page **srcs, 
unsigned char *coef,
async_tx_quiesce(>depend_tx);
amul = raid6_gfmul[coef[0]];
bmul = raid6_gfmul[coef[1]];
-   a = page_address(srcs[0]);
-   b = page_address(srcs[1]);
-   c = page_address(dest);
+   a = kmap_atomic(srcs[0]);
+   b = kmap_atomic(srcs[1]);
+   c = kmap_atomic(dest);
 
while (len--) {
ax= amul[*a++];
@@ -90,6 +90,10 @@ async_sum_product(struct page *dest, struct page **srcs, 
unsigned char *coef,
*c++ = ax ^ bx;
}
 
+   kunmap_atomic(c);
+   kunmap_atomic(b);
+   kunmap_atomic(a);
+
return NULL;
 }
 
@@ -147,12 +151,15 @@ async_mult(struct page *dest, struct page *src, u8 coef, 
size_t len,
 */
async_tx_quiesce(>depend_tx);
qmul  = raid6_gfmul[coef];
-   d = page_address(dest);
-   s = page_address(src);
+   d = kmap_atomic(dest);
+   s = kmap_atomic(src);
 
while (len--)
*d++ = qmul[*s++];
 
+   kunmap_atomic(s);
+   kunmap_atomic(d);
+
return NULL;
 }
 
@@ -372,10 +379,15 @@ async_raid6_2data_recov(int disks, size_t bytes, int 
faila, int failb,
if (blocks[i] == NULL)
ptrs[i] = (void *) raid6_empty_zero_page;
else
-   ptrs[i] = page_address(blocks[i]);
+   ptrs[i] = kmap_atomic(blocks[i]);
 
raid6_2data_recov(disks, bytes, faila, failb, ptrs);
 
+ 

[PATCH] async_tx: replace page_address with kmap_atomic

2015-07-02 Thread Yuanhan Liu
As a page might belong to highmem.

Strictly nested kmap_atomic() order is followed according to doc
Documentation/vm/highmem.txt

CC: Dan Williams dan.j.willi...@intel.com
CC: Shaohua Li s...@fb.com
Signed-off-by: Yuanhan Liu yuanhan@linux.intel.com
---
 crypto/async_tx/async_pq.c  | 18 +-
 crypto/async_tx/async_raid6_recov.c | 31 ---
 crypto/async_tx/async_xor.c | 17 ++---
 3 files changed, 51 insertions(+), 15 deletions(-)

diff --git a/crypto/async_tx/async_pq.c b/crypto/async_tx/async_pq.c
index 5d355e0..a408b7e 100644
--- a/crypto/async_tx/async_pq.c
+++ b/crypto/async_tx/async_pq.c
@@ -136,7 +136,7 @@ do_sync_gen_syndrome(struct page **blocks, unsigned int 
offset, int disks,
BUG_ON(i  disks - 3); /* P or Q can't be zero */
srcs[i] = (void*)raid6_empty_zero_page;
} else {
-   srcs[i] = page_address(blocks[i]) + offset;
+   srcs[i] = kmap_atomic(blocks[i]) + offset;
if (i  disks - 2) {
stop = i;
if (start == -1)
@@ -150,6 +150,12 @@ do_sync_gen_syndrome(struct page **blocks, unsigned int 
offset, int disks,
raid6_call.xor_syndrome(disks, start, stop, len, srcs);
} else
raid6_call.gen_syndrome(disks, len, srcs);
+
+   for (i = disks; i--; ) {
+   if (blocks[i])
+   kunmap_atomic(srcs[i]);
+   }
+
async_tx_sync_epilog(submit);
 }
 
@@ -395,14 +401,15 @@ async_syndrome_val(struct page **blocks, unsigned int 
offset, int disks,
 */
tx = NULL;
*pqres = 0;
+   s = kmap_atomic(spare) + offset;
if (p_src) {
init_async_submit(submit, ASYNC_TX_XOR_ZERO_DST, NULL,
  NULL, NULL, scribble);
tx = async_xor(spare, blocks, offset, disks-2, len, 
submit);
async_tx_quiesce(tx);
-   p = page_address(p_src) + offset;
-   s = page_address(spare) + offset;
+   p = kmap_atomic(p_src) + offset;
*pqres |= !!memcmp(p, s, len)  SUM_CHECK_P;
+   kunmap_atomic(p);
}
 
if (q_src) {
@@ -411,10 +418,11 @@ async_syndrome_val(struct page **blocks, unsigned int 
offset, int disks,
init_async_submit(submit, 0, NULL, NULL, NULL, 
scribble);
tx = async_gen_syndrome(blocks, offset, disks, len, 
submit);
async_tx_quiesce(tx);
-   q = page_address(q_src) + offset;
-   s = page_address(spare) + offset;
+   q = kmap_atomic(q_src) + offset;
*pqres |= !!memcmp(q, s, len)  SUM_CHECK_Q;
+   kunmap_atomic(q);
}
+   kunmap_atomic(s);
 
/* restore P, Q and submit */
P(blocks, disks) = p_src;
diff --git a/crypto/async_tx/async_raid6_recov.c 
b/crypto/async_tx/async_raid6_recov.c
index 934a849..abcacb0 100644
--- a/crypto/async_tx/async_raid6_recov.c
+++ b/crypto/async_tx/async_raid6_recov.c
@@ -80,9 +80,9 @@ async_sum_product(struct page *dest, struct page **srcs, 
unsigned char *coef,
async_tx_quiesce(submit-depend_tx);
amul = raid6_gfmul[coef[0]];
bmul = raid6_gfmul[coef[1]];
-   a = page_address(srcs[0]);
-   b = page_address(srcs[1]);
-   c = page_address(dest);
+   a = kmap_atomic(srcs[0]);
+   b = kmap_atomic(srcs[1]);
+   c = kmap_atomic(dest);
 
while (len--) {
ax= amul[*a++];
@@ -90,6 +90,10 @@ async_sum_product(struct page *dest, struct page **srcs, 
unsigned char *coef,
*c++ = ax ^ bx;
}
 
+   kunmap_atomic(c);
+   kunmap_atomic(b);
+   kunmap_atomic(a);
+
return NULL;
 }
 
@@ -147,12 +151,15 @@ async_mult(struct page *dest, struct page *src, u8 coef, 
size_t len,
 */
async_tx_quiesce(submit-depend_tx);
qmul  = raid6_gfmul[coef];
-   d = page_address(dest);
-   s = page_address(src);
+   d = kmap_atomic(dest);
+   s = kmap_atomic(src);
 
while (len--)
*d++ = qmul[*s++];
 
+   kunmap_atomic(s);
+   kunmap_atomic(d);
+
return NULL;
 }
 
@@ -372,10 +379,15 @@ async_raid6_2data_recov(int disks, size_t bytes, int 
faila, int failb,
if (blocks[i] == NULL)
ptrs[i] = (void *) raid6_empty_zero_page;
else
-   ptrs[i] = page_address(blocks[i]);
+   ptrs[i] = kmap_atomic(blocks[i

Re: [PATCH 3/3] md/raid5: per hash value and exclusive wait_for_stripe

2015-05-13 Thread Yuanhan Liu
On Thu, May 14, 2015 at 03:45:11PM +1000, NeilBrown wrote:
> On Wed, 29 Apr 2015 10:48:55 +0800 Yuanhan Liu 
> wrote:
> 
> > diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
> > index 64d5bea..697d77a 100644
> > --- a/drivers/md/raid5.c
> > +++ b/drivers/md/raid5.c
> > @@ -344,7 +344,8 @@ static void release_inactive_stripe_list(struct r5conf 
> > *conf,
> >  int hash)
> >  {
> > int size;
> > -   bool do_wakeup = false;
> > +   unsigned long do_wakeup = 0;
> > +   int i = 0;
> > unsigned long flags;
> >  
> > if (hash == NR_STRIPE_HASH_LOCKS) {
> > @@ -365,15 +366,19 @@ static void release_inactive_stripe_list(struct 
> > r5conf *conf,
> > !list_empty(list))
> > atomic_dec(>empty_inactive_list_nr);
> > list_splice_tail_init(list, conf->inactive_list + hash);
> > -   do_wakeup = true;
> > +   do_wakeup |= 1 << (size - 1);
> > spin_unlock_irqrestore(conf->hash_locks + hash, flags);
> > }
> > size--;
> > hash--;
> > }
> >  
> > +   for (i = 0; i < NR_STRIPE_HASH_LOCKS; i++) {
> > +   if (do_wakeup & (1 << i))
> > +   wake_up(>wait_for_stripe[i]);
> > +   }
> > +
> 
> hi,
>  I've been doing some testing and got a lock-up in resize_stripes, waiting
>  on wait_for_stripe[].
> 
>  Looking at the above code,  I think
>   do_wakeup |= 1 << (size - 1);
>  should be
>   do_wakeup |= 1 << hash;
> 
>  do you agree?  Or am I missing something?

Right. Sorry for the careless mistake.

--yliu
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 3/3] md/raid5: per hash value and exclusive wait_for_stripe

2015-05-13 Thread Yuanhan Liu
On Thu, May 14, 2015 at 03:45:11PM +1000, NeilBrown wrote:
 On Wed, 29 Apr 2015 10:48:55 +0800 Yuanhan Liu yuanhan@linux.intel.com
 wrote:
 
  diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
  index 64d5bea..697d77a 100644
  --- a/drivers/md/raid5.c
  +++ b/drivers/md/raid5.c
  @@ -344,7 +344,8 @@ static void release_inactive_stripe_list(struct r5conf 
  *conf,
   int hash)
   {
  int size;
  -   bool do_wakeup = false;
  +   unsigned long do_wakeup = 0;
  +   int i = 0;
  unsigned long flags;
   
  if (hash == NR_STRIPE_HASH_LOCKS) {
  @@ -365,15 +366,19 @@ static void release_inactive_stripe_list(struct 
  r5conf *conf,
  !list_empty(list))
  atomic_dec(conf-empty_inactive_list_nr);
  list_splice_tail_init(list, conf-inactive_list + hash);
  -   do_wakeup = true;
  +   do_wakeup |= 1  (size - 1);
  spin_unlock_irqrestore(conf-hash_locks + hash, flags);
  }
  size--;
  hash--;
  }
   
  +   for (i = 0; i  NR_STRIPE_HASH_LOCKS; i++) {
  +   if (do_wakeup  (1  i))
  +   wake_up(conf-wait_for_stripe[i]);
  +   }
  +
 
 hi,
  I've been doing some testing and got a lock-up in resize_stripes, waiting
  on wait_for_stripe[].
 
  Looking at the above code,  I think
   do_wakeup |= 1  (size - 1);
  should be
   do_wakeup |= 1  hash;
 
  do you agree?  Or am I missing something?

Right. Sorry for the careless mistake.

--yliu
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 1/2] md/raid5: avoid duplicate code

2015-05-07 Thread Yuanhan Liu
On Fri, May 08, 2015 at 03:28:00PM +1000, NeilBrown wrote:
> On Wed,  6 May 2015 17:45:49 +0800 Yuanhan Liu 
> wrote:
> 
> > Move the code that put one idle sh(hot in cache, but happens to be
> > zero referenced) back to active stage to __find_stripe(). Because
> > that's what need to do every time you invoke __find_stripe().
> > 
> > Moving it there avoids duplicate code, as well as makes a bit more
> > sense, IMO, as it tells a whole story now.
> 
> Thanks for this.  It is a good cleanup.
> 
> However I don't want to make any new changes to the RAID5 code until I find a
> couple of bugs that I'm hunting.  So I won't apply it just yet.
> Remind me in a couple of weeks if I seem to have forgotten.

Got it. Thanks.


--yliu
> 
> > 
> > Signed-off-by: Yuanhan Liu 
> > ---
> >  drivers/md/raid5.c | 50 ++
> >  1 file changed, 18 insertions(+), 32 deletions(-)
> > 
> > diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
> > index 77dfd72..e7fa818 100644
> > --- a/drivers/md/raid5.c
> > +++ b/drivers/md/raid5.c
> > @@ -567,8 +567,25 @@ static struct stripe_head *__find_stripe(struct r5conf 
> > *conf, sector_t sector,
> >  
> > pr_debug("__find_stripe, sector %llu\n", (unsigned long long)sector);
> > hlist_for_each_entry(sh, stripe_hash(conf, sector), hash)
> > -   if (sh->sector == sector && sh->generation == generation)
> > +   if (sh->sector == sector && sh->generation == generation) {
> > +   if (!atomic_inc_not_zero(>count)) {
> > +   spin_lock(>device_lock);
> > +   if (!atomic_read(>count)) {
> > +   if (!test_bit(STRIPE_HANDLE, 
> > >state))
> > +   
> > atomic_inc(>active_stripes);
> > +   BUG_ON(list_empty(>lru) &&
> > +  !test_bit(STRIPE_EXPANDING, 
> > >state));
> > +   list_del_init(>lru);
> > +   if (sh->group) {
> > +   sh->group->stripes_cnt--;
> > +   sh->group = NULL;
> > +   }
> > +   }
> > +   atomic_inc(>count);
> > +   spin_unlock(>device_lock);
> > +   }
> > return sh;
> > +   }
> > pr_debug("__stripe %llu not in cache\n", (unsigned long long)sector);
> > return NULL;
> >  }
> > @@ -698,21 +715,6 @@ get_active_stripe(struct r5conf *conf, sector_t sector,
> > init_stripe(sh, sector, previous);
> > atomic_inc(>count);
> > }
> > -   } else if (!atomic_inc_not_zero(>count)) {
> > -   spin_lock(>device_lock);
> > -   if (!atomic_read(>count)) {
> > -   if (!test_bit(STRIPE_HANDLE, >state))
> > -   atomic_inc(>active_stripes);
> > -   BUG_ON(list_empty(>lru) &&
> > -  !test_bit(STRIPE_EXPANDING, >state));
> > -   list_del_init(>lru);
> > -   if (sh->group) {
> > -   sh->group->stripes_cnt--;
> > -   sh->group = NULL;
> > -   }
> > -   }
> > -   atomic_inc(>count);
> > -   spin_unlock(>device_lock);
> > }
> > } while (sh == NULL);
> >  
> > @@ -771,22 +773,6 @@ static void stripe_add_to_batch_list(struct r5conf 
> > *conf, struct stripe_head *sh
> > hash = stripe_hash_locks_hash(head_sector);
> > spin_lock_irq(conf->hash_locks + hash);
> > head = __find_stripe(conf, head_sector, conf->generation);
> > -   if (head && !atomic_inc_not_zero(>count)) {
> > -   spin_lock(>device_lock);
> > -   if (!atomic_read(>count)) {
> > -   if (!test_bit(STRIPE_HANDLE, >state))
> > -   atomic_inc(>active_stripes);
> > -   BUG_ON(list_empty(>lru) &&a

Re: [PATCH 1/2] md/raid5: avoid duplicate code

2015-05-07 Thread Yuanhan Liu
On Fri, May 08, 2015 at 03:28:00PM +1000, NeilBrown wrote:
 On Wed,  6 May 2015 17:45:49 +0800 Yuanhan Liu yuanhan@linux.intel.com
 wrote:
 
  Move the code that put one idle sh(hot in cache, but happens to be
  zero referenced) back to active stage to __find_stripe(). Because
  that's what need to do every time you invoke __find_stripe().
  
  Moving it there avoids duplicate code, as well as makes a bit more
  sense, IMO, as it tells a whole story now.
 
 Thanks for this.  It is a good cleanup.
 
 However I don't want to make any new changes to the RAID5 code until I find a
 couple of bugs that I'm hunting.  So I won't apply it just yet.
 Remind me in a couple of weeks if I seem to have forgotten.

Got it. Thanks.


--yliu
 
  
  Signed-off-by: Yuanhan Liu yuanhan@linux.intel.com
  ---
   drivers/md/raid5.c | 50 ++
   1 file changed, 18 insertions(+), 32 deletions(-)
  
  diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
  index 77dfd72..e7fa818 100644
  --- a/drivers/md/raid5.c
  +++ b/drivers/md/raid5.c
  @@ -567,8 +567,25 @@ static struct stripe_head *__find_stripe(struct r5conf 
  *conf, sector_t sector,
   
  pr_debug(__find_stripe, sector %llu\n, (unsigned long long)sector);
  hlist_for_each_entry(sh, stripe_hash(conf, sector), hash)
  -   if (sh-sector == sector  sh-generation == generation)
  +   if (sh-sector == sector  sh-generation == generation) {
  +   if (!atomic_inc_not_zero(sh-count)) {
  +   spin_lock(conf-device_lock);
  +   if (!atomic_read(sh-count)) {
  +   if (!test_bit(STRIPE_HANDLE, 
  sh-state))
  +   
  atomic_inc(conf-active_stripes);
  +   BUG_ON(list_empty(sh-lru) 
  +  !test_bit(STRIPE_EXPANDING, 
  sh-state));
  +   list_del_init(sh-lru);
  +   if (sh-group) {
  +   sh-group-stripes_cnt--;
  +   sh-group = NULL;
  +   }
  +   }
  +   atomic_inc(sh-count);
  +   spin_unlock(conf-device_lock);
  +   }
  return sh;
  +   }
  pr_debug(__stripe %llu not in cache\n, (unsigned long long)sector);
  return NULL;
   }
  @@ -698,21 +715,6 @@ get_active_stripe(struct r5conf *conf, sector_t sector,
  init_stripe(sh, sector, previous);
  atomic_inc(sh-count);
  }
  -   } else if (!atomic_inc_not_zero(sh-count)) {
  -   spin_lock(conf-device_lock);
  -   if (!atomic_read(sh-count)) {
  -   if (!test_bit(STRIPE_HANDLE, sh-state))
  -   atomic_inc(conf-active_stripes);
  -   BUG_ON(list_empty(sh-lru) 
  -  !test_bit(STRIPE_EXPANDING, sh-state));
  -   list_del_init(sh-lru);
  -   if (sh-group) {
  -   sh-group-stripes_cnt--;
  -   sh-group = NULL;
  -   }
  -   }
  -   atomic_inc(sh-count);
  -   spin_unlock(conf-device_lock);
  }
  } while (sh == NULL);
   
  @@ -771,22 +773,6 @@ static void stripe_add_to_batch_list(struct r5conf 
  *conf, struct stripe_head *sh
  hash = stripe_hash_locks_hash(head_sector);
  spin_lock_irq(conf-hash_locks + hash);
  head = __find_stripe(conf, head_sector, conf-generation);
  -   if (head  !atomic_inc_not_zero(head-count)) {
  -   spin_lock(conf-device_lock);
  -   if (!atomic_read(head-count)) {
  -   if (!test_bit(STRIPE_HANDLE, head-state))
  -   atomic_inc(conf-active_stripes);
  -   BUG_ON(list_empty(head-lru) 
  -  !test_bit(STRIPE_EXPANDING, head-state));
  -   list_del_init(head-lru);
  -   if (head-group) {
  -   head-group-stripes_cnt--;
  -   head-group = NULL;
  -   }
  -   }
  -   atomic_inc(head-count);
  -   spin_unlock(conf-device_lock);
  -   }
  spin_unlock_irq(conf-hash_locks + hash);
   
  if (!head)
 


--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 2/2] md/raid5: remove unnecessary sh->count check

2015-05-06 Thread Yuanhan Liu
Remove the unnecessary "!atomic_read(>count)" check, as the previous
"atomic_inc_not_zero(>count)" check assures sh->count to be 0.

The only reason I can think of that we need such check is to consider
the lock race issue.

First of all, I doubt there is another process could modify an in-cache
but zero referenced sh while it's being protected by a hash lock. Hence,
I would say sh->count will be consistent to 0 in that  "if !atomic_inc_not_zero"
block.

Secondly, just assume there is a chance that someone outside the lock
modifies sh->count(by atomic_inc?). It could lead to some problem.

To make it clear, here I paste few lines of key code:

if (!atomic_inc_not_zero(>count)) {
spin_lock(>device_lock);
if (!atomic_read(>count)) {

}
...
}

At the time we enter the first if block, sh->count is zero. And just assume
someone increases sh->count from somewhere while acquiring the lock,
the following if block will not be executed then, leaving some fileds,
such as conf->active_stripes, not being set properly.

So, we should execute the second if block whenever we entered the first
if block no matter sh->count stays with 0 or not.

Signed-off-by: Yuanhan Liu 
---

Neil, I'm a bit concerned that I missed something in this patch. Please
kindly correct me if I'm wrong :)

---
 drivers/md/raid5.c | 18 --
 1 file changed, 8 insertions(+), 10 deletions(-)

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index e7fa818..17ece2a 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -570,16 +570,14 @@ static struct stripe_head *__find_stripe(struct r5conf 
*conf, sector_t sector,
if (sh->sector == sector && sh->generation == generation) {
if (!atomic_inc_not_zero(>count)) {
spin_lock(>device_lock);
-   if (!atomic_read(>count)) {
-   if (!test_bit(STRIPE_HANDLE, 
>state))
-   
atomic_inc(>active_stripes);
-   BUG_ON(list_empty(>lru) &&
-  !test_bit(STRIPE_EXPANDING, 
>state));
-   list_del_init(>lru);
-   if (sh->group) {
-   sh->group->stripes_cnt--;
-   sh->group = NULL;
-   }
+   if (!test_bit(STRIPE_HANDLE, >state))
+   atomic_inc(>active_stripes);
+   BUG_ON(list_empty(>lru) &&
+  !test_bit(STRIPE_EXPANDING, >state));
+   list_del_init(>lru);
+   if (sh->group) {
+   sh->group->stripes_cnt--;
+   sh->group = NULL;
}
atomic_inc(>count);
spin_unlock(>device_lock);
-- 
1.9.0

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 1/2] md/raid5: avoid duplicate code

2015-05-06 Thread Yuanhan Liu
Move the code that put one idle sh(hot in cache, but happens to be
zero referenced) back to active stage to __find_stripe(). Because
that's what need to do every time you invoke __find_stripe().

Moving it there avoids duplicate code, as well as makes a bit more
sense, IMO, as it tells a whole story now.

Signed-off-by: Yuanhan Liu 
---
 drivers/md/raid5.c | 50 ++
 1 file changed, 18 insertions(+), 32 deletions(-)

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 77dfd72..e7fa818 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -567,8 +567,25 @@ static struct stripe_head *__find_stripe(struct r5conf 
*conf, sector_t sector,
 
pr_debug("__find_stripe, sector %llu\n", (unsigned long long)sector);
hlist_for_each_entry(sh, stripe_hash(conf, sector), hash)
-   if (sh->sector == sector && sh->generation == generation)
+   if (sh->sector == sector && sh->generation == generation) {
+   if (!atomic_inc_not_zero(>count)) {
+   spin_lock(>device_lock);
+   if (!atomic_read(>count)) {
+   if (!test_bit(STRIPE_HANDLE, 
>state))
+   
atomic_inc(>active_stripes);
+   BUG_ON(list_empty(>lru) &&
+  !test_bit(STRIPE_EXPANDING, 
>state));
+   list_del_init(>lru);
+   if (sh->group) {
+   sh->group->stripes_cnt--;
+   sh->group = NULL;
+   }
+   }
+   atomic_inc(>count);
+   spin_unlock(>device_lock);
+   }
return sh;
+   }
pr_debug("__stripe %llu not in cache\n", (unsigned long long)sector);
return NULL;
 }
@@ -698,21 +715,6 @@ get_active_stripe(struct r5conf *conf, sector_t sector,
init_stripe(sh, sector, previous);
atomic_inc(>count);
}
-   } else if (!atomic_inc_not_zero(>count)) {
-   spin_lock(>device_lock);
-   if (!atomic_read(>count)) {
-   if (!test_bit(STRIPE_HANDLE, >state))
-   atomic_inc(>active_stripes);
-   BUG_ON(list_empty(>lru) &&
-  !test_bit(STRIPE_EXPANDING, >state));
-   list_del_init(>lru);
-   if (sh->group) {
-   sh->group->stripes_cnt--;
-   sh->group = NULL;
-   }
-   }
-   atomic_inc(>count);
-   spin_unlock(>device_lock);
}
} while (sh == NULL);
 
@@ -771,22 +773,6 @@ static void stripe_add_to_batch_list(struct r5conf *conf, 
struct stripe_head *sh
hash = stripe_hash_locks_hash(head_sector);
spin_lock_irq(conf->hash_locks + hash);
head = __find_stripe(conf, head_sector, conf->generation);
-   if (head && !atomic_inc_not_zero(>count)) {
-   spin_lock(>device_lock);
-   if (!atomic_read(>count)) {
-   if (!test_bit(STRIPE_HANDLE, >state))
-   atomic_inc(>active_stripes);
-   BUG_ON(list_empty(>lru) &&
-  !test_bit(STRIPE_EXPANDING, >state));
-   list_del_init(>lru);
-   if (head->group) {
-   head->group->stripes_cnt--;
-   head->group = NULL;
-   }
-   }
-   atomic_inc(>count);
-   spin_unlock(>device_lock);
-   }
spin_unlock_irq(conf->hash_locks + hash);
 
if (!head)
-- 
1.9.0

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 1/2] md/raid5: avoid duplicate code

2015-05-06 Thread Yuanhan Liu
Move the code that put one idle sh(hot in cache, but happens to be
zero referenced) back to active stage to __find_stripe(). Because
that's what need to do every time you invoke __find_stripe().

Moving it there avoids duplicate code, as well as makes a bit more
sense, IMO, as it tells a whole story now.

Signed-off-by: Yuanhan Liu yuanhan@linux.intel.com
---
 drivers/md/raid5.c | 50 ++
 1 file changed, 18 insertions(+), 32 deletions(-)

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 77dfd72..e7fa818 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -567,8 +567,25 @@ static struct stripe_head *__find_stripe(struct r5conf 
*conf, sector_t sector,
 
pr_debug(__find_stripe, sector %llu\n, (unsigned long long)sector);
hlist_for_each_entry(sh, stripe_hash(conf, sector), hash)
-   if (sh-sector == sector  sh-generation == generation)
+   if (sh-sector == sector  sh-generation == generation) {
+   if (!atomic_inc_not_zero(sh-count)) {
+   spin_lock(conf-device_lock);
+   if (!atomic_read(sh-count)) {
+   if (!test_bit(STRIPE_HANDLE, 
sh-state))
+   
atomic_inc(conf-active_stripes);
+   BUG_ON(list_empty(sh-lru) 
+  !test_bit(STRIPE_EXPANDING, 
sh-state));
+   list_del_init(sh-lru);
+   if (sh-group) {
+   sh-group-stripes_cnt--;
+   sh-group = NULL;
+   }
+   }
+   atomic_inc(sh-count);
+   spin_unlock(conf-device_lock);
+   }
return sh;
+   }
pr_debug(__stripe %llu not in cache\n, (unsigned long long)sector);
return NULL;
 }
@@ -698,21 +715,6 @@ get_active_stripe(struct r5conf *conf, sector_t sector,
init_stripe(sh, sector, previous);
atomic_inc(sh-count);
}
-   } else if (!atomic_inc_not_zero(sh-count)) {
-   spin_lock(conf-device_lock);
-   if (!atomic_read(sh-count)) {
-   if (!test_bit(STRIPE_HANDLE, sh-state))
-   atomic_inc(conf-active_stripes);
-   BUG_ON(list_empty(sh-lru) 
-  !test_bit(STRIPE_EXPANDING, sh-state));
-   list_del_init(sh-lru);
-   if (sh-group) {
-   sh-group-stripes_cnt--;
-   sh-group = NULL;
-   }
-   }
-   atomic_inc(sh-count);
-   spin_unlock(conf-device_lock);
}
} while (sh == NULL);
 
@@ -771,22 +773,6 @@ static void stripe_add_to_batch_list(struct r5conf *conf, 
struct stripe_head *sh
hash = stripe_hash_locks_hash(head_sector);
spin_lock_irq(conf-hash_locks + hash);
head = __find_stripe(conf, head_sector, conf-generation);
-   if (head  !atomic_inc_not_zero(head-count)) {
-   spin_lock(conf-device_lock);
-   if (!atomic_read(head-count)) {
-   if (!test_bit(STRIPE_HANDLE, head-state))
-   atomic_inc(conf-active_stripes);
-   BUG_ON(list_empty(head-lru) 
-  !test_bit(STRIPE_EXPANDING, head-state));
-   list_del_init(head-lru);
-   if (head-group) {
-   head-group-stripes_cnt--;
-   head-group = NULL;
-   }
-   }
-   atomic_inc(head-count);
-   spin_unlock(conf-device_lock);
-   }
spin_unlock_irq(conf-hash_locks + hash);
 
if (!head)
-- 
1.9.0

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 2/2] md/raid5: remove unnecessary sh-count check

2015-05-06 Thread Yuanhan Liu
Remove the unnecessary !atomic_read(sh-count) check, as the previous
atomic_inc_not_zero(sh-count) check assures sh-count to be 0.

The only reason I can think of that we need such check is to consider
the lock race issue.

First of all, I doubt there is another process could modify an in-cache
but zero referenced sh while it's being protected by a hash lock. Hence,
I would say sh-count will be consistent to 0 in that  if !atomic_inc_not_zero
block.

Secondly, just assume there is a chance that someone outside the lock
modifies sh-count(by atomic_inc?). It could lead to some problem.

To make it clear, here I paste few lines of key code:

if (!atomic_inc_not_zero(sh-count)) {
spin_lock(conf-device_lock);
if (!atomic_read(sh-count)) {

}
...
}

At the time we enter the first if block, sh-count is zero. And just assume
someone increases sh-count from somewhere while acquiring the lock,
the following if block will not be executed then, leaving some fileds,
such as conf-active_stripes, not being set properly.

So, we should execute the second if block whenever we entered the first
if block no matter sh-count stays with 0 or not.

Signed-off-by: Yuanhan Liu yuanhan@linux.intel.com
---

Neil, I'm a bit concerned that I missed something in this patch. Please
kindly correct me if I'm wrong :)

---
 drivers/md/raid5.c | 18 --
 1 file changed, 8 insertions(+), 10 deletions(-)

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index e7fa818..17ece2a 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -570,16 +570,14 @@ static struct stripe_head *__find_stripe(struct r5conf 
*conf, sector_t sector,
if (sh-sector == sector  sh-generation == generation) {
if (!atomic_inc_not_zero(sh-count)) {
spin_lock(conf-device_lock);
-   if (!atomic_read(sh-count)) {
-   if (!test_bit(STRIPE_HANDLE, 
sh-state))
-   
atomic_inc(conf-active_stripes);
-   BUG_ON(list_empty(sh-lru) 
-  !test_bit(STRIPE_EXPANDING, 
sh-state));
-   list_del_init(sh-lru);
-   if (sh-group) {
-   sh-group-stripes_cnt--;
-   sh-group = NULL;
-   }
+   if (!test_bit(STRIPE_HANDLE, sh-state))
+   atomic_inc(conf-active_stripes);
+   BUG_ON(list_empty(sh-lru) 
+  !test_bit(STRIPE_EXPANDING, sh-state));
+   list_del_init(sh-lru);
+   if (sh-group) {
+   sh-group-stripes_cnt--;
+   sh-group = NULL;
}
atomic_inc(sh-count);
spin_unlock(conf-device_lock);
-- 
1.9.0

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH] md/raid5: init batch_xxx for new sh at resize_stripes

2015-05-04 Thread Yuanhan Liu
On Mon, May 04, 2015 at 05:24:24PM +1000, NeilBrown wrote:
> On Mon,  4 May 2015 13:50:24 +0800 Yuanhan Liu 
> wrote:
> 
> > This is to fix a kernel NULL dereference oops introduced by commit
> > 59fc630b("RAID5: batch adjacent full stripe write"), which introduced
> > several batch_xxx fields, and did initiation for them at grow_one_stripes(),
> > but forgot to do same at resize_stripes().
> > 
> > This oops can be easily triggered by following steps:
> > 
> > __create RAID5 /dev/md0
> > __grow /dev/md0
> > mdadm --wait /dev/md0
> > dd if=/dev/zero of=/dev/md0
> > 
> > Here is the detailed oops log:
...
> > 
> > Cc: Shaohua Li 
> > Signed-off-by: Yuanhan Liu 
> > ---
> >  drivers/md/raid5.c | 4 
> >  1 file changed, 4 insertions(+)
> > 
> > diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
> > index 697d77a..7b074f7 100644
> > --- a/drivers/md/raid5.c
> > +++ b/drivers/md/raid5.c
> > @@ -2217,6 +2217,10 @@ static int resize_stripes(struct r5conf *conf, int 
> > newsize)
> > if (!p)
> > err = -ENOMEM;
> > }
> > +
> > +   spin_lock_init(>batch_lock);
> > +   INIT_LIST_HEAD(>batch_list);
> > +   nsh->batch_head = NULL;
> > release_stripe(nsh);
> > }
> > /* critical section pass, GFP_NOIO no longer needed */
> 
> Thanks!
> 
> However I already have the following fix queued - though not pushed  out

Yeah, much cleaner.


> you.  I probably would have got it into -rc2 except that I was chasing
> another raid5 bug.  The
>   BUG_ON(sh->batch_head);
> 
> in handle_stripe_fill() fires when I run the mdadm selftests.  I got caught
> up chasing that and didn't push the other fix.

I am not aware of there is a selftests for raid. I'd like to add it to our 0day
kernel testing in near future so that we could catch bugs and bisect it down in
first time ;)

--yliu
> 
> 
> From 3dd8ba734349e602fe17d647ce3da5f4a13748aa Mon Sep 17 00:00:00 2001
> From: NeilBrown 
> Date: Thu, 30 Apr 2015 11:24:28 +1000
> Subject: [PATCH] md/raid5 new alloc_stripe function.
> 
> 
> diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
> index 77dfd720aaa0..91a1e8b26b52 100644
> --- a/drivers/md/raid5.c
> +++ b/drivers/md/raid5.c
> @@ -1971,17 +1971,30 @@ static void raid_run_ops(struct stripe_head *sh, 
> unsigned long ops_request)
>   put_cpu();
>  }
>  
> +static struct stripe_head *alloc_stripe(struct kmem_cache *sc, gfp_t gfp)
> +{
> + struct stripe_head *sh;
> +
> + sh = kmem_cache_zalloc(sc, gfp);
> + if (sh) {
> + spin_lock_init(>stripe_lock);
> + spin_lock_init(>batch_lock);
> + INIT_LIST_HEAD(>batch_list);
> + INIT_LIST_HEAD(>lru);
> + atomic_set(>count, 1);
> + }
> + return sh;
> +}
>  static int grow_one_stripe(struct r5conf *conf, gfp_t gfp)
>  {
>   struct stripe_head *sh;
> - sh = kmem_cache_zalloc(conf->slab_cache, gfp);
> +
> + sh = alloc_stripe(conf->slab_cache, gfp);
>   if (!sh)
>   return 0;
>  
>   sh->raid_conf = conf;
>  
> - spin_lock_init(>stripe_lock);
> -
>   if (grow_buffers(sh, gfp)) {
>   shrink_buffers(sh);
>   kmem_cache_free(conf->slab_cache, sh);
> @@ -1990,13 +2003,8 @@ static int grow_one_stripe(struct r5conf *conf, gfp_t 
> gfp)
>   sh->hash_lock_index =
>   conf->max_nr_stripes % NR_STRIPE_HASH_LOCKS;
>   /* we just created an active stripe so... */
> - atomic_set(>count, 1);
>   atomic_inc(>active_stripes);
> - INIT_LIST_HEAD(>lru);
>  
> - spin_lock_init(>batch_lock);
> - INIT_LIST_HEAD(>batch_list);
> - sh->batch_head = NULL;
>   release_stripe(sh);
>   conf->max_nr_stripes++;
>   return 1;
> @@ -2109,13 +2117,11 @@ static int resize_stripes(struct r5conf *conf, int 
> newsize)
>   return -ENOMEM;
>  
>   for (i = conf->max_nr_stripes; i; i--) {
> - nsh = kmem_cache_zalloc(sc, GFP_KERNEL);
> + nsh = alloc_stripe(sc, GFP_KERNEL);
>   if (!nsh)
>   break;
>  
>   nsh->raid_conf = conf;
> - spin_lock_init(>stripe_lock);
> -
>   list_add(>lru, );
>   }
>   if (i) {
> @@ -2142,13 +2148,11 @@ static int resize_stripes(struct r5conf *conf, int 
> n

Re: [PATCH] md/raid5: init batch_xxx for new sh at resize_stripes

2015-05-04 Thread Yuanhan Liu
On Mon, May 04, 2015 at 05:24:24PM +1000, NeilBrown wrote:
 On Mon,  4 May 2015 13:50:24 +0800 Yuanhan Liu yuanhan@linux.intel.com
 wrote:
 
  This is to fix a kernel NULL dereference oops introduced by commit
  59fc630b(RAID5: batch adjacent full stripe write), which introduced
  several batch_xxx fields, and did initiation for them at grow_one_stripes(),
  but forgot to do same at resize_stripes().
  
  This oops can be easily triggered by following steps:
  
  __create RAID5 /dev/md0
  __grow /dev/md0
  mdadm --wait /dev/md0
  dd if=/dev/zero of=/dev/md0
  
  Here is the detailed oops log:
...
  
  Cc: Shaohua Li s...@kernel.org
  Signed-off-by: Yuanhan Liu yuanhan@linux.intel.com
  ---
   drivers/md/raid5.c | 4 
   1 file changed, 4 insertions(+)
  
  diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
  index 697d77a..7b074f7 100644
  --- a/drivers/md/raid5.c
  +++ b/drivers/md/raid5.c
  @@ -2217,6 +2217,10 @@ static int resize_stripes(struct r5conf *conf, int 
  newsize)
  if (!p)
  err = -ENOMEM;
  }
  +
  +   spin_lock_init(nsh-batch_lock);
  +   INIT_LIST_HEAD(nsh-batch_list);
  +   nsh-batch_head = NULL;
  release_stripe(nsh);
  }
  /* critical section pass, GFP_NOIO no longer needed */
 
 Thanks!
 
 However I already have the following fix queued - though not pushed  out

Yeah, much cleaner.


 you.  I probably would have got it into -rc2 except that I was chasing
 another raid5 bug.  The
   BUG_ON(sh-batch_head);
 
 in handle_stripe_fill() fires when I run the mdadm selftests.  I got caught
 up chasing that and didn't push the other fix.

I am not aware of there is a selftests for raid. I'd like to add it to our 0day
kernel testing in near future so that we could catch bugs and bisect it down in
first time ;)

--yliu
 
 
 From 3dd8ba734349e602fe17d647ce3da5f4a13748aa Mon Sep 17 00:00:00 2001
 From: NeilBrown ne...@suse.de
 Date: Thu, 30 Apr 2015 11:24:28 +1000
 Subject: [PATCH] md/raid5 new alloc_stripe function.
 
 
 diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
 index 77dfd720aaa0..91a1e8b26b52 100644
 --- a/drivers/md/raid5.c
 +++ b/drivers/md/raid5.c
 @@ -1971,17 +1971,30 @@ static void raid_run_ops(struct stripe_head *sh, 
 unsigned long ops_request)
   put_cpu();
  }
  
 +static struct stripe_head *alloc_stripe(struct kmem_cache *sc, gfp_t gfp)
 +{
 + struct stripe_head *sh;
 +
 + sh = kmem_cache_zalloc(sc, gfp);
 + if (sh) {
 + spin_lock_init(sh-stripe_lock);
 + spin_lock_init(sh-batch_lock);
 + INIT_LIST_HEAD(sh-batch_list);
 + INIT_LIST_HEAD(sh-lru);
 + atomic_set(sh-count, 1);
 + }
 + return sh;
 +}
  static int grow_one_stripe(struct r5conf *conf, gfp_t gfp)
  {
   struct stripe_head *sh;
 - sh = kmem_cache_zalloc(conf-slab_cache, gfp);
 +
 + sh = alloc_stripe(conf-slab_cache, gfp);
   if (!sh)
   return 0;
  
   sh-raid_conf = conf;
  
 - spin_lock_init(sh-stripe_lock);
 -
   if (grow_buffers(sh, gfp)) {
   shrink_buffers(sh);
   kmem_cache_free(conf-slab_cache, sh);
 @@ -1990,13 +2003,8 @@ static int grow_one_stripe(struct r5conf *conf, gfp_t 
 gfp)
   sh-hash_lock_index =
   conf-max_nr_stripes % NR_STRIPE_HASH_LOCKS;
   /* we just created an active stripe so... */
 - atomic_set(sh-count, 1);
   atomic_inc(conf-active_stripes);
 - INIT_LIST_HEAD(sh-lru);
  
 - spin_lock_init(sh-batch_lock);
 - INIT_LIST_HEAD(sh-batch_list);
 - sh-batch_head = NULL;
   release_stripe(sh);
   conf-max_nr_stripes++;
   return 1;
 @@ -2109,13 +2117,11 @@ static int resize_stripes(struct r5conf *conf, int 
 newsize)
   return -ENOMEM;
  
   for (i = conf-max_nr_stripes; i; i--) {
 - nsh = kmem_cache_zalloc(sc, GFP_KERNEL);
 + nsh = alloc_stripe(sc, GFP_KERNEL);
   if (!nsh)
   break;
  
   nsh-raid_conf = conf;
 - spin_lock_init(nsh-stripe_lock);
 -
   list_add(nsh-lru, newstripes);
   }
   if (i) {
 @@ -2142,13 +2148,11 @@ static int resize_stripes(struct r5conf *conf, int 
 newsize)
   lock_device_hash_lock(conf, hash));
   osh = get_free_stripe(conf, hash);
   unlock_device_hash_lock(conf, hash);
 - atomic_set(nsh-count, 1);
 +
   for(i=0; iconf-pool_size; i++) {
   nsh-dev[i].page = osh-dev[i].page;
   nsh-dev[i].orig_page = osh-dev[i].page;
   }
 - for( ; inewsize; i++)
 - nsh-dev[i].page = NULL;
   nsh-hash_lock_index = hash;
   kmem_cache_free(conf-slab_cache, osh);
   cnt++;
 


--
To unsubscribe from this list

[PATCH] md/raid5: init batch_xxx for new sh at resize_stripes

2015-05-03 Thread Yuanhan Liu
This is to fix a kernel NULL dereference oops introduced by commit
59fc630b("RAID5: batch adjacent full stripe write"), which introduced
several batch_xxx fields, and did initiation for them at grow_one_stripes(),
but forgot to do same at resize_stripes().

This oops can be easily triggered by following steps:

__create RAID5 /dev/md0
__grow /dev/md0
mdadm --wait /dev/md0
dd if=/dev/zero of=/dev/md0

Here is the detailed oops log:

[   32.384499] BUG: unable to handle kernel NULL pointer dereference at 
  (null)
[   32.385366] IP: [] add_stripe_bio+0x48d/0x544
[   32.385955] PGD 373f3067 PUD 36e34067 PMD 0
[   32.386404] Oops: 0002 [#1] SMP
[   32.386740] Modules linked in:
[   32.387040] CPU: 0 PID: 1059 Comm: kworker/u2:2 Not tainted 
4.0.0-next-20150427+ #107
[   32.387762] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 
rel-1.7.5-0-ge51488c-20140602_164612-nilsson.home.kraxel.org 04/01/2014
[   32.388044] Workqueue: writeback bdi_writeback_workfn (flush-9:0)
[   32.388044] task: 88003d038000 ti: 88003d40c000 task.ti: 
88003d40c000
[   32.388044] RIP: 0010:[]  [] 
add_stripe_bio+0x48d/0x544
[   32.388044] RSP: :88003d40f6f8  EFLAGS: 00010046
[   32.388044] RAX:  RBX: 880037168cd0 RCX: 880037179a28
[   32.388044] RDX: 880037168d58 RSI:  RDI: 880037179a20
[   32.388044] RBP: 88003d40f738 R08: 0410 R09: 0410
[   32.388044] R10: 0410 R11: 0002 R12: 8800371799a0
[   32.388044] R13: 88003c3d0800 R14: 0001 R15: 880037179a08
[   32.388044] FS:  () GS:88003fc0() 
knlGS:
[   32.388044] CS:  0010 DS:  ES:  CR0: 8005003b
[   32.388044] CR2:  CR3: 36e33000 CR4: 06f0
[   32.388044] Stack:
[   32.388044]  0002 880037168d38 88003d40f738 
88003c3abd00
[   32.388044]  88003c2df800 88003c3d0800 0408 
88003c3d0b54
[   32.388044]  88003d40f828 8184b9ea 3d40f7e8 
0292
[   32.388044] Call Trace:
[   32.388044]  [] make_request+0x7a8/0xaee
[   32.388044]  [] ? wait_woken+0x79/0x79
[   32.388044]  [] ? kmem_cache_alloc+0x95/0x1b6
[   32.388044]  [] md_make_request+0xeb/0x1c3
[   32.388044]  [] ? mempool_alloc+0x64/0x127
[   32.388044]  [] generic_make_request+0x9c/0xdb
[   32.388044]  [] submit_bio+0xf6/0x134
[   32.388044]  [] _submit_bh+0x119/0x141
[   32.388044]  [] submit_bh+0x10/0x12
[   32.388044]  [] 
__block_write_full_page.constprop.30+0x1a3/0x2a4
[   32.388044]  [] ? I_BDEV+0xd/0xd
[   32.388044]  [] block_write_full_page+0xab/0xaf
[   32.388044]  [] blkdev_writepage+0x18/0x1a
[   32.388044]  [] __writepage+0x14/0x2d
[   32.388044]  [] write_cache_pages+0x29a/0x3a7
[   32.388044]  [] ? mapping_tagged+0x14/0x14
[   32.388044]  [] generic_writepages+0x3e/0x56
[   32.388044]  [] do_writepages+0x1e/0x2c
[   32.388044]  [] __writeback_single_inode+0x5b/0x27e
[   32.388044]  [] writeback_sb_inodes+0x1dc/0x358
[   32.388044]  [] __writeback_inodes_wb+0x7f/0xb8
[   32.388044]  [] wb_writeback+0x11a/0x271
[   32.388044]  [] ? global_dirty_limits+0x1b/0xfd
[   32.388044]  [] bdi_writeback_workfn+0x1ae/0x360
[   32.388044]  [] process_one_work+0x1c2/0x340
[   32.388044]  [] worker_thread+0x28b/0x389
[   32.388044]  [] ? cancel_delayed_work_sync+0x15/0x15
[   32.388044]  [] kthread+0xd2/0xda
[   32.388044]  [] ? kthread_create_on_node+0x17c/0x17c
[   32.388044]  [] ret_from_fork+0x42/0x70
[   32.388044]  [] ? kthread_create_on_node+0x17c/0x17c
[   32.388044] Code: 84 24 90 00 00 00 48 8d 93 88 00 00 00 49 8d 8c 24 88 00 
00 00 49 89 94 24 90 00 00 00 48 89 8b 88 00 00 00 48 89 83 90 00 00 00 <48> 89 
10 66 41 83 84 24 80 00 00 00 01 3e 0f ba 73 48 06 72 02
[   32.388044] RIP  [] add_stripe_bio+0x48d/0x544
[   32.388044]  RSP 
[   32.388044] CR2: 
[   32.388044] ---[ end trace 2b255d3f55be9eb3 ]---

Cc: Shaohua Li 
Signed-off-by: Yuanhan Liu 
---
 drivers/md/raid5.c | 4 
 1 file changed, 4 insertions(+)

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 697d77a..7b074f7 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -2217,6 +2217,10 @@ static int resize_stripes(struct r5conf *conf, int 
newsize)
if (!p)
err = -ENOMEM;
}
+
+   spin_lock_init(>batch_lock);
+   INIT_LIST_HEAD(>batch_list);
+   nsh->batch_head = NULL;
release_stripe(nsh);
}
/* critical section pass, GFP_NOIO no longer needed */
-- 
1.9.0

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH] md/raid5: init batch_xxx for new sh at resize_stripes

2015-05-03 Thread Yuanhan Liu
This is to fix a kernel NULL dereference oops introduced by commit
59fc630b(RAID5: batch adjacent full stripe write), which introduced
several batch_xxx fields, and did initiation for them at grow_one_stripes(),
but forgot to do same at resize_stripes().

This oops can be easily triggered by following steps:

__create RAID5 /dev/md0
__grow /dev/md0
mdadm --wait /dev/md0
dd if=/dev/zero of=/dev/md0

Here is the detailed oops log:

[   32.384499] BUG: unable to handle kernel NULL pointer dereference at 
  (null)
[   32.385366] IP: [81844082] add_stripe_bio+0x48d/0x544
[   32.385955] PGD 373f3067 PUD 36e34067 PMD 0
[   32.386404] Oops: 0002 [#1] SMP
[   32.386740] Modules linked in:
[   32.387040] CPU: 0 PID: 1059 Comm: kworker/u2:2 Not tainted 
4.0.0-next-20150427+ #107
[   32.387762] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 
rel-1.7.5-0-ge51488c-20140602_164612-nilsson.home.kraxel.org 04/01/2014
[   32.388044] Workqueue: writeback bdi_writeback_workfn (flush-9:0)
[   32.388044] task: 88003d038000 ti: 88003d40c000 task.ti: 
88003d40c000
[   32.388044] RIP: 0010:[81844082]  [81844082] 
add_stripe_bio+0x48d/0x544
[   32.388044] RSP: :88003d40f6f8  EFLAGS: 00010046
[   32.388044] RAX:  RBX: 880037168cd0 RCX: 880037179a28
[   32.388044] RDX: 880037168d58 RSI:  RDI: 880037179a20
[   32.388044] RBP: 88003d40f738 R08: 0410 R09: 0410
[   32.388044] R10: 0410 R11: 0002 R12: 8800371799a0
[   32.388044] R13: 88003c3d0800 R14: 0001 R15: 880037179a08
[   32.388044] FS:  () GS:88003fc0() 
knlGS:
[   32.388044] CS:  0010 DS:  ES:  CR0: 8005003b
[   32.388044] CR2:  CR3: 36e33000 CR4: 06f0
[   32.388044] Stack:
[   32.388044]  0002 880037168d38 88003d40f738 
88003c3abd00
[   32.388044]  88003c2df800 88003c3d0800 0408 
88003c3d0b54
[   32.388044]  88003d40f828 8184b9ea 3d40f7e8 
0292
[   32.388044] Call Trace:
[   32.388044]  [8184b9ea] make_request+0x7a8/0xaee
[   32.388044]  [81120387] ? wait_woken+0x79/0x79
[   32.388044]  [811e9a85] ? kmem_cache_alloc+0x95/0x1b6
[   32.388044]  [8186b944] md_make_request+0xeb/0x1c3
[   32.388044]  [811a3025] ? mempool_alloc+0x64/0x127
[   32.388044]  [81481575] generic_make_request+0x9c/0xdb
[   32.388044]  [814816aa] submit_bio+0xf6/0x134
[   32.388044]  [8122a1f7] _submit_bh+0x119/0x141
[   32.388044]  [8122a22f] submit_bh+0x10/0x12
[   32.388044]  [8122bbb9] 
__block_write_full_page.constprop.30+0x1a3/0x2a4
[   32.388044]  [8122bead] ? I_BDEV+0xd/0xd
[   32.388044]  [8122bd65] block_write_full_page+0xab/0xaf
[   32.388044]  [8122c657] blkdev_writepage+0x18/0x1a
[   32.388044]  [811a9853] __writepage+0x14/0x2d
[   32.388044]  [811a9ef3] write_cache_pages+0x29a/0x3a7
[   32.388044]  [811a983f] ? mapping_tagged+0x14/0x14
[   32.388044]  [811aa03e] generic_writepages+0x3e/0x56
[   32.388044]  [811ab638] do_writepages+0x1e/0x2c
[   32.388044]  [812229ed] __writeback_single_inode+0x5b/0x27e
[   32.388044]  [81222ec7] writeback_sb_inodes+0x1dc/0x358
[   32.388044]  [812230c2] __writeback_inodes_wb+0x7f/0xb8
[   32.388044]  [812232b9] wb_writeback+0x11a/0x271
[   32.388044]  [811aa483] ? global_dirty_limits+0x1b/0xfd
[   32.388044]  [8122399c] bdi_writeback_workfn+0x1ae/0x360
[   32.388044]  [81101bab] process_one_work+0x1c2/0x340
[   32.388044]  [81102571] worker_thread+0x28b/0x389
[   32.388044]  [811022e6] ? cancel_delayed_work_sync+0x15/0x15
[   32.388044]  [81106936] kthread+0xd2/0xda
[   32.388044]  [81106864] ? kthread_create_on_node+0x17c/0x17c
[   32.388044]  [81a16682] ret_from_fork+0x42/0x70
[   32.388044]  [81106864] ? kthread_create_on_node+0x17c/0x17c
[   32.388044] Code: 84 24 90 00 00 00 48 8d 93 88 00 00 00 49 8d 8c 24 88 00 
00 00 49 89 94 24 90 00 00 00 48 89 8b 88 00 00 00 48 89 83 90 00 00 00 48 89 
10 66 41 83 84 24 80 00 00 00 01 3e 0f ba 73 48 06 72 02
[   32.388044] RIP  [81844082] add_stripe_bio+0x48d/0x544
[   32.388044]  RSP 88003d40f6f8
[   32.388044] CR2: 
[   32.388044] ---[ end trace 2b255d3f55be9eb3 ]---

Cc: Shaohua Li s...@kernel.org
Signed-off-by: Yuanhan Liu yuanhan@linux.intel.com
---
 drivers/md/raid5.c | 4 
 1 file changed, 4 insertions(+)

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 697d77a..7b074f7 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -2217,6 +2217,10 @@ static int resize_stripes(struct r5conf *conf, int 
newsize

[LKP] [genirq] d5b2eacdbc2: BUG: unable to handle kernel NULL pointer dereference at (null)

2015-04-30 Thread Yuanhan Liu
FYI, we noticed the below changes on

https://github.com/jiangliu/linux.git test/irq_common_data_v2
commit d5b2eacdbc280da7c6dfbe0f52bb293ef227d349 ("genirq: Introduce struct 
irq_common_data to host shared irq data")


+-+++
| | 39fb394021 | 
d5b2eacdbc |
+-+++
| boot_successes  | 0  | 0  
|
| boot_failures   | 22 | 20 
|
| PM:Hibernation_image_not_present_or_could_not_be_loaded | 22 |
|
| BUG:unable_to_handle_kernel | 0  | 20 
|
| Oops| 0  | 20 
|
| Kernel_panic-not_syncing:Fatal_exception_in_interrupt   | 0  | 20 
|
| backtrace:__pci_register_driver | 0  | 6  
|
| backtrace:e1000_init_module | 0  | 6  
|
| backtrace:kernel_init_freeable  | 0  | 6  
|
| backtrace:ata_sff_pio_task  | 0  | 14 
|
+-+++


[1.351055] ata2.01: NODEV after polling detection
[1.352179] ata2.00: ATAPI: QEMU DVD-ROM, 2.1.2, max UDMA/100
[1.353501] ata2.00: configured for MWDMA2
[1.354423] BUG: unable to handle kernel NULL pointer dereference at 
  (null)
[1.356074] IP: [<  (null)>]   (null)
[1.356074] PGD 0 
[1.356074] Oops: 0010 [#1] SMP 
[1.356074] Modules linked in:
[1.356074] CPU: 0 PID: 584 Comm: kworker/0:1 Not tainted 
4.1.0-rc1-wl-ath-00905-geb3b9ec #1
[1.356074] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 
1.7.5-20140531_083030-gandalf 04/01/2014
[1.356074] Workqueue: ata_sff ata_sff_pio_task
[1.356074] task: 880011c2af30 ti: 8800123bc000 task.ti: 
8800123bc000
[1.356074] RIP: 0010:[<>]  [<  (null)>]   
(null)
[1.356074] RSP: :880013803ee0  EFLAGS: 00010046
[1.356074] RAX: 8222b2c0 RBX: 88001349fc80 RCX: 0009
[1.356074] RDX: 88001348f400 RSI: ffc0 RDI: 88001349fc80
[1.356074] RBP: 880013803ef8 R08:  R09: 0013
[1.356074] R10: 0006 R11:  R12: 88001348f400
[1.356074] R13: 000f R14: 8800123bfc78 R15: 
[1.356074] FS:  () GS:88001380() 
knlGS:
[1.356074] CS:  0010 DS:  ES:  CR0: 8005003b
[1.356074] CR2:  CR3: 0220b000 CR4: 06f0
[1.356074] Stack:
[1.356074]  8113aa96 88001349fc80 88001348f458 
880013803f18
[1.356074]  8106bc49 8222b2c0 88001348f400 
880013803f28
[1.356074]  81138421 880013803f48 811380db 
000f
[1.356074] Call Trace:
[1.356074]   
[1.356074]  [] ? irq_move_irq+0x34/0x50
[1.356074]  [] apic_ack_edge+0x23/0x3b
[1.356074]  [] irq_chip_ack_parent+0x14/0x16
[1.356074]  [] handle_edge_irq+0xa5/0x110
[1.356074]  [] handle_irq+0x27/0x2d
[1.356074]  [] do_IRQ+0x4c/0xcf
[1.356074]  [] common_interrupt+0x73/0x73
[1.356074]   
[1.356074]  [] ? __ata_qc_complete+0xe1/0xe9
[1.356074]  [] ? _raw_spin_unlock_irqrestore+0x32/0x42
[1.356074]  [] ata_sff_hsm_move+0x258/0x66a
[1.356074]  [] ata_sff_pio_task+0x140/0x15e
[1.356074]  [] process_one_work+0x1c6/0x37b
[1.356074]  [] worker_thread+0x2ad/0x3b6
[1.356074]  [] ? rescuer_thread+0x318/0x318
[1.356074]  [] kthread+0xf8/0x100
[1.356074]  [] ? kthread_create_on_node+0x184/0x184
[1.356074]  [] ret_from_fork+0x42/0x70
[1.356074]  [] ? kthread_create_on_node+0x184/0x184
[1.356074] Code:  Bad RIP value.
[1.356074] RIP  [<  (null)>]   (null)
[1.356074]  RSP 
[1.356074] CR2: 
[1.356074] ---[ end trace d37ae2366ce94eef ]---
[1.356074] Kernel panic - not syncing: Fatal exception in interrupt



Thanks,
lkp
#
# Automatically generated file; DO NOT EDIT.
# Linux/x86_64 4.0.0 Kernel Configuration
#
CONFIG_64BIT=y
CONFIG_X86_64=y
CONFIG_X86=y
CONFIG_INSTRUCTION_DECODER=y
CONFIG_PERF_EVENTS_INTEL_UNCORE=y
CONFIG_OUTPUT_FORMAT="elf64-x86-64"
CONFIG_ARCH_DEFCONFIG="arch/x86/configs/x86_64_defconfig"
CONFIG_LOCKDEP_SUPPORT=y
CONFIG_STACKTRACE_SUPPORT=y
CONFIG_HAVE_LATENCYTOP_SUPPORT=y
CONFIG_MMU=y
CONFIG_NEED_DMA_MAP_STATE=y
CONFIG_NEED_SG_DMA_LENGTH=y
CONFIG_GENERIC_ISA_DMA=y
CONFIG_GENERIC_BUG=y
CONFIG_GENERIC_BUG_RELATIVE_POINTERS=y

Re: [PATCH 2/2] md/raid5: trivial coding style fix

2015-04-30 Thread Yuanhan Liu
On Thu, Apr 30, 2015 at 05:16:50PM +1000, NeilBrown wrote:
> On Thu, 30 Apr 2015 15:01:17 +0800 Yuanhan Liu 
> wrote:
> 
> > Signed-off-by: Yuanhan Liu 
> > ---
> >  drivers/md/raid5.c | 3 +--
> >  1 file changed, 1 insertion(+), 2 deletions(-)
> > 
> > diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
> > index 2651bda..bae3e2c 100644
> > --- a/drivers/md/raid5.c
> > +++ b/drivers/md/raid5.c
> > @@ -5789,8 +5789,7 @@ static void raid5d(struct md_thread *thread)
> > if (released)
> > clear_bit(R5_DID_ALLOC, >cache_state);
> >  
> > -   if (
> > -   !list_empty(>bitmap_list)) {
> > +   if (!list_empty(>bitmap_list)) {
> > /* Now is a good time to flush some bitmap updates */
> > conf->seq_flush++;
> > spin_unlock_irq(>device_lock);
> 
> 
> I'm happy for these sorts of changes when you are fixing up nearby code, or
> if the change significantly improves readability.
> But I'd rather not bother is one-off trivial fixes like this.

Got it.

--yliu
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 1/2] md/raid5: fix typo

2015-04-30 Thread Yuanhan Liu
On Thu, Apr 30, 2015 at 05:14:26PM +1000, NeilBrown wrote:
> On Thu, 30 Apr 2015 15:01:16 +0800 Yuanhan Liu 
> wrote:
> 
> > bion -> bios
> > 
> > Signed-off-by: Yuanhan Liu 
> > ---
> >  drivers/md/raid5.c | 2 +-
> >  1 file changed, 1 insertion(+), 1 deletion(-)
> > 
> > diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
> > index 697d77a..2651bda 100644
> > --- a/drivers/md/raid5.c
> > +++ b/drivers/md/raid5.c
> > @@ -2919,7 +2919,7 @@ schedule_reconstruction(struct stripe_head *sh, 
> > struct stripe_head_state *s,
> >  }
> >  
> >  /*
> > - * Each stripe/dev can have one or more bion attached.
> > + * Each stripe/dev can have one or more bios attached.
> >   * toread/towrite point to the first in a chain.
> >   * The bi_next chain must be in order.
> >   */
> 
> That was intentional.  "bios" as a plural looks too much like "BIOS" which is
> in the ROM of computers.
> 
> Children and oxen are plurals with an 'n' at the end.  So I used 'bion'.
> Private joke?

Interesting.

> 
> I'd rather leave it as it is.

Okay, and sorry for the noise.

--yliu
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 1/2] md/raid5: fix typo

2015-04-30 Thread Yuanhan Liu
bion -> bios

Signed-off-by: Yuanhan Liu 
---
 drivers/md/raid5.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 697d77a..2651bda 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -2919,7 +2919,7 @@ schedule_reconstruction(struct stripe_head *sh, struct 
stripe_head_state *s,
 }
 
 /*
- * Each stripe/dev can have one or more bion attached.
+ * Each stripe/dev can have one or more bios attached.
  * toread/towrite point to the first in a chain.
  * The bi_next chain must be in order.
  */
-- 
1.9.0

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 2/2] md/raid5: trivial coding style fix

2015-04-30 Thread Yuanhan Liu
Signed-off-by: Yuanhan Liu 
---
 drivers/md/raid5.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 2651bda..bae3e2c 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -5789,8 +5789,7 @@ static void raid5d(struct md_thread *thread)
if (released)
clear_bit(R5_DID_ALLOC, >cache_state);
 
-   if (
-   !list_empty(>bitmap_list)) {
+   if (!list_empty(>bitmap_list)) {
/* Now is a good time to flush some bitmap updates */
conf->seq_flush++;
spin_unlock_irq(>device_lock);
-- 
1.9.0

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [LKP] [RAID5] 878ee679279: -1.8% vmstat.io.bo, +40.5% perf-stat.LLC-load-misses

2015-04-30 Thread Yuanhan Liu
On Fri, Apr 24, 2015 at 12:15:59PM +1000, NeilBrown wrote:
> On Thu, 23 Apr 2015 14:55:59 +0800 Huang Ying  wrote:
> 
> > FYI, we noticed the below changes on
> > 
> > git://neil.brown.name/md for-next
> > commit 878ee6792799e2f88bdcac329845efadb205252f ("RAID5: batch adjacent 
> > full stripe write")
> 
> Hi,
>  is there any chance that you could explain what some of this means?
> There is lots of data and some very pretty graphs, but no explanation.

Hi Neil,

(Sorry for late response: Ying is on vacation)

I guess you can simply ignore this report, as I already reported to you
month ago that this patch made fsmark performs better in most cases:

https://lists.01.org/pipermail/lkp/2015-March/002411.html

> 
> Which numbers are "good", which are "bad"?  Which is "worst".
> What do the graphs really show? and what would we like to see in them?
> 
> I think it is really great that you are doing this testing and reporting the
> results.  It's just so sad that I completely fail to understand them.

Sorry, it's our bad to make them hard to understand as well as
to report a duplicate one(well, the commit hash is different ;).

We might need take some time to make those data understood easier.

--yliu

> 
> > 
> > 
> > testbox/testcase/testparams: 
> > lkp-st02/dd-write/300-5m-11HDD-RAID5-cfq-xfs-1dd
> > 
> > a87d7f782b47e030  878ee6792799e2f88bdcac3298  
> >   --  
> >  %stddev %change %stddev
> >  \  |\  
> >  59035 ±  0% +18.4%  69913 ±  1%  softirqs.SCHED
> >   1330 ± 10% +17.4%   1561 ±  4%  slabinfo.kmalloc-512.num_objs
> >   1330 ± 10% +17.4%   1561 ±  4%  
> > slabinfo.kmalloc-512.active_objs
> > 305908 ±  0%  -1.8% 300427 ±  0%  vmstat.io.bo
> >  1 ±  0%+100.0%  2 ±  0%  vmstat.procs.r
> >   8266 ±  1% -15.7%   6968 ±  0%  vmstat.system.cs
> >  14819 ±  0%  -2.1%  14503 ±  0%  vmstat.system.in
> >  18.20 ±  6% +10.2%  20.05 ±  4%  
> > perf-profile.cpu-cycles.raid_run_ops.handle_stripe.handle_active_stripes.raid5d.md_thread
> >   1.94 ±  9% +90.6%   3.70 ±  9%  
> > perf-profile.cpu-cycles.async_xor.raid_run_ops.handle_stripe.handle_active_stripes.raid5d
> >   0.00 ±  0%  +Inf%  25.18 ±  3%  
> > perf-profile.cpu-cycles.handle_active_stripes.isra.45.raid5d.md_thread.kthread.ret_from_fork
> >   0.00 ±  0%  +Inf%  14.14 ±  4%  
> > perf-profile.cpu-cycles.async_copy_data.isra.42.raid_run_ops.handle_stripe.handle_active_stripes.raid5d
> >   1.79 ±  7%+102.9%   3.64 ±  9%  
> > perf-profile.cpu-cycles.xor_blocks.async_xor.raid_run_ops.handle_stripe.handle_active_stripes
> >   3.09 ±  4% -10.8%   2.76 ±  4%  
> > perf-profile.cpu-cycles.get_active_stripe.make_request.md_make_request.generic_make_request.submit_bio
> >   0.80 ± 14% +28.1%   1.02 ± 10%  
> > perf-profile.cpu-cycles.mutex_lock.xfs_file_buffered_aio_write.xfs_file_write_iter.new_sync_write.vfs_write
> >  14.78 ±  6%-100.0%   0.00 ±  0%  
> > perf-profile.cpu-cycles.async_copy_data.isra.38.raid_run_ops.handle_stripe.handle_active_stripes.raid5d
> >  25.68 ±  4%-100.0%   0.00 ±  0%  
> > perf-profile.cpu-cycles.handle_active_stripes.isra.41.raid5d.md_thread.kthread.ret_from_fork
> >   1.23 ±  5%+140.0%   2.96 ±  7%  
> > perf-profile.cpu-cycles.xor_sse_5_pf64.xor_blocks.async_xor.raid_run_ops.handle_stripe
> >   2.62 ±  6% -95.6%   0.12 ± 33%  
> > perf-profile.cpu-cycles.analyse_stripe.handle_stripe.handle_active_stripes.raid5d.md_thread
> >   0.96 ±  9% +17.5%   1.12 ±  2%  
> > perf-profile.cpu-cycles.xfs_ilock.xfs_file_buffered_aio_write.xfs_file_write_iter.new_sync_write.vfs_write
> >  1.461e+10 ±  0%  -5.3%  1.384e+10 ±  1%  
> > perf-stat.L1-dcache-load-misses
> >  3.688e+11 ±  0%  -2.7%   3.59e+11 ±  0%  perf-stat.L1-dcache-loads
> >  1.124e+09 ±  0% -27.7%  8.125e+08 ±  0%  perf-stat.L1-dcache-prefetches
> >  2.767e+10 ±  0%  -1.8%  2.717e+10 ±  0%  
> > perf-stat.L1-dcache-store-misses
> >  2.352e+11 ±  0%  -2.8%  2.287e+11 ±  0%  perf-stat.L1-dcache-stores
> >  6.774e+09 ±  0%  -2.3%   6.62e+09 ±  0%  
> > perf-stat.L1-icache-load-misses
> >  5.571e+08 ±  0% +40.5%  7.826e+08 ±  1%  perf-stat.LLC-load-misses
> >  6.263e+09 ±  0% -13.7%  5.407e+09 ±  1%  perf-stat.LLC-loads
> >  1.914e+11 ±  0%  -4.2%  1.833e+11 ±  0%  perf-stat.branch-instructions
> >  1.145e+09 ±  2%  -5.6%  1.081e+09 ±  0%  perf-stat.branch-load-misses
> >  1.911e+11 ±  0%  -4.3%  1.829e+11 ±  0%  perf-stat.branch-loads
> >  1.142e+09 ±  2%  -5.1%  1.083e+09 ±  0%  perf-stat.branch-misses
> >  1.218e+09 ±  0% +19.8%   1.46e+09 ±  0%  perf-stat.cache-misses
> >  2.118e+10 ±  0%  -5.2%  2.007e+10 ±  0%  perf-stat.cache-references
> >2510308 ±  1% -15.7%2115410 ±  

[PATCH 1/2] md/raid5: fix typo

2015-04-30 Thread Yuanhan Liu
bion - bios

Signed-off-by: Yuanhan Liu yuanhan@linux.intel.com
---
 drivers/md/raid5.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 697d77a..2651bda 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -2919,7 +2919,7 @@ schedule_reconstruction(struct stripe_head *sh, struct 
stripe_head_state *s,
 }
 
 /*
- * Each stripe/dev can have one or more bion attached.
+ * Each stripe/dev can have one or more bios attached.
  * toread/towrite point to the first in a chain.
  * The bi_next chain must be in order.
  */
-- 
1.9.0

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 2/2] md/raid5: trivial coding style fix

2015-04-30 Thread Yuanhan Liu
Signed-off-by: Yuanhan Liu yuanhan@linux.intel.com
---
 drivers/md/raid5.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 2651bda..bae3e2c 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -5789,8 +5789,7 @@ static void raid5d(struct md_thread *thread)
if (released)
clear_bit(R5_DID_ALLOC, conf-cache_state);
 
-   if (
-   !list_empty(conf-bitmap_list)) {
+   if (!list_empty(conf-bitmap_list)) {
/* Now is a good time to flush some bitmap updates */
conf-seq_flush++;
spin_unlock_irq(conf-device_lock);
-- 
1.9.0

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 1/2] md/raid5: fix typo

2015-04-30 Thread Yuanhan Liu
On Thu, Apr 30, 2015 at 05:14:26PM +1000, NeilBrown wrote:
 On Thu, 30 Apr 2015 15:01:16 +0800 Yuanhan Liu yuanhan@linux.intel.com
 wrote:
 
  bion - bios
  
  Signed-off-by: Yuanhan Liu yuanhan@linux.intel.com
  ---
   drivers/md/raid5.c | 2 +-
   1 file changed, 1 insertion(+), 1 deletion(-)
  
  diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
  index 697d77a..2651bda 100644
  --- a/drivers/md/raid5.c
  +++ b/drivers/md/raid5.c
  @@ -2919,7 +2919,7 @@ schedule_reconstruction(struct stripe_head *sh, 
  struct stripe_head_state *s,
   }
   
   /*
  - * Each stripe/dev can have one or more bion attached.
  + * Each stripe/dev can have one or more bios attached.
* toread/towrite point to the first in a chain.
* The bi_next chain must be in order.
*/
 
 That was intentional.  bios as a plural looks too much like BIOS which is
 in the ROM of computers.
 
 Children and oxen are plurals with an 'n' at the end.  So I used 'bion'.
 Private joke?

Interesting.

 
 I'd rather leave it as it is.

Okay, and sorry for the noise.

--yliu
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 2/2] md/raid5: trivial coding style fix

2015-04-30 Thread Yuanhan Liu
On Thu, Apr 30, 2015 at 05:16:50PM +1000, NeilBrown wrote:
 On Thu, 30 Apr 2015 15:01:17 +0800 Yuanhan Liu yuanhan@linux.intel.com
 wrote:
 
  Signed-off-by: Yuanhan Liu yuanhan@linux.intel.com
  ---
   drivers/md/raid5.c | 3 +--
   1 file changed, 1 insertion(+), 2 deletions(-)
  
  diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
  index 2651bda..bae3e2c 100644
  --- a/drivers/md/raid5.c
  +++ b/drivers/md/raid5.c
  @@ -5789,8 +5789,7 @@ static void raid5d(struct md_thread *thread)
  if (released)
  clear_bit(R5_DID_ALLOC, conf-cache_state);
   
  -   if (
  -   !list_empty(conf-bitmap_list)) {
  +   if (!list_empty(conf-bitmap_list)) {
  /* Now is a good time to flush some bitmap updates */
  conf-seq_flush++;
  spin_unlock_irq(conf-device_lock);
 
 
 I'm happy for these sorts of changes when you are fixing up nearby code, or
 if the change significantly improves readability.
 But I'd rather not bother is one-off trivial fixes like this.

Got it.

--yliu
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[LKP] [genirq] d5b2eacdbc2: BUG: unable to handle kernel NULL pointer dereference at (null)

2015-04-30 Thread Yuanhan Liu
FYI, we noticed the below changes on

https://github.com/jiangliu/linux.git test/irq_common_data_v2
commit d5b2eacdbc280da7c6dfbe0f52bb293ef227d349 (genirq: Introduce struct 
irq_common_data to host shared irq data)


+-+++
| | 39fb394021 | 
d5b2eacdbc |
+-+++
| boot_successes  | 0  | 0  
|
| boot_failures   | 22 | 20 
|
| PM:Hibernation_image_not_present_or_could_not_be_loaded | 22 |
|
| BUG:unable_to_handle_kernel | 0  | 20 
|
| Oops| 0  | 20 
|
| Kernel_panic-not_syncing:Fatal_exception_in_interrupt   | 0  | 20 
|
| backtrace:__pci_register_driver | 0  | 6  
|
| backtrace:e1000_init_module | 0  | 6  
|
| backtrace:kernel_init_freeable  | 0  | 6  
|
| backtrace:ata_sff_pio_task  | 0  | 14 
|
+-+++


[1.351055] ata2.01: NODEV after polling detection
[1.352179] ata2.00: ATAPI: QEMU DVD-ROM, 2.1.2, max UDMA/100
[1.353501] ata2.00: configured for MWDMA2
[1.354423] BUG: unable to handle kernel NULL pointer dereference at 
  (null)
[1.356074] IP: [  (null)]   (null)
[1.356074] PGD 0 
[1.356074] Oops: 0010 [#1] SMP 
[1.356074] Modules linked in:
[1.356074] CPU: 0 PID: 584 Comm: kworker/0:1 Not tainted 
4.1.0-rc1-wl-ath-00905-geb3b9ec #1
[1.356074] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 
1.7.5-20140531_083030-gandalf 04/01/2014
[1.356074] Workqueue: ata_sff ata_sff_pio_task
[1.356074] task: 880011c2af30 ti: 8800123bc000 task.ti: 
8800123bc000
[1.356074] RIP: 0010:[]  [  (null)]   
(null)
[1.356074] RSP: :880013803ee0  EFLAGS: 00010046
[1.356074] RAX: 8222b2c0 RBX: 88001349fc80 RCX: 0009
[1.356074] RDX: 88001348f400 RSI: ffc0 RDI: 88001349fc80
[1.356074] RBP: 880013803ef8 R08:  R09: 0013
[1.356074] R10: 0006 R11:  R12: 88001348f400
[1.356074] R13: 000f R14: 8800123bfc78 R15: 
[1.356074] FS:  () GS:88001380() 
knlGS:
[1.356074] CS:  0010 DS:  ES:  CR0: 8005003b
[1.356074] CR2:  CR3: 0220b000 CR4: 06f0
[1.356074] Stack:
[1.356074]  8113aa96 88001349fc80 88001348f458 
880013803f18
[1.356074]  8106bc49 8222b2c0 88001348f400 
880013803f28
[1.356074]  81138421 880013803f48 811380db 
000f
[1.356074] Call Trace:
[1.356074]  IRQ 
[1.356074]  [8113aa96] ? irq_move_irq+0x34/0x50
[1.356074]  [8106bc49] apic_ack_edge+0x23/0x3b
[1.356074]  [81138421] irq_chip_ack_parent+0x14/0x16
[1.356074]  [811380db] handle_edge_irq+0xa5/0x110
[1.356074]  [8103f761] handle_irq+0x27/0x2d
[1.356074]  [81a3ad3c] do_IRQ+0x4c/0xcf
[1.356074]  [81a38f33] common_interrupt+0x73/0x73
[1.356074]  EOI 
[1.356074]  [816823a3] ? __ata_qc_complete+0xe1/0xe9
[1.356074]  [81a37f5e] ? _raw_spin_unlock_irqrestore+0x32/0x42
[1.356074]  [8169246a] ata_sff_hsm_move+0x258/0x66a
[1.356074]  [816929bc] ata_sff_pio_task+0x140/0x15e
[1.356074]  [81105591] process_one_work+0x1c6/0x37b
[1.356074]  [81106222] worker_thread+0x2ad/0x3b6
[1.356074]  [81105f75] ? rescuer_thread+0x318/0x318
[1.356074]  [8110a42c] kthread+0xf8/0x100
[1.356074]  [8110a334] ? kthread_create_on_node+0x184/0x184
[1.356074]  [81a38802] ret_from_fork+0x42/0x70
[1.356074]  [8110a334] ? kthread_create_on_node+0x184/0x184
[1.356074] Code:  Bad RIP value.
[1.356074] RIP  [  (null)]   (null)
[1.356074]  RSP 880013803ee0
[1.356074] CR2: 
[1.356074] ---[ end trace d37ae2366ce94eef ]---
[1.356074] Kernel panic - not syncing: Fatal exception in interrupt



Thanks,
lkp
#
# Automatically generated file; DO NOT EDIT.
# Linux/x86_64 4.0.0 Kernel Configuration
#
CONFIG_64BIT=y
CONFIG_X86_64=y
CONFIG_X86=y
CONFIG_INSTRUCTION_DECODER=y
CONFIG_PERF_EVENTS_INTEL_UNCORE=y
CONFIG_OUTPUT_FORMAT=elf64-x86-64

Re: [LKP] [RAID5] 878ee679279: -1.8% vmstat.io.bo, +40.5% perf-stat.LLC-load-misses

2015-04-30 Thread Yuanhan Liu
On Fri, Apr 24, 2015 at 12:15:59PM +1000, NeilBrown wrote:
 On Thu, 23 Apr 2015 14:55:59 +0800 Huang Ying ying.hu...@intel.com wrote:
 
  FYI, we noticed the below changes on
  
  git://neil.brown.name/md for-next
  commit 878ee6792799e2f88bdcac329845efadb205252f (RAID5: batch adjacent 
  full stripe write)
 
 Hi,
  is there any chance that you could explain what some of this means?
 There is lots of data and some very pretty graphs, but no explanation.

Hi Neil,

(Sorry for late response: Ying is on vacation)

I guess you can simply ignore this report, as I already reported to you
month ago that this patch made fsmark performs better in most cases:

https://lists.01.org/pipermail/lkp/2015-March/002411.html

 
 Which numbers are good, which are bad?  Which is worst.
 What do the graphs really show? and what would we like to see in them?
 
 I think it is really great that you are doing this testing and reporting the
 results.  It's just so sad that I completely fail to understand them.

Sorry, it's our bad to make them hard to understand as well as
to report a duplicate one(well, the commit hash is different ;).

We might need take some time to make those data understood easier.

--yliu

 
  
  
  testbox/testcase/testparams: 
  lkp-st02/dd-write/300-5m-11HDD-RAID5-cfq-xfs-1dd
  
  a87d7f782b47e030  878ee6792799e2f88bdcac3298  
    --  
   %stddev %change %stddev
   \  |\  
   59035 ±  0% +18.4%  69913 ±  1%  softirqs.SCHED
1330 ± 10% +17.4%   1561 ±  4%  slabinfo.kmalloc-512.num_objs
1330 ± 10% +17.4%   1561 ±  4%  
  slabinfo.kmalloc-512.active_objs
  305908 ±  0%  -1.8% 300427 ±  0%  vmstat.io.bo
   1 ±  0%+100.0%  2 ±  0%  vmstat.procs.r
8266 ±  1% -15.7%   6968 ±  0%  vmstat.system.cs
   14819 ±  0%  -2.1%  14503 ±  0%  vmstat.system.in
   18.20 ±  6% +10.2%  20.05 ±  4%  
  perf-profile.cpu-cycles.raid_run_ops.handle_stripe.handle_active_stripes.raid5d.md_thread
1.94 ±  9% +90.6%   3.70 ±  9%  
  perf-profile.cpu-cycles.async_xor.raid_run_ops.handle_stripe.handle_active_stripes.raid5d
0.00 ±  0%  +Inf%  25.18 ±  3%  
  perf-profile.cpu-cycles.handle_active_stripes.isra.45.raid5d.md_thread.kthread.ret_from_fork
0.00 ±  0%  +Inf%  14.14 ±  4%  
  perf-profile.cpu-cycles.async_copy_data.isra.42.raid_run_ops.handle_stripe.handle_active_stripes.raid5d
1.79 ±  7%+102.9%   3.64 ±  9%  
  perf-profile.cpu-cycles.xor_blocks.async_xor.raid_run_ops.handle_stripe.handle_active_stripes
3.09 ±  4% -10.8%   2.76 ±  4%  
  perf-profile.cpu-cycles.get_active_stripe.make_request.md_make_request.generic_make_request.submit_bio
0.80 ± 14% +28.1%   1.02 ± 10%  
  perf-profile.cpu-cycles.mutex_lock.xfs_file_buffered_aio_write.xfs_file_write_iter.new_sync_write.vfs_write
   14.78 ±  6%-100.0%   0.00 ±  0%  
  perf-profile.cpu-cycles.async_copy_data.isra.38.raid_run_ops.handle_stripe.handle_active_stripes.raid5d
   25.68 ±  4%-100.0%   0.00 ±  0%  
  perf-profile.cpu-cycles.handle_active_stripes.isra.41.raid5d.md_thread.kthread.ret_from_fork
1.23 ±  5%+140.0%   2.96 ±  7%  
  perf-profile.cpu-cycles.xor_sse_5_pf64.xor_blocks.async_xor.raid_run_ops.handle_stripe
2.62 ±  6% -95.6%   0.12 ± 33%  
  perf-profile.cpu-cycles.analyse_stripe.handle_stripe.handle_active_stripes.raid5d.md_thread
0.96 ±  9% +17.5%   1.12 ±  2%  
  perf-profile.cpu-cycles.xfs_ilock.xfs_file_buffered_aio_write.xfs_file_write_iter.new_sync_write.vfs_write
   1.461e+10 ±  0%  -5.3%  1.384e+10 ±  1%  
  perf-stat.L1-dcache-load-misses
   3.688e+11 ±  0%  -2.7%   3.59e+11 ±  0%  perf-stat.L1-dcache-loads
   1.124e+09 ±  0% -27.7%  8.125e+08 ±  0%  perf-stat.L1-dcache-prefetches
   2.767e+10 ±  0%  -1.8%  2.717e+10 ±  0%  
  perf-stat.L1-dcache-store-misses
   2.352e+11 ±  0%  -2.8%  2.287e+11 ±  0%  perf-stat.L1-dcache-stores
   6.774e+09 ±  0%  -2.3%   6.62e+09 ±  0%  
  perf-stat.L1-icache-load-misses
   5.571e+08 ±  0% +40.5%  7.826e+08 ±  1%  perf-stat.LLC-load-misses
   6.263e+09 ±  0% -13.7%  5.407e+09 ±  1%  perf-stat.LLC-loads
   1.914e+11 ±  0%  -4.2%  1.833e+11 ±  0%  perf-stat.branch-instructions
   1.145e+09 ±  2%  -5.6%  1.081e+09 ±  0%  perf-stat.branch-load-misses
   1.911e+11 ±  0%  -4.3%  1.829e+11 ±  0%  perf-stat.branch-loads
   1.142e+09 ±  2%  -5.1%  1.083e+09 ±  0%  perf-stat.branch-misses
   1.218e+09 ±  0% +19.8%   1.46e+09 ±  0%  perf-stat.cache-misses
   2.118e+10 ±  0%  -5.2%  2.007e+10 ±  0%  perf-stat.cache-references
 2510308 ±  1% -15.7%2115410 ±  0%  perf-stat.context-switches
   39623 ±  0% +22.1%  48370 ±  1%  perf-stat.cpu-migrations
   4.179e+08 ± 40%

[LKP] [block] 5a19fe29ba7: +5.4% boot-slabinfo.num_objs

2015-04-29 Thread Yuanhan Liu
FYI, we noticed the below changes on

git://git.kernel.org/pub/scm/linux/kernel/git/mlin/linux.git block-generic-req
commit 5a19fe29ba7d052c0d8fa8a2bf461abc1e4d89bb ("block: make 
generic_make_request handle arbitrarily sized bios")


testbox/testcase/testparams: vm-kbuild-1G/boot/1

v4.1-rc1  5a19fe29ba7d052c0d8fa8a2bf  
  --  
 %stddev %change %stddev
 \  |\  
152092 ±  0%  +5.4% 160249 ±  0%  boot-slabinfo.num_objs
 10106 ±  0% +21.6%  12293 ±  0%  boot-slabinfo.num_pages
  8.30 ± 21% -33.9%   5.48 ±  1%  boot-time.boot
  7.44 ± 23% -34.9%   4.84 ±  1%  boot-time.dhcp
 10.01 ± 17% -27.0%   7.31 ±  1%  boot-time.idle
 35507 ±  2% +17.9%  41856 ± 10%  boot-meminfo.DirectMap4k
  1558 ±  8%+276.5%   5868 ±  1%  boot-meminfo.KernelStack
480717 ±  0%  -2.8% 467414 ±  0%  boot-meminfo.MemFree
 11462 ±  1% +70.0%  19488 ±  0%  boot-meminfo.SUnreclaim
 40390 ±  0% +21.7%  49146 ±  0%  boot-meminfo.Slab

vm-kbuild-1G: qemu-system-x86_64 -enable-kvm -cpu Haswell,+smep,+smap
Memory: 1G




   boot-slabinfo.num_objs

  162000 ++-+
 |  O OOO
  16 O+ O   O   OOOO O  |
 |O  O O O OOO   O O  O |
 | O  OOO   |
  158000 ++ |
 |O |
  156000 ++ |
 |  |
  154000 ++ |
 |  |
 |  *.  .*.*.. *..*.*.. |
  152000 ++.  *.  *. ..*|
 *  *   |
  15 ++-+


   boot-slabinfo.num_pages

  12500 ++--+
O  O O  O O  O  O O O O  O O  O  O O  O O  O  O O  O  OO O  O
|O  O   |
  12000 ++  |
|   |
|   |
  11500 ++  |
|   |
  11000 ++  |
|   |
|   |
  10500 ++  |
|   |
|   .*..*.  .*..*.*..*..*.* |
  1 *+-*--*-+


boot-meminfo.MemFree

  484000 ++-+
  482000 ++*|
 | .*.*.. .*..*   : +   *.. |
  48 *+  * :  :  + +|
  478000 ++: :**|
 |  *   |
  476000 ++ |
  474000 ++ |
  472000 ++ |
 |  |
  47 ++O O   O O O O|
  468000 ++   O  O O  OO  O O O   OO O  |
 |  O   O   O  O  O O   |
  466000 O+  O  O   O
  464000 ++-+


  boot-meminfo.Slab

  5 

[LKP] [block] 5a19fe29ba7: +5.4% boot-slabinfo.num_objs

2015-04-29 Thread Yuanhan Liu
FYI, we noticed the below changes on

git://git.kernel.org/pub/scm/linux/kernel/git/mlin/linux.git block-generic-req
commit 5a19fe29ba7d052c0d8fa8a2bf461abc1e4d89bb (block: make 
generic_make_request handle arbitrarily sized bios)


testbox/testcase/testparams: vm-kbuild-1G/boot/1

v4.1-rc1  5a19fe29ba7d052c0d8fa8a2bf  
  --  
 %stddev %change %stddev
 \  |\  
152092 ±  0%  +5.4% 160249 ±  0%  boot-slabinfo.num_objs
 10106 ±  0% +21.6%  12293 ±  0%  boot-slabinfo.num_pages
  8.30 ± 21% -33.9%   5.48 ±  1%  boot-time.boot
  7.44 ± 23% -34.9%   4.84 ±  1%  boot-time.dhcp
 10.01 ± 17% -27.0%   7.31 ±  1%  boot-time.idle
 35507 ±  2% +17.9%  41856 ± 10%  boot-meminfo.DirectMap4k
  1558 ±  8%+276.5%   5868 ±  1%  boot-meminfo.KernelStack
480717 ±  0%  -2.8% 467414 ±  0%  boot-meminfo.MemFree
 11462 ±  1% +70.0%  19488 ±  0%  boot-meminfo.SUnreclaim
 40390 ±  0% +21.7%  49146 ±  0%  boot-meminfo.Slab

vm-kbuild-1G: qemu-system-x86_64 -enable-kvm -cpu Haswell,+smep,+smap
Memory: 1G




   boot-slabinfo.num_objs

  162000 ++-+
 |  O OOO
  16 O+ O   O   OOOO O  |
 |O  O O O OOO   O O  O |
 | O  OOO   |
  158000 ++ |
 |O |
  156000 ++ |
 |  |
  154000 ++ |
 |  |
 |  *.  .*.*.. *..*.*.. |
  152000 ++.  *.  *. ..*|
 *  *   |
  15 ++-+


   boot-slabinfo.num_pages

  12500 ++--+
O  O O  O O  O  O O O O  O O  O  O O  O O  O  O O  O  OO O  O
|O  O   |
  12000 ++  |
|   |
|   |
  11500 ++  |
|   |
  11000 ++  |
|   |
|   |
  10500 ++  |
|   |
|   .*..*.  .*..*.*..*..*.* |
  1 *+-*--*-+


boot-meminfo.MemFree

  484000 ++-+
  482000 ++*|
 | .*.*.. .*..*   : +   *.. |
  48 *+  * :  :  + +|
  478000 ++: :**|
 |  *   |
  476000 ++ |
  474000 ++ |
  472000 ++ |
 |  |
  47 ++O O   O O O O|
  468000 ++   O  O O  OO  O O O   OO O  |
 |  O   O   O  O  O O   |
  466000 O+  O  O   O
  464000 ++-+


  boot-meminfo.Slab

  5 

[PATCH 2/3] md/raid5: split wait_for_stripe and introduce wait_for_quiescent

2015-04-28 Thread Yuanhan Liu
I noticed heavy spin lock contention at get_active_stripe(), introduced
at being wake up stage, where a bunch of processes try to re-hold the
spin lock again.

After giving some thoughts on this issue, I found the lock could be
relieved(and even avoided) if we turn the wait_for_stripe to per
waitqueue for each lock hash and make the wake up exclusive: wake up
one process each time, which avoids the lock contention naturally.

Before go hacking with wait_for_stripe, I found it actually has 2
usages: for the array to enter or leave the quiescent state, and also
to wait for an available stripe in each of the hash lists.

So this patch splits the first usage off into a separate wait_queue,
wait_for_quiescent, and the next patch will turn the second usage into
one waitqueue for each hash value, and make it exclusive, to relieve
the lock contention.

v2: wake_up(wait_for_quiescent) when (active_stripes == 0)
Commit log refactor suggestion from Neil.

Signed-off-by: Yuanhan Liu 
---
 drivers/md/raid5.c | 15 +--
 drivers/md/raid5.h |  1 +
 2 files changed, 10 insertions(+), 6 deletions(-)

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 77dfd72..64d5bea 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -374,6 +374,8 @@ static void release_inactive_stripe_list(struct r5conf 
*conf,
 
if (do_wakeup) {
wake_up(>wait_for_stripe);
+   if (atomic_read(>active_stripes) == 0)
+   wake_up(>wait_for_quiescent);
if (conf->retry_read_aligned)
md_wakeup_thread(conf->mddev->thread);
}
@@ -667,7 +669,7 @@ get_active_stripe(struct r5conf *conf, sector_t sector,
spin_lock_irq(conf->hash_locks + hash);
 
do {
-   wait_event_lock_irq(conf->wait_for_stripe,
+   wait_event_lock_irq(conf->wait_for_quiescent,
conf->quiesce == 0 || noquiesce,
*(conf->hash_locks + hash));
sh = __find_stripe(conf, sector, conf->generation - previous);
@@ -4729,7 +4731,7 @@ static void raid5_align_endio(struct bio *bi, int error)
 raid_bi, 0);
bio_endio(raid_bi, 0);
if (atomic_dec_and_test(>active_aligned_reads))
-   wake_up(>wait_for_stripe);
+   wake_up(>wait_for_quiescent);
return;
}
 
@@ -4824,7 +4826,7 @@ static int chunk_aligned_read(struct mddev *mddev, struct 
bio * raid_bio)
align_bi->bi_iter.bi_sector += rdev->data_offset;
 
spin_lock_irq(>device_lock);
-   wait_event_lock_irq(conf->wait_for_stripe,
+   wait_event_lock_irq(conf->wait_for_quiescent,
conf->quiesce == 0,
conf->device_lock);
atomic_inc(>active_aligned_reads);
@@ -5668,7 +5670,7 @@ static int  retry_aligned_read(struct r5conf *conf, 
struct bio *raid_bio)
bio_endio(raid_bio, 0);
}
if (atomic_dec_and_test(>active_aligned_reads))
-   wake_up(>wait_for_stripe);
+   wake_up(>wait_for_quiescent);
return handled;
 }
 
@@ -6399,6 +6401,7 @@ static struct r5conf *setup_conf(struct mddev *mddev)
goto abort;
spin_lock_init(>device_lock);
seqcount_init(>gen_lock);
+   init_waitqueue_head(>wait_for_quiescent);
init_waitqueue_head(>wait_for_stripe);
init_waitqueue_head(>wait_for_overlap);
INIT_LIST_HEAD(>handle_list);
@@ -7422,7 +7425,7 @@ static void raid5_quiesce(struct mddev *mddev, int state)
 * active stripes can drain
 */
conf->quiesce = 2;
-   wait_event_cmd(conf->wait_for_stripe,
+   wait_event_cmd(conf->wait_for_quiescent,
atomic_read(>active_stripes) == 0 &&
atomic_read(>active_aligned_reads) == 
0,
unlock_all_device_hash_locks_irq(conf),
@@ -7436,7 +7439,7 @@ static void raid5_quiesce(struct mddev *mddev, int state)
case 0: /* re-enable writes */
lock_all_device_hash_locks_irq(conf);
conf->quiesce = 0;
-   wake_up(>wait_for_stripe);
+   wake_up(>wait_for_quiescent);
wake_up(>wait_for_overlap);
unlock_all_device_hash_locks_irq(conf);
break;
diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h
index 7dc0dd8..4cc05ec 100644
--- a/drivers/md/raid5.h
+++ b/drivers/md/raid5.h
@@ -508,6 +508,7 @@ struct r5conf {
struct list_headinactive_list[NR_STRIPE_HASH_LOCKS];
atomic_t 

[PATCH 3/3] md/raid5: per hash value and exclusive wait_for_stripe

2015-04-28 Thread Yuanhan Liu
ped heavily, up to 97%. And as expected,
the performance increased a lot, up to 260%, for fast device(ram disk).

v2: use bits instead of array to note down wait queue need to wake up.

Signed-off-by: Yuanhan Liu 
---
 drivers/md/raid5.c | 27 +++
 drivers/md/raid5.h |  2 +-
 2 files changed, 20 insertions(+), 9 deletions(-)

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 64d5bea..697d77a 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -344,7 +344,8 @@ static void release_inactive_stripe_list(struct r5conf 
*conf,
 int hash)
 {
int size;
-   bool do_wakeup = false;
+   unsigned long do_wakeup = 0;
+   int i = 0;
unsigned long flags;
 
if (hash == NR_STRIPE_HASH_LOCKS) {
@@ -365,15 +366,19 @@ static void release_inactive_stripe_list(struct r5conf 
*conf,
!list_empty(list))
atomic_dec(>empty_inactive_list_nr);
list_splice_tail_init(list, conf->inactive_list + hash);
-   do_wakeup = true;
+   do_wakeup |= 1 << (size - 1);
spin_unlock_irqrestore(conf->hash_locks + hash, flags);
}
size--;
hash--;
}
 
+   for (i = 0; i < NR_STRIPE_HASH_LOCKS; i++) {
+   if (do_wakeup & (1 << i))
+   wake_up(>wait_for_stripe[i]);
+   }
+
if (do_wakeup) {
-   wake_up(>wait_for_stripe);
if (atomic_read(>active_stripes) == 0)
wake_up(>wait_for_quiescent);
if (conf->retry_read_aligned)
@@ -686,14 +691,15 @@ get_active_stripe(struct r5conf *conf, sector_t sector,
if (!sh) {
set_bit(R5_INACTIVE_BLOCKED,
>cache_state);
-   wait_event_lock_irq(
-   conf->wait_for_stripe,
+   wait_event_exclusive_cmd(
+   conf->wait_for_stripe[hash],
!list_empty(conf->inactive_list + hash) 
&&
(atomic_read(>active_stripes)
 < (conf->max_nr_stripes * 3 / 4)
 || !test_bit(R5_INACTIVE_BLOCKED,
  >cache_state)),
-   *(conf->hash_locks + hash));
+   spin_unlock_irq(conf->hash_locks + 
hash),
+   spin_lock_irq(conf->hash_locks + hash));
clear_bit(R5_INACTIVE_BLOCKED,
  >cache_state);
} else {
@@ -718,6 +724,9 @@ get_active_stripe(struct r5conf *conf, sector_t sector,
}
} while (sh == NULL);
 
+   if (!list_empty(conf->inactive_list + hash))
+   wake_up(>wait_for_stripe[hash]);
+
spin_unlock_irq(conf->hash_locks + hash);
return sh;
 }
@@ -2138,7 +2147,7 @@ static int resize_stripes(struct r5conf *conf, int 
newsize)
cnt = 0;
list_for_each_entry(nsh, , lru) {
lock_device_hash_lock(conf, hash);
-   wait_event_cmd(conf->wait_for_stripe,
+   wait_event_exclusive_cmd(conf->wait_for_stripe[hash],
!list_empty(conf->inactive_list + hash),
unlock_device_hash_lock(conf, hash),
lock_device_hash_lock(conf, hash));
@@ -6402,7 +6411,9 @@ static struct r5conf *setup_conf(struct mddev *mddev)
spin_lock_init(>device_lock);
seqcount_init(>gen_lock);
init_waitqueue_head(>wait_for_quiescent);
-   init_waitqueue_head(>wait_for_stripe);
+   for (i = 0; i < NR_STRIPE_HASH_LOCKS; i++) {
+   init_waitqueue_head(>wait_for_stripe[i]);
+   }
init_waitqueue_head(>wait_for_overlap);
INIT_LIST_HEAD(>handle_list);
INIT_LIST_HEAD(>hold_list);
diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h
index 4cc05ec..6307b90 100644
--- a/drivers/md/raid5.h
+++ b/drivers/md/raid5.h
@@ -509,7 +509,7 @@ struct r5conf {
atomic_tempty_inactive_list_nr;
struct llist_head   released_stripes;
wait_queue_head_t   wait_for_quiescent;
-   wait_queue_head_t   wait_for_stripe;
+   wait_queue_head_t   wait_for_stripe[NR_STRIPE_HASH_LOCKS];
wait_queue_head_t   wait_for_overlap;
unsigned long   cache_state;
 #define R5_INACTIVE_BLOCKED1   /*

[PATCH 1/3 v2] wait: introduce wait_event_exclusive_cmd

2015-04-28 Thread Yuanhan Liu
It's just a variant of wait_event_cmd(), with exclusive flag being set.

For cases like RAID5, which puts many processes to sleep until 1/4
resources are free, a wake_up wakes up all processes to run, but
there is one process being able to get the resource as it's protected
by a spin lock. That ends up introducing heavy lock contentions, and
hurts performance badly.

Here introduce wait_event_exclusive_cmd to relieve the lock contention
naturally by letting wake_up just wake up one process.

Cc: Ingo Molnar 
Cc: Peter Zijlstra 
v2: its assumed that wait*() and __wait*() have the same arguments - peterz

Signed-off-by: Yuanhan Liu 
---
 include/linux/wait.h | 13 +
 1 file changed, 13 insertions(+)

diff --git a/include/linux/wait.h b/include/linux/wait.h
index 2db8334..db78c72 100644
--- a/include/linux/wait.h
+++ b/include/linux/wait.h
@@ -358,6 +358,19 @@ do {   
\
__ret;  \
 })
 
+#define __wait_event_exclusive_cmd(wq, condition, cmd1, cmd2)  \
+   (void)___wait_event(wq, condition, TASK_UNINTERRUPTIBLE, 1, 0,  \
+   cmd1; schedule(); cmd2)
+/*
+ * Just like wait_event_cmd(), except it sets exclusive flag
+ */
+#define wait_event_exclusive_cmd(wq, condition, cmd1, cmd2)\
+do {   \
+   if (condition)  \
+   break;  \
+   __wait_event_exclusive_cmd(wq, condition, cmd1, cmd2);  \
+} while (0)
+
 #define __wait_event_cmd(wq, condition, cmd1, cmd2)\
(void)___wait_event(wq, condition, TASK_UNINTERRUPTIBLE, 0, 0,  \
cmd1; schedule(); cmd2)
-- 
1.9.0

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 1/3] wait: introduce wait_event_cmd_exclusive

2015-04-28 Thread Yuanhan Liu
On Tue, Apr 28, 2015 at 04:13:15PM +0200, Peter Zijlstra wrote:
> On Mon, Apr 27, 2015 at 12:51:01PM +0800, Yuanhan Liu wrote:
> > It's just a variant of wait_event_cmd, with exclusive flag being set.
> > 
> > For cases like RAID5, which puts many processes to sleep until 1/4
> > resources are free, a wake_up wakes up all processes to run, but
> > there is one process being able to get the resource as it's protected
> > by a spin lock. That ends up introducing heavy lock contentions, and
> > hurts performance badly.
> > 
> > Here introduce wait_event_cmd_exclusive to relieve the lock contention
> > naturally by letting wake_up() just wake up one process.
> > 
> > Cc: Ingo Molnar 
> > Cc: Peter Zijlstra 
> > Signed-off-by: Yuanhan Liu 
> > ---
> >  include/linux/wait.h | 14 +++---
> >  1 file changed, 11 insertions(+), 3 deletions(-)
> > 
> > diff --git a/include/linux/wait.h b/include/linux/wait.h
> > index 2db8334..6c3b4de 100644
> > --- a/include/linux/wait.h
> > +++ b/include/linux/wait.h
> > @@ -358,10 +358,18 @@ do {  
> > \
> > __ret;  \
> >  })
> >  
> > -#define __wait_event_cmd(wq, condition, cmd1, cmd2)
> > \
> > -   (void)___wait_event(wq, condition, TASK_UNINTERRUPTIBLE, 0, 0,  \
> > +#define __wait_event_cmd(wq, condition, cmd1, cmd2, exclusive) 
> > \
> > +   (void)___wait_event(wq, condition, TASK_UNINTERRUPTIBLE, exclusive, 0, \
> > cmd1; schedule(); cmd2)
> >  
> > +
> > +#define wait_event_cmd_exclusive(wq, condition, cmd1, cmd2)
> > \
> > +do {   
> > \
> > +   if (condition)  \
> > +   break;  \
> > +   __wait_event_cmd(wq, condition, cmd1, cmd2, 1); \
> > +} while (0)
> > +
> >  /**
> >   * wait_event_cmd - sleep until a condition gets true
> >   * @wq: the waitqueue to wait on
> > @@ -380,7 +388,7 @@ do {
> > \
> >  do {   
> > \
> > if (condition)  \
> > break;  \
> > -   __wait_event_cmd(wq, condition, cmd1, cmd2);\
> > +   __wait_event_cmd(wq, condition, cmd1, cmd2, 0); \
> >  } while (0)
> >  
> 
> No, that's wrong, its assumed that wait*() and __wait*() have the same
> arguments.

Thanks. Will send an updated patch soon.


--yliu
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 1/3] wait: introduce wait_event_cmd_exclusive

2015-04-28 Thread Yuanhan Liu
On Tue, Apr 28, 2015 at 04:13:15PM +0200, Peter Zijlstra wrote:
 On Mon, Apr 27, 2015 at 12:51:01PM +0800, Yuanhan Liu wrote:
  It's just a variant of wait_event_cmd, with exclusive flag being set.
  
  For cases like RAID5, which puts many processes to sleep until 1/4
  resources are free, a wake_up wakes up all processes to run, but
  there is one process being able to get the resource as it's protected
  by a spin lock. That ends up introducing heavy lock contentions, and
  hurts performance badly.
  
  Here introduce wait_event_cmd_exclusive to relieve the lock contention
  naturally by letting wake_up() just wake up one process.
  
  Cc: Ingo Molnar mi...@redhat.com
  Cc: Peter Zijlstra pet...@infradead.org
  Signed-off-by: Yuanhan Liu yuanhan@linux.intel.com
  ---
   include/linux/wait.h | 14 +++---
   1 file changed, 11 insertions(+), 3 deletions(-)
  
  diff --git a/include/linux/wait.h b/include/linux/wait.h
  index 2db8334..6c3b4de 100644
  --- a/include/linux/wait.h
  +++ b/include/linux/wait.h
  @@ -358,10 +358,18 @@ do {  
  \
  __ret;  \
   })
   
  -#define __wait_event_cmd(wq, condition, cmd1, cmd2)
  \
  -   (void)___wait_event(wq, condition, TASK_UNINTERRUPTIBLE, 0, 0,  \
  +#define __wait_event_cmd(wq, condition, cmd1, cmd2, exclusive) 
  \
  +   (void)___wait_event(wq, condition, TASK_UNINTERRUPTIBLE, exclusive, 0, \
  cmd1; schedule(); cmd2)
   
  +
  +#define wait_event_cmd_exclusive(wq, condition, cmd1, cmd2)
  \
  +do {   
  \
  +   if (condition)  \
  +   break;  \
  +   __wait_event_cmd(wq, condition, cmd1, cmd2, 1); \
  +} while (0)
  +
   /**
* wait_event_cmd - sleep until a condition gets true
* @wq: the waitqueue to wait on
  @@ -380,7 +388,7 @@ do {
  \
   do {   
  \
  if (condition)  \
  break;  \
  -   __wait_event_cmd(wq, condition, cmd1, cmd2);\
  +   __wait_event_cmd(wq, condition, cmd1, cmd2, 0); \
   } while (0)
   
 
 No, that's wrong, its assumed that wait*() and __wait*() have the same
 arguments.

Thanks. Will send an updated patch soon.


--yliu
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 3/3] md/raid5: per hash value and exclusive wait_for_stripe

2015-04-28 Thread Yuanhan Liu
 heavily, up to 97%. And as expected,
the performance increased a lot, up to 260%, for fast device(ram disk).

v2: use bits instead of array to note down wait queue need to wake up.

Signed-off-by: Yuanhan Liu yuanhan@linux.intel.com
---
 drivers/md/raid5.c | 27 +++
 drivers/md/raid5.h |  2 +-
 2 files changed, 20 insertions(+), 9 deletions(-)

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 64d5bea..697d77a 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -344,7 +344,8 @@ static void release_inactive_stripe_list(struct r5conf 
*conf,
 int hash)
 {
int size;
-   bool do_wakeup = false;
+   unsigned long do_wakeup = 0;
+   int i = 0;
unsigned long flags;
 
if (hash == NR_STRIPE_HASH_LOCKS) {
@@ -365,15 +366,19 @@ static void release_inactive_stripe_list(struct r5conf 
*conf,
!list_empty(list))
atomic_dec(conf-empty_inactive_list_nr);
list_splice_tail_init(list, conf-inactive_list + hash);
-   do_wakeup = true;
+   do_wakeup |= 1  (size - 1);
spin_unlock_irqrestore(conf-hash_locks + hash, flags);
}
size--;
hash--;
}
 
+   for (i = 0; i  NR_STRIPE_HASH_LOCKS; i++) {
+   if (do_wakeup  (1  i))
+   wake_up(conf-wait_for_stripe[i]);
+   }
+
if (do_wakeup) {
-   wake_up(conf-wait_for_stripe);
if (atomic_read(conf-active_stripes) == 0)
wake_up(conf-wait_for_quiescent);
if (conf-retry_read_aligned)
@@ -686,14 +691,15 @@ get_active_stripe(struct r5conf *conf, sector_t sector,
if (!sh) {
set_bit(R5_INACTIVE_BLOCKED,
conf-cache_state);
-   wait_event_lock_irq(
-   conf-wait_for_stripe,
+   wait_event_exclusive_cmd(
+   conf-wait_for_stripe[hash],
!list_empty(conf-inactive_list + hash) 

(atomic_read(conf-active_stripes)
  (conf-max_nr_stripes * 3 / 4)
 || !test_bit(R5_INACTIVE_BLOCKED,
  conf-cache_state)),
-   *(conf-hash_locks + hash));
+   spin_unlock_irq(conf-hash_locks + 
hash),
+   spin_lock_irq(conf-hash_locks + hash));
clear_bit(R5_INACTIVE_BLOCKED,
  conf-cache_state);
} else {
@@ -718,6 +724,9 @@ get_active_stripe(struct r5conf *conf, sector_t sector,
}
} while (sh == NULL);
 
+   if (!list_empty(conf-inactive_list + hash))
+   wake_up(conf-wait_for_stripe[hash]);
+
spin_unlock_irq(conf-hash_locks + hash);
return sh;
 }
@@ -2138,7 +2147,7 @@ static int resize_stripes(struct r5conf *conf, int 
newsize)
cnt = 0;
list_for_each_entry(nsh, newstripes, lru) {
lock_device_hash_lock(conf, hash);
-   wait_event_cmd(conf-wait_for_stripe,
+   wait_event_exclusive_cmd(conf-wait_for_stripe[hash],
!list_empty(conf-inactive_list + hash),
unlock_device_hash_lock(conf, hash),
lock_device_hash_lock(conf, hash));
@@ -6402,7 +6411,9 @@ static struct r5conf *setup_conf(struct mddev *mddev)
spin_lock_init(conf-device_lock);
seqcount_init(conf-gen_lock);
init_waitqueue_head(conf-wait_for_quiescent);
-   init_waitqueue_head(conf-wait_for_stripe);
+   for (i = 0; i  NR_STRIPE_HASH_LOCKS; i++) {
+   init_waitqueue_head(conf-wait_for_stripe[i]);
+   }
init_waitqueue_head(conf-wait_for_overlap);
INIT_LIST_HEAD(conf-handle_list);
INIT_LIST_HEAD(conf-hold_list);
diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h
index 4cc05ec..6307b90 100644
--- a/drivers/md/raid5.h
+++ b/drivers/md/raid5.h
@@ -509,7 +509,7 @@ struct r5conf {
atomic_tempty_inactive_list_nr;
struct llist_head   released_stripes;
wait_queue_head_t   wait_for_quiescent;
-   wait_queue_head_t   wait_for_stripe;
+   wait_queue_head_t   wait_for_stripe[NR_STRIPE_HASH_LOCKS];
wait_queue_head_t   wait_for_overlap;
unsigned long   cache_state;
 #define R5_INACTIVE_BLOCKED1   /* release of inactive stripes blocked,
-- 
1.9.0

[PATCH 1/3 v2] wait: introduce wait_event_exclusive_cmd

2015-04-28 Thread Yuanhan Liu
It's just a variant of wait_event_cmd(), with exclusive flag being set.

For cases like RAID5, which puts many processes to sleep until 1/4
resources are free, a wake_up wakes up all processes to run, but
there is one process being able to get the resource as it's protected
by a spin lock. That ends up introducing heavy lock contentions, and
hurts performance badly.

Here introduce wait_event_exclusive_cmd to relieve the lock contention
naturally by letting wake_up just wake up one process.

Cc: Ingo Molnar mi...@redhat.com
Cc: Peter Zijlstra pet...@infradead.org
v2: its assumed that wait*() and __wait*() have the same arguments - peterz

Signed-off-by: Yuanhan Liu yuanhan@linux.intel.com
---
 include/linux/wait.h | 13 +
 1 file changed, 13 insertions(+)

diff --git a/include/linux/wait.h b/include/linux/wait.h
index 2db8334..db78c72 100644
--- a/include/linux/wait.h
+++ b/include/linux/wait.h
@@ -358,6 +358,19 @@ do {   
\
__ret;  \
 })
 
+#define __wait_event_exclusive_cmd(wq, condition, cmd1, cmd2)  \
+   (void)___wait_event(wq, condition, TASK_UNINTERRUPTIBLE, 1, 0,  \
+   cmd1; schedule(); cmd2)
+/*
+ * Just like wait_event_cmd(), except it sets exclusive flag
+ */
+#define wait_event_exclusive_cmd(wq, condition, cmd1, cmd2)\
+do {   \
+   if (condition)  \
+   break;  \
+   __wait_event_exclusive_cmd(wq, condition, cmd1, cmd2);  \
+} while (0)
+
 #define __wait_event_cmd(wq, condition, cmd1, cmd2)\
(void)___wait_event(wq, condition, TASK_UNINTERRUPTIBLE, 0, 0,  \
cmd1; schedule(); cmd2)
-- 
1.9.0

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 2/3] md/raid5: split wait_for_stripe and introduce wait_for_quiescent

2015-04-28 Thread Yuanhan Liu
I noticed heavy spin lock contention at get_active_stripe(), introduced
at being wake up stage, where a bunch of processes try to re-hold the
spin lock again.

After giving some thoughts on this issue, I found the lock could be
relieved(and even avoided) if we turn the wait_for_stripe to per
waitqueue for each lock hash and make the wake up exclusive: wake up
one process each time, which avoids the lock contention naturally.

Before go hacking with wait_for_stripe, I found it actually has 2
usages: for the array to enter or leave the quiescent state, and also
to wait for an available stripe in each of the hash lists.

So this patch splits the first usage off into a separate wait_queue,
wait_for_quiescent, and the next patch will turn the second usage into
one waitqueue for each hash value, and make it exclusive, to relieve
the lock contention.

v2: wake_up(wait_for_quiescent) when (active_stripes == 0)
Commit log refactor suggestion from Neil.

Signed-off-by: Yuanhan Liu yuanhan@linux.intel.com
---
 drivers/md/raid5.c | 15 +--
 drivers/md/raid5.h |  1 +
 2 files changed, 10 insertions(+), 6 deletions(-)

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 77dfd72..64d5bea 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -374,6 +374,8 @@ static void release_inactive_stripe_list(struct r5conf 
*conf,
 
if (do_wakeup) {
wake_up(conf-wait_for_stripe);
+   if (atomic_read(conf-active_stripes) == 0)
+   wake_up(conf-wait_for_quiescent);
if (conf-retry_read_aligned)
md_wakeup_thread(conf-mddev-thread);
}
@@ -667,7 +669,7 @@ get_active_stripe(struct r5conf *conf, sector_t sector,
spin_lock_irq(conf-hash_locks + hash);
 
do {
-   wait_event_lock_irq(conf-wait_for_stripe,
+   wait_event_lock_irq(conf-wait_for_quiescent,
conf-quiesce == 0 || noquiesce,
*(conf-hash_locks + hash));
sh = __find_stripe(conf, sector, conf-generation - previous);
@@ -4729,7 +4731,7 @@ static void raid5_align_endio(struct bio *bi, int error)
 raid_bi, 0);
bio_endio(raid_bi, 0);
if (atomic_dec_and_test(conf-active_aligned_reads))
-   wake_up(conf-wait_for_stripe);
+   wake_up(conf-wait_for_quiescent);
return;
}
 
@@ -4824,7 +4826,7 @@ static int chunk_aligned_read(struct mddev *mddev, struct 
bio * raid_bio)
align_bi-bi_iter.bi_sector += rdev-data_offset;
 
spin_lock_irq(conf-device_lock);
-   wait_event_lock_irq(conf-wait_for_stripe,
+   wait_event_lock_irq(conf-wait_for_quiescent,
conf-quiesce == 0,
conf-device_lock);
atomic_inc(conf-active_aligned_reads);
@@ -5668,7 +5670,7 @@ static int  retry_aligned_read(struct r5conf *conf, 
struct bio *raid_bio)
bio_endio(raid_bio, 0);
}
if (atomic_dec_and_test(conf-active_aligned_reads))
-   wake_up(conf-wait_for_stripe);
+   wake_up(conf-wait_for_quiescent);
return handled;
 }
 
@@ -6399,6 +6401,7 @@ static struct r5conf *setup_conf(struct mddev *mddev)
goto abort;
spin_lock_init(conf-device_lock);
seqcount_init(conf-gen_lock);
+   init_waitqueue_head(conf-wait_for_quiescent);
init_waitqueue_head(conf-wait_for_stripe);
init_waitqueue_head(conf-wait_for_overlap);
INIT_LIST_HEAD(conf-handle_list);
@@ -7422,7 +7425,7 @@ static void raid5_quiesce(struct mddev *mddev, int state)
 * active stripes can drain
 */
conf-quiesce = 2;
-   wait_event_cmd(conf-wait_for_stripe,
+   wait_event_cmd(conf-wait_for_quiescent,
atomic_read(conf-active_stripes) == 0 
atomic_read(conf-active_aligned_reads) == 
0,
unlock_all_device_hash_locks_irq(conf),
@@ -7436,7 +7439,7 @@ static void raid5_quiesce(struct mddev *mddev, int state)
case 0: /* re-enable writes */
lock_all_device_hash_locks_irq(conf);
conf-quiesce = 0;
-   wake_up(conf-wait_for_stripe);
+   wake_up(conf-wait_for_quiescent);
wake_up(conf-wait_for_overlap);
unlock_all_device_hash_locks_irq(conf);
break;
diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h
index 7dc0dd8..4cc05ec 100644
--- a/drivers/md/raid5.h
+++ b/drivers/md/raid5.h
@@ -508,6 +508,7 @@ struct r5conf {
struct list_headinactive_list[NR_STRIPE_HASH_LOCKS];
atomic_tempty_inactive_list_nr;
struct

[PATCH 3/3 v2] md/raid5: per hash value and exclusive wait_for_stripe

2015-04-26 Thread Yuanhan Liu
ped heavily, up to 97%. And as expected,
the performance increased a lot, up to 260%, for fast device(ram disk).

v2: use bits instead of array to note down wait queue need to wake up.

Signed-off-by: Yuanhan Liu 
---
 drivers/md/raid5.c | 27 +++
 drivers/md/raid5.h |  2 +-
 2 files changed, 20 insertions(+), 9 deletions(-)

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 64d5bea..1b11bbf 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -344,7 +344,8 @@ static void release_inactive_stripe_list(struct r5conf 
*conf,
 int hash)
 {
int size;
-   bool do_wakeup = false;
+   unsigned long do_wakeup = 0;
+   int i = 0;
unsigned long flags;
 
if (hash == NR_STRIPE_HASH_LOCKS) {
@@ -365,15 +366,19 @@ static void release_inactive_stripe_list(struct r5conf 
*conf,
!list_empty(list))
atomic_dec(>empty_inactive_list_nr);
list_splice_tail_init(list, conf->inactive_list + hash);
-   do_wakeup = true;
+   do_wakeup |= 1 << (size - 1);
spin_unlock_irqrestore(conf->hash_locks + hash, flags);
}
size--;
hash--;
}
 
+   for (i = 0; i < NR_STRIPE_HASH_LOCKS; i++) {
+   if (do_wakeup & (1 << i))
+   wake_up(>wait_for_stripe[i]);
+   }
+
if (do_wakeup) {
-   wake_up(>wait_for_stripe);
if (atomic_read(>active_stripes) == 0)
wake_up(>wait_for_quiescent);
if (conf->retry_read_aligned)
@@ -686,14 +691,15 @@ get_active_stripe(struct r5conf *conf, sector_t sector,
if (!sh) {
set_bit(R5_INACTIVE_BLOCKED,
>cache_state);
-   wait_event_lock_irq(
-   conf->wait_for_stripe,
+   wait_event_cmd_exclusive(
+   conf->wait_for_stripe[hash],
!list_empty(conf->inactive_list + hash) 
&&
(atomic_read(>active_stripes)
 < (conf->max_nr_stripes * 3 / 4)
 || !test_bit(R5_INACTIVE_BLOCKED,
  >cache_state)),
-   *(conf->hash_locks + hash));
+   spin_unlock_irq(conf->hash_locks + 
hash),
+   spin_lock_irq(conf->hash_locks + hash));
clear_bit(R5_INACTIVE_BLOCKED,
  >cache_state);
} else {
@@ -718,6 +724,9 @@ get_active_stripe(struct r5conf *conf, sector_t sector,
}
} while (sh == NULL);
 
+   if (!list_empty(conf->inactive_list + hash))
+   wake_up(>wait_for_stripe[hash]);
+
spin_unlock_irq(conf->hash_locks + hash);
return sh;
 }
@@ -2138,7 +2147,7 @@ static int resize_stripes(struct r5conf *conf, int 
newsize)
cnt = 0;
list_for_each_entry(nsh, , lru) {
lock_device_hash_lock(conf, hash);
-   wait_event_cmd(conf->wait_for_stripe,
+   wait_event_cmd_exclusive(conf->wait_for_stripe[hash],
!list_empty(conf->inactive_list + hash),
unlock_device_hash_lock(conf, hash),
lock_device_hash_lock(conf, hash));
@@ -6402,7 +6411,9 @@ static struct r5conf *setup_conf(struct mddev *mddev)
spin_lock_init(>device_lock);
seqcount_init(>gen_lock);
init_waitqueue_head(>wait_for_quiescent);
-   init_waitqueue_head(>wait_for_stripe);
+   for (i = 0; i < NR_STRIPE_HASH_LOCKS; i++) {
+   init_waitqueue_head(>wait_for_stripe[i]);
+   }
init_waitqueue_head(>wait_for_overlap);
INIT_LIST_HEAD(>handle_list);
INIT_LIST_HEAD(>hold_list);
diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h
index 4cc05ec..6307b90 100644
--- a/drivers/md/raid5.h
+++ b/drivers/md/raid5.h
@@ -509,7 +509,7 @@ struct r5conf {
atomic_tempty_inactive_list_nr;
struct llist_head   released_stripes;
wait_queue_head_t   wait_for_quiescent;
-   wait_queue_head_t   wait_for_stripe;
+   wait_queue_head_t   wait_for_stripe[NR_STRIPE_HASH_LOCKS];
wait_queue_head_t   wait_for_overlap;
unsigned long   cache_state;
 #define R5_INACTIVE_BLOCKED1   /*

[PATCH 1/3] wait: introduce wait_event_cmd_exclusive

2015-04-26 Thread Yuanhan Liu
It's just a variant of wait_event_cmd, with exclusive flag being set.

For cases like RAID5, which puts many processes to sleep until 1/4
resources are free, a wake_up wakes up all processes to run, but
there is one process being able to get the resource as it's protected
by a spin lock. That ends up introducing heavy lock contentions, and
hurts performance badly.

Here introduce wait_event_cmd_exclusive to relieve the lock contention
naturally by letting wake_up() just wake up one process.

Cc: Ingo Molnar 
Cc: Peter Zijlstra 
Signed-off-by: Yuanhan Liu 
---
 include/linux/wait.h | 14 +++---
 1 file changed, 11 insertions(+), 3 deletions(-)

diff --git a/include/linux/wait.h b/include/linux/wait.h
index 2db8334..6c3b4de 100644
--- a/include/linux/wait.h
+++ b/include/linux/wait.h
@@ -358,10 +358,18 @@ do {  
\
__ret;  \
 })
 
-#define __wait_event_cmd(wq, condition, cmd1, cmd2)\
-   (void)___wait_event(wq, condition, TASK_UNINTERRUPTIBLE, 0, 0,  \
+#define __wait_event_cmd(wq, condition, cmd1, cmd2, exclusive) \
+   (void)___wait_event(wq, condition, TASK_UNINTERRUPTIBLE, exclusive, 0, \
cmd1; schedule(); cmd2)
 
+
+#define wait_event_cmd_exclusive(wq, condition, cmd1, cmd2)
\
+do {   \
+   if (condition)  \
+   break;  \
+   __wait_event_cmd(wq, condition, cmd1, cmd2, 1); \
+} while (0)
+
 /**
  * wait_event_cmd - sleep until a condition gets true
  * @wq: the waitqueue to wait on
@@ -380,7 +388,7 @@ do {
\
 do {   \
if (condition)  \
break;  \
-   __wait_event_cmd(wq, condition, cmd1, cmd2);\
+   __wait_event_cmd(wq, condition, cmd1, cmd2, 0); \
 } while (0)
 
 #define __wait_event_interruptible(wq, condition)  \
-- 
1.9.0

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 2/3 v2] md/raid5: split wait_for_stripe and introduce wait_for_quiescent

2015-04-26 Thread Yuanhan Liu
I noticed heavy spin lock contention at get_active_stripe(), introduced
at being wake up stage, where a bunch of processes try to re-hold the
spin lock again.

After giving some thoughts on this issue, I found the lock could be
relieved(and even avoided) if we turn the wait_for_stripe to per
waitqueue for each lock hash and make the wake up exclusive: wake up
one process each time, which avoids the lock contention naturally.

Before go hacking with wait_for_stripe, I found it actually has 2
usages: for the array to enter or leave the quiescent state, and also
to wait for an available stripe in each of the hash lists.

So this patch splits the first usage off into a separate wait_queue,
wait_for_quiescent, and the next patch will turn the second usage into
one waitqueue for each hash value, and make it exclusive, to relieve
the lock contention.

v2: wake_up(wait_for_quiescent) when (active_stripes == 0)
Commit log refactor suggestion from Neil.

Signed-off-by: Yuanhan Liu 
---
 drivers/md/raid5.c | 15 +--
 drivers/md/raid5.h |  1 +
 2 files changed, 10 insertions(+), 6 deletions(-)

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 77dfd72..64d5bea 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -374,6 +374,8 @@ static void release_inactive_stripe_list(struct r5conf 
*conf,
 
if (do_wakeup) {
wake_up(>wait_for_stripe);
+   if (atomic_read(>active_stripes) == 0)
+   wake_up(>wait_for_quiescent);
if (conf->retry_read_aligned)
md_wakeup_thread(conf->mddev->thread);
}
@@ -667,7 +669,7 @@ get_active_stripe(struct r5conf *conf, sector_t sector,
spin_lock_irq(conf->hash_locks + hash);
 
do {
-   wait_event_lock_irq(conf->wait_for_stripe,
+   wait_event_lock_irq(conf->wait_for_quiescent,
conf->quiesce == 0 || noquiesce,
*(conf->hash_locks + hash));
sh = __find_stripe(conf, sector, conf->generation - previous);
@@ -4729,7 +4731,7 @@ static void raid5_align_endio(struct bio *bi, int error)
 raid_bi, 0);
bio_endio(raid_bi, 0);
if (atomic_dec_and_test(>active_aligned_reads))
-   wake_up(>wait_for_stripe);
+   wake_up(>wait_for_quiescent);
return;
}
 
@@ -4824,7 +4826,7 @@ static int chunk_aligned_read(struct mddev *mddev, struct 
bio * raid_bio)
align_bi->bi_iter.bi_sector += rdev->data_offset;
 
spin_lock_irq(>device_lock);
-   wait_event_lock_irq(conf->wait_for_stripe,
+   wait_event_lock_irq(conf->wait_for_quiescent,
conf->quiesce == 0,
conf->device_lock);
atomic_inc(>active_aligned_reads);
@@ -5668,7 +5670,7 @@ static int  retry_aligned_read(struct r5conf *conf, 
struct bio *raid_bio)
bio_endio(raid_bio, 0);
}
if (atomic_dec_and_test(>active_aligned_reads))
-   wake_up(>wait_for_stripe);
+   wake_up(>wait_for_quiescent);
return handled;
 }
 
@@ -6399,6 +6401,7 @@ static struct r5conf *setup_conf(struct mddev *mddev)
goto abort;
spin_lock_init(>device_lock);
seqcount_init(>gen_lock);
+   init_waitqueue_head(>wait_for_quiescent);
init_waitqueue_head(>wait_for_stripe);
init_waitqueue_head(>wait_for_overlap);
INIT_LIST_HEAD(>handle_list);
@@ -7422,7 +7425,7 @@ static void raid5_quiesce(struct mddev *mddev, int state)
 * active stripes can drain
 */
conf->quiesce = 2;
-   wait_event_cmd(conf->wait_for_stripe,
+   wait_event_cmd(conf->wait_for_quiescent,
atomic_read(>active_stripes) == 0 &&
atomic_read(>active_aligned_reads) == 
0,
unlock_all_device_hash_locks_irq(conf),
@@ -7436,7 +7439,7 @@ static void raid5_quiesce(struct mddev *mddev, int state)
case 0: /* re-enable writes */
lock_all_device_hash_locks_irq(conf);
conf->quiesce = 0;
-   wake_up(>wait_for_stripe);
+   wake_up(>wait_for_quiescent);
wake_up(>wait_for_overlap);
unlock_all_device_hash_locks_irq(conf);
break;
diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h
index 7dc0dd8..4cc05ec 100644
--- a/drivers/md/raid5.h
+++ b/drivers/md/raid5.h
@@ -508,6 +508,7 @@ struct r5conf {
struct list_headinactive_list[NR_STRIPE_HASH_LOCKS];
atomic_t 

Re: [PATCH 2/2] md/raid5: exclusive wait_for_stripe

2015-04-26 Thread Yuanhan Liu
On Mon, Apr 27, 2015 at 10:24:05AM +1000, NeilBrown wrote:
> On Fri, 24 Apr 2015 21:39:04 +0800 Yuanhan Liu 
> wrote:
> 
> > I noticed heavy spin lock contention at get_active_stripe() with fsmark
> > multiple thread write workloads.
> > 
> > Here is how this hot contention comes from. We have limited stripes, and
> > it's a multiple thread write workload. Hence, those stripes will be taken
> > soon, which puts later processes to sleep for waiting free stripes. When
> > enough stripes(> 1/4 total stripes) are released, all process are woken,
> > trying to get the lock. But there is one only being able to get this lock
> > for each hash lock, making other processes spinning out there for acquiring
> > the lock.
> > 
> > Thus, it's effectiveless to wakeup all processes and let them battle for
> > a lock that permits one to access only each time. Instead, we could make
> > it be a exclusive wake up: wake up one process only. That avoids the heavy
> > spin lock contention naturally.
> > 
> > Here are some test results I have got with this patch applied(all test run
> > 3 times):
> > 
> > `fsmark.files_per_sec'
> > =
> > 
> > next-20150317 this patch
> > - -
> > metric_value ±stddev  metric_value ±stddev change  
> > testbox/benchmark/testcase-params
> > - -    
> > --
> >   25.600 ±0.0  92.700 ±2.5  262.1% 
> > ivb44/fsmark/1x-64t-4BRD_12G-RAID5-btrfs-4M-30G-fsyncBeforeClose
> >   25.600 ±0.0  77.800 ±0.6  203.9% 
> > ivb44/fsmark/1x-64t-9BRD_6G-RAID5-btrfs-4M-30G-fsyncBeforeClose
> >   32.000 ±0.0  93.800 ±1.7  193.1% 
> > ivb44/fsmark/1x-64t-4BRD_12G-RAID5-ext4-4M-30G-fsyncBeforeClose
> >   32.000 ±0.0  81.233 ±1.7  153.9% 
> > ivb44/fsmark/1x-64t-9BRD_6G-RAID5-ext4-4M-30G-fsyncBeforeClose
> >   48.800 ±14.5 99.667 ±2.0  104.2% 
> > ivb44/fsmark/1x-64t-4BRD_12G-RAID5-xfs-4M-30G-fsyncBeforeClose
> >6.400 ±0.0  12.800 ±0.0  100.0% 
> > ivb44/fsmark/1x-64t-3HDD-RAID5-btrfs-4M-40G-fsyncBeforeClose
> >   63.133 ±8.2  82.800 ±0.7   31.2% 
> > ivb44/fsmark/1x-64t-9BRD_6G-RAID5-xfs-4M-30G-fsyncBeforeClose
> >  245.067 ±0.7 306.567 ±7.9   25.1% 
> > ivb44/fsmark/1x-64t-4BRD_12G-RAID5-f2fs-4M-30G-fsyncBeforeClose
> >   17.533 ±0.3  21.000 ±0.8   19.8% 
> > ivb44/fsmark/1x-1t-3HDD-RAID5-xfs-4M-40G-fsyncBeforeClose
> >  188.167 ±1.9 215.033 ±3.1   14.3% 
> > ivb44/fsmark/1x-1t-4BRD_12G-RAID5-btrfs-4M-30G-NoSync
> >  254.500 ±1.8 290.733 ±2.4   14.2% 
> > ivb44/fsmark/1x-1t-9BRD_6G-RAID5-btrfs-4M-30G-NoSync
> > 
> > `time.system_time'
> > =
> > 
> > next-20150317 this patch
> > --
> > metric_value ±stddev metric_value ±stddev change   
> > testbox/benchmark/testcase-params
> > -- 
> > --
> > 7235.603 ±1.2 185.163 ±1.9  -97.4% 
> > ivb44/fsmark/1x-64t-4BRD_12G-RAID5-btrfs-4M-30G-fsyncBeforeClose
> > 7666.883 ±2.9 202.750 ±1.0  -97.4% 
> > ivb44/fsmark/1x-64t-9BRD_6G-RAID5-btrfs-4M-30G-fsyncBeforeClose
> >14567.893 ±0.7 421.230 ±0.4  -97.1% 
> > ivb44/fsmark/1x-64t-3HDD-RAID5-btrfs-4M-40G-fsyncBeforeClose
> > 3697.667 ±14.0148.190 ±1.7  -96.0% 
> > ivb44/fsmark/1x-64t-4BRD_12G-RAID5-xfs-4M-30G-fsyncBeforeClose
> > 5572.867 ±3.8 310.717 ±1.4  -94.4% 
> > ivb44/fsmark/1x-64t-9BRD_6G-RAID5-ext4-4M-30G-fsyncBeforeClose
> > 5565.050 ±0.5 313.277 ±1.5  -94.4% 
> > ivb44/fsmark/1x-64t-4BRD_12G-RAID5-ext4-4M-30G-fsyncBeforeClose
> > 2420.707 ±17.1171.043 ±2.7  -92.9% 
> > ivb44/fsmark/1x-64t-9BRD_6G-RAID5-xfs-4M-30G-fsyncBeforeClose
> > 3743.300 ±4.6 379.827 ±3.5  -89.9% 
> > ivb44/fsmark/1x-64

Re: [PATCH 1/2] md/raid5: split wait_for_stripe and introduce wait_for_quiesce

2015-04-26 Thread Yuanhan Liu
On Mon, Apr 27, 2015 at 10:10:24AM +1000, NeilBrown wrote:
> On Fri, 24 Apr 2015 21:39:03 +0800 Yuanhan Liu 
> wrote:
> 
> > If I read code correctly, current wait_for_stripe actually has 2 usage:
> > 
> > - wait for there is enough free stripe cache, triggered when
> >   get_free_stripe() failed. This is what wait_for_stripe intend
> >   for literally.
> > 
> > - wait for quiesce == 0 or
> >active_aligned_reads == 0 && active_stripes == 0
> > 
> >   It has nothing to do with wait_for_stripe literally, and releasing
> >   an active stripe won't actually wake them up. On the contrary, wake_up
> >   from under this case won't actually wake up the process waiting for
> >   an free stripe being available.
> 
> I disagree.  Releasing an active stripe *will* (or *can*) wake up that third
> case, as it decrements "active_stripes" which will eventually reach zero.
> 
> I don't think your new code will properly wake up a process which is waiting
> for "active_stripes == 0".

Right, and thanks for pointing it out. So, is this enough?

---
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 2d8fcc1..3f23035 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -383,6 +383,9 @@ static void release_inactive_stripe_list(struct
r5conf *conf,
}
}
}
+
+   if (!atomic_read(>active_stripes))
+   wake_up(>wait_for_quiesce);
 }

 /* should hold conf->device_lock already */


Or, should I put it a bit ahead, trying to invoke 
wake_up(>wait_for_quiesce)
after each atomic_dec(>active_stripes)?

if (atomic_dec_return(>active_stripes) == 0)
wake_up(>wait_for_quiesce);

> 
> > 
> > Hence, we'd better split wait_for_stripe, and here I introduce
> > wait_for_quiesce for the second usage. The name may not well taken, or
> > even taken wrongly. Feel free to correct me then.
> > 
> > This is also a prepare patch for next patch: make wait_for_stripe
> > exclusive.
> 
> I think you have this commit description upside down :-)
> 
> The real motivation is that you are seeing contention on some spinlock and so
> you want to split 'wait_for_stripe' up in to multiple wait_queues so that you
> can use exclusive wakeup.  As this is the main motivation, it should be
> stated first.
> 
> Then explain that 'wait_for_stripe' is used to wait for the array to enter or
> leave the quiescent state, and also to wait for an available stripe in each
> of the hash lists.
> 
> So this patch splits the first usage off into a separate wait_queue, and the
> next patch will split the second usage into one waitqueue for each hash value.
> 
> Then explain just is what is needed for that first step.
> 
> When you put it that way around, the patch makes lots of sense.

It does, and thanks!

> 
> So: could you please resubmit with the description the right way around, and

To make sure I followed you correctly, my patch order is correct(I mean,
split lock first, and make wait_for_stripe per lock hash and exclusive
second), and what I need to do is re-writing the commit log as you suggested,
and fixing all issues you pointed out. Right?

--yliu

> with an appropriate wakeup call to ensure raid5_quiesce is woken up when
> active_stripes reaches zero?
> 
> Thanks,
> NeilBrown
> 
> 
> > 
> > Signed-off-by: Yuanhan Liu 
> > ---
> >  drivers/md/raid5.c | 13 +++--
> >  drivers/md/raid5.h |  1 +
> >  2 files changed, 8 insertions(+), 6 deletions(-)
> > 
> > diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
> > index 9716319..b7e385f 100644
> > --- a/drivers/md/raid5.c
> > +++ b/drivers/md/raid5.c
> > @@ -667,7 +667,7 @@ get_active_stripe(struct r5conf *conf, sector_t sector,
> > spin_lock_irq(conf->hash_locks + hash);
> >  
> > do {
> > -   wait_event_lock_irq(conf->wait_for_stripe,
> > +   wait_event_lock_irq(conf->wait_for_quiesce,
> > conf->quiesce == 0 || noquiesce,
> > *(conf->hash_locks + hash));
> > sh = __find_stripe(conf, sector, conf->generation - previous);
> > @@ -4725,7 +4725,7 @@ static void raid5_align_endio(struct bio *bi, int 
> > error)
> >  raid_bi, 0);
> > bio_endio(raid_bi, 0);
> > if (atomic_dec_and_test(>active_aligned_reads))
> > -   wake_up(>wait_for_stripe);
> > +   wake_up(>wait_for_quiesce);
> > return;
> > }
> >  
> > @@ -

Re: [PATCH 1/2] md/raid5: split wait_for_stripe and introduce wait_for_quiesce

2015-04-26 Thread Yuanhan Liu
On Mon, Apr 27, 2015 at 10:10:24AM +1000, NeilBrown wrote:
 On Fri, 24 Apr 2015 21:39:03 +0800 Yuanhan Liu yuanhan@linux.intel.com
 wrote:
 
  If I read code correctly, current wait_for_stripe actually has 2 usage:
  
  - wait for there is enough free stripe cache, triggered when
get_free_stripe() failed. This is what wait_for_stripe intend
for literally.
  
  - wait for quiesce == 0 or
 active_aligned_reads == 0  active_stripes == 0
  
It has nothing to do with wait_for_stripe literally, and releasing
an active stripe won't actually wake them up. On the contrary, wake_up
from under this case won't actually wake up the process waiting for
an free stripe being available.
 
 I disagree.  Releasing an active stripe *will* (or *can*) wake up that third
 case, as it decrements active_stripes which will eventually reach zero.
 
 I don't think your new code will properly wake up a process which is waiting
 for active_stripes == 0.

Right, and thanks for pointing it out. So, is this enough?

---
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 2d8fcc1..3f23035 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -383,6 +383,9 @@ static void release_inactive_stripe_list(struct
r5conf *conf,
}
}
}
+
+   if (!atomic_read(conf-active_stripes))
+   wake_up(conf-wait_for_quiesce);
 }

 /* should hold conf-device_lock already */


Or, should I put it a bit ahead, trying to invoke 
wake_up(conf-wait_for_quiesce)
after each atomic_dec(conf-active_stripes)?

if (atomic_dec_return(conf-active_stripes) == 0)
wake_up(conf-wait_for_quiesce);

 
  
  Hence, we'd better split wait_for_stripe, and here I introduce
  wait_for_quiesce for the second usage. The name may not well taken, or
  even taken wrongly. Feel free to correct me then.
  
  This is also a prepare patch for next patch: make wait_for_stripe
  exclusive.
 
 I think you have this commit description upside down :-)
 
 The real motivation is that you are seeing contention on some spinlock and so
 you want to split 'wait_for_stripe' up in to multiple wait_queues so that you
 can use exclusive wakeup.  As this is the main motivation, it should be
 stated first.
 
 Then explain that 'wait_for_stripe' is used to wait for the array to enter or
 leave the quiescent state, and also to wait for an available stripe in each
 of the hash lists.
 
 So this patch splits the first usage off into a separate wait_queue, and the
 next patch will split the second usage into one waitqueue for each hash value.
 
 Then explain just is what is needed for that first step.
 
 When you put it that way around, the patch makes lots of sense.

It does, and thanks!

 
 So: could you please resubmit with the description the right way around, and

To make sure I followed you correctly, my patch order is correct(I mean,
split lock first, and make wait_for_stripe per lock hash and exclusive
second), and what I need to do is re-writing the commit log as you suggested,
and fixing all issues you pointed out. Right?

--yliu

 with an appropriate wakeup call to ensure raid5_quiesce is woken up when
 active_stripes reaches zero?
 
 Thanks,
 NeilBrown
 
 
  
  Signed-off-by: Yuanhan Liu yuanhan@linux.intel.com
  ---
   drivers/md/raid5.c | 13 +++--
   drivers/md/raid5.h |  1 +
   2 files changed, 8 insertions(+), 6 deletions(-)
  
  diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
  index 9716319..b7e385f 100644
  --- a/drivers/md/raid5.c
  +++ b/drivers/md/raid5.c
  @@ -667,7 +667,7 @@ get_active_stripe(struct r5conf *conf, sector_t sector,
  spin_lock_irq(conf-hash_locks + hash);
   
  do {
  -   wait_event_lock_irq(conf-wait_for_stripe,
  +   wait_event_lock_irq(conf-wait_for_quiesce,
  conf-quiesce == 0 || noquiesce,
  *(conf-hash_locks + hash));
  sh = __find_stripe(conf, sector, conf-generation - previous);
  @@ -4725,7 +4725,7 @@ static void raid5_align_endio(struct bio *bi, int 
  error)
   raid_bi, 0);
  bio_endio(raid_bi, 0);
  if (atomic_dec_and_test(conf-active_aligned_reads))
  -   wake_up(conf-wait_for_stripe);
  +   wake_up(conf-wait_for_quiesce);
  return;
  }
   
  @@ -4820,7 +4820,7 @@ static int chunk_aligned_read(struct mddev *mddev, 
  struct bio * raid_bio)
  align_bi-bi_iter.bi_sector += rdev-data_offset;
   
  spin_lock_irq(conf-device_lock);
  -   wait_event_lock_irq(conf-wait_for_stripe,
  +   wait_event_lock_irq(conf-wait_for_quiesce,
  conf-quiesce == 0,
  conf-device_lock);
  atomic_inc(conf-active_aligned_reads);
  @@ -5659,7 +5659,7 @@ static int  retry_aligned_read(struct r5conf *conf

Re: [PATCH 2/2] md/raid5: exclusive wait_for_stripe

2015-04-26 Thread Yuanhan Liu
On Mon, Apr 27, 2015 at 10:24:05AM +1000, NeilBrown wrote:
 On Fri, 24 Apr 2015 21:39:04 +0800 Yuanhan Liu yuanhan@linux.intel.com
 wrote:
 
  I noticed heavy spin lock contention at get_active_stripe() with fsmark
  multiple thread write workloads.
  
  Here is how this hot contention comes from. We have limited stripes, and
  it's a multiple thread write workload. Hence, those stripes will be taken
  soon, which puts later processes to sleep for waiting free stripes. When
  enough stripes( 1/4 total stripes) are released, all process are woken,
  trying to get the lock. But there is one only being able to get this lock
  for each hash lock, making other processes spinning out there for acquiring
  the lock.
  
  Thus, it's effectiveless to wakeup all processes and let them battle for
  a lock that permits one to access only each time. Instead, we could make
  it be a exclusive wake up: wake up one process only. That avoids the heavy
  spin lock contention naturally.
  
  Here are some test results I have got with this patch applied(all test run
  3 times):
  
  `fsmark.files_per_sec'
  =
  
  next-20150317 this patch
  - -
  metric_value ±stddev  metric_value ±stddev change  
  testbox/benchmark/testcase-params
  - -    
  --
25.600 ±0.0  92.700 ±2.5  262.1% 
  ivb44/fsmark/1x-64t-4BRD_12G-RAID5-btrfs-4M-30G-fsyncBeforeClose
25.600 ±0.0  77.800 ±0.6  203.9% 
  ivb44/fsmark/1x-64t-9BRD_6G-RAID5-btrfs-4M-30G-fsyncBeforeClose
32.000 ±0.0  93.800 ±1.7  193.1% 
  ivb44/fsmark/1x-64t-4BRD_12G-RAID5-ext4-4M-30G-fsyncBeforeClose
32.000 ±0.0  81.233 ±1.7  153.9% 
  ivb44/fsmark/1x-64t-9BRD_6G-RAID5-ext4-4M-30G-fsyncBeforeClose
48.800 ±14.5 99.667 ±2.0  104.2% 
  ivb44/fsmark/1x-64t-4BRD_12G-RAID5-xfs-4M-30G-fsyncBeforeClose
 6.400 ±0.0  12.800 ±0.0  100.0% 
  ivb44/fsmark/1x-64t-3HDD-RAID5-btrfs-4M-40G-fsyncBeforeClose
63.133 ±8.2  82.800 ±0.7   31.2% 
  ivb44/fsmark/1x-64t-9BRD_6G-RAID5-xfs-4M-30G-fsyncBeforeClose
   245.067 ±0.7 306.567 ±7.9   25.1% 
  ivb44/fsmark/1x-64t-4BRD_12G-RAID5-f2fs-4M-30G-fsyncBeforeClose
17.533 ±0.3  21.000 ±0.8   19.8% 
  ivb44/fsmark/1x-1t-3HDD-RAID5-xfs-4M-40G-fsyncBeforeClose
   188.167 ±1.9 215.033 ±3.1   14.3% 
  ivb44/fsmark/1x-1t-4BRD_12G-RAID5-btrfs-4M-30G-NoSync
   254.500 ±1.8 290.733 ±2.4   14.2% 
  ivb44/fsmark/1x-1t-9BRD_6G-RAID5-btrfs-4M-30G-NoSync
  
  `time.system_time'
  =
  
  next-20150317 this patch
  --
  metric_value ±stddev metric_value ±stddev change   
  testbox/benchmark/testcase-params
  -- 
  --
  7235.603 ±1.2 185.163 ±1.9  -97.4% 
  ivb44/fsmark/1x-64t-4BRD_12G-RAID5-btrfs-4M-30G-fsyncBeforeClose
  7666.883 ±2.9 202.750 ±1.0  -97.4% 
  ivb44/fsmark/1x-64t-9BRD_6G-RAID5-btrfs-4M-30G-fsyncBeforeClose
 14567.893 ±0.7 421.230 ±0.4  -97.1% 
  ivb44/fsmark/1x-64t-3HDD-RAID5-btrfs-4M-40G-fsyncBeforeClose
  3697.667 ±14.0148.190 ±1.7  -96.0% 
  ivb44/fsmark/1x-64t-4BRD_12G-RAID5-xfs-4M-30G-fsyncBeforeClose
  5572.867 ±3.8 310.717 ±1.4  -94.4% 
  ivb44/fsmark/1x-64t-9BRD_6G-RAID5-ext4-4M-30G-fsyncBeforeClose
  5565.050 ±0.5 313.277 ±1.5  -94.4% 
  ivb44/fsmark/1x-64t-4BRD_12G-RAID5-ext4-4M-30G-fsyncBeforeClose
  2420.707 ±17.1171.043 ±2.7  -92.9% 
  ivb44/fsmark/1x-64t-9BRD_6G-RAID5-xfs-4M-30G-fsyncBeforeClose
  3743.300 ±4.6 379.827 ±3.5  -89.9% 
  ivb44/fsmark/1x-64t-3HDD-RAID5-ext4-4M-40G-fsyncBeforeClose
  3308.687 ±6.3 363.050 ±2.0  -89.0% 
  ivb44/fsmark/1x-64t-3HDD-RAID5-xfs-4M-40G-fsyncBeforeClose
  
  Where,
  
   1x: where 'x' means iterations or loop, corresponding to the 'L' 
  option of fsmark
  
   1t, 64t: where 't' means thread
  
   4M: means the single file size, corresponding to the '-s' option of 
  fsmark
   40G, 30G, 120G: means the total test size
  
   4BRD_12G: BRD is the ramdisk, where '4' means 4 ramdisk, and where 
  '12G' means
 the size of one ramdisk. So

[PATCH 2/3 v2] md/raid5: split wait_for_stripe and introduce wait_for_quiescent

2015-04-26 Thread Yuanhan Liu
I noticed heavy spin lock contention at get_active_stripe(), introduced
at being wake up stage, where a bunch of processes try to re-hold the
spin lock again.

After giving some thoughts on this issue, I found the lock could be
relieved(and even avoided) if we turn the wait_for_stripe to per
waitqueue for each lock hash and make the wake up exclusive: wake up
one process each time, which avoids the lock contention naturally.

Before go hacking with wait_for_stripe, I found it actually has 2
usages: for the array to enter or leave the quiescent state, and also
to wait for an available stripe in each of the hash lists.

So this patch splits the first usage off into a separate wait_queue,
wait_for_quiescent, and the next patch will turn the second usage into
one waitqueue for each hash value, and make it exclusive, to relieve
the lock contention.

v2: wake_up(wait_for_quiescent) when (active_stripes == 0)
Commit log refactor suggestion from Neil.

Signed-off-by: Yuanhan Liu yuanhan@linux.intel.com
---
 drivers/md/raid5.c | 15 +--
 drivers/md/raid5.h |  1 +
 2 files changed, 10 insertions(+), 6 deletions(-)

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 77dfd72..64d5bea 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -374,6 +374,8 @@ static void release_inactive_stripe_list(struct r5conf 
*conf,
 
if (do_wakeup) {
wake_up(conf-wait_for_stripe);
+   if (atomic_read(conf-active_stripes) == 0)
+   wake_up(conf-wait_for_quiescent);
if (conf-retry_read_aligned)
md_wakeup_thread(conf-mddev-thread);
}
@@ -667,7 +669,7 @@ get_active_stripe(struct r5conf *conf, sector_t sector,
spin_lock_irq(conf-hash_locks + hash);
 
do {
-   wait_event_lock_irq(conf-wait_for_stripe,
+   wait_event_lock_irq(conf-wait_for_quiescent,
conf-quiesce == 0 || noquiesce,
*(conf-hash_locks + hash));
sh = __find_stripe(conf, sector, conf-generation - previous);
@@ -4729,7 +4731,7 @@ static void raid5_align_endio(struct bio *bi, int error)
 raid_bi, 0);
bio_endio(raid_bi, 0);
if (atomic_dec_and_test(conf-active_aligned_reads))
-   wake_up(conf-wait_for_stripe);
+   wake_up(conf-wait_for_quiescent);
return;
}
 
@@ -4824,7 +4826,7 @@ static int chunk_aligned_read(struct mddev *mddev, struct 
bio * raid_bio)
align_bi-bi_iter.bi_sector += rdev-data_offset;
 
spin_lock_irq(conf-device_lock);
-   wait_event_lock_irq(conf-wait_for_stripe,
+   wait_event_lock_irq(conf-wait_for_quiescent,
conf-quiesce == 0,
conf-device_lock);
atomic_inc(conf-active_aligned_reads);
@@ -5668,7 +5670,7 @@ static int  retry_aligned_read(struct r5conf *conf, 
struct bio *raid_bio)
bio_endio(raid_bio, 0);
}
if (atomic_dec_and_test(conf-active_aligned_reads))
-   wake_up(conf-wait_for_stripe);
+   wake_up(conf-wait_for_quiescent);
return handled;
 }
 
@@ -6399,6 +6401,7 @@ static struct r5conf *setup_conf(struct mddev *mddev)
goto abort;
spin_lock_init(conf-device_lock);
seqcount_init(conf-gen_lock);
+   init_waitqueue_head(conf-wait_for_quiescent);
init_waitqueue_head(conf-wait_for_stripe);
init_waitqueue_head(conf-wait_for_overlap);
INIT_LIST_HEAD(conf-handle_list);
@@ -7422,7 +7425,7 @@ static void raid5_quiesce(struct mddev *mddev, int state)
 * active stripes can drain
 */
conf-quiesce = 2;
-   wait_event_cmd(conf-wait_for_stripe,
+   wait_event_cmd(conf-wait_for_quiescent,
atomic_read(conf-active_stripes) == 0 
atomic_read(conf-active_aligned_reads) == 
0,
unlock_all_device_hash_locks_irq(conf),
@@ -7436,7 +7439,7 @@ static void raid5_quiesce(struct mddev *mddev, int state)
case 0: /* re-enable writes */
lock_all_device_hash_locks_irq(conf);
conf-quiesce = 0;
-   wake_up(conf-wait_for_stripe);
+   wake_up(conf-wait_for_quiescent);
wake_up(conf-wait_for_overlap);
unlock_all_device_hash_locks_irq(conf);
break;
diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h
index 7dc0dd8..4cc05ec 100644
--- a/drivers/md/raid5.h
+++ b/drivers/md/raid5.h
@@ -508,6 +508,7 @@ struct r5conf {
struct list_headinactive_list[NR_STRIPE_HASH_LOCKS];
atomic_tempty_inactive_list_nr;
struct

[PATCH 1/3] wait: introduce wait_event_cmd_exclusive

2015-04-26 Thread Yuanhan Liu
It's just a variant of wait_event_cmd, with exclusive flag being set.

For cases like RAID5, which puts many processes to sleep until 1/4
resources are free, a wake_up wakes up all processes to run, but
there is one process being able to get the resource as it's protected
by a spin lock. That ends up introducing heavy lock contentions, and
hurts performance badly.

Here introduce wait_event_cmd_exclusive to relieve the lock contention
naturally by letting wake_up() just wake up one process.

Cc: Ingo Molnar mi...@redhat.com
Cc: Peter Zijlstra pet...@infradead.org
Signed-off-by: Yuanhan Liu yuanhan@linux.intel.com
---
 include/linux/wait.h | 14 +++---
 1 file changed, 11 insertions(+), 3 deletions(-)

diff --git a/include/linux/wait.h b/include/linux/wait.h
index 2db8334..6c3b4de 100644
--- a/include/linux/wait.h
+++ b/include/linux/wait.h
@@ -358,10 +358,18 @@ do {  
\
__ret;  \
 })
 
-#define __wait_event_cmd(wq, condition, cmd1, cmd2)\
-   (void)___wait_event(wq, condition, TASK_UNINTERRUPTIBLE, 0, 0,  \
+#define __wait_event_cmd(wq, condition, cmd1, cmd2, exclusive) \
+   (void)___wait_event(wq, condition, TASK_UNINTERRUPTIBLE, exclusive, 0, \
cmd1; schedule(); cmd2)
 
+
+#define wait_event_cmd_exclusive(wq, condition, cmd1, cmd2)
\
+do {   \
+   if (condition)  \
+   break;  \
+   __wait_event_cmd(wq, condition, cmd1, cmd2, 1); \
+} while (0)
+
 /**
  * wait_event_cmd - sleep until a condition gets true
  * @wq: the waitqueue to wait on
@@ -380,7 +388,7 @@ do {
\
 do {   \
if (condition)  \
break;  \
-   __wait_event_cmd(wq, condition, cmd1, cmd2);\
+   __wait_event_cmd(wq, condition, cmd1, cmd2, 0); \
 } while (0)
 
 #define __wait_event_interruptible(wq, condition)  \
-- 
1.9.0

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 3/3 v2] md/raid5: per hash value and exclusive wait_for_stripe

2015-04-26 Thread Yuanhan Liu
 heavily, up to 97%. And as expected,
the performance increased a lot, up to 260%, for fast device(ram disk).

v2: use bits instead of array to note down wait queue need to wake up.

Signed-off-by: Yuanhan Liu yuanhan@linux.intel.com
---
 drivers/md/raid5.c | 27 +++
 drivers/md/raid5.h |  2 +-
 2 files changed, 20 insertions(+), 9 deletions(-)

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 64d5bea..1b11bbf 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -344,7 +344,8 @@ static void release_inactive_stripe_list(struct r5conf 
*conf,
 int hash)
 {
int size;
-   bool do_wakeup = false;
+   unsigned long do_wakeup = 0;
+   int i = 0;
unsigned long flags;
 
if (hash == NR_STRIPE_HASH_LOCKS) {
@@ -365,15 +366,19 @@ static void release_inactive_stripe_list(struct r5conf 
*conf,
!list_empty(list))
atomic_dec(conf-empty_inactive_list_nr);
list_splice_tail_init(list, conf-inactive_list + hash);
-   do_wakeup = true;
+   do_wakeup |= 1  (size - 1);
spin_unlock_irqrestore(conf-hash_locks + hash, flags);
}
size--;
hash--;
}
 
+   for (i = 0; i  NR_STRIPE_HASH_LOCKS; i++) {
+   if (do_wakeup  (1  i))
+   wake_up(conf-wait_for_stripe[i]);
+   }
+
if (do_wakeup) {
-   wake_up(conf-wait_for_stripe);
if (atomic_read(conf-active_stripes) == 0)
wake_up(conf-wait_for_quiescent);
if (conf-retry_read_aligned)
@@ -686,14 +691,15 @@ get_active_stripe(struct r5conf *conf, sector_t sector,
if (!sh) {
set_bit(R5_INACTIVE_BLOCKED,
conf-cache_state);
-   wait_event_lock_irq(
-   conf-wait_for_stripe,
+   wait_event_cmd_exclusive(
+   conf-wait_for_stripe[hash],
!list_empty(conf-inactive_list + hash) 

(atomic_read(conf-active_stripes)
  (conf-max_nr_stripes * 3 / 4)
 || !test_bit(R5_INACTIVE_BLOCKED,
  conf-cache_state)),
-   *(conf-hash_locks + hash));
+   spin_unlock_irq(conf-hash_locks + 
hash),
+   spin_lock_irq(conf-hash_locks + hash));
clear_bit(R5_INACTIVE_BLOCKED,
  conf-cache_state);
} else {
@@ -718,6 +724,9 @@ get_active_stripe(struct r5conf *conf, sector_t sector,
}
} while (sh == NULL);
 
+   if (!list_empty(conf-inactive_list + hash))
+   wake_up(conf-wait_for_stripe[hash]);
+
spin_unlock_irq(conf-hash_locks + hash);
return sh;
 }
@@ -2138,7 +2147,7 @@ static int resize_stripes(struct r5conf *conf, int 
newsize)
cnt = 0;
list_for_each_entry(nsh, newstripes, lru) {
lock_device_hash_lock(conf, hash);
-   wait_event_cmd(conf-wait_for_stripe,
+   wait_event_cmd_exclusive(conf-wait_for_stripe[hash],
!list_empty(conf-inactive_list + hash),
unlock_device_hash_lock(conf, hash),
lock_device_hash_lock(conf, hash));
@@ -6402,7 +6411,9 @@ static struct r5conf *setup_conf(struct mddev *mddev)
spin_lock_init(conf-device_lock);
seqcount_init(conf-gen_lock);
init_waitqueue_head(conf-wait_for_quiescent);
-   init_waitqueue_head(conf-wait_for_stripe);
+   for (i = 0; i  NR_STRIPE_HASH_LOCKS; i++) {
+   init_waitqueue_head(conf-wait_for_stripe[i]);
+   }
init_waitqueue_head(conf-wait_for_overlap);
INIT_LIST_HEAD(conf-handle_list);
INIT_LIST_HEAD(conf-hold_list);
diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h
index 4cc05ec..6307b90 100644
--- a/drivers/md/raid5.h
+++ b/drivers/md/raid5.h
@@ -509,7 +509,7 @@ struct r5conf {
atomic_tempty_inactive_list_nr;
struct llist_head   released_stripes;
wait_queue_head_t   wait_for_quiescent;
-   wait_queue_head_t   wait_for_stripe;
+   wait_queue_head_t   wait_for_stripe[NR_STRIPE_HASH_LOCKS];
wait_queue_head_t   wait_for_overlap;
unsigned long   cache_state;
 #define R5_INACTIVE_BLOCKED1   /* release of inactive stripes blocked,
-- 
1.9.0

[PATCH 1/2] md/raid5: split wait_for_stripe and introduce wait_for_quiesce

2015-04-24 Thread Yuanhan Liu
If I read code correctly, current wait_for_stripe actually has 2 usage:

- wait for there is enough free stripe cache, triggered when
  get_free_stripe() failed. This is what wait_for_stripe intend
  for literally.

- wait for quiesce == 0 or
   active_aligned_reads == 0 && active_stripes == 0

  It has nothing to do with wait_for_stripe literally, and releasing
  an active stripe won't actually wake them up. On the contrary, wake_up
  from under this case won't actually wake up the process waiting for
  an free stripe being available.

Hence, we'd better split wait_for_stripe, and here I introduce
wait_for_quiesce for the second usage. The name may not well taken, or
even taken wrongly. Feel free to correct me then.

This is also a prepare patch for next patch: make wait_for_stripe
exclusive.

Signed-off-by: Yuanhan Liu 
---
 drivers/md/raid5.c | 13 +++--
 drivers/md/raid5.h |  1 +
 2 files changed, 8 insertions(+), 6 deletions(-)

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 9716319..b7e385f 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -667,7 +667,7 @@ get_active_stripe(struct r5conf *conf, sector_t sector,
spin_lock_irq(conf->hash_locks + hash);
 
do {
-   wait_event_lock_irq(conf->wait_for_stripe,
+   wait_event_lock_irq(conf->wait_for_quiesce,
conf->quiesce == 0 || noquiesce,
*(conf->hash_locks + hash));
sh = __find_stripe(conf, sector, conf->generation - previous);
@@ -4725,7 +4725,7 @@ static void raid5_align_endio(struct bio *bi, int error)
 raid_bi, 0);
bio_endio(raid_bi, 0);
if (atomic_dec_and_test(>active_aligned_reads))
-   wake_up(>wait_for_stripe);
+   wake_up(>wait_for_quiesce);
return;
}
 
@@ -4820,7 +4820,7 @@ static int chunk_aligned_read(struct mddev *mddev, struct 
bio * raid_bio)
align_bi->bi_iter.bi_sector += rdev->data_offset;
 
spin_lock_irq(>device_lock);
-   wait_event_lock_irq(conf->wait_for_stripe,
+   wait_event_lock_irq(conf->wait_for_quiesce,
conf->quiesce == 0,
conf->device_lock);
atomic_inc(>active_aligned_reads);
@@ -5659,7 +5659,7 @@ static int  retry_aligned_read(struct r5conf *conf, 
struct bio *raid_bio)
bio_endio(raid_bio, 0);
}
if (atomic_dec_and_test(>active_aligned_reads))
-   wake_up(>wait_for_stripe);
+   wake_up(>wait_for_quiesce);
return handled;
 }
 
@@ -6390,6 +6390,7 @@ static struct r5conf *setup_conf(struct mddev *mddev)
goto abort;
spin_lock_init(>device_lock);
seqcount_init(>gen_lock);
+   init_waitqueue_head(>wait_for_quiesce);
init_waitqueue_head(>wait_for_stripe);
init_waitqueue_head(>wait_for_overlap);
INIT_LIST_HEAD(>handle_list);
@@ -7413,7 +7414,7 @@ static void raid5_quiesce(struct mddev *mddev, int state)
 * active stripes can drain
 */
conf->quiesce = 2;
-   wait_event_cmd(conf->wait_for_stripe,
+   wait_event_cmd(conf->wait_for_quiesce,
atomic_read(>active_stripes) == 0 &&
atomic_read(>active_aligned_reads) == 
0,
unlock_all_device_hash_locks_irq(conf),
@@ -7427,7 +7428,7 @@ static void raid5_quiesce(struct mddev *mddev, int state)
case 0: /* re-enable writes */
lock_all_device_hash_locks_irq(conf);
conf->quiesce = 0;
-   wake_up(>wait_for_stripe);
+   wake_up(>wait_for_quiesce);
wake_up(>wait_for_overlap);
unlock_all_device_hash_locks_irq(conf);
break;
diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h
index 7dc0dd8..fab53a3 100644
--- a/drivers/md/raid5.h
+++ b/drivers/md/raid5.h
@@ -508,6 +508,7 @@ struct r5conf {
struct list_headinactive_list[NR_STRIPE_HASH_LOCKS];
atomic_tempty_inactive_list_nr;
struct llist_head   released_stripes;
+   wait_queue_head_t   wait_for_quiesce;
wait_queue_head_t   wait_for_stripe;
wait_queue_head_t   wait_for_overlap;
unsigned long   cache_state;
-- 
1.9.0

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 2/2] md/raid5: exclusive wait_for_stripe

2015-04-24 Thread Yuanhan Liu
I noticed heavy spin lock contention at get_active_stripe() with fsmark
multiple thread write workloads.

Here is how this hot contention comes from. We have limited stripes, and
it's a multiple thread write workload. Hence, those stripes will be taken
soon, which puts later processes to sleep for waiting free stripes. When
enough stripes(> 1/4 total stripes) are released, all process are woken,
trying to get the lock. But there is one only being able to get this lock
for each hash lock, making other processes spinning out there for acquiring
the lock.

Thus, it's effectiveless to wakeup all processes and let them battle for
a lock that permits one to access only each time. Instead, we could make
it be a exclusive wake up: wake up one process only. That avoids the heavy
spin lock contention naturally.

Here are some test results I have got with this patch applied(all test run
3 times):

`fsmark.files_per_sec'
=

next-20150317 this patch
- -
metric_value ±stddev  metric_value ±stddev change  
testbox/benchmark/testcase-params
- -    
--
  25.600 ±0.0  92.700 ±2.5  262.1% 
ivb44/fsmark/1x-64t-4BRD_12G-RAID5-btrfs-4M-30G-fsyncBeforeClose
  25.600 ±0.0  77.800 ±0.6  203.9% 
ivb44/fsmark/1x-64t-9BRD_6G-RAID5-btrfs-4M-30G-fsyncBeforeClose
  32.000 ±0.0  93.800 ±1.7  193.1% 
ivb44/fsmark/1x-64t-4BRD_12G-RAID5-ext4-4M-30G-fsyncBeforeClose
  32.000 ±0.0  81.233 ±1.7  153.9% 
ivb44/fsmark/1x-64t-9BRD_6G-RAID5-ext4-4M-30G-fsyncBeforeClose
  48.800 ±14.5 99.667 ±2.0  104.2% 
ivb44/fsmark/1x-64t-4BRD_12G-RAID5-xfs-4M-30G-fsyncBeforeClose
   6.400 ±0.0  12.800 ±0.0  100.0% 
ivb44/fsmark/1x-64t-3HDD-RAID5-btrfs-4M-40G-fsyncBeforeClose
  63.133 ±8.2  82.800 ±0.7   31.2% 
ivb44/fsmark/1x-64t-9BRD_6G-RAID5-xfs-4M-30G-fsyncBeforeClose
 245.067 ±0.7 306.567 ±7.9   25.1% 
ivb44/fsmark/1x-64t-4BRD_12G-RAID5-f2fs-4M-30G-fsyncBeforeClose
  17.533 ±0.3  21.000 ±0.8   19.8% 
ivb44/fsmark/1x-1t-3HDD-RAID5-xfs-4M-40G-fsyncBeforeClose
 188.167 ±1.9 215.033 ±3.1   14.3% 
ivb44/fsmark/1x-1t-4BRD_12G-RAID5-btrfs-4M-30G-NoSync
 254.500 ±1.8 290.733 ±2.4   14.2% 
ivb44/fsmark/1x-1t-9BRD_6G-RAID5-btrfs-4M-30G-NoSync

`time.system_time'
=

next-20150317 this patch
--
metric_value ±stddev metric_value ±stddev change   
testbox/benchmark/testcase-params
-- 
--
7235.603 ±1.2 185.163 ±1.9  -97.4% 
ivb44/fsmark/1x-64t-4BRD_12G-RAID5-btrfs-4M-30G-fsyncBeforeClose
7666.883 ±2.9 202.750 ±1.0  -97.4% 
ivb44/fsmark/1x-64t-9BRD_6G-RAID5-btrfs-4M-30G-fsyncBeforeClose
   14567.893 ±0.7 421.230 ±0.4  -97.1% 
ivb44/fsmark/1x-64t-3HDD-RAID5-btrfs-4M-40G-fsyncBeforeClose
3697.667 ±14.0148.190 ±1.7  -96.0% 
ivb44/fsmark/1x-64t-4BRD_12G-RAID5-xfs-4M-30G-fsyncBeforeClose
5572.867 ±3.8 310.717 ±1.4  -94.4% 
ivb44/fsmark/1x-64t-9BRD_6G-RAID5-ext4-4M-30G-fsyncBeforeClose
5565.050 ±0.5 313.277 ±1.5  -94.4% 
ivb44/fsmark/1x-64t-4BRD_12G-RAID5-ext4-4M-30G-fsyncBeforeClose
2420.707 ±17.1171.043 ±2.7  -92.9% 
ivb44/fsmark/1x-64t-9BRD_6G-RAID5-xfs-4M-30G-fsyncBeforeClose
3743.300 ±4.6 379.827 ±3.5  -89.9% 
ivb44/fsmark/1x-64t-3HDD-RAID5-ext4-4M-40G-fsyncBeforeClose
3308.687 ±6.3 363.050 ±2.0  -89.0% 
ivb44/fsmark/1x-64t-3HDD-RAID5-xfs-4M-40G-fsyncBeforeClose

Where,

 1x: where 'x' means iterations or loop, corresponding to the 'L' option of 
fsmark

 1t, 64t: where 't' means thread

 4M: means the single file size, corresponding to the '-s' option of fsmark
 40G, 30G, 120G: means the total test size

 4BRD_12G: BRD is the ramdisk, where '4' means 4 ramdisk, and where '12G' 
means
   the size of one ramdisk. So, it would be 48G in total. And we 
made a
   raid on those ramdisk

As you can see, though there are no much performance gain for hard disk
workload, the system time is dropped heavily, up to 97%. And as expected,
the performance increased a lot, up to 260%, for fast device(ram disk).

Signed-off-by: Yuanhan 

[PATCH 2/2] md/raid5: exclusive wait_for_stripe

2015-04-24 Thread Yuanhan Liu
I noticed heavy spin lock contention at get_active_stripe() with fsmark
multiple thread write workloads.

Here is how this hot contention comes from. We have limited stripes, and
it's a multiple thread write workload. Hence, those stripes will be taken
soon, which puts later processes to sleep for waiting free stripes. When
enough stripes( 1/4 total stripes) are released, all process are woken,
trying to get the lock. But there is one only being able to get this lock
for each hash lock, making other processes spinning out there for acquiring
the lock.

Thus, it's effectiveless to wakeup all processes and let them battle for
a lock that permits one to access only each time. Instead, we could make
it be a exclusive wake up: wake up one process only. That avoids the heavy
spin lock contention naturally.

Here are some test results I have got with this patch applied(all test run
3 times):

`fsmark.files_per_sec'
=

next-20150317 this patch
- -
metric_value ±stddev  metric_value ±stddev change  
testbox/benchmark/testcase-params
- -    
--
  25.600 ±0.0  92.700 ±2.5  262.1% 
ivb44/fsmark/1x-64t-4BRD_12G-RAID5-btrfs-4M-30G-fsyncBeforeClose
  25.600 ±0.0  77.800 ±0.6  203.9% 
ivb44/fsmark/1x-64t-9BRD_6G-RAID5-btrfs-4M-30G-fsyncBeforeClose
  32.000 ±0.0  93.800 ±1.7  193.1% 
ivb44/fsmark/1x-64t-4BRD_12G-RAID5-ext4-4M-30G-fsyncBeforeClose
  32.000 ±0.0  81.233 ±1.7  153.9% 
ivb44/fsmark/1x-64t-9BRD_6G-RAID5-ext4-4M-30G-fsyncBeforeClose
  48.800 ±14.5 99.667 ±2.0  104.2% 
ivb44/fsmark/1x-64t-4BRD_12G-RAID5-xfs-4M-30G-fsyncBeforeClose
   6.400 ±0.0  12.800 ±0.0  100.0% 
ivb44/fsmark/1x-64t-3HDD-RAID5-btrfs-4M-40G-fsyncBeforeClose
  63.133 ±8.2  82.800 ±0.7   31.2% 
ivb44/fsmark/1x-64t-9BRD_6G-RAID5-xfs-4M-30G-fsyncBeforeClose
 245.067 ±0.7 306.567 ±7.9   25.1% 
ivb44/fsmark/1x-64t-4BRD_12G-RAID5-f2fs-4M-30G-fsyncBeforeClose
  17.533 ±0.3  21.000 ±0.8   19.8% 
ivb44/fsmark/1x-1t-3HDD-RAID5-xfs-4M-40G-fsyncBeforeClose
 188.167 ±1.9 215.033 ±3.1   14.3% 
ivb44/fsmark/1x-1t-4BRD_12G-RAID5-btrfs-4M-30G-NoSync
 254.500 ±1.8 290.733 ±2.4   14.2% 
ivb44/fsmark/1x-1t-9BRD_6G-RAID5-btrfs-4M-30G-NoSync

`time.system_time'
=

next-20150317 this patch
--
metric_value ±stddev metric_value ±stddev change   
testbox/benchmark/testcase-params
-- 
--
7235.603 ±1.2 185.163 ±1.9  -97.4% 
ivb44/fsmark/1x-64t-4BRD_12G-RAID5-btrfs-4M-30G-fsyncBeforeClose
7666.883 ±2.9 202.750 ±1.0  -97.4% 
ivb44/fsmark/1x-64t-9BRD_6G-RAID5-btrfs-4M-30G-fsyncBeforeClose
   14567.893 ±0.7 421.230 ±0.4  -97.1% 
ivb44/fsmark/1x-64t-3HDD-RAID5-btrfs-4M-40G-fsyncBeforeClose
3697.667 ±14.0148.190 ±1.7  -96.0% 
ivb44/fsmark/1x-64t-4BRD_12G-RAID5-xfs-4M-30G-fsyncBeforeClose
5572.867 ±3.8 310.717 ±1.4  -94.4% 
ivb44/fsmark/1x-64t-9BRD_6G-RAID5-ext4-4M-30G-fsyncBeforeClose
5565.050 ±0.5 313.277 ±1.5  -94.4% 
ivb44/fsmark/1x-64t-4BRD_12G-RAID5-ext4-4M-30G-fsyncBeforeClose
2420.707 ±17.1171.043 ±2.7  -92.9% 
ivb44/fsmark/1x-64t-9BRD_6G-RAID5-xfs-4M-30G-fsyncBeforeClose
3743.300 ±4.6 379.827 ±3.5  -89.9% 
ivb44/fsmark/1x-64t-3HDD-RAID5-ext4-4M-40G-fsyncBeforeClose
3308.687 ±6.3 363.050 ±2.0  -89.0% 
ivb44/fsmark/1x-64t-3HDD-RAID5-xfs-4M-40G-fsyncBeforeClose

Where,

 1x: where 'x' means iterations or loop, corresponding to the 'L' option of 
fsmark

 1t, 64t: where 't' means thread

 4M: means the single file size, corresponding to the '-s' option of fsmark
 40G, 30G, 120G: means the total test size

 4BRD_12G: BRD is the ramdisk, where '4' means 4 ramdisk, and where '12G' 
means
   the size of one ramdisk. So, it would be 48G in total. And we 
made a
   raid on those ramdisk

As you can see, though there are no much performance gain for hard disk
workload, the system time is dropped heavily, up to 97%. And as expected,
the performance increased a lot, up to 260%, for fast device(ram disk).

Signed-off-by: Yuanhan Liu yuanhan

[PATCH 1/2] md/raid5: split wait_for_stripe and introduce wait_for_quiesce

2015-04-24 Thread Yuanhan Liu
If I read code correctly, current wait_for_stripe actually has 2 usage:

- wait for there is enough free stripe cache, triggered when
  get_free_stripe() failed. This is what wait_for_stripe intend
  for literally.

- wait for quiesce == 0 or
   active_aligned_reads == 0  active_stripes == 0

  It has nothing to do with wait_for_stripe literally, and releasing
  an active stripe won't actually wake them up. On the contrary, wake_up
  from under this case won't actually wake up the process waiting for
  an free stripe being available.

Hence, we'd better split wait_for_stripe, and here I introduce
wait_for_quiesce for the second usage. The name may not well taken, or
even taken wrongly. Feel free to correct me then.

This is also a prepare patch for next patch: make wait_for_stripe
exclusive.

Signed-off-by: Yuanhan Liu yuanhan@linux.intel.com
---
 drivers/md/raid5.c | 13 +++--
 drivers/md/raid5.h |  1 +
 2 files changed, 8 insertions(+), 6 deletions(-)

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 9716319..b7e385f 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -667,7 +667,7 @@ get_active_stripe(struct r5conf *conf, sector_t sector,
spin_lock_irq(conf-hash_locks + hash);
 
do {
-   wait_event_lock_irq(conf-wait_for_stripe,
+   wait_event_lock_irq(conf-wait_for_quiesce,
conf-quiesce == 0 || noquiesce,
*(conf-hash_locks + hash));
sh = __find_stripe(conf, sector, conf-generation - previous);
@@ -4725,7 +4725,7 @@ static void raid5_align_endio(struct bio *bi, int error)
 raid_bi, 0);
bio_endio(raid_bi, 0);
if (atomic_dec_and_test(conf-active_aligned_reads))
-   wake_up(conf-wait_for_stripe);
+   wake_up(conf-wait_for_quiesce);
return;
}
 
@@ -4820,7 +4820,7 @@ static int chunk_aligned_read(struct mddev *mddev, struct 
bio * raid_bio)
align_bi-bi_iter.bi_sector += rdev-data_offset;
 
spin_lock_irq(conf-device_lock);
-   wait_event_lock_irq(conf-wait_for_stripe,
+   wait_event_lock_irq(conf-wait_for_quiesce,
conf-quiesce == 0,
conf-device_lock);
atomic_inc(conf-active_aligned_reads);
@@ -5659,7 +5659,7 @@ static int  retry_aligned_read(struct r5conf *conf, 
struct bio *raid_bio)
bio_endio(raid_bio, 0);
}
if (atomic_dec_and_test(conf-active_aligned_reads))
-   wake_up(conf-wait_for_stripe);
+   wake_up(conf-wait_for_quiesce);
return handled;
 }
 
@@ -6390,6 +6390,7 @@ static struct r5conf *setup_conf(struct mddev *mddev)
goto abort;
spin_lock_init(conf-device_lock);
seqcount_init(conf-gen_lock);
+   init_waitqueue_head(conf-wait_for_quiesce);
init_waitqueue_head(conf-wait_for_stripe);
init_waitqueue_head(conf-wait_for_overlap);
INIT_LIST_HEAD(conf-handle_list);
@@ -7413,7 +7414,7 @@ static void raid5_quiesce(struct mddev *mddev, int state)
 * active stripes can drain
 */
conf-quiesce = 2;
-   wait_event_cmd(conf-wait_for_stripe,
+   wait_event_cmd(conf-wait_for_quiesce,
atomic_read(conf-active_stripes) == 0 
atomic_read(conf-active_aligned_reads) == 
0,
unlock_all_device_hash_locks_irq(conf),
@@ -7427,7 +7428,7 @@ static void raid5_quiesce(struct mddev *mddev, int state)
case 0: /* re-enable writes */
lock_all_device_hash_locks_irq(conf);
conf-quiesce = 0;
-   wake_up(conf-wait_for_stripe);
+   wake_up(conf-wait_for_quiesce);
wake_up(conf-wait_for_overlap);
unlock_all_device_hash_locks_irq(conf);
break;
diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h
index 7dc0dd8..fab53a3 100644
--- a/drivers/md/raid5.h
+++ b/drivers/md/raid5.h
@@ -508,6 +508,7 @@ struct r5conf {
struct list_headinactive_list[NR_STRIPE_HASH_LOCKS];
atomic_tempty_inactive_list_nr;
struct llist_head   released_stripes;
+   wait_queue_head_t   wait_for_quiesce;
wait_queue_head_t   wait_for_stripe;
wait_queue_head_t   wait_for_overlap;
unsigned long   cache_state;
-- 
1.9.0

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


performance changes on c9dc4c65: 9.8% fsmark.files_per_sec

2015-04-22 Thread Yuanhan Liu
FYI, we found performance increasement, which is expected as commit patch says,
on `fsmark.files_per_sec' by c9dc4c6578502c2085705347375b82089aad18d0:

> commit c9dc4c6578502c2085705347375b82089aad18d0
> Author: Chris Mason 
> AuthorDate: Sat Apr 4 17:14:42 2015 -0700
> Commit: Chris Mason 
> CommitDate: Fri Apr 10 14:07:11 2015 -0700
> 
> Btrfs: two stage dirty block group writeout

4c6d1d85ad89fd8e32dc9204b7f944854399bda9 
c9dc4c6578502c2085705347375b82089aad18d0
 

run time(m) metric_value ±stddev run time(m) metric_value 
±stddev change   testbox/benchmark/testcase-params
--- --   --- --  
  --
3   7.3  |35.267|±0.55   6.6  |38.740|
±1.69.8% ivb44/fsmark/1x-1t-1HDD-btrfs-4M-60G-NoSync


NOTE: here are some more explanation about those test parameters for you to
  know what the testcase does better:

  1x: where 'x' means iterations or loop, corresponding to the 'L' option 
of fsmark

  1t, 64t: where 't' means thread

  4M: means the single file size, corresponding to the '-s' option of fsmark
  60G: means the total test size


And FYI, here are more changes by the same commit:

4c6d1d85ad89fd8e  c9dc4c6578502c208570534737  
  --  
 %stddev %change %stddev
 \  |\  
  9864 ±  2%+156.9%  25345 ±  4%  
fsmark.time.voluntary_context_switches
 9 ±  0% +17.8% 10 ±  4%  
fsmark.time.percent_of_cpu_this_job_got
462211 ±  1% +16.8% 539707 ±  0%  fsmark.app_overhead
 35.27 ±  0%  +9.8%  38.74 ±  1%  fsmark.files_per_sec
   435 ±  0%  -9.0%396 ±  1%  fsmark.time.elapsed_time.max
   435 ±  0%  -9.0%396 ±  1%  fsmark.time.elapsed_time
  5.20 ±  2% -70.3%   1.54 ±  6%  turbostat.Pkg%pc6
   2447873 ± 42% -67.9% 785086 ± 33%  numa-numastat.node1.numa_hit
   2413662 ± 43% -68.1% 771115 ± 31%  numa-numastat.node1.local_node
  9864 ±  2%+156.9%  25345 ±  4%  time.voluntary_context_switches
187680 ± 10%+126.8% 425676 ±  7%  numa-vmstat.node1.nr_dirty
747361 ±  9%+127.8%1702809 ±  7%  numa-meminfo.node1.Dirty
   1787510 ±  1%+117.0%3878984 ±  2%  meminfo.Dirty
446861 ±  1%+117.0% 969472 ±  2%  proc-vmstat.nr_dirty
   1655962 ± 37% -59.3% 673988 ± 29%  numa-vmstat.node1.numa_local
   1036191 ±  8%+110.3%2179311 ±  3%  numa-meminfo.node0.Dirty
259069 ±  8%+110.3% 544783 ±  3%  numa-vmstat.node0.nr_dirty
   1687987 ± 37% -58.6% 698626 ± 29%  numa-vmstat.node1.numa_hit
 1 ±  0%+100.0%  2 ±  0%  vmstat.procs.b
  0.02 ±  0%+100.0%   0.04 ± 22%  turbostat.CPU%c3
  6.03 ±  1% +76.9%  10.67 ±  1%  turbostat.CPU%c1
 5.189e+08 ±  0% +72.6%  8.956e+08 ±  1%  cpuidle.C1-IVT.time
   2646692 ±  7% +75.0%4630890 ± 23%  cpuidle.C3-IVT.time
  5301 ±  6% -31.7%   3620 ±  3%  
slabinfo.btrfs_ordered_extent.active_objs
 10549 ± 16% -30.3%   7349 ± 12%  
numa-vmstat.node1.nr_slab_reclaimable
  5353 ±  6% -31.4%   3670 ±  3%  
slabinfo.btrfs_ordered_extent.num_objs
 42169 ± 16% -30.3%  29397 ± 12%  numa-meminfo.node1.SReclaimable
   1619825 ± 22% +39.4%2258188 ±  4%  proc-vmstat.pgfree
  4611 ±  7% -28.0%   3318 ±  1%  
slabinfo.btrfs_delayed_ref_head.num_objs
  4471 ±  8% -27.0%   3264 ±  2%  
slabinfo.btrfs_delayed_ref_head.active_objs
 67.93 ±  1% -24.7%  51.15 ±  4%  turbostat.Pkg%pc2
   2332975 ± 21% +45.6%3396446 ±  4%  numa-vmstat.node1.numa_other
   2300949 ± 22% +46.5%3371807 ±  4%  numa-vmstat.node1.numa_miss
   2300941 ± 22% +46.5%3371793 ±  4%  numa-vmstat.node0.numa_foreign
  2952 ±  8% -23.3%   2263 ±  3%  
slabinfo.btrfs_delayed_data_ref.num_objs
   2570716 ±  3% +25.7%3230157 ±  2%  numa-meminfo.node1.Writeback
642367 ±  3% +25.7% 807533 ±  2%  numa-vmstat.node1.nr_writeback
 95408 ± 13% -17.3%  78910 ±  6%  numa-meminfo.node1.Slab
  2803 ±  7% -21.1%   2210 ±  3%  
slabinfo.btrfs_delayed_data_ref.active_objs
   240 ±  9% +23.1%295 ± 16%  
numa-vmstat.node0.nr_page_table_pages
   4626942 ± 19% +49.6%6924087 ± 22%  cpuidle.C1E-IVT.time
   5585235 ±  0% +25.5%7011242 ±  0%  meminfo.Writeback
   1396232 ±  0% +25.5%1752892 ±  0%  proc-vmstat.nr_writeback
   962 ±  9% +23.0%   1184 ± 16%  numa-meminfo.node0.PageTables
 9 ±  0% +17.8% 10 ±  4%  time.percent_of_cpu_this_job_got
754027 ±  2% +25.2% 944312 

performance changes on c9dc4c65: 9.8% fsmark.files_per_sec

2015-04-22 Thread Yuanhan Liu
FYI, we found performance increasement, which is expected as commit patch says,
on `fsmark.files_per_sec' by c9dc4c6578502c2085705347375b82089aad18d0:

 commit c9dc4c6578502c2085705347375b82089aad18d0
 Author: Chris Mason c...@fb.com
 AuthorDate: Sat Apr 4 17:14:42 2015 -0700
 Commit: Chris Mason c...@fb.com
 CommitDate: Fri Apr 10 14:07:11 2015 -0700
 
 Btrfs: two stage dirty block group writeout

4c6d1d85ad89fd8e32dc9204b7f944854399bda9 
c9dc4c6578502c2085705347375b82089aad18d0
 

run time(m) metric_value ±stddev run time(m) metric_value 
±stddev change   testbox/benchmark/testcase-params
--- --   --- --  
  --
3   7.3  |35.267|±0.55   6.6  |38.740|
±1.69.8% ivb44/fsmark/1x-1t-1HDD-btrfs-4M-60G-NoSync


NOTE: here are some more explanation about those test parameters for you to
  know what the testcase does better:

  1x: where 'x' means iterations or loop, corresponding to the 'L' option 
of fsmark

  1t, 64t: where 't' means thread

  4M: means the single file size, corresponding to the '-s' option of fsmark
  60G: means the total test size


And FYI, here are more changes by the same commit:

4c6d1d85ad89fd8e  c9dc4c6578502c208570534737  
  --  
 %stddev %change %stddev
 \  |\  
  9864 ±  2%+156.9%  25345 ±  4%  
fsmark.time.voluntary_context_switches
 9 ±  0% +17.8% 10 ±  4%  
fsmark.time.percent_of_cpu_this_job_got
462211 ±  1% +16.8% 539707 ±  0%  fsmark.app_overhead
 35.27 ±  0%  +9.8%  38.74 ±  1%  fsmark.files_per_sec
   435 ±  0%  -9.0%396 ±  1%  fsmark.time.elapsed_time.max
   435 ±  0%  -9.0%396 ±  1%  fsmark.time.elapsed_time
  5.20 ±  2% -70.3%   1.54 ±  6%  turbostat.Pkg%pc6
   2447873 ± 42% -67.9% 785086 ± 33%  numa-numastat.node1.numa_hit
   2413662 ± 43% -68.1% 771115 ± 31%  numa-numastat.node1.local_node
  9864 ±  2%+156.9%  25345 ±  4%  time.voluntary_context_switches
187680 ± 10%+126.8% 425676 ±  7%  numa-vmstat.node1.nr_dirty
747361 ±  9%+127.8%1702809 ±  7%  numa-meminfo.node1.Dirty
   1787510 ±  1%+117.0%3878984 ±  2%  meminfo.Dirty
446861 ±  1%+117.0% 969472 ±  2%  proc-vmstat.nr_dirty
   1655962 ± 37% -59.3% 673988 ± 29%  numa-vmstat.node1.numa_local
   1036191 ±  8%+110.3%2179311 ±  3%  numa-meminfo.node0.Dirty
259069 ±  8%+110.3% 544783 ±  3%  numa-vmstat.node0.nr_dirty
   1687987 ± 37% -58.6% 698626 ± 29%  numa-vmstat.node1.numa_hit
 1 ±  0%+100.0%  2 ±  0%  vmstat.procs.b
  0.02 ±  0%+100.0%   0.04 ± 22%  turbostat.CPU%c3
  6.03 ±  1% +76.9%  10.67 ±  1%  turbostat.CPU%c1
 5.189e+08 ±  0% +72.6%  8.956e+08 ±  1%  cpuidle.C1-IVT.time
   2646692 ±  7% +75.0%4630890 ± 23%  cpuidle.C3-IVT.time
  5301 ±  6% -31.7%   3620 ±  3%  
slabinfo.btrfs_ordered_extent.active_objs
 10549 ± 16% -30.3%   7349 ± 12%  
numa-vmstat.node1.nr_slab_reclaimable
  5353 ±  6% -31.4%   3670 ±  3%  
slabinfo.btrfs_ordered_extent.num_objs
 42169 ± 16% -30.3%  29397 ± 12%  numa-meminfo.node1.SReclaimable
   1619825 ± 22% +39.4%2258188 ±  4%  proc-vmstat.pgfree
  4611 ±  7% -28.0%   3318 ±  1%  
slabinfo.btrfs_delayed_ref_head.num_objs
  4471 ±  8% -27.0%   3264 ±  2%  
slabinfo.btrfs_delayed_ref_head.active_objs
 67.93 ±  1% -24.7%  51.15 ±  4%  turbostat.Pkg%pc2
   2332975 ± 21% +45.6%3396446 ±  4%  numa-vmstat.node1.numa_other
   2300949 ± 22% +46.5%3371807 ±  4%  numa-vmstat.node1.numa_miss
   2300941 ± 22% +46.5%3371793 ±  4%  numa-vmstat.node0.numa_foreign
  2952 ±  8% -23.3%   2263 ±  3%  
slabinfo.btrfs_delayed_data_ref.num_objs
   2570716 ±  3% +25.7%3230157 ±  2%  numa-meminfo.node1.Writeback
642367 ±  3% +25.7% 807533 ±  2%  numa-vmstat.node1.nr_writeback
 95408 ± 13% -17.3%  78910 ±  6%  numa-meminfo.node1.Slab
  2803 ±  7% -21.1%   2210 ±  3%  
slabinfo.btrfs_delayed_data_ref.active_objs
   240 ±  9% +23.1%295 ± 16%  
numa-vmstat.node0.nr_page_table_pages
   4626942 ± 19% +49.6%6924087 ± 22%  cpuidle.C1E-IVT.time
   5585235 ±  0% +25.5%7011242 ±  0%  meminfo.Writeback
   1396232 ±  0% +25.5%1752892 ±  0%  proc-vmstat.nr_writeback
   962 ±  9% +23.0%   1184 ± 16%  numa-meminfo.node0.PageTables
 9 ±  0% +17.8% 10 ±  4%  time.percent_of_cpu_this_job_got
754027 ±  2% 

performance changes on 78373b73: -46.6% fsmark.files_per_sec, and few more

2015-04-20 Thread Yuanhan Liu
FYI, we found changes on `fsmark.files_per_sec' by 
78373b7319abdf15050af5b1632c4c8b8b398f33:

> commit 78373b7319abdf15050af5b1632c4c8b8b398f33
> Author: Jaegeuk Kim 
> AuthorDate: Fri Mar 13 21:44:36 2015 -0700
> Commit: Jaegeuk Kim 
> CommitDate: Fri Apr 10 15:08:45 2015 -0700
> 
> f2fs: enhance multi-threads performance

3402e87cfb5e762f9c95071bf4a2ad65fd9392a2 
78373b7319abdf15050af5b1632c4c8b8b398f33
 

run time(m) metric_value ±stddev run time(m) metric_value 
±stddev change   testbox/benchmark/testcase-params
--- --   --- --  
  --
3   0.3 |490.800|±5.73   0.5 |262.067|
±0.4  -46.6% 
ivb44/fsmark/1x-64t-4BRD_12G-RAID0-f2fs-4M-30G-fsyncBeforeClose
3   0.3 |468.367|±3.53   0.5 |264.467|
±0.2  -43.5% 
ivb44/fsmark/1x-64t-9BRD_6G-RAID0-f2fs-4M-30G-fsyncBeforeClose
3   0.6 |211.867|±0.73   0.7 |191.067|
±0.5   -9.8% 
ivb44/fsmark/1x-64t-4BRD_12G-RAID5-f2fs-4M-30G-fsyncBeforeClose

NOTE: here are some more info about those test parameters for you to
  know what the testcase does better:

  1x: where 'x' means iterations or loop, corresponding to the 'L' option 
of fsmark

  1t, 64t: where 't' means thread

  4M: means the single file size, corresponding to the '-s' option of fsmark
  40G, 30G, 120G: means the total test size

  4BRD_12G: BRD is the ramdisk, where '4' means 4 ramdisk, and where '12G' 
means
the size of one ramdisk. So, it would be 48G in total. And we 
made a
raid on those ramdisk


The change is a bit interesting as you already stated it clear that this
patch is for performance gain. The patch itself is clear, too: remove a
mutex lock. So the only reasonable cause, without too much dig, I can think
of would be the remove of this lock reduces sleep time, and brings more
process to be able run, but somehow increases the context switches and cpu
usage in the meantime at somewhere. I guess this is what the following
changes are trying to tell us:

 29708 ±  2%   +5720.0%1729051 ±  1%  
fsmark.time.voluntary_context_switches
   302 ±  0%+113.8%647 ±  0%  
fsmark.time.percent_of_cpu_this_job_got
 61.05 ±  0%+214.0% 191.70 ±  0%  fsmark.time.system_time


FYI, Here I listed all changes for the outstanding change:

3   0.3 |490.800|±5.73   0.5 |262.067|
±0.4  -46.6% 
ivb44/fsmark/1x-64t-4BRD_12G-RAID0-f2fs-4M-30G-fsyncBeforeClose

3402e87cfb5e762f  78373b7319abdf15050af5b163  
  --  
 %stddev %change %stddev
 \  |\  
 29708 ±  2%   +5720.0%1729051 ±  1%  
fsmark.time.voluntary_context_switches
 61.05 ±  0%+214.0% 191.70 ±  0%  fsmark.time.system_time
   302 ±  0%+113.8%647 ±  0%  
fsmark.time.percent_of_cpu_this_job_got
 10476 ±  0% +95.4%  20467 ±  5%  fsmark.time.minor_page_faults
   490 ±  5% -46.6%262 ±  0%  fsmark.files_per_sec
 20.21 ±  0% +46.7%  29.65 ±  0%  fsmark.time.elapsed_time
 20.21 ±  0% +46.7%  29.65 ±  0%  fsmark.time.elapsed_time.max
226379 ±  0% +32.5% 299882 ±  0%  fsmark.app_overhead
 0 ±  0%  +Inf%   1045 ±  2%  proc-vmstat.numa_pages_migrated
   209 ± 26%   +3272.3%   7059 ±  3%  cpuidle.C1E-IVT.usage
   228 ± 42%+686.7%   1799 ± 14%  numa-meminfo.node0.Writeback
 14633 ±  5%   +7573.2%1122849 ±  1%  cpuidle.C1-IVT.usage
 0 ±  0%  +Inf%   1045 ±  2%  proc-vmstat.pgmigrate_success
 29708 ±  2%   +5720.0%1729051 ±  1%  time.voluntary_context_switches
 55663 ±  0%+776.9% 488081 ±  0%  cpuidle.C6-IVT.usage
56 ± 42%+718.8%464 ± 11%  numa-vmstat.node0.nr_writeback
   535 ± 29%+334.4%   2325 ± 10%  meminfo.Writeback
   129 ± 30%+295.6%511 ±  4%  proc-vmstat.nr_writeback
 59.25 ±  5% -74.2%  15.26 ±  3%  turbostat.CPU%c6
  2.58 ±  8% -74.5%   0.66 ± 11%  turbostat.Pkg%pc2
 1.551e+08 ± 14%+233.4%  5.171e+08 ±  4%  cpuidle.C1-IVT.time
 32564 ± 24%+208.1% 100330 ±  5%  softirqs.RCU
 61.05 ±  0%+214.0% 191.70 ±  0%  time.system_time
60 ± 32%+165.7%160 ± 16%  numa-vmstat.node1.nr_writeback
 2 ±  0%+200.0%  6 ±  0%  vmstat.procs.r
  3057 ±  2%+166.1%   8136 ± 22%  numa-vmstat.node0.nr_mapped
 12240 ±  2%+165.9%  32547 ± 22%  numa-meminfo.node0.Mapped
  6324 ±  3%+148.4%  15709 ±  0%  proc-vmstat.nr_mapped
   

performance changes on 78373b73: -46.6% fsmark.files_per_sec, and few more

2015-04-20 Thread Yuanhan Liu
FYI, we found changes on `fsmark.files_per_sec' by 
78373b7319abdf15050af5b1632c4c8b8b398f33:

 commit 78373b7319abdf15050af5b1632c4c8b8b398f33
 Author: Jaegeuk Kim jaeg...@kernel.org
 AuthorDate: Fri Mar 13 21:44:36 2015 -0700
 Commit: Jaegeuk Kim jaeg...@kernel.org
 CommitDate: Fri Apr 10 15:08:45 2015 -0700
 
 f2fs: enhance multi-threads performance

3402e87cfb5e762f9c95071bf4a2ad65fd9392a2 
78373b7319abdf15050af5b1632c4c8b8b398f33
 

run time(m) metric_value ±stddev run time(m) metric_value 
±stddev change   testbox/benchmark/testcase-params
--- --   --- --  
  --
3   0.3 |490.800|±5.73   0.5 |262.067|
±0.4  -46.6% 
ivb44/fsmark/1x-64t-4BRD_12G-RAID0-f2fs-4M-30G-fsyncBeforeClose
3   0.3 |468.367|±3.53   0.5 |264.467|
±0.2  -43.5% 
ivb44/fsmark/1x-64t-9BRD_6G-RAID0-f2fs-4M-30G-fsyncBeforeClose
3   0.6 |211.867|±0.73   0.7 |191.067|
±0.5   -9.8% 
ivb44/fsmark/1x-64t-4BRD_12G-RAID5-f2fs-4M-30G-fsyncBeforeClose

NOTE: here are some more info about those test parameters for you to
  know what the testcase does better:

  1x: where 'x' means iterations or loop, corresponding to the 'L' option 
of fsmark

  1t, 64t: where 't' means thread

  4M: means the single file size, corresponding to the '-s' option of fsmark
  40G, 30G, 120G: means the total test size

  4BRD_12G: BRD is the ramdisk, where '4' means 4 ramdisk, and where '12G' 
means
the size of one ramdisk. So, it would be 48G in total. And we 
made a
raid on those ramdisk


The change is a bit interesting as you already stated it clear that this
patch is for performance gain. The patch itself is clear, too: remove a
mutex lock. So the only reasonable cause, without too much dig, I can think
of would be the remove of this lock reduces sleep time, and brings more
process to be able run, but somehow increases the context switches and cpu
usage in the meantime at somewhere. I guess this is what the following
changes are trying to tell us:

 29708 ±  2%   +5720.0%1729051 ±  1%  
fsmark.time.voluntary_context_switches
   302 ±  0%+113.8%647 ±  0%  
fsmark.time.percent_of_cpu_this_job_got
 61.05 ±  0%+214.0% 191.70 ±  0%  fsmark.time.system_time


FYI, Here I listed all changes for the outstanding change:

3   0.3 |490.800|±5.73   0.5 |262.067|
±0.4  -46.6% 
ivb44/fsmark/1x-64t-4BRD_12G-RAID0-f2fs-4M-30G-fsyncBeforeClose

3402e87cfb5e762f  78373b7319abdf15050af5b163  
  --  
 %stddev %change %stddev
 \  |\  
 29708 ±  2%   +5720.0%1729051 ±  1%  
fsmark.time.voluntary_context_switches
 61.05 ±  0%+214.0% 191.70 ±  0%  fsmark.time.system_time
   302 ±  0%+113.8%647 ±  0%  
fsmark.time.percent_of_cpu_this_job_got
 10476 ±  0% +95.4%  20467 ±  5%  fsmark.time.minor_page_faults
   490 ±  5% -46.6%262 ±  0%  fsmark.files_per_sec
 20.21 ±  0% +46.7%  29.65 ±  0%  fsmark.time.elapsed_time
 20.21 ±  0% +46.7%  29.65 ±  0%  fsmark.time.elapsed_time.max
226379 ±  0% +32.5% 299882 ±  0%  fsmark.app_overhead
 0 ±  0%  +Inf%   1045 ±  2%  proc-vmstat.numa_pages_migrated
   209 ± 26%   +3272.3%   7059 ±  3%  cpuidle.C1E-IVT.usage
   228 ± 42%+686.7%   1799 ± 14%  numa-meminfo.node0.Writeback
 14633 ±  5%   +7573.2%1122849 ±  1%  cpuidle.C1-IVT.usage
 0 ±  0%  +Inf%   1045 ±  2%  proc-vmstat.pgmigrate_success
 29708 ±  2%   +5720.0%1729051 ±  1%  time.voluntary_context_switches
 55663 ±  0%+776.9% 488081 ±  0%  cpuidle.C6-IVT.usage
56 ± 42%+718.8%464 ± 11%  numa-vmstat.node0.nr_writeback
   535 ± 29%+334.4%   2325 ± 10%  meminfo.Writeback
   129 ± 30%+295.6%511 ±  4%  proc-vmstat.nr_writeback
 59.25 ±  5% -74.2%  15.26 ±  3%  turbostat.CPU%c6
  2.58 ±  8% -74.5%   0.66 ± 11%  turbostat.Pkg%pc2
 1.551e+08 ± 14%+233.4%  5.171e+08 ±  4%  cpuidle.C1-IVT.time
 32564 ± 24%+208.1% 100330 ±  5%  softirqs.RCU
 61.05 ±  0%+214.0% 191.70 ±  0%  time.system_time
60 ± 32%+165.7%160 ± 16%  numa-vmstat.node1.nr_writeback
 2 ±  0%+200.0%  6 ±  0%  vmstat.procs.r
  3057 ±  2%+166.1%   8136 ± 22%  numa-vmstat.node0.nr_mapped
 12240 ±  2%+165.9%  32547 ± 22%  numa-meminfo.node0.Mapped
  6324 ±  3%+148.4%  15709 ±  

Re: performance changes on 4400755e: 200.0% fsmark.files_per_sec, -18.1% fsmark.files_per_sec, and few more

2015-03-25 Thread Yuanhan Liu
On Wed, Mar 25, 2015 at 02:03:59PM +1100, NeilBrown wrote:
> On Wed, 18 Mar 2015 13:00:30 +0800 Yuanahn Liu 
> wrote:
> 
> > Hi,
> > 
> > FYI, we noticed performance changes on `fsmark.files_per_sec' by 
> > 4400755e356f9a2b0b7ceaa02f57b1c7546c3765:
> > 
> > > commit 4400755e356f9a2b0b7ceaa02f57b1c7546c3765
> > > Author: NeilBrown 
> > > AuthorDate: Thu Feb 26 12:47:56 2015 +1100
> > > Commit: NeilBrown 
> > > CommitDate: Wed Mar 4 13:40:19 2015 +1100
> > > 
> > > md/raid5: allow the stripe_cache to grow and shrink.
> 
> Thanks a lot for this testing!!! I was wondering how I could do some proper
> testing of this patch, and you've done it for me :-)

Welcome!

> 
> The large number of improvements is very encouraging - that is what I was
> hoping for of course.
> 
> The few regressions could be a concern.  I note that are all NoSync.
> That seems to suggest that they could just be writing more data.

It's not a time based test, but size based test:

> >   40G, 30G, 120G: means the total test size

Hence, I doubt it might be writing more data.


> i.e. the data is written a bit earlier (certainly possible) so it happen to
> introduce more delay 
> 
> I guess I'm not really sure how to interpret NoSync results, and suspect that
> poor NoSync result don't really reflect much on the underlying block device.
> Could that be right?

Sorry, I'm not quite sure I followed you. Poor NoSync result? Do you
mean the small number like 63.133, 57.600? They are of unit of
files_per_sec, and file size is 4M. Hence, it would be 200+ MB/s, which
is not that bad in this case, as it's a 3 hard disk RAID5.

> > 3   8.1   63.133 ±0.5%   3   9.2   55.633   
> >   ±0.2% -11.9% ivb44/fsmark/1x-1t-3HDD-RAID5-btrfs-4M-120G-NoSync

Here are few iostat sample from 26089f4902595a2f64c512066af07af6e82eb096
of above test:

avg-cpu:  %user   %nice %system %iowait  %steal   %idle
   0.000.000.631.670.00   97.70

Device: rrqm/s   wrqm/s r/s w/srkB/swkB/s avgrq-sz 
avgqu-sz   await r_await w_await  svctm  %util
sdb   0.00 30353.000.00  240.00 0.00 121860.00  1015.50 
1.295.350.005.35   3.50  83.90
sdc   0.00 30353.000.00  241.00 0.00 122372.00  1015.54 
0.662.740.002.74   2.53  60.90
sda   0.00 30353.000.00  242.00 0.00 122884.00  1015.57 
1.295.360.005.36   3.52  85.20
md0   0.00 0.000.00  956.00 0.00 244736.00   512.00 
227231.390.000.000.00   1.05 100.00

avg-cpu:  %user   %nice %system %iowait  %steal   %idle
   0.020.000.691.690.00   97.60

Device: rrqm/s   wrqm/s r/s w/srkB/swkB/s avgrq-sz 
avgqu-sz   await r_await w_await  svctm  %util
sdb   0.00 30988.000.00  247.00 0.00 125444.00  1015.74 
1.777.170.007.17   4.02  99.40
sdc   0.00 30988.000.00  245.00 0.00 124420.00  1015.67 
1.194.820.004.82   3.67  89.90
sda   0.00 30988.000.00  247.00 0.00 125444.00  1015.74 
0.652.650.002.65   2.54  62.70
md0   0.00 0.000.00  976.00 0.00 249856.00   512.00 
228206.370.000.000.00   1.02 100.00

avg-cpu:  %user   %nice %system %iowait  %steal   %idle
   0.000.000.611.670.00   97.72

Device: rrqm/s   wrqm/s r/s w/srkB/swkB/s avgrq-sz 
avgqu-sz   await r_await w_await  svctm  %util
sdb   0.00 29718.000.00  235.00 0.00 119300.00  1015.32 
1.355.710.005.71   3.71  87.20
sdc   0.00 29718.000.00  236.00 0.00 119812.00  1015.36 
1.195.060.005.06   3.43  80.90
sda   0.00 29718.000.00  235.00 0.00 119300.00  1015.32 
0.873.690.003.69   2.99  70.20
md0   0.00 0.000.00  936.00 0.00 239616.00   512.00 
229157.330.000.000.00   1.07 100.00


And few iostat sample of 4400755e356f9a2b0b7ceaa02f57b1c7546c3765(first bad 
commit):

avg-cpu:  %user   %nice %system %iowait  %steal   %idle
   0.020.001.091.540.00   97.35

Device: rrqm/s   wrqm/s r/s w/srkB/swkB/s avgrq-sz 
avgqu-sz   await r_await w_await  svctm  %util
sdb   1.00 27677.001.00  206.00 8.00 100516.00   971.25 
   27.40  130.56  196.00  130.24   4.72  97.70
sdc   0.00 27677.000.00  207.00 0.00 101028.00   976.12 
   27.05  129.430.00  129.43   4.61  95.50
sda   5.00 27677.001.00  211.0016.00 102984.00   971.70 
   26.61  127.00  201.00  126.64   4.50  95.50
md0   0.00 0.000.00  824.00 

Re: performance changes on 4400755e: 200.0% fsmark.files_per_sec, -18.1% fsmark.files_per_sec, and few more

2015-03-25 Thread Yuanhan Liu
On Wed, Mar 25, 2015 at 02:03:59PM +1100, NeilBrown wrote:
 On Wed, 18 Mar 2015 13:00:30 +0800 Yuanahn Liu yuanhan@linux.intel.com
 wrote:
 
  Hi,
  
  FYI, we noticed performance changes on `fsmark.files_per_sec' by 
  4400755e356f9a2b0b7ceaa02f57b1c7546c3765:
  
   commit 4400755e356f9a2b0b7ceaa02f57b1c7546c3765
   Author: NeilBrown ne...@suse.de
   AuthorDate: Thu Feb 26 12:47:56 2015 +1100
   Commit: NeilBrown ne...@suse.de
   CommitDate: Wed Mar 4 13:40:19 2015 +1100
   
   md/raid5: allow the stripe_cache to grow and shrink.
 
 Thanks a lot for this testing!!! I was wondering how I could do some proper
 testing of this patch, and you've done it for me :-)

Welcome!

 
 The large number of improvements is very encouraging - that is what I was
 hoping for of course.
 
 The few regressions could be a concern.  I note that are all NoSync.
 That seems to suggest that they could just be writing more data.

It's not a time based test, but size based test:

40G, 30G, 120G: means the total test size

Hence, I doubt it might be writing more data.


 i.e. the data is written a bit earlier (certainly possible) so it happen to
 introduce more delay 
 
 I guess I'm not really sure how to interpret NoSync results, and suspect that
 poor NoSync result don't really reflect much on the underlying block device.
 Could that be right?

Sorry, I'm not quite sure I followed you. Poor NoSync result? Do you
mean the small number like 63.133, 57.600? They are of unit of
files_per_sec, and file size is 4M. Hence, it would be 200+ MB/s, which
is not that bad in this case, as it's a 3 hard disk RAID5.

  3   8.1   63.133 ±0.5%   3   9.2   55.633   
±0.2% -11.9% ivb44/fsmark/1x-1t-3HDD-RAID5-btrfs-4M-120G-NoSync

Here are few iostat sample from 26089f4902595a2f64c512066af07af6e82eb096
of above test:

avg-cpu:  %user   %nice %system %iowait  %steal   %idle
   0.000.000.631.670.00   97.70

Device: rrqm/s   wrqm/s r/s w/srkB/swkB/s avgrq-sz 
avgqu-sz   await r_await w_await  svctm  %util
sdb   0.00 30353.000.00  240.00 0.00 121860.00  1015.50 
1.295.350.005.35   3.50  83.90
sdc   0.00 30353.000.00  241.00 0.00 122372.00  1015.54 
0.662.740.002.74   2.53  60.90
sda   0.00 30353.000.00  242.00 0.00 122884.00  1015.57 
1.295.360.005.36   3.52  85.20
md0   0.00 0.000.00  956.00 0.00 244736.00   512.00 
227231.390.000.000.00   1.05 100.00

avg-cpu:  %user   %nice %system %iowait  %steal   %idle
   0.020.000.691.690.00   97.60

Device: rrqm/s   wrqm/s r/s w/srkB/swkB/s avgrq-sz 
avgqu-sz   await r_await w_await  svctm  %util
sdb   0.00 30988.000.00  247.00 0.00 125444.00  1015.74 
1.777.170.007.17   4.02  99.40
sdc   0.00 30988.000.00  245.00 0.00 124420.00  1015.67 
1.194.820.004.82   3.67  89.90
sda   0.00 30988.000.00  247.00 0.00 125444.00  1015.74 
0.652.650.002.65   2.54  62.70
md0   0.00 0.000.00  976.00 0.00 249856.00   512.00 
228206.370.000.000.00   1.02 100.00

avg-cpu:  %user   %nice %system %iowait  %steal   %idle
   0.000.000.611.670.00   97.72

Device: rrqm/s   wrqm/s r/s w/srkB/swkB/s avgrq-sz 
avgqu-sz   await r_await w_await  svctm  %util
sdb   0.00 29718.000.00  235.00 0.00 119300.00  1015.32 
1.355.710.005.71   3.71  87.20
sdc   0.00 29718.000.00  236.00 0.00 119812.00  1015.36 
1.195.060.005.06   3.43  80.90
sda   0.00 29718.000.00  235.00 0.00 119300.00  1015.32 
0.873.690.003.69   2.99  70.20
md0   0.00 0.000.00  936.00 0.00 239616.00   512.00 
229157.330.000.000.00   1.07 100.00


And few iostat sample of 4400755e356f9a2b0b7ceaa02f57b1c7546c3765(first bad 
commit):

avg-cpu:  %user   %nice %system %iowait  %steal   %idle
   0.020.001.091.540.00   97.35

Device: rrqm/s   wrqm/s r/s w/srkB/swkB/s avgrq-sz 
avgqu-sz   await r_await w_await  svctm  %util
sdb   1.00 27677.001.00  206.00 8.00 100516.00   971.25 
   27.40  130.56  196.00  130.24   4.72  97.70
sdc   0.00 27677.000.00  207.00 0.00 101028.00   976.12 
   27.05  129.430.00  129.43   4.61  95.50
sda   5.00 27677.001.00  211.0016.00 102984.00   971.70 
   26.61  127.00  201.00  126.64   4.50  95.50
md0   0.00 0.000.00  824.00 0.00 

[LKP] [sched] WARNING: CPU: 0 PID: 13608 at kernel/sched/core.c:7323 __might_sleep+0xbd/0xd0()

2014-12-02 Thread Yuanhan Liu
FYI, we noticed the below changes on

git://git.kernel.org/pub/scm/linux/kernel/git/next/linux-next.git master
commit 8eb23b9f35aae413140d3fda766a98092c21e9b0 ("sched: Debug nested sleeps")


+-+++
| | 26cabd3125 | 8eb23b9f35 |
+-+++
| boot_successes  | 10 | 15 |
| boot_failures   | 0  | 25 |
| WARNING:at_kernel/sched/core.c:__might_sleep()  | 0  | 5  |
| backtrace:SyS_read  | 0  | 5  |
| backtrace:vfs_read  | 0  | 5  |
| WARNING:at_kernel/sched/core.c:#__might_sleep() | 0  | 20 |
| backtrace:SyS_io_getevents  | 0  | 10 |
| backtrace:read_events   | 0  | 7  |
| backtrace:kauditd_thread| 0  | 10 |
+-+++



<4>[  839.494114] [ cut here ]
<4>[  839.494131] WARNING: CPU: 0 PID: 13608 at 
/kbuild/src/lkp/kernel/sched/core.c:7323 __might_sleep+0xbd/0xd0()
<4>[  839.494137] do not call blocking ops when !TASK_RUNNING; state=1 set at 
[] prepare_to_wait+0x2f/0x90
<4>[  839.494256] Modules linked in: tun ipmi_watchdog loop btrfs xor raid6_pq 
sg sd_mod ast snd_pcm syscopyarea sysfillrect snd_timer sysimgblt snd ie6xx_wdt 
ttm i2c_isch drm_kms_helper soundcore drm ahci libahci pcspkr i2c_ismt lpc_sch 
ipmi_si libata shpchp ipmi_msghandler acpi_cpufreq
<4>[  839.494264] CPU: 0 PID: 13608 Comm: fanotify01 Not tainted 
3.18.0-rc4-next-20141117 #1
<4>[  839.494266] Hardware name: To be filled by O.E.M. To be filled by 
O.E.M./Double Cove , BIOS BWDEXT.86B.000.012.D127 10/08/2012
<4>[  839.494273]  81b5ebb8 88023cf37d18 81892f54 
64026402
<4>[  839.494277]  88023cf37d68 88023cf37d58 8107047a 
88023cf37db8
<4>[  839.494281]  81b5f5e8 0061  
6000
<4>[  839.494285] Call Trace:
<4>[  839.494315]  [] dump_stack+0x4c/0x65
<4>[  839.494323]  [] warn_slowpath_common+0x8a/0xc0
<4>[  839.494327]  [] warn_slowpath_fmt+0x46/0x50
<4>[  839.494333]  [] ? prepare_to_wait+0x2f/0x90
<4>[  839.494337]  [] ? prepare_to_wait+0x2f/0x90
<4>[  839.494341]  [] __might_sleep+0xbd/0xd0
<4>[  839.494348]  [] mutex_lock+0x24/0x50
<4>[  839.494354]  [] fanotify_read+0xd5/0x620
<4>[  839.494370]  [] ? selinux_file_permission+0xa6/0x120
<4>[  839.494374]  [] ? wait_woken+0xc0/0xc0
<4>[  839.494381]  [] __vfs_read+0x18/0x50
<4>[  839.494385]  [] vfs_read+0x8a/0x140
<4>[  839.494390]  [] SyS_read+0x46/0xb0
<4>[  839.494403]  [] system_call_fastpath+0x12/0x17
<4>[  839.494409] ---[ end trace 5a2207521429f889 ]---



--yliu
___
LKP mailing list
l...@linux.intel.com
[0.00] Initializing cgroup subsys cpuset
[0.00] Initializing cgroup subsys cpu
[0.00] Linux version 3.18.0-rc4-next-20141117 (kbuild@roam) (gcc 
version 4.9.1 (Debian 4.9.1-19) ) #1 SMP Tue Nov 18 11:46:52 CST 2014
[0.00] Command line: user=lkp 
job=/lkp/scheduled/lkp-a06/cyclic_ltp-performance-syscalls-x86_64-rhel-HEAD-efefb5ca5da52f7537c7ced03d6e53408f13a26e-0.yaml
 ARCH=x86_64 
BOOT_IMAGE=/kernel/x86_64-rhel/efefb5ca5da52f7537c7ced03d6e53408f13a26e/vmlinuz-3.18.0-rc4-next-20141117
 kconfig=x86_64-rhel commit=efefb5ca5da52f7537c7ced03d6e53408f13a26e 
branch=next/master root=/dev/ram0 max_uptime=3600 
RESULT_ROOT=/result/lkp-a06/ltp/performance-syscalls/debian-x86_64.cgz/x86_64-rhel/efefb5ca5da52f7537c7ced03d6e53408f13a26e/0
 ip=lkp-a06::dhcp earlyprintk=ttyS0,115200 debug apic=debug 
sysrq_always_enabled rcupdate.rcu_cpu_stall_timeout=100 panic=-1 
softlockup_panic=1 nmi_watchdog=panic oops=panic load_ramdisk=2 
prompt_ramdisk=0 console=ttyS0,115200 console=tty0 vga=normal rw
[0.00] e820: BIOS-provided physical RAM map:
[0.00] BIOS-e820: [mem 0x0100-0x0009e3ff] usable
[0.00] BIOS-e820: [mem 0x0009e400-0x0009] reserved
[0.00] BIOS-e820: [mem 0x000e-0x000f] reserved
[0.00] BIOS-e820: [mem 0x0010-0xbf67afff] usable
[0.00] BIOS-e820: [mem 0xbf67b000-0xbfb3dfff] ACPI NVS
[0.00] BIOS-e820: [mem 0xbfb3e000-0xbfc50fff] reserved
[0.00] BIOS-e820: [mem 0xbfc51000-0xbfc51fff] ACPI NVS
[0.00] BIOS-e820: [mem 0xbfc52000-0xbfc62fff] reserved
[0.00] BIOS-e820: [mem 0xbfc63000-0xbfc65fff] ACPI NVS
[0.00] BIOS-e820: [mem 0xbfc66000-0xbfc83fff] reserved
[

[LKP] [drm/fb] f5ef139cbe5: *ERROR* not all connectors configured

2014-12-02 Thread Yuanhan Liu
FYI, we noticed the below changes on

git://people.freedesktop.org/~airlied/linux.git radeon-mst-hacks
commit f5ef139cbe5dbd755dab3706022d7147800099a8 ("drm/fb: add support for tiled 
monitor configurations.")


testbox/testcase/testparams: vm-kbuild-1G/xfstests/4HDD-btrfs-generic-113

9cf13203b1fd7cc3  f5ef139cbe5dbd755dab370602  
  --  
   fail:runs  %reproductionfail:runs
   | | |
   :10 100%  10:10
kmsg.drm:drm_setup_crtcs[drm_kms_helper]]*ERROR*not_all_connectors_configured

vm-kbuild-1G: qemu-system-x86_64 -enable-kvm -cpu Haswell,+smep,+smap
Memory: 1G




To reproduce:

apt-get install ruby ruby-oj
git clone 
git://git.kernel.org/pub/scm/linux/kernel/git/wfg/lkp-tests.git
cd lkp-tests
bin/setup-local job.yaml # the job file attached in this email
bin/run-local   job.yaml


Disclaimer:
Results have been estimated based on internal Intel analysis and are provided
for informational purposes only. Any difference in system hardware or software
design or configuration may affect actual performance.



--yliu
---
testcase: xfstests
default_monitors:
  wait: pre-test
  vmstat: 
default_watchdogs:
  watch-oom: 
  watchdog: 
cpufreq_governor: 
model: qemu-system-x86_64 -enable-kvm -cpu Haswell,+smep,+smap
nr_vm: 16
nr_cpu: 2
memory: 1G
disk_type: virtio-scsi
rootfs: debian-x86_64.cgz
hdd_partitions: "/dev/sda /dev/sdb /dev/sdc /dev/sdd"
swap_partitions: "/dev/sde"
disk: 4HDD
fs:
- btrfs
xfstests:
  test:
  - generic-113
enqueue_time: 2014-11-26 13:11:19.191840759 +08:00
branch: linux-devel/devel-hourly-2014112611
commit: a7a1168f6b45bb0d29a20c942e1ca300ef54dc6e
repeat_to: 2
testbox: vm-kbuild-1G-3
tbox_group: vm-kbuild-1G
kconfig: x86_64-rhel
kernel: 
"/kernel/x86_64-rhel/a7a1168f6b45bb0d29a20c942e1ca300ef54dc6e/vmlinuz-3.18.0-rc6-wl-ath-ga7a1168f"
user: lkp
queue: rand
result_root: 
"/result/vm-kbuild-1G/xfstests/4HDD-btrfs-generic-113/debian-x86_64.cgz/x86_64-rhel/a7a1168f6b45bb0d29a20c942e1ca300ef54dc6e/0"
job_file: 
"/lkp/scheduled/vm-kbuild-1G-3/rand_xfstests-4HDD-btrfs-generic-113-debian-x86_64.cgz-x86_64-rhel-a7a1168f6b45bb0d29a20c942e1ca300ef54dc6e-1.yaml"
dequeue_time: 2014-11-26 13:25:10.605471464 +08:00
job_state: finished
loadavg: 96.37 33.89 12.20 1/593 3339
start_time: '1416979556'
end_time: '1416979727'
version: "/lkp/lkp/.src-20141126-053142"
mkfs -t btrfs /dev/sdd
mkfs -t btrfs /dev/sdc
mkfs -t btrfs /dev/sdb
mkfs -t btrfs /dev/sda
mount -t btrfs /dev/sda /fs/sda
mount -t btrfs /dev/sdb /fs/sdb
mount -t btrfs /dev/sdc /fs/sdc
mount -t btrfs /dev/sdd /fs/sdd
export TEST_DIR=/fs/sda
export TEST_DEV=/dev/sda
export FSTYP=btrfs
export SCRATCH_MNT=/fs/scratch
mkdir /fs/scratch -p
export SCRATCH_DEV_POOL="/dev/sdb /dev/sdc /dev/sdd"
./check generic/113


[LKP] [drm/fb] f5ef139cbe5: *ERROR* not all connectors configured

2014-12-02 Thread Yuanhan Liu
FYI, we noticed the below changes on

git://people.freedesktop.org/~airlied/linux.git radeon-mst-hacks
commit f5ef139cbe5dbd755dab3706022d7147800099a8 (drm/fb: add support for tiled 
monitor configurations.)


testbox/testcase/testparams: vm-kbuild-1G/xfstests/4HDD-btrfs-generic-113

9cf13203b1fd7cc3  f5ef139cbe5dbd755dab370602  
  --  
   fail:runs  %reproductionfail:runs
   | | |
   :10 100%  10:10
kmsg.drm:drm_setup_crtcs[drm_kms_helper]]*ERROR*not_all_connectors_configured

vm-kbuild-1G: qemu-system-x86_64 -enable-kvm -cpu Haswell,+smep,+smap
Memory: 1G




To reproduce:

apt-get install ruby ruby-oj
git clone 
git://git.kernel.org/pub/scm/linux/kernel/git/wfg/lkp-tests.git
cd lkp-tests
bin/setup-local job.yaml # the job file attached in this email
bin/run-local   job.yaml


Disclaimer:
Results have been estimated based on internal Intel analysis and are provided
for informational purposes only. Any difference in system hardware or software
design or configuration may affect actual performance.



--yliu
---
testcase: xfstests
default_monitors:
  wait: pre-test
  vmstat: 
default_watchdogs:
  watch-oom: 
  watchdog: 
cpufreq_governor: 
model: qemu-system-x86_64 -enable-kvm -cpu Haswell,+smep,+smap
nr_vm: 16
nr_cpu: 2
memory: 1G
disk_type: virtio-scsi
rootfs: debian-x86_64.cgz
hdd_partitions: /dev/sda /dev/sdb /dev/sdc /dev/sdd
swap_partitions: /dev/sde
disk: 4HDD
fs:
- btrfs
xfstests:
  test:
  - generic-113
enqueue_time: 2014-11-26 13:11:19.191840759 +08:00
branch: linux-devel/devel-hourly-2014112611
commit: a7a1168f6b45bb0d29a20c942e1ca300ef54dc6e
repeat_to: 2
testbox: vm-kbuild-1G-3
tbox_group: vm-kbuild-1G
kconfig: x86_64-rhel
kernel: 
/kernel/x86_64-rhel/a7a1168f6b45bb0d29a20c942e1ca300ef54dc6e/vmlinuz-3.18.0-rc6-wl-ath-ga7a1168f
user: lkp
queue: rand
result_root: 
/result/vm-kbuild-1G/xfstests/4HDD-btrfs-generic-113/debian-x86_64.cgz/x86_64-rhel/a7a1168f6b45bb0d29a20c942e1ca300ef54dc6e/0
job_file: 
/lkp/scheduled/vm-kbuild-1G-3/rand_xfstests-4HDD-btrfs-generic-113-debian-x86_64.cgz-x86_64-rhel-a7a1168f6b45bb0d29a20c942e1ca300ef54dc6e-1.yaml
dequeue_time: 2014-11-26 13:25:10.605471464 +08:00
job_state: finished
loadavg: 96.37 33.89 12.20 1/593 3339
start_time: '1416979556'
end_time: '1416979727'
version: /lkp/lkp/.src-20141126-053142
mkfs -t btrfs /dev/sdd
mkfs -t btrfs /dev/sdc
mkfs -t btrfs /dev/sdb
mkfs -t btrfs /dev/sda
mount -t btrfs /dev/sda /fs/sda
mount -t btrfs /dev/sdb /fs/sdb
mount -t btrfs /dev/sdc /fs/sdc
mount -t btrfs /dev/sdd /fs/sdd
export TEST_DIR=/fs/sda
export TEST_DEV=/dev/sda
export FSTYP=btrfs
export SCRATCH_MNT=/fs/scratch
mkdir /fs/scratch -p
export SCRATCH_DEV_POOL=/dev/sdb /dev/sdc /dev/sdd
./check generic/113


[LKP] [sched] WARNING: CPU: 0 PID: 13608 at kernel/sched/core.c:7323 __might_sleep+0xbd/0xd0()

2014-12-02 Thread Yuanhan Liu
FYI, we noticed the below changes on

git://git.kernel.org/pub/scm/linux/kernel/git/next/linux-next.git master
commit 8eb23b9f35aae413140d3fda766a98092c21e9b0 (sched: Debug nested sleeps)


+-+++
| | 26cabd3125 | 8eb23b9f35 |
+-+++
| boot_successes  | 10 | 15 |
| boot_failures   | 0  | 25 |
| WARNING:at_kernel/sched/core.c:__might_sleep()  | 0  | 5  |
| backtrace:SyS_read  | 0  | 5  |
| backtrace:vfs_read  | 0  | 5  |
| WARNING:at_kernel/sched/core.c:#__might_sleep() | 0  | 20 |
| backtrace:SyS_io_getevents  | 0  | 10 |
| backtrace:read_events   | 0  | 7  |
| backtrace:kauditd_thread| 0  | 10 |
+-+++



4[  839.494114] [ cut here ]
4[  839.494131] WARNING: CPU: 0 PID: 13608 at 
/kbuild/src/lkp/kernel/sched/core.c:7323 __might_sleep+0xbd/0xd0()
4[  839.494137] do not call blocking ops when !TASK_RUNNING; state=1 set at 
[810b3fff] prepare_to_wait+0x2f/0x90
4[  839.494256] Modules linked in: tun ipmi_watchdog loop btrfs xor raid6_pq 
sg sd_mod ast snd_pcm syscopyarea sysfillrect snd_timer sysimgblt snd ie6xx_wdt 
ttm i2c_isch drm_kms_helper soundcore drm ahci libahci pcspkr i2c_ismt lpc_sch 
ipmi_si libata shpchp ipmi_msghandler acpi_cpufreq
4[  839.494264] CPU: 0 PID: 13608 Comm: fanotify01 Not tainted 
3.18.0-rc4-next-20141117 #1
4[  839.494266] Hardware name: To be filled by O.E.M. To be filled by 
O.E.M./Double Cove , BIOS BWDEXT.86B.000.012.D127 10/08/2012
4[  839.494273]  81b5ebb8 88023cf37d18 81892f54 
64026402
4[  839.494277]  88023cf37d68 88023cf37d58 8107047a 
88023cf37db8
4[  839.494281]  81b5f5e8 0061  
6000
4[  839.494285] Call Trace:
4[  839.494315]  [81892f54] dump_stack+0x4c/0x65
4[  839.494323]  [8107047a] warn_slowpath_common+0x8a/0xc0
4[  839.494327]  [810704f6] warn_slowpath_fmt+0x46/0x50
4[  839.494333]  [810b3fff] ? prepare_to_wait+0x2f/0x90
4[  839.494337]  [810b3fff] ? prepare_to_wait+0x2f/0x90
4[  839.494341]  [810961fd] __might_sleep+0xbd/0xd0
4[  839.494348]  [81898974] mutex_lock+0x24/0x50
4[  839.494354]  [812250f5] fanotify_read+0xd5/0x620
4[  839.494370]  [8139c906] ? selinux_file_permission+0xa6/0x120
4[  839.494374]  [810b43e0] ? wait_woken+0xc0/0xc0
4[  839.494381]  [811e14c8] __vfs_read+0x18/0x50
4[  839.494385]  [811e158a] vfs_read+0x8a/0x140
4[  839.494390]  [811e1686] SyS_read+0x46/0xb0
4[  839.494403]  [8189b629] system_call_fastpath+0x12/0x17
4[  839.494409] ---[ end trace 5a2207521429f889 ]---



--yliu
___
LKP mailing list
l...@linux.intel.com
[0.00] Initializing cgroup subsys cpuset
[0.00] Initializing cgroup subsys cpu
[0.00] Linux version 3.18.0-rc4-next-20141117 (kbuild@roam) (gcc 
version 4.9.1 (Debian 4.9.1-19) ) #1 SMP Tue Nov 18 11:46:52 CST 2014
[0.00] Command line: user=lkp 
job=/lkp/scheduled/lkp-a06/cyclic_ltp-performance-syscalls-x86_64-rhel-HEAD-efefb5ca5da52f7537c7ced03d6e53408f13a26e-0.yaml
 ARCH=x86_64 
BOOT_IMAGE=/kernel/x86_64-rhel/efefb5ca5da52f7537c7ced03d6e53408f13a26e/vmlinuz-3.18.0-rc4-next-20141117
 kconfig=x86_64-rhel commit=efefb5ca5da52f7537c7ced03d6e53408f13a26e 
branch=next/master root=/dev/ram0 max_uptime=3600 
RESULT_ROOT=/result/lkp-a06/ltp/performance-syscalls/debian-x86_64.cgz/x86_64-rhel/efefb5ca5da52f7537c7ced03d6e53408f13a26e/0
 ip=lkp-a06::dhcp earlyprintk=ttyS0,115200 debug apic=debug 
sysrq_always_enabled rcupdate.rcu_cpu_stall_timeout=100 panic=-1 
softlockup_panic=1 nmi_watchdog=panic oops=panic load_ramdisk=2 
prompt_ramdisk=0 console=ttyS0,115200 console=tty0 vga=normal rw
[0.00] e820: BIOS-provided physical RAM map:
[0.00] BIOS-e820: [mem 0x0100-0x0009e3ff] usable
[0.00] BIOS-e820: [mem 0x0009e400-0x0009] reserved
[0.00] BIOS-e820: [mem 0x000e-0x000f] reserved
[0.00] BIOS-e820: [mem 0x0010-0xbf67afff] usable
[0.00] BIOS-e820: [mem 0xbf67b000-0xbfb3dfff] ACPI NVS
[0.00] BIOS-e820: [mem 0xbfb3e000-0xbfc50fff] reserved
[0.00] BIOS-e820: [mem 0xbfc51000-0xbfc51fff] ACPI NVS
[0.00] BIOS-e820: [mem 

[LKP] [net] 4ed2d765dfa:

2014-11-24 Thread Yuanhan Liu
FYI, we noticed the below changes on

commit 4ed2d765dfaccff5ebdac68e2064b59125033a3b ("net-timestamp: TCP 
timestamping")


testbox/testcase/testparams: vm-vp-2G/ltp/syscalls

e7fd2885385157d4  4ed2d765dfaccff5ebdac68e20  
  --  
   fail:runs  %reproductionfail:runs
   | | |
   :5  100%   5:5 ltp.recv01.fail
   :5  100%   5:5 ltp.recvfrom01.fail
   :5  100%   5:5 ltp.recvmsg01.fail
   :5   20%   1:5 
kmsg.APIC_calibration_not_consistent_with_PM-Timer:#ms_instead_of#ms
   :5   20%   1:5 kmsg.hrtimer:interrupt_took#ns
   :5   20%   1:5 
kmsg.TINFO:mlock_failed:errno=ENOMEM(#):Cannot_allocate_memory
   :5   20%   1:5 
kmsg.estcases/kernel/syscalls/getgroups/../utils/compat_16.h::#-bit_version_of_getgroups()is_not_supported_on_your_platform

testbox/testcase/testparams: nhm-white/ltp/syscalls

e7fd2885385157d4  4ed2d765dfaccff5ebdac68e20  
  --  
   :10 100%   5:5 ltp.recv01.fail
   :10 100%   5:5 ltp.recvfrom01.fail
   :10 100%   5:5 ltp.recvmsg01.fail

vm-vp-2G: qemu-system-x86_64 -enable-kvm -cpu Penryn
Memory: 2G

nhm-white: Nehalem
Memory: 6G




To reproduce:

  apt-get install ruby ruby-oj
  git clone git://git.kernel.org/pub/scm/linux/kernel/git/wfg/lkp-tests.git
  cd lkp-tests
  bin/setup-local job.yaml # the job file attached in this email
  bin/run-local   job.yaml


--yliu
---
testcase: ltp
default_monitors:
  wait: pre-test
  vmstat: 
model: qemu-system-x86_64 -enable-kvm -cpu Penryn
nr_vm: 4
nr_cpu: 4
memory: 2G
rootfs: debian-x86_64.cgz
hdd_partitions: "/dev/vdb /dev/vdc /dev/vdd /dev/vde /dev/vdf"
swap_partitions: "/dev/vda"
ltp:
  test:
  - syscalls
enqueue_time: 2014-10-02 10:07:25.199207485 +08:00
branch: net/master
commit: 0754476419f127eb8c294b17b6fc8b6787ded1e2
testbox: vm-vp-2G-3
kconfig: x86_64-rhel
kernel: 
"/kernel/x86_64-rhel/0754476419f127eb8c294b17b6fc8b6787ded1e2/vmlinuz-3.17.0-rc6-00145-g0754476"
user: lkp
queue: rand
result_root: 
"/result/vm-vp-2G/ltp/syscalls/debian-x86_64.cgz/x86_64-rhel/0754476419f127eb8c294b17b6fc8b6787ded1e2/0"
job_file: 
"/lkp/scheduled/vm-vp-2G-3/rand_ltp-syscalls-debian-x86_64.cgz-x86_64-rhel-0754476419f127eb8c294b17b6fc8b6787ded1e2-0.yaml"
dequeue_time: 2014-10-02 11:55:51.761588446 +08:00
job_state: finished
loadavg: 4.39 5.61 2.69 1/85 10461
start_time: '141188'
end_time: '141759'
version: "/lkp/lkp/.src-20141001-203321"
./runltp -f syscalls


[LKP] [net] 4ed2d765dfa:

2014-11-24 Thread Yuanhan Liu
FYI, we noticed the below changes on

commit 4ed2d765dfaccff5ebdac68e2064b59125033a3b (net-timestamp: TCP 
timestamping)


testbox/testcase/testparams: vm-vp-2G/ltp/syscalls

e7fd2885385157d4  4ed2d765dfaccff5ebdac68e20  
  --  
   fail:runs  %reproductionfail:runs
   | | |
   :5  100%   5:5 ltp.recv01.fail
   :5  100%   5:5 ltp.recvfrom01.fail
   :5  100%   5:5 ltp.recvmsg01.fail
   :5   20%   1:5 
kmsg.APIC_calibration_not_consistent_with_PM-Timer:#ms_instead_of#ms
   :5   20%   1:5 kmsg.hrtimer:interrupt_took#ns
   :5   20%   1:5 
kmsg.TINFO:mlock_failed:errno=ENOMEM(#):Cannot_allocate_memory
   :5   20%   1:5 
kmsg.estcases/kernel/syscalls/getgroups/../utils/compat_16.h::#-bit_version_of_getgroups()is_not_supported_on_your_platform

testbox/testcase/testparams: nhm-white/ltp/syscalls

e7fd2885385157d4  4ed2d765dfaccff5ebdac68e20  
  --  
   :10 100%   5:5 ltp.recv01.fail
   :10 100%   5:5 ltp.recvfrom01.fail
   :10 100%   5:5 ltp.recvmsg01.fail

vm-vp-2G: qemu-system-x86_64 -enable-kvm -cpu Penryn
Memory: 2G

nhm-white: Nehalem
Memory: 6G




To reproduce:

  apt-get install ruby ruby-oj
  git clone git://git.kernel.org/pub/scm/linux/kernel/git/wfg/lkp-tests.git
  cd lkp-tests
  bin/setup-local job.yaml # the job file attached in this email
  bin/run-local   job.yaml


--yliu
---
testcase: ltp
default_monitors:
  wait: pre-test
  vmstat: 
model: qemu-system-x86_64 -enable-kvm -cpu Penryn
nr_vm: 4
nr_cpu: 4
memory: 2G
rootfs: debian-x86_64.cgz
hdd_partitions: /dev/vdb /dev/vdc /dev/vdd /dev/vde /dev/vdf
swap_partitions: /dev/vda
ltp:
  test:
  - syscalls
enqueue_time: 2014-10-02 10:07:25.199207485 +08:00
branch: net/master
commit: 0754476419f127eb8c294b17b6fc8b6787ded1e2
testbox: vm-vp-2G-3
kconfig: x86_64-rhel
kernel: 
/kernel/x86_64-rhel/0754476419f127eb8c294b17b6fc8b6787ded1e2/vmlinuz-3.17.0-rc6-00145-g0754476
user: lkp
queue: rand
result_root: 
/result/vm-vp-2G/ltp/syscalls/debian-x86_64.cgz/x86_64-rhel/0754476419f127eb8c294b17b6fc8b6787ded1e2/0
job_file: 
/lkp/scheduled/vm-vp-2G-3/rand_ltp-syscalls-debian-x86_64.cgz-x86_64-rhel-0754476419f127eb8c294b17b6fc8b6787ded1e2-0.yaml
dequeue_time: 2014-10-02 11:55:51.761588446 +08:00
job_state: finished
loadavg: 4.39 5.61 2.69 1/85 10461
start_time: '141188'
end_time: '141759'
version: /lkp/lkp/.src-20141001-203321
./runltp -f syscalls


[LKP] [nohz] 2a16fc93d2c:

2014-11-23 Thread Yuanhan Liu
FYI, we noticed the below changes on

commit 2a16fc93d2c9568e16d45db77c7b5f15e1921cf1 ("nohz: Avoid tick's double 
reprogramming in highres mode")


testbox/testcase/testparams: snb-drag/piglit/performance-igt-001

b5e995e671d8e4d7  2a16fc93d2c9568e16d45db77c  
  --  
   fail:runs  %reproductionfail:runs
   | | |
   :5  100%   5:5 
kmsg.drm:__gen6_gt_force_wake_get]*ERROR*Timed_out_waiting_for_forcewake_to_ack_request
   :5  100%   5:5 
piglit.igt/gem_ctx_exec/reset-pin-leak.dmesg-warn

snb-drag: Sandy Bridge
Memory: 6G


<3>[   90.915459] [drm:__gen6_gt_force_wake_get] *ERROR* Timed out waiting for 
forcewake to ack request.
<3>[   90.925094] [drm:__gen6_gt_force_wake_get] *ERROR* Timed out waiting for 
forcewake to ack request.
<3>[   90.934725] [drm:__gen6_gt_force_wake_get] *ERROR* Timed out waiting for 
forcewake to ack request.
<3>[   90.944347] [drm:__gen6_gt_force_wake_get] *ERROR* Timed out waiting for 
forcewake to ack request.
<3>[   90.953956] [drm:__gen6_gt_force_wake_get] *ERROR* Timed out waiting for 
forcewake to ack request.
<3>[   90.963559] [drm:__gen6_gt_force_wake_get] *ERROR* Timed out waiting for 
forcewake to ack request.
<3>[   90.973173] [drm:__gen6_gt_force_wake_get] *ERROR* Timed out waiting for 
forcewake to ack request.
<3>[   90.982793] [drm:__gen6_gt_force_wake_get] *ERROR* Timed out waiting for 
forcewake to ack request.
<3>[   90.992405] [drm:__gen6_gt_force_wake_get] *ERROR* Timed out waiting for 
forcewake to ack request.
<3>[   91.002008] [drm:__gen6_gt_force_wake_get] *ERROR* Timed out waiting for 
forcewake to ack request.
<3>[   91.011618] [drm:__gen6_gt_force_wake_get] *ERROR* Timed out waiting for 
forcewake to ack request.
<3>[   91.021222] [drm:__gen6_gt_force_wake_get] *ERROR* Timed out waiting for 
forcewake to ack request.
<3>[   91.030825] [drm:__gen6_gt_force_wake_get] *ERROR* Timed out waiting for 
forcewake to ack request.
<3>[   91.040430] [drm:__gen6_gt_force_wake_get] *ERROR* Timed out waiting for 
forcewake to ack request.
<3>[   91.050016] [drm:__gen6_gt_force_wake_get] *ERROR* Timed out waiting for 
forcewake to ack request.
<3>[   91.059593] [drm:__gen6_gt_force_wake_get] *ERROR* Timed out waiting for 
forcewake to ack request.
<3>[   91.069152] [drm:__gen6_gt_force_wake_get] *ERROR* Timed out waiting for 
forcewake to ack request.


To reproduce:

apt-get install ruby ruby-oj
git clone 
git://git.kernel.org/pub/scm/linux/kernel/git/wfg/lkp-tests.git
cd lkp-tests
bin/setup-local job.yaml # the job file attached in this email
bin/run-local   job.yaml


Disclaimer:
Results have been estimated based on internal Intel analysis and are provided
for informational purposes only. Any difference in system hardware or software
design or configuration may affect actual performance.


--yliu
---
testcase: piglit
default_monitors:
  wait: pre-test
  vmstat: 
default_watchdogs:
  watch-oom: 
  watchdog: 
cpufreq_governor:
- performance
commit: 9bdebfefe1de2b6fa7e193c10411ef209b0ebc96
model: Sandy Bridge
memory: 6G
hdd_partitions: "/dev/disk/by-id/ata-ST3750528AS_6VP2W0PA-part5 
/dev/disk/by-id/ata-ST3750528AS_6VP2W0PA-part6
  /dev/disk/by-id/ata-ST3750528AS_6VP2W0PA-part7 
/dev/disk/by-id/ata-ST3750528AS_6VP2W0PA-part8
  /dev/disk/by-id/ata-ST3750528AS_6VP2W0PA-part9 
/dev/disk/by-id/ata-ST3750528AS_6VP2W0PA-part10"
swap_partitions: 
rootfs_partition: "/dev/disk/by-id/ata-ST3750528AS_6VP2W0PA-part2"
timeout: 30m
piglit:
  group:
  - igt-001
enqueue_time: 2014-10-27 03:51:37.871425766 +08:00
testbox: snb-drag
tbox_group: snb-drag
kconfig: x86_64-rhel
head_commit: 9bdebfefe1de2b6fa7e193c10411ef209b0ebc96
base_commit: cac7f2429872d3733dc3f9915857b1691da2eb2f
branch: linux-devel/devel-hourly-2014103002
kernel: 
"/kernel/x86_64-rhel/9bdebfefe1de2b6fa7e193c10411ef209b0ebc96/vmlinuz-3.18.0-rc2-g9bdebfe"
user: lkp
queue: cyclic
rootfs: debian-x86_64.cgz
result_root: 
"/result/snb-drag/piglit/performance-igt-001/debian-x86_64.cgz/x86_64-rhel/9bdebfefe1de2b6fa7e193c10411ef209b0ebc96/0"
job_file: 
"/lkp/scheduled/snb-drag/cyclic_piglit-performance-igt-001-x86_64-rhel-HEAD-9bdebfefe1de2b6fa7e193c10411ef209b0ebc96-0.yaml"
dequeue_time: 2014-10-30 03:46:50.534182476 +08:00
job_state: finished
loadavg: 0.62 0.46 0.25 1/96 9645
start_time: '1414612069'
end_time: '1414612536'
version: "/lkp/lkp/.src-20141029-214343"
echo performance > /sys/devices/system/cpu/cpu0/cpufreq/scaling_governor
echo performance > /sys/devices/system/cpu/cpu1/cpufreq/scaling_governor
echo performance > /sys/devices/system/cpu/cpu2/cpufreq/scaling_governor
echo performance > /sys/devices/system/cpu/cpu3/cpufreq/scaling_governor
piglit run igt -t igt/drv_hangman/error-state-capture-bsd 
/lkp/lkp/src/tmp/piglit-results-0
piglit summary console /lkp/lkp/src/tmp/piglit-results-0
piglit run igt -t 

[LKP] [x86, irq, ACPI] 5fcb864ef90: -3.3%(vm-scalability.throughput) +12.9%(turbostat.%c0)

2014-11-23 Thread Yuanhan Liu

Hi,

We noticed the below changes on(NOTE: I'm not sure the bisect is correct
or not, here I report it out JFYI).

git://git.kernel.org/pub/scm/linux/kernel/git/next/linux-next.git master
commit 5fcb864ef90df093d964171539c87ffa0ab49f0f ("x86, irq, ACPI: Implement 
interfaces to support ACPI based IOAPIC hot-removal")


testbox/testcase/testparams: 
lkp-nex06/vm-scalability/performance-300s-small-allocs-mt

ff6213974cd90e1e  5fcb864ef90df093d964171539  
  --  
   fail:runs  %reproductionfail:runs
   | | |
   :5   20%   1:5 
kmsg.CE:hpet_increased_min_delta_ns_to#nsec
 %stddev %change %stddev
 \  |\  
315326 ±  0%  -3.3% 304841 ±  0%  vm-scalability.throughput
 11.82 ±  0% +12.9%  13.34 ±  0%  turbostat.%c0
  1.34 ±  0%  +9.4%   1.46 ±  0%  turbostat.GHz
12 ± 47% +78.7% 21 ± 32%  sched_debug.cfs_rq[29]:/.load
   113 ± 26% +86.3%212 ± 28%  
sched_debug.cfs_rq[39]:/.tg_load_contrib
   106 ± 28% +89.5%202 ± 30%  
sched_debug.cfs_rq[39]:/.blocked_load_avg
66 ± 23%+120.6%145 ± 29%  
sched_debug.cfs_rq[40]:/.blocked_load_avg
70 ± 23%+113.0%150 ± 29%  
sched_debug.cfs_rq[40]:/.tg_load_contrib
 10145 ± 23% -38.3%   6255 ± 35%  numa-meminfo.node1.AnonPages
  2535 ± 23% -38.3%   1564 ± 35%  numa-vmstat.node1.nr_anon_pages
   605 ± 16% -22.0%471 ±  5%  
sched_debug.cpu#58.nr_uninterruptible
 58904 ±  7% -13.8%  50762 ±  7%  
sched_debug.cfs_rq[0]:/.min_vruntime
481299 ±  8% -13.4% 416975 ±  7%  sched_debug.cpu#0.sched_count
409009 ± 11% -15.7% 344638 ±  2%  sched_debug.cpu#4.sched_count
 52022 ± 10% -16.1%  43623 ±  2%  
sched_debug.cfs_rq[4]:/.min_vruntime
68 ±  3% -12.2% 60 ±  3%  
sched_debug.cfs_rq[4]:/.tg_runnable_contrib
  3175 ±  3% -12.1%   2791 ±  3%  
sched_debug.cfs_rq[4]:/.avg->runnable_avg_sum
 50060 ±  6% -12.3%  43914 ±  4%  
sched_debug.cfs_rq[29]:/.min_vruntime
  1751 ± 12% -15.5%   1480 ±  6%  
sched_debug.cpu#63.nr_uninterruptible
  2967 ±  6% -13.7%   2562 ±  4%  
sched_debug.cfs_rq[37]:/.avg->runnable_avg_sum
63 ±  6% -13.8% 55 ±  4%  
sched_debug.cfs_rq[37]:/.tg_runnable_contrib
  1.07 ±  2% -10.9%   0.95 ±  3%  
perf-profile.cpu-cycles.tick_nohz_restart.tick_nohz_idle_exit.cpu_startup_entry.start_secondary
  1.64 ±  2%  -8.4%   1.50 ±  4%  
perf-profile.cpu-cycles.__tick_nohz_idle_enter.tick_nohz_idle_enter.cpu_startup_entry.start_secondary
 35173 ±  5%  -9.1%  31983 ±  3%  
sched_debug.cfs_rq[56]:/.min_vruntime
  1.41 ±  2%  -8.3%   1.29 ±  4%  
perf-profile.cpu-cycles.tick_nohz_stop_sched_tick.__tick_nohz_idle_enter.tick_nohz_idle_enter.cpu_startup_entry.start_secondary
  1.63 ±  1%  -9.3%   1.48 ±  3%  
perf-profile.cpu-cycles.tick_nohz_idle_exit.cpu_startup_entry.start_secondary
 45161 ± 11% -12.8%  39358 ±  4%  
sched_debug.cfs_rq[25]:/.min_vruntime
 39201 ±  5% +17.3%  45969 ± 18%  
sched_debug.cfs_rq[8]:/.min_vruntime
  21071502 ±  0%  -3.3%   20379730 ±  0%  time.minor_page_faults
   299 ±  0%  -3.1%290 ±  0%  time.user_time
  21763267 ±  0%  -3.3%   21055329 ±  0%  time.voluntary_context_switches
142199 ±  0%  -3.1% 137732 ±  0%  vmstat.system.cs
   737 ±  0%  -2.1%721 ±  1%  time.system_time
   341 ±  0%  -2.5%333 ±  0%  time.percent_of_cpu_this_job_got

lkp-nex06: Nehalem-EX
Memory: 64G




   turbostat.%c0

14 ++---+
   |O   |
   |  O  O   O  O  O|
  13.5 O+   O O   O  O  O O  O  |
   |  O   OO   O O  |
   | O OO
13 ++   |
   ||
  12.5 ++   |
   ||
   ||
12 *+.*...*.. .*... |
   | *..*. *..  .*...*..*..*..*...  .*..*   |
   |  *.  *.|
  11.5 

[LKP] [x86, irq, ACPI] 5fcb864ef90: -3.3%(vm-scalability.throughput) +12.9%(turbostat.%c0)

2014-11-23 Thread Yuanhan Liu

Hi,

We noticed the below changes on(NOTE: I'm not sure the bisect is correct
or not, here I report it out JFYI).

git://git.kernel.org/pub/scm/linux/kernel/git/next/linux-next.git master
commit 5fcb864ef90df093d964171539c87ffa0ab49f0f (x86, irq, ACPI: Implement 
interfaces to support ACPI based IOAPIC hot-removal)


testbox/testcase/testparams: 
lkp-nex06/vm-scalability/performance-300s-small-allocs-mt

ff6213974cd90e1e  5fcb864ef90df093d964171539  
  --  
   fail:runs  %reproductionfail:runs
   | | |
   :5   20%   1:5 
kmsg.CE:hpet_increased_min_delta_ns_to#nsec
 %stddev %change %stddev
 \  |\  
315326 ±  0%  -3.3% 304841 ±  0%  vm-scalability.throughput
 11.82 ±  0% +12.9%  13.34 ±  0%  turbostat.%c0
  1.34 ±  0%  +9.4%   1.46 ±  0%  turbostat.GHz
12 ± 47% +78.7% 21 ± 32%  sched_debug.cfs_rq[29]:/.load
   113 ± 26% +86.3%212 ± 28%  
sched_debug.cfs_rq[39]:/.tg_load_contrib
   106 ± 28% +89.5%202 ± 30%  
sched_debug.cfs_rq[39]:/.blocked_load_avg
66 ± 23%+120.6%145 ± 29%  
sched_debug.cfs_rq[40]:/.blocked_load_avg
70 ± 23%+113.0%150 ± 29%  
sched_debug.cfs_rq[40]:/.tg_load_contrib
 10145 ± 23% -38.3%   6255 ± 35%  numa-meminfo.node1.AnonPages
  2535 ± 23% -38.3%   1564 ± 35%  numa-vmstat.node1.nr_anon_pages
   605 ± 16% -22.0%471 ±  5%  
sched_debug.cpu#58.nr_uninterruptible
 58904 ±  7% -13.8%  50762 ±  7%  
sched_debug.cfs_rq[0]:/.min_vruntime
481299 ±  8% -13.4% 416975 ±  7%  sched_debug.cpu#0.sched_count
409009 ± 11% -15.7% 344638 ±  2%  sched_debug.cpu#4.sched_count
 52022 ± 10% -16.1%  43623 ±  2%  
sched_debug.cfs_rq[4]:/.min_vruntime
68 ±  3% -12.2% 60 ±  3%  
sched_debug.cfs_rq[4]:/.tg_runnable_contrib
  3175 ±  3% -12.1%   2791 ±  3%  
sched_debug.cfs_rq[4]:/.avg-runnable_avg_sum
 50060 ±  6% -12.3%  43914 ±  4%  
sched_debug.cfs_rq[29]:/.min_vruntime
  1751 ± 12% -15.5%   1480 ±  6%  
sched_debug.cpu#63.nr_uninterruptible
  2967 ±  6% -13.7%   2562 ±  4%  
sched_debug.cfs_rq[37]:/.avg-runnable_avg_sum
63 ±  6% -13.8% 55 ±  4%  
sched_debug.cfs_rq[37]:/.tg_runnable_contrib
  1.07 ±  2% -10.9%   0.95 ±  3%  
perf-profile.cpu-cycles.tick_nohz_restart.tick_nohz_idle_exit.cpu_startup_entry.start_secondary
  1.64 ±  2%  -8.4%   1.50 ±  4%  
perf-profile.cpu-cycles.__tick_nohz_idle_enter.tick_nohz_idle_enter.cpu_startup_entry.start_secondary
 35173 ±  5%  -9.1%  31983 ±  3%  
sched_debug.cfs_rq[56]:/.min_vruntime
  1.41 ±  2%  -8.3%   1.29 ±  4%  
perf-profile.cpu-cycles.tick_nohz_stop_sched_tick.__tick_nohz_idle_enter.tick_nohz_idle_enter.cpu_startup_entry.start_secondary
  1.63 ±  1%  -9.3%   1.48 ±  3%  
perf-profile.cpu-cycles.tick_nohz_idle_exit.cpu_startup_entry.start_secondary
 45161 ± 11% -12.8%  39358 ±  4%  
sched_debug.cfs_rq[25]:/.min_vruntime
 39201 ±  5% +17.3%  45969 ± 18%  
sched_debug.cfs_rq[8]:/.min_vruntime
  21071502 ±  0%  -3.3%   20379730 ±  0%  time.minor_page_faults
   299 ±  0%  -3.1%290 ±  0%  time.user_time
  21763267 ±  0%  -3.3%   21055329 ±  0%  time.voluntary_context_switches
142199 ±  0%  -3.1% 137732 ±  0%  vmstat.system.cs
   737 ±  0%  -2.1%721 ±  1%  time.system_time
   341 ±  0%  -2.5%333 ±  0%  time.percent_of_cpu_this_job_got

lkp-nex06: Nehalem-EX
Memory: 64G




   turbostat.%c0

14 ++---+
   |O   |
   |  O  O   O  O  O|
  13.5 O+   O O   O  O  O O  O  |
   |  O   OO   O O  |
   | O OO
13 ++   |
   ||
  12.5 ++   |
   ||
   ||
12 *+.*...*.. .*... |
   | *..*. *..  .*...*..*..*..*...  .*..*   |
   |  *.  *.|
  11.5 

[LKP] [nohz] 2a16fc93d2c:

2014-11-23 Thread Yuanhan Liu
FYI, we noticed the below changes on

commit 2a16fc93d2c9568e16d45db77c7b5f15e1921cf1 (nohz: Avoid tick's double 
reprogramming in highres mode)


testbox/testcase/testparams: snb-drag/piglit/performance-igt-001

b5e995e671d8e4d7  2a16fc93d2c9568e16d45db77c  
  --  
   fail:runs  %reproductionfail:runs
   | | |
   :5  100%   5:5 
kmsg.drm:__gen6_gt_force_wake_get]*ERROR*Timed_out_waiting_for_forcewake_to_ack_request
   :5  100%   5:5 
piglit.igt/gem_ctx_exec/reset-pin-leak.dmesg-warn

snb-drag: Sandy Bridge
Memory: 6G


3[   90.915459] [drm:__gen6_gt_force_wake_get] *ERROR* Timed out waiting for 
forcewake to ack request.
3[   90.925094] [drm:__gen6_gt_force_wake_get] *ERROR* Timed out waiting for 
forcewake to ack request.
3[   90.934725] [drm:__gen6_gt_force_wake_get] *ERROR* Timed out waiting for 
forcewake to ack request.
3[   90.944347] [drm:__gen6_gt_force_wake_get] *ERROR* Timed out waiting for 
forcewake to ack request.
3[   90.953956] [drm:__gen6_gt_force_wake_get] *ERROR* Timed out waiting for 
forcewake to ack request.
3[   90.963559] [drm:__gen6_gt_force_wake_get] *ERROR* Timed out waiting for 
forcewake to ack request.
3[   90.973173] [drm:__gen6_gt_force_wake_get] *ERROR* Timed out waiting for 
forcewake to ack request.
3[   90.982793] [drm:__gen6_gt_force_wake_get] *ERROR* Timed out waiting for 
forcewake to ack request.
3[   90.992405] [drm:__gen6_gt_force_wake_get] *ERROR* Timed out waiting for 
forcewake to ack request.
3[   91.002008] [drm:__gen6_gt_force_wake_get] *ERROR* Timed out waiting for 
forcewake to ack request.
3[   91.011618] [drm:__gen6_gt_force_wake_get] *ERROR* Timed out waiting for 
forcewake to ack request.
3[   91.021222] [drm:__gen6_gt_force_wake_get] *ERROR* Timed out waiting for 
forcewake to ack request.
3[   91.030825] [drm:__gen6_gt_force_wake_get] *ERROR* Timed out waiting for 
forcewake to ack request.
3[   91.040430] [drm:__gen6_gt_force_wake_get] *ERROR* Timed out waiting for 
forcewake to ack request.
3[   91.050016] [drm:__gen6_gt_force_wake_get] *ERROR* Timed out waiting for 
forcewake to ack request.
3[   91.059593] [drm:__gen6_gt_force_wake_get] *ERROR* Timed out waiting for 
forcewake to ack request.
3[   91.069152] [drm:__gen6_gt_force_wake_get] *ERROR* Timed out waiting for 
forcewake to ack request.


To reproduce:

apt-get install ruby ruby-oj
git clone 
git://git.kernel.org/pub/scm/linux/kernel/git/wfg/lkp-tests.git
cd lkp-tests
bin/setup-local job.yaml # the job file attached in this email
bin/run-local   job.yaml


Disclaimer:
Results have been estimated based on internal Intel analysis and are provided
for informational purposes only. Any difference in system hardware or software
design or configuration may affect actual performance.


--yliu
---
testcase: piglit
default_monitors:
  wait: pre-test
  vmstat: 
default_watchdogs:
  watch-oom: 
  watchdog: 
cpufreq_governor:
- performance
commit: 9bdebfefe1de2b6fa7e193c10411ef209b0ebc96
model: Sandy Bridge
memory: 6G
hdd_partitions: /dev/disk/by-id/ata-ST3750528AS_6VP2W0PA-part5 
/dev/disk/by-id/ata-ST3750528AS_6VP2W0PA-part6
  /dev/disk/by-id/ata-ST3750528AS_6VP2W0PA-part7 
/dev/disk/by-id/ata-ST3750528AS_6VP2W0PA-part8
  /dev/disk/by-id/ata-ST3750528AS_6VP2W0PA-part9 
/dev/disk/by-id/ata-ST3750528AS_6VP2W0PA-part10
swap_partitions: 
rootfs_partition: /dev/disk/by-id/ata-ST3750528AS_6VP2W0PA-part2
timeout: 30m
piglit:
  group:
  - igt-001
enqueue_time: 2014-10-27 03:51:37.871425766 +08:00
testbox: snb-drag
tbox_group: snb-drag
kconfig: x86_64-rhel
head_commit: 9bdebfefe1de2b6fa7e193c10411ef209b0ebc96
base_commit: cac7f2429872d3733dc3f9915857b1691da2eb2f
branch: linux-devel/devel-hourly-2014103002
kernel: 
/kernel/x86_64-rhel/9bdebfefe1de2b6fa7e193c10411ef209b0ebc96/vmlinuz-3.18.0-rc2-g9bdebfe
user: lkp
queue: cyclic
rootfs: debian-x86_64.cgz
result_root: 
/result/snb-drag/piglit/performance-igt-001/debian-x86_64.cgz/x86_64-rhel/9bdebfefe1de2b6fa7e193c10411ef209b0ebc96/0
job_file: 
/lkp/scheduled/snb-drag/cyclic_piglit-performance-igt-001-x86_64-rhel-HEAD-9bdebfefe1de2b6fa7e193c10411ef209b0ebc96-0.yaml
dequeue_time: 2014-10-30 03:46:50.534182476 +08:00
job_state: finished
loadavg: 0.62 0.46 0.25 1/96 9645
start_time: '1414612069'
end_time: '1414612536'
version: /lkp/lkp/.src-20141029-214343
echo performance  /sys/devices/system/cpu/cpu0/cpufreq/scaling_governor
echo performance  /sys/devices/system/cpu/cpu1/cpufreq/scaling_governor
echo performance  /sys/devices/system/cpu/cpu2/cpufreq/scaling_governor
echo performance  /sys/devices/system/cpu/cpu3/cpufreq/scaling_governor
piglit run igt -t igt/drv_hangman/error-state-capture-bsd 
/lkp/lkp/src/tmp/piglit-results-0
piglit summary console /lkp/lkp/src/tmp/piglit-results-0
piglit run igt -t igt/gem_reset_stats/reset-count-ctx-vebox 
/lkp/lkp/src/tmp/piglit-results-1

[LKP] [x86, PCI, MSI] BUG: unable to handle kernel NULL pointer dereference at 0000000000000002

2014-11-16 Thread Yuanhan Liu
FYI, we noticed the below changes on

https://github.com/jiangliu/linux.git irqdomain/p2v7
commit 515b463a5a4c2bac0593c6d88a475a32d65f4bcc ("x86, PCI, MSI: Use hierarchy 
irqdomain to manage MSI interrupts")


+--+++
|  | dadb7cd295 | 515b463a5a |
+--+++
| boot_successes   | 6  | 1  |
| early-boot-hang  | 1  ||
| boot_failures| 0  | 4  |
| BUG:unable_to_handle_kernel  | 0  | 4  |
| Oops | 0  | 4  |
| RIP:init_irq_alloc_info  | 0  | 4  |
| Kernel_panic-not_syncing:Fatal_exception | 0  | 4  |
| backtrace:init_irq_alloc_info| 0  | 4  |
| backtrace:vp_find_vqs| 0  | 4  |
| backtrace:init_vq| 0  | 4  |
| backtrace:init   | 0  | 4  |
| backtrace:kernel_init_freeable   | 0  | 4  |
+--+++


[   20.962013] BUG: unable to handle kernel NULL pointer dereference at 
0002
[   20.964023] IP: [] init_irq_alloc_info+0x13/0x1b
[   20.964023] PGD 0 
[   20.964023] Oops: 0002 [#1] SMP DEBUG_PAGEALLOC
[   20.964023] Modules linked in:
[   20.964023] CPU: 0 PID: 1 Comm: swapper/0 Not tainted 3.18.0-rc4-g4ae16b6 
#1457
[   20.964023] Hardware name: Bochs Bochs, BIOS Bochs 01/01/2011
[   20.964023] task: 8801289c0010 ti: 8801289c4000 task.ti: 
8801289c4000
[   20.964023] RIP: 0010:[]  [] 
init_irq_alloc_info+0x13/0x1b
[   20.964023] RSP: :8801289c7928  EFLAGS: 00010246
[   20.964023] RAX:  RBX: 0002 RCX: 000a
[   20.964023] RDX: 0002 RSI:  RDI: 0002
[   20.964023] RBP: 8801289c7928 R08: 0008 R09: 
[   20.964023] R10: 8800b8399f80 R11: 0023 R12: 8800db055000
[   20.964023] R13: 8800d1ee8f98 R14: 880129cc3f80 R15: 83e36800
[   20.964023] FS:  () GS:88012a20() 
knlGS:
[   20.964023] CS:  0010 DS:  ES:  CR0: 8005003b
[   20.964023] CR2: 0002 CR3: 03e1a000 CR4: 06f0
[   20.964023] Stack:
[   20.964023]  8801289c7958 810770be 8801289c7980 
0002
[   20.964023]  83e36840 8800db055098 8801289c79d8 
8110fd29
[   20.964023]    8800db055000 
0011
[   20.964023] Call Trace:
[   20.964023]  [] pci_msi_prepare+0x2d/0x54
[   20.964023]  [] msi_domain_alloc_irqs+0x4a/0x162
[   20.964023]  [] ? dmar_find_matched_drhd_unit+0xf7/0x10b
[   20.964023]  [] pci_msi_domain_alloc_irqs+0x15/0x17
[   20.964023]  [] native_setup_msi_irqs+0x61/0x6c
[   20.964023]  [] arch_setup_msi_irqs+0xf/0x11
[   20.964023]  [] pci_msi_setup_msi_irqs+0x45/0x4c
[   20.964023]  [] pci_enable_msix+0x1d8/0x2d0
[   20.964023]  [] pci_enable_msix_range+0x31/0x50
[   20.964023]  [] vp_request_msix_vectors+0xb6/0x1f8
[   20.964023]  [] vp_try_to_find_vqs+0xae/0x43e
[   20.964023]  [] ? vsnprintf+0x374/0x3ad
[   20.964023]  [] vp_find_vqs+0x32/0x8d
[   20.964023]  [] init_vq+0x14f/0x1f8
[   20.964023]  [] virtblk_probe+0xf3/0x501
[   20.964023]  [] ? sysfs_do_create_link_sd+0x78/0xa8
[   20.964023]  [] ? vp_set_status+0x25/0x27
[   20.964023]  [] virtio_dev_probe+0xbd/0x104
[   20.964023]  [] driver_probe_device+0xb0/0x1d7
[   20.964023]  [] __driver_attach+0x62/0x85
[   20.964023]  [] ? __device_attach+0x3d/0x3d
[   20.964023]  [] bus_for_each_dev+0x6f/0x89
[   20.964023]  [] driver_attach+0x1e/0x20
[   20.964023]  [] bus_add_driver+0x110/0x1cf
[   20.964023]  [] ? nbd_init+0x39c/0x39c
[   20.964023]  [] driver_register+0x8f/0xcc
[   20.964023]  [] ? nbd_init+0x39c/0x39c
[   20.964023]  [] register_virtio_driver+0x2b/0x2d
[   20.964023]  [] init+0x5d/0x8b
[   20.964023]  [] do_one_initcall+0xee/0x17e
[   20.964023]  [] kernel_init_freeable+0x1ec/0x274
[   20.964023]  [] ? rest_init+0xcc/0xcc
[   20.964023]  [] kernel_init+0xe/0xdf
[   20.964023]  [] ret_from_fork+0x7c/0xb0
[   20.964023]  [] ? rest_init+0xcc/0xcc
[   20.964023] Code: eb 05 bb da ff ff ff 48 83 c4 28 89 d8 5b 41 5c 41 5d 41 
5e 41 5f 5d c3 0f 1f 44 00 00 55 48 89 fa b9 0a 00 00 00 31 c0 48 89 e5  ab 
5d 48 89 72 08 c3 0f 1f 44 00 00 55 48 85 f6 b9 0a 00 00 
[   20.964023] RIP  [] init_irq_alloc_info+0x13/0x1b
[   20.964023]  RSP 
[   20.964023] CR2: 0002
[   20.964023] ---[ end trace 21200aca189fb8f5 ]---
[   20.964023] Kernel panic - not syncing: Fatal exception
[   20.964023] Kernel 

[LKP] [LSM] Kernel panic - not syncing: No working init found.

2014-11-16 Thread Yuanhan Liu
FYI, we noticed the below changes on(TBH, I don't know the bisect is
correct or not; sorry for the noise if not)

git://git.kernel.org/pub/scm/linux/kernel/git/kees/linux.git lsm/stacking
commit 58c4f9e3be81a85839ea229b1dd36bf55232d440 ("LSM: Refactor existing LSM 
stacking")


++++
|| c9979f3c6e | 58c4f9e3be |
++++
| boot_successes | 15 | 0  |
| early-boot-hang| 1  ||
| boot_failures  | 0  | 15 |
| Kernel_panic-not_syncing:No_working_init_found | 0  | 15 |
| backtrace:panic| 0  | 15 |
++++


[3.437279] Starting init: /sbin/init exists but couldn't execute it (error 
-12)
[3.438655] Starting init: /etc/init exists but couldn't execute it (error 
-13)
[3.440136] Starting init: /bin/sh exists but couldn't execute it (error -12)
[3.441487] Kernel panic - not syncing: No working init found.  Try passing 
init= option to kernel. See Linux Documentation/init.txt for guidance.
[3.443352] CPU: 0 PID: 1 Comm: swapper Not tainted 3.18.0-rc4-g49aba53 #1949
[3.443352]   f783d540 80017f88 8138c3bd 80017fa0 8138b30b 815e1f40 
f783d540
[3.443352]  815e1f40  80017fac 81389523 8152ab4d 80016000 813918e0 
81389474
[3.443352]        007b 
007b
[3.443352] Call Trace:
[3.443352]  [<8138c3bd>] dump_stack+0x16/0x18
[3.443352]  [<8138b30b>] panic+0x86/0x19e
[3.443352]  [<81389523>] kernel_init+0xaf/0xb3
[3.443352]  [<813918e0>] ret_from_kernel_thread+0x20/0x30
[3.443352]  [<81389474>] ? rest_init+0xa2/0xa2
[3.443352] Kernel Offset: 0x0 from 0x8100 (relocation range: 
0x8000-0x947fdfff)

Elapsed time: 10



--yliu
early console in setup code
Probing EDD (edd=off to disable)... ok
[0.00] Linux version 3.18.0-rc4-g49aba53 (kbuild@lkp-hsx01) (gcc 
version 4.9.1 (Debian 4.9.1-19) ) #1949 Sat Nov 15 06:21:52 CST 2014
[0.00] e820: BIOS-provided physical RAM map:
[0.00] BIOS-e820: [mem 0x-0x0009fbff] usable
[0.00] BIOS-e820: [mem 0x0009fc00-0x0009] reserved
[0.00] BIOS-e820: [mem 0x000f-0x000f] reserved
[0.00] BIOS-e820: [mem 0x0010-0x13ffdfff] usable
[0.00] BIOS-e820: [mem 0x13ffe000-0x13ff] reserved
[0.00] BIOS-e820: [mem 0xfeffc000-0xfeff] reserved
[0.00] BIOS-e820: [mem 0xfffc-0x] reserved
[0.00] Notice: NX (Execute Disable) protection missing in CPU!
[0.00] Hypervisor detected: KVM
[0.00] e820: update [mem 0x-0x0fff] usable ==> reserved
[0.00] e820: remove [mem 0x000a-0x000f] usable
[0.00] e820: last_pfn = 0x13ffe max_arch_pfn = 0x100
[0.00] initial memory mapped: [mem 0x-0x027f]
[0.00] Base memory trampoline at [8009b000] 9b000 size 16384
[0.00] init_memory_mapping: [mem 0x-0x000f]
[0.00]  [mem 0x-0x000f] page 4k
[0.00] init_memory_mapping: [mem 0x1320-0x133f]
[0.00]  [mem 0x1320-0x133f] page 2M
[0.00] init_memory_mapping: [mem 0x1000-0x131f]
[0.00]  [mem 0x1000-0x131f] page 2M
[0.00] init_memory_mapping: [mem 0x0010-0x0fff]
[0.00]  [mem 0x0010-0x001f] page 4k
[0.00]  [mem 0x0020-0x0fff] page 2M
[0.00] init_memory_mapping: [mem 0x1340-0x13ffdfff]
[0.00]  [mem 0x1340-0x13df] page 2M
[0.00]  [mem 0x13e0-0x13ffdfff] page 4k
[0.00] BRK [0x01f22000, 0x01f22fff] PGTABLE
[0.00] BRK [0x01f23000, 0x01f23fff] PGTABLE
[0.00] RAMDISK: [mem 0x135e9000-0x13fe]
[0.00] ACPI: Early table checksum verification disabled
[0.00] ACPI: RSDP 0x000FD950 14 (v00 BOCHS )
[0.00] ACPI: RSDT 0x13FFE450 34 (v01 BOCHS  BXPCRSDT 0001 BXPC 
0001)
[0.00] ACPI: FACP 0x1380 74 (v01 BOCHS  BXPCFACP 0001 BXPC 
0001)
[0.00] ACPI: DSDT 0x13FFE490 0011A9 (v01 BXPC   BXDSDT   0001 INTL 
20100528)
[0.00] ACPI: FACS 0x1340 40
[0.00] ACPI: SSDT 0x13FFF7A0 000796 (v01 BOCHS  BXPCSSDT 0001 BXPC 
0001)
[0.00] ACPI: APIC 0x13FFF680 80 (v01 BOCHS  BXPCAPIC 0001 BXPC 
0001)
[0.00] ACPI: HPET 0x13FFF640 38 (v01 BOCHS  BXPCHPET 0001 BXPC 
0001)
[0.00] ACPI: Local APIC address 0xfee0
[ 

[LKP] [x86, mm] BUG: Bad page state in process swapper/0 pfn:02500

2014-11-16 Thread Yuanhan Liu
FYI, we noticed the below changes on

git://git.kernel.org/pub/scm/linux/kernel/git/kees/linux.git x86/pmd-nx
commit 3622dcc2b4f4eaf23bae2511a30fc449d0e5f0d9 ("x86, mm: set NX across entire 
PMD at boot")


+--+++
|  | b23dc5a7cc | 3622dcc2b4 |
+--+++
| boot_successes   | 4  | 0  |
| boot_failures| 0  | 19 |
| BUG:Bad_page_state_in_process| 0  | 19 |
| BUG:Bad_page_map_in_process  | 0  | 14 |
| BUG:Bad_rss-counter_state_mm:#idx:val| 0  | 2  |
| backtrace:free_reserved_area | 0  | 19 |
| backtrace:free_init_pages| 0  | 19 |
| backtrace:mark_rodata_ro | 0  | 19 |
| backtrace:vm_munmap  | 0  | 2  |
| backtrace:SyS_munmap | 0  | 2  |
| backtrace:do_execve  | 0  | 12 |
| backtrace:SyS_execve | 0  | 12 |
| backtrace:do_group_exit  | 0  | 10 |
| backtrace:SyS_exit_group | 0  | 10 |
| backtrace:vfs_read   | 0  | 3  |
| backtrace:SyS_read   | 0  | 3  |
| general_protection_fault | 0  | 3  |
| RIP:release_pages| 0  | 3  |
| Kernel_panic-not_syncing:Fatal_exception | 0  | 3  |
+--+++


[5.435374] PM: Hibernation image not present or could not be loaded.
[5.437869] Freeing unused kernel memory: 1448K (8215b000 - 
822c5000)
[5.439558] Write protecting the kernel read-only data: 16384k
[5.441103] BUG: Bad page state in process swapper/0  pfn:02500
[5.442204] page:ea094000 count:0 mapcount:-127 mapping:  
(null) index:0x2
[5.443939] flags: 0x180()
[5.444891] page dumped because: nonzero mapcount
[5.445861] Modules linked in:
[5.446711] CPU: 1 PID: 1 Comm: swapper/0 Not tainted 
3.18.0-rc4-00185-g3622dcc #1
[5.448369] Hardware name: Bochs Bochs, BIOS Bochs 01/01/2011
[5.449450]  81cf3ba4 880037a33d78 819ea6b0 
10ac
[5.451360]  ea094000 880037a33da8 8119b29c 

[5.453289]  ea094000 0001  
880037a33de8
[5.455234] Call Trace:
[5.455942]  [] dump_stack+0x4e/0x68
[5.456971]  [] bad_page+0xf5/0x113
[5.457972]  [] free_pages_prepare+0xbf/0x13f
[5.459067]  [] free_hot_cold_page+0x35/0x1a0
[5.460178]  [] __free_pages+0x1b/0x24
[5.461219]  [] free_reserved_area+0xaf/0x10b
[5.462339]  [] free_init_pages+0x8d/0x99
[5.463407]  [] mark_rodata_ro+0xb6/0x11c
[5.464522]  [] ? rest_init+0x89/0x89
[5.465533]  [] kernel_init+0x1d/0xdf
[5.466596]  [] ret_from_fork+0x7c/0xb0
[5.467633]  [] ? rest_init+0x89/0x89
[5.468711] Disabling lock debugging due to kernel taint
[5.470302] Freeing unused kernel memory: 1488K (8248c000 - 
8260)
[5.472182] Freeing unused kernel memory: 20K (8800019fb000 - 
880001a0)
[5.477823] Freeing unused kernel memory: 1812K (880001e3b000 - 
88000200)
[5.582078] BUG: Bad page state in process udevd  pfn:0248c
[5.582103] BUG: Bad page state in process udevd  pfn:024a0
[5.582104] page:ea092800 count:2 mapcount:0 
mapping:88003ec8ea69 index:0x2
[5.582107] flags: 0x1880068(uptodate|lru|active|swapbacked)



--yliu
early console in setup code
Probing EDD (edd=off to disable)... ok
early console in decompress_kernel

Decompressing Linux... Parsing ELF... done.
Booting the kernel.
[0.00] Initializing cgroup subsys cpuset
[0.00] Initializing cgroup subsys cpu
[0.00] Linux version 3.18.0-rc4-00185-g3622dcc (kbuild@roam) (gcc 
version 4.9.1 (Debian 4.9.1-19) ) #1 SMP Sat Nov 15 17:25:59 CST 2014
[0.00] Command line: user=lkp 
job=/lkp/scheduled/vm-vp-1G-6/rand_boot-1-debian-x86_64.cgz-x86_64-lkp-3622dcc2b4f4eaf23bae2511a30fc449d0e5f0d9-1.yaml
 ARCH=x86_64 
BOOT_IMAGE=/kernel/x86_64-lkp/3622dcc2b4f4eaf23bae2511a30fc449d0e5f0d9/vmlinuz-3.18.0-rc4-00185-g3622dcc
 kconfig=x86_64-lkp commit=3622dcc2b4f4eaf23bae2511a30fc449d0e5f0d9 
branch=kees/x86/pmd-nx root=/dev/ram0 max_uptime=3600 
RESULT_ROOT=/result/vm-vp-1G/boot/1/debian-x86_64.cgz/x86_64-lkp/3622dcc2b4f4eaf23bae2511a30fc449d0e5f0d9/0
 ip=vm-vp-1G-6::dhcp earlyprintk=ttyS0,115200 debug apic=debug 
sysrq_always_enabled rcupdate.rcu_cpu_stall_timeout=100 

[LKP] [x86, mm] BUG: Bad page state in process swapper/0 pfn:02500

2014-11-16 Thread Yuanhan Liu
FYI, we noticed the below changes on

git://git.kernel.org/pub/scm/linux/kernel/git/kees/linux.git x86/pmd-nx
commit 3622dcc2b4f4eaf23bae2511a30fc449d0e5f0d9 (x86, mm: set NX across entire 
PMD at boot)


+--+++
|  | b23dc5a7cc | 3622dcc2b4 |
+--+++
| boot_successes   | 4  | 0  |
| boot_failures| 0  | 19 |
| BUG:Bad_page_state_in_process| 0  | 19 |
| BUG:Bad_page_map_in_process  | 0  | 14 |
| BUG:Bad_rss-counter_state_mm:#idx:val| 0  | 2  |
| backtrace:free_reserved_area | 0  | 19 |
| backtrace:free_init_pages| 0  | 19 |
| backtrace:mark_rodata_ro | 0  | 19 |
| backtrace:vm_munmap  | 0  | 2  |
| backtrace:SyS_munmap | 0  | 2  |
| backtrace:do_execve  | 0  | 12 |
| backtrace:SyS_execve | 0  | 12 |
| backtrace:do_group_exit  | 0  | 10 |
| backtrace:SyS_exit_group | 0  | 10 |
| backtrace:vfs_read   | 0  | 3  |
| backtrace:SyS_read   | 0  | 3  |
| general_protection_fault | 0  | 3  |
| RIP:release_pages| 0  | 3  |
| Kernel_panic-not_syncing:Fatal_exception | 0  | 3  |
+--+++


[5.435374] PM: Hibernation image not present or could not be loaded.
[5.437869] Freeing unused kernel memory: 1448K (8215b000 - 
822c5000)
[5.439558] Write protecting the kernel read-only data: 16384k
[5.441103] BUG: Bad page state in process swapper/0  pfn:02500
[5.442204] page:ea094000 count:0 mapcount:-127 mapping:  
(null) index:0x2
[5.443939] flags: 0x180()
[5.444891] page dumped because: nonzero mapcount
[5.445861] Modules linked in:
[5.446711] CPU: 1 PID: 1 Comm: swapper/0 Not tainted 
3.18.0-rc4-00185-g3622dcc #1
[5.448369] Hardware name: Bochs Bochs, BIOS Bochs 01/01/2011
[5.449450]  81cf3ba4 880037a33d78 819ea6b0 
10ac
[5.451360]  ea094000 880037a33da8 8119b29c 

[5.453289]  ea094000 0001  
880037a33de8
[5.455234] Call Trace:
[5.455942]  [819ea6b0] dump_stack+0x4e/0x68
[5.456971]  [8119b29c] bad_page+0xf5/0x113
[5.457972]  [8119b379] free_pages_prepare+0xbf/0x13f
[5.459067]  [8119d65e] free_hot_cold_page+0x35/0x1a0
[5.460178]  [8119d878] __free_pages+0x1b/0x24
[5.461219]  [8119d930] free_reserved_area+0xaf/0x10b
[5.462339]  [8106c126] free_init_pages+0x8d/0x99
[5.463407]  [8106cb1a] mark_rodata_ro+0xb6/0x11c
[5.464522]  [819e15b5] ? rest_init+0x89/0x89
[5.465533]  [819e15d2] kernel_init+0x1d/0xdf
[5.466596]  [819f15bc] ret_from_fork+0x7c/0xb0
[5.467633]  [819e15b5] ? rest_init+0x89/0x89
[5.468711] Disabling lock debugging due to kernel taint
[5.470302] Freeing unused kernel memory: 1488K (8248c000 - 
8260)
[5.472182] Freeing unused kernel memory: 20K (8800019fb000 - 
880001a0)
[5.477823] Freeing unused kernel memory: 1812K (880001e3b000 - 
88000200)
[5.582078] BUG: Bad page state in process udevd  pfn:0248c
[5.582103] BUG: Bad page state in process udevd  pfn:024a0
[5.582104] page:ea092800 count:2 mapcount:0 
mapping:88003ec8ea69 index:0x2
[5.582107] flags: 0x1880068(uptodate|lru|active|swapbacked)



--yliu
early console in setup code
Probing EDD (edd=off to disable)... ok
early console in decompress_kernel

Decompressing Linux... Parsing ELF... done.
Booting the kernel.
[0.00] Initializing cgroup subsys cpuset
[0.00] Initializing cgroup subsys cpu
[0.00] Linux version 3.18.0-rc4-00185-g3622dcc (kbuild@roam) (gcc 
version 4.9.1 (Debian 4.9.1-19) ) #1 SMP Sat Nov 15 17:25:59 CST 2014
[0.00] Command line: user=lkp 
job=/lkp/scheduled/vm-vp-1G-6/rand_boot-1-debian-x86_64.cgz-x86_64-lkp-3622dcc2b4f4eaf23bae2511a30fc449d0e5f0d9-1.yaml
 ARCH=x86_64 
BOOT_IMAGE=/kernel/x86_64-lkp/3622dcc2b4f4eaf23bae2511a30fc449d0e5f0d9/vmlinuz-3.18.0-rc4-00185-g3622dcc
 kconfig=x86_64-lkp commit=3622dcc2b4f4eaf23bae2511a30fc449d0e5f0d9 
branch=kees/x86/pmd-nx root=/dev/ram0 max_uptime=3600 

[LKP] [LSM] Kernel panic - not syncing: No working init found.

2014-11-16 Thread Yuanhan Liu
FYI, we noticed the below changes on(TBH, I don't know the bisect is
correct or not; sorry for the noise if not)

git://git.kernel.org/pub/scm/linux/kernel/git/kees/linux.git lsm/stacking
commit 58c4f9e3be81a85839ea229b1dd36bf55232d440 (LSM: Refactor existing LSM 
stacking)


++++
|| c9979f3c6e | 58c4f9e3be |
++++
| boot_successes | 15 | 0  |
| early-boot-hang| 1  ||
| boot_failures  | 0  | 15 |
| Kernel_panic-not_syncing:No_working_init_found | 0  | 15 |
| backtrace:panic| 0  | 15 |
++++


[3.437279] Starting init: /sbin/init exists but couldn't execute it (error 
-12)
[3.438655] Starting init: /etc/init exists but couldn't execute it (error 
-13)
[3.440136] Starting init: /bin/sh exists but couldn't execute it (error -12)
[3.441487] Kernel panic - not syncing: No working init found.  Try passing 
init= option to kernel. See Linux Documentation/init.txt for guidance.
[3.443352] CPU: 0 PID: 1 Comm: swapper Not tainted 3.18.0-rc4-g49aba53 #1949
[3.443352]   f783d540 80017f88 8138c3bd 80017fa0 8138b30b 815e1f40 
f783d540
[3.443352]  815e1f40  80017fac 81389523 8152ab4d 80016000 813918e0 
81389474
[3.443352]        007b 
007b
[3.443352] Call Trace:
[3.443352]  [8138c3bd] dump_stack+0x16/0x18
[3.443352]  [8138b30b] panic+0x86/0x19e
[3.443352]  [81389523] kernel_init+0xaf/0xb3
[3.443352]  [813918e0] ret_from_kernel_thread+0x20/0x30
[3.443352]  [81389474] ? rest_init+0xa2/0xa2
[3.443352] Kernel Offset: 0x0 from 0x8100 (relocation range: 
0x8000-0x947fdfff)

Elapsed time: 10



--yliu
early console in setup code
Probing EDD (edd=off to disable)... ok
[0.00] Linux version 3.18.0-rc4-g49aba53 (kbuild@lkp-hsx01) (gcc 
version 4.9.1 (Debian 4.9.1-19) ) #1949 Sat Nov 15 06:21:52 CST 2014
[0.00] e820: BIOS-provided physical RAM map:
[0.00] BIOS-e820: [mem 0x-0x0009fbff] usable
[0.00] BIOS-e820: [mem 0x0009fc00-0x0009] reserved
[0.00] BIOS-e820: [mem 0x000f-0x000f] reserved
[0.00] BIOS-e820: [mem 0x0010-0x13ffdfff] usable
[0.00] BIOS-e820: [mem 0x13ffe000-0x13ff] reserved
[0.00] BIOS-e820: [mem 0xfeffc000-0xfeff] reserved
[0.00] BIOS-e820: [mem 0xfffc-0x] reserved
[0.00] Notice: NX (Execute Disable) protection missing in CPU!
[0.00] Hypervisor detected: KVM
[0.00] e820: update [mem 0x-0x0fff] usable == reserved
[0.00] e820: remove [mem 0x000a-0x000f] usable
[0.00] e820: last_pfn = 0x13ffe max_arch_pfn = 0x100
[0.00] initial memory mapped: [mem 0x-0x027f]
[0.00] Base memory trampoline at [8009b000] 9b000 size 16384
[0.00] init_memory_mapping: [mem 0x-0x000f]
[0.00]  [mem 0x-0x000f] page 4k
[0.00] init_memory_mapping: [mem 0x1320-0x133f]
[0.00]  [mem 0x1320-0x133f] page 2M
[0.00] init_memory_mapping: [mem 0x1000-0x131f]
[0.00]  [mem 0x1000-0x131f] page 2M
[0.00] init_memory_mapping: [mem 0x0010-0x0fff]
[0.00]  [mem 0x0010-0x001f] page 4k
[0.00]  [mem 0x0020-0x0fff] page 2M
[0.00] init_memory_mapping: [mem 0x1340-0x13ffdfff]
[0.00]  [mem 0x1340-0x13df] page 2M
[0.00]  [mem 0x13e0-0x13ffdfff] page 4k
[0.00] BRK [0x01f22000, 0x01f22fff] PGTABLE
[0.00] BRK [0x01f23000, 0x01f23fff] PGTABLE
[0.00] RAMDISK: [mem 0x135e9000-0x13fe]
[0.00] ACPI: Early table checksum verification disabled
[0.00] ACPI: RSDP 0x000FD950 14 (v00 BOCHS )
[0.00] ACPI: RSDT 0x13FFE450 34 (v01 BOCHS  BXPCRSDT 0001 BXPC 
0001)
[0.00] ACPI: FACP 0x1380 74 (v01 BOCHS  BXPCFACP 0001 BXPC 
0001)
[0.00] ACPI: DSDT 0x13FFE490 0011A9 (v01 BXPC   BXDSDT   0001 INTL 
20100528)
[0.00] ACPI: FACS 0x1340 40
[0.00] ACPI: SSDT 0x13FFF7A0 000796 (v01 BOCHS  BXPCSSDT 0001 BXPC 
0001)
[0.00] ACPI: APIC 0x13FFF680 80 (v01 BOCHS  BXPCAPIC 0001 BXPC 
0001)
[0.00] ACPI: HPET 0x13FFF640 38 (v01 BOCHS  BXPCHPET 0001 BXPC 
0001)
[0.00] ACPI: Local APIC address 0xfee0
[0.00] 

[LKP] [x86, PCI, MSI] BUG: unable to handle kernel NULL pointer dereference at 0000000000000002

2014-11-16 Thread Yuanhan Liu
FYI, we noticed the below changes on

https://github.com/jiangliu/linux.git irqdomain/p2v7
commit 515b463a5a4c2bac0593c6d88a475a32d65f4bcc (x86, PCI, MSI: Use hierarchy 
irqdomain to manage MSI interrupts)


+--+++
|  | dadb7cd295 | 515b463a5a |
+--+++
| boot_successes   | 6  | 1  |
| early-boot-hang  | 1  ||
| boot_failures| 0  | 4  |
| BUG:unable_to_handle_kernel  | 0  | 4  |
| Oops | 0  | 4  |
| RIP:init_irq_alloc_info  | 0  | 4  |
| Kernel_panic-not_syncing:Fatal_exception | 0  | 4  |
| backtrace:init_irq_alloc_info| 0  | 4  |
| backtrace:vp_find_vqs| 0  | 4  |
| backtrace:init_vq| 0  | 4  |
| backtrace:init   | 0  | 4  |
| backtrace:kernel_init_freeable   | 0  | 4  |
+--+++


[   20.962013] BUG: unable to handle kernel NULL pointer dereference at 
0002
[   20.964023] IP: [81074795] init_irq_alloc_info+0x13/0x1b
[   20.964023] PGD 0 
[   20.964023] Oops: 0002 [#1] SMP DEBUG_PAGEALLOC
[   20.964023] Modules linked in:
[   20.964023] CPU: 0 PID: 1 Comm: swapper/0 Not tainted 3.18.0-rc4-g4ae16b6 
#1457
[   20.964023] Hardware name: Bochs Bochs, BIOS Bochs 01/01/2011
[   20.964023] task: 8801289c0010 ti: 8801289c4000 task.ti: 
8801289c4000
[   20.964023] RIP: 0010:[81074795]  [81074795] 
init_irq_alloc_info+0x13/0x1b
[   20.964023] RSP: :8801289c7928  EFLAGS: 00010246
[   20.964023] RAX:  RBX: 0002 RCX: 000a
[   20.964023] RDX: 0002 RSI:  RDI: 0002
[   20.964023] RBP: 8801289c7928 R08: 0008 R09: 
[   20.964023] R10: 8800b8399f80 R11: 0023 R12: 8800db055000
[   20.964023] R13: 8800d1ee8f98 R14: 880129cc3f80 R15: 83e36800
[   20.964023] FS:  () GS:88012a20() 
knlGS:
[   20.964023] CS:  0010 DS:  ES:  CR0: 8005003b
[   20.964023] CR2: 0002 CR3: 03e1a000 CR4: 06f0
[   20.964023] Stack:
[   20.964023]  8801289c7958 810770be 8801289c7980 
0002
[   20.964023]  83e36840 8800db055098 8801289c79d8 
8110fd29
[   20.964023]    8800db055000 
0011
[   20.964023] Call Trace:
[   20.964023]  [810770be] pci_msi_prepare+0x2d/0x54
[   20.964023]  [8110fd29] msi_domain_alloc_irqs+0x4a/0x162
[   20.964023]  [8285063a] ? dmar_find_matched_drhd_unit+0xf7/0x10b
[   20.964023]  [8177e2ee] pci_msi_domain_alloc_irqs+0x15/0x17
[   20.964023]  [8107727c] native_setup_msi_irqs+0x61/0x6c
[   20.964023]  [8104f786] arch_setup_msi_irqs+0xf/0x11
[   20.964023]  [8177d3e0] pci_msi_setup_msi_irqs+0x45/0x4c
[   20.964023]  [8177daf7] pci_enable_msix+0x1d8/0x2d0
[   20.964023]  [8177dc20] pci_enable_msix_range+0x31/0x50
[   20.964023]  [8185dfa6] vp_request_msix_vectors+0xb6/0x1f8
[   20.964023]  [8185e196] vp_try_to_find_vqs+0xae/0x43e
[   20.964023]  [8172fbc5] ? vsnprintf+0x374/0x3ad
[   20.964023]  [8185e558] vp_find_vqs+0x32/0x8d
[   20.964023]  [81b416aa] init_vq+0x14f/0x1f8
[   20.964023]  [81b41896] virtblk_probe+0xf3/0x501
[   20.964023]  [81238727] ? sysfs_do_create_link_sd+0x78/0xa8
[   20.964023]  [8185dba0] ? vp_set_status+0x25/0x27
[   20.964023]  [8185c2ec] virtio_dev_probe+0xbd/0x104
[   20.964023]  [81b09a19] driver_probe_device+0xb0/0x1d7
[   20.964023]  [81b09bdf] __driver_attach+0x62/0x85
[   20.964023]  [81b09b7d] ? __device_attach+0x3d/0x3d
[   20.964023]  [81b08009] bus_for_each_dev+0x6f/0x89
[   20.964023]  [81b0957d] driver_attach+0x1e/0x20
[   20.964023]  [81b09229] bus_add_driver+0x110/0x1cf
[   20.964023]  [84452673] ? nbd_init+0x39c/0x39c
[   20.964023]  [81b0a235] driver_register+0x8f/0xcc
[   20.964023]  [84452673] ? nbd_init+0x39c/0x39c
[   20.964023]  [8185c5fd] register_virtio_driver+0x2b/0x2d
[   20.964023]  [844526d0] init+0x5d/0x8b
[   20.964023]  [8100216d] do_one_initcall+0xee/0x17e
[   20.964023]  [843e60ef] kernel_init_freeable+0x1ec/0x274
[   20.964023]  [82d3c238] ? rest_init+0xcc/0xcc
[   20.964023]  

Re: [LKP] [sched] 9597d64116d: -16.1% hackbench.throughput

2014-11-13 Thread Yuanhan Liu
On Wed, Nov 12, 2014 at 03:44:34PM +0100, Vincent Guittot wrote:
> On 10 November 2014 06:54,   wrote:
> > FYI, we noticed the below changes on
> >
> > https://git.linaro.org/people/mturquette/linux.git eas-next
> > commit 9597d64116d0d441dea32e7f5f05fa135d16f44b ("sched: replace 
> > capacity_factor by usage")
> >
> >
> > b57a1e0afff2cbac  9597d64116d0d441dea32e7f5f  testbox/testcase/testparams
> >   --  ---
> >  %stddev %change %stddev
> >  \  |\
> > 104249 ą  0% -16.1%  87436 ą  0%  
> > ivb42/hackbench/performance-50%-threads-socket
> > 104249   -16.1%  87436GEO-MEAN hackbench.throughput
> 
> Hi yuanhan,
> 
> i understand this email as a 16% drop in hackbench performance when
> the number of group is half the number of CPUs. Is it the only test
> for which you  have seen some decreases ? where can i find the list of
> tests that you have passed ?

Sorry, the list is not accessed outside, plus, you have to run some
commands to generate the list on fly. Anyway, I checked it for you,
and we have run hackbench/performance-50%-threads-socket only on that
commit, which is reasonable in our system as we bisected once on this
issue.

But I can run more tests(say, with 100% and 1600% cpu) on that commit if
you like, and it also would be good if you can name some of benchmarks
you care most so that we can run it for you.

--yliu
> 
> I'm going to try to reproduce the test in my local setup
> 
> Regards,
> Vincent
> 
> 
> >
> > b57a1e0afff2cbac  9597d64116d0d441dea32e7f5f
> >   --
> >   0.88 ą 25%+209.7%   2.74 ą  5%  
> > ivb42/hackbench/performance-50%-threads-socket
> >   0.88  +209.7%   2.74GEO-MEAN 
> > perf-profile.cpu-cycles.ttwu_do_activate.constprop.87.sched_ttwu_pending.scheduler_ipi.smp_reschedule_interrupt.reschedule_interrupt
> >
> > b57a1e0afff2cbac  9597d64116d0d441dea32e7f5f
> >   --
> >   0.76 ą 26%+209.2%   2.36 ą  5%  
> > ivb42/hackbench/performance-50%-threads-socket
> >   0.76  +209.2%   2.36GEO-MEAN 
> > perf-profile.cpu-cycles.activate_task.ttwu_do_activate.sched_ttwu_pending.scheduler_ipi.smp_reschedule_interrupt
> >
> > b57a1e0afff2cbac  9597d64116d0d441dea32e7f5f
> >   --
> >   0.76 ą 26%+210.6%   2.35 ą  5%  
> > ivb42/hackbench/performance-50%-threads-socket
> >   0.76  +210.6%   2.35GEO-MEAN 
> > perf-profile.cpu-cycles.enqueue_task.activate_task.ttwu_do_activate.sched_ttwu_pending.scheduler_ipi
> >
> > b57a1e0afff2cbac  9597d64116d0d441dea32e7f5f
> >   --
> >   0.70 ą 25%+203.1%   2.13 ą  6%  
> > ivb42/hackbench/performance-50%-threads-socket
> >   0.70  +203.1%   2.13GEO-MEAN 
> > perf-profile.cpu-cycles.enqueue_task_fair.enqueue_task.activate_task.ttwu_do_activate.sched_ttwu_pending
> >
> > b57a1e0afff2cbac  9597d64116d0d441dea32e7f5f
> >   --
> > 243252 ą 46%+242.5% 833240 ą 42%  
> > ivb42/hackbench/performance-50%-threads-socket
> > 243252  +242.5% 833240GEO-MEAN 
> > sched_debug.cfs_rq[2]:/.spread0
> >
> > b57a1e0afff2cbac  9597d64116d0d441dea32e7f5f
> >   --
> > 98 ą 36% -49.1% 50 ą 34%  
> > ivb42/hackbench/performance-50%-threads-socket
> > 98   -49.1% 50GEO-MEAN 
> > sched_debug.cfs_rq[18]:/.blocked_load_avg
> >
> > b57a1e0afff2cbac  9597d64116d0d441dea32e7f5f
> >   --
> >1067752 ą 25% +65.3%1764542 ą 11%  
> > ivb42/hackbench/performance-50%-threads-socket
> >1067752   +65.3%1764542GEO-MEAN 
> > sched_debug.cfs_rq[16]:/.spread0
> >
> > b57a1e0afff2cbac  9597d64116d0d441dea32e7f5f
> >   --
> > 923375 ą 22% +96.3%1812750 ą 21%  
> > ivb42/hackbench/performance-50%-threads-socket
> > 923375   +96.3%1812750GEO-MEAN 
> > sched_debug.cfs_rq[14]:/.spread0
> >
> > b57a1e0afff2cbac  9597d64116d0d441dea32e7f5f
> >   --
> >1008818 ą 20% +70.9%1724167 ą 14%  
> > ivb42/hackbench/performance-50%-threads-socket
> >1008818   +70.9%1724167GEO-MEAN 
> > sched_debug.cfs_rq[6]:/.spread0
> >
> > b57a1e0afff2cbac  9597d64116d0d441dea32e7f5f
> >   --
> >1109100 ą 25% +53.9%1707190 ą 16%  
> > ivb42/hackbench/performance-50%-threads-socket
> >1109100   +53.9%1707190GEO-MEAN 
> > sched_debug.cfs_rq[15]:/.spread0
> >
> > b57a1e0afff2cbac  9597d64116d0d441dea32e7f5f
> > 

Re: [LKP] [sched] 9597d64116d: -16.1% hackbench.throughput

2014-11-13 Thread Yuanhan Liu
On Wed, Nov 12, 2014 at 03:44:34PM +0100, Vincent Guittot wrote:
 On 10 November 2014 06:54,  l...@01.org wrote:
  FYI, we noticed the below changes on
 
  https://git.linaro.org/people/mturquette/linux.git eas-next
  commit 9597d64116d0d441dea32e7f5f05fa135d16f44b (sched: replace 
  capacity_factor by usage)
 
 
  b57a1e0afff2cbac  9597d64116d0d441dea32e7f5f  testbox/testcase/testparams
    --  ---
   %stddev %change %stddev
   \  |\
  104249 ą  0% -16.1%  87436 ą  0%  
  ivb42/hackbench/performance-50%-threads-socket
  104249   -16.1%  87436GEO-MEAN hackbench.throughput
 
 Hi yuanhan,
 
 i understand this email as a 16% drop in hackbench performance when
 the number of group is half the number of CPUs. Is it the only test
 for which you  have seen some decreases ? where can i find the list of
 tests that you have passed ?

Sorry, the list is not accessed outside, plus, you have to run some
commands to generate the list on fly. Anyway, I checked it for you,
and we have run hackbench/performance-50%-threads-socket only on that
commit, which is reasonable in our system as we bisected once on this
issue.

But I can run more tests(say, with 100% and 1600% cpu) on that commit if
you like, and it also would be good if you can name some of benchmarks
you care most so that we can run it for you.

--yliu
 
 I'm going to try to reproduce the test in my local setup
 
 Regards,
 Vincent
 
 
 
  b57a1e0afff2cbac  9597d64116d0d441dea32e7f5f
    --
0.88 ą 25%+209.7%   2.74 ą  5%  
  ivb42/hackbench/performance-50%-threads-socket
0.88  +209.7%   2.74GEO-MEAN 
  perf-profile.cpu-cycles.ttwu_do_activate.constprop.87.sched_ttwu_pending.scheduler_ipi.smp_reschedule_interrupt.reschedule_interrupt
 
  b57a1e0afff2cbac  9597d64116d0d441dea32e7f5f
    --
0.76 ą 26%+209.2%   2.36 ą  5%  
  ivb42/hackbench/performance-50%-threads-socket
0.76  +209.2%   2.36GEO-MEAN 
  perf-profile.cpu-cycles.activate_task.ttwu_do_activate.sched_ttwu_pending.scheduler_ipi.smp_reschedule_interrupt
 
  b57a1e0afff2cbac  9597d64116d0d441dea32e7f5f
    --
0.76 ą 26%+210.6%   2.35 ą  5%  
  ivb42/hackbench/performance-50%-threads-socket
0.76  +210.6%   2.35GEO-MEAN 
  perf-profile.cpu-cycles.enqueue_task.activate_task.ttwu_do_activate.sched_ttwu_pending.scheduler_ipi
 
  b57a1e0afff2cbac  9597d64116d0d441dea32e7f5f
    --
0.70 ą 25%+203.1%   2.13 ą  6%  
  ivb42/hackbench/performance-50%-threads-socket
0.70  +203.1%   2.13GEO-MEAN 
  perf-profile.cpu-cycles.enqueue_task_fair.enqueue_task.activate_task.ttwu_do_activate.sched_ttwu_pending
 
  b57a1e0afff2cbac  9597d64116d0d441dea32e7f5f
    --
  243252 ą 46%+242.5% 833240 ą 42%  
  ivb42/hackbench/performance-50%-threads-socket
  243252  +242.5% 833240GEO-MEAN 
  sched_debug.cfs_rq[2]:/.spread0
 
  b57a1e0afff2cbac  9597d64116d0d441dea32e7f5f
    --
  98 ą 36% -49.1% 50 ą 34%  
  ivb42/hackbench/performance-50%-threads-socket
  98   -49.1% 50GEO-MEAN 
  sched_debug.cfs_rq[18]:/.blocked_load_avg
 
  b57a1e0afff2cbac  9597d64116d0d441dea32e7f5f
    --
 1067752 ą 25% +65.3%1764542 ą 11%  
  ivb42/hackbench/performance-50%-threads-socket
 1067752   +65.3%1764542GEO-MEAN 
  sched_debug.cfs_rq[16]:/.spread0
 
  b57a1e0afff2cbac  9597d64116d0d441dea32e7f5f
    --
  923375 ą 22% +96.3%1812750 ą 21%  
  ivb42/hackbench/performance-50%-threads-socket
  923375   +96.3%1812750GEO-MEAN 
  sched_debug.cfs_rq[14]:/.spread0
 
  b57a1e0afff2cbac  9597d64116d0d441dea32e7f5f
    --
 1008818 ą 20% +70.9%1724167 ą 14%  
  ivb42/hackbench/performance-50%-threads-socket
 1008818   +70.9%1724167GEO-MEAN 
  sched_debug.cfs_rq[6]:/.spread0
 
  b57a1e0afff2cbac  9597d64116d0d441dea32e7f5f
    --
 1109100 ą 25% +53.9%1707190 ą 16%  
  ivb42/hackbench/performance-50%-threads-socket
 1109100   +53.9%1707190GEO-MEAN 
  sched_debug.cfs_rq[15]:/.spread0
 
  b57a1e0afff2cbac  9597d64116d0d441dea32e7f5f
    --
 1006499 ą 33% +67.8%1688436 ą 22%  
  ivb42/hackbench/performance-50%-threads-socket
 1006499   +67.8%1688436 

Re: [LKP] [dmi] PANIC: early exception 0e rip 10:ffffffff81899e6b error 9 cr2 ffffffffff240000

2014-11-07 Thread Yuanhan Liu
On Fri, Nov 07, 2014 at 10:35:44AM +0100, Ard Biesheuvel wrote:
> On 7 November 2014 10:26, Yuanhan Liu  wrote:
> > On Fri, Nov 07, 2014 at 10:03:55AM +0100, Ard Biesheuvel wrote:
> >> On 7 November 2014 09:46, Yuanhan Liu  wrote:
> >> > On Fri, Nov 07, 2014 at 09:23:56AM +0100, Ard Biesheuvel wrote:
> >> >> On 7 November 2014 09:13, Yuanhan Liu  
> >> >> wrote:
> >> >> > On Fri, Nov 07, 2014 at 08:44:40AM +0100, Ard Biesheuvel wrote:
> >> >> >> On 7 November 2014 08:37, Yuanhan Liu  
> >> >> >> wrote:
> >> >> >> > On Fri, Nov 07, 2014 at 08:17:36AM +0100, Ard Biesheuvel wrote:
> >> >> >> >> On 7 November 2014 06:47, LKP  wrote:
> >> >> >> >> > FYI, we noticed the below changes on
> >> >> >> >> >
> >> >> >> >> > https://git.linaro.org/people/ard.biesheuvel/linux-arm 
> >> >> >> >> > efi-for-3.19
> >> >> >> >> > commit aacdce6e880894acb57d71dcb2e3fc61b4ed4e96 ("dmi: add 
> >> >> >> >> > support for SMBIOS 3.0 64-bit entry point")
> >> >> >> >> >
> >> >> >> >> >
> >> >> >> >> > +---+++
> >> >> >> >> > |   | 2fa165a26c | aacdce6e88 |
> >> >> >> >> > +---+++
> >> >> >> >> > | boot_successes| 20 | 10 |
> >> >> >> >> > | early-boot-hang   | 1  ||
> >> >> >> >> > | boot_failures | 0  | 5  |
> >> >> >> >> > | PANIC:early_exception | 0  | 5  |
> >> >> >> >> > +---+++
> >> >> >> >> >
> >> >> >> >> >
> >> >> >> >> > [0.00] BIOS-e820: [mem 
> >> >> >> >> > 0x0001-0x00036fff] usable
> >> >> >> >> > [0.00] bootconsole [earlyser0] enabled
> >> >> >> >> > [0.00] NX (Execute Disable) protection: active
> >> >> >> >> > PANIC: early exception 0e rip 10:81899e6b error 9 cr2 
> >> >> >> >> > ff24
> >> >> >> >> > [0.00] CPU: 0 PID: 0 Comm: swapper Not tainted 
> >> >> >> >> > 3.18.0-rc2-gc5221e6 #1
> >> >> >> >> > [0.00]   82203d30 
> >> >> >> >> > 819f0a6e 03f8
> >> >> >> >> > [0.00]  ff24 82203e18 
> >> >> >> >> > 823701b0 82511401
> >> >> >> >> > [0.00]   0ba3 
> >> >> >> >> >  ff24
> >> >> >> >> > [0.00] Call Trace:
> >> >> >> >> > [0.00]  [] dump_stack+0x4e/0x68
> >> >> >> >> > [0.00]  [] early_idt_handler+0x90/0xb7
> >> >> >> >> > [0.00]  [] ? 
> >> >> >> >> > dmi_save_one_device+0x81/0x81
> >> >> >> >> > [0.00]  [] ? dmi_table+0x3f/0x94
> >> >> >> >> > [0.00]  [] ? dmi_table+0x16/0x94
> >> >> >> >> > [0.00]  [] ? 
> >> >> >> >> > dmi_save_one_device+0x81/0x81
> >> >> >> >> > [0.00]  [] ? 
> >> >> >> >> > dmi_save_one_device+0x81/0x81
> >> >> >> >> > [0.00]  [] dmi_walk_early+0x44/0x69
> >> >> >> >> > [0.00]  [] dmi_present+0x180/0x1ff
> >> >> >> >> > [0.00]  [] 
> >> >> >> >> > dmi_scan_machine+0x144/0x191
> >> >> >> >> > [0.00]  [] ? loglevel+0x31/0x31
> >> >> >> >> > [0.00]  [] setup_arch+0x490/0xc73
> >> >> >> >> > [0.00]  [] ? printk+0x4d/0x4f
> >> >> 

Re: [LKP] [dmi] PANIC: early exception 0e rip 10:ffffffff81899e6b error 9 cr2 ffffffffff240000

2014-11-07 Thread Yuanhan Liu
On Fri, Nov 07, 2014 at 09:16:02AM +, Matt Fleming wrote:
> On Fri, 2014-11-07 at 08:17 +0100, Ard Biesheuvel wrote:
> > On 7 November 2014 06:47, LKP  wrote:
> > > FYI, we noticed the below changes on
> > >
> > > https://git.linaro.org/people/ard.biesheuvel/linux-arm efi-for-3.19
> > > commit aacdce6e880894acb57d71dcb2e3fc61b4ed4e96 ("dmi: add support for 
> > > SMBIOS 3.0 64-bit entry point")
> > >
> > >
> > > +---+++
> > > |   | 2fa165a26c | aacdce6e88 |
> > > +---+++
> > > | boot_successes| 20 | 10 |
> > > | early-boot-hang   | 1  ||
> > > | boot_failures | 0  | 5  |
> > > | PANIC:early_exception | 0  | 5  |
> > > +---+++
> > >
> > >
> > > [0.00] BIOS-e820: [mem 0x0001-0x00036fff] 
> > > usable
> > > [0.00] bootconsole [earlyser0] enabled
> > > [0.00] NX (Execute Disable) protection: active
> > > PANIC: early exception 0e rip 10:81899e6b error 9 cr2 
> > > ff24
> > > [0.00] CPU: 0 PID: 0 Comm: swapper Not tainted 
> > > 3.18.0-rc2-gc5221e6 #1
> > > [0.00]   82203d30 819f0a6e 
> > > 03f8
> > > [0.00]  ff24 82203e18 823701b0 
> > > 82511401
> > > [0.00]   0ba3  
> > > ff24
> > > [0.00] Call Trace:
> > > [0.00]  [] dump_stack+0x4e/0x68
> > > [0.00]  [] early_idt_handler+0x90/0xb7
> > > [0.00]  [] ? dmi_save_one_device+0x81/0x81
> > > [0.00]  [] ? dmi_table+0x3f/0x94
> > > [0.00]  [] ? dmi_table+0x16/0x94
> > > [0.00]  [] ? dmi_save_one_device+0x81/0x81
> > > [0.00]  [] ? dmi_save_one_device+0x81/0x81
> > > [0.00]  [] dmi_walk_early+0x44/0x69
> > > [0.00]  [] dmi_present+0x180/0x1ff
> > > [0.00]  [] dmi_scan_machine+0x144/0x191
> > > [0.00]  [] ? loglevel+0x31/0x31
> > > [0.00]  [] setup_arch+0x490/0xc73
> > > [0.00]  [] ? printk+0x4d/0x4f
> > > [0.00]  [] start_kernel+0x9c/0x43f
> > > [0.00]  [] ? early_idt_handlers+0x120/0x120
> > > [0.00]  [] x86_64_start_reservations+0x2a/0x2c
> > > [0.00]  [] x86_64_start_kernel+0x13b/0x14a
> > > [0.00] RIP 0x4
> > >
> > 
> > This is most puzzling. Could anyone decode the exception?
> > This looks like the non-EFI path through dmi_scan_machine(), which
> > calls dmi_present() /after/ calling dmi_smbios3_present(), which
> > apparently has not found the _SM3_ header tag. Or could the call stack
> > be inaccurate?
> 
> The code triggered a page fault while trying to access
> 0xff24, caused because the reserved bit was set in the page
> table and no page was found. Looks like it jumped through a bogus
> pointer.
> 
> And yes, the callstack may definitely be wrong - the stack dumper is
> just scraping addresses from the stack, as indicated by the '?' symbol.
> 
> Yuanhan, what symbol does 0x81899e6b (the faulting instruction)
> translate to?

I found no System.map for that kernel, I then changed to another kernel,
and here is the new panic dmesg:

PANIC: early exception 0e rip 10:8167aa1a error 9 cr2 ff240001
[0.00] CPU: 0 PID: 0 Comm: swapper Not tainted 
3.18.0-rc2-8-g4d3a0be #66
[0.00]  0ba3 81bcfd10 818010a4 
03f8
[0.00]  003e 81bcfdf8 81d801b0 
617420534f49424d
[0.00]  001f ff24  
ff24
[0.00] Call Trace:
[0.00]  [] dump_stack+0x46/0x58
[0.00]  [] early_idt_handler+0x90/0xb7
[0.00]  [] ? dmi_format_ids.constprop.9+0x13c/0x13c
[0.00]  [] ? dmi_table+0x4a/0xf0
[0.00]  [] ? printk+0x61/0x63
[0.00]  [] ? dmi_format_ids.constprop.9+0x13c/0x13c
[0.00]  [] ? dmi_format_ids.constprop.9+0x13c/0x13c
[0.00]  [] dmi_walk_early+0x6b/0x90
[0.00]  [] dmi_present+0x1b4/0x23f
[0.00]  [] dmi_scan_machine+0x1d4/0x23a
[0.00]  [] ? early_idt_handlers+0x120/0x120
[0.00]  [] setup_arch+0x462/0xcc6
[0.00]  [] ? early_idt_handlers+0x120/0x120
[0.00]  [] ? early_idt_handler+0x47/0xb7
[0.00]  [] ? early_idt_handlers+0x120/0x120
[0.00]  [] start_kernel+0x97/0x456
[0.00]  [] ? early_idt_handlers+0x120/0x120
[0.00]  [] ? early_idt_handlers+0x120/0x120
[0.00]  [] x86_64_start_reservations+0x2a/0x2c
[0.00]  [] x86_64_start_kernel+0x13e/0x14d
[0.00] RIP 0xba2


The address changes to 10:8167aa1a, and in the System.map, it has:

  8167a9d0 t dmi_table
  8167aac0 T dmi_name_in_vendors

Sorry, I don't 

Re: [LKP] [dmi] PANIC: early exception 0e rip 10:ffffffff81899e6b error 9 cr2 ffffffffff240000

2014-11-07 Thread Yuanhan Liu
On Fri, Nov 07, 2014 at 10:03:55AM +0100, Ard Biesheuvel wrote:
> On 7 November 2014 09:46, Yuanhan Liu  wrote:
> > On Fri, Nov 07, 2014 at 09:23:56AM +0100, Ard Biesheuvel wrote:
> >> On 7 November 2014 09:13, Yuanhan Liu  wrote:
> >> > On Fri, Nov 07, 2014 at 08:44:40AM +0100, Ard Biesheuvel wrote:
> >> >> On 7 November 2014 08:37, Yuanhan Liu  
> >> >> wrote:
> >> >> > On Fri, Nov 07, 2014 at 08:17:36AM +0100, Ard Biesheuvel wrote:
> >> >> >> On 7 November 2014 06:47, LKP  wrote:
> >> >> >> > FYI, we noticed the below changes on
> >> >> >> >
> >> >> >> > https://git.linaro.org/people/ard.biesheuvel/linux-arm efi-for-3.19
> >> >> >> > commit aacdce6e880894acb57d71dcb2e3fc61b4ed4e96 ("dmi: add support 
> >> >> >> > for SMBIOS 3.0 64-bit entry point")
> >> >> >> >
> >> >> >> >
> >> >> >> > +---+++
> >> >> >> > |   | 2fa165a26c | aacdce6e88 |
> >> >> >> > +---+++
> >> >> >> > | boot_successes| 20 | 10 |
> >> >> >> > | early-boot-hang   | 1  ||
> >> >> >> > | boot_failures | 0  | 5  |
> >> >> >> > | PANIC:early_exception | 0  | 5  |
> >> >> >> > +---+++
> >> >> >> >
> >> >> >> >
> >> >> >> > [0.00] BIOS-e820: [mem 
> >> >> >> > 0x0001-0x00036fff] usable
> >> >> >> > [0.00] bootconsole [earlyser0] enabled
> >> >> >> > [0.00] NX (Execute Disable) protection: active
> >> >> >> > PANIC: early exception 0e rip 10:81899e6b error 9 cr2 
> >> >> >> > ff24
> >> >> >> > [0.00] CPU: 0 PID: 0 Comm: swapper Not tainted 
> >> >> >> > 3.18.0-rc2-gc5221e6 #1
> >> >> >> > [0.00]   82203d30 819f0a6e 
> >> >> >> > 03f8
> >> >> >> > [0.00]  ff24 82203e18 823701b0 
> >> >> >> > 82511401
> >> >> >> > [0.00]   0ba3  
> >> >> >> > ff24
> >> >> >> > [0.00] Call Trace:
> >> >> >> > [0.00]  [] dump_stack+0x4e/0x68
> >> >> >> > [0.00]  [] early_idt_handler+0x90/0xb7
> >> >> >> > [0.00]  [] ? 
> >> >> >> > dmi_save_one_device+0x81/0x81
> >> >> >> > [0.00]  [] ? dmi_table+0x3f/0x94
> >> >> >> > [0.00]  [] ? dmi_table+0x16/0x94
> >> >> >> > [0.00]  [] ? 
> >> >> >> > dmi_save_one_device+0x81/0x81
> >> >> >> > [0.00]  [] ? 
> >> >> >> > dmi_save_one_device+0x81/0x81
> >> >> >> > [0.00]  [] dmi_walk_early+0x44/0x69
> >> >> >> > [0.00]  [] dmi_present+0x180/0x1ff
> >> >> >> > [0.00]  [] dmi_scan_machine+0x144/0x191
> >> >> >> > [0.00]  [] ? loglevel+0x31/0x31
> >> >> >> > [0.00]  [] setup_arch+0x490/0xc73
> >> >> >> > [0.00]  [] ? printk+0x4d/0x4f
> >> >> >> > [0.00]  [] start_kernel+0x9c/0x43f
> >> >> >> > [0.00]  [] ? 
> >> >> >> > early_idt_handlers+0x120/0x120
> >> >> >> > [0.00]  [] 
> >> >> >> > x86_64_start_reservations+0x2a/0x2c
> >> >> >> > [0.00]  [] 
> >> >> >> > x86_64_start_kernel+0x13b/0x14a
> >> >> >> > [0.00] RIP 0x4
> >> >> >> >
> >> >> >>
> >> >> >> This is most puzzling. Could anyone decode the exception?
> >> >> >> This looks like the non-EFI path through dmi_scan_machine(), which

Re: [LKP] [dmi] PANIC: early exception 0e rip 10:ffffffff81899e6b error 9 cr2 ffffffffff240000

2014-11-07 Thread Yuanhan Liu
On Fri, Nov 07, 2014 at 09:23:56AM +0100, Ard Biesheuvel wrote:
> On 7 November 2014 09:13, Yuanhan Liu  wrote:
> > On Fri, Nov 07, 2014 at 08:44:40AM +0100, Ard Biesheuvel wrote:
> >> On 7 November 2014 08:37, Yuanhan Liu  wrote:
> >> > On Fri, Nov 07, 2014 at 08:17:36AM +0100, Ard Biesheuvel wrote:
> >> >> On 7 November 2014 06:47, LKP  wrote:
> >> >> > FYI, we noticed the below changes on
> >> >> >
> >> >> > https://git.linaro.org/people/ard.biesheuvel/linux-arm efi-for-3.19
> >> >> > commit aacdce6e880894acb57d71dcb2e3fc61b4ed4e96 ("dmi: add support 
> >> >> > for SMBIOS 3.0 64-bit entry point")
> >> >> >
> >> >> >
> >> >> > +---+++
> >> >> > |   | 2fa165a26c | aacdce6e88 |
> >> >> > +---+++
> >> >> > | boot_successes| 20 | 10 |
> >> >> > | early-boot-hang   | 1  ||
> >> >> > | boot_failures | 0  | 5  |
> >> >> > | PANIC:early_exception | 0  | 5  |
> >> >> > +---+++
> >> >> >
> >> >> >
> >> >> > [0.00] BIOS-e820: [mem 0x0001-0x00036fff] 
> >> >> > usable
> >> >> > [0.00] bootconsole [earlyser0] enabled
> >> >> > [0.00] NX (Execute Disable) protection: active
> >> >> > PANIC: early exception 0e rip 10:81899e6b error 9 cr2 
> >> >> > ff24
> >> >> > [0.00] CPU: 0 PID: 0 Comm: swapper Not tainted 
> >> >> > 3.18.0-rc2-gc5221e6 #1
> >> >> > [0.00]   82203d30 819f0a6e 
> >> >> > 03f8
> >> >> > [0.00]  ff24 82203e18 823701b0 
> >> >> > 82511401
> >> >> > [0.00]   0ba3  
> >> >> > ff24
> >> >> > [0.00] Call Trace:
> >> >> > [0.00]  [] dump_stack+0x4e/0x68
> >> >> > [0.00]  [] early_idt_handler+0x90/0xb7
> >> >> > [0.00]  [] ? dmi_save_one_device+0x81/0x81
> >> >> > [0.00]  [] ? dmi_table+0x3f/0x94
> >> >> > [0.00]  [] ? dmi_table+0x16/0x94
> >> >> > [0.00]  [] ? dmi_save_one_device+0x81/0x81
> >> >> > [0.00]  [] ? dmi_save_one_device+0x81/0x81
> >> >> > [0.00]  [] dmi_walk_early+0x44/0x69
> >> >> > [0.00]  [] dmi_present+0x180/0x1ff
> >> >> > [0.00]  [] dmi_scan_machine+0x144/0x191
> >> >> > [0.00]  [] ? loglevel+0x31/0x31
> >> >> > [0.00]  [] setup_arch+0x490/0xc73
> >> >> > [0.00]  [] ? printk+0x4d/0x4f
> >> >> > [0.00]  [] start_kernel+0x9c/0x43f
> >> >> > [0.00]  [] ? early_idt_handlers+0x120/0x120
> >> >> > [0.00]  [] 
> >> >> > x86_64_start_reservations+0x2a/0x2c
> >> >> > [0.00]  [] x86_64_start_kernel+0x13b/0x14a
> >> >> > [0.00] RIP 0x4
> >> >> >
> >> >>
> >> >> This is most puzzling. Could anyone decode the exception?
> >> >> This looks like the non-EFI path through dmi_scan_machine(), which
> >> >> calls dmi_present() /after/ calling dmi_smbios3_present(), which
> >> >> apparently has not found the _SM3_ header tag. Or could the call stack
> >> >> be inaccurate?
> >> >>
> >> >> Anyway, it would be good to know the exact type of the platform,
> >> >
> >> > It's a Nehalem-EP machine, wht 16 CPU and 12G memory.
> >> >
> >> >> and
> >> >> perhaps we could find out if there is an inadvertent _SM3_ tag
> >> >> somewhere in the 0xF - 0xF range?
> >> >
> >> > Sorry, how?
> >> >
> >>
> >> That's not a brand new machine, so I suppose there wouldn't be a
> >> SMBIOS 3.0 header lurking in there.
> >>
> >> Anyway, if 

Re: [LKP] [dmi] PANIC: early exception 0e rip 10:ffffffff81899e6b error 9 cr2 ffffffffff240000

2014-11-07 Thread Yuanhan Liu
On Fri, Nov 07, 2014 at 08:44:40AM +0100, Ard Biesheuvel wrote:
> On 7 November 2014 08:37, Yuanhan Liu  wrote:
> > On Fri, Nov 07, 2014 at 08:17:36AM +0100, Ard Biesheuvel wrote:
> >> On 7 November 2014 06:47, LKP  wrote:
> >> > FYI, we noticed the below changes on
> >> >
> >> > https://git.linaro.org/people/ard.biesheuvel/linux-arm efi-for-3.19
> >> > commit aacdce6e880894acb57d71dcb2e3fc61b4ed4e96 ("dmi: add support for 
> >> > SMBIOS 3.0 64-bit entry point")
> >> >
> >> >
> >> > +---+++
> >> > |   | 2fa165a26c | aacdce6e88 |
> >> > +---+++
> >> > | boot_successes| 20 | 10 |
> >> > | early-boot-hang   | 1  ||
> >> > | boot_failures | 0  | 5  |
> >> > | PANIC:early_exception | 0  | 5  |
> >> > +---+++
> >> >
> >> >
> >> > [0.00] BIOS-e820: [mem 0x0001-0x00036fff] 
> >> > usable
> >> > [0.00] bootconsole [earlyser0] enabled
> >> > [0.00] NX (Execute Disable) protection: active
> >> > PANIC: early exception 0e rip 10:81899e6b error 9 cr2 
> >> > ff24
> >> > [0.00] CPU: 0 PID: 0 Comm: swapper Not tainted 
> >> > 3.18.0-rc2-gc5221e6 #1
> >> > [0.00]   82203d30 819f0a6e 
> >> > 03f8
> >> > [0.00]  ff24 82203e18 823701b0 
> >> > 82511401
> >> > [0.00]   0ba3  
> >> > ff24
> >> > [0.00] Call Trace:
> >> > [0.00]  [] dump_stack+0x4e/0x68
> >> > [0.00]  [] early_idt_handler+0x90/0xb7
> >> > [0.00]  [] ? dmi_save_one_device+0x81/0x81
> >> > [0.00]  [] ? dmi_table+0x3f/0x94
> >> > [0.00]  [] ? dmi_table+0x16/0x94
> >> > [0.00]  [] ? dmi_save_one_device+0x81/0x81
> >> > [0.00]  [] ? dmi_save_one_device+0x81/0x81
> >> > [0.00]  [] dmi_walk_early+0x44/0x69
> >> > [0.00]  [] dmi_present+0x180/0x1ff
> >> > [0.00]  [] dmi_scan_machine+0x144/0x191
> >> > [0.00]  [] ? loglevel+0x31/0x31
> >> > [0.00]  [] setup_arch+0x490/0xc73
> >> > [0.00]  [] ? printk+0x4d/0x4f
> >> > [0.00]  [] start_kernel+0x9c/0x43f
> >> > [0.00]  [] ? early_idt_handlers+0x120/0x120
> >> > [0.00]  [] x86_64_start_reservations+0x2a/0x2c
> >> > [0.00]  [] x86_64_start_kernel+0x13b/0x14a
> >> > [0.00] RIP 0x4
> >> >
> >>
> >> This is most puzzling. Could anyone decode the exception?
> >> This looks like the non-EFI path through dmi_scan_machine(), which
> >> calls dmi_present() /after/ calling dmi_smbios3_present(), which
> >> apparently has not found the _SM3_ header tag. Or could the call stack
> >> be inaccurate?
> >>
> >> Anyway, it would be good to know the exact type of the platform,
> >
> > It's a Nehalem-EP machine, wht 16 CPU and 12G memory.
> >
> >> and
> >> perhaps we could find out if there is an inadvertent _SM3_ tag
> >> somewhere in the 0xF - 0xF range?
> >
> > Sorry, how?
> >
> 
> That's not a brand new machine, so I suppose there wouldn't be a
> SMBIOS 3.0 header lurking in there.
> 
> Anyway, if you are in a position to try things, could you apply this
> 
> --- a/drivers/firmware/dmi_scan.c
> +++ b/drivers/firmware/dmi_scan.c
> @@ -617,7 +617,7 @@ void __init dmi_scan_machine(void)
> memset(buf, 0, 16);
> for (q = p; q < p + 0x1; q += 16) {
> memcpy_fromio(buf + 16, q, 16);
> -   if (!dmi_smbios3_present(buf) || !dmi_present(buf)) {
> +   if (!dmi_present(buf)) {
> dmi_available = 1;
> dmi_early_unmap(p, 0x1);
> goto out;
> 
> and try again?

kernel boots perfectly with this patch applied.

--yliu

> That is the only change that is relevant to the non-EFI
> code path which this machine appears to take, so if this fixes things,
> that would be valuable information even if it doesn't tell us exactly
> what is going wrong.
> 
> Thanks,
> Ard.
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [LKP] [dmi] PANIC: early exception 0e rip 10:ffffffff81899e6b error 9 cr2 ffffffffff240000

2014-11-07 Thread Yuanhan Liu
On Fri, Nov 07, 2014 at 08:44:40AM +0100, Ard Biesheuvel wrote:
 On 7 November 2014 08:37, Yuanhan Liu yuanhan@linux.intel.com wrote:
  On Fri, Nov 07, 2014 at 08:17:36AM +0100, Ard Biesheuvel wrote:
  On 7 November 2014 06:47, LKP l...@01.org wrote:
   FYI, we noticed the below changes on
  
   https://git.linaro.org/people/ard.biesheuvel/linux-arm efi-for-3.19
   commit aacdce6e880894acb57d71dcb2e3fc61b4ed4e96 (dmi: add support for 
   SMBIOS 3.0 64-bit entry point)
  
  
   +---+++
   |   | 2fa165a26c | aacdce6e88 |
   +---+++
   | boot_successes| 20 | 10 |
   | early-boot-hang   | 1  ||
   | boot_failures | 0  | 5  |
   | PANIC:early_exception | 0  | 5  |
   +---+++
  
  
   [0.00] BIOS-e820: [mem 0x0001-0x00036fff] 
   usable
   [0.00] bootconsole [earlyser0] enabled
   [0.00] NX (Execute Disable) protection: active
   PANIC: early exception 0e rip 10:81899e6b error 9 cr2 
   ff24
   [0.00] CPU: 0 PID: 0 Comm: swapper Not tainted 
   3.18.0-rc2-gc5221e6 #1
   [0.00]   82203d30 819f0a6e 
   03f8
   [0.00]  ff24 82203e18 823701b0 
   82511401
   [0.00]   0ba3  
   ff24
   [0.00] Call Trace:
   [0.00]  [819f0a6e] dump_stack+0x4e/0x68
   [0.00]  [823701b0] early_idt_handler+0x90/0xb7
   [0.00]  [823c80da] ? dmi_save_one_device+0x81/0x81
   [0.00]  [81899e6b] ? dmi_table+0x3f/0x94
   [0.00]  [81899e42] ? dmi_table+0x16/0x94
   [0.00]  [823c80da] ? dmi_save_one_device+0x81/0x81
   [0.00]  [823c80da] ? dmi_save_one_device+0x81/0x81
   [0.00]  [823c7eff] dmi_walk_early+0x44/0x69
   [0.00]  [823c88a2] dmi_present+0x180/0x1ff
   [0.00]  [823c8ab3] dmi_scan_machine+0x144/0x191
   [0.00]  [82370702] ? loglevel+0x31/0x31
   [0.00]  [82377f52] setup_arch+0x490/0xc73
   [0.00]  [819eef73] ? printk+0x4d/0x4f
   [0.00]  [82370b90] start_kernel+0x9c/0x43f
   [0.00]  [82370120] ? early_idt_handlers+0x120/0x120
   [0.00]  [823704a2] x86_64_start_reservations+0x2a/0x2c
   [0.00]  [823705df] x86_64_start_kernel+0x13b/0x14a
   [0.00] RIP 0x4
  
 
  This is most puzzling. Could anyone decode the exception?
  This looks like the non-EFI path through dmi_scan_machine(), which
  calls dmi_present() /after/ calling dmi_smbios3_present(), which
  apparently has not found the _SM3_ header tag. Or could the call stack
  be inaccurate?
 
  Anyway, it would be good to know the exact type of the platform,
 
  It's a Nehalem-EP machine, wht 16 CPU and 12G memory.
 
  and
  perhaps we could find out if there is an inadvertent _SM3_ tag
  somewhere in the 0xF - 0xF range?
 
  Sorry, how?
 
 
 That's not a brand new machine, so I suppose there wouldn't be a
 SMBIOS 3.0 header lurking in there.
 
 Anyway, if you are in a position to try things, could you apply this
 
 --- a/drivers/firmware/dmi_scan.c
 +++ b/drivers/firmware/dmi_scan.c
 @@ -617,7 +617,7 @@ void __init dmi_scan_machine(void)
 memset(buf, 0, 16);
 for (q = p; q  p + 0x1; q += 16) {
 memcpy_fromio(buf + 16, q, 16);
 -   if (!dmi_smbios3_present(buf) || !dmi_present(buf)) {
 +   if (!dmi_present(buf)) {
 dmi_available = 1;
 dmi_early_unmap(p, 0x1);
 goto out;
 
 and try again?

kernel boots perfectly with this patch applied.

--yliu

 That is the only change that is relevant to the non-EFI
 code path which this machine appears to take, so if this fixes things,
 that would be valuable information even if it doesn't tell us exactly
 what is going wrong.
 
 Thanks,
 Ard.
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [LKP] [dmi] PANIC: early exception 0e rip 10:ffffffff81899e6b error 9 cr2 ffffffffff240000

2014-11-07 Thread Yuanhan Liu
On Fri, Nov 07, 2014 at 09:23:56AM +0100, Ard Biesheuvel wrote:
 On 7 November 2014 09:13, Yuanhan Liu yuanhan@linux.intel.com wrote:
  On Fri, Nov 07, 2014 at 08:44:40AM +0100, Ard Biesheuvel wrote:
  On 7 November 2014 08:37, Yuanhan Liu yuanhan@linux.intel.com wrote:
   On Fri, Nov 07, 2014 at 08:17:36AM +0100, Ard Biesheuvel wrote:
   On 7 November 2014 06:47, LKP l...@01.org wrote:
FYI, we noticed the below changes on
   
https://git.linaro.org/people/ard.biesheuvel/linux-arm efi-for-3.19
commit aacdce6e880894acb57d71dcb2e3fc61b4ed4e96 (dmi: add support 
for SMBIOS 3.0 64-bit entry point)
   
   
+---+++
|   | 2fa165a26c | aacdce6e88 |
+---+++
| boot_successes| 20 | 10 |
| early-boot-hang   | 1  ||
| boot_failures | 0  | 5  |
| PANIC:early_exception | 0  | 5  |
+---+++
   
   
[0.00] BIOS-e820: [mem 0x0001-0x00036fff] 
usable
[0.00] bootconsole [earlyser0] enabled
[0.00] NX (Execute Disable) protection: active
PANIC: early exception 0e rip 10:81899e6b error 9 cr2 
ff24
[0.00] CPU: 0 PID: 0 Comm: swapper Not tainted 
3.18.0-rc2-gc5221e6 #1
[0.00]   82203d30 819f0a6e 
03f8
[0.00]  ff24 82203e18 823701b0 
82511401
[0.00]   0ba3  
ff24
[0.00] Call Trace:
[0.00]  [819f0a6e] dump_stack+0x4e/0x68
[0.00]  [823701b0] early_idt_handler+0x90/0xb7
[0.00]  [823c80da] ? dmi_save_one_device+0x81/0x81
[0.00]  [81899e6b] ? dmi_table+0x3f/0x94
[0.00]  [81899e42] ? dmi_table+0x16/0x94
[0.00]  [823c80da] ? dmi_save_one_device+0x81/0x81
[0.00]  [823c80da] ? dmi_save_one_device+0x81/0x81
[0.00]  [823c7eff] dmi_walk_early+0x44/0x69
[0.00]  [823c88a2] dmi_present+0x180/0x1ff
[0.00]  [823c8ab3] dmi_scan_machine+0x144/0x191
[0.00]  [82370702] ? loglevel+0x31/0x31
[0.00]  [82377f52] setup_arch+0x490/0xc73
[0.00]  [819eef73] ? printk+0x4d/0x4f
[0.00]  [82370b90] start_kernel+0x9c/0x43f
[0.00]  [82370120] ? early_idt_handlers+0x120/0x120
[0.00]  [823704a2] 
x86_64_start_reservations+0x2a/0x2c
[0.00]  [823705df] x86_64_start_kernel+0x13b/0x14a
[0.00] RIP 0x4
   
  
   This is most puzzling. Could anyone decode the exception?
   This looks like the non-EFI path through dmi_scan_machine(), which
   calls dmi_present() /after/ calling dmi_smbios3_present(), which
   apparently has not found the _SM3_ header tag. Or could the call stack
   be inaccurate?
  
   Anyway, it would be good to know the exact type of the platform,
  
   It's a Nehalem-EP machine, wht 16 CPU and 12G memory.
  
   and
   perhaps we could find out if there is an inadvertent _SM3_ tag
   somewhere in the 0xF - 0xF range?
  
   Sorry, how?
  
 
  That's not a brand new machine, so I suppose there wouldn't be a
  SMBIOS 3.0 header lurking in there.
 
  Anyway, if you are in a position to try things, could you apply this
 
  --- a/drivers/firmware/dmi_scan.c
  +++ b/drivers/firmware/dmi_scan.c
  @@ -617,7 +617,7 @@ void __init dmi_scan_machine(void)
  memset(buf, 0, 16);
  for (q = p; q  p + 0x1; q += 16) {
  memcpy_fromio(buf + 16, q, 16);
  -   if (!dmi_smbios3_present(buf) || 
  !dmi_present(buf)) {
  +   if (!dmi_present(buf)) {
  dmi_available = 1;
  dmi_early_unmap(p, 0x1);
  goto out;
 
  and try again?
 
  kernel boots perfectly with this patch applied.
 
  --yliu
 
 
 Thank you! Very useful to know
 

Sigh, I made a silly error, I speicified wrong commit while testing your
patch. Sorry for that.

And I tested it again, with your former patch, sorry, the panic still
happens.

--yliu

 Sorry to keep you busy, but could you please apply this on top of the
 previous patch
 
 --- a/drivers/firmware/dmi_scan.c
 +++ b/drivers/firmware/dmi_scan.c
 @@ -617,6 +617,8 @@ void __init dmi_scan_machine(void)
 memset(buf, 0, 16);
 for (q = p; q  p + 0x1; q += 16) {
 memcpy_fromio(buf + 16, q, 16);
 +   if (memcmp(buf, _SM3_, 5) == 0

Re: [LKP] [dmi] PANIC: early exception 0e rip 10:ffffffff81899e6b error 9 cr2 ffffffffff240000

2014-11-07 Thread Yuanhan Liu
On Fri, Nov 07, 2014 at 10:03:55AM +0100, Ard Biesheuvel wrote:
 On 7 November 2014 09:46, Yuanhan Liu yuanhan@linux.intel.com wrote:
  On Fri, Nov 07, 2014 at 09:23:56AM +0100, Ard Biesheuvel wrote:
  On 7 November 2014 09:13, Yuanhan Liu yuanhan@linux.intel.com wrote:
   On Fri, Nov 07, 2014 at 08:44:40AM +0100, Ard Biesheuvel wrote:
   On 7 November 2014 08:37, Yuanhan Liu yuanhan@linux.intel.com 
   wrote:
On Fri, Nov 07, 2014 at 08:17:36AM +0100, Ard Biesheuvel wrote:
On 7 November 2014 06:47, LKP l...@01.org wrote:
 FYI, we noticed the below changes on

 https://git.linaro.org/people/ard.biesheuvel/linux-arm efi-for-3.19
 commit aacdce6e880894acb57d71dcb2e3fc61b4ed4e96 (dmi: add support 
 for SMBIOS 3.0 64-bit entry point)


 +---+++
 |   | 2fa165a26c | aacdce6e88 |
 +---+++
 | boot_successes| 20 | 10 |
 | early-boot-hang   | 1  ||
 | boot_failures | 0  | 5  |
 | PANIC:early_exception | 0  | 5  |
 +---+++


 [0.00] BIOS-e820: [mem 
 0x0001-0x00036fff] usable
 [0.00] bootconsole [earlyser0] enabled
 [0.00] NX (Execute Disable) protection: active
 PANIC: early exception 0e rip 10:81899e6b error 9 cr2 
 ff24
 [0.00] CPU: 0 PID: 0 Comm: swapper Not tainted 
 3.18.0-rc2-gc5221e6 #1
 [0.00]   82203d30 819f0a6e 
 03f8
 [0.00]  ff24 82203e18 823701b0 
 82511401
 [0.00]   0ba3  
 ff24
 [0.00] Call Trace:
 [0.00]  [819f0a6e] dump_stack+0x4e/0x68
 [0.00]  [823701b0] early_idt_handler+0x90/0xb7
 [0.00]  [823c80da] ? 
 dmi_save_one_device+0x81/0x81
 [0.00]  [81899e6b] ? dmi_table+0x3f/0x94
 [0.00]  [81899e42] ? dmi_table+0x16/0x94
 [0.00]  [823c80da] ? 
 dmi_save_one_device+0x81/0x81
 [0.00]  [823c80da] ? 
 dmi_save_one_device+0x81/0x81
 [0.00]  [823c7eff] dmi_walk_early+0x44/0x69
 [0.00]  [823c88a2] dmi_present+0x180/0x1ff
 [0.00]  [823c8ab3] dmi_scan_machine+0x144/0x191
 [0.00]  [82370702] ? loglevel+0x31/0x31
 [0.00]  [82377f52] setup_arch+0x490/0xc73
 [0.00]  [819eef73] ? printk+0x4d/0x4f
 [0.00]  [82370b90] start_kernel+0x9c/0x43f
 [0.00]  [82370120] ? 
 early_idt_handlers+0x120/0x120
 [0.00]  [823704a2] 
 x86_64_start_reservations+0x2a/0x2c
 [0.00]  [823705df] 
 x86_64_start_kernel+0x13b/0x14a
 [0.00] RIP 0x4

   
This is most puzzling. Could anyone decode the exception?
This looks like the non-EFI path through dmi_scan_machine(), which
calls dmi_present() /after/ calling dmi_smbios3_present(), which
apparently has not found the _SM3_ header tag. Or could the call 
stack
be inaccurate?
   
Anyway, it would be good to know the exact type of the platform,
   
It's a Nehalem-EP machine, wht 16 CPU and 12G memory.
   
and
perhaps we could find out if there is an inadvertent _SM3_ tag
somewhere in the 0xF - 0xF range?
   
Sorry, how?
   
  
   That's not a brand new machine, so I suppose there wouldn't be a
   SMBIOS 3.0 header lurking in there.
  
   Anyway, if you are in a position to try things, could you apply this
  
   --- a/drivers/firmware/dmi_scan.c
   +++ b/drivers/firmware/dmi_scan.c
   @@ -617,7 +617,7 @@ void __init dmi_scan_machine(void)
   memset(buf, 0, 16);
   for (q = p; q  p + 0x1; q += 16) {
   memcpy_fromio(buf + 16, q, 16);
   -   if (!dmi_smbios3_present(buf) || 
   !dmi_present(buf)) {
   +   if (!dmi_present(buf)) {
   dmi_available = 1;
   dmi_early_unmap(p, 0x1);
   goto out;
  
   and try again?
  
   kernel boots perfectly with this patch applied.
  
   --yliu
  
 
  Thank you! Very useful to know
 
 
  Sigh, I made a silly error, I speicified wrong commit while testing your
  patch. Sorry for that.
 
  And I tested it again, with your former patch, sorry, the panic still
  happens.
 
  --yliu
 
 
 OK, no worries.
 
 Could you please try the attached patch? On my ARM system, it produces
 something like this
 
  == Decoding

Re: [LKP] [dmi] PANIC: early exception 0e rip 10:ffffffff81899e6b error 9 cr2 ffffffffff240000

2014-11-07 Thread Yuanhan Liu
On Fri, Nov 07, 2014 at 09:16:02AM +, Matt Fleming wrote:
 On Fri, 2014-11-07 at 08:17 +0100, Ard Biesheuvel wrote:
  On 7 November 2014 06:47, LKP l...@01.org wrote:
   FYI, we noticed the below changes on
  
   https://git.linaro.org/people/ard.biesheuvel/linux-arm efi-for-3.19
   commit aacdce6e880894acb57d71dcb2e3fc61b4ed4e96 (dmi: add support for 
   SMBIOS 3.0 64-bit entry point)
  
  
   +---+++
   |   | 2fa165a26c | aacdce6e88 |
   +---+++
   | boot_successes| 20 | 10 |
   | early-boot-hang   | 1  ||
   | boot_failures | 0  | 5  |
   | PANIC:early_exception | 0  | 5  |
   +---+++
  
  
   [0.00] BIOS-e820: [mem 0x0001-0x00036fff] 
   usable
   [0.00] bootconsole [earlyser0] enabled
   [0.00] NX (Execute Disable) protection: active
   PANIC: early exception 0e rip 10:81899e6b error 9 cr2 
   ff24
   [0.00] CPU: 0 PID: 0 Comm: swapper Not tainted 
   3.18.0-rc2-gc5221e6 #1
   [0.00]   82203d30 819f0a6e 
   03f8
   [0.00]  ff24 82203e18 823701b0 
   82511401
   [0.00]   0ba3  
   ff24
   [0.00] Call Trace:
   [0.00]  [819f0a6e] dump_stack+0x4e/0x68
   [0.00]  [823701b0] early_idt_handler+0x90/0xb7
   [0.00]  [823c80da] ? dmi_save_one_device+0x81/0x81
   [0.00]  [81899e6b] ? dmi_table+0x3f/0x94
   [0.00]  [81899e42] ? dmi_table+0x16/0x94
   [0.00]  [823c80da] ? dmi_save_one_device+0x81/0x81
   [0.00]  [823c80da] ? dmi_save_one_device+0x81/0x81
   [0.00]  [823c7eff] dmi_walk_early+0x44/0x69
   [0.00]  [823c88a2] dmi_present+0x180/0x1ff
   [0.00]  [823c8ab3] dmi_scan_machine+0x144/0x191
   [0.00]  [82370702] ? loglevel+0x31/0x31
   [0.00]  [82377f52] setup_arch+0x490/0xc73
   [0.00]  [819eef73] ? printk+0x4d/0x4f
   [0.00]  [82370b90] start_kernel+0x9c/0x43f
   [0.00]  [82370120] ? early_idt_handlers+0x120/0x120
   [0.00]  [823704a2] x86_64_start_reservations+0x2a/0x2c
   [0.00]  [823705df] x86_64_start_kernel+0x13b/0x14a
   [0.00] RIP 0x4
  
  
  This is most puzzling. Could anyone decode the exception?
  This looks like the non-EFI path through dmi_scan_machine(), which
  calls dmi_present() /after/ calling dmi_smbios3_present(), which
  apparently has not found the _SM3_ header tag. Or could the call stack
  be inaccurate?
 
 The code triggered a page fault while trying to access
 0xff24, caused because the reserved bit was set in the page
 table and no page was found. Looks like it jumped through a bogus
 pointer.
 
 And yes, the callstack may definitely be wrong - the stack dumper is
 just scraping addresses from the stack, as indicated by the '?' symbol.
 
 Yuanhan, what symbol does 0x81899e6b (the faulting instruction)
 translate to?

I found no System.map for that kernel, I then changed to another kernel,
and here is the new panic dmesg:

PANIC: early exception 0e rip 10:8167aa1a error 9 cr2 ff240001
[0.00] CPU: 0 PID: 0 Comm: swapper Not tainted 
3.18.0-rc2-8-g4d3a0be #66
[0.00]  0ba3 81bcfd10 818010a4 
03f8
[0.00]  003e 81bcfdf8 81d801b0 
617420534f49424d
[0.00]  001f ff24  
ff24
[0.00] Call Trace:
[0.00]  [818010a4] dump_stack+0x46/0x58
[0.00]  [81d801b0] early_idt_handler+0x90/0xb7
[0.00]  [81dd4cfc] ? dmi_format_ids.constprop.9+0x13c/0x13c
[0.00]  [8167aa1a] ? dmi_table+0x4a/0xf0
[0.00]  [817fa71b] ? printk+0x61/0x63
[0.00]  [81dd4cfc] ? dmi_format_ids.constprop.9+0x13c/0x13c
[0.00]  [81dd4cfc] ? dmi_format_ids.constprop.9+0x13c/0x13c
[0.00]  [81dd49dc] dmi_walk_early+0x6b/0x90
[0.00]  [81dd52fc] dmi_present+0x1b4/0x23f
[0.00]  [81dd55ab] dmi_scan_machine+0x1d4/0x23a
[0.00]  [81d80120] ? early_idt_handlers+0x120/0x120
[0.00]  [81d883a2] setup_arch+0x462/0xcc6
[0.00]  [81d80120] ? early_idt_handlers+0x120/0x120
[0.00]  [81d80167] ? early_idt_handler+0x47/0xb7
[0.00]  [81d80120] ? early_idt_handlers+0x120/0x120
[0.00]  [81d80cf0] start_kernel+0x97/0x456
[0.00]  [81d80120] ? 

Re: [LKP] [dmi] PANIC: early exception 0e rip 10:ffffffff81899e6b error 9 cr2 ffffffffff240000

2014-11-07 Thread Yuanhan Liu
On Fri, Nov 07, 2014 at 10:35:44AM +0100, Ard Biesheuvel wrote:
 On 7 November 2014 10:26, Yuanhan Liu yuanhan@linux.intel.com wrote:
  On Fri, Nov 07, 2014 at 10:03:55AM +0100, Ard Biesheuvel wrote:
  On 7 November 2014 09:46, Yuanhan Liu yuanhan@linux.intel.com wrote:
   On Fri, Nov 07, 2014 at 09:23:56AM +0100, Ard Biesheuvel wrote:
   On 7 November 2014 09:13, Yuanhan Liu yuanhan@linux.intel.com 
   wrote:
On Fri, Nov 07, 2014 at 08:44:40AM +0100, Ard Biesheuvel wrote:
On 7 November 2014 08:37, Yuanhan Liu yuanhan@linux.intel.com 
wrote:
 On Fri, Nov 07, 2014 at 08:17:36AM +0100, Ard Biesheuvel wrote:
 On 7 November 2014 06:47, LKP l...@01.org wrote:
  FYI, we noticed the below changes on
 
  https://git.linaro.org/people/ard.biesheuvel/linux-arm 
  efi-for-3.19
  commit aacdce6e880894acb57d71dcb2e3fc61b4ed4e96 (dmi: add 
  support for SMBIOS 3.0 64-bit entry point)
 
 
  +---+++
  |   | 2fa165a26c | aacdce6e88 |
  +---+++
  | boot_successes| 20 | 10 |
  | early-boot-hang   | 1  ||
  | boot_failures | 0  | 5  |
  | PANIC:early_exception | 0  | 5  |
  +---+++
 
 
  [0.00] BIOS-e820: [mem 
  0x0001-0x00036fff] usable
  [0.00] bootconsole [earlyser0] enabled
  [0.00] NX (Execute Disable) protection: active
  PANIC: early exception 0e rip 10:81899e6b error 9 cr2 
  ff24
  [0.00] CPU: 0 PID: 0 Comm: swapper Not tainted 
  3.18.0-rc2-gc5221e6 #1
  [0.00]   82203d30 
  819f0a6e 03f8
  [0.00]  ff24 82203e18 
  823701b0 82511401
  [0.00]   0ba3 
   ff24
  [0.00] Call Trace:
  [0.00]  [819f0a6e] dump_stack+0x4e/0x68
  [0.00]  [823701b0] early_idt_handler+0x90/0xb7
  [0.00]  [823c80da] ? 
  dmi_save_one_device+0x81/0x81
  [0.00]  [81899e6b] ? dmi_table+0x3f/0x94
  [0.00]  [81899e42] ? dmi_table+0x16/0x94
  [0.00]  [823c80da] ? 
  dmi_save_one_device+0x81/0x81
  [0.00]  [823c80da] ? 
  dmi_save_one_device+0x81/0x81
  [0.00]  [823c7eff] dmi_walk_early+0x44/0x69
  [0.00]  [823c88a2] dmi_present+0x180/0x1ff
  [0.00]  [823c8ab3] 
  dmi_scan_machine+0x144/0x191
  [0.00]  [82370702] ? loglevel+0x31/0x31
  [0.00]  [82377f52] setup_arch+0x490/0xc73
  [0.00]  [819eef73] ? printk+0x4d/0x4f
  [0.00]  [82370b90] start_kernel+0x9c/0x43f
  [0.00]  [82370120] ? 
  early_idt_handlers+0x120/0x120
  [0.00]  [823704a2] 
  x86_64_start_reservations+0x2a/0x2c
  [0.00]  [823705df] 
  x86_64_start_kernel+0x13b/0x14a
  [0.00] RIP 0x4
 

 This is most puzzling. Could anyone decode the exception?
 This looks like the non-EFI path through dmi_scan_machine(), which
 calls dmi_present() /after/ calling dmi_smbios3_present(), which
 apparently has not found the _SM3_ header tag. Or could the call 
 stack
 be inaccurate?

 Anyway, it would be good to know the exact type of the platform,

 It's a Nehalem-EP machine, wht 16 CPU and 12G memory.

 and
 perhaps we could find out if there is an inadvertent _SM3_ tag
 somewhere in the 0xF - 0xF range?

 Sorry, how?

   
That's not a brand new machine, so I suppose there wouldn't be a
SMBIOS 3.0 header lurking in there.
   
Anyway, if you are in a position to try things, could you apply this
   
--- a/drivers/firmware/dmi_scan.c
+++ b/drivers/firmware/dmi_scan.c
@@ -617,7 +617,7 @@ void __init dmi_scan_machine(void)
memset(buf, 0, 16);
for (q = p; q  p + 0x1; q += 16) {
memcpy_fromio(buf + 16, q, 16);
-   if (!dmi_smbios3_present(buf) || 
!dmi_present(buf)) {
+   if (!dmi_present(buf)) {
dmi_available = 1;
dmi_early_unmap(p, 0x1);
goto out;
   
and try again?
   
kernel boots perfectly with this patch applied.
   
--yliu
   
  
   Thank you! Very useful to know
  
  
   Sigh, I made a silly error, I speicified wrong commit while testing

Re: [LKP] [dmi] PANIC: early exception 0e rip 10:ffffffff81899e6b error 9 cr2 ffffffffff240000

2014-11-06 Thread Yuanhan Liu
On Fri, Nov 07, 2014 at 08:17:36AM +0100, Ard Biesheuvel wrote:
> On 7 November 2014 06:47, LKP  wrote:
> > FYI, we noticed the below changes on
> >
> > https://git.linaro.org/people/ard.biesheuvel/linux-arm efi-for-3.19
> > commit aacdce6e880894acb57d71dcb2e3fc61b4ed4e96 ("dmi: add support for 
> > SMBIOS 3.0 64-bit entry point")
> >
> >
> > +---+++
> > |   | 2fa165a26c | aacdce6e88 |
> > +---+++
> > | boot_successes| 20 | 10 |
> > | early-boot-hang   | 1  ||
> > | boot_failures | 0  | 5  |
> > | PANIC:early_exception | 0  | 5  |
> > +---+++
> >
> >
> > [0.00] BIOS-e820: [mem 0x0001-0x00036fff] usable
> > [0.00] bootconsole [earlyser0] enabled
> > [0.00] NX (Execute Disable) protection: active
> > PANIC: early exception 0e rip 10:81899e6b error 9 cr2 
> > ff24
> > [0.00] CPU: 0 PID: 0 Comm: swapper Not tainted 3.18.0-rc2-gc5221e6 
> > #1
> > [0.00]   82203d30 819f0a6e 
> > 03f8
> > [0.00]  ff24 82203e18 823701b0 
> > 82511401
> > [0.00]   0ba3  
> > ff24
> > [0.00] Call Trace:
> > [0.00]  [] dump_stack+0x4e/0x68
> > [0.00]  [] early_idt_handler+0x90/0xb7
> > [0.00]  [] ? dmi_save_one_device+0x81/0x81
> > [0.00]  [] ? dmi_table+0x3f/0x94
> > [0.00]  [] ? dmi_table+0x16/0x94
> > [0.00]  [] ? dmi_save_one_device+0x81/0x81
> > [0.00]  [] ? dmi_save_one_device+0x81/0x81
> > [0.00]  [] dmi_walk_early+0x44/0x69
> > [0.00]  [] dmi_present+0x180/0x1ff
> > [0.00]  [] dmi_scan_machine+0x144/0x191
> > [0.00]  [] ? loglevel+0x31/0x31
> > [0.00]  [] setup_arch+0x490/0xc73
> > [0.00]  [] ? printk+0x4d/0x4f
> > [0.00]  [] start_kernel+0x9c/0x43f
> > [0.00]  [] ? early_idt_handlers+0x120/0x120
> > [0.00]  [] x86_64_start_reservations+0x2a/0x2c
> > [0.00]  [] x86_64_start_kernel+0x13b/0x14a
> > [0.00] RIP 0x4
> >
> 
> This is most puzzling. Could anyone decode the exception?
> This looks like the non-EFI path through dmi_scan_machine(), which
> calls dmi_present() /after/ calling dmi_smbios3_present(), which
> apparently has not found the _SM3_ header tag. Or could the call stack
> be inaccurate?
> 
> Anyway, it would be good to know the exact type of the platform,

It's a Nehalem-EP machine, wht 16 CPU and 12G memory.

> and
> perhaps we could find out if there is an inadvertent _SM3_ tag
> somewhere in the 0xF - 0xF range?

Sorry, how?

--yliu
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [LKP] [dmi] PANIC: early exception 0e rip 10:ffffffff81899e6b error 9 cr2 ffffffffff240000

2014-11-06 Thread Yuanhan Liu
On Fri, Nov 07, 2014 at 08:17:36AM +0100, Ard Biesheuvel wrote:
 On 7 November 2014 06:47, LKP l...@01.org wrote:
  FYI, we noticed the below changes on
 
  https://git.linaro.org/people/ard.biesheuvel/linux-arm efi-for-3.19
  commit aacdce6e880894acb57d71dcb2e3fc61b4ed4e96 (dmi: add support for 
  SMBIOS 3.0 64-bit entry point)
 
 
  +---+++
  |   | 2fa165a26c | aacdce6e88 |
  +---+++
  | boot_successes| 20 | 10 |
  | early-boot-hang   | 1  ||
  | boot_failures | 0  | 5  |
  | PANIC:early_exception | 0  | 5  |
  +---+++
 
 
  [0.00] BIOS-e820: [mem 0x0001-0x00036fff] usable
  [0.00] bootconsole [earlyser0] enabled
  [0.00] NX (Execute Disable) protection: active
  PANIC: early exception 0e rip 10:81899e6b error 9 cr2 
  ff24
  [0.00] CPU: 0 PID: 0 Comm: swapper Not tainted 3.18.0-rc2-gc5221e6 
  #1
  [0.00]   82203d30 819f0a6e 
  03f8
  [0.00]  ff24 82203e18 823701b0 
  82511401
  [0.00]   0ba3  
  ff24
  [0.00] Call Trace:
  [0.00]  [819f0a6e] dump_stack+0x4e/0x68
  [0.00]  [823701b0] early_idt_handler+0x90/0xb7
  [0.00]  [823c80da] ? dmi_save_one_device+0x81/0x81
  [0.00]  [81899e6b] ? dmi_table+0x3f/0x94
  [0.00]  [81899e42] ? dmi_table+0x16/0x94
  [0.00]  [823c80da] ? dmi_save_one_device+0x81/0x81
  [0.00]  [823c80da] ? dmi_save_one_device+0x81/0x81
  [0.00]  [823c7eff] dmi_walk_early+0x44/0x69
  [0.00]  [823c88a2] dmi_present+0x180/0x1ff
  [0.00]  [823c8ab3] dmi_scan_machine+0x144/0x191
  [0.00]  [82370702] ? loglevel+0x31/0x31
  [0.00]  [82377f52] setup_arch+0x490/0xc73
  [0.00]  [819eef73] ? printk+0x4d/0x4f
  [0.00]  [82370b90] start_kernel+0x9c/0x43f
  [0.00]  [82370120] ? early_idt_handlers+0x120/0x120
  [0.00]  [823704a2] x86_64_start_reservations+0x2a/0x2c
  [0.00]  [823705df] x86_64_start_kernel+0x13b/0x14a
  [0.00] RIP 0x4
 
 
 This is most puzzling. Could anyone decode the exception?
 This looks like the non-EFI path through dmi_scan_machine(), which
 calls dmi_present() /after/ calling dmi_smbios3_present(), which
 apparently has not found the _SM3_ header tag. Or could the call stack
 be inaccurate?
 
 Anyway, it would be good to know the exact type of the platform,

It's a Nehalem-EP machine, wht 16 CPU and 12G memory.

 and
 perhaps we could find out if there is an inadvertent _SM3_ tag
 somewhere in the 0xF - 0xF range?

Sorry, how?

--yliu
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 0/3] Shrinkers and proportional reclaim

2014-05-22 Thread Yuanhan Liu
On Thu, May 22, 2014 at 05:30:51PM +0100, Mel Gorman wrote:
> On Fri, May 23, 2014 at 12:14:16AM +0800, Yuanhan Liu wrote:
> > On Thu, May 22, 2014 at 10:09:36AM +0100, Mel Gorman wrote:
> > > This series is aimed at regressions noticed during reclaim activity. The
> > > first two patches are shrinker patches that were posted ages ago but never
> > > merged for reasons that are unclear to me. I'm posting them again to see 
> > > if
> > > there was a reason they were dropped or if they just got lost. Dave?  
> > > Time?
> > > The last patch adjusts proportional reclaim. Yuanhan Liu, can you retest
> > > the vm scalability test cases on a larger machine? Hugh, does this work
> > > for you on the memcg test cases?
> > 
> > Sure, and here is the result. I applied these 3 patches on v3.15-rc6,
> > and head commit is 60c10afd. e82e0561 is the old commit that introduced
> > the regression.  The testserver has 512G memory and 120 CPU.
> > 
> > It's a simple result; if you need more data, I can gather them and send
> > it to you tomorrow:
> > 
> > e82e0561v3.15-rc6   60c10afd
> > 
> > 185607851223212238868453
> > -34%+109
> > 
> > As you can see, the performance is back, and it is way much better ;)
> > 
> 
> Thanks a lot for that and the quick response. It is much appreciated.

Welcome! And sorry that I made a silly mistake. Those numbers are right
though, I just setup wrong compare base; I should compare them with
e82e0561's parent, which is 75485363ce85526 at below table.

Here is the detailed results to compensate the mistake I made ;)

Legend:
~XX%- stddev percent  (3 runs for each kernel)
[+-]XX% - change percent


75485363ce85526  e82e0561dae9f3ae5a21fc2d3  v3.15-rc6  
60c10afd233f3344479d229dc  
---  -  -  
-  
  35979244 ~ 0% -48.4%   18560785 ~ 0% -66.0%   12235090 ~ 0%  
+8.0%   38868453 ~ 0%   vm-scalability.throughput

 28138 ~ 0%   +7448.2%2123943 ~ 0%   +2724.5% 794777 ~ 0%  
+1.6%  28598 ~ 0%   proc-vmstat.allocstall

   544 ~ 6% -95.2% 26 ~ 0% -96.5% 19 ~21%  
-6.9%506 ~ 6%   numa-vmstat.node2.nr_isolated_file
  12009832 ~11%+368.1%   56215319 ~ 0%+312.9%   49589361 ~ 1%  
+0.7%   12091235 ~ 5%   numa-numastat.node3.numa_foreign
   560 ~ 5% -95.7% 24 ~12% -96.9% 17 ~10%  
-8.7%511 ~ 2%   numa-vmstat.node1.nr_isolated_file
   8740137 ~12%+574.0%   58910256 ~ 0%+321.0%   36798827 ~ 0% 
+21.0%   10578905 ~13%   numa-vmstat.node0.numa_other
   8734988 ~12%+574.4%   58904944 ~ 0%+321.2%   36794158 ~ 0% 
+21.0%   10572718 ~13%   numa-vmstat.node0.numa_miss
  1308 ~12%-100.0%  0 ~ 0%-100.0%  0  
+23.3%   1612 ~18%   proc-vmstat.pgscan_direct_throttle
  12294788 ~11%+401.2%   61622745 ~ 0%+332.6%   53190547 ~ 0% 
-13.2%   10667387 ~ 5%   numa-numastat.node1.numa_foreign
   576 ~ 6% -91.2% 50 ~22% -94.3% 33 ~20% 
-18.1%472 ~ 1%   numa-vmstat.node0.nr_isolated_file
12 ~24%   +2400.0%316 ~ 4%  +13543.7%   1728 ~ 5%
+155.3% 32 ~29%   proc-vmstat.compact_stall
   572 ~ 2% -96.4% 20 ~18% -97.6% 13 ~11% 
-17.5%472 ~ 2%   numa-vmstat.node3.nr_isolated_file
  3012 ~12%   +2388.4%  74959 ~ 0%+254.7%  10685 ~ 1% 
-45.4%   1646 ~ 1%   proc-vmstat.pageoutrun
  2312 ~ 3% -94.2%133 ~ 4% -95.8% 97 ~ 8% 
-12.6%   2021 ~ 2%   proc-vmstat.nr_isolated_file
   2575163 ~ 0%   +2779.1%   74141888 ~ 0%+958.0%   27244229 ~ 0%  
-1.3%2542941 ~ 0%   proc-vmstat.pgscan_direct_dma32
  21916603 ~13%   +2519.8%  5.742e+08 ~ 0%   +2868.9%  6.507e+08 ~ 0% 
-16.1%   18397644 ~ 5%   proc-vmstat.pgscan_kswapd_normal
 53306 ~24%   +1077.9% 627895 ~ 0%   +2066.2%1154741 ~ 0% 
+23.5%  65815 ~24%   proc-vmstat.pgscan_kswapd_dma32
   2575163 ~ 0%   +2778.6%   74129497 ~ 0%+957.8%   27239606 ~ 0%  
-1.3%2542353 ~ 0%   proc-vmstat.pgsteal_direct_dma32
  21907744 ~14%   +2520.8%  5.742e+08 ~ 0%   +2870.0%  6.507e+08 ~ 0% 
-16.1%   18386641 ~ 5%   proc-vmstat.pgsteal_kswapd_normal
 53306 ~24%   +1077.7% 627796 ~ 0%   +2065.7%1154436 ~ 0% 
+23.3%  65731 ~24%   proc-vmstat.pgsteal_kswapd_dma32
   2967449 ~ 0%   +2432.7%   75156011 ~ 0%+869.9%   28781337 ~ 0%  
-0.7%2945933 ~ 0%   proc-vmstat.pgalloc_dma32
  13081172 ~11%+599.4%   91495653 ~ 0%+337.1%   57180622 ~ 0% 
+12.1%   14668

Re: [PATCH 0/3] Shrinkers and proportional reclaim

2014-05-22 Thread Yuanhan Liu
On Thu, May 22, 2014 at 10:09:36AM +0100, Mel Gorman wrote:
> This series is aimed at regressions noticed during reclaim activity. The
> first two patches are shrinker patches that were posted ages ago but never
> merged for reasons that are unclear to me. I'm posting them again to see if
> there was a reason they were dropped or if they just got lost. Dave?  Time?
> The last patch adjusts proportional reclaim. Yuanhan Liu, can you retest
> the vm scalability test cases on a larger machine? Hugh, does this work
> for you on the memcg test cases?

Sure, and here is the result. I applied these 3 patches on v3.15-rc6,
and head commit is 60c10afd. e82e0561 is the old commit that introduced
the regression.  The testserver has 512G memory and 120 CPU.

It's a simple result; if you need more data, I can gather them and send
it to you tomorrow:

e82e0561v3.15-rc6   60c10afd

185607851223212238868453
-34%+109

As you can see, the performance is back, and it is way much better ;)

--yliu
> 
> Based on ext4, I get the following results but unfortunately my larger test
> machines are all unavailable so this is based on a relatively small machine.
> 
> postmark
>   3.15.0-rc53.15.0-rc5
>  vanilla   proportion-v1r4
> Ops/sec Transactions 21.00 (  0.00%)   25.00 ( 19.05%)
> Ops/sec FilesCreate  39.00 (  0.00%)   45.00 ( 15.38%)
> Ops/sec CreateTransact   10.00 (  0.00%)   12.00 ( 20.00%)
> Ops/sec FilesDeleted   6202.00 (  0.00%) 6202.00 (  0.00%)
> Ops/sec DeleteTransact   11.00 (  0.00%)   12.00 (  9.09%)
> Ops/sec DataRead/MB  25.97 (  0.00%)   30.02 ( 15.59%)
> Ops/sec DataWrite/MB 49.99 (  0.00%)   57.78 ( 15.58%)
> 
> ffsb (mail server simulator)
>  3.15.0-rc5 3.15.0-rc5
> vanillaproportion-v1r4
> Ops/sec readall   9402.63 (  0.00%)  9805.74 (  4.29%)
> Ops/sec create4695.45 (  0.00%)  4781.39 (  1.83%)
> Ops/sec delete 173.72 (  0.00%)   177.23 (  2.02%)
> Ops/sec Transactions 14271.80 (  0.00%) 14764.37 (  3.45%)
> Ops/sec Read37.00 (  0.00%)38.50 (  4.05%)
> Ops/sec Write   18.20 (  0.00%)18.50 (  1.65%)
> 
> dd of a large file
> 3.15.0-rc53.15.0-rc5
>vanilla   proportion-v1r4
> WallTime DownloadTar   75.00 (  0.00%)   61.00 ( 18.67%)
> WallTime DD   423.00 (  0.00%)  401.00 (  5.20%)
> WallTime Delete 2.00 (  0.00%)5.00 (-150.00%)
> 
> stutter (times mmap latency during large amounts of IO)
> 
> 3.15.0-rc53.15.0-rc5
>vanilla   proportion-v1r4
> Unit >5ms Delays  80252. (  0.00%)  81523. ( -1.58%)
> Unit Mmap min 8.2118 (  0.00%)  8.3206 ( -1.33%)
> Unit Mmap mean   17.4614 (  0.00%) 17.2868 (  1.00%)
> Unit Mmap stddev 24.9059 (  0.00%) 34.6771 (-39.23%)
> Unit Mmap max  2811.6433 (  0.00%)   2645.1398 (  5.92%)
> Unit Mmap 90%20.5098 (  0.00%) 18.3105 ( 10.72%)
> Unit Mmap 93%22.9180 (  0.00%) 20.1751 ( 11.97%)
> Unit Mmap 95%25.2114 (  0.00%) 22.4988 ( 10.76%)
> Unit Mmap 99%46.1430 (  0.00%) 43.5952 (  5.52%)
> Unit Ideal  Tput 85.2623 (  0.00%) 78.8906 (  7.47%)
> Unit Tput min44.0666 (  0.00%) 43.9609 (  0.24%)
> Unit Tput mean   45.5646 (  0.00%) 45.2009 (  0.80%)
> Unit Tput stddev  0.9318 (  0.00%)  1.1084 (-18.95%)
> Unit Tput max46.7375 (  0.00%) 46.7539 ( -0.04%)
> 
>  fs/super.c  | 16 +---
>  mm/vmscan.c | 36 +---
>  2 files changed, 34 insertions(+), 18 deletions(-)
> 
> -- 
> 1.8.4.5
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 0/3] Shrinkers and proportional reclaim

2014-05-22 Thread Yuanhan Liu
On Thu, May 22, 2014 at 10:09:36AM +0100, Mel Gorman wrote:
 This series is aimed at regressions noticed during reclaim activity. The
 first two patches are shrinker patches that were posted ages ago but never
 merged for reasons that are unclear to me. I'm posting them again to see if
 there was a reason they were dropped or if they just got lost. Dave?  Time?
 The last patch adjusts proportional reclaim. Yuanhan Liu, can you retest
 the vm scalability test cases on a larger machine? Hugh, does this work
 for you on the memcg test cases?

Sure, and here is the result. I applied these 3 patches on v3.15-rc6,
and head commit is 60c10afd. e82e0561 is the old commit that introduced
the regression.  The testserver has 512G memory and 120 CPU.

It's a simple result; if you need more data, I can gather them and send
it to you tomorrow:

e82e0561v3.15-rc6   60c10afd

185607851223212238868453
-34%+109

As you can see, the performance is back, and it is way much better ;)

--yliu
 
 Based on ext4, I get the following results but unfortunately my larger test
 machines are all unavailable so this is based on a relatively small machine.
 
 postmark
   3.15.0-rc53.15.0-rc5
  vanilla   proportion-v1r4
 Ops/sec Transactions 21.00 (  0.00%)   25.00 ( 19.05%)
 Ops/sec FilesCreate  39.00 (  0.00%)   45.00 ( 15.38%)
 Ops/sec CreateTransact   10.00 (  0.00%)   12.00 ( 20.00%)
 Ops/sec FilesDeleted   6202.00 (  0.00%) 6202.00 (  0.00%)
 Ops/sec DeleteTransact   11.00 (  0.00%)   12.00 (  9.09%)
 Ops/sec DataRead/MB  25.97 (  0.00%)   30.02 ( 15.59%)
 Ops/sec DataWrite/MB 49.99 (  0.00%)   57.78 ( 15.58%)
 
 ffsb (mail server simulator)
  3.15.0-rc5 3.15.0-rc5
 vanillaproportion-v1r4
 Ops/sec readall   9402.63 (  0.00%)  9805.74 (  4.29%)
 Ops/sec create4695.45 (  0.00%)  4781.39 (  1.83%)
 Ops/sec delete 173.72 (  0.00%)   177.23 (  2.02%)
 Ops/sec Transactions 14271.80 (  0.00%) 14764.37 (  3.45%)
 Ops/sec Read37.00 (  0.00%)38.50 (  4.05%)
 Ops/sec Write   18.20 (  0.00%)18.50 (  1.65%)
 
 dd of a large file
 3.15.0-rc53.15.0-rc5
vanilla   proportion-v1r4
 WallTime DownloadTar   75.00 (  0.00%)   61.00 ( 18.67%)
 WallTime DD   423.00 (  0.00%)  401.00 (  5.20%)
 WallTime Delete 2.00 (  0.00%)5.00 (-150.00%)
 
 stutter (times mmap latency during large amounts of IO)
 
 3.15.0-rc53.15.0-rc5
vanilla   proportion-v1r4
 Unit 5ms Delays  80252. (  0.00%)  81523. ( -1.58%)
 Unit Mmap min 8.2118 (  0.00%)  8.3206 ( -1.33%)
 Unit Mmap mean   17.4614 (  0.00%) 17.2868 (  1.00%)
 Unit Mmap stddev 24.9059 (  0.00%) 34.6771 (-39.23%)
 Unit Mmap max  2811.6433 (  0.00%)   2645.1398 (  5.92%)
 Unit Mmap 90%20.5098 (  0.00%) 18.3105 ( 10.72%)
 Unit Mmap 93%22.9180 (  0.00%) 20.1751 ( 11.97%)
 Unit Mmap 95%25.2114 (  0.00%) 22.4988 ( 10.76%)
 Unit Mmap 99%46.1430 (  0.00%) 43.5952 (  5.52%)
 Unit Ideal  Tput 85.2623 (  0.00%) 78.8906 (  7.47%)
 Unit Tput min44.0666 (  0.00%) 43.9609 (  0.24%)
 Unit Tput mean   45.5646 (  0.00%) 45.2009 (  0.80%)
 Unit Tput stddev  0.9318 (  0.00%)  1.1084 (-18.95%)
 Unit Tput max46.7375 (  0.00%) 46.7539 ( -0.04%)
 
  fs/super.c  | 16 +---
  mm/vmscan.c | 36 +---
  2 files changed, 34 insertions(+), 18 deletions(-)
 
 -- 
 1.8.4.5
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 0/3] Shrinkers and proportional reclaim

2014-05-22 Thread Yuanhan Liu
On Thu, May 22, 2014 at 05:30:51PM +0100, Mel Gorman wrote:
 On Fri, May 23, 2014 at 12:14:16AM +0800, Yuanhan Liu wrote:
  On Thu, May 22, 2014 at 10:09:36AM +0100, Mel Gorman wrote:
   This series is aimed at regressions noticed during reclaim activity. The
   first two patches are shrinker patches that were posted ages ago but never
   merged for reasons that are unclear to me. I'm posting them again to see 
   if
   there was a reason they were dropped or if they just got lost. Dave?  
   Time?
   The last patch adjusts proportional reclaim. Yuanhan Liu, can you retest
   the vm scalability test cases on a larger machine? Hugh, does this work
   for you on the memcg test cases?
  
  Sure, and here is the result. I applied these 3 patches on v3.15-rc6,
  and head commit is 60c10afd. e82e0561 is the old commit that introduced
  the regression.  The testserver has 512G memory and 120 CPU.
  
  It's a simple result; if you need more data, I can gather them and send
  it to you tomorrow:
  
  e82e0561v3.15-rc6   60c10afd
  
  185607851223212238868453
  -34%+109
  
  As you can see, the performance is back, and it is way much better ;)
  
 
 Thanks a lot for that and the quick response. It is much appreciated.

Welcome! And sorry that I made a silly mistake. Those numbers are right
though, I just setup wrong compare base; I should compare them with
e82e0561's parent, which is 75485363ce85526 at below table.

Here is the detailed results to compensate the mistake I made ;)

Legend:
~XX%- stddev percent  (3 runs for each kernel)
[+-]XX% - change percent


75485363ce85526  e82e0561dae9f3ae5a21fc2d3  v3.15-rc6  
60c10afd233f3344479d229dc  
---  -  -  
-  
  35979244 ~ 0% -48.4%   18560785 ~ 0% -66.0%   12235090 ~ 0%  
+8.0%   38868453 ~ 0%   vm-scalability.throughput

 28138 ~ 0%   +7448.2%2123943 ~ 0%   +2724.5% 794777 ~ 0%  
+1.6%  28598 ~ 0%   proc-vmstat.allocstall

   544 ~ 6% -95.2% 26 ~ 0% -96.5% 19 ~21%  
-6.9%506 ~ 6%   numa-vmstat.node2.nr_isolated_file
  12009832 ~11%+368.1%   56215319 ~ 0%+312.9%   49589361 ~ 1%  
+0.7%   12091235 ~ 5%   numa-numastat.node3.numa_foreign
   560 ~ 5% -95.7% 24 ~12% -96.9% 17 ~10%  
-8.7%511 ~ 2%   numa-vmstat.node1.nr_isolated_file
   8740137 ~12%+574.0%   58910256 ~ 0%+321.0%   36798827 ~ 0% 
+21.0%   10578905 ~13%   numa-vmstat.node0.numa_other
   8734988 ~12%+574.4%   58904944 ~ 0%+321.2%   36794158 ~ 0% 
+21.0%   10572718 ~13%   numa-vmstat.node0.numa_miss
  1308 ~12%-100.0%  0 ~ 0%-100.0%  0  
+23.3%   1612 ~18%   proc-vmstat.pgscan_direct_throttle
  12294788 ~11%+401.2%   61622745 ~ 0%+332.6%   53190547 ~ 0% 
-13.2%   10667387 ~ 5%   numa-numastat.node1.numa_foreign
   576 ~ 6% -91.2% 50 ~22% -94.3% 33 ~20% 
-18.1%472 ~ 1%   numa-vmstat.node0.nr_isolated_file
12 ~24%   +2400.0%316 ~ 4%  +13543.7%   1728 ~ 5%
+155.3% 32 ~29%   proc-vmstat.compact_stall
   572 ~ 2% -96.4% 20 ~18% -97.6% 13 ~11% 
-17.5%472 ~ 2%   numa-vmstat.node3.nr_isolated_file
  3012 ~12%   +2388.4%  74959 ~ 0%+254.7%  10685 ~ 1% 
-45.4%   1646 ~ 1%   proc-vmstat.pageoutrun
  2312 ~ 3% -94.2%133 ~ 4% -95.8% 97 ~ 8% 
-12.6%   2021 ~ 2%   proc-vmstat.nr_isolated_file
   2575163 ~ 0%   +2779.1%   74141888 ~ 0%+958.0%   27244229 ~ 0%  
-1.3%2542941 ~ 0%   proc-vmstat.pgscan_direct_dma32
  21916603 ~13%   +2519.8%  5.742e+08 ~ 0%   +2868.9%  6.507e+08 ~ 0% 
-16.1%   18397644 ~ 5%   proc-vmstat.pgscan_kswapd_normal
 53306 ~24%   +1077.9% 627895 ~ 0%   +2066.2%1154741 ~ 0% 
+23.5%  65815 ~24%   proc-vmstat.pgscan_kswapd_dma32
   2575163 ~ 0%   +2778.6%   74129497 ~ 0%+957.8%   27239606 ~ 0%  
-1.3%2542353 ~ 0%   proc-vmstat.pgsteal_direct_dma32
  21907744 ~14%   +2520.8%  5.742e+08 ~ 0%   +2870.0%  6.507e+08 ~ 0% 
-16.1%   18386641 ~ 5%   proc-vmstat.pgsteal_kswapd_normal
 53306 ~24%   +1077.7% 627796 ~ 0%   +2065.7%1154436 ~ 0% 
+23.3%  65731 ~24%   proc-vmstat.pgsteal_kswapd_dma32
   2967449 ~ 0%   +2432.7%   75156011 ~ 0%+869.9%   28781337 ~ 0%  
-0.7%2945933 ~ 0%   proc-vmstat.pgalloc_dma32
  13081172 ~11%+599.4%   91495653 ~ 0%+337.1%   57180622 ~ 0% 
+12.1%   14668141 ~13%   numa-numastat.node0.other_node
  13073426 ~11%+599.8%   91489575 ~ 0%+337.4%   57177129 ~ 0% 
+12.1%   14660341 ~13%   numa-numastat.node0.numa_miss
   281 ~23%   +1969.4%   5822 ~ 1%   +3321.4%   9625 ~ 2% 
-26.9

Re: performance regression due to commit e82e0561("mm: vmscan: obey proportional scanning requirements for kswapd")

2014-03-18 Thread Yuanhan Liu
On Sat, Mar 15, 2014 at 08:56:10PM -0700, Hugh Dickins wrote:
> On Fri, 14 Mar 2014, Mel Gorman wrote:
> > On Thu, Mar 13, 2014 at 05:44:57AM -0700, Hugh Dickins wrote:
> > > On Wed, 12 Mar 2014, Mel Gorman wrote:
> > > > On Tue, Feb 18, 2014 at 04:01:22PM +0800, Yuanhan Liu wrote:
... snip ...

> > > I missed Yuanhan's mail, but seeing your reply reminds me of another
> > > issue with that proportionality patch - or perhaps more thought would
> > > show them to be two sides of the same issue, with just one fix required.
> > > Let me throw our patch into the cauldron.
> > > 
> > > [PATCH] mm: revisit shrink_lruvec's attempt at proportionality
> > > 
> > > We have a memcg reclaim test which exerts a certain amount of pressure,
> > > and expects to see a certain range of page reclaim in response.  It's a
> > > very wide range allowed, but the test repeatably failed on v3.11 onwards,
> > > because reclaim goes wild and frees up almost everything.
> > > 
> > > This wild behaviour bisects to Mel's "scan_adjusted" commit e82e0561dae9
> > > "mm: vmscan: obey proportional scanning requirements for kswapd".  That
> > > attempts to achieve proportionality between anon and file lrus: to the
> > > extent that once one of those is empty, it then tries to empty the other.
> > > Stop that.
> > > 
> > > Signed-off-by: Hugh Dickins 
> > > ---
> > > 
> > > We've been running happily with this for months; but all that time it's
> > > been on my TODO list with a "needs more thought" tag before we could
> > > upstream it, and I never got around to that.  We also have a somewhat
> > > similar, but older and quite independent, fix to get_scan_count() from
> > > Suleiman, which I'd meant to send along at the same time: I'll dig that
> > > one out tomorrow or the day after.
> 
> I've sent that one out now in a new thread
> https://lkml.org/lkml/2014/3/15/168
> and also let's tie these together with Hannes's
> https://lkml.org/lkml/2014/3/14/277
> 
> > > 
> > 
> > I ran a battery of page reclaim related tests against it on top of
> > 3.14-rc6. Workloads showed small improvements in their absolute performance
> > but actual IO behaviour looked much better in some tests.  This is the
> > iostats summary for the test that showed the biggest different -- dd of
> > a large file on ext3.
> > 
> > 3.14.0-rc6  3.14.0-rc6
> >vanilla  proportional-v1r1
> > Meansda-avgqz   1045.64 224.18  
> > Meansda-await   2120.12 506.77  
> > Meansda-r_await 18.61   19.78   
> > Meansda-w_await 11089.602126.35 
> > Max sda-avgqz   2294.39 787.13  
> > Max sda-await   7074.79 2371.67 
> > Max sda-r_await 503.00  414.00  
> > Max sda-w_await 35721.937249.84 
> > 
> > Not all workloads benefitted. The same workload on ext4 showed no useful
> > difference. btrfs looks like
> > 
> >  3.14.0-rc6 3.14.0-rc6
> >vanilla  proportional-v1r1
> > Meansda-avgqz   762.69  650.39  
> > Meansda-await   2438.46 2495.15 
> > Meansda-r_await 44.18   47.20   
> > Meansda-w_await 6109.19 5139.86 
> > Max sda-avgqz   2203.50 1870.78 
> > Max sda-await   7098.26 6847.21 
> > Max sda-r_await 63.02   156.00  
> > Max sda-w_await 19921.7011085.13
> > 
> > Better but not as dramatically so. I didn't analyse why. A workload that
> > had a large anonymous mapping with large amounts of IO in the background
> > did not show any regressions so based on that and the fact the patch looks
> > ok, here goes nothing;
> > 
> > Acked-by: Mel Gorman 
> 
> Big thank you, Mel, for doing so much work on it, and so very quickly.
> I get quite lost in the numbers myself: I'm much more convinced of it
> by your numbers and ack.
> 
> > 
> > You say it's already been tested for months but it would be nice if the
> > workload that generated this thread was also tested.
> 
> Yes indeed: Yuanhan, do you have time to try this patch for your
> testcase?  I'm hoping it will prove at least as effective as your
> own suggested patch, but please let us know what you find - thanks.

Hi Hugh,

Sure, and sorry to t

Re: performance regression due to commit e82e0561(mm: vmscan: obey proportional scanning requirements for kswapd)

2014-03-18 Thread Yuanhan Liu
On Sat, Mar 15, 2014 at 08:56:10PM -0700, Hugh Dickins wrote:
 On Fri, 14 Mar 2014, Mel Gorman wrote:
  On Thu, Mar 13, 2014 at 05:44:57AM -0700, Hugh Dickins wrote:
   On Wed, 12 Mar 2014, Mel Gorman wrote:
On Tue, Feb 18, 2014 at 04:01:22PM +0800, Yuanhan Liu wrote:
... snip ...

   I missed Yuanhan's mail, but seeing your reply reminds me of another
   issue with that proportionality patch - or perhaps more thought would
   show them to be two sides of the same issue, with just one fix required.
   Let me throw our patch into the cauldron.
   
   [PATCH] mm: revisit shrink_lruvec's attempt at proportionality
   
   We have a memcg reclaim test which exerts a certain amount of pressure,
   and expects to see a certain range of page reclaim in response.  It's a
   very wide range allowed, but the test repeatably failed on v3.11 onwards,
   because reclaim goes wild and frees up almost everything.
   
   This wild behaviour bisects to Mel's scan_adjusted commit e82e0561dae9
   mm: vmscan: obey proportional scanning requirements for kswapd.  That
   attempts to achieve proportionality between anon and file lrus: to the
   extent that once one of those is empty, it then tries to empty the other.
   Stop that.
   
   Signed-off-by: Hugh Dickins hu...@google.com
   ---
   
   We've been running happily with this for months; but all that time it's
   been on my TODO list with a needs more thought tag before we could
   upstream it, and I never got around to that.  We also have a somewhat
   similar, but older and quite independent, fix to get_scan_count() from
   Suleiman, which I'd meant to send along at the same time: I'll dig that
   one out tomorrow or the day after.
 
 I've sent that one out now in a new thread
 https://lkml.org/lkml/2014/3/15/168
 and also let's tie these together with Hannes's
 https://lkml.org/lkml/2014/3/14/277
 
   
  
  I ran a battery of page reclaim related tests against it on top of
  3.14-rc6. Workloads showed small improvements in their absolute performance
  but actual IO behaviour looked much better in some tests.  This is the
  iostats summary for the test that showed the biggest different -- dd of
  a large file on ext3.
  
  3.14.0-rc6  3.14.0-rc6
 vanilla  proportional-v1r1
  Meansda-avgqz   1045.64 224.18  
  Meansda-await   2120.12 506.77  
  Meansda-r_await 18.61   19.78   
  Meansda-w_await 11089.602126.35 
  Max sda-avgqz   2294.39 787.13  
  Max sda-await   7074.79 2371.67 
  Max sda-r_await 503.00  414.00  
  Max sda-w_await 35721.937249.84 
  
  Not all workloads benefitted. The same workload on ext4 showed no useful
  difference. btrfs looks like
  
   3.14.0-rc6 3.14.0-rc6
 vanilla  proportional-v1r1
  Meansda-avgqz   762.69  650.39  
  Meansda-await   2438.46 2495.15 
  Meansda-r_await 44.18   47.20   
  Meansda-w_await 6109.19 5139.86 
  Max sda-avgqz   2203.50 1870.78 
  Max sda-await   7098.26 6847.21 
  Max sda-r_await 63.02   156.00  
  Max sda-w_await 19921.7011085.13
  
  Better but not as dramatically so. I didn't analyse why. A workload that
  had a large anonymous mapping with large amounts of IO in the background
  did not show any regressions so based on that and the fact the patch looks
  ok, here goes nothing;
  
  Acked-by: Mel Gorman mgor...@suse.de
 
 Big thank you, Mel, for doing so much work on it, and so very quickly.
 I get quite lost in the numbers myself: I'm much more convinced of it
 by your numbers and ack.
 
  
  You say it's already been tested for months but it would be nice if the
  workload that generated this thread was also tested.
 
 Yes indeed: Yuanhan, do you have time to try this patch for your
 testcase?  I'm hoping it will prove at least as effective as your
 own suggested patch, but please let us know what you find - thanks.

Hi Hugh,

Sure, and sorry to tell you that this patch introduced another half
performance descrease from avg 60 MB/s to 30 MB/s in this testcase.

Moreover, the dd throughput for each process was steady before, however,
it's quite bumpy from 20 MB/s to 40 MB/s w/ this patch applied, and thus
got a avg of 30 MB/s:

11327188992 bytes (11 GB) copied, 300.014 s, 37.8 MB/s
1809373+0 records in
1809372+0 records out
7411187712 bytes (7.4 GB) copied, 300.008 s, 24.7 MB/s
3068285+0 records in
3068284+0 records out
12567691264 bytes (13 GB) copied, 300.001 s, 41.9 MB/s
1883877+0 records in
1883876+0 records out
7716356096 bytes (7.7 GB) copied, 300.002 s, 25.7 MB/s
1807674+0 records in
1807673+0 records out
7404228608 bytes (7.4 GB) copied

Re: performance regression due to commit e82e0561("mm: vmscan: obey proportional scanning requirements for kswapd")

2014-03-13 Thread Yuanhan Liu
On Wed, Mar 12, 2014 at 04:54:47PM +, Mel Gorman wrote:
> On Tue, Feb 18, 2014 at 04:01:22PM +0800, Yuanhan Liu wrote:
> > Hi,
> > 
> > Commit e82e0561("mm: vmscan: obey proportional scanning requirements for
> > kswapd") caused a big performance regression(73%) for vm-scalability/
> > lru-file-readonce testcase on a system with 256G memory without swap.
> > 
> > That testcase simply looks like this:
> >  truncate -s 1T /tmp/vm-scalability.img
> >  mkfs.xfs -q /tmp/vm-scalability.img
> >  mount -o loop /tmp/vm-scalability.img /tmp/vm-scalability
> > 
> >  SPARESE_FILE="/tmp/vm-scalability/sparse-lru-file-readonce"
> >  for i in `seq 1 120`; do
> >  truncate $SPARESE_FILE-$i -s 36G
> >  timeout --foreground -s INT 300 dd bs=4k if=$SPARESE_FILE-$i 
> > of=/dev/null
> >  done
> > 
> >  wait
> > 
> 
> The filename implies that it's a sparse file with no IO but does not say
> what the truncate function/program/whatever actually does.

It's actually the /usr/bin/truncate file from coreutils.

> If it's really a
> sparse file then the dd process should be reading zeros and writing them to
> NULL without IO. Where are pages being dirtied?

Sorry, my bad. I was wrong and I meant to "the speed of getting new
pages", but not "the speed of dirtying pages".

> Does the truncate command
> really create a sparse file or is it something else?
> 
> > Actually, it's not the newlly added code(obey proportional scanning)
> > in that commit caused the regression. But instead, it's the following
> > change:
> > +
> > +   if (nr_reclaimed < nr_to_reclaim || scan_adjusted)
> > +   continue;
> > +
> > 
> > 
> > -   if (nr_reclaimed >= nr_to_reclaim &&
> > -   sc->priority < DEF_PRIORITY)
> > +   if (global_reclaim(sc) && !current_is_kswapd())
> > break;
> > 
> > The difference is that we might reclaim more than requested before
> > in the first round reclaimming(sc->priority == DEF_PRIORITY).
> > 
> > So, for a testcase like lru-file-readonce, the dirty rate is fast, and
> > reclaimming SWAP_CLUSTER_MAX(32 pages) each time is not enough for catching
> > up the dirty rate. And thus page allocation stalls, and performance drops:
> > 
> >O for e82e0561
> >* for parent commit
> > 
> > proc-vmstat.allocstall
> > 
> >  2e+06 
> > ++---+
> >1.8e+06 O+  OO   O   
> > |
> >|
> > |
> >1.6e+06 ++   
> > |
> >1.4e+06 ++   
> > |
> >|
> > |
> >1.2e+06 ++   
> > |
> >  1e+06 ++   
> > |
> > 80 ++   
> > |
> >|
> > |
> > 60 ++   
> > |
> > 40 ++   
> > |
> >|
> > |
> > 20 
> > *+..**...*...*
> >  0 
> > ++---+
> > 
> >vm-scalability.throughput
> > 
> >2.2e+07 
> > ++---+
> >|
> > |
> >  2e+07 
> > *+..**...*...*
> >1.8e+07 ++   
> > |
> >|
> > |
> >1.6e+07 ++  

Re: performance regression due to commit e82e0561(mm: vmscan: obey proportional scanning requirements for kswapd)

2014-03-13 Thread Yuanhan Liu
On Wed, Mar 12, 2014 at 04:54:47PM +, Mel Gorman wrote:
 On Tue, Feb 18, 2014 at 04:01:22PM +0800, Yuanhan Liu wrote:
  Hi,
  
  Commit e82e0561(mm: vmscan: obey proportional scanning requirements for
  kswapd) caused a big performance regression(73%) for vm-scalability/
  lru-file-readonce testcase on a system with 256G memory without swap.
  
  That testcase simply looks like this:
   truncate -s 1T /tmp/vm-scalability.img
   mkfs.xfs -q /tmp/vm-scalability.img
   mount -o loop /tmp/vm-scalability.img /tmp/vm-scalability
  
   SPARESE_FILE=/tmp/vm-scalability/sparse-lru-file-readonce
   for i in `seq 1 120`; do
   truncate $SPARESE_FILE-$i -s 36G
   timeout --foreground -s INT 300 dd bs=4k if=$SPARESE_FILE-$i 
  of=/dev/null
   done
  
   wait
  
 
 The filename implies that it's a sparse file with no IO but does not say
 what the truncate function/program/whatever actually does.

It's actually the /usr/bin/truncate file from coreutils.

 If it's really a
 sparse file then the dd process should be reading zeros and writing them to
 NULL without IO. Where are pages being dirtied?

Sorry, my bad. I was wrong and I meant to the speed of getting new
pages, but not the speed of dirtying pages.

 Does the truncate command
 really create a sparse file or is it something else?
 
  Actually, it's not the newlly added code(obey proportional scanning)
  in that commit caused the regression. But instead, it's the following
  change:
  +
  +   if (nr_reclaimed  nr_to_reclaim || scan_adjusted)
  +   continue;
  +
  
  
  -   if (nr_reclaimed = nr_to_reclaim 
  -   sc-priority  DEF_PRIORITY)
  +   if (global_reclaim(sc)  !current_is_kswapd())
  break;
  
  The difference is that we might reclaim more than requested before
  in the first round reclaimming(sc-priority == DEF_PRIORITY).
  
  So, for a testcase like lru-file-readonce, the dirty rate is fast, and
  reclaimming SWAP_CLUSTER_MAX(32 pages) each time is not enough for catching
  up the dirty rate. And thus page allocation stalls, and performance drops:
  
 O for e82e0561
 * for parent commit
  
  proc-vmstat.allocstall
  
   2e+06 
  ++---+
 1.8e+06 O+  OO   O   
  |
 |
  |
 1.6e+06 ++   
  |
 1.4e+06 ++   
  |
 |
  |
 1.2e+06 ++   
  |
   1e+06 ++   
  |
  80 ++   
  |
 |
  |
  60 ++   
  |
  40 ++   
  |
 |
  |
  20 
  *+..**...*...*
   0 
  ++---+
  
 vm-scalability.throughput
  
 2.2e+07 
  ++---+
 |
  |
   2e+07 
  *+..**...*...*
 1.8e+07 ++   
  |
 |
  |
 1.6e+07 ++   
  |
 |
  |
 1.4e+07 ++   
  |
 |
  |
 1.2e+07 ++   
  |
   1e+07 ++   
  |
 |
  |
   8e+06 ++  OO   O   
  |
 O
  |
   6e+06 
  ++---+
  
  I made a patch which simply keeps reclaimming more if sc-priority == 
  DEF_PRIORITY.
  I'm not sure it's the right way to go or not. Anyway, I pasted

Re: performance regression due to commit e82e0561("mm: vmscan: obey proportional scanning requirements for kswapd")

2014-03-07 Thread Yuanhan Liu
ping...

On Tue, Feb 18, 2014 at 04:01:22PM +0800, Yuanhan Liu wrote:
> Hi,
> 
> Commit e82e0561("mm: vmscan: obey proportional scanning requirements for
> kswapd") caused a big performance regression(73%) for vm-scalability/
> lru-file-readonce testcase on a system with 256G memory without swap.
> 
> That testcase simply looks like this:
>  truncate -s 1T /tmp/vm-scalability.img
>  mkfs.xfs -q /tmp/vm-scalability.img
>  mount -o loop /tmp/vm-scalability.img /tmp/vm-scalability
> 
>  SPARESE_FILE="/tmp/vm-scalability/sparse-lru-file-readonce"
>  for i in `seq 1 120`; do
>  truncate $SPARESE_FILE-$i -s 36G
>  timeout --foreground -s INT 300 dd bs=4k if=$SPARESE_FILE-$i 
> of=/dev/null
>  done
> 
>  wait
> 
> Actually, it's not the newlly added code(obey proportional scanning)
> in that commit caused the regression. But instead, it's the following
> change:
> +
> +   if (nr_reclaimed < nr_to_reclaim || scan_adjusted)
> +   continue;
> +
> 
> 
> -   if (nr_reclaimed >= nr_to_reclaim &&
> -   sc->priority < DEF_PRIORITY)
> +   if (global_reclaim(sc) && !current_is_kswapd())
> break;
> 
> The difference is that we might reclaim more than requested before
> in the first round reclaimming(sc->priority == DEF_PRIORITY).
> 
> So, for a testcase like lru-file-readonce, the dirty rate is fast, and
> reclaimming SWAP_CLUSTER_MAX(32 pages) each time is not enough for catching
> up the dirty rate. And thus page allocation stalls, and performance drops:
> 
>O for e82e0561
>* for parent commit
> 
> proc-vmstat.allocstall
> 
>  2e+06 ++---+
>1.8e+06 O+  OO   O   |
>||
>1.6e+06 ++   |
>1.4e+06 ++   |
>||
>1.2e+06 ++   |
>  1e+06 ++   |
> 80 ++   |
>||
> 60 ++   |
> 40 ++   |
>||
> 20 *+..**...*...*
>  0 ++---+
> 
>vm-scalability.throughput
> 
>2.2e+07 ++---+
>||
>  2e+07 *+..**...*...*
>1.8e+07 ++   |
>||
>1.6e+07 ++   |
>||
>1.4e+07 ++   |
>||
>1.2e+07 ++   |
>  1e+07 ++   |
>||
>  8e+06 ++  OO   O   |
>O|
>  6e+06 ++---+
> 
> I made a patch which simply keeps reclaimming more if sc->priority == 
> DEF_PRIORITY.
> I'm not sure it's the right way to go or not. Anyway, I pasted it here for 
> comments.
> 
> ---
> diff --git a/mm/vmscan.c b/mm/vmscan.c
> index 26ad67f..37004a8 100644
> --- a/mm/vmscan.c
> +++ b/mm/vmscan.c
> @@ -1828,7 +1828,16 @@ static void shrink_lruvec(struct lruvec *lruvec, 
> struct scan_control *sc)
>   unsigned long nr_reclaimed = 0;
>   unsigned long nr_to_

Re: performance regression due to commit e82e0561(mm: vmscan: obey proportional scanning requirements for kswapd)

2014-03-07 Thread Yuanhan Liu
ping...

On Tue, Feb 18, 2014 at 04:01:22PM +0800, Yuanhan Liu wrote:
 Hi,
 
 Commit e82e0561(mm: vmscan: obey proportional scanning requirements for
 kswapd) caused a big performance regression(73%) for vm-scalability/
 lru-file-readonce testcase on a system with 256G memory without swap.
 
 That testcase simply looks like this:
  truncate -s 1T /tmp/vm-scalability.img
  mkfs.xfs -q /tmp/vm-scalability.img
  mount -o loop /tmp/vm-scalability.img /tmp/vm-scalability
 
  SPARESE_FILE=/tmp/vm-scalability/sparse-lru-file-readonce
  for i in `seq 1 120`; do
  truncate $SPARESE_FILE-$i -s 36G
  timeout --foreground -s INT 300 dd bs=4k if=$SPARESE_FILE-$i 
 of=/dev/null
  done
 
  wait
 
 Actually, it's not the newlly added code(obey proportional scanning)
 in that commit caused the regression. But instead, it's the following
 change:
 +
 +   if (nr_reclaimed  nr_to_reclaim || scan_adjusted)
 +   continue;
 +
 
 
 -   if (nr_reclaimed = nr_to_reclaim 
 -   sc-priority  DEF_PRIORITY)
 +   if (global_reclaim(sc)  !current_is_kswapd())
 break;
 
 The difference is that we might reclaim more than requested before
 in the first round reclaimming(sc-priority == DEF_PRIORITY).
 
 So, for a testcase like lru-file-readonce, the dirty rate is fast, and
 reclaimming SWAP_CLUSTER_MAX(32 pages) each time is not enough for catching
 up the dirty rate. And thus page allocation stalls, and performance drops:
 
O for e82e0561
* for parent commit
 
 proc-vmstat.allocstall
 
  2e+06 ++---+
1.8e+06 O+  OO   O   |
||
1.6e+06 ++   |
1.4e+06 ++   |
||
1.2e+06 ++   |
  1e+06 ++   |
 80 ++   |
||
 60 ++   |
 40 ++   |
||
 20 *+..**...*...*
  0 ++---+
 
vm-scalability.throughput
 
2.2e+07 ++---+
||
  2e+07 *+..**...*...*
1.8e+07 ++   |
||
1.6e+07 ++   |
||
1.4e+07 ++   |
||
1.2e+07 ++   |
  1e+07 ++   |
||
  8e+06 ++  OO   O   |
O|
  6e+06 ++---+
 
 I made a patch which simply keeps reclaimming more if sc-priority == 
 DEF_PRIORITY.
 I'm not sure it's the right way to go or not. Anyway, I pasted it here for 
 comments.
 
 ---
 diff --git a/mm/vmscan.c b/mm/vmscan.c
 index 26ad67f..37004a8 100644
 --- a/mm/vmscan.c
 +++ b/mm/vmscan.c
 @@ -1828,7 +1828,16 @@ static void shrink_lruvec(struct lruvec *lruvec, 
 struct scan_control *sc)
   unsigned long nr_reclaimed = 0;
   unsigned long nr_to_reclaim = sc-nr_to_reclaim;
   struct blk_plug plug;
 - bool scan_adjusted = false;
 + /*
 +  * On large memory systems, direct reclamming of SWAP_CLUSTER_MAX
 +  * each time may not catch up the dirty rate in some cases(say,
 +  * vm-scalability/lru-file-readonce), which may increase the
 +  * page allocation stall latency in the end.
 +  *
 +  * Here we try to reclaim more than requested for the first round

performance regression due to commit e82e0561("mm: vmscan: obey proportional scanning requirements for kswapd")

2014-02-18 Thread Yuanhan Liu
Hi,

Commit e82e0561("mm: vmscan: obey proportional scanning requirements for
kswapd") caused a big performance regression(73%) for vm-scalability/
lru-file-readonce testcase on a system with 256G memory without swap.

That testcase simply looks like this:
 truncate -s 1T /tmp/vm-scalability.img
 mkfs.xfs -q /tmp/vm-scalability.img
 mount -o loop /tmp/vm-scalability.img /tmp/vm-scalability

 SPARESE_FILE="/tmp/vm-scalability/sparse-lru-file-readonce"
 for i in `seq 1 120`; do
 truncate $SPARESE_FILE-$i -s 36G
 timeout --foreground -s INT 300 dd bs=4k if=$SPARESE_FILE-$i 
of=/dev/null
 done

 wait

Actually, it's not the newlly added code(obey proportional scanning)
in that commit caused the regression. But instead, it's the following
change:
+
+   if (nr_reclaimed < nr_to_reclaim || scan_adjusted)
+   continue;
+


-   if (nr_reclaimed >= nr_to_reclaim &&
-   sc->priority < DEF_PRIORITY)
+   if (global_reclaim(sc) && !current_is_kswapd())
break;

The difference is that we might reclaim more than requested before
in the first round reclaimming(sc->priority == DEF_PRIORITY).

So, for a testcase like lru-file-readonce, the dirty rate is fast, and
reclaimming SWAP_CLUSTER_MAX(32 pages) each time is not enough for catching
up the dirty rate. And thus page allocation stalls, and performance drops:

   O for e82e0561
   * for parent commit

proc-vmstat.allocstall

 2e+06 ++---+
   1.8e+06 O+  OO   O   |
   ||
   1.6e+06 ++   |
   1.4e+06 ++   |
   ||
   1.2e+06 ++   |
 1e+06 ++   |
80 ++   |
   ||
60 ++   |
40 ++   |
   ||
20 *+..**...*...*
 0 ++---+

   vm-scalability.throughput

   2.2e+07 ++---+
   ||
 2e+07 *+..**...*...*
   1.8e+07 ++   |
   ||
   1.6e+07 ++   |
   ||
   1.4e+07 ++   |
   ||
   1.2e+07 ++   |
 1e+07 ++   |
   ||
 8e+06 ++  OO   O   |
   O|
 6e+06 ++---+

I made a patch which simply keeps reclaimming more if sc->priority == 
DEF_PRIORITY.
I'm not sure it's the right way to go or not. Anyway, I pasted it here for 
comments.

---
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 26ad67f..37004a8 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1828,7 +1828,16 @@ static void shrink_lruvec(struct lruvec *lruvec, struct 
scan_control *sc)
unsigned long nr_reclaimed = 0;
unsigned long nr_to_reclaim = sc->nr_to_reclaim;
struct blk_plug plug;
-   bool scan_adjusted = false;
+   /*
+* On large memory systems, direct reclamming of SWAP_CLUSTER_MAX
+* each time may not catch up the dirty rate in some cases(say,
+* vm-scalability/lru-file-readonce), which may increase the
+* page allocation stall latency in the end.
+*
+* Here we try to reclaim more than requested for the first round
+* (sc->priority == DEF_PRIORITY) to reduce such latency.
+*/
+   bool scan_adjusted = sc->priority == DEF_PRIORITY;
 
  

performance regression due to commit e82e0561(mm: vmscan: obey proportional scanning requirements for kswapd)

2014-02-18 Thread Yuanhan Liu
Hi,

Commit e82e0561(mm: vmscan: obey proportional scanning requirements for
kswapd) caused a big performance regression(73%) for vm-scalability/
lru-file-readonce testcase on a system with 256G memory without swap.

That testcase simply looks like this:
 truncate -s 1T /tmp/vm-scalability.img
 mkfs.xfs -q /tmp/vm-scalability.img
 mount -o loop /tmp/vm-scalability.img /tmp/vm-scalability

 SPARESE_FILE=/tmp/vm-scalability/sparse-lru-file-readonce
 for i in `seq 1 120`; do
 truncate $SPARESE_FILE-$i -s 36G
 timeout --foreground -s INT 300 dd bs=4k if=$SPARESE_FILE-$i 
of=/dev/null
 done

 wait

Actually, it's not the newlly added code(obey proportional scanning)
in that commit caused the regression. But instead, it's the following
change:
+
+   if (nr_reclaimed  nr_to_reclaim || scan_adjusted)
+   continue;
+


-   if (nr_reclaimed = nr_to_reclaim 
-   sc-priority  DEF_PRIORITY)
+   if (global_reclaim(sc)  !current_is_kswapd())
break;

The difference is that we might reclaim more than requested before
in the first round reclaimming(sc-priority == DEF_PRIORITY).

So, for a testcase like lru-file-readonce, the dirty rate is fast, and
reclaimming SWAP_CLUSTER_MAX(32 pages) each time is not enough for catching
up the dirty rate. And thus page allocation stalls, and performance drops:

   O for e82e0561
   * for parent commit

proc-vmstat.allocstall

 2e+06 ++---+
   1.8e+06 O+  OO   O   |
   ||
   1.6e+06 ++   |
   1.4e+06 ++   |
   ||
   1.2e+06 ++   |
 1e+06 ++   |
80 ++   |
   ||
60 ++   |
40 ++   |
   ||
20 *+..**...*...*
 0 ++---+

   vm-scalability.throughput

   2.2e+07 ++---+
   ||
 2e+07 *+..**...*...*
   1.8e+07 ++   |
   ||
   1.6e+07 ++   |
   ||
   1.4e+07 ++   |
   ||
   1.2e+07 ++   |
 1e+07 ++   |
   ||
 8e+06 ++  OO   O   |
   O|
 6e+06 ++---+

I made a patch which simply keeps reclaimming more if sc-priority == 
DEF_PRIORITY.
I'm not sure it's the right way to go or not. Anyway, I pasted it here for 
comments.

---
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 26ad67f..37004a8 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1828,7 +1828,16 @@ static void shrink_lruvec(struct lruvec *lruvec, struct 
scan_control *sc)
unsigned long nr_reclaimed = 0;
unsigned long nr_to_reclaim = sc-nr_to_reclaim;
struct blk_plug plug;
-   bool scan_adjusted = false;
+   /*
+* On large memory systems, direct reclamming of SWAP_CLUSTER_MAX
+* each time may not catch up the dirty rate in some cases(say,
+* vm-scalability/lru-file-readonce), which may increase the
+* page allocation stall latency in the end.
+*
+* Here we try to reclaim more than requested for the first round
+* (sc-priority == DEF_PRIORITY) to reduce such latency.
+*/
+   bool scan_adjusted = sc-priority == DEF_PRIORITY;
 

Re: changes caused by 0d11e6ac("blk-mq: fix use-after-free of request")

2013-12-19 Thread Yuanhan Liu
On Wed, Dec 18, 2013 at 11:29:30AM +0100, Matias Bjørling wrote:
> On 12/18/2013 09:50 AM, Yuanhan Liu wrote:
> >Hi,
> >
> >FYI, we noticed some changes caused by 0d11e6ac("blk-mq: fix use-after-free 
> >of request"):
> >
> 
> The blk-mq accounting was faulty up to that commit. We should
> compare the blk-mq with the previous block layer.
> 
> Could you try to revert the following patches:
> 
> f02b9ac virtio-blk: virtqueue_kick() must be ordered with other...
> 1cf7e9c virtio_blk: blk-mq support
> 
> and compare the two runs (upto 0d11e6ac applied, and the same, with
> the two patches reverted)

Hi Matias,

You are right. Those counter restore back with the two patches 
reverted(d1b4e3825c8848b0ea0f).

959a35f13eb785f982d7   0d11e6aca396e679c07b   d1b4e3825c8848b0ea0f  
----  - 
 
  0.00  60.02 ~42%  0.00   
vpx/micro/xfstests/4HDD-btrfs-generic-quick
  0.00 367.81 ~27%  0.00   
vpx/micro/xfstests/4HDD-ext4-generic-mid
  0.00 411.64 ~13%  0.00   
vpx/micro/xfstests/4HDD-xfs-generic-mid
  0.00 208.39 ~10%  0.00   
vpx/micro/xfstests/4HDD-xfs-generic-quick
  0.001047.86   0.00   TOTAL 
iostat.vdd.await

959a35f13eb785f982d7   0d11e6aca396e679c07b   d1b4e3825c8848b0ea0f  
----  - 
 
  0.00 301.60 ~34%  0.00   
vpx/micro/xfstests/4HDD-btrfs-generic-mid
  0.00 249.16 ~12%  0.00   
vpx/micro/xfstests/4HDD-btrfs-generic-quick
  0.00  51.45 ~26%  0.00   
vpx/micro/xfstests/4HDD-ext4-generic-mid
  0.00  91.51 ~21%  0.04   
vpx/micro/xfstests/4HDD-xfs-generic-127
  0.001919.27 ~43%  0.00   
vpx/micro/xfstests/4HDD-xfs-generic-mid
  0.00 121.04 ~11%  0.00   
vpx/micro/xfstests/4HDD-xfs-generic-quick
  0.002734.03   0.04   TOTAL 
iostat.vda.r_await

959a35f13eb785f982d7   0d11e6aca396e679c07b   d1b4e3825c8848b0ea0f  
----  - 
 
  0.00 406.12 ~10%  0.00   
vpx/micro/xfstests/4HDD-btrfs-generic-mid
  0.00 433.66 ~ 7%  0.00   
vpx/micro/xfstests/4HDD-btrfs-generic-quick
  0.00 807.79 ~15%  0.00   
vpx/micro/xfstests/4HDD-ext4-generic-mid
  0.00  42.94 ~67%  0.51   
vpx/micro/xfstests/4HDD-xfs-generic-127
  0.00 592.20 ~16%  0.00   
vpx/micro/xfstests/4HDD-xfs-generic-mid
  0.00 401.74 ~12%  0.00   
vpx/micro/xfstests/4HDD-xfs-generic-quick
  0.002684.45   0.51   TOTAL 
iostat.vda.w_await



--yliu
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: changes caused by 0d11e6ac(blk-mq: fix use-after-free of request)

2013-12-19 Thread Yuanhan Liu
On Wed, Dec 18, 2013 at 11:29:30AM +0100, Matias Bjørling wrote:
 On 12/18/2013 09:50 AM, Yuanhan Liu wrote:
 Hi,
 
 FYI, we noticed some changes caused by 0d11e6ac(blk-mq: fix use-after-free 
 of request):
 
 
 The blk-mq accounting was faulty up to that commit. We should
 compare the blk-mq with the previous block layer.
 
 Could you try to revert the following patches:
 
 f02b9ac virtio-blk: virtqueue_kick() must be ordered with other...
 1cf7e9c virtio_blk: blk-mq support
 
 and compare the two runs (upto 0d11e6ac applied, and the same, with
 the two patches reverted)

Hi Matias,

You are right. Those counter restore back with the two patches 
reverted(d1b4e3825c8848b0ea0f).

959a35f13eb785f982d7   0d11e6aca396e679c07b   d1b4e3825c8848b0ea0f  
----  - 
 
  0.00  60.02 ~42%  0.00   
vpx/micro/xfstests/4HDD-btrfs-generic-quick
  0.00 367.81 ~27%  0.00   
vpx/micro/xfstests/4HDD-ext4-generic-mid
  0.00 411.64 ~13%  0.00   
vpx/micro/xfstests/4HDD-xfs-generic-mid
  0.00 208.39 ~10%  0.00   
vpx/micro/xfstests/4HDD-xfs-generic-quick
  0.001047.86   0.00   TOTAL 
iostat.vdd.await

959a35f13eb785f982d7   0d11e6aca396e679c07b   d1b4e3825c8848b0ea0f  
----  - 
 
  0.00 301.60 ~34%  0.00   
vpx/micro/xfstests/4HDD-btrfs-generic-mid
  0.00 249.16 ~12%  0.00   
vpx/micro/xfstests/4HDD-btrfs-generic-quick
  0.00  51.45 ~26%  0.00   
vpx/micro/xfstests/4HDD-ext4-generic-mid
  0.00  91.51 ~21%  0.04   
vpx/micro/xfstests/4HDD-xfs-generic-127
  0.001919.27 ~43%  0.00   
vpx/micro/xfstests/4HDD-xfs-generic-mid
  0.00 121.04 ~11%  0.00   
vpx/micro/xfstests/4HDD-xfs-generic-quick
  0.002734.03   0.04   TOTAL 
iostat.vda.r_await

959a35f13eb785f982d7   0d11e6aca396e679c07b   d1b4e3825c8848b0ea0f  
----  - 
 
  0.00 406.12 ~10%  0.00   
vpx/micro/xfstests/4HDD-btrfs-generic-mid
  0.00 433.66 ~ 7%  0.00   
vpx/micro/xfstests/4HDD-btrfs-generic-quick
  0.00 807.79 ~15%  0.00   
vpx/micro/xfstests/4HDD-ext4-generic-mid
  0.00  42.94 ~67%  0.51   
vpx/micro/xfstests/4HDD-xfs-generic-127
  0.00 592.20 ~16%  0.00   
vpx/micro/xfstests/4HDD-xfs-generic-mid
  0.00 401.74 ~12%  0.00   
vpx/micro/xfstests/4HDD-xfs-generic-quick
  0.002684.45   0.51   TOTAL 
iostat.vda.w_await



--yliu
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


  1   2   3   4   >