[PATCH] async_tx: replace page_address with kmap_atomic
As a page might belong to highmem. Strictly nested kmap_atomic() order is followed according to doc Documentation/vm/highmem.txt CC: Dan Williams CC: Shaohua Li Signed-off-by: Yuanhan Liu --- crypto/async_tx/async_pq.c | 18 +- crypto/async_tx/async_raid6_recov.c | 31 --- crypto/async_tx/async_xor.c | 17 ++--- 3 files changed, 51 insertions(+), 15 deletions(-) diff --git a/crypto/async_tx/async_pq.c b/crypto/async_tx/async_pq.c index 5d355e0..a408b7e 100644 --- a/crypto/async_tx/async_pq.c +++ b/crypto/async_tx/async_pq.c @@ -136,7 +136,7 @@ do_sync_gen_syndrome(struct page **blocks, unsigned int offset, int disks, BUG_ON(i > disks - 3); /* P or Q can't be zero */ srcs[i] = (void*)raid6_empty_zero_page; } else { - srcs[i] = page_address(blocks[i]) + offset; + srcs[i] = kmap_atomic(blocks[i]) + offset; if (i < disks - 2) { stop = i; if (start == -1) @@ -150,6 +150,12 @@ do_sync_gen_syndrome(struct page **blocks, unsigned int offset, int disks, raid6_call.xor_syndrome(disks, start, stop, len, srcs); } else raid6_call.gen_syndrome(disks, len, srcs); + + for (i = disks; i--; ) { + if (blocks[i]) + kunmap_atomic(srcs[i]); + } + async_tx_sync_epilog(submit); } @@ -395,14 +401,15 @@ async_syndrome_val(struct page **blocks, unsigned int offset, int disks, */ tx = NULL; *pqres = 0; + s = kmap_atomic(spare) + offset; if (p_src) { init_async_submit(submit, ASYNC_TX_XOR_ZERO_DST, NULL, NULL, NULL, scribble); tx = async_xor(spare, blocks, offset, disks-2, len, submit); async_tx_quiesce(); - p = page_address(p_src) + offset; - s = page_address(spare) + offset; + p = kmap_atomic(p_src) + offset; *pqres |= !!memcmp(p, s, len) << SUM_CHECK_P; + kunmap_atomic(p); } if (q_src) { @@ -411,10 +418,11 @@ async_syndrome_val(struct page **blocks, unsigned int offset, int disks, init_async_submit(submit, 0, NULL, NULL, NULL, scribble); tx = async_gen_syndrome(blocks, offset, disks, len, submit); async_tx_quiesce(); - q = page_address(q_src) + offset; - s = page_address(spare) + offset; + q = kmap_atomic(q_src) + offset; *pqres |= !!memcmp(q, s, len) << SUM_CHECK_Q; + kunmap_atomic(q); } + kunmap_atomic(s); /* restore P, Q and submit */ P(blocks, disks) = p_src; diff --git a/crypto/async_tx/async_raid6_recov.c b/crypto/async_tx/async_raid6_recov.c index 934a849..abcacb0 100644 --- a/crypto/async_tx/async_raid6_recov.c +++ b/crypto/async_tx/async_raid6_recov.c @@ -80,9 +80,9 @@ async_sum_product(struct page *dest, struct page **srcs, unsigned char *coef, async_tx_quiesce(>depend_tx); amul = raid6_gfmul[coef[0]]; bmul = raid6_gfmul[coef[1]]; - a = page_address(srcs[0]); - b = page_address(srcs[1]); - c = page_address(dest); + a = kmap_atomic(srcs[0]); + b = kmap_atomic(srcs[1]); + c = kmap_atomic(dest); while (len--) { ax= amul[*a++]; @@ -90,6 +90,10 @@ async_sum_product(struct page *dest, struct page **srcs, unsigned char *coef, *c++ = ax ^ bx; } + kunmap_atomic(c); + kunmap_atomic(b); + kunmap_atomic(a); + return NULL; } @@ -147,12 +151,15 @@ async_mult(struct page *dest, struct page *src, u8 coef, size_t len, */ async_tx_quiesce(>depend_tx); qmul = raid6_gfmul[coef]; - d = page_address(dest); - s = page_address(src); + d = kmap_atomic(dest); + s = kmap_atomic(src); while (len--) *d++ = qmul[*s++]; + kunmap_atomic(s); + kunmap_atomic(d); + return NULL; } @@ -372,10 +379,15 @@ async_raid6_2data_recov(int disks, size_t bytes, int faila, int failb, if (blocks[i] == NULL) ptrs[i] = (void *) raid6_empty_zero_page; else - ptrs[i] = page_address(blocks[i]); + ptrs[i] = kmap_atomic(blocks[i]); raid6_2data_recov(disks, bytes, faila, failb, ptrs); +
[PATCH] async_tx: replace page_address with kmap_atomic
As a page might belong to highmem. Strictly nested kmap_atomic() order is followed according to doc Documentation/vm/highmem.txt CC: Dan Williams dan.j.willi...@intel.com CC: Shaohua Li s...@fb.com Signed-off-by: Yuanhan Liu yuanhan@linux.intel.com --- crypto/async_tx/async_pq.c | 18 +- crypto/async_tx/async_raid6_recov.c | 31 --- crypto/async_tx/async_xor.c | 17 ++--- 3 files changed, 51 insertions(+), 15 deletions(-) diff --git a/crypto/async_tx/async_pq.c b/crypto/async_tx/async_pq.c index 5d355e0..a408b7e 100644 --- a/crypto/async_tx/async_pq.c +++ b/crypto/async_tx/async_pq.c @@ -136,7 +136,7 @@ do_sync_gen_syndrome(struct page **blocks, unsigned int offset, int disks, BUG_ON(i disks - 3); /* P or Q can't be zero */ srcs[i] = (void*)raid6_empty_zero_page; } else { - srcs[i] = page_address(blocks[i]) + offset; + srcs[i] = kmap_atomic(blocks[i]) + offset; if (i disks - 2) { stop = i; if (start == -1) @@ -150,6 +150,12 @@ do_sync_gen_syndrome(struct page **blocks, unsigned int offset, int disks, raid6_call.xor_syndrome(disks, start, stop, len, srcs); } else raid6_call.gen_syndrome(disks, len, srcs); + + for (i = disks; i--; ) { + if (blocks[i]) + kunmap_atomic(srcs[i]); + } + async_tx_sync_epilog(submit); } @@ -395,14 +401,15 @@ async_syndrome_val(struct page **blocks, unsigned int offset, int disks, */ tx = NULL; *pqres = 0; + s = kmap_atomic(spare) + offset; if (p_src) { init_async_submit(submit, ASYNC_TX_XOR_ZERO_DST, NULL, NULL, NULL, scribble); tx = async_xor(spare, blocks, offset, disks-2, len, submit); async_tx_quiesce(tx); - p = page_address(p_src) + offset; - s = page_address(spare) + offset; + p = kmap_atomic(p_src) + offset; *pqres |= !!memcmp(p, s, len) SUM_CHECK_P; + kunmap_atomic(p); } if (q_src) { @@ -411,10 +418,11 @@ async_syndrome_val(struct page **blocks, unsigned int offset, int disks, init_async_submit(submit, 0, NULL, NULL, NULL, scribble); tx = async_gen_syndrome(blocks, offset, disks, len, submit); async_tx_quiesce(tx); - q = page_address(q_src) + offset; - s = page_address(spare) + offset; + q = kmap_atomic(q_src) + offset; *pqres |= !!memcmp(q, s, len) SUM_CHECK_Q; + kunmap_atomic(q); } + kunmap_atomic(s); /* restore P, Q and submit */ P(blocks, disks) = p_src; diff --git a/crypto/async_tx/async_raid6_recov.c b/crypto/async_tx/async_raid6_recov.c index 934a849..abcacb0 100644 --- a/crypto/async_tx/async_raid6_recov.c +++ b/crypto/async_tx/async_raid6_recov.c @@ -80,9 +80,9 @@ async_sum_product(struct page *dest, struct page **srcs, unsigned char *coef, async_tx_quiesce(submit-depend_tx); amul = raid6_gfmul[coef[0]]; bmul = raid6_gfmul[coef[1]]; - a = page_address(srcs[0]); - b = page_address(srcs[1]); - c = page_address(dest); + a = kmap_atomic(srcs[0]); + b = kmap_atomic(srcs[1]); + c = kmap_atomic(dest); while (len--) { ax= amul[*a++]; @@ -90,6 +90,10 @@ async_sum_product(struct page *dest, struct page **srcs, unsigned char *coef, *c++ = ax ^ bx; } + kunmap_atomic(c); + kunmap_atomic(b); + kunmap_atomic(a); + return NULL; } @@ -147,12 +151,15 @@ async_mult(struct page *dest, struct page *src, u8 coef, size_t len, */ async_tx_quiesce(submit-depend_tx); qmul = raid6_gfmul[coef]; - d = page_address(dest); - s = page_address(src); + d = kmap_atomic(dest); + s = kmap_atomic(src); while (len--) *d++ = qmul[*s++]; + kunmap_atomic(s); + kunmap_atomic(d); + return NULL; } @@ -372,10 +379,15 @@ async_raid6_2data_recov(int disks, size_t bytes, int faila, int failb, if (blocks[i] == NULL) ptrs[i] = (void *) raid6_empty_zero_page; else - ptrs[i] = page_address(blocks[i]); + ptrs[i] = kmap_atomic(blocks[i
Re: [PATCH 3/3] md/raid5: per hash value and exclusive wait_for_stripe
On Thu, May 14, 2015 at 03:45:11PM +1000, NeilBrown wrote: > On Wed, 29 Apr 2015 10:48:55 +0800 Yuanhan Liu > wrote: > > > diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c > > index 64d5bea..697d77a 100644 > > --- a/drivers/md/raid5.c > > +++ b/drivers/md/raid5.c > > @@ -344,7 +344,8 @@ static void release_inactive_stripe_list(struct r5conf > > *conf, > > int hash) > > { > > int size; > > - bool do_wakeup = false; > > + unsigned long do_wakeup = 0; > > + int i = 0; > > unsigned long flags; > > > > if (hash == NR_STRIPE_HASH_LOCKS) { > > @@ -365,15 +366,19 @@ static void release_inactive_stripe_list(struct > > r5conf *conf, > > !list_empty(list)) > > atomic_dec(>empty_inactive_list_nr); > > list_splice_tail_init(list, conf->inactive_list + hash); > > - do_wakeup = true; > > + do_wakeup |= 1 << (size - 1); > > spin_unlock_irqrestore(conf->hash_locks + hash, flags); > > } > > size--; > > hash--; > > } > > > > + for (i = 0; i < NR_STRIPE_HASH_LOCKS; i++) { > > + if (do_wakeup & (1 << i)) > > + wake_up(>wait_for_stripe[i]); > > + } > > + > > hi, > I've been doing some testing and got a lock-up in resize_stripes, waiting > on wait_for_stripe[]. > > Looking at the above code, I think > do_wakeup |= 1 << (size - 1); > should be > do_wakeup |= 1 << hash; > > do you agree? Or am I missing something? Right. Sorry for the careless mistake. --yliu -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH 3/3] md/raid5: per hash value and exclusive wait_for_stripe
On Thu, May 14, 2015 at 03:45:11PM +1000, NeilBrown wrote: On Wed, 29 Apr 2015 10:48:55 +0800 Yuanhan Liu yuanhan@linux.intel.com wrote: diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 64d5bea..697d77a 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -344,7 +344,8 @@ static void release_inactive_stripe_list(struct r5conf *conf, int hash) { int size; - bool do_wakeup = false; + unsigned long do_wakeup = 0; + int i = 0; unsigned long flags; if (hash == NR_STRIPE_HASH_LOCKS) { @@ -365,15 +366,19 @@ static void release_inactive_stripe_list(struct r5conf *conf, !list_empty(list)) atomic_dec(conf-empty_inactive_list_nr); list_splice_tail_init(list, conf-inactive_list + hash); - do_wakeup = true; + do_wakeup |= 1 (size - 1); spin_unlock_irqrestore(conf-hash_locks + hash, flags); } size--; hash--; } + for (i = 0; i NR_STRIPE_HASH_LOCKS; i++) { + if (do_wakeup (1 i)) + wake_up(conf-wait_for_stripe[i]); + } + hi, I've been doing some testing and got a lock-up in resize_stripes, waiting on wait_for_stripe[]. Looking at the above code, I think do_wakeup |= 1 (size - 1); should be do_wakeup |= 1 hash; do you agree? Or am I missing something? Right. Sorry for the careless mistake. --yliu -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH 1/2] md/raid5: avoid duplicate code
On Fri, May 08, 2015 at 03:28:00PM +1000, NeilBrown wrote: > On Wed, 6 May 2015 17:45:49 +0800 Yuanhan Liu > wrote: > > > Move the code that put one idle sh(hot in cache, but happens to be > > zero referenced) back to active stage to __find_stripe(). Because > > that's what need to do every time you invoke __find_stripe(). > > > > Moving it there avoids duplicate code, as well as makes a bit more > > sense, IMO, as it tells a whole story now. > > Thanks for this. It is a good cleanup. > > However I don't want to make any new changes to the RAID5 code until I find a > couple of bugs that I'm hunting. So I won't apply it just yet. > Remind me in a couple of weeks if I seem to have forgotten. Got it. Thanks. --yliu > > > > > Signed-off-by: Yuanhan Liu > > --- > > drivers/md/raid5.c | 50 ++ > > 1 file changed, 18 insertions(+), 32 deletions(-) > > > > diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c > > index 77dfd72..e7fa818 100644 > > --- a/drivers/md/raid5.c > > +++ b/drivers/md/raid5.c > > @@ -567,8 +567,25 @@ static struct stripe_head *__find_stripe(struct r5conf > > *conf, sector_t sector, > > > > pr_debug("__find_stripe, sector %llu\n", (unsigned long long)sector); > > hlist_for_each_entry(sh, stripe_hash(conf, sector), hash) > > - if (sh->sector == sector && sh->generation == generation) > > + if (sh->sector == sector && sh->generation == generation) { > > + if (!atomic_inc_not_zero(>count)) { > > + spin_lock(>device_lock); > > + if (!atomic_read(>count)) { > > + if (!test_bit(STRIPE_HANDLE, > > >state)) > > + > > atomic_inc(>active_stripes); > > + BUG_ON(list_empty(>lru) && > > + !test_bit(STRIPE_EXPANDING, > > >state)); > > + list_del_init(>lru); > > + if (sh->group) { > > + sh->group->stripes_cnt--; > > + sh->group = NULL; > > + } > > + } > > + atomic_inc(>count); > > + spin_unlock(>device_lock); > > + } > > return sh; > > + } > > pr_debug("__stripe %llu not in cache\n", (unsigned long long)sector); > > return NULL; > > } > > @@ -698,21 +715,6 @@ get_active_stripe(struct r5conf *conf, sector_t sector, > > init_stripe(sh, sector, previous); > > atomic_inc(>count); > > } > > - } else if (!atomic_inc_not_zero(>count)) { > > - spin_lock(>device_lock); > > - if (!atomic_read(>count)) { > > - if (!test_bit(STRIPE_HANDLE, >state)) > > - atomic_inc(>active_stripes); > > - BUG_ON(list_empty(>lru) && > > - !test_bit(STRIPE_EXPANDING, >state)); > > - list_del_init(>lru); > > - if (sh->group) { > > - sh->group->stripes_cnt--; > > - sh->group = NULL; > > - } > > - } > > - atomic_inc(>count); > > - spin_unlock(>device_lock); > > } > > } while (sh == NULL); > > > > @@ -771,22 +773,6 @@ static void stripe_add_to_batch_list(struct r5conf > > *conf, struct stripe_head *sh > > hash = stripe_hash_locks_hash(head_sector); > > spin_lock_irq(conf->hash_locks + hash); > > head = __find_stripe(conf, head_sector, conf->generation); > > - if (head && !atomic_inc_not_zero(>count)) { > > - spin_lock(>device_lock); > > - if (!atomic_read(>count)) { > > - if (!test_bit(STRIPE_HANDLE, >state)) > > - atomic_inc(>active_stripes); > > - BUG_ON(list_empty(>lru) &&a
Re: [PATCH 1/2] md/raid5: avoid duplicate code
On Fri, May 08, 2015 at 03:28:00PM +1000, NeilBrown wrote: On Wed, 6 May 2015 17:45:49 +0800 Yuanhan Liu yuanhan@linux.intel.com wrote: Move the code that put one idle sh(hot in cache, but happens to be zero referenced) back to active stage to __find_stripe(). Because that's what need to do every time you invoke __find_stripe(). Moving it there avoids duplicate code, as well as makes a bit more sense, IMO, as it tells a whole story now. Thanks for this. It is a good cleanup. However I don't want to make any new changes to the RAID5 code until I find a couple of bugs that I'm hunting. So I won't apply it just yet. Remind me in a couple of weeks if I seem to have forgotten. Got it. Thanks. --yliu Signed-off-by: Yuanhan Liu yuanhan@linux.intel.com --- drivers/md/raid5.c | 50 ++ 1 file changed, 18 insertions(+), 32 deletions(-) diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 77dfd72..e7fa818 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -567,8 +567,25 @@ static struct stripe_head *__find_stripe(struct r5conf *conf, sector_t sector, pr_debug(__find_stripe, sector %llu\n, (unsigned long long)sector); hlist_for_each_entry(sh, stripe_hash(conf, sector), hash) - if (sh-sector == sector sh-generation == generation) + if (sh-sector == sector sh-generation == generation) { + if (!atomic_inc_not_zero(sh-count)) { + spin_lock(conf-device_lock); + if (!atomic_read(sh-count)) { + if (!test_bit(STRIPE_HANDLE, sh-state)) + atomic_inc(conf-active_stripes); + BUG_ON(list_empty(sh-lru) + !test_bit(STRIPE_EXPANDING, sh-state)); + list_del_init(sh-lru); + if (sh-group) { + sh-group-stripes_cnt--; + sh-group = NULL; + } + } + atomic_inc(sh-count); + spin_unlock(conf-device_lock); + } return sh; + } pr_debug(__stripe %llu not in cache\n, (unsigned long long)sector); return NULL; } @@ -698,21 +715,6 @@ get_active_stripe(struct r5conf *conf, sector_t sector, init_stripe(sh, sector, previous); atomic_inc(sh-count); } - } else if (!atomic_inc_not_zero(sh-count)) { - spin_lock(conf-device_lock); - if (!atomic_read(sh-count)) { - if (!test_bit(STRIPE_HANDLE, sh-state)) - atomic_inc(conf-active_stripes); - BUG_ON(list_empty(sh-lru) - !test_bit(STRIPE_EXPANDING, sh-state)); - list_del_init(sh-lru); - if (sh-group) { - sh-group-stripes_cnt--; - sh-group = NULL; - } - } - atomic_inc(sh-count); - spin_unlock(conf-device_lock); } } while (sh == NULL); @@ -771,22 +773,6 @@ static void stripe_add_to_batch_list(struct r5conf *conf, struct stripe_head *sh hash = stripe_hash_locks_hash(head_sector); spin_lock_irq(conf-hash_locks + hash); head = __find_stripe(conf, head_sector, conf-generation); - if (head !atomic_inc_not_zero(head-count)) { - spin_lock(conf-device_lock); - if (!atomic_read(head-count)) { - if (!test_bit(STRIPE_HANDLE, head-state)) - atomic_inc(conf-active_stripes); - BUG_ON(list_empty(head-lru) - !test_bit(STRIPE_EXPANDING, head-state)); - list_del_init(head-lru); - if (head-group) { - head-group-stripes_cnt--; - head-group = NULL; - } - } - atomic_inc(head-count); - spin_unlock(conf-device_lock); - } spin_unlock_irq(conf-hash_locks + hash); if (!head) -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH 2/2] md/raid5: remove unnecessary sh->count check
Remove the unnecessary "!atomic_read(>count)" check, as the previous "atomic_inc_not_zero(>count)" check assures sh->count to be 0. The only reason I can think of that we need such check is to consider the lock race issue. First of all, I doubt there is another process could modify an in-cache but zero referenced sh while it's being protected by a hash lock. Hence, I would say sh->count will be consistent to 0 in that "if !atomic_inc_not_zero" block. Secondly, just assume there is a chance that someone outside the lock modifies sh->count(by atomic_inc?). It could lead to some problem. To make it clear, here I paste few lines of key code: if (!atomic_inc_not_zero(>count)) { spin_lock(>device_lock); if (!atomic_read(>count)) { } ... } At the time we enter the first if block, sh->count is zero. And just assume someone increases sh->count from somewhere while acquiring the lock, the following if block will not be executed then, leaving some fileds, such as conf->active_stripes, not being set properly. So, we should execute the second if block whenever we entered the first if block no matter sh->count stays with 0 or not. Signed-off-by: Yuanhan Liu --- Neil, I'm a bit concerned that I missed something in this patch. Please kindly correct me if I'm wrong :) --- drivers/md/raid5.c | 18 -- 1 file changed, 8 insertions(+), 10 deletions(-) diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index e7fa818..17ece2a 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -570,16 +570,14 @@ static struct stripe_head *__find_stripe(struct r5conf *conf, sector_t sector, if (sh->sector == sector && sh->generation == generation) { if (!atomic_inc_not_zero(>count)) { spin_lock(>device_lock); - if (!atomic_read(>count)) { - if (!test_bit(STRIPE_HANDLE, >state)) - atomic_inc(>active_stripes); - BUG_ON(list_empty(>lru) && - !test_bit(STRIPE_EXPANDING, >state)); - list_del_init(>lru); - if (sh->group) { - sh->group->stripes_cnt--; - sh->group = NULL; - } + if (!test_bit(STRIPE_HANDLE, >state)) + atomic_inc(>active_stripes); + BUG_ON(list_empty(>lru) && + !test_bit(STRIPE_EXPANDING, >state)); + list_del_init(>lru); + if (sh->group) { + sh->group->stripes_cnt--; + sh->group = NULL; } atomic_inc(>count); spin_unlock(>device_lock); -- 1.9.0 -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH 1/2] md/raid5: avoid duplicate code
Move the code that put one idle sh(hot in cache, but happens to be zero referenced) back to active stage to __find_stripe(). Because that's what need to do every time you invoke __find_stripe(). Moving it there avoids duplicate code, as well as makes a bit more sense, IMO, as it tells a whole story now. Signed-off-by: Yuanhan Liu --- drivers/md/raid5.c | 50 ++ 1 file changed, 18 insertions(+), 32 deletions(-) diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 77dfd72..e7fa818 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -567,8 +567,25 @@ static struct stripe_head *__find_stripe(struct r5conf *conf, sector_t sector, pr_debug("__find_stripe, sector %llu\n", (unsigned long long)sector); hlist_for_each_entry(sh, stripe_hash(conf, sector), hash) - if (sh->sector == sector && sh->generation == generation) + if (sh->sector == sector && sh->generation == generation) { + if (!atomic_inc_not_zero(>count)) { + spin_lock(>device_lock); + if (!atomic_read(>count)) { + if (!test_bit(STRIPE_HANDLE, >state)) + atomic_inc(>active_stripes); + BUG_ON(list_empty(>lru) && + !test_bit(STRIPE_EXPANDING, >state)); + list_del_init(>lru); + if (sh->group) { + sh->group->stripes_cnt--; + sh->group = NULL; + } + } + atomic_inc(>count); + spin_unlock(>device_lock); + } return sh; + } pr_debug("__stripe %llu not in cache\n", (unsigned long long)sector); return NULL; } @@ -698,21 +715,6 @@ get_active_stripe(struct r5conf *conf, sector_t sector, init_stripe(sh, sector, previous); atomic_inc(>count); } - } else if (!atomic_inc_not_zero(>count)) { - spin_lock(>device_lock); - if (!atomic_read(>count)) { - if (!test_bit(STRIPE_HANDLE, >state)) - atomic_inc(>active_stripes); - BUG_ON(list_empty(>lru) && - !test_bit(STRIPE_EXPANDING, >state)); - list_del_init(>lru); - if (sh->group) { - sh->group->stripes_cnt--; - sh->group = NULL; - } - } - atomic_inc(>count); - spin_unlock(>device_lock); } } while (sh == NULL); @@ -771,22 +773,6 @@ static void stripe_add_to_batch_list(struct r5conf *conf, struct stripe_head *sh hash = stripe_hash_locks_hash(head_sector); spin_lock_irq(conf->hash_locks + hash); head = __find_stripe(conf, head_sector, conf->generation); - if (head && !atomic_inc_not_zero(>count)) { - spin_lock(>device_lock); - if (!atomic_read(>count)) { - if (!test_bit(STRIPE_HANDLE, >state)) - atomic_inc(>active_stripes); - BUG_ON(list_empty(>lru) && - !test_bit(STRIPE_EXPANDING, >state)); - list_del_init(>lru); - if (head->group) { - head->group->stripes_cnt--; - head->group = NULL; - } - } - atomic_inc(>count); - spin_unlock(>device_lock); - } spin_unlock_irq(conf->hash_locks + hash); if (!head) -- 1.9.0 -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH 1/2] md/raid5: avoid duplicate code
Move the code that put one idle sh(hot in cache, but happens to be zero referenced) back to active stage to __find_stripe(). Because that's what need to do every time you invoke __find_stripe(). Moving it there avoids duplicate code, as well as makes a bit more sense, IMO, as it tells a whole story now. Signed-off-by: Yuanhan Liu yuanhan@linux.intel.com --- drivers/md/raid5.c | 50 ++ 1 file changed, 18 insertions(+), 32 deletions(-) diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 77dfd72..e7fa818 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -567,8 +567,25 @@ static struct stripe_head *__find_stripe(struct r5conf *conf, sector_t sector, pr_debug(__find_stripe, sector %llu\n, (unsigned long long)sector); hlist_for_each_entry(sh, stripe_hash(conf, sector), hash) - if (sh-sector == sector sh-generation == generation) + if (sh-sector == sector sh-generation == generation) { + if (!atomic_inc_not_zero(sh-count)) { + spin_lock(conf-device_lock); + if (!atomic_read(sh-count)) { + if (!test_bit(STRIPE_HANDLE, sh-state)) + atomic_inc(conf-active_stripes); + BUG_ON(list_empty(sh-lru) + !test_bit(STRIPE_EXPANDING, sh-state)); + list_del_init(sh-lru); + if (sh-group) { + sh-group-stripes_cnt--; + sh-group = NULL; + } + } + atomic_inc(sh-count); + spin_unlock(conf-device_lock); + } return sh; + } pr_debug(__stripe %llu not in cache\n, (unsigned long long)sector); return NULL; } @@ -698,21 +715,6 @@ get_active_stripe(struct r5conf *conf, sector_t sector, init_stripe(sh, sector, previous); atomic_inc(sh-count); } - } else if (!atomic_inc_not_zero(sh-count)) { - spin_lock(conf-device_lock); - if (!atomic_read(sh-count)) { - if (!test_bit(STRIPE_HANDLE, sh-state)) - atomic_inc(conf-active_stripes); - BUG_ON(list_empty(sh-lru) - !test_bit(STRIPE_EXPANDING, sh-state)); - list_del_init(sh-lru); - if (sh-group) { - sh-group-stripes_cnt--; - sh-group = NULL; - } - } - atomic_inc(sh-count); - spin_unlock(conf-device_lock); } } while (sh == NULL); @@ -771,22 +773,6 @@ static void stripe_add_to_batch_list(struct r5conf *conf, struct stripe_head *sh hash = stripe_hash_locks_hash(head_sector); spin_lock_irq(conf-hash_locks + hash); head = __find_stripe(conf, head_sector, conf-generation); - if (head !atomic_inc_not_zero(head-count)) { - spin_lock(conf-device_lock); - if (!atomic_read(head-count)) { - if (!test_bit(STRIPE_HANDLE, head-state)) - atomic_inc(conf-active_stripes); - BUG_ON(list_empty(head-lru) - !test_bit(STRIPE_EXPANDING, head-state)); - list_del_init(head-lru); - if (head-group) { - head-group-stripes_cnt--; - head-group = NULL; - } - } - atomic_inc(head-count); - spin_unlock(conf-device_lock); - } spin_unlock_irq(conf-hash_locks + hash); if (!head) -- 1.9.0 -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH 2/2] md/raid5: remove unnecessary sh-count check
Remove the unnecessary !atomic_read(sh-count) check, as the previous atomic_inc_not_zero(sh-count) check assures sh-count to be 0. The only reason I can think of that we need such check is to consider the lock race issue. First of all, I doubt there is another process could modify an in-cache but zero referenced sh while it's being protected by a hash lock. Hence, I would say sh-count will be consistent to 0 in that if !atomic_inc_not_zero block. Secondly, just assume there is a chance that someone outside the lock modifies sh-count(by atomic_inc?). It could lead to some problem. To make it clear, here I paste few lines of key code: if (!atomic_inc_not_zero(sh-count)) { spin_lock(conf-device_lock); if (!atomic_read(sh-count)) { } ... } At the time we enter the first if block, sh-count is zero. And just assume someone increases sh-count from somewhere while acquiring the lock, the following if block will not be executed then, leaving some fileds, such as conf-active_stripes, not being set properly. So, we should execute the second if block whenever we entered the first if block no matter sh-count stays with 0 or not. Signed-off-by: Yuanhan Liu yuanhan@linux.intel.com --- Neil, I'm a bit concerned that I missed something in this patch. Please kindly correct me if I'm wrong :) --- drivers/md/raid5.c | 18 -- 1 file changed, 8 insertions(+), 10 deletions(-) diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index e7fa818..17ece2a 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -570,16 +570,14 @@ static struct stripe_head *__find_stripe(struct r5conf *conf, sector_t sector, if (sh-sector == sector sh-generation == generation) { if (!atomic_inc_not_zero(sh-count)) { spin_lock(conf-device_lock); - if (!atomic_read(sh-count)) { - if (!test_bit(STRIPE_HANDLE, sh-state)) - atomic_inc(conf-active_stripes); - BUG_ON(list_empty(sh-lru) - !test_bit(STRIPE_EXPANDING, sh-state)); - list_del_init(sh-lru); - if (sh-group) { - sh-group-stripes_cnt--; - sh-group = NULL; - } + if (!test_bit(STRIPE_HANDLE, sh-state)) + atomic_inc(conf-active_stripes); + BUG_ON(list_empty(sh-lru) + !test_bit(STRIPE_EXPANDING, sh-state)); + list_del_init(sh-lru); + if (sh-group) { + sh-group-stripes_cnt--; + sh-group = NULL; } atomic_inc(sh-count); spin_unlock(conf-device_lock); -- 1.9.0 -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH] md/raid5: init batch_xxx for new sh at resize_stripes
On Mon, May 04, 2015 at 05:24:24PM +1000, NeilBrown wrote: > On Mon, 4 May 2015 13:50:24 +0800 Yuanhan Liu > wrote: > > > This is to fix a kernel NULL dereference oops introduced by commit > > 59fc630b("RAID5: batch adjacent full stripe write"), which introduced > > several batch_xxx fields, and did initiation for them at grow_one_stripes(), > > but forgot to do same at resize_stripes(). > > > > This oops can be easily triggered by following steps: > > > > __create RAID5 /dev/md0 > > __grow /dev/md0 > > mdadm --wait /dev/md0 > > dd if=/dev/zero of=/dev/md0 > > > > Here is the detailed oops log: ... > > > > Cc: Shaohua Li > > Signed-off-by: Yuanhan Liu > > --- > > drivers/md/raid5.c | 4 > > 1 file changed, 4 insertions(+) > > > > diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c > > index 697d77a..7b074f7 100644 > > --- a/drivers/md/raid5.c > > +++ b/drivers/md/raid5.c > > @@ -2217,6 +2217,10 @@ static int resize_stripes(struct r5conf *conf, int > > newsize) > > if (!p) > > err = -ENOMEM; > > } > > + > > + spin_lock_init(>batch_lock); > > + INIT_LIST_HEAD(>batch_list); > > + nsh->batch_head = NULL; > > release_stripe(nsh); > > } > > /* critical section pass, GFP_NOIO no longer needed */ > > Thanks! > > However I already have the following fix queued - though not pushed out Yeah, much cleaner. > you. I probably would have got it into -rc2 except that I was chasing > another raid5 bug. The > BUG_ON(sh->batch_head); > > in handle_stripe_fill() fires when I run the mdadm selftests. I got caught > up chasing that and didn't push the other fix. I am not aware of there is a selftests for raid. I'd like to add it to our 0day kernel testing in near future so that we could catch bugs and bisect it down in first time ;) --yliu > > > From 3dd8ba734349e602fe17d647ce3da5f4a13748aa Mon Sep 17 00:00:00 2001 > From: NeilBrown > Date: Thu, 30 Apr 2015 11:24:28 +1000 > Subject: [PATCH] md/raid5 new alloc_stripe function. > > > diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c > index 77dfd720aaa0..91a1e8b26b52 100644 > --- a/drivers/md/raid5.c > +++ b/drivers/md/raid5.c > @@ -1971,17 +1971,30 @@ static void raid_run_ops(struct stripe_head *sh, > unsigned long ops_request) > put_cpu(); > } > > +static struct stripe_head *alloc_stripe(struct kmem_cache *sc, gfp_t gfp) > +{ > + struct stripe_head *sh; > + > + sh = kmem_cache_zalloc(sc, gfp); > + if (sh) { > + spin_lock_init(>stripe_lock); > + spin_lock_init(>batch_lock); > + INIT_LIST_HEAD(>batch_list); > + INIT_LIST_HEAD(>lru); > + atomic_set(>count, 1); > + } > + return sh; > +} > static int grow_one_stripe(struct r5conf *conf, gfp_t gfp) > { > struct stripe_head *sh; > - sh = kmem_cache_zalloc(conf->slab_cache, gfp); > + > + sh = alloc_stripe(conf->slab_cache, gfp); > if (!sh) > return 0; > > sh->raid_conf = conf; > > - spin_lock_init(>stripe_lock); > - > if (grow_buffers(sh, gfp)) { > shrink_buffers(sh); > kmem_cache_free(conf->slab_cache, sh); > @@ -1990,13 +2003,8 @@ static int grow_one_stripe(struct r5conf *conf, gfp_t > gfp) > sh->hash_lock_index = > conf->max_nr_stripes % NR_STRIPE_HASH_LOCKS; > /* we just created an active stripe so... */ > - atomic_set(>count, 1); > atomic_inc(>active_stripes); > - INIT_LIST_HEAD(>lru); > > - spin_lock_init(>batch_lock); > - INIT_LIST_HEAD(>batch_list); > - sh->batch_head = NULL; > release_stripe(sh); > conf->max_nr_stripes++; > return 1; > @@ -2109,13 +2117,11 @@ static int resize_stripes(struct r5conf *conf, int > newsize) > return -ENOMEM; > > for (i = conf->max_nr_stripes; i; i--) { > - nsh = kmem_cache_zalloc(sc, GFP_KERNEL); > + nsh = alloc_stripe(sc, GFP_KERNEL); > if (!nsh) > break; > > nsh->raid_conf = conf; > - spin_lock_init(>stripe_lock); > - > list_add(>lru, ); > } > if (i) { > @@ -2142,13 +2148,11 @@ static int resize_stripes(struct r5conf *conf, int > n
Re: [PATCH] md/raid5: init batch_xxx for new sh at resize_stripes
On Mon, May 04, 2015 at 05:24:24PM +1000, NeilBrown wrote: On Mon, 4 May 2015 13:50:24 +0800 Yuanhan Liu yuanhan@linux.intel.com wrote: This is to fix a kernel NULL dereference oops introduced by commit 59fc630b(RAID5: batch adjacent full stripe write), which introduced several batch_xxx fields, and did initiation for them at grow_one_stripes(), but forgot to do same at resize_stripes(). This oops can be easily triggered by following steps: __create RAID5 /dev/md0 __grow /dev/md0 mdadm --wait /dev/md0 dd if=/dev/zero of=/dev/md0 Here is the detailed oops log: ... Cc: Shaohua Li s...@kernel.org Signed-off-by: Yuanhan Liu yuanhan@linux.intel.com --- drivers/md/raid5.c | 4 1 file changed, 4 insertions(+) diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 697d77a..7b074f7 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -2217,6 +2217,10 @@ static int resize_stripes(struct r5conf *conf, int newsize) if (!p) err = -ENOMEM; } + + spin_lock_init(nsh-batch_lock); + INIT_LIST_HEAD(nsh-batch_list); + nsh-batch_head = NULL; release_stripe(nsh); } /* critical section pass, GFP_NOIO no longer needed */ Thanks! However I already have the following fix queued - though not pushed out Yeah, much cleaner. you. I probably would have got it into -rc2 except that I was chasing another raid5 bug. The BUG_ON(sh-batch_head); in handle_stripe_fill() fires when I run the mdadm selftests. I got caught up chasing that and didn't push the other fix. I am not aware of there is a selftests for raid. I'd like to add it to our 0day kernel testing in near future so that we could catch bugs and bisect it down in first time ;) --yliu From 3dd8ba734349e602fe17d647ce3da5f4a13748aa Mon Sep 17 00:00:00 2001 From: NeilBrown ne...@suse.de Date: Thu, 30 Apr 2015 11:24:28 +1000 Subject: [PATCH] md/raid5 new alloc_stripe function. diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 77dfd720aaa0..91a1e8b26b52 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -1971,17 +1971,30 @@ static void raid_run_ops(struct stripe_head *sh, unsigned long ops_request) put_cpu(); } +static struct stripe_head *alloc_stripe(struct kmem_cache *sc, gfp_t gfp) +{ + struct stripe_head *sh; + + sh = kmem_cache_zalloc(sc, gfp); + if (sh) { + spin_lock_init(sh-stripe_lock); + spin_lock_init(sh-batch_lock); + INIT_LIST_HEAD(sh-batch_list); + INIT_LIST_HEAD(sh-lru); + atomic_set(sh-count, 1); + } + return sh; +} static int grow_one_stripe(struct r5conf *conf, gfp_t gfp) { struct stripe_head *sh; - sh = kmem_cache_zalloc(conf-slab_cache, gfp); + + sh = alloc_stripe(conf-slab_cache, gfp); if (!sh) return 0; sh-raid_conf = conf; - spin_lock_init(sh-stripe_lock); - if (grow_buffers(sh, gfp)) { shrink_buffers(sh); kmem_cache_free(conf-slab_cache, sh); @@ -1990,13 +2003,8 @@ static int grow_one_stripe(struct r5conf *conf, gfp_t gfp) sh-hash_lock_index = conf-max_nr_stripes % NR_STRIPE_HASH_LOCKS; /* we just created an active stripe so... */ - atomic_set(sh-count, 1); atomic_inc(conf-active_stripes); - INIT_LIST_HEAD(sh-lru); - spin_lock_init(sh-batch_lock); - INIT_LIST_HEAD(sh-batch_list); - sh-batch_head = NULL; release_stripe(sh); conf-max_nr_stripes++; return 1; @@ -2109,13 +2117,11 @@ static int resize_stripes(struct r5conf *conf, int newsize) return -ENOMEM; for (i = conf-max_nr_stripes; i; i--) { - nsh = kmem_cache_zalloc(sc, GFP_KERNEL); + nsh = alloc_stripe(sc, GFP_KERNEL); if (!nsh) break; nsh-raid_conf = conf; - spin_lock_init(nsh-stripe_lock); - list_add(nsh-lru, newstripes); } if (i) { @@ -2142,13 +2148,11 @@ static int resize_stripes(struct r5conf *conf, int newsize) lock_device_hash_lock(conf, hash)); osh = get_free_stripe(conf, hash); unlock_device_hash_lock(conf, hash); - atomic_set(nsh-count, 1); + for(i=0; iconf-pool_size; i++) { nsh-dev[i].page = osh-dev[i].page; nsh-dev[i].orig_page = osh-dev[i].page; } - for( ; inewsize; i++) - nsh-dev[i].page = NULL; nsh-hash_lock_index = hash; kmem_cache_free(conf-slab_cache, osh); cnt++; -- To unsubscribe from this list
[PATCH] md/raid5: init batch_xxx for new sh at resize_stripes
This is to fix a kernel NULL dereference oops introduced by commit 59fc630b("RAID5: batch adjacent full stripe write"), which introduced several batch_xxx fields, and did initiation for them at grow_one_stripes(), but forgot to do same at resize_stripes(). This oops can be easily triggered by following steps: __create RAID5 /dev/md0 __grow /dev/md0 mdadm --wait /dev/md0 dd if=/dev/zero of=/dev/md0 Here is the detailed oops log: [ 32.384499] BUG: unable to handle kernel NULL pointer dereference at (null) [ 32.385366] IP: [] add_stripe_bio+0x48d/0x544 [ 32.385955] PGD 373f3067 PUD 36e34067 PMD 0 [ 32.386404] Oops: 0002 [#1] SMP [ 32.386740] Modules linked in: [ 32.387040] CPU: 0 PID: 1059 Comm: kworker/u2:2 Not tainted 4.0.0-next-20150427+ #107 [ 32.387762] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.7.5-0-ge51488c-20140602_164612-nilsson.home.kraxel.org 04/01/2014 [ 32.388044] Workqueue: writeback bdi_writeback_workfn (flush-9:0) [ 32.388044] task: 88003d038000 ti: 88003d40c000 task.ti: 88003d40c000 [ 32.388044] RIP: 0010:[] [] add_stripe_bio+0x48d/0x544 [ 32.388044] RSP: :88003d40f6f8 EFLAGS: 00010046 [ 32.388044] RAX: RBX: 880037168cd0 RCX: 880037179a28 [ 32.388044] RDX: 880037168d58 RSI: RDI: 880037179a20 [ 32.388044] RBP: 88003d40f738 R08: 0410 R09: 0410 [ 32.388044] R10: 0410 R11: 0002 R12: 8800371799a0 [ 32.388044] R13: 88003c3d0800 R14: 0001 R15: 880037179a08 [ 32.388044] FS: () GS:88003fc0() knlGS: [ 32.388044] CS: 0010 DS: ES: CR0: 8005003b [ 32.388044] CR2: CR3: 36e33000 CR4: 06f0 [ 32.388044] Stack: [ 32.388044] 0002 880037168d38 88003d40f738 88003c3abd00 [ 32.388044] 88003c2df800 88003c3d0800 0408 88003c3d0b54 [ 32.388044] 88003d40f828 8184b9ea 3d40f7e8 0292 [ 32.388044] Call Trace: [ 32.388044] [] make_request+0x7a8/0xaee [ 32.388044] [] ? wait_woken+0x79/0x79 [ 32.388044] [] ? kmem_cache_alloc+0x95/0x1b6 [ 32.388044] [] md_make_request+0xeb/0x1c3 [ 32.388044] [] ? mempool_alloc+0x64/0x127 [ 32.388044] [] generic_make_request+0x9c/0xdb [ 32.388044] [] submit_bio+0xf6/0x134 [ 32.388044] [] _submit_bh+0x119/0x141 [ 32.388044] [] submit_bh+0x10/0x12 [ 32.388044] [] __block_write_full_page.constprop.30+0x1a3/0x2a4 [ 32.388044] [] ? I_BDEV+0xd/0xd [ 32.388044] [] block_write_full_page+0xab/0xaf [ 32.388044] [] blkdev_writepage+0x18/0x1a [ 32.388044] [] __writepage+0x14/0x2d [ 32.388044] [] write_cache_pages+0x29a/0x3a7 [ 32.388044] [] ? mapping_tagged+0x14/0x14 [ 32.388044] [] generic_writepages+0x3e/0x56 [ 32.388044] [] do_writepages+0x1e/0x2c [ 32.388044] [] __writeback_single_inode+0x5b/0x27e [ 32.388044] [] writeback_sb_inodes+0x1dc/0x358 [ 32.388044] [] __writeback_inodes_wb+0x7f/0xb8 [ 32.388044] [] wb_writeback+0x11a/0x271 [ 32.388044] [] ? global_dirty_limits+0x1b/0xfd [ 32.388044] [] bdi_writeback_workfn+0x1ae/0x360 [ 32.388044] [] process_one_work+0x1c2/0x340 [ 32.388044] [] worker_thread+0x28b/0x389 [ 32.388044] [] ? cancel_delayed_work_sync+0x15/0x15 [ 32.388044] [] kthread+0xd2/0xda [ 32.388044] [] ? kthread_create_on_node+0x17c/0x17c [ 32.388044] [] ret_from_fork+0x42/0x70 [ 32.388044] [] ? kthread_create_on_node+0x17c/0x17c [ 32.388044] Code: 84 24 90 00 00 00 48 8d 93 88 00 00 00 49 8d 8c 24 88 00 00 00 49 89 94 24 90 00 00 00 48 89 8b 88 00 00 00 48 89 83 90 00 00 00 <48> 89 10 66 41 83 84 24 80 00 00 00 01 3e 0f ba 73 48 06 72 02 [ 32.388044] RIP [] add_stripe_bio+0x48d/0x544 [ 32.388044] RSP [ 32.388044] CR2: [ 32.388044] ---[ end trace 2b255d3f55be9eb3 ]--- Cc: Shaohua Li Signed-off-by: Yuanhan Liu --- drivers/md/raid5.c | 4 1 file changed, 4 insertions(+) diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 697d77a..7b074f7 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -2217,6 +2217,10 @@ static int resize_stripes(struct r5conf *conf, int newsize) if (!p) err = -ENOMEM; } + + spin_lock_init(>batch_lock); + INIT_LIST_HEAD(>batch_list); + nsh->batch_head = NULL; release_stripe(nsh); } /* critical section pass, GFP_NOIO no longer needed */ -- 1.9.0 -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH] md/raid5: init batch_xxx for new sh at resize_stripes
This is to fix a kernel NULL dereference oops introduced by commit 59fc630b(RAID5: batch adjacent full stripe write), which introduced several batch_xxx fields, and did initiation for them at grow_one_stripes(), but forgot to do same at resize_stripes(). This oops can be easily triggered by following steps: __create RAID5 /dev/md0 __grow /dev/md0 mdadm --wait /dev/md0 dd if=/dev/zero of=/dev/md0 Here is the detailed oops log: [ 32.384499] BUG: unable to handle kernel NULL pointer dereference at (null) [ 32.385366] IP: [81844082] add_stripe_bio+0x48d/0x544 [ 32.385955] PGD 373f3067 PUD 36e34067 PMD 0 [ 32.386404] Oops: 0002 [#1] SMP [ 32.386740] Modules linked in: [ 32.387040] CPU: 0 PID: 1059 Comm: kworker/u2:2 Not tainted 4.0.0-next-20150427+ #107 [ 32.387762] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.7.5-0-ge51488c-20140602_164612-nilsson.home.kraxel.org 04/01/2014 [ 32.388044] Workqueue: writeback bdi_writeback_workfn (flush-9:0) [ 32.388044] task: 88003d038000 ti: 88003d40c000 task.ti: 88003d40c000 [ 32.388044] RIP: 0010:[81844082] [81844082] add_stripe_bio+0x48d/0x544 [ 32.388044] RSP: :88003d40f6f8 EFLAGS: 00010046 [ 32.388044] RAX: RBX: 880037168cd0 RCX: 880037179a28 [ 32.388044] RDX: 880037168d58 RSI: RDI: 880037179a20 [ 32.388044] RBP: 88003d40f738 R08: 0410 R09: 0410 [ 32.388044] R10: 0410 R11: 0002 R12: 8800371799a0 [ 32.388044] R13: 88003c3d0800 R14: 0001 R15: 880037179a08 [ 32.388044] FS: () GS:88003fc0() knlGS: [ 32.388044] CS: 0010 DS: ES: CR0: 8005003b [ 32.388044] CR2: CR3: 36e33000 CR4: 06f0 [ 32.388044] Stack: [ 32.388044] 0002 880037168d38 88003d40f738 88003c3abd00 [ 32.388044] 88003c2df800 88003c3d0800 0408 88003c3d0b54 [ 32.388044] 88003d40f828 8184b9ea 3d40f7e8 0292 [ 32.388044] Call Trace: [ 32.388044] [8184b9ea] make_request+0x7a8/0xaee [ 32.388044] [81120387] ? wait_woken+0x79/0x79 [ 32.388044] [811e9a85] ? kmem_cache_alloc+0x95/0x1b6 [ 32.388044] [8186b944] md_make_request+0xeb/0x1c3 [ 32.388044] [811a3025] ? mempool_alloc+0x64/0x127 [ 32.388044] [81481575] generic_make_request+0x9c/0xdb [ 32.388044] [814816aa] submit_bio+0xf6/0x134 [ 32.388044] [8122a1f7] _submit_bh+0x119/0x141 [ 32.388044] [8122a22f] submit_bh+0x10/0x12 [ 32.388044] [8122bbb9] __block_write_full_page.constprop.30+0x1a3/0x2a4 [ 32.388044] [8122bead] ? I_BDEV+0xd/0xd [ 32.388044] [8122bd65] block_write_full_page+0xab/0xaf [ 32.388044] [8122c657] blkdev_writepage+0x18/0x1a [ 32.388044] [811a9853] __writepage+0x14/0x2d [ 32.388044] [811a9ef3] write_cache_pages+0x29a/0x3a7 [ 32.388044] [811a983f] ? mapping_tagged+0x14/0x14 [ 32.388044] [811aa03e] generic_writepages+0x3e/0x56 [ 32.388044] [811ab638] do_writepages+0x1e/0x2c [ 32.388044] [812229ed] __writeback_single_inode+0x5b/0x27e [ 32.388044] [81222ec7] writeback_sb_inodes+0x1dc/0x358 [ 32.388044] [812230c2] __writeback_inodes_wb+0x7f/0xb8 [ 32.388044] [812232b9] wb_writeback+0x11a/0x271 [ 32.388044] [811aa483] ? global_dirty_limits+0x1b/0xfd [ 32.388044] [8122399c] bdi_writeback_workfn+0x1ae/0x360 [ 32.388044] [81101bab] process_one_work+0x1c2/0x340 [ 32.388044] [81102571] worker_thread+0x28b/0x389 [ 32.388044] [811022e6] ? cancel_delayed_work_sync+0x15/0x15 [ 32.388044] [81106936] kthread+0xd2/0xda [ 32.388044] [81106864] ? kthread_create_on_node+0x17c/0x17c [ 32.388044] [81a16682] ret_from_fork+0x42/0x70 [ 32.388044] [81106864] ? kthread_create_on_node+0x17c/0x17c [ 32.388044] Code: 84 24 90 00 00 00 48 8d 93 88 00 00 00 49 8d 8c 24 88 00 00 00 49 89 94 24 90 00 00 00 48 89 8b 88 00 00 00 48 89 83 90 00 00 00 48 89 10 66 41 83 84 24 80 00 00 00 01 3e 0f ba 73 48 06 72 02 [ 32.388044] RIP [81844082] add_stripe_bio+0x48d/0x544 [ 32.388044] RSP 88003d40f6f8 [ 32.388044] CR2: [ 32.388044] ---[ end trace 2b255d3f55be9eb3 ]--- Cc: Shaohua Li s...@kernel.org Signed-off-by: Yuanhan Liu yuanhan@linux.intel.com --- drivers/md/raid5.c | 4 1 file changed, 4 insertions(+) diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 697d77a..7b074f7 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -2217,6 +2217,10 @@ static int resize_stripes(struct r5conf *conf, int newsize
[LKP] [genirq] d5b2eacdbc2: BUG: unable to handle kernel NULL pointer dereference at (null)
FYI, we noticed the below changes on https://github.com/jiangliu/linux.git test/irq_common_data_v2 commit d5b2eacdbc280da7c6dfbe0f52bb293ef227d349 ("genirq: Introduce struct irq_common_data to host shared irq data") +-+++ | | 39fb394021 | d5b2eacdbc | +-+++ | boot_successes | 0 | 0 | | boot_failures | 22 | 20 | | PM:Hibernation_image_not_present_or_could_not_be_loaded | 22 | | | BUG:unable_to_handle_kernel | 0 | 20 | | Oops| 0 | 20 | | Kernel_panic-not_syncing:Fatal_exception_in_interrupt | 0 | 20 | | backtrace:__pci_register_driver | 0 | 6 | | backtrace:e1000_init_module | 0 | 6 | | backtrace:kernel_init_freeable | 0 | 6 | | backtrace:ata_sff_pio_task | 0 | 14 | +-+++ [1.351055] ata2.01: NODEV after polling detection [1.352179] ata2.00: ATAPI: QEMU DVD-ROM, 2.1.2, max UDMA/100 [1.353501] ata2.00: configured for MWDMA2 [1.354423] BUG: unable to handle kernel NULL pointer dereference at (null) [1.356074] IP: [< (null)>] (null) [1.356074] PGD 0 [1.356074] Oops: 0010 [#1] SMP [1.356074] Modules linked in: [1.356074] CPU: 0 PID: 584 Comm: kworker/0:1 Not tainted 4.1.0-rc1-wl-ath-00905-geb3b9ec #1 [1.356074] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.7.5-20140531_083030-gandalf 04/01/2014 [1.356074] Workqueue: ata_sff ata_sff_pio_task [1.356074] task: 880011c2af30 ti: 8800123bc000 task.ti: 8800123bc000 [1.356074] RIP: 0010:[<>] [< (null)>] (null) [1.356074] RSP: :880013803ee0 EFLAGS: 00010046 [1.356074] RAX: 8222b2c0 RBX: 88001349fc80 RCX: 0009 [1.356074] RDX: 88001348f400 RSI: ffc0 RDI: 88001349fc80 [1.356074] RBP: 880013803ef8 R08: R09: 0013 [1.356074] R10: 0006 R11: R12: 88001348f400 [1.356074] R13: 000f R14: 8800123bfc78 R15: [1.356074] FS: () GS:88001380() knlGS: [1.356074] CS: 0010 DS: ES: CR0: 8005003b [1.356074] CR2: CR3: 0220b000 CR4: 06f0 [1.356074] Stack: [1.356074] 8113aa96 88001349fc80 88001348f458 880013803f18 [1.356074] 8106bc49 8222b2c0 88001348f400 880013803f28 [1.356074] 81138421 880013803f48 811380db 000f [1.356074] Call Trace: [1.356074] [1.356074] [] ? irq_move_irq+0x34/0x50 [1.356074] [] apic_ack_edge+0x23/0x3b [1.356074] [] irq_chip_ack_parent+0x14/0x16 [1.356074] [] handle_edge_irq+0xa5/0x110 [1.356074] [] handle_irq+0x27/0x2d [1.356074] [] do_IRQ+0x4c/0xcf [1.356074] [] common_interrupt+0x73/0x73 [1.356074] [1.356074] [] ? __ata_qc_complete+0xe1/0xe9 [1.356074] [] ? _raw_spin_unlock_irqrestore+0x32/0x42 [1.356074] [] ata_sff_hsm_move+0x258/0x66a [1.356074] [] ata_sff_pio_task+0x140/0x15e [1.356074] [] process_one_work+0x1c6/0x37b [1.356074] [] worker_thread+0x2ad/0x3b6 [1.356074] [] ? rescuer_thread+0x318/0x318 [1.356074] [] kthread+0xf8/0x100 [1.356074] [] ? kthread_create_on_node+0x184/0x184 [1.356074] [] ret_from_fork+0x42/0x70 [1.356074] [] ? kthread_create_on_node+0x184/0x184 [1.356074] Code: Bad RIP value. [1.356074] RIP [< (null)>] (null) [1.356074] RSP [1.356074] CR2: [1.356074] ---[ end trace d37ae2366ce94eef ]--- [1.356074] Kernel panic - not syncing: Fatal exception in interrupt Thanks, lkp # # Automatically generated file; DO NOT EDIT. # Linux/x86_64 4.0.0 Kernel Configuration # CONFIG_64BIT=y CONFIG_X86_64=y CONFIG_X86=y CONFIG_INSTRUCTION_DECODER=y CONFIG_PERF_EVENTS_INTEL_UNCORE=y CONFIG_OUTPUT_FORMAT="elf64-x86-64" CONFIG_ARCH_DEFCONFIG="arch/x86/configs/x86_64_defconfig" CONFIG_LOCKDEP_SUPPORT=y CONFIG_STACKTRACE_SUPPORT=y CONFIG_HAVE_LATENCYTOP_SUPPORT=y CONFIG_MMU=y CONFIG_NEED_DMA_MAP_STATE=y CONFIG_NEED_SG_DMA_LENGTH=y CONFIG_GENERIC_ISA_DMA=y CONFIG_GENERIC_BUG=y CONFIG_GENERIC_BUG_RELATIVE_POINTERS=y
Re: [PATCH 2/2] md/raid5: trivial coding style fix
On Thu, Apr 30, 2015 at 05:16:50PM +1000, NeilBrown wrote: > On Thu, 30 Apr 2015 15:01:17 +0800 Yuanhan Liu > wrote: > > > Signed-off-by: Yuanhan Liu > > --- > > drivers/md/raid5.c | 3 +-- > > 1 file changed, 1 insertion(+), 2 deletions(-) > > > > diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c > > index 2651bda..bae3e2c 100644 > > --- a/drivers/md/raid5.c > > +++ b/drivers/md/raid5.c > > @@ -5789,8 +5789,7 @@ static void raid5d(struct md_thread *thread) > > if (released) > > clear_bit(R5_DID_ALLOC, >cache_state); > > > > - if ( > > - !list_empty(>bitmap_list)) { > > + if (!list_empty(>bitmap_list)) { > > /* Now is a good time to flush some bitmap updates */ > > conf->seq_flush++; > > spin_unlock_irq(>device_lock); > > > I'm happy for these sorts of changes when you are fixing up nearby code, or > if the change significantly improves readability. > But I'd rather not bother is one-off trivial fixes like this. Got it. --yliu -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH 1/2] md/raid5: fix typo
On Thu, Apr 30, 2015 at 05:14:26PM +1000, NeilBrown wrote: > On Thu, 30 Apr 2015 15:01:16 +0800 Yuanhan Liu > wrote: > > > bion -> bios > > > > Signed-off-by: Yuanhan Liu > > --- > > drivers/md/raid5.c | 2 +- > > 1 file changed, 1 insertion(+), 1 deletion(-) > > > > diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c > > index 697d77a..2651bda 100644 > > --- a/drivers/md/raid5.c > > +++ b/drivers/md/raid5.c > > @@ -2919,7 +2919,7 @@ schedule_reconstruction(struct stripe_head *sh, > > struct stripe_head_state *s, > > } > > > > /* > > - * Each stripe/dev can have one or more bion attached. > > + * Each stripe/dev can have one or more bios attached. > > * toread/towrite point to the first in a chain. > > * The bi_next chain must be in order. > > */ > > That was intentional. "bios" as a plural looks too much like "BIOS" which is > in the ROM of computers. > > Children and oxen are plurals with an 'n' at the end. So I used 'bion'. > Private joke? Interesting. > > I'd rather leave it as it is. Okay, and sorry for the noise. --yliu -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH 1/2] md/raid5: fix typo
bion -> bios Signed-off-by: Yuanhan Liu --- drivers/md/raid5.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 697d77a..2651bda 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -2919,7 +2919,7 @@ schedule_reconstruction(struct stripe_head *sh, struct stripe_head_state *s, } /* - * Each stripe/dev can have one or more bion attached. + * Each stripe/dev can have one or more bios attached. * toread/towrite point to the first in a chain. * The bi_next chain must be in order. */ -- 1.9.0 -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH 2/2] md/raid5: trivial coding style fix
Signed-off-by: Yuanhan Liu --- drivers/md/raid5.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 2651bda..bae3e2c 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -5789,8 +5789,7 @@ static void raid5d(struct md_thread *thread) if (released) clear_bit(R5_DID_ALLOC, >cache_state); - if ( - !list_empty(>bitmap_list)) { + if (!list_empty(>bitmap_list)) { /* Now is a good time to flush some bitmap updates */ conf->seq_flush++; spin_unlock_irq(>device_lock); -- 1.9.0 -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [LKP] [RAID5] 878ee679279: -1.8% vmstat.io.bo, +40.5% perf-stat.LLC-load-misses
On Fri, Apr 24, 2015 at 12:15:59PM +1000, NeilBrown wrote: > On Thu, 23 Apr 2015 14:55:59 +0800 Huang Ying wrote: > > > FYI, we noticed the below changes on > > > > git://neil.brown.name/md for-next > > commit 878ee6792799e2f88bdcac329845efadb205252f ("RAID5: batch adjacent > > full stripe write") > > Hi, > is there any chance that you could explain what some of this means? > There is lots of data and some very pretty graphs, but no explanation. Hi Neil, (Sorry for late response: Ying is on vacation) I guess you can simply ignore this report, as I already reported to you month ago that this patch made fsmark performs better in most cases: https://lists.01.org/pipermail/lkp/2015-March/002411.html > > Which numbers are "good", which are "bad"? Which is "worst". > What do the graphs really show? and what would we like to see in them? > > I think it is really great that you are doing this testing and reporting the > results. It's just so sad that I completely fail to understand them. Sorry, it's our bad to make them hard to understand as well as to report a duplicate one(well, the commit hash is different ;). We might need take some time to make those data understood easier. --yliu > > > > > > > testbox/testcase/testparams: > > lkp-st02/dd-write/300-5m-11HDD-RAID5-cfq-xfs-1dd > > > > a87d7f782b47e030 878ee6792799e2f88bdcac3298 > > -- > > %stddev %change %stddev > > \ |\ > > 59035 ± 0% +18.4% 69913 ± 1% softirqs.SCHED > > 1330 ± 10% +17.4% 1561 ± 4% slabinfo.kmalloc-512.num_objs > > 1330 ± 10% +17.4% 1561 ± 4% > > slabinfo.kmalloc-512.active_objs > > 305908 ± 0% -1.8% 300427 ± 0% vmstat.io.bo > > 1 ± 0%+100.0% 2 ± 0% vmstat.procs.r > > 8266 ± 1% -15.7% 6968 ± 0% vmstat.system.cs > > 14819 ± 0% -2.1% 14503 ± 0% vmstat.system.in > > 18.20 ± 6% +10.2% 20.05 ± 4% > > perf-profile.cpu-cycles.raid_run_ops.handle_stripe.handle_active_stripes.raid5d.md_thread > > 1.94 ± 9% +90.6% 3.70 ± 9% > > perf-profile.cpu-cycles.async_xor.raid_run_ops.handle_stripe.handle_active_stripes.raid5d > > 0.00 ± 0% +Inf% 25.18 ± 3% > > perf-profile.cpu-cycles.handle_active_stripes.isra.45.raid5d.md_thread.kthread.ret_from_fork > > 0.00 ± 0% +Inf% 14.14 ± 4% > > perf-profile.cpu-cycles.async_copy_data.isra.42.raid_run_ops.handle_stripe.handle_active_stripes.raid5d > > 1.79 ± 7%+102.9% 3.64 ± 9% > > perf-profile.cpu-cycles.xor_blocks.async_xor.raid_run_ops.handle_stripe.handle_active_stripes > > 3.09 ± 4% -10.8% 2.76 ± 4% > > perf-profile.cpu-cycles.get_active_stripe.make_request.md_make_request.generic_make_request.submit_bio > > 0.80 ± 14% +28.1% 1.02 ± 10% > > perf-profile.cpu-cycles.mutex_lock.xfs_file_buffered_aio_write.xfs_file_write_iter.new_sync_write.vfs_write > > 14.78 ± 6%-100.0% 0.00 ± 0% > > perf-profile.cpu-cycles.async_copy_data.isra.38.raid_run_ops.handle_stripe.handle_active_stripes.raid5d > > 25.68 ± 4%-100.0% 0.00 ± 0% > > perf-profile.cpu-cycles.handle_active_stripes.isra.41.raid5d.md_thread.kthread.ret_from_fork > > 1.23 ± 5%+140.0% 2.96 ± 7% > > perf-profile.cpu-cycles.xor_sse_5_pf64.xor_blocks.async_xor.raid_run_ops.handle_stripe > > 2.62 ± 6% -95.6% 0.12 ± 33% > > perf-profile.cpu-cycles.analyse_stripe.handle_stripe.handle_active_stripes.raid5d.md_thread > > 0.96 ± 9% +17.5% 1.12 ± 2% > > perf-profile.cpu-cycles.xfs_ilock.xfs_file_buffered_aio_write.xfs_file_write_iter.new_sync_write.vfs_write > > 1.461e+10 ± 0% -5.3% 1.384e+10 ± 1% > > perf-stat.L1-dcache-load-misses > > 3.688e+11 ± 0% -2.7% 3.59e+11 ± 0% perf-stat.L1-dcache-loads > > 1.124e+09 ± 0% -27.7% 8.125e+08 ± 0% perf-stat.L1-dcache-prefetches > > 2.767e+10 ± 0% -1.8% 2.717e+10 ± 0% > > perf-stat.L1-dcache-store-misses > > 2.352e+11 ± 0% -2.8% 2.287e+11 ± 0% perf-stat.L1-dcache-stores > > 6.774e+09 ± 0% -2.3% 6.62e+09 ± 0% > > perf-stat.L1-icache-load-misses > > 5.571e+08 ± 0% +40.5% 7.826e+08 ± 1% perf-stat.LLC-load-misses > > 6.263e+09 ± 0% -13.7% 5.407e+09 ± 1% perf-stat.LLC-loads > > 1.914e+11 ± 0% -4.2% 1.833e+11 ± 0% perf-stat.branch-instructions > > 1.145e+09 ± 2% -5.6% 1.081e+09 ± 0% perf-stat.branch-load-misses > > 1.911e+11 ± 0% -4.3% 1.829e+11 ± 0% perf-stat.branch-loads > > 1.142e+09 ± 2% -5.1% 1.083e+09 ± 0% perf-stat.branch-misses > > 1.218e+09 ± 0% +19.8% 1.46e+09 ± 0% perf-stat.cache-misses > > 2.118e+10 ± 0% -5.2% 2.007e+10 ± 0% perf-stat.cache-references > >2510308 ± 1% -15.7%2115410 ±
[PATCH 1/2] md/raid5: fix typo
bion - bios Signed-off-by: Yuanhan Liu yuanhan@linux.intel.com --- drivers/md/raid5.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 697d77a..2651bda 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -2919,7 +2919,7 @@ schedule_reconstruction(struct stripe_head *sh, struct stripe_head_state *s, } /* - * Each stripe/dev can have one or more bion attached. + * Each stripe/dev can have one or more bios attached. * toread/towrite point to the first in a chain. * The bi_next chain must be in order. */ -- 1.9.0 -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH 2/2] md/raid5: trivial coding style fix
Signed-off-by: Yuanhan Liu yuanhan@linux.intel.com --- drivers/md/raid5.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 2651bda..bae3e2c 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -5789,8 +5789,7 @@ static void raid5d(struct md_thread *thread) if (released) clear_bit(R5_DID_ALLOC, conf-cache_state); - if ( - !list_empty(conf-bitmap_list)) { + if (!list_empty(conf-bitmap_list)) { /* Now is a good time to flush some bitmap updates */ conf-seq_flush++; spin_unlock_irq(conf-device_lock); -- 1.9.0 -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH 1/2] md/raid5: fix typo
On Thu, Apr 30, 2015 at 05:14:26PM +1000, NeilBrown wrote: On Thu, 30 Apr 2015 15:01:16 +0800 Yuanhan Liu yuanhan@linux.intel.com wrote: bion - bios Signed-off-by: Yuanhan Liu yuanhan@linux.intel.com --- drivers/md/raid5.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 697d77a..2651bda 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -2919,7 +2919,7 @@ schedule_reconstruction(struct stripe_head *sh, struct stripe_head_state *s, } /* - * Each stripe/dev can have one or more bion attached. + * Each stripe/dev can have one or more bios attached. * toread/towrite point to the first in a chain. * The bi_next chain must be in order. */ That was intentional. bios as a plural looks too much like BIOS which is in the ROM of computers. Children and oxen are plurals with an 'n' at the end. So I used 'bion'. Private joke? Interesting. I'd rather leave it as it is. Okay, and sorry for the noise. --yliu -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH 2/2] md/raid5: trivial coding style fix
On Thu, Apr 30, 2015 at 05:16:50PM +1000, NeilBrown wrote: On Thu, 30 Apr 2015 15:01:17 +0800 Yuanhan Liu yuanhan@linux.intel.com wrote: Signed-off-by: Yuanhan Liu yuanhan@linux.intel.com --- drivers/md/raid5.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 2651bda..bae3e2c 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -5789,8 +5789,7 @@ static void raid5d(struct md_thread *thread) if (released) clear_bit(R5_DID_ALLOC, conf-cache_state); - if ( - !list_empty(conf-bitmap_list)) { + if (!list_empty(conf-bitmap_list)) { /* Now is a good time to flush some bitmap updates */ conf-seq_flush++; spin_unlock_irq(conf-device_lock); I'm happy for these sorts of changes when you are fixing up nearby code, or if the change significantly improves readability. But I'd rather not bother is one-off trivial fixes like this. Got it. --yliu -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[LKP] [genirq] d5b2eacdbc2: BUG: unable to handle kernel NULL pointer dereference at (null)
FYI, we noticed the below changes on https://github.com/jiangliu/linux.git test/irq_common_data_v2 commit d5b2eacdbc280da7c6dfbe0f52bb293ef227d349 (genirq: Introduce struct irq_common_data to host shared irq data) +-+++ | | 39fb394021 | d5b2eacdbc | +-+++ | boot_successes | 0 | 0 | | boot_failures | 22 | 20 | | PM:Hibernation_image_not_present_or_could_not_be_loaded | 22 | | | BUG:unable_to_handle_kernel | 0 | 20 | | Oops| 0 | 20 | | Kernel_panic-not_syncing:Fatal_exception_in_interrupt | 0 | 20 | | backtrace:__pci_register_driver | 0 | 6 | | backtrace:e1000_init_module | 0 | 6 | | backtrace:kernel_init_freeable | 0 | 6 | | backtrace:ata_sff_pio_task | 0 | 14 | +-+++ [1.351055] ata2.01: NODEV after polling detection [1.352179] ata2.00: ATAPI: QEMU DVD-ROM, 2.1.2, max UDMA/100 [1.353501] ata2.00: configured for MWDMA2 [1.354423] BUG: unable to handle kernel NULL pointer dereference at (null) [1.356074] IP: [ (null)] (null) [1.356074] PGD 0 [1.356074] Oops: 0010 [#1] SMP [1.356074] Modules linked in: [1.356074] CPU: 0 PID: 584 Comm: kworker/0:1 Not tainted 4.1.0-rc1-wl-ath-00905-geb3b9ec #1 [1.356074] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.7.5-20140531_083030-gandalf 04/01/2014 [1.356074] Workqueue: ata_sff ata_sff_pio_task [1.356074] task: 880011c2af30 ti: 8800123bc000 task.ti: 8800123bc000 [1.356074] RIP: 0010:[] [ (null)] (null) [1.356074] RSP: :880013803ee0 EFLAGS: 00010046 [1.356074] RAX: 8222b2c0 RBX: 88001349fc80 RCX: 0009 [1.356074] RDX: 88001348f400 RSI: ffc0 RDI: 88001349fc80 [1.356074] RBP: 880013803ef8 R08: R09: 0013 [1.356074] R10: 0006 R11: R12: 88001348f400 [1.356074] R13: 000f R14: 8800123bfc78 R15: [1.356074] FS: () GS:88001380() knlGS: [1.356074] CS: 0010 DS: ES: CR0: 8005003b [1.356074] CR2: CR3: 0220b000 CR4: 06f0 [1.356074] Stack: [1.356074] 8113aa96 88001349fc80 88001348f458 880013803f18 [1.356074] 8106bc49 8222b2c0 88001348f400 880013803f28 [1.356074] 81138421 880013803f48 811380db 000f [1.356074] Call Trace: [1.356074] IRQ [1.356074] [8113aa96] ? irq_move_irq+0x34/0x50 [1.356074] [8106bc49] apic_ack_edge+0x23/0x3b [1.356074] [81138421] irq_chip_ack_parent+0x14/0x16 [1.356074] [811380db] handle_edge_irq+0xa5/0x110 [1.356074] [8103f761] handle_irq+0x27/0x2d [1.356074] [81a3ad3c] do_IRQ+0x4c/0xcf [1.356074] [81a38f33] common_interrupt+0x73/0x73 [1.356074] EOI [1.356074] [816823a3] ? __ata_qc_complete+0xe1/0xe9 [1.356074] [81a37f5e] ? _raw_spin_unlock_irqrestore+0x32/0x42 [1.356074] [8169246a] ata_sff_hsm_move+0x258/0x66a [1.356074] [816929bc] ata_sff_pio_task+0x140/0x15e [1.356074] [81105591] process_one_work+0x1c6/0x37b [1.356074] [81106222] worker_thread+0x2ad/0x3b6 [1.356074] [81105f75] ? rescuer_thread+0x318/0x318 [1.356074] [8110a42c] kthread+0xf8/0x100 [1.356074] [8110a334] ? kthread_create_on_node+0x184/0x184 [1.356074] [81a38802] ret_from_fork+0x42/0x70 [1.356074] [8110a334] ? kthread_create_on_node+0x184/0x184 [1.356074] Code: Bad RIP value. [1.356074] RIP [ (null)] (null) [1.356074] RSP 880013803ee0 [1.356074] CR2: [1.356074] ---[ end trace d37ae2366ce94eef ]--- [1.356074] Kernel panic - not syncing: Fatal exception in interrupt Thanks, lkp # # Automatically generated file; DO NOT EDIT. # Linux/x86_64 4.0.0 Kernel Configuration # CONFIG_64BIT=y CONFIG_X86_64=y CONFIG_X86=y CONFIG_INSTRUCTION_DECODER=y CONFIG_PERF_EVENTS_INTEL_UNCORE=y CONFIG_OUTPUT_FORMAT=elf64-x86-64
Re: [LKP] [RAID5] 878ee679279: -1.8% vmstat.io.bo, +40.5% perf-stat.LLC-load-misses
On Fri, Apr 24, 2015 at 12:15:59PM +1000, NeilBrown wrote: On Thu, 23 Apr 2015 14:55:59 +0800 Huang Ying ying.hu...@intel.com wrote: FYI, we noticed the below changes on git://neil.brown.name/md for-next commit 878ee6792799e2f88bdcac329845efadb205252f (RAID5: batch adjacent full stripe write) Hi, is there any chance that you could explain what some of this means? There is lots of data and some very pretty graphs, but no explanation. Hi Neil, (Sorry for late response: Ying is on vacation) I guess you can simply ignore this report, as I already reported to you month ago that this patch made fsmark performs better in most cases: https://lists.01.org/pipermail/lkp/2015-March/002411.html Which numbers are good, which are bad? Which is worst. What do the graphs really show? and what would we like to see in them? I think it is really great that you are doing this testing and reporting the results. It's just so sad that I completely fail to understand them. Sorry, it's our bad to make them hard to understand as well as to report a duplicate one(well, the commit hash is different ;). We might need take some time to make those data understood easier. --yliu testbox/testcase/testparams: lkp-st02/dd-write/300-5m-11HDD-RAID5-cfq-xfs-1dd a87d7f782b47e030 878ee6792799e2f88bdcac3298 -- %stddev %change %stddev \ |\ 59035 ± 0% +18.4% 69913 ± 1% softirqs.SCHED 1330 ± 10% +17.4% 1561 ± 4% slabinfo.kmalloc-512.num_objs 1330 ± 10% +17.4% 1561 ± 4% slabinfo.kmalloc-512.active_objs 305908 ± 0% -1.8% 300427 ± 0% vmstat.io.bo 1 ± 0%+100.0% 2 ± 0% vmstat.procs.r 8266 ± 1% -15.7% 6968 ± 0% vmstat.system.cs 14819 ± 0% -2.1% 14503 ± 0% vmstat.system.in 18.20 ± 6% +10.2% 20.05 ± 4% perf-profile.cpu-cycles.raid_run_ops.handle_stripe.handle_active_stripes.raid5d.md_thread 1.94 ± 9% +90.6% 3.70 ± 9% perf-profile.cpu-cycles.async_xor.raid_run_ops.handle_stripe.handle_active_stripes.raid5d 0.00 ± 0% +Inf% 25.18 ± 3% perf-profile.cpu-cycles.handle_active_stripes.isra.45.raid5d.md_thread.kthread.ret_from_fork 0.00 ± 0% +Inf% 14.14 ± 4% perf-profile.cpu-cycles.async_copy_data.isra.42.raid_run_ops.handle_stripe.handle_active_stripes.raid5d 1.79 ± 7%+102.9% 3.64 ± 9% perf-profile.cpu-cycles.xor_blocks.async_xor.raid_run_ops.handle_stripe.handle_active_stripes 3.09 ± 4% -10.8% 2.76 ± 4% perf-profile.cpu-cycles.get_active_stripe.make_request.md_make_request.generic_make_request.submit_bio 0.80 ± 14% +28.1% 1.02 ± 10% perf-profile.cpu-cycles.mutex_lock.xfs_file_buffered_aio_write.xfs_file_write_iter.new_sync_write.vfs_write 14.78 ± 6%-100.0% 0.00 ± 0% perf-profile.cpu-cycles.async_copy_data.isra.38.raid_run_ops.handle_stripe.handle_active_stripes.raid5d 25.68 ± 4%-100.0% 0.00 ± 0% perf-profile.cpu-cycles.handle_active_stripes.isra.41.raid5d.md_thread.kthread.ret_from_fork 1.23 ± 5%+140.0% 2.96 ± 7% perf-profile.cpu-cycles.xor_sse_5_pf64.xor_blocks.async_xor.raid_run_ops.handle_stripe 2.62 ± 6% -95.6% 0.12 ± 33% perf-profile.cpu-cycles.analyse_stripe.handle_stripe.handle_active_stripes.raid5d.md_thread 0.96 ± 9% +17.5% 1.12 ± 2% perf-profile.cpu-cycles.xfs_ilock.xfs_file_buffered_aio_write.xfs_file_write_iter.new_sync_write.vfs_write 1.461e+10 ± 0% -5.3% 1.384e+10 ± 1% perf-stat.L1-dcache-load-misses 3.688e+11 ± 0% -2.7% 3.59e+11 ± 0% perf-stat.L1-dcache-loads 1.124e+09 ± 0% -27.7% 8.125e+08 ± 0% perf-stat.L1-dcache-prefetches 2.767e+10 ± 0% -1.8% 2.717e+10 ± 0% perf-stat.L1-dcache-store-misses 2.352e+11 ± 0% -2.8% 2.287e+11 ± 0% perf-stat.L1-dcache-stores 6.774e+09 ± 0% -2.3% 6.62e+09 ± 0% perf-stat.L1-icache-load-misses 5.571e+08 ± 0% +40.5% 7.826e+08 ± 1% perf-stat.LLC-load-misses 6.263e+09 ± 0% -13.7% 5.407e+09 ± 1% perf-stat.LLC-loads 1.914e+11 ± 0% -4.2% 1.833e+11 ± 0% perf-stat.branch-instructions 1.145e+09 ± 2% -5.6% 1.081e+09 ± 0% perf-stat.branch-load-misses 1.911e+11 ± 0% -4.3% 1.829e+11 ± 0% perf-stat.branch-loads 1.142e+09 ± 2% -5.1% 1.083e+09 ± 0% perf-stat.branch-misses 1.218e+09 ± 0% +19.8% 1.46e+09 ± 0% perf-stat.cache-misses 2.118e+10 ± 0% -5.2% 2.007e+10 ± 0% perf-stat.cache-references 2510308 ± 1% -15.7%2115410 ± 0% perf-stat.context-switches 39623 ± 0% +22.1% 48370 ± 1% perf-stat.cpu-migrations 4.179e+08 ± 40%
[LKP] [block] 5a19fe29ba7: +5.4% boot-slabinfo.num_objs
FYI, we noticed the below changes on git://git.kernel.org/pub/scm/linux/kernel/git/mlin/linux.git block-generic-req commit 5a19fe29ba7d052c0d8fa8a2bf461abc1e4d89bb ("block: make generic_make_request handle arbitrarily sized bios") testbox/testcase/testparams: vm-kbuild-1G/boot/1 v4.1-rc1 5a19fe29ba7d052c0d8fa8a2bf -- %stddev %change %stddev \ |\ 152092 ± 0% +5.4% 160249 ± 0% boot-slabinfo.num_objs 10106 ± 0% +21.6% 12293 ± 0% boot-slabinfo.num_pages 8.30 ± 21% -33.9% 5.48 ± 1% boot-time.boot 7.44 ± 23% -34.9% 4.84 ± 1% boot-time.dhcp 10.01 ± 17% -27.0% 7.31 ± 1% boot-time.idle 35507 ± 2% +17.9% 41856 ± 10% boot-meminfo.DirectMap4k 1558 ± 8%+276.5% 5868 ± 1% boot-meminfo.KernelStack 480717 ± 0% -2.8% 467414 ± 0% boot-meminfo.MemFree 11462 ± 1% +70.0% 19488 ± 0% boot-meminfo.SUnreclaim 40390 ± 0% +21.7% 49146 ± 0% boot-meminfo.Slab vm-kbuild-1G: qemu-system-x86_64 -enable-kvm -cpu Haswell,+smep,+smap Memory: 1G boot-slabinfo.num_objs 162000 ++-+ | O OOO 16 O+ O O OOOO O | |O O O O OOO O O O | | O OOO | 158000 ++ | |O | 156000 ++ | | | 154000 ++ | | | | *. .*.*.. *..*.*.. | 152000 ++. *. *. ..*| * * | 15 ++-+ boot-slabinfo.num_pages 12500 ++--+ O O O O O O O O O O O O O O O O O O O O O OO O O |O O | 12000 ++ | | | | | 11500 ++ | | | 11000 ++ | | | | | 10500 ++ | | | | .*..*. .*..*.*..*..*.* | 1 *+-*--*-+ boot-meminfo.MemFree 484000 ++-+ 482000 ++*| | .*.*.. .*..* : + *.. | 48 *+ * : : + +| 478000 ++: :**| | * | 476000 ++ | 474000 ++ | 472000 ++ | | | 47 ++O O O O O O| 468000 ++ O O O OO O O O OO O | | O O O O O O | 466000 O+ O O O 464000 ++-+ boot-meminfo.Slab 5
[LKP] [block] 5a19fe29ba7: +5.4% boot-slabinfo.num_objs
FYI, we noticed the below changes on git://git.kernel.org/pub/scm/linux/kernel/git/mlin/linux.git block-generic-req commit 5a19fe29ba7d052c0d8fa8a2bf461abc1e4d89bb (block: make generic_make_request handle arbitrarily sized bios) testbox/testcase/testparams: vm-kbuild-1G/boot/1 v4.1-rc1 5a19fe29ba7d052c0d8fa8a2bf -- %stddev %change %stddev \ |\ 152092 ± 0% +5.4% 160249 ± 0% boot-slabinfo.num_objs 10106 ± 0% +21.6% 12293 ± 0% boot-slabinfo.num_pages 8.30 ± 21% -33.9% 5.48 ± 1% boot-time.boot 7.44 ± 23% -34.9% 4.84 ± 1% boot-time.dhcp 10.01 ± 17% -27.0% 7.31 ± 1% boot-time.idle 35507 ± 2% +17.9% 41856 ± 10% boot-meminfo.DirectMap4k 1558 ± 8%+276.5% 5868 ± 1% boot-meminfo.KernelStack 480717 ± 0% -2.8% 467414 ± 0% boot-meminfo.MemFree 11462 ± 1% +70.0% 19488 ± 0% boot-meminfo.SUnreclaim 40390 ± 0% +21.7% 49146 ± 0% boot-meminfo.Slab vm-kbuild-1G: qemu-system-x86_64 -enable-kvm -cpu Haswell,+smep,+smap Memory: 1G boot-slabinfo.num_objs 162000 ++-+ | O OOO 16 O+ O O OOOO O | |O O O O OOO O O O | | O OOO | 158000 ++ | |O | 156000 ++ | | | 154000 ++ | | | | *. .*.*.. *..*.*.. | 152000 ++. *. *. ..*| * * | 15 ++-+ boot-slabinfo.num_pages 12500 ++--+ O O O O O O O O O O O O O O O O O O O O O OO O O |O O | 12000 ++ | | | | | 11500 ++ | | | 11000 ++ | | | | | 10500 ++ | | | | .*..*. .*..*.*..*..*.* | 1 *+-*--*-+ boot-meminfo.MemFree 484000 ++-+ 482000 ++*| | .*.*.. .*..* : + *.. | 48 *+ * : : + +| 478000 ++: :**| | * | 476000 ++ | 474000 ++ | 472000 ++ | | | 47 ++O O O O O O| 468000 ++ O O O OO O O O OO O | | O O O O O O | 466000 O+ O O O 464000 ++-+ boot-meminfo.Slab 5
[PATCH 2/3] md/raid5: split wait_for_stripe and introduce wait_for_quiescent
I noticed heavy spin lock contention at get_active_stripe(), introduced at being wake up stage, where a bunch of processes try to re-hold the spin lock again. After giving some thoughts on this issue, I found the lock could be relieved(and even avoided) if we turn the wait_for_stripe to per waitqueue for each lock hash and make the wake up exclusive: wake up one process each time, which avoids the lock contention naturally. Before go hacking with wait_for_stripe, I found it actually has 2 usages: for the array to enter or leave the quiescent state, and also to wait for an available stripe in each of the hash lists. So this patch splits the first usage off into a separate wait_queue, wait_for_quiescent, and the next patch will turn the second usage into one waitqueue for each hash value, and make it exclusive, to relieve the lock contention. v2: wake_up(wait_for_quiescent) when (active_stripes == 0) Commit log refactor suggestion from Neil. Signed-off-by: Yuanhan Liu --- drivers/md/raid5.c | 15 +-- drivers/md/raid5.h | 1 + 2 files changed, 10 insertions(+), 6 deletions(-) diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 77dfd72..64d5bea 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -374,6 +374,8 @@ static void release_inactive_stripe_list(struct r5conf *conf, if (do_wakeup) { wake_up(>wait_for_stripe); + if (atomic_read(>active_stripes) == 0) + wake_up(>wait_for_quiescent); if (conf->retry_read_aligned) md_wakeup_thread(conf->mddev->thread); } @@ -667,7 +669,7 @@ get_active_stripe(struct r5conf *conf, sector_t sector, spin_lock_irq(conf->hash_locks + hash); do { - wait_event_lock_irq(conf->wait_for_stripe, + wait_event_lock_irq(conf->wait_for_quiescent, conf->quiesce == 0 || noquiesce, *(conf->hash_locks + hash)); sh = __find_stripe(conf, sector, conf->generation - previous); @@ -4729,7 +4731,7 @@ static void raid5_align_endio(struct bio *bi, int error) raid_bi, 0); bio_endio(raid_bi, 0); if (atomic_dec_and_test(>active_aligned_reads)) - wake_up(>wait_for_stripe); + wake_up(>wait_for_quiescent); return; } @@ -4824,7 +4826,7 @@ static int chunk_aligned_read(struct mddev *mddev, struct bio * raid_bio) align_bi->bi_iter.bi_sector += rdev->data_offset; spin_lock_irq(>device_lock); - wait_event_lock_irq(conf->wait_for_stripe, + wait_event_lock_irq(conf->wait_for_quiescent, conf->quiesce == 0, conf->device_lock); atomic_inc(>active_aligned_reads); @@ -5668,7 +5670,7 @@ static int retry_aligned_read(struct r5conf *conf, struct bio *raid_bio) bio_endio(raid_bio, 0); } if (atomic_dec_and_test(>active_aligned_reads)) - wake_up(>wait_for_stripe); + wake_up(>wait_for_quiescent); return handled; } @@ -6399,6 +6401,7 @@ static struct r5conf *setup_conf(struct mddev *mddev) goto abort; spin_lock_init(>device_lock); seqcount_init(>gen_lock); + init_waitqueue_head(>wait_for_quiescent); init_waitqueue_head(>wait_for_stripe); init_waitqueue_head(>wait_for_overlap); INIT_LIST_HEAD(>handle_list); @@ -7422,7 +7425,7 @@ static void raid5_quiesce(struct mddev *mddev, int state) * active stripes can drain */ conf->quiesce = 2; - wait_event_cmd(conf->wait_for_stripe, + wait_event_cmd(conf->wait_for_quiescent, atomic_read(>active_stripes) == 0 && atomic_read(>active_aligned_reads) == 0, unlock_all_device_hash_locks_irq(conf), @@ -7436,7 +7439,7 @@ static void raid5_quiesce(struct mddev *mddev, int state) case 0: /* re-enable writes */ lock_all_device_hash_locks_irq(conf); conf->quiesce = 0; - wake_up(>wait_for_stripe); + wake_up(>wait_for_quiescent); wake_up(>wait_for_overlap); unlock_all_device_hash_locks_irq(conf); break; diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h index 7dc0dd8..4cc05ec 100644 --- a/drivers/md/raid5.h +++ b/drivers/md/raid5.h @@ -508,6 +508,7 @@ struct r5conf { struct list_headinactive_list[NR_STRIPE_HASH_LOCKS]; atomic_t
[PATCH 3/3] md/raid5: per hash value and exclusive wait_for_stripe
ped heavily, up to 97%. And as expected, the performance increased a lot, up to 260%, for fast device(ram disk). v2: use bits instead of array to note down wait queue need to wake up. Signed-off-by: Yuanhan Liu --- drivers/md/raid5.c | 27 +++ drivers/md/raid5.h | 2 +- 2 files changed, 20 insertions(+), 9 deletions(-) diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 64d5bea..697d77a 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -344,7 +344,8 @@ static void release_inactive_stripe_list(struct r5conf *conf, int hash) { int size; - bool do_wakeup = false; + unsigned long do_wakeup = 0; + int i = 0; unsigned long flags; if (hash == NR_STRIPE_HASH_LOCKS) { @@ -365,15 +366,19 @@ static void release_inactive_stripe_list(struct r5conf *conf, !list_empty(list)) atomic_dec(>empty_inactive_list_nr); list_splice_tail_init(list, conf->inactive_list + hash); - do_wakeup = true; + do_wakeup |= 1 << (size - 1); spin_unlock_irqrestore(conf->hash_locks + hash, flags); } size--; hash--; } + for (i = 0; i < NR_STRIPE_HASH_LOCKS; i++) { + if (do_wakeup & (1 << i)) + wake_up(>wait_for_stripe[i]); + } + if (do_wakeup) { - wake_up(>wait_for_stripe); if (atomic_read(>active_stripes) == 0) wake_up(>wait_for_quiescent); if (conf->retry_read_aligned) @@ -686,14 +691,15 @@ get_active_stripe(struct r5conf *conf, sector_t sector, if (!sh) { set_bit(R5_INACTIVE_BLOCKED, >cache_state); - wait_event_lock_irq( - conf->wait_for_stripe, + wait_event_exclusive_cmd( + conf->wait_for_stripe[hash], !list_empty(conf->inactive_list + hash) && (atomic_read(>active_stripes) < (conf->max_nr_stripes * 3 / 4) || !test_bit(R5_INACTIVE_BLOCKED, >cache_state)), - *(conf->hash_locks + hash)); + spin_unlock_irq(conf->hash_locks + hash), + spin_lock_irq(conf->hash_locks + hash)); clear_bit(R5_INACTIVE_BLOCKED, >cache_state); } else { @@ -718,6 +724,9 @@ get_active_stripe(struct r5conf *conf, sector_t sector, } } while (sh == NULL); + if (!list_empty(conf->inactive_list + hash)) + wake_up(>wait_for_stripe[hash]); + spin_unlock_irq(conf->hash_locks + hash); return sh; } @@ -2138,7 +2147,7 @@ static int resize_stripes(struct r5conf *conf, int newsize) cnt = 0; list_for_each_entry(nsh, , lru) { lock_device_hash_lock(conf, hash); - wait_event_cmd(conf->wait_for_stripe, + wait_event_exclusive_cmd(conf->wait_for_stripe[hash], !list_empty(conf->inactive_list + hash), unlock_device_hash_lock(conf, hash), lock_device_hash_lock(conf, hash)); @@ -6402,7 +6411,9 @@ static struct r5conf *setup_conf(struct mddev *mddev) spin_lock_init(>device_lock); seqcount_init(>gen_lock); init_waitqueue_head(>wait_for_quiescent); - init_waitqueue_head(>wait_for_stripe); + for (i = 0; i < NR_STRIPE_HASH_LOCKS; i++) { + init_waitqueue_head(>wait_for_stripe[i]); + } init_waitqueue_head(>wait_for_overlap); INIT_LIST_HEAD(>handle_list); INIT_LIST_HEAD(>hold_list); diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h index 4cc05ec..6307b90 100644 --- a/drivers/md/raid5.h +++ b/drivers/md/raid5.h @@ -509,7 +509,7 @@ struct r5conf { atomic_tempty_inactive_list_nr; struct llist_head released_stripes; wait_queue_head_t wait_for_quiescent; - wait_queue_head_t wait_for_stripe; + wait_queue_head_t wait_for_stripe[NR_STRIPE_HASH_LOCKS]; wait_queue_head_t wait_for_overlap; unsigned long cache_state; #define R5_INACTIVE_BLOCKED1 /*
[PATCH 1/3 v2] wait: introduce wait_event_exclusive_cmd
It's just a variant of wait_event_cmd(), with exclusive flag being set. For cases like RAID5, which puts many processes to sleep until 1/4 resources are free, a wake_up wakes up all processes to run, but there is one process being able to get the resource as it's protected by a spin lock. That ends up introducing heavy lock contentions, and hurts performance badly. Here introduce wait_event_exclusive_cmd to relieve the lock contention naturally by letting wake_up just wake up one process. Cc: Ingo Molnar Cc: Peter Zijlstra v2: its assumed that wait*() and __wait*() have the same arguments - peterz Signed-off-by: Yuanhan Liu --- include/linux/wait.h | 13 + 1 file changed, 13 insertions(+) diff --git a/include/linux/wait.h b/include/linux/wait.h index 2db8334..db78c72 100644 --- a/include/linux/wait.h +++ b/include/linux/wait.h @@ -358,6 +358,19 @@ do { \ __ret; \ }) +#define __wait_event_exclusive_cmd(wq, condition, cmd1, cmd2) \ + (void)___wait_event(wq, condition, TASK_UNINTERRUPTIBLE, 1, 0, \ + cmd1; schedule(); cmd2) +/* + * Just like wait_event_cmd(), except it sets exclusive flag + */ +#define wait_event_exclusive_cmd(wq, condition, cmd1, cmd2)\ +do { \ + if (condition) \ + break; \ + __wait_event_exclusive_cmd(wq, condition, cmd1, cmd2); \ +} while (0) + #define __wait_event_cmd(wq, condition, cmd1, cmd2)\ (void)___wait_event(wq, condition, TASK_UNINTERRUPTIBLE, 0, 0, \ cmd1; schedule(); cmd2) -- 1.9.0 -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH 1/3] wait: introduce wait_event_cmd_exclusive
On Tue, Apr 28, 2015 at 04:13:15PM +0200, Peter Zijlstra wrote: > On Mon, Apr 27, 2015 at 12:51:01PM +0800, Yuanhan Liu wrote: > > It's just a variant of wait_event_cmd, with exclusive flag being set. > > > > For cases like RAID5, which puts many processes to sleep until 1/4 > > resources are free, a wake_up wakes up all processes to run, but > > there is one process being able to get the resource as it's protected > > by a spin lock. That ends up introducing heavy lock contentions, and > > hurts performance badly. > > > > Here introduce wait_event_cmd_exclusive to relieve the lock contention > > naturally by letting wake_up() just wake up one process. > > > > Cc: Ingo Molnar > > Cc: Peter Zijlstra > > Signed-off-by: Yuanhan Liu > > --- > > include/linux/wait.h | 14 +++--- > > 1 file changed, 11 insertions(+), 3 deletions(-) > > > > diff --git a/include/linux/wait.h b/include/linux/wait.h > > index 2db8334..6c3b4de 100644 > > --- a/include/linux/wait.h > > +++ b/include/linux/wait.h > > @@ -358,10 +358,18 @@ do { > > \ > > __ret; \ > > }) > > > > -#define __wait_event_cmd(wq, condition, cmd1, cmd2) > > \ > > - (void)___wait_event(wq, condition, TASK_UNINTERRUPTIBLE, 0, 0, \ > > +#define __wait_event_cmd(wq, condition, cmd1, cmd2, exclusive) > > \ > > + (void)___wait_event(wq, condition, TASK_UNINTERRUPTIBLE, exclusive, 0, \ > > cmd1; schedule(); cmd2) > > > > + > > +#define wait_event_cmd_exclusive(wq, condition, cmd1, cmd2) > > \ > > +do { > > \ > > + if (condition) \ > > + break; \ > > + __wait_event_cmd(wq, condition, cmd1, cmd2, 1); \ > > +} while (0) > > + > > /** > > * wait_event_cmd - sleep until a condition gets true > > * @wq: the waitqueue to wait on > > @@ -380,7 +388,7 @@ do { > > \ > > do { > > \ > > if (condition) \ > > break; \ > > - __wait_event_cmd(wq, condition, cmd1, cmd2);\ > > + __wait_event_cmd(wq, condition, cmd1, cmd2, 0); \ > > } while (0) > > > > No, that's wrong, its assumed that wait*() and __wait*() have the same > arguments. Thanks. Will send an updated patch soon. --yliu -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH 1/3] wait: introduce wait_event_cmd_exclusive
On Tue, Apr 28, 2015 at 04:13:15PM +0200, Peter Zijlstra wrote: On Mon, Apr 27, 2015 at 12:51:01PM +0800, Yuanhan Liu wrote: It's just a variant of wait_event_cmd, with exclusive flag being set. For cases like RAID5, which puts many processes to sleep until 1/4 resources are free, a wake_up wakes up all processes to run, but there is one process being able to get the resource as it's protected by a spin lock. That ends up introducing heavy lock contentions, and hurts performance badly. Here introduce wait_event_cmd_exclusive to relieve the lock contention naturally by letting wake_up() just wake up one process. Cc: Ingo Molnar mi...@redhat.com Cc: Peter Zijlstra pet...@infradead.org Signed-off-by: Yuanhan Liu yuanhan@linux.intel.com --- include/linux/wait.h | 14 +++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/include/linux/wait.h b/include/linux/wait.h index 2db8334..6c3b4de 100644 --- a/include/linux/wait.h +++ b/include/linux/wait.h @@ -358,10 +358,18 @@ do { \ __ret; \ }) -#define __wait_event_cmd(wq, condition, cmd1, cmd2) \ - (void)___wait_event(wq, condition, TASK_UNINTERRUPTIBLE, 0, 0, \ +#define __wait_event_cmd(wq, condition, cmd1, cmd2, exclusive) \ + (void)___wait_event(wq, condition, TASK_UNINTERRUPTIBLE, exclusive, 0, \ cmd1; schedule(); cmd2) + +#define wait_event_cmd_exclusive(wq, condition, cmd1, cmd2) \ +do { \ + if (condition) \ + break; \ + __wait_event_cmd(wq, condition, cmd1, cmd2, 1); \ +} while (0) + /** * wait_event_cmd - sleep until a condition gets true * @wq: the waitqueue to wait on @@ -380,7 +388,7 @@ do { \ do { \ if (condition) \ break; \ - __wait_event_cmd(wq, condition, cmd1, cmd2);\ + __wait_event_cmd(wq, condition, cmd1, cmd2, 0); \ } while (0) No, that's wrong, its assumed that wait*() and __wait*() have the same arguments. Thanks. Will send an updated patch soon. --yliu -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH 3/3] md/raid5: per hash value and exclusive wait_for_stripe
heavily, up to 97%. And as expected, the performance increased a lot, up to 260%, for fast device(ram disk). v2: use bits instead of array to note down wait queue need to wake up. Signed-off-by: Yuanhan Liu yuanhan@linux.intel.com --- drivers/md/raid5.c | 27 +++ drivers/md/raid5.h | 2 +- 2 files changed, 20 insertions(+), 9 deletions(-) diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 64d5bea..697d77a 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -344,7 +344,8 @@ static void release_inactive_stripe_list(struct r5conf *conf, int hash) { int size; - bool do_wakeup = false; + unsigned long do_wakeup = 0; + int i = 0; unsigned long flags; if (hash == NR_STRIPE_HASH_LOCKS) { @@ -365,15 +366,19 @@ static void release_inactive_stripe_list(struct r5conf *conf, !list_empty(list)) atomic_dec(conf-empty_inactive_list_nr); list_splice_tail_init(list, conf-inactive_list + hash); - do_wakeup = true; + do_wakeup |= 1 (size - 1); spin_unlock_irqrestore(conf-hash_locks + hash, flags); } size--; hash--; } + for (i = 0; i NR_STRIPE_HASH_LOCKS; i++) { + if (do_wakeup (1 i)) + wake_up(conf-wait_for_stripe[i]); + } + if (do_wakeup) { - wake_up(conf-wait_for_stripe); if (atomic_read(conf-active_stripes) == 0) wake_up(conf-wait_for_quiescent); if (conf-retry_read_aligned) @@ -686,14 +691,15 @@ get_active_stripe(struct r5conf *conf, sector_t sector, if (!sh) { set_bit(R5_INACTIVE_BLOCKED, conf-cache_state); - wait_event_lock_irq( - conf-wait_for_stripe, + wait_event_exclusive_cmd( + conf-wait_for_stripe[hash], !list_empty(conf-inactive_list + hash) (atomic_read(conf-active_stripes) (conf-max_nr_stripes * 3 / 4) || !test_bit(R5_INACTIVE_BLOCKED, conf-cache_state)), - *(conf-hash_locks + hash)); + spin_unlock_irq(conf-hash_locks + hash), + spin_lock_irq(conf-hash_locks + hash)); clear_bit(R5_INACTIVE_BLOCKED, conf-cache_state); } else { @@ -718,6 +724,9 @@ get_active_stripe(struct r5conf *conf, sector_t sector, } } while (sh == NULL); + if (!list_empty(conf-inactive_list + hash)) + wake_up(conf-wait_for_stripe[hash]); + spin_unlock_irq(conf-hash_locks + hash); return sh; } @@ -2138,7 +2147,7 @@ static int resize_stripes(struct r5conf *conf, int newsize) cnt = 0; list_for_each_entry(nsh, newstripes, lru) { lock_device_hash_lock(conf, hash); - wait_event_cmd(conf-wait_for_stripe, + wait_event_exclusive_cmd(conf-wait_for_stripe[hash], !list_empty(conf-inactive_list + hash), unlock_device_hash_lock(conf, hash), lock_device_hash_lock(conf, hash)); @@ -6402,7 +6411,9 @@ static struct r5conf *setup_conf(struct mddev *mddev) spin_lock_init(conf-device_lock); seqcount_init(conf-gen_lock); init_waitqueue_head(conf-wait_for_quiescent); - init_waitqueue_head(conf-wait_for_stripe); + for (i = 0; i NR_STRIPE_HASH_LOCKS; i++) { + init_waitqueue_head(conf-wait_for_stripe[i]); + } init_waitqueue_head(conf-wait_for_overlap); INIT_LIST_HEAD(conf-handle_list); INIT_LIST_HEAD(conf-hold_list); diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h index 4cc05ec..6307b90 100644 --- a/drivers/md/raid5.h +++ b/drivers/md/raid5.h @@ -509,7 +509,7 @@ struct r5conf { atomic_tempty_inactive_list_nr; struct llist_head released_stripes; wait_queue_head_t wait_for_quiescent; - wait_queue_head_t wait_for_stripe; + wait_queue_head_t wait_for_stripe[NR_STRIPE_HASH_LOCKS]; wait_queue_head_t wait_for_overlap; unsigned long cache_state; #define R5_INACTIVE_BLOCKED1 /* release of inactive stripes blocked, -- 1.9.0
[PATCH 1/3 v2] wait: introduce wait_event_exclusive_cmd
It's just a variant of wait_event_cmd(), with exclusive flag being set. For cases like RAID5, which puts many processes to sleep until 1/4 resources are free, a wake_up wakes up all processes to run, but there is one process being able to get the resource as it's protected by a spin lock. That ends up introducing heavy lock contentions, and hurts performance badly. Here introduce wait_event_exclusive_cmd to relieve the lock contention naturally by letting wake_up just wake up one process. Cc: Ingo Molnar mi...@redhat.com Cc: Peter Zijlstra pet...@infradead.org v2: its assumed that wait*() and __wait*() have the same arguments - peterz Signed-off-by: Yuanhan Liu yuanhan@linux.intel.com --- include/linux/wait.h | 13 + 1 file changed, 13 insertions(+) diff --git a/include/linux/wait.h b/include/linux/wait.h index 2db8334..db78c72 100644 --- a/include/linux/wait.h +++ b/include/linux/wait.h @@ -358,6 +358,19 @@ do { \ __ret; \ }) +#define __wait_event_exclusive_cmd(wq, condition, cmd1, cmd2) \ + (void)___wait_event(wq, condition, TASK_UNINTERRUPTIBLE, 1, 0, \ + cmd1; schedule(); cmd2) +/* + * Just like wait_event_cmd(), except it sets exclusive flag + */ +#define wait_event_exclusive_cmd(wq, condition, cmd1, cmd2)\ +do { \ + if (condition) \ + break; \ + __wait_event_exclusive_cmd(wq, condition, cmd1, cmd2); \ +} while (0) + #define __wait_event_cmd(wq, condition, cmd1, cmd2)\ (void)___wait_event(wq, condition, TASK_UNINTERRUPTIBLE, 0, 0, \ cmd1; schedule(); cmd2) -- 1.9.0 -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH 2/3] md/raid5: split wait_for_stripe and introduce wait_for_quiescent
I noticed heavy spin lock contention at get_active_stripe(), introduced at being wake up stage, where a bunch of processes try to re-hold the spin lock again. After giving some thoughts on this issue, I found the lock could be relieved(and even avoided) if we turn the wait_for_stripe to per waitqueue for each lock hash and make the wake up exclusive: wake up one process each time, which avoids the lock contention naturally. Before go hacking with wait_for_stripe, I found it actually has 2 usages: for the array to enter or leave the quiescent state, and also to wait for an available stripe in each of the hash lists. So this patch splits the first usage off into a separate wait_queue, wait_for_quiescent, and the next patch will turn the second usage into one waitqueue for each hash value, and make it exclusive, to relieve the lock contention. v2: wake_up(wait_for_quiescent) when (active_stripes == 0) Commit log refactor suggestion from Neil. Signed-off-by: Yuanhan Liu yuanhan@linux.intel.com --- drivers/md/raid5.c | 15 +-- drivers/md/raid5.h | 1 + 2 files changed, 10 insertions(+), 6 deletions(-) diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 77dfd72..64d5bea 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -374,6 +374,8 @@ static void release_inactive_stripe_list(struct r5conf *conf, if (do_wakeup) { wake_up(conf-wait_for_stripe); + if (atomic_read(conf-active_stripes) == 0) + wake_up(conf-wait_for_quiescent); if (conf-retry_read_aligned) md_wakeup_thread(conf-mddev-thread); } @@ -667,7 +669,7 @@ get_active_stripe(struct r5conf *conf, sector_t sector, spin_lock_irq(conf-hash_locks + hash); do { - wait_event_lock_irq(conf-wait_for_stripe, + wait_event_lock_irq(conf-wait_for_quiescent, conf-quiesce == 0 || noquiesce, *(conf-hash_locks + hash)); sh = __find_stripe(conf, sector, conf-generation - previous); @@ -4729,7 +4731,7 @@ static void raid5_align_endio(struct bio *bi, int error) raid_bi, 0); bio_endio(raid_bi, 0); if (atomic_dec_and_test(conf-active_aligned_reads)) - wake_up(conf-wait_for_stripe); + wake_up(conf-wait_for_quiescent); return; } @@ -4824,7 +4826,7 @@ static int chunk_aligned_read(struct mddev *mddev, struct bio * raid_bio) align_bi-bi_iter.bi_sector += rdev-data_offset; spin_lock_irq(conf-device_lock); - wait_event_lock_irq(conf-wait_for_stripe, + wait_event_lock_irq(conf-wait_for_quiescent, conf-quiesce == 0, conf-device_lock); atomic_inc(conf-active_aligned_reads); @@ -5668,7 +5670,7 @@ static int retry_aligned_read(struct r5conf *conf, struct bio *raid_bio) bio_endio(raid_bio, 0); } if (atomic_dec_and_test(conf-active_aligned_reads)) - wake_up(conf-wait_for_stripe); + wake_up(conf-wait_for_quiescent); return handled; } @@ -6399,6 +6401,7 @@ static struct r5conf *setup_conf(struct mddev *mddev) goto abort; spin_lock_init(conf-device_lock); seqcount_init(conf-gen_lock); + init_waitqueue_head(conf-wait_for_quiescent); init_waitqueue_head(conf-wait_for_stripe); init_waitqueue_head(conf-wait_for_overlap); INIT_LIST_HEAD(conf-handle_list); @@ -7422,7 +7425,7 @@ static void raid5_quiesce(struct mddev *mddev, int state) * active stripes can drain */ conf-quiesce = 2; - wait_event_cmd(conf-wait_for_stripe, + wait_event_cmd(conf-wait_for_quiescent, atomic_read(conf-active_stripes) == 0 atomic_read(conf-active_aligned_reads) == 0, unlock_all_device_hash_locks_irq(conf), @@ -7436,7 +7439,7 @@ static void raid5_quiesce(struct mddev *mddev, int state) case 0: /* re-enable writes */ lock_all_device_hash_locks_irq(conf); conf-quiesce = 0; - wake_up(conf-wait_for_stripe); + wake_up(conf-wait_for_quiescent); wake_up(conf-wait_for_overlap); unlock_all_device_hash_locks_irq(conf); break; diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h index 7dc0dd8..4cc05ec 100644 --- a/drivers/md/raid5.h +++ b/drivers/md/raid5.h @@ -508,6 +508,7 @@ struct r5conf { struct list_headinactive_list[NR_STRIPE_HASH_LOCKS]; atomic_tempty_inactive_list_nr; struct
[PATCH 3/3 v2] md/raid5: per hash value and exclusive wait_for_stripe
ped heavily, up to 97%. And as expected, the performance increased a lot, up to 260%, for fast device(ram disk). v2: use bits instead of array to note down wait queue need to wake up. Signed-off-by: Yuanhan Liu --- drivers/md/raid5.c | 27 +++ drivers/md/raid5.h | 2 +- 2 files changed, 20 insertions(+), 9 deletions(-) diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 64d5bea..1b11bbf 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -344,7 +344,8 @@ static void release_inactive_stripe_list(struct r5conf *conf, int hash) { int size; - bool do_wakeup = false; + unsigned long do_wakeup = 0; + int i = 0; unsigned long flags; if (hash == NR_STRIPE_HASH_LOCKS) { @@ -365,15 +366,19 @@ static void release_inactive_stripe_list(struct r5conf *conf, !list_empty(list)) atomic_dec(>empty_inactive_list_nr); list_splice_tail_init(list, conf->inactive_list + hash); - do_wakeup = true; + do_wakeup |= 1 << (size - 1); spin_unlock_irqrestore(conf->hash_locks + hash, flags); } size--; hash--; } + for (i = 0; i < NR_STRIPE_HASH_LOCKS; i++) { + if (do_wakeup & (1 << i)) + wake_up(>wait_for_stripe[i]); + } + if (do_wakeup) { - wake_up(>wait_for_stripe); if (atomic_read(>active_stripes) == 0) wake_up(>wait_for_quiescent); if (conf->retry_read_aligned) @@ -686,14 +691,15 @@ get_active_stripe(struct r5conf *conf, sector_t sector, if (!sh) { set_bit(R5_INACTIVE_BLOCKED, >cache_state); - wait_event_lock_irq( - conf->wait_for_stripe, + wait_event_cmd_exclusive( + conf->wait_for_stripe[hash], !list_empty(conf->inactive_list + hash) && (atomic_read(>active_stripes) < (conf->max_nr_stripes * 3 / 4) || !test_bit(R5_INACTIVE_BLOCKED, >cache_state)), - *(conf->hash_locks + hash)); + spin_unlock_irq(conf->hash_locks + hash), + spin_lock_irq(conf->hash_locks + hash)); clear_bit(R5_INACTIVE_BLOCKED, >cache_state); } else { @@ -718,6 +724,9 @@ get_active_stripe(struct r5conf *conf, sector_t sector, } } while (sh == NULL); + if (!list_empty(conf->inactive_list + hash)) + wake_up(>wait_for_stripe[hash]); + spin_unlock_irq(conf->hash_locks + hash); return sh; } @@ -2138,7 +2147,7 @@ static int resize_stripes(struct r5conf *conf, int newsize) cnt = 0; list_for_each_entry(nsh, , lru) { lock_device_hash_lock(conf, hash); - wait_event_cmd(conf->wait_for_stripe, + wait_event_cmd_exclusive(conf->wait_for_stripe[hash], !list_empty(conf->inactive_list + hash), unlock_device_hash_lock(conf, hash), lock_device_hash_lock(conf, hash)); @@ -6402,7 +6411,9 @@ static struct r5conf *setup_conf(struct mddev *mddev) spin_lock_init(>device_lock); seqcount_init(>gen_lock); init_waitqueue_head(>wait_for_quiescent); - init_waitqueue_head(>wait_for_stripe); + for (i = 0; i < NR_STRIPE_HASH_LOCKS; i++) { + init_waitqueue_head(>wait_for_stripe[i]); + } init_waitqueue_head(>wait_for_overlap); INIT_LIST_HEAD(>handle_list); INIT_LIST_HEAD(>hold_list); diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h index 4cc05ec..6307b90 100644 --- a/drivers/md/raid5.h +++ b/drivers/md/raid5.h @@ -509,7 +509,7 @@ struct r5conf { atomic_tempty_inactive_list_nr; struct llist_head released_stripes; wait_queue_head_t wait_for_quiescent; - wait_queue_head_t wait_for_stripe; + wait_queue_head_t wait_for_stripe[NR_STRIPE_HASH_LOCKS]; wait_queue_head_t wait_for_overlap; unsigned long cache_state; #define R5_INACTIVE_BLOCKED1 /*
[PATCH 1/3] wait: introduce wait_event_cmd_exclusive
It's just a variant of wait_event_cmd, with exclusive flag being set. For cases like RAID5, which puts many processes to sleep until 1/4 resources are free, a wake_up wakes up all processes to run, but there is one process being able to get the resource as it's protected by a spin lock. That ends up introducing heavy lock contentions, and hurts performance badly. Here introduce wait_event_cmd_exclusive to relieve the lock contention naturally by letting wake_up() just wake up one process. Cc: Ingo Molnar Cc: Peter Zijlstra Signed-off-by: Yuanhan Liu --- include/linux/wait.h | 14 +++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/include/linux/wait.h b/include/linux/wait.h index 2db8334..6c3b4de 100644 --- a/include/linux/wait.h +++ b/include/linux/wait.h @@ -358,10 +358,18 @@ do { \ __ret; \ }) -#define __wait_event_cmd(wq, condition, cmd1, cmd2)\ - (void)___wait_event(wq, condition, TASK_UNINTERRUPTIBLE, 0, 0, \ +#define __wait_event_cmd(wq, condition, cmd1, cmd2, exclusive) \ + (void)___wait_event(wq, condition, TASK_UNINTERRUPTIBLE, exclusive, 0, \ cmd1; schedule(); cmd2) + +#define wait_event_cmd_exclusive(wq, condition, cmd1, cmd2) \ +do { \ + if (condition) \ + break; \ + __wait_event_cmd(wq, condition, cmd1, cmd2, 1); \ +} while (0) + /** * wait_event_cmd - sleep until a condition gets true * @wq: the waitqueue to wait on @@ -380,7 +388,7 @@ do { \ do { \ if (condition) \ break; \ - __wait_event_cmd(wq, condition, cmd1, cmd2);\ + __wait_event_cmd(wq, condition, cmd1, cmd2, 0); \ } while (0) #define __wait_event_interruptible(wq, condition) \ -- 1.9.0 -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH 2/3 v2] md/raid5: split wait_for_stripe and introduce wait_for_quiescent
I noticed heavy spin lock contention at get_active_stripe(), introduced at being wake up stage, where a bunch of processes try to re-hold the spin lock again. After giving some thoughts on this issue, I found the lock could be relieved(and even avoided) if we turn the wait_for_stripe to per waitqueue for each lock hash and make the wake up exclusive: wake up one process each time, which avoids the lock contention naturally. Before go hacking with wait_for_stripe, I found it actually has 2 usages: for the array to enter or leave the quiescent state, and also to wait for an available stripe in each of the hash lists. So this patch splits the first usage off into a separate wait_queue, wait_for_quiescent, and the next patch will turn the second usage into one waitqueue for each hash value, and make it exclusive, to relieve the lock contention. v2: wake_up(wait_for_quiescent) when (active_stripes == 0) Commit log refactor suggestion from Neil. Signed-off-by: Yuanhan Liu --- drivers/md/raid5.c | 15 +-- drivers/md/raid5.h | 1 + 2 files changed, 10 insertions(+), 6 deletions(-) diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 77dfd72..64d5bea 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -374,6 +374,8 @@ static void release_inactive_stripe_list(struct r5conf *conf, if (do_wakeup) { wake_up(>wait_for_stripe); + if (atomic_read(>active_stripes) == 0) + wake_up(>wait_for_quiescent); if (conf->retry_read_aligned) md_wakeup_thread(conf->mddev->thread); } @@ -667,7 +669,7 @@ get_active_stripe(struct r5conf *conf, sector_t sector, spin_lock_irq(conf->hash_locks + hash); do { - wait_event_lock_irq(conf->wait_for_stripe, + wait_event_lock_irq(conf->wait_for_quiescent, conf->quiesce == 0 || noquiesce, *(conf->hash_locks + hash)); sh = __find_stripe(conf, sector, conf->generation - previous); @@ -4729,7 +4731,7 @@ static void raid5_align_endio(struct bio *bi, int error) raid_bi, 0); bio_endio(raid_bi, 0); if (atomic_dec_and_test(>active_aligned_reads)) - wake_up(>wait_for_stripe); + wake_up(>wait_for_quiescent); return; } @@ -4824,7 +4826,7 @@ static int chunk_aligned_read(struct mddev *mddev, struct bio * raid_bio) align_bi->bi_iter.bi_sector += rdev->data_offset; spin_lock_irq(>device_lock); - wait_event_lock_irq(conf->wait_for_stripe, + wait_event_lock_irq(conf->wait_for_quiescent, conf->quiesce == 0, conf->device_lock); atomic_inc(>active_aligned_reads); @@ -5668,7 +5670,7 @@ static int retry_aligned_read(struct r5conf *conf, struct bio *raid_bio) bio_endio(raid_bio, 0); } if (atomic_dec_and_test(>active_aligned_reads)) - wake_up(>wait_for_stripe); + wake_up(>wait_for_quiescent); return handled; } @@ -6399,6 +6401,7 @@ static struct r5conf *setup_conf(struct mddev *mddev) goto abort; spin_lock_init(>device_lock); seqcount_init(>gen_lock); + init_waitqueue_head(>wait_for_quiescent); init_waitqueue_head(>wait_for_stripe); init_waitqueue_head(>wait_for_overlap); INIT_LIST_HEAD(>handle_list); @@ -7422,7 +7425,7 @@ static void raid5_quiesce(struct mddev *mddev, int state) * active stripes can drain */ conf->quiesce = 2; - wait_event_cmd(conf->wait_for_stripe, + wait_event_cmd(conf->wait_for_quiescent, atomic_read(>active_stripes) == 0 && atomic_read(>active_aligned_reads) == 0, unlock_all_device_hash_locks_irq(conf), @@ -7436,7 +7439,7 @@ static void raid5_quiesce(struct mddev *mddev, int state) case 0: /* re-enable writes */ lock_all_device_hash_locks_irq(conf); conf->quiesce = 0; - wake_up(>wait_for_stripe); + wake_up(>wait_for_quiescent); wake_up(>wait_for_overlap); unlock_all_device_hash_locks_irq(conf); break; diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h index 7dc0dd8..4cc05ec 100644 --- a/drivers/md/raid5.h +++ b/drivers/md/raid5.h @@ -508,6 +508,7 @@ struct r5conf { struct list_headinactive_list[NR_STRIPE_HASH_LOCKS]; atomic_t
Re: [PATCH 2/2] md/raid5: exclusive wait_for_stripe
On Mon, Apr 27, 2015 at 10:24:05AM +1000, NeilBrown wrote: > On Fri, 24 Apr 2015 21:39:04 +0800 Yuanhan Liu > wrote: > > > I noticed heavy spin lock contention at get_active_stripe() with fsmark > > multiple thread write workloads. > > > > Here is how this hot contention comes from. We have limited stripes, and > > it's a multiple thread write workload. Hence, those stripes will be taken > > soon, which puts later processes to sleep for waiting free stripes. When > > enough stripes(> 1/4 total stripes) are released, all process are woken, > > trying to get the lock. But there is one only being able to get this lock > > for each hash lock, making other processes spinning out there for acquiring > > the lock. > > > > Thus, it's effectiveless to wakeup all processes and let them battle for > > a lock that permits one to access only each time. Instead, we could make > > it be a exclusive wake up: wake up one process only. That avoids the heavy > > spin lock contention naturally. > > > > Here are some test results I have got with this patch applied(all test run > > 3 times): > > > > `fsmark.files_per_sec' > > = > > > > next-20150317 this patch > > - - > > metric_value ±stddev metric_value ±stddev change > > testbox/benchmark/testcase-params > > - - > > -- > > 25.600 ±0.0 92.700 ±2.5 262.1% > > ivb44/fsmark/1x-64t-4BRD_12G-RAID5-btrfs-4M-30G-fsyncBeforeClose > > 25.600 ±0.0 77.800 ±0.6 203.9% > > ivb44/fsmark/1x-64t-9BRD_6G-RAID5-btrfs-4M-30G-fsyncBeforeClose > > 32.000 ±0.0 93.800 ±1.7 193.1% > > ivb44/fsmark/1x-64t-4BRD_12G-RAID5-ext4-4M-30G-fsyncBeforeClose > > 32.000 ±0.0 81.233 ±1.7 153.9% > > ivb44/fsmark/1x-64t-9BRD_6G-RAID5-ext4-4M-30G-fsyncBeforeClose > > 48.800 ±14.5 99.667 ±2.0 104.2% > > ivb44/fsmark/1x-64t-4BRD_12G-RAID5-xfs-4M-30G-fsyncBeforeClose > >6.400 ±0.0 12.800 ±0.0 100.0% > > ivb44/fsmark/1x-64t-3HDD-RAID5-btrfs-4M-40G-fsyncBeforeClose > > 63.133 ±8.2 82.800 ±0.7 31.2% > > ivb44/fsmark/1x-64t-9BRD_6G-RAID5-xfs-4M-30G-fsyncBeforeClose > > 245.067 ±0.7 306.567 ±7.9 25.1% > > ivb44/fsmark/1x-64t-4BRD_12G-RAID5-f2fs-4M-30G-fsyncBeforeClose > > 17.533 ±0.3 21.000 ±0.8 19.8% > > ivb44/fsmark/1x-1t-3HDD-RAID5-xfs-4M-40G-fsyncBeforeClose > > 188.167 ±1.9 215.033 ±3.1 14.3% > > ivb44/fsmark/1x-1t-4BRD_12G-RAID5-btrfs-4M-30G-NoSync > > 254.500 ±1.8 290.733 ±2.4 14.2% > > ivb44/fsmark/1x-1t-9BRD_6G-RAID5-btrfs-4M-30G-NoSync > > > > `time.system_time' > > = > > > > next-20150317 this patch > > -- > > metric_value ±stddev metric_value ±stddev change > > testbox/benchmark/testcase-params > > -- > > -- > > 7235.603 ±1.2 185.163 ±1.9 -97.4% > > ivb44/fsmark/1x-64t-4BRD_12G-RAID5-btrfs-4M-30G-fsyncBeforeClose > > 7666.883 ±2.9 202.750 ±1.0 -97.4% > > ivb44/fsmark/1x-64t-9BRD_6G-RAID5-btrfs-4M-30G-fsyncBeforeClose > >14567.893 ±0.7 421.230 ±0.4 -97.1% > > ivb44/fsmark/1x-64t-3HDD-RAID5-btrfs-4M-40G-fsyncBeforeClose > > 3697.667 ±14.0148.190 ±1.7 -96.0% > > ivb44/fsmark/1x-64t-4BRD_12G-RAID5-xfs-4M-30G-fsyncBeforeClose > > 5572.867 ±3.8 310.717 ±1.4 -94.4% > > ivb44/fsmark/1x-64t-9BRD_6G-RAID5-ext4-4M-30G-fsyncBeforeClose > > 5565.050 ±0.5 313.277 ±1.5 -94.4% > > ivb44/fsmark/1x-64t-4BRD_12G-RAID5-ext4-4M-30G-fsyncBeforeClose > > 2420.707 ±17.1171.043 ±2.7 -92.9% > > ivb44/fsmark/1x-64t-9BRD_6G-RAID5-xfs-4M-30G-fsyncBeforeClose > > 3743.300 ±4.6 379.827 ±3.5 -89.9% > > ivb44/fsmark/1x-64
Re: [PATCH 1/2] md/raid5: split wait_for_stripe and introduce wait_for_quiesce
On Mon, Apr 27, 2015 at 10:10:24AM +1000, NeilBrown wrote: > On Fri, 24 Apr 2015 21:39:03 +0800 Yuanhan Liu > wrote: > > > If I read code correctly, current wait_for_stripe actually has 2 usage: > > > > - wait for there is enough free stripe cache, triggered when > > get_free_stripe() failed. This is what wait_for_stripe intend > > for literally. > > > > - wait for quiesce == 0 or > >active_aligned_reads == 0 && active_stripes == 0 > > > > It has nothing to do with wait_for_stripe literally, and releasing > > an active stripe won't actually wake them up. On the contrary, wake_up > > from under this case won't actually wake up the process waiting for > > an free stripe being available. > > I disagree. Releasing an active stripe *will* (or *can*) wake up that third > case, as it decrements "active_stripes" which will eventually reach zero. > > I don't think your new code will properly wake up a process which is waiting > for "active_stripes == 0". Right, and thanks for pointing it out. So, is this enough? --- diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 2d8fcc1..3f23035 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -383,6 +383,9 @@ static void release_inactive_stripe_list(struct r5conf *conf, } } } + + if (!atomic_read(>active_stripes)) + wake_up(>wait_for_quiesce); } /* should hold conf->device_lock already */ Or, should I put it a bit ahead, trying to invoke wake_up(>wait_for_quiesce) after each atomic_dec(>active_stripes)? if (atomic_dec_return(>active_stripes) == 0) wake_up(>wait_for_quiesce); > > > > > Hence, we'd better split wait_for_stripe, and here I introduce > > wait_for_quiesce for the second usage. The name may not well taken, or > > even taken wrongly. Feel free to correct me then. > > > > This is also a prepare patch for next patch: make wait_for_stripe > > exclusive. > > I think you have this commit description upside down :-) > > The real motivation is that you are seeing contention on some spinlock and so > you want to split 'wait_for_stripe' up in to multiple wait_queues so that you > can use exclusive wakeup. As this is the main motivation, it should be > stated first. > > Then explain that 'wait_for_stripe' is used to wait for the array to enter or > leave the quiescent state, and also to wait for an available stripe in each > of the hash lists. > > So this patch splits the first usage off into a separate wait_queue, and the > next patch will split the second usage into one waitqueue for each hash value. > > Then explain just is what is needed for that first step. > > When you put it that way around, the patch makes lots of sense. It does, and thanks! > > So: could you please resubmit with the description the right way around, and To make sure I followed you correctly, my patch order is correct(I mean, split lock first, and make wait_for_stripe per lock hash and exclusive second), and what I need to do is re-writing the commit log as you suggested, and fixing all issues you pointed out. Right? --yliu > with an appropriate wakeup call to ensure raid5_quiesce is woken up when > active_stripes reaches zero? > > Thanks, > NeilBrown > > > > > > Signed-off-by: Yuanhan Liu > > --- > > drivers/md/raid5.c | 13 +++-- > > drivers/md/raid5.h | 1 + > > 2 files changed, 8 insertions(+), 6 deletions(-) > > > > diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c > > index 9716319..b7e385f 100644 > > --- a/drivers/md/raid5.c > > +++ b/drivers/md/raid5.c > > @@ -667,7 +667,7 @@ get_active_stripe(struct r5conf *conf, sector_t sector, > > spin_lock_irq(conf->hash_locks + hash); > > > > do { > > - wait_event_lock_irq(conf->wait_for_stripe, > > + wait_event_lock_irq(conf->wait_for_quiesce, > > conf->quiesce == 0 || noquiesce, > > *(conf->hash_locks + hash)); > > sh = __find_stripe(conf, sector, conf->generation - previous); > > @@ -4725,7 +4725,7 @@ static void raid5_align_endio(struct bio *bi, int > > error) > > raid_bi, 0); > > bio_endio(raid_bi, 0); > > if (atomic_dec_and_test(>active_aligned_reads)) > > - wake_up(>wait_for_stripe); > > + wake_up(>wait_for_quiesce); > > return; > > } > > > > @@ -
Re: [PATCH 1/2] md/raid5: split wait_for_stripe and introduce wait_for_quiesce
On Mon, Apr 27, 2015 at 10:10:24AM +1000, NeilBrown wrote: On Fri, 24 Apr 2015 21:39:03 +0800 Yuanhan Liu yuanhan@linux.intel.com wrote: If I read code correctly, current wait_for_stripe actually has 2 usage: - wait for there is enough free stripe cache, triggered when get_free_stripe() failed. This is what wait_for_stripe intend for literally. - wait for quiesce == 0 or active_aligned_reads == 0 active_stripes == 0 It has nothing to do with wait_for_stripe literally, and releasing an active stripe won't actually wake them up. On the contrary, wake_up from under this case won't actually wake up the process waiting for an free stripe being available. I disagree. Releasing an active stripe *will* (or *can*) wake up that third case, as it decrements active_stripes which will eventually reach zero. I don't think your new code will properly wake up a process which is waiting for active_stripes == 0. Right, and thanks for pointing it out. So, is this enough? --- diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 2d8fcc1..3f23035 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -383,6 +383,9 @@ static void release_inactive_stripe_list(struct r5conf *conf, } } } + + if (!atomic_read(conf-active_stripes)) + wake_up(conf-wait_for_quiesce); } /* should hold conf-device_lock already */ Or, should I put it a bit ahead, trying to invoke wake_up(conf-wait_for_quiesce) after each atomic_dec(conf-active_stripes)? if (atomic_dec_return(conf-active_stripes) == 0) wake_up(conf-wait_for_quiesce); Hence, we'd better split wait_for_stripe, and here I introduce wait_for_quiesce for the second usage. The name may not well taken, or even taken wrongly. Feel free to correct me then. This is also a prepare patch for next patch: make wait_for_stripe exclusive. I think you have this commit description upside down :-) The real motivation is that you are seeing contention on some spinlock and so you want to split 'wait_for_stripe' up in to multiple wait_queues so that you can use exclusive wakeup. As this is the main motivation, it should be stated first. Then explain that 'wait_for_stripe' is used to wait for the array to enter or leave the quiescent state, and also to wait for an available stripe in each of the hash lists. So this patch splits the first usage off into a separate wait_queue, and the next patch will split the second usage into one waitqueue for each hash value. Then explain just is what is needed for that first step. When you put it that way around, the patch makes lots of sense. It does, and thanks! So: could you please resubmit with the description the right way around, and To make sure I followed you correctly, my patch order is correct(I mean, split lock first, and make wait_for_stripe per lock hash and exclusive second), and what I need to do is re-writing the commit log as you suggested, and fixing all issues you pointed out. Right? --yliu with an appropriate wakeup call to ensure raid5_quiesce is woken up when active_stripes reaches zero? Thanks, NeilBrown Signed-off-by: Yuanhan Liu yuanhan@linux.intel.com --- drivers/md/raid5.c | 13 +++-- drivers/md/raid5.h | 1 + 2 files changed, 8 insertions(+), 6 deletions(-) diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 9716319..b7e385f 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -667,7 +667,7 @@ get_active_stripe(struct r5conf *conf, sector_t sector, spin_lock_irq(conf-hash_locks + hash); do { - wait_event_lock_irq(conf-wait_for_stripe, + wait_event_lock_irq(conf-wait_for_quiesce, conf-quiesce == 0 || noquiesce, *(conf-hash_locks + hash)); sh = __find_stripe(conf, sector, conf-generation - previous); @@ -4725,7 +4725,7 @@ static void raid5_align_endio(struct bio *bi, int error) raid_bi, 0); bio_endio(raid_bi, 0); if (atomic_dec_and_test(conf-active_aligned_reads)) - wake_up(conf-wait_for_stripe); + wake_up(conf-wait_for_quiesce); return; } @@ -4820,7 +4820,7 @@ static int chunk_aligned_read(struct mddev *mddev, struct bio * raid_bio) align_bi-bi_iter.bi_sector += rdev-data_offset; spin_lock_irq(conf-device_lock); - wait_event_lock_irq(conf-wait_for_stripe, + wait_event_lock_irq(conf-wait_for_quiesce, conf-quiesce == 0, conf-device_lock); atomic_inc(conf-active_aligned_reads); @@ -5659,7 +5659,7 @@ static int retry_aligned_read(struct r5conf *conf
Re: [PATCH 2/2] md/raid5: exclusive wait_for_stripe
On Mon, Apr 27, 2015 at 10:24:05AM +1000, NeilBrown wrote: On Fri, 24 Apr 2015 21:39:04 +0800 Yuanhan Liu yuanhan@linux.intel.com wrote: I noticed heavy spin lock contention at get_active_stripe() with fsmark multiple thread write workloads. Here is how this hot contention comes from. We have limited stripes, and it's a multiple thread write workload. Hence, those stripes will be taken soon, which puts later processes to sleep for waiting free stripes. When enough stripes( 1/4 total stripes) are released, all process are woken, trying to get the lock. But there is one only being able to get this lock for each hash lock, making other processes spinning out there for acquiring the lock. Thus, it's effectiveless to wakeup all processes and let them battle for a lock that permits one to access only each time. Instead, we could make it be a exclusive wake up: wake up one process only. That avoids the heavy spin lock contention naturally. Here are some test results I have got with this patch applied(all test run 3 times): `fsmark.files_per_sec' = next-20150317 this patch - - metric_value ±stddev metric_value ±stddev change testbox/benchmark/testcase-params - - -- 25.600 ±0.0 92.700 ±2.5 262.1% ivb44/fsmark/1x-64t-4BRD_12G-RAID5-btrfs-4M-30G-fsyncBeforeClose 25.600 ±0.0 77.800 ±0.6 203.9% ivb44/fsmark/1x-64t-9BRD_6G-RAID5-btrfs-4M-30G-fsyncBeforeClose 32.000 ±0.0 93.800 ±1.7 193.1% ivb44/fsmark/1x-64t-4BRD_12G-RAID5-ext4-4M-30G-fsyncBeforeClose 32.000 ±0.0 81.233 ±1.7 153.9% ivb44/fsmark/1x-64t-9BRD_6G-RAID5-ext4-4M-30G-fsyncBeforeClose 48.800 ±14.5 99.667 ±2.0 104.2% ivb44/fsmark/1x-64t-4BRD_12G-RAID5-xfs-4M-30G-fsyncBeforeClose 6.400 ±0.0 12.800 ±0.0 100.0% ivb44/fsmark/1x-64t-3HDD-RAID5-btrfs-4M-40G-fsyncBeforeClose 63.133 ±8.2 82.800 ±0.7 31.2% ivb44/fsmark/1x-64t-9BRD_6G-RAID5-xfs-4M-30G-fsyncBeforeClose 245.067 ±0.7 306.567 ±7.9 25.1% ivb44/fsmark/1x-64t-4BRD_12G-RAID5-f2fs-4M-30G-fsyncBeforeClose 17.533 ±0.3 21.000 ±0.8 19.8% ivb44/fsmark/1x-1t-3HDD-RAID5-xfs-4M-40G-fsyncBeforeClose 188.167 ±1.9 215.033 ±3.1 14.3% ivb44/fsmark/1x-1t-4BRD_12G-RAID5-btrfs-4M-30G-NoSync 254.500 ±1.8 290.733 ±2.4 14.2% ivb44/fsmark/1x-1t-9BRD_6G-RAID5-btrfs-4M-30G-NoSync `time.system_time' = next-20150317 this patch -- metric_value ±stddev metric_value ±stddev change testbox/benchmark/testcase-params -- -- 7235.603 ±1.2 185.163 ±1.9 -97.4% ivb44/fsmark/1x-64t-4BRD_12G-RAID5-btrfs-4M-30G-fsyncBeforeClose 7666.883 ±2.9 202.750 ±1.0 -97.4% ivb44/fsmark/1x-64t-9BRD_6G-RAID5-btrfs-4M-30G-fsyncBeforeClose 14567.893 ±0.7 421.230 ±0.4 -97.1% ivb44/fsmark/1x-64t-3HDD-RAID5-btrfs-4M-40G-fsyncBeforeClose 3697.667 ±14.0148.190 ±1.7 -96.0% ivb44/fsmark/1x-64t-4BRD_12G-RAID5-xfs-4M-30G-fsyncBeforeClose 5572.867 ±3.8 310.717 ±1.4 -94.4% ivb44/fsmark/1x-64t-9BRD_6G-RAID5-ext4-4M-30G-fsyncBeforeClose 5565.050 ±0.5 313.277 ±1.5 -94.4% ivb44/fsmark/1x-64t-4BRD_12G-RAID5-ext4-4M-30G-fsyncBeforeClose 2420.707 ±17.1171.043 ±2.7 -92.9% ivb44/fsmark/1x-64t-9BRD_6G-RAID5-xfs-4M-30G-fsyncBeforeClose 3743.300 ±4.6 379.827 ±3.5 -89.9% ivb44/fsmark/1x-64t-3HDD-RAID5-ext4-4M-40G-fsyncBeforeClose 3308.687 ±6.3 363.050 ±2.0 -89.0% ivb44/fsmark/1x-64t-3HDD-RAID5-xfs-4M-40G-fsyncBeforeClose Where, 1x: where 'x' means iterations or loop, corresponding to the 'L' option of fsmark 1t, 64t: where 't' means thread 4M: means the single file size, corresponding to the '-s' option of fsmark 40G, 30G, 120G: means the total test size 4BRD_12G: BRD is the ramdisk, where '4' means 4 ramdisk, and where '12G' means the size of one ramdisk. So
[PATCH 2/3 v2] md/raid5: split wait_for_stripe and introduce wait_for_quiescent
I noticed heavy spin lock contention at get_active_stripe(), introduced at being wake up stage, where a bunch of processes try to re-hold the spin lock again. After giving some thoughts on this issue, I found the lock could be relieved(and even avoided) if we turn the wait_for_stripe to per waitqueue for each lock hash and make the wake up exclusive: wake up one process each time, which avoids the lock contention naturally. Before go hacking with wait_for_stripe, I found it actually has 2 usages: for the array to enter or leave the quiescent state, and also to wait for an available stripe in each of the hash lists. So this patch splits the first usage off into a separate wait_queue, wait_for_quiescent, and the next patch will turn the second usage into one waitqueue for each hash value, and make it exclusive, to relieve the lock contention. v2: wake_up(wait_for_quiescent) when (active_stripes == 0) Commit log refactor suggestion from Neil. Signed-off-by: Yuanhan Liu yuanhan@linux.intel.com --- drivers/md/raid5.c | 15 +-- drivers/md/raid5.h | 1 + 2 files changed, 10 insertions(+), 6 deletions(-) diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 77dfd72..64d5bea 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -374,6 +374,8 @@ static void release_inactive_stripe_list(struct r5conf *conf, if (do_wakeup) { wake_up(conf-wait_for_stripe); + if (atomic_read(conf-active_stripes) == 0) + wake_up(conf-wait_for_quiescent); if (conf-retry_read_aligned) md_wakeup_thread(conf-mddev-thread); } @@ -667,7 +669,7 @@ get_active_stripe(struct r5conf *conf, sector_t sector, spin_lock_irq(conf-hash_locks + hash); do { - wait_event_lock_irq(conf-wait_for_stripe, + wait_event_lock_irq(conf-wait_for_quiescent, conf-quiesce == 0 || noquiesce, *(conf-hash_locks + hash)); sh = __find_stripe(conf, sector, conf-generation - previous); @@ -4729,7 +4731,7 @@ static void raid5_align_endio(struct bio *bi, int error) raid_bi, 0); bio_endio(raid_bi, 0); if (atomic_dec_and_test(conf-active_aligned_reads)) - wake_up(conf-wait_for_stripe); + wake_up(conf-wait_for_quiescent); return; } @@ -4824,7 +4826,7 @@ static int chunk_aligned_read(struct mddev *mddev, struct bio * raid_bio) align_bi-bi_iter.bi_sector += rdev-data_offset; spin_lock_irq(conf-device_lock); - wait_event_lock_irq(conf-wait_for_stripe, + wait_event_lock_irq(conf-wait_for_quiescent, conf-quiesce == 0, conf-device_lock); atomic_inc(conf-active_aligned_reads); @@ -5668,7 +5670,7 @@ static int retry_aligned_read(struct r5conf *conf, struct bio *raid_bio) bio_endio(raid_bio, 0); } if (atomic_dec_and_test(conf-active_aligned_reads)) - wake_up(conf-wait_for_stripe); + wake_up(conf-wait_for_quiescent); return handled; } @@ -6399,6 +6401,7 @@ static struct r5conf *setup_conf(struct mddev *mddev) goto abort; spin_lock_init(conf-device_lock); seqcount_init(conf-gen_lock); + init_waitqueue_head(conf-wait_for_quiescent); init_waitqueue_head(conf-wait_for_stripe); init_waitqueue_head(conf-wait_for_overlap); INIT_LIST_HEAD(conf-handle_list); @@ -7422,7 +7425,7 @@ static void raid5_quiesce(struct mddev *mddev, int state) * active stripes can drain */ conf-quiesce = 2; - wait_event_cmd(conf-wait_for_stripe, + wait_event_cmd(conf-wait_for_quiescent, atomic_read(conf-active_stripes) == 0 atomic_read(conf-active_aligned_reads) == 0, unlock_all_device_hash_locks_irq(conf), @@ -7436,7 +7439,7 @@ static void raid5_quiesce(struct mddev *mddev, int state) case 0: /* re-enable writes */ lock_all_device_hash_locks_irq(conf); conf-quiesce = 0; - wake_up(conf-wait_for_stripe); + wake_up(conf-wait_for_quiescent); wake_up(conf-wait_for_overlap); unlock_all_device_hash_locks_irq(conf); break; diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h index 7dc0dd8..4cc05ec 100644 --- a/drivers/md/raid5.h +++ b/drivers/md/raid5.h @@ -508,6 +508,7 @@ struct r5conf { struct list_headinactive_list[NR_STRIPE_HASH_LOCKS]; atomic_tempty_inactive_list_nr; struct
[PATCH 1/3] wait: introduce wait_event_cmd_exclusive
It's just a variant of wait_event_cmd, with exclusive flag being set. For cases like RAID5, which puts many processes to sleep until 1/4 resources are free, a wake_up wakes up all processes to run, but there is one process being able to get the resource as it's protected by a spin lock. That ends up introducing heavy lock contentions, and hurts performance badly. Here introduce wait_event_cmd_exclusive to relieve the lock contention naturally by letting wake_up() just wake up one process. Cc: Ingo Molnar mi...@redhat.com Cc: Peter Zijlstra pet...@infradead.org Signed-off-by: Yuanhan Liu yuanhan@linux.intel.com --- include/linux/wait.h | 14 +++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/include/linux/wait.h b/include/linux/wait.h index 2db8334..6c3b4de 100644 --- a/include/linux/wait.h +++ b/include/linux/wait.h @@ -358,10 +358,18 @@ do { \ __ret; \ }) -#define __wait_event_cmd(wq, condition, cmd1, cmd2)\ - (void)___wait_event(wq, condition, TASK_UNINTERRUPTIBLE, 0, 0, \ +#define __wait_event_cmd(wq, condition, cmd1, cmd2, exclusive) \ + (void)___wait_event(wq, condition, TASK_UNINTERRUPTIBLE, exclusive, 0, \ cmd1; schedule(); cmd2) + +#define wait_event_cmd_exclusive(wq, condition, cmd1, cmd2) \ +do { \ + if (condition) \ + break; \ + __wait_event_cmd(wq, condition, cmd1, cmd2, 1); \ +} while (0) + /** * wait_event_cmd - sleep until a condition gets true * @wq: the waitqueue to wait on @@ -380,7 +388,7 @@ do { \ do { \ if (condition) \ break; \ - __wait_event_cmd(wq, condition, cmd1, cmd2);\ + __wait_event_cmd(wq, condition, cmd1, cmd2, 0); \ } while (0) #define __wait_event_interruptible(wq, condition) \ -- 1.9.0 -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH 3/3 v2] md/raid5: per hash value and exclusive wait_for_stripe
heavily, up to 97%. And as expected, the performance increased a lot, up to 260%, for fast device(ram disk). v2: use bits instead of array to note down wait queue need to wake up. Signed-off-by: Yuanhan Liu yuanhan@linux.intel.com --- drivers/md/raid5.c | 27 +++ drivers/md/raid5.h | 2 +- 2 files changed, 20 insertions(+), 9 deletions(-) diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 64d5bea..1b11bbf 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -344,7 +344,8 @@ static void release_inactive_stripe_list(struct r5conf *conf, int hash) { int size; - bool do_wakeup = false; + unsigned long do_wakeup = 0; + int i = 0; unsigned long flags; if (hash == NR_STRIPE_HASH_LOCKS) { @@ -365,15 +366,19 @@ static void release_inactive_stripe_list(struct r5conf *conf, !list_empty(list)) atomic_dec(conf-empty_inactive_list_nr); list_splice_tail_init(list, conf-inactive_list + hash); - do_wakeup = true; + do_wakeup |= 1 (size - 1); spin_unlock_irqrestore(conf-hash_locks + hash, flags); } size--; hash--; } + for (i = 0; i NR_STRIPE_HASH_LOCKS; i++) { + if (do_wakeup (1 i)) + wake_up(conf-wait_for_stripe[i]); + } + if (do_wakeup) { - wake_up(conf-wait_for_stripe); if (atomic_read(conf-active_stripes) == 0) wake_up(conf-wait_for_quiescent); if (conf-retry_read_aligned) @@ -686,14 +691,15 @@ get_active_stripe(struct r5conf *conf, sector_t sector, if (!sh) { set_bit(R5_INACTIVE_BLOCKED, conf-cache_state); - wait_event_lock_irq( - conf-wait_for_stripe, + wait_event_cmd_exclusive( + conf-wait_for_stripe[hash], !list_empty(conf-inactive_list + hash) (atomic_read(conf-active_stripes) (conf-max_nr_stripes * 3 / 4) || !test_bit(R5_INACTIVE_BLOCKED, conf-cache_state)), - *(conf-hash_locks + hash)); + spin_unlock_irq(conf-hash_locks + hash), + spin_lock_irq(conf-hash_locks + hash)); clear_bit(R5_INACTIVE_BLOCKED, conf-cache_state); } else { @@ -718,6 +724,9 @@ get_active_stripe(struct r5conf *conf, sector_t sector, } } while (sh == NULL); + if (!list_empty(conf-inactive_list + hash)) + wake_up(conf-wait_for_stripe[hash]); + spin_unlock_irq(conf-hash_locks + hash); return sh; } @@ -2138,7 +2147,7 @@ static int resize_stripes(struct r5conf *conf, int newsize) cnt = 0; list_for_each_entry(nsh, newstripes, lru) { lock_device_hash_lock(conf, hash); - wait_event_cmd(conf-wait_for_stripe, + wait_event_cmd_exclusive(conf-wait_for_stripe[hash], !list_empty(conf-inactive_list + hash), unlock_device_hash_lock(conf, hash), lock_device_hash_lock(conf, hash)); @@ -6402,7 +6411,9 @@ static struct r5conf *setup_conf(struct mddev *mddev) spin_lock_init(conf-device_lock); seqcount_init(conf-gen_lock); init_waitqueue_head(conf-wait_for_quiescent); - init_waitqueue_head(conf-wait_for_stripe); + for (i = 0; i NR_STRIPE_HASH_LOCKS; i++) { + init_waitqueue_head(conf-wait_for_stripe[i]); + } init_waitqueue_head(conf-wait_for_overlap); INIT_LIST_HEAD(conf-handle_list); INIT_LIST_HEAD(conf-hold_list); diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h index 4cc05ec..6307b90 100644 --- a/drivers/md/raid5.h +++ b/drivers/md/raid5.h @@ -509,7 +509,7 @@ struct r5conf { atomic_tempty_inactive_list_nr; struct llist_head released_stripes; wait_queue_head_t wait_for_quiescent; - wait_queue_head_t wait_for_stripe; + wait_queue_head_t wait_for_stripe[NR_STRIPE_HASH_LOCKS]; wait_queue_head_t wait_for_overlap; unsigned long cache_state; #define R5_INACTIVE_BLOCKED1 /* release of inactive stripes blocked, -- 1.9.0
[PATCH 1/2] md/raid5: split wait_for_stripe and introduce wait_for_quiesce
If I read code correctly, current wait_for_stripe actually has 2 usage: - wait for there is enough free stripe cache, triggered when get_free_stripe() failed. This is what wait_for_stripe intend for literally. - wait for quiesce == 0 or active_aligned_reads == 0 && active_stripes == 0 It has nothing to do with wait_for_stripe literally, and releasing an active stripe won't actually wake them up. On the contrary, wake_up from under this case won't actually wake up the process waiting for an free stripe being available. Hence, we'd better split wait_for_stripe, and here I introduce wait_for_quiesce for the second usage. The name may not well taken, or even taken wrongly. Feel free to correct me then. This is also a prepare patch for next patch: make wait_for_stripe exclusive. Signed-off-by: Yuanhan Liu --- drivers/md/raid5.c | 13 +++-- drivers/md/raid5.h | 1 + 2 files changed, 8 insertions(+), 6 deletions(-) diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 9716319..b7e385f 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -667,7 +667,7 @@ get_active_stripe(struct r5conf *conf, sector_t sector, spin_lock_irq(conf->hash_locks + hash); do { - wait_event_lock_irq(conf->wait_for_stripe, + wait_event_lock_irq(conf->wait_for_quiesce, conf->quiesce == 0 || noquiesce, *(conf->hash_locks + hash)); sh = __find_stripe(conf, sector, conf->generation - previous); @@ -4725,7 +4725,7 @@ static void raid5_align_endio(struct bio *bi, int error) raid_bi, 0); bio_endio(raid_bi, 0); if (atomic_dec_and_test(>active_aligned_reads)) - wake_up(>wait_for_stripe); + wake_up(>wait_for_quiesce); return; } @@ -4820,7 +4820,7 @@ static int chunk_aligned_read(struct mddev *mddev, struct bio * raid_bio) align_bi->bi_iter.bi_sector += rdev->data_offset; spin_lock_irq(>device_lock); - wait_event_lock_irq(conf->wait_for_stripe, + wait_event_lock_irq(conf->wait_for_quiesce, conf->quiesce == 0, conf->device_lock); atomic_inc(>active_aligned_reads); @@ -5659,7 +5659,7 @@ static int retry_aligned_read(struct r5conf *conf, struct bio *raid_bio) bio_endio(raid_bio, 0); } if (atomic_dec_and_test(>active_aligned_reads)) - wake_up(>wait_for_stripe); + wake_up(>wait_for_quiesce); return handled; } @@ -6390,6 +6390,7 @@ static struct r5conf *setup_conf(struct mddev *mddev) goto abort; spin_lock_init(>device_lock); seqcount_init(>gen_lock); + init_waitqueue_head(>wait_for_quiesce); init_waitqueue_head(>wait_for_stripe); init_waitqueue_head(>wait_for_overlap); INIT_LIST_HEAD(>handle_list); @@ -7413,7 +7414,7 @@ static void raid5_quiesce(struct mddev *mddev, int state) * active stripes can drain */ conf->quiesce = 2; - wait_event_cmd(conf->wait_for_stripe, + wait_event_cmd(conf->wait_for_quiesce, atomic_read(>active_stripes) == 0 && atomic_read(>active_aligned_reads) == 0, unlock_all_device_hash_locks_irq(conf), @@ -7427,7 +7428,7 @@ static void raid5_quiesce(struct mddev *mddev, int state) case 0: /* re-enable writes */ lock_all_device_hash_locks_irq(conf); conf->quiesce = 0; - wake_up(>wait_for_stripe); + wake_up(>wait_for_quiesce); wake_up(>wait_for_overlap); unlock_all_device_hash_locks_irq(conf); break; diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h index 7dc0dd8..fab53a3 100644 --- a/drivers/md/raid5.h +++ b/drivers/md/raid5.h @@ -508,6 +508,7 @@ struct r5conf { struct list_headinactive_list[NR_STRIPE_HASH_LOCKS]; atomic_tempty_inactive_list_nr; struct llist_head released_stripes; + wait_queue_head_t wait_for_quiesce; wait_queue_head_t wait_for_stripe; wait_queue_head_t wait_for_overlap; unsigned long cache_state; -- 1.9.0 -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH 2/2] md/raid5: exclusive wait_for_stripe
I noticed heavy spin lock contention at get_active_stripe() with fsmark multiple thread write workloads. Here is how this hot contention comes from. We have limited stripes, and it's a multiple thread write workload. Hence, those stripes will be taken soon, which puts later processes to sleep for waiting free stripes. When enough stripes(> 1/4 total stripes) are released, all process are woken, trying to get the lock. But there is one only being able to get this lock for each hash lock, making other processes spinning out there for acquiring the lock. Thus, it's effectiveless to wakeup all processes and let them battle for a lock that permits one to access only each time. Instead, we could make it be a exclusive wake up: wake up one process only. That avoids the heavy spin lock contention naturally. Here are some test results I have got with this patch applied(all test run 3 times): `fsmark.files_per_sec' = next-20150317 this patch - - metric_value ±stddev metric_value ±stddev change testbox/benchmark/testcase-params - - -- 25.600 ±0.0 92.700 ±2.5 262.1% ivb44/fsmark/1x-64t-4BRD_12G-RAID5-btrfs-4M-30G-fsyncBeforeClose 25.600 ±0.0 77.800 ±0.6 203.9% ivb44/fsmark/1x-64t-9BRD_6G-RAID5-btrfs-4M-30G-fsyncBeforeClose 32.000 ±0.0 93.800 ±1.7 193.1% ivb44/fsmark/1x-64t-4BRD_12G-RAID5-ext4-4M-30G-fsyncBeforeClose 32.000 ±0.0 81.233 ±1.7 153.9% ivb44/fsmark/1x-64t-9BRD_6G-RAID5-ext4-4M-30G-fsyncBeforeClose 48.800 ±14.5 99.667 ±2.0 104.2% ivb44/fsmark/1x-64t-4BRD_12G-RAID5-xfs-4M-30G-fsyncBeforeClose 6.400 ±0.0 12.800 ±0.0 100.0% ivb44/fsmark/1x-64t-3HDD-RAID5-btrfs-4M-40G-fsyncBeforeClose 63.133 ±8.2 82.800 ±0.7 31.2% ivb44/fsmark/1x-64t-9BRD_6G-RAID5-xfs-4M-30G-fsyncBeforeClose 245.067 ±0.7 306.567 ±7.9 25.1% ivb44/fsmark/1x-64t-4BRD_12G-RAID5-f2fs-4M-30G-fsyncBeforeClose 17.533 ±0.3 21.000 ±0.8 19.8% ivb44/fsmark/1x-1t-3HDD-RAID5-xfs-4M-40G-fsyncBeforeClose 188.167 ±1.9 215.033 ±3.1 14.3% ivb44/fsmark/1x-1t-4BRD_12G-RAID5-btrfs-4M-30G-NoSync 254.500 ±1.8 290.733 ±2.4 14.2% ivb44/fsmark/1x-1t-9BRD_6G-RAID5-btrfs-4M-30G-NoSync `time.system_time' = next-20150317 this patch -- metric_value ±stddev metric_value ±stddev change testbox/benchmark/testcase-params -- -- 7235.603 ±1.2 185.163 ±1.9 -97.4% ivb44/fsmark/1x-64t-4BRD_12G-RAID5-btrfs-4M-30G-fsyncBeforeClose 7666.883 ±2.9 202.750 ±1.0 -97.4% ivb44/fsmark/1x-64t-9BRD_6G-RAID5-btrfs-4M-30G-fsyncBeforeClose 14567.893 ±0.7 421.230 ±0.4 -97.1% ivb44/fsmark/1x-64t-3HDD-RAID5-btrfs-4M-40G-fsyncBeforeClose 3697.667 ±14.0148.190 ±1.7 -96.0% ivb44/fsmark/1x-64t-4BRD_12G-RAID5-xfs-4M-30G-fsyncBeforeClose 5572.867 ±3.8 310.717 ±1.4 -94.4% ivb44/fsmark/1x-64t-9BRD_6G-RAID5-ext4-4M-30G-fsyncBeforeClose 5565.050 ±0.5 313.277 ±1.5 -94.4% ivb44/fsmark/1x-64t-4BRD_12G-RAID5-ext4-4M-30G-fsyncBeforeClose 2420.707 ±17.1171.043 ±2.7 -92.9% ivb44/fsmark/1x-64t-9BRD_6G-RAID5-xfs-4M-30G-fsyncBeforeClose 3743.300 ±4.6 379.827 ±3.5 -89.9% ivb44/fsmark/1x-64t-3HDD-RAID5-ext4-4M-40G-fsyncBeforeClose 3308.687 ±6.3 363.050 ±2.0 -89.0% ivb44/fsmark/1x-64t-3HDD-RAID5-xfs-4M-40G-fsyncBeforeClose Where, 1x: where 'x' means iterations or loop, corresponding to the 'L' option of fsmark 1t, 64t: where 't' means thread 4M: means the single file size, corresponding to the '-s' option of fsmark 40G, 30G, 120G: means the total test size 4BRD_12G: BRD is the ramdisk, where '4' means 4 ramdisk, and where '12G' means the size of one ramdisk. So, it would be 48G in total. And we made a raid on those ramdisk As you can see, though there are no much performance gain for hard disk workload, the system time is dropped heavily, up to 97%. And as expected, the performance increased a lot, up to 260%, for fast device(ram disk). Signed-off-by: Yuanhan
[PATCH 2/2] md/raid5: exclusive wait_for_stripe
I noticed heavy spin lock contention at get_active_stripe() with fsmark multiple thread write workloads. Here is how this hot contention comes from. We have limited stripes, and it's a multiple thread write workload. Hence, those stripes will be taken soon, which puts later processes to sleep for waiting free stripes. When enough stripes( 1/4 total stripes) are released, all process are woken, trying to get the lock. But there is one only being able to get this lock for each hash lock, making other processes spinning out there for acquiring the lock. Thus, it's effectiveless to wakeup all processes and let them battle for a lock that permits one to access only each time. Instead, we could make it be a exclusive wake up: wake up one process only. That avoids the heavy spin lock contention naturally. Here are some test results I have got with this patch applied(all test run 3 times): `fsmark.files_per_sec' = next-20150317 this patch - - metric_value ±stddev metric_value ±stddev change testbox/benchmark/testcase-params - - -- 25.600 ±0.0 92.700 ±2.5 262.1% ivb44/fsmark/1x-64t-4BRD_12G-RAID5-btrfs-4M-30G-fsyncBeforeClose 25.600 ±0.0 77.800 ±0.6 203.9% ivb44/fsmark/1x-64t-9BRD_6G-RAID5-btrfs-4M-30G-fsyncBeforeClose 32.000 ±0.0 93.800 ±1.7 193.1% ivb44/fsmark/1x-64t-4BRD_12G-RAID5-ext4-4M-30G-fsyncBeforeClose 32.000 ±0.0 81.233 ±1.7 153.9% ivb44/fsmark/1x-64t-9BRD_6G-RAID5-ext4-4M-30G-fsyncBeforeClose 48.800 ±14.5 99.667 ±2.0 104.2% ivb44/fsmark/1x-64t-4BRD_12G-RAID5-xfs-4M-30G-fsyncBeforeClose 6.400 ±0.0 12.800 ±0.0 100.0% ivb44/fsmark/1x-64t-3HDD-RAID5-btrfs-4M-40G-fsyncBeforeClose 63.133 ±8.2 82.800 ±0.7 31.2% ivb44/fsmark/1x-64t-9BRD_6G-RAID5-xfs-4M-30G-fsyncBeforeClose 245.067 ±0.7 306.567 ±7.9 25.1% ivb44/fsmark/1x-64t-4BRD_12G-RAID5-f2fs-4M-30G-fsyncBeforeClose 17.533 ±0.3 21.000 ±0.8 19.8% ivb44/fsmark/1x-1t-3HDD-RAID5-xfs-4M-40G-fsyncBeforeClose 188.167 ±1.9 215.033 ±3.1 14.3% ivb44/fsmark/1x-1t-4BRD_12G-RAID5-btrfs-4M-30G-NoSync 254.500 ±1.8 290.733 ±2.4 14.2% ivb44/fsmark/1x-1t-9BRD_6G-RAID5-btrfs-4M-30G-NoSync `time.system_time' = next-20150317 this patch -- metric_value ±stddev metric_value ±stddev change testbox/benchmark/testcase-params -- -- 7235.603 ±1.2 185.163 ±1.9 -97.4% ivb44/fsmark/1x-64t-4BRD_12G-RAID5-btrfs-4M-30G-fsyncBeforeClose 7666.883 ±2.9 202.750 ±1.0 -97.4% ivb44/fsmark/1x-64t-9BRD_6G-RAID5-btrfs-4M-30G-fsyncBeforeClose 14567.893 ±0.7 421.230 ±0.4 -97.1% ivb44/fsmark/1x-64t-3HDD-RAID5-btrfs-4M-40G-fsyncBeforeClose 3697.667 ±14.0148.190 ±1.7 -96.0% ivb44/fsmark/1x-64t-4BRD_12G-RAID5-xfs-4M-30G-fsyncBeforeClose 5572.867 ±3.8 310.717 ±1.4 -94.4% ivb44/fsmark/1x-64t-9BRD_6G-RAID5-ext4-4M-30G-fsyncBeforeClose 5565.050 ±0.5 313.277 ±1.5 -94.4% ivb44/fsmark/1x-64t-4BRD_12G-RAID5-ext4-4M-30G-fsyncBeforeClose 2420.707 ±17.1171.043 ±2.7 -92.9% ivb44/fsmark/1x-64t-9BRD_6G-RAID5-xfs-4M-30G-fsyncBeforeClose 3743.300 ±4.6 379.827 ±3.5 -89.9% ivb44/fsmark/1x-64t-3HDD-RAID5-ext4-4M-40G-fsyncBeforeClose 3308.687 ±6.3 363.050 ±2.0 -89.0% ivb44/fsmark/1x-64t-3HDD-RAID5-xfs-4M-40G-fsyncBeforeClose Where, 1x: where 'x' means iterations or loop, corresponding to the 'L' option of fsmark 1t, 64t: where 't' means thread 4M: means the single file size, corresponding to the '-s' option of fsmark 40G, 30G, 120G: means the total test size 4BRD_12G: BRD is the ramdisk, where '4' means 4 ramdisk, and where '12G' means the size of one ramdisk. So, it would be 48G in total. And we made a raid on those ramdisk As you can see, though there are no much performance gain for hard disk workload, the system time is dropped heavily, up to 97%. And as expected, the performance increased a lot, up to 260%, for fast device(ram disk). Signed-off-by: Yuanhan Liu yuanhan
[PATCH 1/2] md/raid5: split wait_for_stripe and introduce wait_for_quiesce
If I read code correctly, current wait_for_stripe actually has 2 usage: - wait for there is enough free stripe cache, triggered when get_free_stripe() failed. This is what wait_for_stripe intend for literally. - wait for quiesce == 0 or active_aligned_reads == 0 active_stripes == 0 It has nothing to do with wait_for_stripe literally, and releasing an active stripe won't actually wake them up. On the contrary, wake_up from under this case won't actually wake up the process waiting for an free stripe being available. Hence, we'd better split wait_for_stripe, and here I introduce wait_for_quiesce for the second usage. The name may not well taken, or even taken wrongly. Feel free to correct me then. This is also a prepare patch for next patch: make wait_for_stripe exclusive. Signed-off-by: Yuanhan Liu yuanhan@linux.intel.com --- drivers/md/raid5.c | 13 +++-- drivers/md/raid5.h | 1 + 2 files changed, 8 insertions(+), 6 deletions(-) diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 9716319..b7e385f 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -667,7 +667,7 @@ get_active_stripe(struct r5conf *conf, sector_t sector, spin_lock_irq(conf-hash_locks + hash); do { - wait_event_lock_irq(conf-wait_for_stripe, + wait_event_lock_irq(conf-wait_for_quiesce, conf-quiesce == 0 || noquiesce, *(conf-hash_locks + hash)); sh = __find_stripe(conf, sector, conf-generation - previous); @@ -4725,7 +4725,7 @@ static void raid5_align_endio(struct bio *bi, int error) raid_bi, 0); bio_endio(raid_bi, 0); if (atomic_dec_and_test(conf-active_aligned_reads)) - wake_up(conf-wait_for_stripe); + wake_up(conf-wait_for_quiesce); return; } @@ -4820,7 +4820,7 @@ static int chunk_aligned_read(struct mddev *mddev, struct bio * raid_bio) align_bi-bi_iter.bi_sector += rdev-data_offset; spin_lock_irq(conf-device_lock); - wait_event_lock_irq(conf-wait_for_stripe, + wait_event_lock_irq(conf-wait_for_quiesce, conf-quiesce == 0, conf-device_lock); atomic_inc(conf-active_aligned_reads); @@ -5659,7 +5659,7 @@ static int retry_aligned_read(struct r5conf *conf, struct bio *raid_bio) bio_endio(raid_bio, 0); } if (atomic_dec_and_test(conf-active_aligned_reads)) - wake_up(conf-wait_for_stripe); + wake_up(conf-wait_for_quiesce); return handled; } @@ -6390,6 +6390,7 @@ static struct r5conf *setup_conf(struct mddev *mddev) goto abort; spin_lock_init(conf-device_lock); seqcount_init(conf-gen_lock); + init_waitqueue_head(conf-wait_for_quiesce); init_waitqueue_head(conf-wait_for_stripe); init_waitqueue_head(conf-wait_for_overlap); INIT_LIST_HEAD(conf-handle_list); @@ -7413,7 +7414,7 @@ static void raid5_quiesce(struct mddev *mddev, int state) * active stripes can drain */ conf-quiesce = 2; - wait_event_cmd(conf-wait_for_stripe, + wait_event_cmd(conf-wait_for_quiesce, atomic_read(conf-active_stripes) == 0 atomic_read(conf-active_aligned_reads) == 0, unlock_all_device_hash_locks_irq(conf), @@ -7427,7 +7428,7 @@ static void raid5_quiesce(struct mddev *mddev, int state) case 0: /* re-enable writes */ lock_all_device_hash_locks_irq(conf); conf-quiesce = 0; - wake_up(conf-wait_for_stripe); + wake_up(conf-wait_for_quiesce); wake_up(conf-wait_for_overlap); unlock_all_device_hash_locks_irq(conf); break; diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h index 7dc0dd8..fab53a3 100644 --- a/drivers/md/raid5.h +++ b/drivers/md/raid5.h @@ -508,6 +508,7 @@ struct r5conf { struct list_headinactive_list[NR_STRIPE_HASH_LOCKS]; atomic_tempty_inactive_list_nr; struct llist_head released_stripes; + wait_queue_head_t wait_for_quiesce; wait_queue_head_t wait_for_stripe; wait_queue_head_t wait_for_overlap; unsigned long cache_state; -- 1.9.0 -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
performance changes on c9dc4c65: 9.8% fsmark.files_per_sec
FYI, we found performance increasement, which is expected as commit patch says, on `fsmark.files_per_sec' by c9dc4c6578502c2085705347375b82089aad18d0: > commit c9dc4c6578502c2085705347375b82089aad18d0 > Author: Chris Mason > AuthorDate: Sat Apr 4 17:14:42 2015 -0700 > Commit: Chris Mason > CommitDate: Fri Apr 10 14:07:11 2015 -0700 > > Btrfs: two stage dirty block group writeout 4c6d1d85ad89fd8e32dc9204b7f944854399bda9 c9dc4c6578502c2085705347375b82089aad18d0 run time(m) metric_value ±stddev run time(m) metric_value ±stddev change testbox/benchmark/testcase-params --- -- --- -- -- 3 7.3 |35.267|±0.55 6.6 |38.740| ±1.69.8% ivb44/fsmark/1x-1t-1HDD-btrfs-4M-60G-NoSync NOTE: here are some more explanation about those test parameters for you to know what the testcase does better: 1x: where 'x' means iterations or loop, corresponding to the 'L' option of fsmark 1t, 64t: where 't' means thread 4M: means the single file size, corresponding to the '-s' option of fsmark 60G: means the total test size And FYI, here are more changes by the same commit: 4c6d1d85ad89fd8e c9dc4c6578502c208570534737 -- %stddev %change %stddev \ |\ 9864 ± 2%+156.9% 25345 ± 4% fsmark.time.voluntary_context_switches 9 ± 0% +17.8% 10 ± 4% fsmark.time.percent_of_cpu_this_job_got 462211 ± 1% +16.8% 539707 ± 0% fsmark.app_overhead 35.27 ± 0% +9.8% 38.74 ± 1% fsmark.files_per_sec 435 ± 0% -9.0%396 ± 1% fsmark.time.elapsed_time.max 435 ± 0% -9.0%396 ± 1% fsmark.time.elapsed_time 5.20 ± 2% -70.3% 1.54 ± 6% turbostat.Pkg%pc6 2447873 ± 42% -67.9% 785086 ± 33% numa-numastat.node1.numa_hit 2413662 ± 43% -68.1% 771115 ± 31% numa-numastat.node1.local_node 9864 ± 2%+156.9% 25345 ± 4% time.voluntary_context_switches 187680 ± 10%+126.8% 425676 ± 7% numa-vmstat.node1.nr_dirty 747361 ± 9%+127.8%1702809 ± 7% numa-meminfo.node1.Dirty 1787510 ± 1%+117.0%3878984 ± 2% meminfo.Dirty 446861 ± 1%+117.0% 969472 ± 2% proc-vmstat.nr_dirty 1655962 ± 37% -59.3% 673988 ± 29% numa-vmstat.node1.numa_local 1036191 ± 8%+110.3%2179311 ± 3% numa-meminfo.node0.Dirty 259069 ± 8%+110.3% 544783 ± 3% numa-vmstat.node0.nr_dirty 1687987 ± 37% -58.6% 698626 ± 29% numa-vmstat.node1.numa_hit 1 ± 0%+100.0% 2 ± 0% vmstat.procs.b 0.02 ± 0%+100.0% 0.04 ± 22% turbostat.CPU%c3 6.03 ± 1% +76.9% 10.67 ± 1% turbostat.CPU%c1 5.189e+08 ± 0% +72.6% 8.956e+08 ± 1% cpuidle.C1-IVT.time 2646692 ± 7% +75.0%4630890 ± 23% cpuidle.C3-IVT.time 5301 ± 6% -31.7% 3620 ± 3% slabinfo.btrfs_ordered_extent.active_objs 10549 ± 16% -30.3% 7349 ± 12% numa-vmstat.node1.nr_slab_reclaimable 5353 ± 6% -31.4% 3670 ± 3% slabinfo.btrfs_ordered_extent.num_objs 42169 ± 16% -30.3% 29397 ± 12% numa-meminfo.node1.SReclaimable 1619825 ± 22% +39.4%2258188 ± 4% proc-vmstat.pgfree 4611 ± 7% -28.0% 3318 ± 1% slabinfo.btrfs_delayed_ref_head.num_objs 4471 ± 8% -27.0% 3264 ± 2% slabinfo.btrfs_delayed_ref_head.active_objs 67.93 ± 1% -24.7% 51.15 ± 4% turbostat.Pkg%pc2 2332975 ± 21% +45.6%3396446 ± 4% numa-vmstat.node1.numa_other 2300949 ± 22% +46.5%3371807 ± 4% numa-vmstat.node1.numa_miss 2300941 ± 22% +46.5%3371793 ± 4% numa-vmstat.node0.numa_foreign 2952 ± 8% -23.3% 2263 ± 3% slabinfo.btrfs_delayed_data_ref.num_objs 2570716 ± 3% +25.7%3230157 ± 2% numa-meminfo.node1.Writeback 642367 ± 3% +25.7% 807533 ± 2% numa-vmstat.node1.nr_writeback 95408 ± 13% -17.3% 78910 ± 6% numa-meminfo.node1.Slab 2803 ± 7% -21.1% 2210 ± 3% slabinfo.btrfs_delayed_data_ref.active_objs 240 ± 9% +23.1%295 ± 16% numa-vmstat.node0.nr_page_table_pages 4626942 ± 19% +49.6%6924087 ± 22% cpuidle.C1E-IVT.time 5585235 ± 0% +25.5%7011242 ± 0% meminfo.Writeback 1396232 ± 0% +25.5%1752892 ± 0% proc-vmstat.nr_writeback 962 ± 9% +23.0% 1184 ± 16% numa-meminfo.node0.PageTables 9 ± 0% +17.8% 10 ± 4% time.percent_of_cpu_this_job_got 754027 ± 2% +25.2% 944312
performance changes on c9dc4c65: 9.8% fsmark.files_per_sec
FYI, we found performance increasement, which is expected as commit patch says, on `fsmark.files_per_sec' by c9dc4c6578502c2085705347375b82089aad18d0: commit c9dc4c6578502c2085705347375b82089aad18d0 Author: Chris Mason c...@fb.com AuthorDate: Sat Apr 4 17:14:42 2015 -0700 Commit: Chris Mason c...@fb.com CommitDate: Fri Apr 10 14:07:11 2015 -0700 Btrfs: two stage dirty block group writeout 4c6d1d85ad89fd8e32dc9204b7f944854399bda9 c9dc4c6578502c2085705347375b82089aad18d0 run time(m) metric_value ±stddev run time(m) metric_value ±stddev change testbox/benchmark/testcase-params --- -- --- -- -- 3 7.3 |35.267|±0.55 6.6 |38.740| ±1.69.8% ivb44/fsmark/1x-1t-1HDD-btrfs-4M-60G-NoSync NOTE: here are some more explanation about those test parameters for you to know what the testcase does better: 1x: where 'x' means iterations or loop, corresponding to the 'L' option of fsmark 1t, 64t: where 't' means thread 4M: means the single file size, corresponding to the '-s' option of fsmark 60G: means the total test size And FYI, here are more changes by the same commit: 4c6d1d85ad89fd8e c9dc4c6578502c208570534737 -- %stddev %change %stddev \ |\ 9864 ± 2%+156.9% 25345 ± 4% fsmark.time.voluntary_context_switches 9 ± 0% +17.8% 10 ± 4% fsmark.time.percent_of_cpu_this_job_got 462211 ± 1% +16.8% 539707 ± 0% fsmark.app_overhead 35.27 ± 0% +9.8% 38.74 ± 1% fsmark.files_per_sec 435 ± 0% -9.0%396 ± 1% fsmark.time.elapsed_time.max 435 ± 0% -9.0%396 ± 1% fsmark.time.elapsed_time 5.20 ± 2% -70.3% 1.54 ± 6% turbostat.Pkg%pc6 2447873 ± 42% -67.9% 785086 ± 33% numa-numastat.node1.numa_hit 2413662 ± 43% -68.1% 771115 ± 31% numa-numastat.node1.local_node 9864 ± 2%+156.9% 25345 ± 4% time.voluntary_context_switches 187680 ± 10%+126.8% 425676 ± 7% numa-vmstat.node1.nr_dirty 747361 ± 9%+127.8%1702809 ± 7% numa-meminfo.node1.Dirty 1787510 ± 1%+117.0%3878984 ± 2% meminfo.Dirty 446861 ± 1%+117.0% 969472 ± 2% proc-vmstat.nr_dirty 1655962 ± 37% -59.3% 673988 ± 29% numa-vmstat.node1.numa_local 1036191 ± 8%+110.3%2179311 ± 3% numa-meminfo.node0.Dirty 259069 ± 8%+110.3% 544783 ± 3% numa-vmstat.node0.nr_dirty 1687987 ± 37% -58.6% 698626 ± 29% numa-vmstat.node1.numa_hit 1 ± 0%+100.0% 2 ± 0% vmstat.procs.b 0.02 ± 0%+100.0% 0.04 ± 22% turbostat.CPU%c3 6.03 ± 1% +76.9% 10.67 ± 1% turbostat.CPU%c1 5.189e+08 ± 0% +72.6% 8.956e+08 ± 1% cpuidle.C1-IVT.time 2646692 ± 7% +75.0%4630890 ± 23% cpuidle.C3-IVT.time 5301 ± 6% -31.7% 3620 ± 3% slabinfo.btrfs_ordered_extent.active_objs 10549 ± 16% -30.3% 7349 ± 12% numa-vmstat.node1.nr_slab_reclaimable 5353 ± 6% -31.4% 3670 ± 3% slabinfo.btrfs_ordered_extent.num_objs 42169 ± 16% -30.3% 29397 ± 12% numa-meminfo.node1.SReclaimable 1619825 ± 22% +39.4%2258188 ± 4% proc-vmstat.pgfree 4611 ± 7% -28.0% 3318 ± 1% slabinfo.btrfs_delayed_ref_head.num_objs 4471 ± 8% -27.0% 3264 ± 2% slabinfo.btrfs_delayed_ref_head.active_objs 67.93 ± 1% -24.7% 51.15 ± 4% turbostat.Pkg%pc2 2332975 ± 21% +45.6%3396446 ± 4% numa-vmstat.node1.numa_other 2300949 ± 22% +46.5%3371807 ± 4% numa-vmstat.node1.numa_miss 2300941 ± 22% +46.5%3371793 ± 4% numa-vmstat.node0.numa_foreign 2952 ± 8% -23.3% 2263 ± 3% slabinfo.btrfs_delayed_data_ref.num_objs 2570716 ± 3% +25.7%3230157 ± 2% numa-meminfo.node1.Writeback 642367 ± 3% +25.7% 807533 ± 2% numa-vmstat.node1.nr_writeback 95408 ± 13% -17.3% 78910 ± 6% numa-meminfo.node1.Slab 2803 ± 7% -21.1% 2210 ± 3% slabinfo.btrfs_delayed_data_ref.active_objs 240 ± 9% +23.1%295 ± 16% numa-vmstat.node0.nr_page_table_pages 4626942 ± 19% +49.6%6924087 ± 22% cpuidle.C1E-IVT.time 5585235 ± 0% +25.5%7011242 ± 0% meminfo.Writeback 1396232 ± 0% +25.5%1752892 ± 0% proc-vmstat.nr_writeback 962 ± 9% +23.0% 1184 ± 16% numa-meminfo.node0.PageTables 9 ± 0% +17.8% 10 ± 4% time.percent_of_cpu_this_job_got 754027 ± 2%
performance changes on 78373b73: -46.6% fsmark.files_per_sec, and few more
FYI, we found changes on `fsmark.files_per_sec' by 78373b7319abdf15050af5b1632c4c8b8b398f33: > commit 78373b7319abdf15050af5b1632c4c8b8b398f33 > Author: Jaegeuk Kim > AuthorDate: Fri Mar 13 21:44:36 2015 -0700 > Commit: Jaegeuk Kim > CommitDate: Fri Apr 10 15:08:45 2015 -0700 > > f2fs: enhance multi-threads performance 3402e87cfb5e762f9c95071bf4a2ad65fd9392a2 78373b7319abdf15050af5b1632c4c8b8b398f33 run time(m) metric_value ±stddev run time(m) metric_value ±stddev change testbox/benchmark/testcase-params --- -- --- -- -- 3 0.3 |490.800|±5.73 0.5 |262.067| ±0.4 -46.6% ivb44/fsmark/1x-64t-4BRD_12G-RAID0-f2fs-4M-30G-fsyncBeforeClose 3 0.3 |468.367|±3.53 0.5 |264.467| ±0.2 -43.5% ivb44/fsmark/1x-64t-9BRD_6G-RAID0-f2fs-4M-30G-fsyncBeforeClose 3 0.6 |211.867|±0.73 0.7 |191.067| ±0.5 -9.8% ivb44/fsmark/1x-64t-4BRD_12G-RAID5-f2fs-4M-30G-fsyncBeforeClose NOTE: here are some more info about those test parameters for you to know what the testcase does better: 1x: where 'x' means iterations or loop, corresponding to the 'L' option of fsmark 1t, 64t: where 't' means thread 4M: means the single file size, corresponding to the '-s' option of fsmark 40G, 30G, 120G: means the total test size 4BRD_12G: BRD is the ramdisk, where '4' means 4 ramdisk, and where '12G' means the size of one ramdisk. So, it would be 48G in total. And we made a raid on those ramdisk The change is a bit interesting as you already stated it clear that this patch is for performance gain. The patch itself is clear, too: remove a mutex lock. So the only reasonable cause, without too much dig, I can think of would be the remove of this lock reduces sleep time, and brings more process to be able run, but somehow increases the context switches and cpu usage in the meantime at somewhere. I guess this is what the following changes are trying to tell us: 29708 ± 2% +5720.0%1729051 ± 1% fsmark.time.voluntary_context_switches 302 ± 0%+113.8%647 ± 0% fsmark.time.percent_of_cpu_this_job_got 61.05 ± 0%+214.0% 191.70 ± 0% fsmark.time.system_time FYI, Here I listed all changes for the outstanding change: 3 0.3 |490.800|±5.73 0.5 |262.067| ±0.4 -46.6% ivb44/fsmark/1x-64t-4BRD_12G-RAID0-f2fs-4M-30G-fsyncBeforeClose 3402e87cfb5e762f 78373b7319abdf15050af5b163 -- %stddev %change %stddev \ |\ 29708 ± 2% +5720.0%1729051 ± 1% fsmark.time.voluntary_context_switches 61.05 ± 0%+214.0% 191.70 ± 0% fsmark.time.system_time 302 ± 0%+113.8%647 ± 0% fsmark.time.percent_of_cpu_this_job_got 10476 ± 0% +95.4% 20467 ± 5% fsmark.time.minor_page_faults 490 ± 5% -46.6%262 ± 0% fsmark.files_per_sec 20.21 ± 0% +46.7% 29.65 ± 0% fsmark.time.elapsed_time 20.21 ± 0% +46.7% 29.65 ± 0% fsmark.time.elapsed_time.max 226379 ± 0% +32.5% 299882 ± 0% fsmark.app_overhead 0 ± 0% +Inf% 1045 ± 2% proc-vmstat.numa_pages_migrated 209 ± 26% +3272.3% 7059 ± 3% cpuidle.C1E-IVT.usage 228 ± 42%+686.7% 1799 ± 14% numa-meminfo.node0.Writeback 14633 ± 5% +7573.2%1122849 ± 1% cpuidle.C1-IVT.usage 0 ± 0% +Inf% 1045 ± 2% proc-vmstat.pgmigrate_success 29708 ± 2% +5720.0%1729051 ± 1% time.voluntary_context_switches 55663 ± 0%+776.9% 488081 ± 0% cpuidle.C6-IVT.usage 56 ± 42%+718.8%464 ± 11% numa-vmstat.node0.nr_writeback 535 ± 29%+334.4% 2325 ± 10% meminfo.Writeback 129 ± 30%+295.6%511 ± 4% proc-vmstat.nr_writeback 59.25 ± 5% -74.2% 15.26 ± 3% turbostat.CPU%c6 2.58 ± 8% -74.5% 0.66 ± 11% turbostat.Pkg%pc2 1.551e+08 ± 14%+233.4% 5.171e+08 ± 4% cpuidle.C1-IVT.time 32564 ± 24%+208.1% 100330 ± 5% softirqs.RCU 61.05 ± 0%+214.0% 191.70 ± 0% time.system_time 60 ± 32%+165.7%160 ± 16% numa-vmstat.node1.nr_writeback 2 ± 0%+200.0% 6 ± 0% vmstat.procs.r 3057 ± 2%+166.1% 8136 ± 22% numa-vmstat.node0.nr_mapped 12240 ± 2%+165.9% 32547 ± 22% numa-meminfo.node0.Mapped 6324 ± 3%+148.4% 15709 ± 0% proc-vmstat.nr_mapped
performance changes on 78373b73: -46.6% fsmark.files_per_sec, and few more
FYI, we found changes on `fsmark.files_per_sec' by 78373b7319abdf15050af5b1632c4c8b8b398f33: commit 78373b7319abdf15050af5b1632c4c8b8b398f33 Author: Jaegeuk Kim jaeg...@kernel.org AuthorDate: Fri Mar 13 21:44:36 2015 -0700 Commit: Jaegeuk Kim jaeg...@kernel.org CommitDate: Fri Apr 10 15:08:45 2015 -0700 f2fs: enhance multi-threads performance 3402e87cfb5e762f9c95071bf4a2ad65fd9392a2 78373b7319abdf15050af5b1632c4c8b8b398f33 run time(m) metric_value ±stddev run time(m) metric_value ±stddev change testbox/benchmark/testcase-params --- -- --- -- -- 3 0.3 |490.800|±5.73 0.5 |262.067| ±0.4 -46.6% ivb44/fsmark/1x-64t-4BRD_12G-RAID0-f2fs-4M-30G-fsyncBeforeClose 3 0.3 |468.367|±3.53 0.5 |264.467| ±0.2 -43.5% ivb44/fsmark/1x-64t-9BRD_6G-RAID0-f2fs-4M-30G-fsyncBeforeClose 3 0.6 |211.867|±0.73 0.7 |191.067| ±0.5 -9.8% ivb44/fsmark/1x-64t-4BRD_12G-RAID5-f2fs-4M-30G-fsyncBeforeClose NOTE: here are some more info about those test parameters for you to know what the testcase does better: 1x: where 'x' means iterations or loop, corresponding to the 'L' option of fsmark 1t, 64t: where 't' means thread 4M: means the single file size, corresponding to the '-s' option of fsmark 40G, 30G, 120G: means the total test size 4BRD_12G: BRD is the ramdisk, where '4' means 4 ramdisk, and where '12G' means the size of one ramdisk. So, it would be 48G in total. And we made a raid on those ramdisk The change is a bit interesting as you already stated it clear that this patch is for performance gain. The patch itself is clear, too: remove a mutex lock. So the only reasonable cause, without too much dig, I can think of would be the remove of this lock reduces sleep time, and brings more process to be able run, but somehow increases the context switches and cpu usage in the meantime at somewhere. I guess this is what the following changes are trying to tell us: 29708 ± 2% +5720.0%1729051 ± 1% fsmark.time.voluntary_context_switches 302 ± 0%+113.8%647 ± 0% fsmark.time.percent_of_cpu_this_job_got 61.05 ± 0%+214.0% 191.70 ± 0% fsmark.time.system_time FYI, Here I listed all changes for the outstanding change: 3 0.3 |490.800|±5.73 0.5 |262.067| ±0.4 -46.6% ivb44/fsmark/1x-64t-4BRD_12G-RAID0-f2fs-4M-30G-fsyncBeforeClose 3402e87cfb5e762f 78373b7319abdf15050af5b163 -- %stddev %change %stddev \ |\ 29708 ± 2% +5720.0%1729051 ± 1% fsmark.time.voluntary_context_switches 61.05 ± 0%+214.0% 191.70 ± 0% fsmark.time.system_time 302 ± 0%+113.8%647 ± 0% fsmark.time.percent_of_cpu_this_job_got 10476 ± 0% +95.4% 20467 ± 5% fsmark.time.minor_page_faults 490 ± 5% -46.6%262 ± 0% fsmark.files_per_sec 20.21 ± 0% +46.7% 29.65 ± 0% fsmark.time.elapsed_time 20.21 ± 0% +46.7% 29.65 ± 0% fsmark.time.elapsed_time.max 226379 ± 0% +32.5% 299882 ± 0% fsmark.app_overhead 0 ± 0% +Inf% 1045 ± 2% proc-vmstat.numa_pages_migrated 209 ± 26% +3272.3% 7059 ± 3% cpuidle.C1E-IVT.usage 228 ± 42%+686.7% 1799 ± 14% numa-meminfo.node0.Writeback 14633 ± 5% +7573.2%1122849 ± 1% cpuidle.C1-IVT.usage 0 ± 0% +Inf% 1045 ± 2% proc-vmstat.pgmigrate_success 29708 ± 2% +5720.0%1729051 ± 1% time.voluntary_context_switches 55663 ± 0%+776.9% 488081 ± 0% cpuidle.C6-IVT.usage 56 ± 42%+718.8%464 ± 11% numa-vmstat.node0.nr_writeback 535 ± 29%+334.4% 2325 ± 10% meminfo.Writeback 129 ± 30%+295.6%511 ± 4% proc-vmstat.nr_writeback 59.25 ± 5% -74.2% 15.26 ± 3% turbostat.CPU%c6 2.58 ± 8% -74.5% 0.66 ± 11% turbostat.Pkg%pc2 1.551e+08 ± 14%+233.4% 5.171e+08 ± 4% cpuidle.C1-IVT.time 32564 ± 24%+208.1% 100330 ± 5% softirqs.RCU 61.05 ± 0%+214.0% 191.70 ± 0% time.system_time 60 ± 32%+165.7%160 ± 16% numa-vmstat.node1.nr_writeback 2 ± 0%+200.0% 6 ± 0% vmstat.procs.r 3057 ± 2%+166.1% 8136 ± 22% numa-vmstat.node0.nr_mapped 12240 ± 2%+165.9% 32547 ± 22% numa-meminfo.node0.Mapped 6324 ± 3%+148.4% 15709 ±
Re: performance changes on 4400755e: 200.0% fsmark.files_per_sec, -18.1% fsmark.files_per_sec, and few more
On Wed, Mar 25, 2015 at 02:03:59PM +1100, NeilBrown wrote: > On Wed, 18 Mar 2015 13:00:30 +0800 Yuanahn Liu > wrote: > > > Hi, > > > > FYI, we noticed performance changes on `fsmark.files_per_sec' by > > 4400755e356f9a2b0b7ceaa02f57b1c7546c3765: > > > > > commit 4400755e356f9a2b0b7ceaa02f57b1c7546c3765 > > > Author: NeilBrown > > > AuthorDate: Thu Feb 26 12:47:56 2015 +1100 > > > Commit: NeilBrown > > > CommitDate: Wed Mar 4 13:40:19 2015 +1100 > > > > > > md/raid5: allow the stripe_cache to grow and shrink. > > Thanks a lot for this testing!!! I was wondering how I could do some proper > testing of this patch, and you've done it for me :-) Welcome! > > The large number of improvements is very encouraging - that is what I was > hoping for of course. > > The few regressions could be a concern. I note that are all NoSync. > That seems to suggest that they could just be writing more data. It's not a time based test, but size based test: > > 40G, 30G, 120G: means the total test size Hence, I doubt it might be writing more data. > i.e. the data is written a bit earlier (certainly possible) so it happen to > introduce more delay > > I guess I'm not really sure how to interpret NoSync results, and suspect that > poor NoSync result don't really reflect much on the underlying block device. > Could that be right? Sorry, I'm not quite sure I followed you. Poor NoSync result? Do you mean the small number like 63.133, 57.600? They are of unit of files_per_sec, and file size is 4M. Hence, it would be 200+ MB/s, which is not that bad in this case, as it's a 3 hard disk RAID5. > > 3 8.1 63.133 ±0.5% 3 9.2 55.633 > > ±0.2% -11.9% ivb44/fsmark/1x-1t-3HDD-RAID5-btrfs-4M-120G-NoSync Here are few iostat sample from 26089f4902595a2f64c512066af07af6e82eb096 of above test: avg-cpu: %user %nice %system %iowait %steal %idle 0.000.000.631.670.00 97.70 Device: rrqm/s wrqm/s r/s w/srkB/swkB/s avgrq-sz avgqu-sz await r_await w_await svctm %util sdb 0.00 30353.000.00 240.00 0.00 121860.00 1015.50 1.295.350.005.35 3.50 83.90 sdc 0.00 30353.000.00 241.00 0.00 122372.00 1015.54 0.662.740.002.74 2.53 60.90 sda 0.00 30353.000.00 242.00 0.00 122884.00 1015.57 1.295.360.005.36 3.52 85.20 md0 0.00 0.000.00 956.00 0.00 244736.00 512.00 227231.390.000.000.00 1.05 100.00 avg-cpu: %user %nice %system %iowait %steal %idle 0.020.000.691.690.00 97.60 Device: rrqm/s wrqm/s r/s w/srkB/swkB/s avgrq-sz avgqu-sz await r_await w_await svctm %util sdb 0.00 30988.000.00 247.00 0.00 125444.00 1015.74 1.777.170.007.17 4.02 99.40 sdc 0.00 30988.000.00 245.00 0.00 124420.00 1015.67 1.194.820.004.82 3.67 89.90 sda 0.00 30988.000.00 247.00 0.00 125444.00 1015.74 0.652.650.002.65 2.54 62.70 md0 0.00 0.000.00 976.00 0.00 249856.00 512.00 228206.370.000.000.00 1.02 100.00 avg-cpu: %user %nice %system %iowait %steal %idle 0.000.000.611.670.00 97.72 Device: rrqm/s wrqm/s r/s w/srkB/swkB/s avgrq-sz avgqu-sz await r_await w_await svctm %util sdb 0.00 29718.000.00 235.00 0.00 119300.00 1015.32 1.355.710.005.71 3.71 87.20 sdc 0.00 29718.000.00 236.00 0.00 119812.00 1015.36 1.195.060.005.06 3.43 80.90 sda 0.00 29718.000.00 235.00 0.00 119300.00 1015.32 0.873.690.003.69 2.99 70.20 md0 0.00 0.000.00 936.00 0.00 239616.00 512.00 229157.330.000.000.00 1.07 100.00 And few iostat sample of 4400755e356f9a2b0b7ceaa02f57b1c7546c3765(first bad commit): avg-cpu: %user %nice %system %iowait %steal %idle 0.020.001.091.540.00 97.35 Device: rrqm/s wrqm/s r/s w/srkB/swkB/s avgrq-sz avgqu-sz await r_await w_await svctm %util sdb 1.00 27677.001.00 206.00 8.00 100516.00 971.25 27.40 130.56 196.00 130.24 4.72 97.70 sdc 0.00 27677.000.00 207.00 0.00 101028.00 976.12 27.05 129.430.00 129.43 4.61 95.50 sda 5.00 27677.001.00 211.0016.00 102984.00 971.70 26.61 127.00 201.00 126.64 4.50 95.50 md0 0.00 0.000.00 824.00
Re: performance changes on 4400755e: 200.0% fsmark.files_per_sec, -18.1% fsmark.files_per_sec, and few more
On Wed, Mar 25, 2015 at 02:03:59PM +1100, NeilBrown wrote: On Wed, 18 Mar 2015 13:00:30 +0800 Yuanahn Liu yuanhan@linux.intel.com wrote: Hi, FYI, we noticed performance changes on `fsmark.files_per_sec' by 4400755e356f9a2b0b7ceaa02f57b1c7546c3765: commit 4400755e356f9a2b0b7ceaa02f57b1c7546c3765 Author: NeilBrown ne...@suse.de AuthorDate: Thu Feb 26 12:47:56 2015 +1100 Commit: NeilBrown ne...@suse.de CommitDate: Wed Mar 4 13:40:19 2015 +1100 md/raid5: allow the stripe_cache to grow and shrink. Thanks a lot for this testing!!! I was wondering how I could do some proper testing of this patch, and you've done it for me :-) Welcome! The large number of improvements is very encouraging - that is what I was hoping for of course. The few regressions could be a concern. I note that are all NoSync. That seems to suggest that they could just be writing more data. It's not a time based test, but size based test: 40G, 30G, 120G: means the total test size Hence, I doubt it might be writing more data. i.e. the data is written a bit earlier (certainly possible) so it happen to introduce more delay I guess I'm not really sure how to interpret NoSync results, and suspect that poor NoSync result don't really reflect much on the underlying block device. Could that be right? Sorry, I'm not quite sure I followed you. Poor NoSync result? Do you mean the small number like 63.133, 57.600? They are of unit of files_per_sec, and file size is 4M. Hence, it would be 200+ MB/s, which is not that bad in this case, as it's a 3 hard disk RAID5. 3 8.1 63.133 ±0.5% 3 9.2 55.633 ±0.2% -11.9% ivb44/fsmark/1x-1t-3HDD-RAID5-btrfs-4M-120G-NoSync Here are few iostat sample from 26089f4902595a2f64c512066af07af6e82eb096 of above test: avg-cpu: %user %nice %system %iowait %steal %idle 0.000.000.631.670.00 97.70 Device: rrqm/s wrqm/s r/s w/srkB/swkB/s avgrq-sz avgqu-sz await r_await w_await svctm %util sdb 0.00 30353.000.00 240.00 0.00 121860.00 1015.50 1.295.350.005.35 3.50 83.90 sdc 0.00 30353.000.00 241.00 0.00 122372.00 1015.54 0.662.740.002.74 2.53 60.90 sda 0.00 30353.000.00 242.00 0.00 122884.00 1015.57 1.295.360.005.36 3.52 85.20 md0 0.00 0.000.00 956.00 0.00 244736.00 512.00 227231.390.000.000.00 1.05 100.00 avg-cpu: %user %nice %system %iowait %steal %idle 0.020.000.691.690.00 97.60 Device: rrqm/s wrqm/s r/s w/srkB/swkB/s avgrq-sz avgqu-sz await r_await w_await svctm %util sdb 0.00 30988.000.00 247.00 0.00 125444.00 1015.74 1.777.170.007.17 4.02 99.40 sdc 0.00 30988.000.00 245.00 0.00 124420.00 1015.67 1.194.820.004.82 3.67 89.90 sda 0.00 30988.000.00 247.00 0.00 125444.00 1015.74 0.652.650.002.65 2.54 62.70 md0 0.00 0.000.00 976.00 0.00 249856.00 512.00 228206.370.000.000.00 1.02 100.00 avg-cpu: %user %nice %system %iowait %steal %idle 0.000.000.611.670.00 97.72 Device: rrqm/s wrqm/s r/s w/srkB/swkB/s avgrq-sz avgqu-sz await r_await w_await svctm %util sdb 0.00 29718.000.00 235.00 0.00 119300.00 1015.32 1.355.710.005.71 3.71 87.20 sdc 0.00 29718.000.00 236.00 0.00 119812.00 1015.36 1.195.060.005.06 3.43 80.90 sda 0.00 29718.000.00 235.00 0.00 119300.00 1015.32 0.873.690.003.69 2.99 70.20 md0 0.00 0.000.00 936.00 0.00 239616.00 512.00 229157.330.000.000.00 1.07 100.00 And few iostat sample of 4400755e356f9a2b0b7ceaa02f57b1c7546c3765(first bad commit): avg-cpu: %user %nice %system %iowait %steal %idle 0.020.001.091.540.00 97.35 Device: rrqm/s wrqm/s r/s w/srkB/swkB/s avgrq-sz avgqu-sz await r_await w_await svctm %util sdb 1.00 27677.001.00 206.00 8.00 100516.00 971.25 27.40 130.56 196.00 130.24 4.72 97.70 sdc 0.00 27677.000.00 207.00 0.00 101028.00 976.12 27.05 129.430.00 129.43 4.61 95.50 sda 5.00 27677.001.00 211.0016.00 102984.00 971.70 26.61 127.00 201.00 126.64 4.50 95.50 md0 0.00 0.000.00 824.00 0.00
[LKP] [sched] WARNING: CPU: 0 PID: 13608 at kernel/sched/core.c:7323 __might_sleep+0xbd/0xd0()
FYI, we noticed the below changes on git://git.kernel.org/pub/scm/linux/kernel/git/next/linux-next.git master commit 8eb23b9f35aae413140d3fda766a98092c21e9b0 ("sched: Debug nested sleeps") +-+++ | | 26cabd3125 | 8eb23b9f35 | +-+++ | boot_successes | 10 | 15 | | boot_failures | 0 | 25 | | WARNING:at_kernel/sched/core.c:__might_sleep() | 0 | 5 | | backtrace:SyS_read | 0 | 5 | | backtrace:vfs_read | 0 | 5 | | WARNING:at_kernel/sched/core.c:#__might_sleep() | 0 | 20 | | backtrace:SyS_io_getevents | 0 | 10 | | backtrace:read_events | 0 | 7 | | backtrace:kauditd_thread| 0 | 10 | +-+++ <4>[ 839.494114] [ cut here ] <4>[ 839.494131] WARNING: CPU: 0 PID: 13608 at /kbuild/src/lkp/kernel/sched/core.c:7323 __might_sleep+0xbd/0xd0() <4>[ 839.494137] do not call blocking ops when !TASK_RUNNING; state=1 set at [] prepare_to_wait+0x2f/0x90 <4>[ 839.494256] Modules linked in: tun ipmi_watchdog loop btrfs xor raid6_pq sg sd_mod ast snd_pcm syscopyarea sysfillrect snd_timer sysimgblt snd ie6xx_wdt ttm i2c_isch drm_kms_helper soundcore drm ahci libahci pcspkr i2c_ismt lpc_sch ipmi_si libata shpchp ipmi_msghandler acpi_cpufreq <4>[ 839.494264] CPU: 0 PID: 13608 Comm: fanotify01 Not tainted 3.18.0-rc4-next-20141117 #1 <4>[ 839.494266] Hardware name: To be filled by O.E.M. To be filled by O.E.M./Double Cove , BIOS BWDEXT.86B.000.012.D127 10/08/2012 <4>[ 839.494273] 81b5ebb8 88023cf37d18 81892f54 64026402 <4>[ 839.494277] 88023cf37d68 88023cf37d58 8107047a 88023cf37db8 <4>[ 839.494281] 81b5f5e8 0061 6000 <4>[ 839.494285] Call Trace: <4>[ 839.494315] [] dump_stack+0x4c/0x65 <4>[ 839.494323] [] warn_slowpath_common+0x8a/0xc0 <4>[ 839.494327] [] warn_slowpath_fmt+0x46/0x50 <4>[ 839.494333] [] ? prepare_to_wait+0x2f/0x90 <4>[ 839.494337] [] ? prepare_to_wait+0x2f/0x90 <4>[ 839.494341] [] __might_sleep+0xbd/0xd0 <4>[ 839.494348] [] mutex_lock+0x24/0x50 <4>[ 839.494354] [] fanotify_read+0xd5/0x620 <4>[ 839.494370] [] ? selinux_file_permission+0xa6/0x120 <4>[ 839.494374] [] ? wait_woken+0xc0/0xc0 <4>[ 839.494381] [] __vfs_read+0x18/0x50 <4>[ 839.494385] [] vfs_read+0x8a/0x140 <4>[ 839.494390] [] SyS_read+0x46/0xb0 <4>[ 839.494403] [] system_call_fastpath+0x12/0x17 <4>[ 839.494409] ---[ end trace 5a2207521429f889 ]--- --yliu ___ LKP mailing list l...@linux.intel.com [0.00] Initializing cgroup subsys cpuset [0.00] Initializing cgroup subsys cpu [0.00] Linux version 3.18.0-rc4-next-20141117 (kbuild@roam) (gcc version 4.9.1 (Debian 4.9.1-19) ) #1 SMP Tue Nov 18 11:46:52 CST 2014 [0.00] Command line: user=lkp job=/lkp/scheduled/lkp-a06/cyclic_ltp-performance-syscalls-x86_64-rhel-HEAD-efefb5ca5da52f7537c7ced03d6e53408f13a26e-0.yaml ARCH=x86_64 BOOT_IMAGE=/kernel/x86_64-rhel/efefb5ca5da52f7537c7ced03d6e53408f13a26e/vmlinuz-3.18.0-rc4-next-20141117 kconfig=x86_64-rhel commit=efefb5ca5da52f7537c7ced03d6e53408f13a26e branch=next/master root=/dev/ram0 max_uptime=3600 RESULT_ROOT=/result/lkp-a06/ltp/performance-syscalls/debian-x86_64.cgz/x86_64-rhel/efefb5ca5da52f7537c7ced03d6e53408f13a26e/0 ip=lkp-a06::dhcp earlyprintk=ttyS0,115200 debug apic=debug sysrq_always_enabled rcupdate.rcu_cpu_stall_timeout=100 panic=-1 softlockup_panic=1 nmi_watchdog=panic oops=panic load_ramdisk=2 prompt_ramdisk=0 console=ttyS0,115200 console=tty0 vga=normal rw [0.00] e820: BIOS-provided physical RAM map: [0.00] BIOS-e820: [mem 0x0100-0x0009e3ff] usable [0.00] BIOS-e820: [mem 0x0009e400-0x0009] reserved [0.00] BIOS-e820: [mem 0x000e-0x000f] reserved [0.00] BIOS-e820: [mem 0x0010-0xbf67afff] usable [0.00] BIOS-e820: [mem 0xbf67b000-0xbfb3dfff] ACPI NVS [0.00] BIOS-e820: [mem 0xbfb3e000-0xbfc50fff] reserved [0.00] BIOS-e820: [mem 0xbfc51000-0xbfc51fff] ACPI NVS [0.00] BIOS-e820: [mem 0xbfc52000-0xbfc62fff] reserved [0.00] BIOS-e820: [mem 0xbfc63000-0xbfc65fff] ACPI NVS [0.00] BIOS-e820: [mem 0xbfc66000-0xbfc83fff] reserved [
[LKP] [drm/fb] f5ef139cbe5: *ERROR* not all connectors configured
FYI, we noticed the below changes on git://people.freedesktop.org/~airlied/linux.git radeon-mst-hacks commit f5ef139cbe5dbd755dab3706022d7147800099a8 ("drm/fb: add support for tiled monitor configurations.") testbox/testcase/testparams: vm-kbuild-1G/xfstests/4HDD-btrfs-generic-113 9cf13203b1fd7cc3 f5ef139cbe5dbd755dab370602 -- fail:runs %reproductionfail:runs | | | :10 100% 10:10 kmsg.drm:drm_setup_crtcs[drm_kms_helper]]*ERROR*not_all_connectors_configured vm-kbuild-1G: qemu-system-x86_64 -enable-kvm -cpu Haswell,+smep,+smap Memory: 1G To reproduce: apt-get install ruby ruby-oj git clone git://git.kernel.org/pub/scm/linux/kernel/git/wfg/lkp-tests.git cd lkp-tests bin/setup-local job.yaml # the job file attached in this email bin/run-local job.yaml Disclaimer: Results have been estimated based on internal Intel analysis and are provided for informational purposes only. Any difference in system hardware or software design or configuration may affect actual performance. --yliu --- testcase: xfstests default_monitors: wait: pre-test vmstat: default_watchdogs: watch-oom: watchdog: cpufreq_governor: model: qemu-system-x86_64 -enable-kvm -cpu Haswell,+smep,+smap nr_vm: 16 nr_cpu: 2 memory: 1G disk_type: virtio-scsi rootfs: debian-x86_64.cgz hdd_partitions: "/dev/sda /dev/sdb /dev/sdc /dev/sdd" swap_partitions: "/dev/sde" disk: 4HDD fs: - btrfs xfstests: test: - generic-113 enqueue_time: 2014-11-26 13:11:19.191840759 +08:00 branch: linux-devel/devel-hourly-2014112611 commit: a7a1168f6b45bb0d29a20c942e1ca300ef54dc6e repeat_to: 2 testbox: vm-kbuild-1G-3 tbox_group: vm-kbuild-1G kconfig: x86_64-rhel kernel: "/kernel/x86_64-rhel/a7a1168f6b45bb0d29a20c942e1ca300ef54dc6e/vmlinuz-3.18.0-rc6-wl-ath-ga7a1168f" user: lkp queue: rand result_root: "/result/vm-kbuild-1G/xfstests/4HDD-btrfs-generic-113/debian-x86_64.cgz/x86_64-rhel/a7a1168f6b45bb0d29a20c942e1ca300ef54dc6e/0" job_file: "/lkp/scheduled/vm-kbuild-1G-3/rand_xfstests-4HDD-btrfs-generic-113-debian-x86_64.cgz-x86_64-rhel-a7a1168f6b45bb0d29a20c942e1ca300ef54dc6e-1.yaml" dequeue_time: 2014-11-26 13:25:10.605471464 +08:00 job_state: finished loadavg: 96.37 33.89 12.20 1/593 3339 start_time: '1416979556' end_time: '1416979727' version: "/lkp/lkp/.src-20141126-053142" mkfs -t btrfs /dev/sdd mkfs -t btrfs /dev/sdc mkfs -t btrfs /dev/sdb mkfs -t btrfs /dev/sda mount -t btrfs /dev/sda /fs/sda mount -t btrfs /dev/sdb /fs/sdb mount -t btrfs /dev/sdc /fs/sdc mount -t btrfs /dev/sdd /fs/sdd export TEST_DIR=/fs/sda export TEST_DEV=/dev/sda export FSTYP=btrfs export SCRATCH_MNT=/fs/scratch mkdir /fs/scratch -p export SCRATCH_DEV_POOL="/dev/sdb /dev/sdc /dev/sdd" ./check generic/113
[LKP] [drm/fb] f5ef139cbe5: *ERROR* not all connectors configured
FYI, we noticed the below changes on git://people.freedesktop.org/~airlied/linux.git radeon-mst-hacks commit f5ef139cbe5dbd755dab3706022d7147800099a8 (drm/fb: add support for tiled monitor configurations.) testbox/testcase/testparams: vm-kbuild-1G/xfstests/4HDD-btrfs-generic-113 9cf13203b1fd7cc3 f5ef139cbe5dbd755dab370602 -- fail:runs %reproductionfail:runs | | | :10 100% 10:10 kmsg.drm:drm_setup_crtcs[drm_kms_helper]]*ERROR*not_all_connectors_configured vm-kbuild-1G: qemu-system-x86_64 -enable-kvm -cpu Haswell,+smep,+smap Memory: 1G To reproduce: apt-get install ruby ruby-oj git clone git://git.kernel.org/pub/scm/linux/kernel/git/wfg/lkp-tests.git cd lkp-tests bin/setup-local job.yaml # the job file attached in this email bin/run-local job.yaml Disclaimer: Results have been estimated based on internal Intel analysis and are provided for informational purposes only. Any difference in system hardware or software design or configuration may affect actual performance. --yliu --- testcase: xfstests default_monitors: wait: pre-test vmstat: default_watchdogs: watch-oom: watchdog: cpufreq_governor: model: qemu-system-x86_64 -enable-kvm -cpu Haswell,+smep,+smap nr_vm: 16 nr_cpu: 2 memory: 1G disk_type: virtio-scsi rootfs: debian-x86_64.cgz hdd_partitions: /dev/sda /dev/sdb /dev/sdc /dev/sdd swap_partitions: /dev/sde disk: 4HDD fs: - btrfs xfstests: test: - generic-113 enqueue_time: 2014-11-26 13:11:19.191840759 +08:00 branch: linux-devel/devel-hourly-2014112611 commit: a7a1168f6b45bb0d29a20c942e1ca300ef54dc6e repeat_to: 2 testbox: vm-kbuild-1G-3 tbox_group: vm-kbuild-1G kconfig: x86_64-rhel kernel: /kernel/x86_64-rhel/a7a1168f6b45bb0d29a20c942e1ca300ef54dc6e/vmlinuz-3.18.0-rc6-wl-ath-ga7a1168f user: lkp queue: rand result_root: /result/vm-kbuild-1G/xfstests/4HDD-btrfs-generic-113/debian-x86_64.cgz/x86_64-rhel/a7a1168f6b45bb0d29a20c942e1ca300ef54dc6e/0 job_file: /lkp/scheduled/vm-kbuild-1G-3/rand_xfstests-4HDD-btrfs-generic-113-debian-x86_64.cgz-x86_64-rhel-a7a1168f6b45bb0d29a20c942e1ca300ef54dc6e-1.yaml dequeue_time: 2014-11-26 13:25:10.605471464 +08:00 job_state: finished loadavg: 96.37 33.89 12.20 1/593 3339 start_time: '1416979556' end_time: '1416979727' version: /lkp/lkp/.src-20141126-053142 mkfs -t btrfs /dev/sdd mkfs -t btrfs /dev/sdc mkfs -t btrfs /dev/sdb mkfs -t btrfs /dev/sda mount -t btrfs /dev/sda /fs/sda mount -t btrfs /dev/sdb /fs/sdb mount -t btrfs /dev/sdc /fs/sdc mount -t btrfs /dev/sdd /fs/sdd export TEST_DIR=/fs/sda export TEST_DEV=/dev/sda export FSTYP=btrfs export SCRATCH_MNT=/fs/scratch mkdir /fs/scratch -p export SCRATCH_DEV_POOL=/dev/sdb /dev/sdc /dev/sdd ./check generic/113
[LKP] [sched] WARNING: CPU: 0 PID: 13608 at kernel/sched/core.c:7323 __might_sleep+0xbd/0xd0()
FYI, we noticed the below changes on git://git.kernel.org/pub/scm/linux/kernel/git/next/linux-next.git master commit 8eb23b9f35aae413140d3fda766a98092c21e9b0 (sched: Debug nested sleeps) +-+++ | | 26cabd3125 | 8eb23b9f35 | +-+++ | boot_successes | 10 | 15 | | boot_failures | 0 | 25 | | WARNING:at_kernel/sched/core.c:__might_sleep() | 0 | 5 | | backtrace:SyS_read | 0 | 5 | | backtrace:vfs_read | 0 | 5 | | WARNING:at_kernel/sched/core.c:#__might_sleep() | 0 | 20 | | backtrace:SyS_io_getevents | 0 | 10 | | backtrace:read_events | 0 | 7 | | backtrace:kauditd_thread| 0 | 10 | +-+++ 4[ 839.494114] [ cut here ] 4[ 839.494131] WARNING: CPU: 0 PID: 13608 at /kbuild/src/lkp/kernel/sched/core.c:7323 __might_sleep+0xbd/0xd0() 4[ 839.494137] do not call blocking ops when !TASK_RUNNING; state=1 set at [810b3fff] prepare_to_wait+0x2f/0x90 4[ 839.494256] Modules linked in: tun ipmi_watchdog loop btrfs xor raid6_pq sg sd_mod ast snd_pcm syscopyarea sysfillrect snd_timer sysimgblt snd ie6xx_wdt ttm i2c_isch drm_kms_helper soundcore drm ahci libahci pcspkr i2c_ismt lpc_sch ipmi_si libata shpchp ipmi_msghandler acpi_cpufreq 4[ 839.494264] CPU: 0 PID: 13608 Comm: fanotify01 Not tainted 3.18.0-rc4-next-20141117 #1 4[ 839.494266] Hardware name: To be filled by O.E.M. To be filled by O.E.M./Double Cove , BIOS BWDEXT.86B.000.012.D127 10/08/2012 4[ 839.494273] 81b5ebb8 88023cf37d18 81892f54 64026402 4[ 839.494277] 88023cf37d68 88023cf37d58 8107047a 88023cf37db8 4[ 839.494281] 81b5f5e8 0061 6000 4[ 839.494285] Call Trace: 4[ 839.494315] [81892f54] dump_stack+0x4c/0x65 4[ 839.494323] [8107047a] warn_slowpath_common+0x8a/0xc0 4[ 839.494327] [810704f6] warn_slowpath_fmt+0x46/0x50 4[ 839.494333] [810b3fff] ? prepare_to_wait+0x2f/0x90 4[ 839.494337] [810b3fff] ? prepare_to_wait+0x2f/0x90 4[ 839.494341] [810961fd] __might_sleep+0xbd/0xd0 4[ 839.494348] [81898974] mutex_lock+0x24/0x50 4[ 839.494354] [812250f5] fanotify_read+0xd5/0x620 4[ 839.494370] [8139c906] ? selinux_file_permission+0xa6/0x120 4[ 839.494374] [810b43e0] ? wait_woken+0xc0/0xc0 4[ 839.494381] [811e14c8] __vfs_read+0x18/0x50 4[ 839.494385] [811e158a] vfs_read+0x8a/0x140 4[ 839.494390] [811e1686] SyS_read+0x46/0xb0 4[ 839.494403] [8189b629] system_call_fastpath+0x12/0x17 4[ 839.494409] ---[ end trace 5a2207521429f889 ]--- --yliu ___ LKP mailing list l...@linux.intel.com [0.00] Initializing cgroup subsys cpuset [0.00] Initializing cgroup subsys cpu [0.00] Linux version 3.18.0-rc4-next-20141117 (kbuild@roam) (gcc version 4.9.1 (Debian 4.9.1-19) ) #1 SMP Tue Nov 18 11:46:52 CST 2014 [0.00] Command line: user=lkp job=/lkp/scheduled/lkp-a06/cyclic_ltp-performance-syscalls-x86_64-rhel-HEAD-efefb5ca5da52f7537c7ced03d6e53408f13a26e-0.yaml ARCH=x86_64 BOOT_IMAGE=/kernel/x86_64-rhel/efefb5ca5da52f7537c7ced03d6e53408f13a26e/vmlinuz-3.18.0-rc4-next-20141117 kconfig=x86_64-rhel commit=efefb5ca5da52f7537c7ced03d6e53408f13a26e branch=next/master root=/dev/ram0 max_uptime=3600 RESULT_ROOT=/result/lkp-a06/ltp/performance-syscalls/debian-x86_64.cgz/x86_64-rhel/efefb5ca5da52f7537c7ced03d6e53408f13a26e/0 ip=lkp-a06::dhcp earlyprintk=ttyS0,115200 debug apic=debug sysrq_always_enabled rcupdate.rcu_cpu_stall_timeout=100 panic=-1 softlockup_panic=1 nmi_watchdog=panic oops=panic load_ramdisk=2 prompt_ramdisk=0 console=ttyS0,115200 console=tty0 vga=normal rw [0.00] e820: BIOS-provided physical RAM map: [0.00] BIOS-e820: [mem 0x0100-0x0009e3ff] usable [0.00] BIOS-e820: [mem 0x0009e400-0x0009] reserved [0.00] BIOS-e820: [mem 0x000e-0x000f] reserved [0.00] BIOS-e820: [mem 0x0010-0xbf67afff] usable [0.00] BIOS-e820: [mem 0xbf67b000-0xbfb3dfff] ACPI NVS [0.00] BIOS-e820: [mem 0xbfb3e000-0xbfc50fff] reserved [0.00] BIOS-e820: [mem 0xbfc51000-0xbfc51fff] ACPI NVS [0.00] BIOS-e820: [mem
[LKP] [net] 4ed2d765dfa:
FYI, we noticed the below changes on commit 4ed2d765dfaccff5ebdac68e2064b59125033a3b ("net-timestamp: TCP timestamping") testbox/testcase/testparams: vm-vp-2G/ltp/syscalls e7fd2885385157d4 4ed2d765dfaccff5ebdac68e20 -- fail:runs %reproductionfail:runs | | | :5 100% 5:5 ltp.recv01.fail :5 100% 5:5 ltp.recvfrom01.fail :5 100% 5:5 ltp.recvmsg01.fail :5 20% 1:5 kmsg.APIC_calibration_not_consistent_with_PM-Timer:#ms_instead_of#ms :5 20% 1:5 kmsg.hrtimer:interrupt_took#ns :5 20% 1:5 kmsg.TINFO:mlock_failed:errno=ENOMEM(#):Cannot_allocate_memory :5 20% 1:5 kmsg.estcases/kernel/syscalls/getgroups/../utils/compat_16.h::#-bit_version_of_getgroups()is_not_supported_on_your_platform testbox/testcase/testparams: nhm-white/ltp/syscalls e7fd2885385157d4 4ed2d765dfaccff5ebdac68e20 -- :10 100% 5:5 ltp.recv01.fail :10 100% 5:5 ltp.recvfrom01.fail :10 100% 5:5 ltp.recvmsg01.fail vm-vp-2G: qemu-system-x86_64 -enable-kvm -cpu Penryn Memory: 2G nhm-white: Nehalem Memory: 6G To reproduce: apt-get install ruby ruby-oj git clone git://git.kernel.org/pub/scm/linux/kernel/git/wfg/lkp-tests.git cd lkp-tests bin/setup-local job.yaml # the job file attached in this email bin/run-local job.yaml --yliu --- testcase: ltp default_monitors: wait: pre-test vmstat: model: qemu-system-x86_64 -enable-kvm -cpu Penryn nr_vm: 4 nr_cpu: 4 memory: 2G rootfs: debian-x86_64.cgz hdd_partitions: "/dev/vdb /dev/vdc /dev/vdd /dev/vde /dev/vdf" swap_partitions: "/dev/vda" ltp: test: - syscalls enqueue_time: 2014-10-02 10:07:25.199207485 +08:00 branch: net/master commit: 0754476419f127eb8c294b17b6fc8b6787ded1e2 testbox: vm-vp-2G-3 kconfig: x86_64-rhel kernel: "/kernel/x86_64-rhel/0754476419f127eb8c294b17b6fc8b6787ded1e2/vmlinuz-3.17.0-rc6-00145-g0754476" user: lkp queue: rand result_root: "/result/vm-vp-2G/ltp/syscalls/debian-x86_64.cgz/x86_64-rhel/0754476419f127eb8c294b17b6fc8b6787ded1e2/0" job_file: "/lkp/scheduled/vm-vp-2G-3/rand_ltp-syscalls-debian-x86_64.cgz-x86_64-rhel-0754476419f127eb8c294b17b6fc8b6787ded1e2-0.yaml" dequeue_time: 2014-10-02 11:55:51.761588446 +08:00 job_state: finished loadavg: 4.39 5.61 2.69 1/85 10461 start_time: '141188' end_time: '141759' version: "/lkp/lkp/.src-20141001-203321" ./runltp -f syscalls
[LKP] [net] 4ed2d765dfa:
FYI, we noticed the below changes on commit 4ed2d765dfaccff5ebdac68e2064b59125033a3b (net-timestamp: TCP timestamping) testbox/testcase/testparams: vm-vp-2G/ltp/syscalls e7fd2885385157d4 4ed2d765dfaccff5ebdac68e20 -- fail:runs %reproductionfail:runs | | | :5 100% 5:5 ltp.recv01.fail :5 100% 5:5 ltp.recvfrom01.fail :5 100% 5:5 ltp.recvmsg01.fail :5 20% 1:5 kmsg.APIC_calibration_not_consistent_with_PM-Timer:#ms_instead_of#ms :5 20% 1:5 kmsg.hrtimer:interrupt_took#ns :5 20% 1:5 kmsg.TINFO:mlock_failed:errno=ENOMEM(#):Cannot_allocate_memory :5 20% 1:5 kmsg.estcases/kernel/syscalls/getgroups/../utils/compat_16.h::#-bit_version_of_getgroups()is_not_supported_on_your_platform testbox/testcase/testparams: nhm-white/ltp/syscalls e7fd2885385157d4 4ed2d765dfaccff5ebdac68e20 -- :10 100% 5:5 ltp.recv01.fail :10 100% 5:5 ltp.recvfrom01.fail :10 100% 5:5 ltp.recvmsg01.fail vm-vp-2G: qemu-system-x86_64 -enable-kvm -cpu Penryn Memory: 2G nhm-white: Nehalem Memory: 6G To reproduce: apt-get install ruby ruby-oj git clone git://git.kernel.org/pub/scm/linux/kernel/git/wfg/lkp-tests.git cd lkp-tests bin/setup-local job.yaml # the job file attached in this email bin/run-local job.yaml --yliu --- testcase: ltp default_monitors: wait: pre-test vmstat: model: qemu-system-x86_64 -enable-kvm -cpu Penryn nr_vm: 4 nr_cpu: 4 memory: 2G rootfs: debian-x86_64.cgz hdd_partitions: /dev/vdb /dev/vdc /dev/vdd /dev/vde /dev/vdf swap_partitions: /dev/vda ltp: test: - syscalls enqueue_time: 2014-10-02 10:07:25.199207485 +08:00 branch: net/master commit: 0754476419f127eb8c294b17b6fc8b6787ded1e2 testbox: vm-vp-2G-3 kconfig: x86_64-rhel kernel: /kernel/x86_64-rhel/0754476419f127eb8c294b17b6fc8b6787ded1e2/vmlinuz-3.17.0-rc6-00145-g0754476 user: lkp queue: rand result_root: /result/vm-vp-2G/ltp/syscalls/debian-x86_64.cgz/x86_64-rhel/0754476419f127eb8c294b17b6fc8b6787ded1e2/0 job_file: /lkp/scheduled/vm-vp-2G-3/rand_ltp-syscalls-debian-x86_64.cgz-x86_64-rhel-0754476419f127eb8c294b17b6fc8b6787ded1e2-0.yaml dequeue_time: 2014-10-02 11:55:51.761588446 +08:00 job_state: finished loadavg: 4.39 5.61 2.69 1/85 10461 start_time: '141188' end_time: '141759' version: /lkp/lkp/.src-20141001-203321 ./runltp -f syscalls
[LKP] [nohz] 2a16fc93d2c:
FYI, we noticed the below changes on commit 2a16fc93d2c9568e16d45db77c7b5f15e1921cf1 ("nohz: Avoid tick's double reprogramming in highres mode") testbox/testcase/testparams: snb-drag/piglit/performance-igt-001 b5e995e671d8e4d7 2a16fc93d2c9568e16d45db77c -- fail:runs %reproductionfail:runs | | | :5 100% 5:5 kmsg.drm:__gen6_gt_force_wake_get]*ERROR*Timed_out_waiting_for_forcewake_to_ack_request :5 100% 5:5 piglit.igt/gem_ctx_exec/reset-pin-leak.dmesg-warn snb-drag: Sandy Bridge Memory: 6G <3>[ 90.915459] [drm:__gen6_gt_force_wake_get] *ERROR* Timed out waiting for forcewake to ack request. <3>[ 90.925094] [drm:__gen6_gt_force_wake_get] *ERROR* Timed out waiting for forcewake to ack request. <3>[ 90.934725] [drm:__gen6_gt_force_wake_get] *ERROR* Timed out waiting for forcewake to ack request. <3>[ 90.944347] [drm:__gen6_gt_force_wake_get] *ERROR* Timed out waiting for forcewake to ack request. <3>[ 90.953956] [drm:__gen6_gt_force_wake_get] *ERROR* Timed out waiting for forcewake to ack request. <3>[ 90.963559] [drm:__gen6_gt_force_wake_get] *ERROR* Timed out waiting for forcewake to ack request. <3>[ 90.973173] [drm:__gen6_gt_force_wake_get] *ERROR* Timed out waiting for forcewake to ack request. <3>[ 90.982793] [drm:__gen6_gt_force_wake_get] *ERROR* Timed out waiting for forcewake to ack request. <3>[ 90.992405] [drm:__gen6_gt_force_wake_get] *ERROR* Timed out waiting for forcewake to ack request. <3>[ 91.002008] [drm:__gen6_gt_force_wake_get] *ERROR* Timed out waiting for forcewake to ack request. <3>[ 91.011618] [drm:__gen6_gt_force_wake_get] *ERROR* Timed out waiting for forcewake to ack request. <3>[ 91.021222] [drm:__gen6_gt_force_wake_get] *ERROR* Timed out waiting for forcewake to ack request. <3>[ 91.030825] [drm:__gen6_gt_force_wake_get] *ERROR* Timed out waiting for forcewake to ack request. <3>[ 91.040430] [drm:__gen6_gt_force_wake_get] *ERROR* Timed out waiting for forcewake to ack request. <3>[ 91.050016] [drm:__gen6_gt_force_wake_get] *ERROR* Timed out waiting for forcewake to ack request. <3>[ 91.059593] [drm:__gen6_gt_force_wake_get] *ERROR* Timed out waiting for forcewake to ack request. <3>[ 91.069152] [drm:__gen6_gt_force_wake_get] *ERROR* Timed out waiting for forcewake to ack request. To reproduce: apt-get install ruby ruby-oj git clone git://git.kernel.org/pub/scm/linux/kernel/git/wfg/lkp-tests.git cd lkp-tests bin/setup-local job.yaml # the job file attached in this email bin/run-local job.yaml Disclaimer: Results have been estimated based on internal Intel analysis and are provided for informational purposes only. Any difference in system hardware or software design or configuration may affect actual performance. --yliu --- testcase: piglit default_monitors: wait: pre-test vmstat: default_watchdogs: watch-oom: watchdog: cpufreq_governor: - performance commit: 9bdebfefe1de2b6fa7e193c10411ef209b0ebc96 model: Sandy Bridge memory: 6G hdd_partitions: "/dev/disk/by-id/ata-ST3750528AS_6VP2W0PA-part5 /dev/disk/by-id/ata-ST3750528AS_6VP2W0PA-part6 /dev/disk/by-id/ata-ST3750528AS_6VP2W0PA-part7 /dev/disk/by-id/ata-ST3750528AS_6VP2W0PA-part8 /dev/disk/by-id/ata-ST3750528AS_6VP2W0PA-part9 /dev/disk/by-id/ata-ST3750528AS_6VP2W0PA-part10" swap_partitions: rootfs_partition: "/dev/disk/by-id/ata-ST3750528AS_6VP2W0PA-part2" timeout: 30m piglit: group: - igt-001 enqueue_time: 2014-10-27 03:51:37.871425766 +08:00 testbox: snb-drag tbox_group: snb-drag kconfig: x86_64-rhel head_commit: 9bdebfefe1de2b6fa7e193c10411ef209b0ebc96 base_commit: cac7f2429872d3733dc3f9915857b1691da2eb2f branch: linux-devel/devel-hourly-2014103002 kernel: "/kernel/x86_64-rhel/9bdebfefe1de2b6fa7e193c10411ef209b0ebc96/vmlinuz-3.18.0-rc2-g9bdebfe" user: lkp queue: cyclic rootfs: debian-x86_64.cgz result_root: "/result/snb-drag/piglit/performance-igt-001/debian-x86_64.cgz/x86_64-rhel/9bdebfefe1de2b6fa7e193c10411ef209b0ebc96/0" job_file: "/lkp/scheduled/snb-drag/cyclic_piglit-performance-igt-001-x86_64-rhel-HEAD-9bdebfefe1de2b6fa7e193c10411ef209b0ebc96-0.yaml" dequeue_time: 2014-10-30 03:46:50.534182476 +08:00 job_state: finished loadavg: 0.62 0.46 0.25 1/96 9645 start_time: '1414612069' end_time: '1414612536' version: "/lkp/lkp/.src-20141029-214343" echo performance > /sys/devices/system/cpu/cpu0/cpufreq/scaling_governor echo performance > /sys/devices/system/cpu/cpu1/cpufreq/scaling_governor echo performance > /sys/devices/system/cpu/cpu2/cpufreq/scaling_governor echo performance > /sys/devices/system/cpu/cpu3/cpufreq/scaling_governor piglit run igt -t igt/drv_hangman/error-state-capture-bsd /lkp/lkp/src/tmp/piglit-results-0 piglit summary console /lkp/lkp/src/tmp/piglit-results-0 piglit run igt -t
[LKP] [x86, irq, ACPI] 5fcb864ef90: -3.3%(vm-scalability.throughput) +12.9%(turbostat.%c0)
Hi, We noticed the below changes on(NOTE: I'm not sure the bisect is correct or not, here I report it out JFYI). git://git.kernel.org/pub/scm/linux/kernel/git/next/linux-next.git master commit 5fcb864ef90df093d964171539c87ffa0ab49f0f ("x86, irq, ACPI: Implement interfaces to support ACPI based IOAPIC hot-removal") testbox/testcase/testparams: lkp-nex06/vm-scalability/performance-300s-small-allocs-mt ff6213974cd90e1e 5fcb864ef90df093d964171539 -- fail:runs %reproductionfail:runs | | | :5 20% 1:5 kmsg.CE:hpet_increased_min_delta_ns_to#nsec %stddev %change %stddev \ |\ 315326 ± 0% -3.3% 304841 ± 0% vm-scalability.throughput 11.82 ± 0% +12.9% 13.34 ± 0% turbostat.%c0 1.34 ± 0% +9.4% 1.46 ± 0% turbostat.GHz 12 ± 47% +78.7% 21 ± 32% sched_debug.cfs_rq[29]:/.load 113 ± 26% +86.3%212 ± 28% sched_debug.cfs_rq[39]:/.tg_load_contrib 106 ± 28% +89.5%202 ± 30% sched_debug.cfs_rq[39]:/.blocked_load_avg 66 ± 23%+120.6%145 ± 29% sched_debug.cfs_rq[40]:/.blocked_load_avg 70 ± 23%+113.0%150 ± 29% sched_debug.cfs_rq[40]:/.tg_load_contrib 10145 ± 23% -38.3% 6255 ± 35% numa-meminfo.node1.AnonPages 2535 ± 23% -38.3% 1564 ± 35% numa-vmstat.node1.nr_anon_pages 605 ± 16% -22.0%471 ± 5% sched_debug.cpu#58.nr_uninterruptible 58904 ± 7% -13.8% 50762 ± 7% sched_debug.cfs_rq[0]:/.min_vruntime 481299 ± 8% -13.4% 416975 ± 7% sched_debug.cpu#0.sched_count 409009 ± 11% -15.7% 344638 ± 2% sched_debug.cpu#4.sched_count 52022 ± 10% -16.1% 43623 ± 2% sched_debug.cfs_rq[4]:/.min_vruntime 68 ± 3% -12.2% 60 ± 3% sched_debug.cfs_rq[4]:/.tg_runnable_contrib 3175 ± 3% -12.1% 2791 ± 3% sched_debug.cfs_rq[4]:/.avg->runnable_avg_sum 50060 ± 6% -12.3% 43914 ± 4% sched_debug.cfs_rq[29]:/.min_vruntime 1751 ± 12% -15.5% 1480 ± 6% sched_debug.cpu#63.nr_uninterruptible 2967 ± 6% -13.7% 2562 ± 4% sched_debug.cfs_rq[37]:/.avg->runnable_avg_sum 63 ± 6% -13.8% 55 ± 4% sched_debug.cfs_rq[37]:/.tg_runnable_contrib 1.07 ± 2% -10.9% 0.95 ± 3% perf-profile.cpu-cycles.tick_nohz_restart.tick_nohz_idle_exit.cpu_startup_entry.start_secondary 1.64 ± 2% -8.4% 1.50 ± 4% perf-profile.cpu-cycles.__tick_nohz_idle_enter.tick_nohz_idle_enter.cpu_startup_entry.start_secondary 35173 ± 5% -9.1% 31983 ± 3% sched_debug.cfs_rq[56]:/.min_vruntime 1.41 ± 2% -8.3% 1.29 ± 4% perf-profile.cpu-cycles.tick_nohz_stop_sched_tick.__tick_nohz_idle_enter.tick_nohz_idle_enter.cpu_startup_entry.start_secondary 1.63 ± 1% -9.3% 1.48 ± 3% perf-profile.cpu-cycles.tick_nohz_idle_exit.cpu_startup_entry.start_secondary 45161 ± 11% -12.8% 39358 ± 4% sched_debug.cfs_rq[25]:/.min_vruntime 39201 ± 5% +17.3% 45969 ± 18% sched_debug.cfs_rq[8]:/.min_vruntime 21071502 ± 0% -3.3% 20379730 ± 0% time.minor_page_faults 299 ± 0% -3.1%290 ± 0% time.user_time 21763267 ± 0% -3.3% 21055329 ± 0% time.voluntary_context_switches 142199 ± 0% -3.1% 137732 ± 0% vmstat.system.cs 737 ± 0% -2.1%721 ± 1% time.system_time 341 ± 0% -2.5%333 ± 0% time.percent_of_cpu_this_job_got lkp-nex06: Nehalem-EX Memory: 64G turbostat.%c0 14 ++---+ |O | | O O O O O| 13.5 O+ O O O O O O O | | O OO O O | | O OO 13 ++ | || 12.5 ++ | || || 12 *+.*...*.. .*... | | *..*. *.. .*...*..*..*..*... .*..* | | *. *.| 11.5
[LKP] [x86, irq, ACPI] 5fcb864ef90: -3.3%(vm-scalability.throughput) +12.9%(turbostat.%c0)
Hi, We noticed the below changes on(NOTE: I'm not sure the bisect is correct or not, here I report it out JFYI). git://git.kernel.org/pub/scm/linux/kernel/git/next/linux-next.git master commit 5fcb864ef90df093d964171539c87ffa0ab49f0f (x86, irq, ACPI: Implement interfaces to support ACPI based IOAPIC hot-removal) testbox/testcase/testparams: lkp-nex06/vm-scalability/performance-300s-small-allocs-mt ff6213974cd90e1e 5fcb864ef90df093d964171539 -- fail:runs %reproductionfail:runs | | | :5 20% 1:5 kmsg.CE:hpet_increased_min_delta_ns_to#nsec %stddev %change %stddev \ |\ 315326 ± 0% -3.3% 304841 ± 0% vm-scalability.throughput 11.82 ± 0% +12.9% 13.34 ± 0% turbostat.%c0 1.34 ± 0% +9.4% 1.46 ± 0% turbostat.GHz 12 ± 47% +78.7% 21 ± 32% sched_debug.cfs_rq[29]:/.load 113 ± 26% +86.3%212 ± 28% sched_debug.cfs_rq[39]:/.tg_load_contrib 106 ± 28% +89.5%202 ± 30% sched_debug.cfs_rq[39]:/.blocked_load_avg 66 ± 23%+120.6%145 ± 29% sched_debug.cfs_rq[40]:/.blocked_load_avg 70 ± 23%+113.0%150 ± 29% sched_debug.cfs_rq[40]:/.tg_load_contrib 10145 ± 23% -38.3% 6255 ± 35% numa-meminfo.node1.AnonPages 2535 ± 23% -38.3% 1564 ± 35% numa-vmstat.node1.nr_anon_pages 605 ± 16% -22.0%471 ± 5% sched_debug.cpu#58.nr_uninterruptible 58904 ± 7% -13.8% 50762 ± 7% sched_debug.cfs_rq[0]:/.min_vruntime 481299 ± 8% -13.4% 416975 ± 7% sched_debug.cpu#0.sched_count 409009 ± 11% -15.7% 344638 ± 2% sched_debug.cpu#4.sched_count 52022 ± 10% -16.1% 43623 ± 2% sched_debug.cfs_rq[4]:/.min_vruntime 68 ± 3% -12.2% 60 ± 3% sched_debug.cfs_rq[4]:/.tg_runnable_contrib 3175 ± 3% -12.1% 2791 ± 3% sched_debug.cfs_rq[4]:/.avg-runnable_avg_sum 50060 ± 6% -12.3% 43914 ± 4% sched_debug.cfs_rq[29]:/.min_vruntime 1751 ± 12% -15.5% 1480 ± 6% sched_debug.cpu#63.nr_uninterruptible 2967 ± 6% -13.7% 2562 ± 4% sched_debug.cfs_rq[37]:/.avg-runnable_avg_sum 63 ± 6% -13.8% 55 ± 4% sched_debug.cfs_rq[37]:/.tg_runnable_contrib 1.07 ± 2% -10.9% 0.95 ± 3% perf-profile.cpu-cycles.tick_nohz_restart.tick_nohz_idle_exit.cpu_startup_entry.start_secondary 1.64 ± 2% -8.4% 1.50 ± 4% perf-profile.cpu-cycles.__tick_nohz_idle_enter.tick_nohz_idle_enter.cpu_startup_entry.start_secondary 35173 ± 5% -9.1% 31983 ± 3% sched_debug.cfs_rq[56]:/.min_vruntime 1.41 ± 2% -8.3% 1.29 ± 4% perf-profile.cpu-cycles.tick_nohz_stop_sched_tick.__tick_nohz_idle_enter.tick_nohz_idle_enter.cpu_startup_entry.start_secondary 1.63 ± 1% -9.3% 1.48 ± 3% perf-profile.cpu-cycles.tick_nohz_idle_exit.cpu_startup_entry.start_secondary 45161 ± 11% -12.8% 39358 ± 4% sched_debug.cfs_rq[25]:/.min_vruntime 39201 ± 5% +17.3% 45969 ± 18% sched_debug.cfs_rq[8]:/.min_vruntime 21071502 ± 0% -3.3% 20379730 ± 0% time.minor_page_faults 299 ± 0% -3.1%290 ± 0% time.user_time 21763267 ± 0% -3.3% 21055329 ± 0% time.voluntary_context_switches 142199 ± 0% -3.1% 137732 ± 0% vmstat.system.cs 737 ± 0% -2.1%721 ± 1% time.system_time 341 ± 0% -2.5%333 ± 0% time.percent_of_cpu_this_job_got lkp-nex06: Nehalem-EX Memory: 64G turbostat.%c0 14 ++---+ |O | | O O O O O| 13.5 O+ O O O O O O O | | O OO O O | | O OO 13 ++ | || 12.5 ++ | || || 12 *+.*...*.. .*... | | *..*. *.. .*...*..*..*..*... .*..* | | *. *.| 11.5
[LKP] [nohz] 2a16fc93d2c:
FYI, we noticed the below changes on commit 2a16fc93d2c9568e16d45db77c7b5f15e1921cf1 (nohz: Avoid tick's double reprogramming in highres mode) testbox/testcase/testparams: snb-drag/piglit/performance-igt-001 b5e995e671d8e4d7 2a16fc93d2c9568e16d45db77c -- fail:runs %reproductionfail:runs | | | :5 100% 5:5 kmsg.drm:__gen6_gt_force_wake_get]*ERROR*Timed_out_waiting_for_forcewake_to_ack_request :5 100% 5:5 piglit.igt/gem_ctx_exec/reset-pin-leak.dmesg-warn snb-drag: Sandy Bridge Memory: 6G 3[ 90.915459] [drm:__gen6_gt_force_wake_get] *ERROR* Timed out waiting for forcewake to ack request. 3[ 90.925094] [drm:__gen6_gt_force_wake_get] *ERROR* Timed out waiting for forcewake to ack request. 3[ 90.934725] [drm:__gen6_gt_force_wake_get] *ERROR* Timed out waiting for forcewake to ack request. 3[ 90.944347] [drm:__gen6_gt_force_wake_get] *ERROR* Timed out waiting for forcewake to ack request. 3[ 90.953956] [drm:__gen6_gt_force_wake_get] *ERROR* Timed out waiting for forcewake to ack request. 3[ 90.963559] [drm:__gen6_gt_force_wake_get] *ERROR* Timed out waiting for forcewake to ack request. 3[ 90.973173] [drm:__gen6_gt_force_wake_get] *ERROR* Timed out waiting for forcewake to ack request. 3[ 90.982793] [drm:__gen6_gt_force_wake_get] *ERROR* Timed out waiting for forcewake to ack request. 3[ 90.992405] [drm:__gen6_gt_force_wake_get] *ERROR* Timed out waiting for forcewake to ack request. 3[ 91.002008] [drm:__gen6_gt_force_wake_get] *ERROR* Timed out waiting for forcewake to ack request. 3[ 91.011618] [drm:__gen6_gt_force_wake_get] *ERROR* Timed out waiting for forcewake to ack request. 3[ 91.021222] [drm:__gen6_gt_force_wake_get] *ERROR* Timed out waiting for forcewake to ack request. 3[ 91.030825] [drm:__gen6_gt_force_wake_get] *ERROR* Timed out waiting for forcewake to ack request. 3[ 91.040430] [drm:__gen6_gt_force_wake_get] *ERROR* Timed out waiting for forcewake to ack request. 3[ 91.050016] [drm:__gen6_gt_force_wake_get] *ERROR* Timed out waiting for forcewake to ack request. 3[ 91.059593] [drm:__gen6_gt_force_wake_get] *ERROR* Timed out waiting for forcewake to ack request. 3[ 91.069152] [drm:__gen6_gt_force_wake_get] *ERROR* Timed out waiting for forcewake to ack request. To reproduce: apt-get install ruby ruby-oj git clone git://git.kernel.org/pub/scm/linux/kernel/git/wfg/lkp-tests.git cd lkp-tests bin/setup-local job.yaml # the job file attached in this email bin/run-local job.yaml Disclaimer: Results have been estimated based on internal Intel analysis and are provided for informational purposes only. Any difference in system hardware or software design or configuration may affect actual performance. --yliu --- testcase: piglit default_monitors: wait: pre-test vmstat: default_watchdogs: watch-oom: watchdog: cpufreq_governor: - performance commit: 9bdebfefe1de2b6fa7e193c10411ef209b0ebc96 model: Sandy Bridge memory: 6G hdd_partitions: /dev/disk/by-id/ata-ST3750528AS_6VP2W0PA-part5 /dev/disk/by-id/ata-ST3750528AS_6VP2W0PA-part6 /dev/disk/by-id/ata-ST3750528AS_6VP2W0PA-part7 /dev/disk/by-id/ata-ST3750528AS_6VP2W0PA-part8 /dev/disk/by-id/ata-ST3750528AS_6VP2W0PA-part9 /dev/disk/by-id/ata-ST3750528AS_6VP2W0PA-part10 swap_partitions: rootfs_partition: /dev/disk/by-id/ata-ST3750528AS_6VP2W0PA-part2 timeout: 30m piglit: group: - igt-001 enqueue_time: 2014-10-27 03:51:37.871425766 +08:00 testbox: snb-drag tbox_group: snb-drag kconfig: x86_64-rhel head_commit: 9bdebfefe1de2b6fa7e193c10411ef209b0ebc96 base_commit: cac7f2429872d3733dc3f9915857b1691da2eb2f branch: linux-devel/devel-hourly-2014103002 kernel: /kernel/x86_64-rhel/9bdebfefe1de2b6fa7e193c10411ef209b0ebc96/vmlinuz-3.18.0-rc2-g9bdebfe user: lkp queue: cyclic rootfs: debian-x86_64.cgz result_root: /result/snb-drag/piglit/performance-igt-001/debian-x86_64.cgz/x86_64-rhel/9bdebfefe1de2b6fa7e193c10411ef209b0ebc96/0 job_file: /lkp/scheduled/snb-drag/cyclic_piglit-performance-igt-001-x86_64-rhel-HEAD-9bdebfefe1de2b6fa7e193c10411ef209b0ebc96-0.yaml dequeue_time: 2014-10-30 03:46:50.534182476 +08:00 job_state: finished loadavg: 0.62 0.46 0.25 1/96 9645 start_time: '1414612069' end_time: '1414612536' version: /lkp/lkp/.src-20141029-214343 echo performance /sys/devices/system/cpu/cpu0/cpufreq/scaling_governor echo performance /sys/devices/system/cpu/cpu1/cpufreq/scaling_governor echo performance /sys/devices/system/cpu/cpu2/cpufreq/scaling_governor echo performance /sys/devices/system/cpu/cpu3/cpufreq/scaling_governor piglit run igt -t igt/drv_hangman/error-state-capture-bsd /lkp/lkp/src/tmp/piglit-results-0 piglit summary console /lkp/lkp/src/tmp/piglit-results-0 piglit run igt -t igt/gem_reset_stats/reset-count-ctx-vebox /lkp/lkp/src/tmp/piglit-results-1
[LKP] [x86, PCI, MSI] BUG: unable to handle kernel NULL pointer dereference at 0000000000000002
FYI, we noticed the below changes on https://github.com/jiangliu/linux.git irqdomain/p2v7 commit 515b463a5a4c2bac0593c6d88a475a32d65f4bcc ("x86, PCI, MSI: Use hierarchy irqdomain to manage MSI interrupts") +--+++ | | dadb7cd295 | 515b463a5a | +--+++ | boot_successes | 6 | 1 | | early-boot-hang | 1 || | boot_failures| 0 | 4 | | BUG:unable_to_handle_kernel | 0 | 4 | | Oops | 0 | 4 | | RIP:init_irq_alloc_info | 0 | 4 | | Kernel_panic-not_syncing:Fatal_exception | 0 | 4 | | backtrace:init_irq_alloc_info| 0 | 4 | | backtrace:vp_find_vqs| 0 | 4 | | backtrace:init_vq| 0 | 4 | | backtrace:init | 0 | 4 | | backtrace:kernel_init_freeable | 0 | 4 | +--+++ [ 20.962013] BUG: unable to handle kernel NULL pointer dereference at 0002 [ 20.964023] IP: [] init_irq_alloc_info+0x13/0x1b [ 20.964023] PGD 0 [ 20.964023] Oops: 0002 [#1] SMP DEBUG_PAGEALLOC [ 20.964023] Modules linked in: [ 20.964023] CPU: 0 PID: 1 Comm: swapper/0 Not tainted 3.18.0-rc4-g4ae16b6 #1457 [ 20.964023] Hardware name: Bochs Bochs, BIOS Bochs 01/01/2011 [ 20.964023] task: 8801289c0010 ti: 8801289c4000 task.ti: 8801289c4000 [ 20.964023] RIP: 0010:[] [] init_irq_alloc_info+0x13/0x1b [ 20.964023] RSP: :8801289c7928 EFLAGS: 00010246 [ 20.964023] RAX: RBX: 0002 RCX: 000a [ 20.964023] RDX: 0002 RSI: RDI: 0002 [ 20.964023] RBP: 8801289c7928 R08: 0008 R09: [ 20.964023] R10: 8800b8399f80 R11: 0023 R12: 8800db055000 [ 20.964023] R13: 8800d1ee8f98 R14: 880129cc3f80 R15: 83e36800 [ 20.964023] FS: () GS:88012a20() knlGS: [ 20.964023] CS: 0010 DS: ES: CR0: 8005003b [ 20.964023] CR2: 0002 CR3: 03e1a000 CR4: 06f0 [ 20.964023] Stack: [ 20.964023] 8801289c7958 810770be 8801289c7980 0002 [ 20.964023] 83e36840 8800db055098 8801289c79d8 8110fd29 [ 20.964023] 8800db055000 0011 [ 20.964023] Call Trace: [ 20.964023] [] pci_msi_prepare+0x2d/0x54 [ 20.964023] [] msi_domain_alloc_irqs+0x4a/0x162 [ 20.964023] [] ? dmar_find_matched_drhd_unit+0xf7/0x10b [ 20.964023] [] pci_msi_domain_alloc_irqs+0x15/0x17 [ 20.964023] [] native_setup_msi_irqs+0x61/0x6c [ 20.964023] [] arch_setup_msi_irqs+0xf/0x11 [ 20.964023] [] pci_msi_setup_msi_irqs+0x45/0x4c [ 20.964023] [] pci_enable_msix+0x1d8/0x2d0 [ 20.964023] [] pci_enable_msix_range+0x31/0x50 [ 20.964023] [] vp_request_msix_vectors+0xb6/0x1f8 [ 20.964023] [] vp_try_to_find_vqs+0xae/0x43e [ 20.964023] [] ? vsnprintf+0x374/0x3ad [ 20.964023] [] vp_find_vqs+0x32/0x8d [ 20.964023] [] init_vq+0x14f/0x1f8 [ 20.964023] [] virtblk_probe+0xf3/0x501 [ 20.964023] [] ? sysfs_do_create_link_sd+0x78/0xa8 [ 20.964023] [] ? vp_set_status+0x25/0x27 [ 20.964023] [] virtio_dev_probe+0xbd/0x104 [ 20.964023] [] driver_probe_device+0xb0/0x1d7 [ 20.964023] [] __driver_attach+0x62/0x85 [ 20.964023] [] ? __device_attach+0x3d/0x3d [ 20.964023] [] bus_for_each_dev+0x6f/0x89 [ 20.964023] [] driver_attach+0x1e/0x20 [ 20.964023] [] bus_add_driver+0x110/0x1cf [ 20.964023] [] ? nbd_init+0x39c/0x39c [ 20.964023] [] driver_register+0x8f/0xcc [ 20.964023] [] ? nbd_init+0x39c/0x39c [ 20.964023] [] register_virtio_driver+0x2b/0x2d [ 20.964023] [] init+0x5d/0x8b [ 20.964023] [] do_one_initcall+0xee/0x17e [ 20.964023] [] kernel_init_freeable+0x1ec/0x274 [ 20.964023] [] ? rest_init+0xcc/0xcc [ 20.964023] [] kernel_init+0xe/0xdf [ 20.964023] [] ret_from_fork+0x7c/0xb0 [ 20.964023] [] ? rest_init+0xcc/0xcc [ 20.964023] Code: eb 05 bb da ff ff ff 48 83 c4 28 89 d8 5b 41 5c 41 5d 41 5e 41 5f 5d c3 0f 1f 44 00 00 55 48 89 fa b9 0a 00 00 00 31 c0 48 89 e5 ab 5d 48 89 72 08 c3 0f 1f 44 00 00 55 48 85 f6 b9 0a 00 00 [ 20.964023] RIP [] init_irq_alloc_info+0x13/0x1b [ 20.964023] RSP [ 20.964023] CR2: 0002 [ 20.964023] ---[ end trace 21200aca189fb8f5 ]--- [ 20.964023] Kernel panic - not syncing: Fatal exception [ 20.964023] Kernel
[LKP] [LSM] Kernel panic - not syncing: No working init found.
FYI, we noticed the below changes on(TBH, I don't know the bisect is correct or not; sorry for the noise if not) git://git.kernel.org/pub/scm/linux/kernel/git/kees/linux.git lsm/stacking commit 58c4f9e3be81a85839ea229b1dd36bf55232d440 ("LSM: Refactor existing LSM stacking") ++++ || c9979f3c6e | 58c4f9e3be | ++++ | boot_successes | 15 | 0 | | early-boot-hang| 1 || | boot_failures | 0 | 15 | | Kernel_panic-not_syncing:No_working_init_found | 0 | 15 | | backtrace:panic| 0 | 15 | ++++ [3.437279] Starting init: /sbin/init exists but couldn't execute it (error -12) [3.438655] Starting init: /etc/init exists but couldn't execute it (error -13) [3.440136] Starting init: /bin/sh exists but couldn't execute it (error -12) [3.441487] Kernel panic - not syncing: No working init found. Try passing init= option to kernel. See Linux Documentation/init.txt for guidance. [3.443352] CPU: 0 PID: 1 Comm: swapper Not tainted 3.18.0-rc4-g49aba53 #1949 [3.443352] f783d540 80017f88 8138c3bd 80017fa0 8138b30b 815e1f40 f783d540 [3.443352] 815e1f40 80017fac 81389523 8152ab4d 80016000 813918e0 81389474 [3.443352] 007b 007b [3.443352] Call Trace: [3.443352] [<8138c3bd>] dump_stack+0x16/0x18 [3.443352] [<8138b30b>] panic+0x86/0x19e [3.443352] [<81389523>] kernel_init+0xaf/0xb3 [3.443352] [<813918e0>] ret_from_kernel_thread+0x20/0x30 [3.443352] [<81389474>] ? rest_init+0xa2/0xa2 [3.443352] Kernel Offset: 0x0 from 0x8100 (relocation range: 0x8000-0x947fdfff) Elapsed time: 10 --yliu early console in setup code Probing EDD (edd=off to disable)... ok [0.00] Linux version 3.18.0-rc4-g49aba53 (kbuild@lkp-hsx01) (gcc version 4.9.1 (Debian 4.9.1-19) ) #1949 Sat Nov 15 06:21:52 CST 2014 [0.00] e820: BIOS-provided physical RAM map: [0.00] BIOS-e820: [mem 0x-0x0009fbff] usable [0.00] BIOS-e820: [mem 0x0009fc00-0x0009] reserved [0.00] BIOS-e820: [mem 0x000f-0x000f] reserved [0.00] BIOS-e820: [mem 0x0010-0x13ffdfff] usable [0.00] BIOS-e820: [mem 0x13ffe000-0x13ff] reserved [0.00] BIOS-e820: [mem 0xfeffc000-0xfeff] reserved [0.00] BIOS-e820: [mem 0xfffc-0x] reserved [0.00] Notice: NX (Execute Disable) protection missing in CPU! [0.00] Hypervisor detected: KVM [0.00] e820: update [mem 0x-0x0fff] usable ==> reserved [0.00] e820: remove [mem 0x000a-0x000f] usable [0.00] e820: last_pfn = 0x13ffe max_arch_pfn = 0x100 [0.00] initial memory mapped: [mem 0x-0x027f] [0.00] Base memory trampoline at [8009b000] 9b000 size 16384 [0.00] init_memory_mapping: [mem 0x-0x000f] [0.00] [mem 0x-0x000f] page 4k [0.00] init_memory_mapping: [mem 0x1320-0x133f] [0.00] [mem 0x1320-0x133f] page 2M [0.00] init_memory_mapping: [mem 0x1000-0x131f] [0.00] [mem 0x1000-0x131f] page 2M [0.00] init_memory_mapping: [mem 0x0010-0x0fff] [0.00] [mem 0x0010-0x001f] page 4k [0.00] [mem 0x0020-0x0fff] page 2M [0.00] init_memory_mapping: [mem 0x1340-0x13ffdfff] [0.00] [mem 0x1340-0x13df] page 2M [0.00] [mem 0x13e0-0x13ffdfff] page 4k [0.00] BRK [0x01f22000, 0x01f22fff] PGTABLE [0.00] BRK [0x01f23000, 0x01f23fff] PGTABLE [0.00] RAMDISK: [mem 0x135e9000-0x13fe] [0.00] ACPI: Early table checksum verification disabled [0.00] ACPI: RSDP 0x000FD950 14 (v00 BOCHS ) [0.00] ACPI: RSDT 0x13FFE450 34 (v01 BOCHS BXPCRSDT 0001 BXPC 0001) [0.00] ACPI: FACP 0x1380 74 (v01 BOCHS BXPCFACP 0001 BXPC 0001) [0.00] ACPI: DSDT 0x13FFE490 0011A9 (v01 BXPC BXDSDT 0001 INTL 20100528) [0.00] ACPI: FACS 0x1340 40 [0.00] ACPI: SSDT 0x13FFF7A0 000796 (v01 BOCHS BXPCSSDT 0001 BXPC 0001) [0.00] ACPI: APIC 0x13FFF680 80 (v01 BOCHS BXPCAPIC 0001 BXPC 0001) [0.00] ACPI: HPET 0x13FFF640 38 (v01 BOCHS BXPCHPET 0001 BXPC 0001) [0.00] ACPI: Local APIC address 0xfee0 [
[LKP] [x86, mm] BUG: Bad page state in process swapper/0 pfn:02500
FYI, we noticed the below changes on git://git.kernel.org/pub/scm/linux/kernel/git/kees/linux.git x86/pmd-nx commit 3622dcc2b4f4eaf23bae2511a30fc449d0e5f0d9 ("x86, mm: set NX across entire PMD at boot") +--+++ | | b23dc5a7cc | 3622dcc2b4 | +--+++ | boot_successes | 4 | 0 | | boot_failures| 0 | 19 | | BUG:Bad_page_state_in_process| 0 | 19 | | BUG:Bad_page_map_in_process | 0 | 14 | | BUG:Bad_rss-counter_state_mm:#idx:val| 0 | 2 | | backtrace:free_reserved_area | 0 | 19 | | backtrace:free_init_pages| 0 | 19 | | backtrace:mark_rodata_ro | 0 | 19 | | backtrace:vm_munmap | 0 | 2 | | backtrace:SyS_munmap | 0 | 2 | | backtrace:do_execve | 0 | 12 | | backtrace:SyS_execve | 0 | 12 | | backtrace:do_group_exit | 0 | 10 | | backtrace:SyS_exit_group | 0 | 10 | | backtrace:vfs_read | 0 | 3 | | backtrace:SyS_read | 0 | 3 | | general_protection_fault | 0 | 3 | | RIP:release_pages| 0 | 3 | | Kernel_panic-not_syncing:Fatal_exception | 0 | 3 | +--+++ [5.435374] PM: Hibernation image not present or could not be loaded. [5.437869] Freeing unused kernel memory: 1448K (8215b000 - 822c5000) [5.439558] Write protecting the kernel read-only data: 16384k [5.441103] BUG: Bad page state in process swapper/0 pfn:02500 [5.442204] page:ea094000 count:0 mapcount:-127 mapping: (null) index:0x2 [5.443939] flags: 0x180() [5.444891] page dumped because: nonzero mapcount [5.445861] Modules linked in: [5.446711] CPU: 1 PID: 1 Comm: swapper/0 Not tainted 3.18.0-rc4-00185-g3622dcc #1 [5.448369] Hardware name: Bochs Bochs, BIOS Bochs 01/01/2011 [5.449450] 81cf3ba4 880037a33d78 819ea6b0 10ac [5.451360] ea094000 880037a33da8 8119b29c [5.453289] ea094000 0001 880037a33de8 [5.455234] Call Trace: [5.455942] [] dump_stack+0x4e/0x68 [5.456971] [] bad_page+0xf5/0x113 [5.457972] [] free_pages_prepare+0xbf/0x13f [5.459067] [] free_hot_cold_page+0x35/0x1a0 [5.460178] [] __free_pages+0x1b/0x24 [5.461219] [] free_reserved_area+0xaf/0x10b [5.462339] [] free_init_pages+0x8d/0x99 [5.463407] [] mark_rodata_ro+0xb6/0x11c [5.464522] [] ? rest_init+0x89/0x89 [5.465533] [] kernel_init+0x1d/0xdf [5.466596] [] ret_from_fork+0x7c/0xb0 [5.467633] [] ? rest_init+0x89/0x89 [5.468711] Disabling lock debugging due to kernel taint [5.470302] Freeing unused kernel memory: 1488K (8248c000 - 8260) [5.472182] Freeing unused kernel memory: 20K (8800019fb000 - 880001a0) [5.477823] Freeing unused kernel memory: 1812K (880001e3b000 - 88000200) [5.582078] BUG: Bad page state in process udevd pfn:0248c [5.582103] BUG: Bad page state in process udevd pfn:024a0 [5.582104] page:ea092800 count:2 mapcount:0 mapping:88003ec8ea69 index:0x2 [5.582107] flags: 0x1880068(uptodate|lru|active|swapbacked) --yliu early console in setup code Probing EDD (edd=off to disable)... ok early console in decompress_kernel Decompressing Linux... Parsing ELF... done. Booting the kernel. [0.00] Initializing cgroup subsys cpuset [0.00] Initializing cgroup subsys cpu [0.00] Linux version 3.18.0-rc4-00185-g3622dcc (kbuild@roam) (gcc version 4.9.1 (Debian 4.9.1-19) ) #1 SMP Sat Nov 15 17:25:59 CST 2014 [0.00] Command line: user=lkp job=/lkp/scheduled/vm-vp-1G-6/rand_boot-1-debian-x86_64.cgz-x86_64-lkp-3622dcc2b4f4eaf23bae2511a30fc449d0e5f0d9-1.yaml ARCH=x86_64 BOOT_IMAGE=/kernel/x86_64-lkp/3622dcc2b4f4eaf23bae2511a30fc449d0e5f0d9/vmlinuz-3.18.0-rc4-00185-g3622dcc kconfig=x86_64-lkp commit=3622dcc2b4f4eaf23bae2511a30fc449d0e5f0d9 branch=kees/x86/pmd-nx root=/dev/ram0 max_uptime=3600 RESULT_ROOT=/result/vm-vp-1G/boot/1/debian-x86_64.cgz/x86_64-lkp/3622dcc2b4f4eaf23bae2511a30fc449d0e5f0d9/0 ip=vm-vp-1G-6::dhcp earlyprintk=ttyS0,115200 debug apic=debug sysrq_always_enabled rcupdate.rcu_cpu_stall_timeout=100
[LKP] [x86, mm] BUG: Bad page state in process swapper/0 pfn:02500
FYI, we noticed the below changes on git://git.kernel.org/pub/scm/linux/kernel/git/kees/linux.git x86/pmd-nx commit 3622dcc2b4f4eaf23bae2511a30fc449d0e5f0d9 (x86, mm: set NX across entire PMD at boot) +--+++ | | b23dc5a7cc | 3622dcc2b4 | +--+++ | boot_successes | 4 | 0 | | boot_failures| 0 | 19 | | BUG:Bad_page_state_in_process| 0 | 19 | | BUG:Bad_page_map_in_process | 0 | 14 | | BUG:Bad_rss-counter_state_mm:#idx:val| 0 | 2 | | backtrace:free_reserved_area | 0 | 19 | | backtrace:free_init_pages| 0 | 19 | | backtrace:mark_rodata_ro | 0 | 19 | | backtrace:vm_munmap | 0 | 2 | | backtrace:SyS_munmap | 0 | 2 | | backtrace:do_execve | 0 | 12 | | backtrace:SyS_execve | 0 | 12 | | backtrace:do_group_exit | 0 | 10 | | backtrace:SyS_exit_group | 0 | 10 | | backtrace:vfs_read | 0 | 3 | | backtrace:SyS_read | 0 | 3 | | general_protection_fault | 0 | 3 | | RIP:release_pages| 0 | 3 | | Kernel_panic-not_syncing:Fatal_exception | 0 | 3 | +--+++ [5.435374] PM: Hibernation image not present or could not be loaded. [5.437869] Freeing unused kernel memory: 1448K (8215b000 - 822c5000) [5.439558] Write protecting the kernel read-only data: 16384k [5.441103] BUG: Bad page state in process swapper/0 pfn:02500 [5.442204] page:ea094000 count:0 mapcount:-127 mapping: (null) index:0x2 [5.443939] flags: 0x180() [5.444891] page dumped because: nonzero mapcount [5.445861] Modules linked in: [5.446711] CPU: 1 PID: 1 Comm: swapper/0 Not tainted 3.18.0-rc4-00185-g3622dcc #1 [5.448369] Hardware name: Bochs Bochs, BIOS Bochs 01/01/2011 [5.449450] 81cf3ba4 880037a33d78 819ea6b0 10ac [5.451360] ea094000 880037a33da8 8119b29c [5.453289] ea094000 0001 880037a33de8 [5.455234] Call Trace: [5.455942] [819ea6b0] dump_stack+0x4e/0x68 [5.456971] [8119b29c] bad_page+0xf5/0x113 [5.457972] [8119b379] free_pages_prepare+0xbf/0x13f [5.459067] [8119d65e] free_hot_cold_page+0x35/0x1a0 [5.460178] [8119d878] __free_pages+0x1b/0x24 [5.461219] [8119d930] free_reserved_area+0xaf/0x10b [5.462339] [8106c126] free_init_pages+0x8d/0x99 [5.463407] [8106cb1a] mark_rodata_ro+0xb6/0x11c [5.464522] [819e15b5] ? rest_init+0x89/0x89 [5.465533] [819e15d2] kernel_init+0x1d/0xdf [5.466596] [819f15bc] ret_from_fork+0x7c/0xb0 [5.467633] [819e15b5] ? rest_init+0x89/0x89 [5.468711] Disabling lock debugging due to kernel taint [5.470302] Freeing unused kernel memory: 1488K (8248c000 - 8260) [5.472182] Freeing unused kernel memory: 20K (8800019fb000 - 880001a0) [5.477823] Freeing unused kernel memory: 1812K (880001e3b000 - 88000200) [5.582078] BUG: Bad page state in process udevd pfn:0248c [5.582103] BUG: Bad page state in process udevd pfn:024a0 [5.582104] page:ea092800 count:2 mapcount:0 mapping:88003ec8ea69 index:0x2 [5.582107] flags: 0x1880068(uptodate|lru|active|swapbacked) --yliu early console in setup code Probing EDD (edd=off to disable)... ok early console in decompress_kernel Decompressing Linux... Parsing ELF... done. Booting the kernel. [0.00] Initializing cgroup subsys cpuset [0.00] Initializing cgroup subsys cpu [0.00] Linux version 3.18.0-rc4-00185-g3622dcc (kbuild@roam) (gcc version 4.9.1 (Debian 4.9.1-19) ) #1 SMP Sat Nov 15 17:25:59 CST 2014 [0.00] Command line: user=lkp job=/lkp/scheduled/vm-vp-1G-6/rand_boot-1-debian-x86_64.cgz-x86_64-lkp-3622dcc2b4f4eaf23bae2511a30fc449d0e5f0d9-1.yaml ARCH=x86_64 BOOT_IMAGE=/kernel/x86_64-lkp/3622dcc2b4f4eaf23bae2511a30fc449d0e5f0d9/vmlinuz-3.18.0-rc4-00185-g3622dcc kconfig=x86_64-lkp commit=3622dcc2b4f4eaf23bae2511a30fc449d0e5f0d9 branch=kees/x86/pmd-nx root=/dev/ram0 max_uptime=3600
[LKP] [LSM] Kernel panic - not syncing: No working init found.
FYI, we noticed the below changes on(TBH, I don't know the bisect is correct or not; sorry for the noise if not) git://git.kernel.org/pub/scm/linux/kernel/git/kees/linux.git lsm/stacking commit 58c4f9e3be81a85839ea229b1dd36bf55232d440 (LSM: Refactor existing LSM stacking) ++++ || c9979f3c6e | 58c4f9e3be | ++++ | boot_successes | 15 | 0 | | early-boot-hang| 1 || | boot_failures | 0 | 15 | | Kernel_panic-not_syncing:No_working_init_found | 0 | 15 | | backtrace:panic| 0 | 15 | ++++ [3.437279] Starting init: /sbin/init exists but couldn't execute it (error -12) [3.438655] Starting init: /etc/init exists but couldn't execute it (error -13) [3.440136] Starting init: /bin/sh exists but couldn't execute it (error -12) [3.441487] Kernel panic - not syncing: No working init found. Try passing init= option to kernel. See Linux Documentation/init.txt for guidance. [3.443352] CPU: 0 PID: 1 Comm: swapper Not tainted 3.18.0-rc4-g49aba53 #1949 [3.443352] f783d540 80017f88 8138c3bd 80017fa0 8138b30b 815e1f40 f783d540 [3.443352] 815e1f40 80017fac 81389523 8152ab4d 80016000 813918e0 81389474 [3.443352] 007b 007b [3.443352] Call Trace: [3.443352] [8138c3bd] dump_stack+0x16/0x18 [3.443352] [8138b30b] panic+0x86/0x19e [3.443352] [81389523] kernel_init+0xaf/0xb3 [3.443352] [813918e0] ret_from_kernel_thread+0x20/0x30 [3.443352] [81389474] ? rest_init+0xa2/0xa2 [3.443352] Kernel Offset: 0x0 from 0x8100 (relocation range: 0x8000-0x947fdfff) Elapsed time: 10 --yliu early console in setup code Probing EDD (edd=off to disable)... ok [0.00] Linux version 3.18.0-rc4-g49aba53 (kbuild@lkp-hsx01) (gcc version 4.9.1 (Debian 4.9.1-19) ) #1949 Sat Nov 15 06:21:52 CST 2014 [0.00] e820: BIOS-provided physical RAM map: [0.00] BIOS-e820: [mem 0x-0x0009fbff] usable [0.00] BIOS-e820: [mem 0x0009fc00-0x0009] reserved [0.00] BIOS-e820: [mem 0x000f-0x000f] reserved [0.00] BIOS-e820: [mem 0x0010-0x13ffdfff] usable [0.00] BIOS-e820: [mem 0x13ffe000-0x13ff] reserved [0.00] BIOS-e820: [mem 0xfeffc000-0xfeff] reserved [0.00] BIOS-e820: [mem 0xfffc-0x] reserved [0.00] Notice: NX (Execute Disable) protection missing in CPU! [0.00] Hypervisor detected: KVM [0.00] e820: update [mem 0x-0x0fff] usable == reserved [0.00] e820: remove [mem 0x000a-0x000f] usable [0.00] e820: last_pfn = 0x13ffe max_arch_pfn = 0x100 [0.00] initial memory mapped: [mem 0x-0x027f] [0.00] Base memory trampoline at [8009b000] 9b000 size 16384 [0.00] init_memory_mapping: [mem 0x-0x000f] [0.00] [mem 0x-0x000f] page 4k [0.00] init_memory_mapping: [mem 0x1320-0x133f] [0.00] [mem 0x1320-0x133f] page 2M [0.00] init_memory_mapping: [mem 0x1000-0x131f] [0.00] [mem 0x1000-0x131f] page 2M [0.00] init_memory_mapping: [mem 0x0010-0x0fff] [0.00] [mem 0x0010-0x001f] page 4k [0.00] [mem 0x0020-0x0fff] page 2M [0.00] init_memory_mapping: [mem 0x1340-0x13ffdfff] [0.00] [mem 0x1340-0x13df] page 2M [0.00] [mem 0x13e0-0x13ffdfff] page 4k [0.00] BRK [0x01f22000, 0x01f22fff] PGTABLE [0.00] BRK [0x01f23000, 0x01f23fff] PGTABLE [0.00] RAMDISK: [mem 0x135e9000-0x13fe] [0.00] ACPI: Early table checksum verification disabled [0.00] ACPI: RSDP 0x000FD950 14 (v00 BOCHS ) [0.00] ACPI: RSDT 0x13FFE450 34 (v01 BOCHS BXPCRSDT 0001 BXPC 0001) [0.00] ACPI: FACP 0x1380 74 (v01 BOCHS BXPCFACP 0001 BXPC 0001) [0.00] ACPI: DSDT 0x13FFE490 0011A9 (v01 BXPC BXDSDT 0001 INTL 20100528) [0.00] ACPI: FACS 0x1340 40 [0.00] ACPI: SSDT 0x13FFF7A0 000796 (v01 BOCHS BXPCSSDT 0001 BXPC 0001) [0.00] ACPI: APIC 0x13FFF680 80 (v01 BOCHS BXPCAPIC 0001 BXPC 0001) [0.00] ACPI: HPET 0x13FFF640 38 (v01 BOCHS BXPCHPET 0001 BXPC 0001) [0.00] ACPI: Local APIC address 0xfee0 [0.00]
[LKP] [x86, PCI, MSI] BUG: unable to handle kernel NULL pointer dereference at 0000000000000002
FYI, we noticed the below changes on https://github.com/jiangliu/linux.git irqdomain/p2v7 commit 515b463a5a4c2bac0593c6d88a475a32d65f4bcc (x86, PCI, MSI: Use hierarchy irqdomain to manage MSI interrupts) +--+++ | | dadb7cd295 | 515b463a5a | +--+++ | boot_successes | 6 | 1 | | early-boot-hang | 1 || | boot_failures| 0 | 4 | | BUG:unable_to_handle_kernel | 0 | 4 | | Oops | 0 | 4 | | RIP:init_irq_alloc_info | 0 | 4 | | Kernel_panic-not_syncing:Fatal_exception | 0 | 4 | | backtrace:init_irq_alloc_info| 0 | 4 | | backtrace:vp_find_vqs| 0 | 4 | | backtrace:init_vq| 0 | 4 | | backtrace:init | 0 | 4 | | backtrace:kernel_init_freeable | 0 | 4 | +--+++ [ 20.962013] BUG: unable to handle kernel NULL pointer dereference at 0002 [ 20.964023] IP: [81074795] init_irq_alloc_info+0x13/0x1b [ 20.964023] PGD 0 [ 20.964023] Oops: 0002 [#1] SMP DEBUG_PAGEALLOC [ 20.964023] Modules linked in: [ 20.964023] CPU: 0 PID: 1 Comm: swapper/0 Not tainted 3.18.0-rc4-g4ae16b6 #1457 [ 20.964023] Hardware name: Bochs Bochs, BIOS Bochs 01/01/2011 [ 20.964023] task: 8801289c0010 ti: 8801289c4000 task.ti: 8801289c4000 [ 20.964023] RIP: 0010:[81074795] [81074795] init_irq_alloc_info+0x13/0x1b [ 20.964023] RSP: :8801289c7928 EFLAGS: 00010246 [ 20.964023] RAX: RBX: 0002 RCX: 000a [ 20.964023] RDX: 0002 RSI: RDI: 0002 [ 20.964023] RBP: 8801289c7928 R08: 0008 R09: [ 20.964023] R10: 8800b8399f80 R11: 0023 R12: 8800db055000 [ 20.964023] R13: 8800d1ee8f98 R14: 880129cc3f80 R15: 83e36800 [ 20.964023] FS: () GS:88012a20() knlGS: [ 20.964023] CS: 0010 DS: ES: CR0: 8005003b [ 20.964023] CR2: 0002 CR3: 03e1a000 CR4: 06f0 [ 20.964023] Stack: [ 20.964023] 8801289c7958 810770be 8801289c7980 0002 [ 20.964023] 83e36840 8800db055098 8801289c79d8 8110fd29 [ 20.964023] 8800db055000 0011 [ 20.964023] Call Trace: [ 20.964023] [810770be] pci_msi_prepare+0x2d/0x54 [ 20.964023] [8110fd29] msi_domain_alloc_irqs+0x4a/0x162 [ 20.964023] [8285063a] ? dmar_find_matched_drhd_unit+0xf7/0x10b [ 20.964023] [8177e2ee] pci_msi_domain_alloc_irqs+0x15/0x17 [ 20.964023] [8107727c] native_setup_msi_irqs+0x61/0x6c [ 20.964023] [8104f786] arch_setup_msi_irqs+0xf/0x11 [ 20.964023] [8177d3e0] pci_msi_setup_msi_irqs+0x45/0x4c [ 20.964023] [8177daf7] pci_enable_msix+0x1d8/0x2d0 [ 20.964023] [8177dc20] pci_enable_msix_range+0x31/0x50 [ 20.964023] [8185dfa6] vp_request_msix_vectors+0xb6/0x1f8 [ 20.964023] [8185e196] vp_try_to_find_vqs+0xae/0x43e [ 20.964023] [8172fbc5] ? vsnprintf+0x374/0x3ad [ 20.964023] [8185e558] vp_find_vqs+0x32/0x8d [ 20.964023] [81b416aa] init_vq+0x14f/0x1f8 [ 20.964023] [81b41896] virtblk_probe+0xf3/0x501 [ 20.964023] [81238727] ? sysfs_do_create_link_sd+0x78/0xa8 [ 20.964023] [8185dba0] ? vp_set_status+0x25/0x27 [ 20.964023] [8185c2ec] virtio_dev_probe+0xbd/0x104 [ 20.964023] [81b09a19] driver_probe_device+0xb0/0x1d7 [ 20.964023] [81b09bdf] __driver_attach+0x62/0x85 [ 20.964023] [81b09b7d] ? __device_attach+0x3d/0x3d [ 20.964023] [81b08009] bus_for_each_dev+0x6f/0x89 [ 20.964023] [81b0957d] driver_attach+0x1e/0x20 [ 20.964023] [81b09229] bus_add_driver+0x110/0x1cf [ 20.964023] [84452673] ? nbd_init+0x39c/0x39c [ 20.964023] [81b0a235] driver_register+0x8f/0xcc [ 20.964023] [84452673] ? nbd_init+0x39c/0x39c [ 20.964023] [8185c5fd] register_virtio_driver+0x2b/0x2d [ 20.964023] [844526d0] init+0x5d/0x8b [ 20.964023] [8100216d] do_one_initcall+0xee/0x17e [ 20.964023] [843e60ef] kernel_init_freeable+0x1ec/0x274 [ 20.964023] [82d3c238] ? rest_init+0xcc/0xcc [ 20.964023]
Re: [LKP] [sched] 9597d64116d: -16.1% hackbench.throughput
On Wed, Nov 12, 2014 at 03:44:34PM +0100, Vincent Guittot wrote: > On 10 November 2014 06:54, wrote: > > FYI, we noticed the below changes on > > > > https://git.linaro.org/people/mturquette/linux.git eas-next > > commit 9597d64116d0d441dea32e7f5f05fa135d16f44b ("sched: replace > > capacity_factor by usage") > > > > > > b57a1e0afff2cbac 9597d64116d0d441dea32e7f5f testbox/testcase/testparams > > -- --- > > %stddev %change %stddev > > \ |\ > > 104249 ą 0% -16.1% 87436 ą 0% > > ivb42/hackbench/performance-50%-threads-socket > > 104249 -16.1% 87436GEO-MEAN hackbench.throughput > > Hi yuanhan, > > i understand this email as a 16% drop in hackbench performance when > the number of group is half the number of CPUs. Is it the only test > for which you have seen some decreases ? where can i find the list of > tests that you have passed ? Sorry, the list is not accessed outside, plus, you have to run some commands to generate the list on fly. Anyway, I checked it for you, and we have run hackbench/performance-50%-threads-socket only on that commit, which is reasonable in our system as we bisected once on this issue. But I can run more tests(say, with 100% and 1600% cpu) on that commit if you like, and it also would be good if you can name some of benchmarks you care most so that we can run it for you. --yliu > > I'm going to try to reproduce the test in my local setup > > Regards, > Vincent > > > > > > b57a1e0afff2cbac 9597d64116d0d441dea32e7f5f > > -- > > 0.88 ą 25%+209.7% 2.74 ą 5% > > ivb42/hackbench/performance-50%-threads-socket > > 0.88 +209.7% 2.74GEO-MEAN > > perf-profile.cpu-cycles.ttwu_do_activate.constprop.87.sched_ttwu_pending.scheduler_ipi.smp_reschedule_interrupt.reschedule_interrupt > > > > b57a1e0afff2cbac 9597d64116d0d441dea32e7f5f > > -- > > 0.76 ą 26%+209.2% 2.36 ą 5% > > ivb42/hackbench/performance-50%-threads-socket > > 0.76 +209.2% 2.36GEO-MEAN > > perf-profile.cpu-cycles.activate_task.ttwu_do_activate.sched_ttwu_pending.scheduler_ipi.smp_reschedule_interrupt > > > > b57a1e0afff2cbac 9597d64116d0d441dea32e7f5f > > -- > > 0.76 ą 26%+210.6% 2.35 ą 5% > > ivb42/hackbench/performance-50%-threads-socket > > 0.76 +210.6% 2.35GEO-MEAN > > perf-profile.cpu-cycles.enqueue_task.activate_task.ttwu_do_activate.sched_ttwu_pending.scheduler_ipi > > > > b57a1e0afff2cbac 9597d64116d0d441dea32e7f5f > > -- > > 0.70 ą 25%+203.1% 2.13 ą 6% > > ivb42/hackbench/performance-50%-threads-socket > > 0.70 +203.1% 2.13GEO-MEAN > > perf-profile.cpu-cycles.enqueue_task_fair.enqueue_task.activate_task.ttwu_do_activate.sched_ttwu_pending > > > > b57a1e0afff2cbac 9597d64116d0d441dea32e7f5f > > -- > > 243252 ą 46%+242.5% 833240 ą 42% > > ivb42/hackbench/performance-50%-threads-socket > > 243252 +242.5% 833240GEO-MEAN > > sched_debug.cfs_rq[2]:/.spread0 > > > > b57a1e0afff2cbac 9597d64116d0d441dea32e7f5f > > -- > > 98 ą 36% -49.1% 50 ą 34% > > ivb42/hackbench/performance-50%-threads-socket > > 98 -49.1% 50GEO-MEAN > > sched_debug.cfs_rq[18]:/.blocked_load_avg > > > > b57a1e0afff2cbac 9597d64116d0d441dea32e7f5f > > -- > >1067752 ą 25% +65.3%1764542 ą 11% > > ivb42/hackbench/performance-50%-threads-socket > >1067752 +65.3%1764542GEO-MEAN > > sched_debug.cfs_rq[16]:/.spread0 > > > > b57a1e0afff2cbac 9597d64116d0d441dea32e7f5f > > -- > > 923375 ą 22% +96.3%1812750 ą 21% > > ivb42/hackbench/performance-50%-threads-socket > > 923375 +96.3%1812750GEO-MEAN > > sched_debug.cfs_rq[14]:/.spread0 > > > > b57a1e0afff2cbac 9597d64116d0d441dea32e7f5f > > -- > >1008818 ą 20% +70.9%1724167 ą 14% > > ivb42/hackbench/performance-50%-threads-socket > >1008818 +70.9%1724167GEO-MEAN > > sched_debug.cfs_rq[6]:/.spread0 > > > > b57a1e0afff2cbac 9597d64116d0d441dea32e7f5f > > -- > >1109100 ą 25% +53.9%1707190 ą 16% > > ivb42/hackbench/performance-50%-threads-socket > >1109100 +53.9%1707190GEO-MEAN > > sched_debug.cfs_rq[15]:/.spread0 > > > > b57a1e0afff2cbac 9597d64116d0d441dea32e7f5f > >
Re: [LKP] [sched] 9597d64116d: -16.1% hackbench.throughput
On Wed, Nov 12, 2014 at 03:44:34PM +0100, Vincent Guittot wrote: On 10 November 2014 06:54, l...@01.org wrote: FYI, we noticed the below changes on https://git.linaro.org/people/mturquette/linux.git eas-next commit 9597d64116d0d441dea32e7f5f05fa135d16f44b (sched: replace capacity_factor by usage) b57a1e0afff2cbac 9597d64116d0d441dea32e7f5f testbox/testcase/testparams -- --- %stddev %change %stddev \ |\ 104249 ą 0% -16.1% 87436 ą 0% ivb42/hackbench/performance-50%-threads-socket 104249 -16.1% 87436GEO-MEAN hackbench.throughput Hi yuanhan, i understand this email as a 16% drop in hackbench performance when the number of group is half the number of CPUs. Is it the only test for which you have seen some decreases ? where can i find the list of tests that you have passed ? Sorry, the list is not accessed outside, plus, you have to run some commands to generate the list on fly. Anyway, I checked it for you, and we have run hackbench/performance-50%-threads-socket only on that commit, which is reasonable in our system as we bisected once on this issue. But I can run more tests(say, with 100% and 1600% cpu) on that commit if you like, and it also would be good if you can name some of benchmarks you care most so that we can run it for you. --yliu I'm going to try to reproduce the test in my local setup Regards, Vincent b57a1e0afff2cbac 9597d64116d0d441dea32e7f5f -- 0.88 ą 25%+209.7% 2.74 ą 5% ivb42/hackbench/performance-50%-threads-socket 0.88 +209.7% 2.74GEO-MEAN perf-profile.cpu-cycles.ttwu_do_activate.constprop.87.sched_ttwu_pending.scheduler_ipi.smp_reschedule_interrupt.reschedule_interrupt b57a1e0afff2cbac 9597d64116d0d441dea32e7f5f -- 0.76 ą 26%+209.2% 2.36 ą 5% ivb42/hackbench/performance-50%-threads-socket 0.76 +209.2% 2.36GEO-MEAN perf-profile.cpu-cycles.activate_task.ttwu_do_activate.sched_ttwu_pending.scheduler_ipi.smp_reschedule_interrupt b57a1e0afff2cbac 9597d64116d0d441dea32e7f5f -- 0.76 ą 26%+210.6% 2.35 ą 5% ivb42/hackbench/performance-50%-threads-socket 0.76 +210.6% 2.35GEO-MEAN perf-profile.cpu-cycles.enqueue_task.activate_task.ttwu_do_activate.sched_ttwu_pending.scheduler_ipi b57a1e0afff2cbac 9597d64116d0d441dea32e7f5f -- 0.70 ą 25%+203.1% 2.13 ą 6% ivb42/hackbench/performance-50%-threads-socket 0.70 +203.1% 2.13GEO-MEAN perf-profile.cpu-cycles.enqueue_task_fair.enqueue_task.activate_task.ttwu_do_activate.sched_ttwu_pending b57a1e0afff2cbac 9597d64116d0d441dea32e7f5f -- 243252 ą 46%+242.5% 833240 ą 42% ivb42/hackbench/performance-50%-threads-socket 243252 +242.5% 833240GEO-MEAN sched_debug.cfs_rq[2]:/.spread0 b57a1e0afff2cbac 9597d64116d0d441dea32e7f5f -- 98 ą 36% -49.1% 50 ą 34% ivb42/hackbench/performance-50%-threads-socket 98 -49.1% 50GEO-MEAN sched_debug.cfs_rq[18]:/.blocked_load_avg b57a1e0afff2cbac 9597d64116d0d441dea32e7f5f -- 1067752 ą 25% +65.3%1764542 ą 11% ivb42/hackbench/performance-50%-threads-socket 1067752 +65.3%1764542GEO-MEAN sched_debug.cfs_rq[16]:/.spread0 b57a1e0afff2cbac 9597d64116d0d441dea32e7f5f -- 923375 ą 22% +96.3%1812750 ą 21% ivb42/hackbench/performance-50%-threads-socket 923375 +96.3%1812750GEO-MEAN sched_debug.cfs_rq[14]:/.spread0 b57a1e0afff2cbac 9597d64116d0d441dea32e7f5f -- 1008818 ą 20% +70.9%1724167 ą 14% ivb42/hackbench/performance-50%-threads-socket 1008818 +70.9%1724167GEO-MEAN sched_debug.cfs_rq[6]:/.spread0 b57a1e0afff2cbac 9597d64116d0d441dea32e7f5f -- 1109100 ą 25% +53.9%1707190 ą 16% ivb42/hackbench/performance-50%-threads-socket 1109100 +53.9%1707190GEO-MEAN sched_debug.cfs_rq[15]:/.spread0 b57a1e0afff2cbac 9597d64116d0d441dea32e7f5f -- 1006499 ą 33% +67.8%1688436 ą 22% ivb42/hackbench/performance-50%-threads-socket 1006499 +67.8%1688436
Re: [LKP] [dmi] PANIC: early exception 0e rip 10:ffffffff81899e6b error 9 cr2 ffffffffff240000
On Fri, Nov 07, 2014 at 10:35:44AM +0100, Ard Biesheuvel wrote: > On 7 November 2014 10:26, Yuanhan Liu wrote: > > On Fri, Nov 07, 2014 at 10:03:55AM +0100, Ard Biesheuvel wrote: > >> On 7 November 2014 09:46, Yuanhan Liu wrote: > >> > On Fri, Nov 07, 2014 at 09:23:56AM +0100, Ard Biesheuvel wrote: > >> >> On 7 November 2014 09:13, Yuanhan Liu > >> >> wrote: > >> >> > On Fri, Nov 07, 2014 at 08:44:40AM +0100, Ard Biesheuvel wrote: > >> >> >> On 7 November 2014 08:37, Yuanhan Liu > >> >> >> wrote: > >> >> >> > On Fri, Nov 07, 2014 at 08:17:36AM +0100, Ard Biesheuvel wrote: > >> >> >> >> On 7 November 2014 06:47, LKP wrote: > >> >> >> >> > FYI, we noticed the below changes on > >> >> >> >> > > >> >> >> >> > https://git.linaro.org/people/ard.biesheuvel/linux-arm > >> >> >> >> > efi-for-3.19 > >> >> >> >> > commit aacdce6e880894acb57d71dcb2e3fc61b4ed4e96 ("dmi: add > >> >> >> >> > support for SMBIOS 3.0 64-bit entry point") > >> >> >> >> > > >> >> >> >> > > >> >> >> >> > +---+++ > >> >> >> >> > | | 2fa165a26c | aacdce6e88 | > >> >> >> >> > +---+++ > >> >> >> >> > | boot_successes| 20 | 10 | > >> >> >> >> > | early-boot-hang | 1 || > >> >> >> >> > | boot_failures | 0 | 5 | > >> >> >> >> > | PANIC:early_exception | 0 | 5 | > >> >> >> >> > +---+++ > >> >> >> >> > > >> >> >> >> > > >> >> >> >> > [0.00] BIOS-e820: [mem > >> >> >> >> > 0x0001-0x00036fff] usable > >> >> >> >> > [0.00] bootconsole [earlyser0] enabled > >> >> >> >> > [0.00] NX (Execute Disable) protection: active > >> >> >> >> > PANIC: early exception 0e rip 10:81899e6b error 9 cr2 > >> >> >> >> > ff24 > >> >> >> >> > [0.00] CPU: 0 PID: 0 Comm: swapper Not tainted > >> >> >> >> > 3.18.0-rc2-gc5221e6 #1 > >> >> >> >> > [0.00] 82203d30 > >> >> >> >> > 819f0a6e 03f8 > >> >> >> >> > [0.00] ff24 82203e18 > >> >> >> >> > 823701b0 82511401 > >> >> >> >> > [0.00] 0ba3 > >> >> >> >> > ff24 > >> >> >> >> > [0.00] Call Trace: > >> >> >> >> > [0.00] [] dump_stack+0x4e/0x68 > >> >> >> >> > [0.00] [] early_idt_handler+0x90/0xb7 > >> >> >> >> > [0.00] [] ? > >> >> >> >> > dmi_save_one_device+0x81/0x81 > >> >> >> >> > [0.00] [] ? dmi_table+0x3f/0x94 > >> >> >> >> > [0.00] [] ? dmi_table+0x16/0x94 > >> >> >> >> > [0.00] [] ? > >> >> >> >> > dmi_save_one_device+0x81/0x81 > >> >> >> >> > [0.00] [] ? > >> >> >> >> > dmi_save_one_device+0x81/0x81 > >> >> >> >> > [0.00] [] dmi_walk_early+0x44/0x69 > >> >> >> >> > [0.00] [] dmi_present+0x180/0x1ff > >> >> >> >> > [0.00] [] > >> >> >> >> > dmi_scan_machine+0x144/0x191 > >> >> >> >> > [0.00] [] ? loglevel+0x31/0x31 > >> >> >> >> > [0.00] [] setup_arch+0x490/0xc73 > >> >> >> >> > [0.00] [] ? printk+0x4d/0x4f > >> >>
Re: [LKP] [dmi] PANIC: early exception 0e rip 10:ffffffff81899e6b error 9 cr2 ffffffffff240000
On Fri, Nov 07, 2014 at 09:16:02AM +, Matt Fleming wrote: > On Fri, 2014-11-07 at 08:17 +0100, Ard Biesheuvel wrote: > > On 7 November 2014 06:47, LKP wrote: > > > FYI, we noticed the below changes on > > > > > > https://git.linaro.org/people/ard.biesheuvel/linux-arm efi-for-3.19 > > > commit aacdce6e880894acb57d71dcb2e3fc61b4ed4e96 ("dmi: add support for > > > SMBIOS 3.0 64-bit entry point") > > > > > > > > > +---+++ > > > | | 2fa165a26c | aacdce6e88 | > > > +---+++ > > > | boot_successes| 20 | 10 | > > > | early-boot-hang | 1 || > > > | boot_failures | 0 | 5 | > > > | PANIC:early_exception | 0 | 5 | > > > +---+++ > > > > > > > > > [0.00] BIOS-e820: [mem 0x0001-0x00036fff] > > > usable > > > [0.00] bootconsole [earlyser0] enabled > > > [0.00] NX (Execute Disable) protection: active > > > PANIC: early exception 0e rip 10:81899e6b error 9 cr2 > > > ff24 > > > [0.00] CPU: 0 PID: 0 Comm: swapper Not tainted > > > 3.18.0-rc2-gc5221e6 #1 > > > [0.00] 82203d30 819f0a6e > > > 03f8 > > > [0.00] ff24 82203e18 823701b0 > > > 82511401 > > > [0.00] 0ba3 > > > ff24 > > > [0.00] Call Trace: > > > [0.00] [] dump_stack+0x4e/0x68 > > > [0.00] [] early_idt_handler+0x90/0xb7 > > > [0.00] [] ? dmi_save_one_device+0x81/0x81 > > > [0.00] [] ? dmi_table+0x3f/0x94 > > > [0.00] [] ? dmi_table+0x16/0x94 > > > [0.00] [] ? dmi_save_one_device+0x81/0x81 > > > [0.00] [] ? dmi_save_one_device+0x81/0x81 > > > [0.00] [] dmi_walk_early+0x44/0x69 > > > [0.00] [] dmi_present+0x180/0x1ff > > > [0.00] [] dmi_scan_machine+0x144/0x191 > > > [0.00] [] ? loglevel+0x31/0x31 > > > [0.00] [] setup_arch+0x490/0xc73 > > > [0.00] [] ? printk+0x4d/0x4f > > > [0.00] [] start_kernel+0x9c/0x43f > > > [0.00] [] ? early_idt_handlers+0x120/0x120 > > > [0.00] [] x86_64_start_reservations+0x2a/0x2c > > > [0.00] [] x86_64_start_kernel+0x13b/0x14a > > > [0.00] RIP 0x4 > > > > > > > This is most puzzling. Could anyone decode the exception? > > This looks like the non-EFI path through dmi_scan_machine(), which > > calls dmi_present() /after/ calling dmi_smbios3_present(), which > > apparently has not found the _SM3_ header tag. Or could the call stack > > be inaccurate? > > The code triggered a page fault while trying to access > 0xff24, caused because the reserved bit was set in the page > table and no page was found. Looks like it jumped through a bogus > pointer. > > And yes, the callstack may definitely be wrong - the stack dumper is > just scraping addresses from the stack, as indicated by the '?' symbol. > > Yuanhan, what symbol does 0x81899e6b (the faulting instruction) > translate to? I found no System.map for that kernel, I then changed to another kernel, and here is the new panic dmesg: PANIC: early exception 0e rip 10:8167aa1a error 9 cr2 ff240001 [0.00] CPU: 0 PID: 0 Comm: swapper Not tainted 3.18.0-rc2-8-g4d3a0be #66 [0.00] 0ba3 81bcfd10 818010a4 03f8 [0.00] 003e 81bcfdf8 81d801b0 617420534f49424d [0.00] 001f ff24 ff24 [0.00] Call Trace: [0.00] [] dump_stack+0x46/0x58 [0.00] [] early_idt_handler+0x90/0xb7 [0.00] [] ? dmi_format_ids.constprop.9+0x13c/0x13c [0.00] [] ? dmi_table+0x4a/0xf0 [0.00] [] ? printk+0x61/0x63 [0.00] [] ? dmi_format_ids.constprop.9+0x13c/0x13c [0.00] [] ? dmi_format_ids.constprop.9+0x13c/0x13c [0.00] [] dmi_walk_early+0x6b/0x90 [0.00] [] dmi_present+0x1b4/0x23f [0.00] [] dmi_scan_machine+0x1d4/0x23a [0.00] [] ? early_idt_handlers+0x120/0x120 [0.00] [] setup_arch+0x462/0xcc6 [0.00] [] ? early_idt_handlers+0x120/0x120 [0.00] [] ? early_idt_handler+0x47/0xb7 [0.00] [] ? early_idt_handlers+0x120/0x120 [0.00] [] start_kernel+0x97/0x456 [0.00] [] ? early_idt_handlers+0x120/0x120 [0.00] [] ? early_idt_handlers+0x120/0x120 [0.00] [] x86_64_start_reservations+0x2a/0x2c [0.00] [] x86_64_start_kernel+0x13e/0x14d [0.00] RIP 0xba2 The address changes to 10:8167aa1a, and in the System.map, it has: 8167a9d0 t dmi_table 8167aac0 T dmi_name_in_vendors Sorry, I don't
Re: [LKP] [dmi] PANIC: early exception 0e rip 10:ffffffff81899e6b error 9 cr2 ffffffffff240000
On Fri, Nov 07, 2014 at 10:03:55AM +0100, Ard Biesheuvel wrote: > On 7 November 2014 09:46, Yuanhan Liu wrote: > > On Fri, Nov 07, 2014 at 09:23:56AM +0100, Ard Biesheuvel wrote: > >> On 7 November 2014 09:13, Yuanhan Liu wrote: > >> > On Fri, Nov 07, 2014 at 08:44:40AM +0100, Ard Biesheuvel wrote: > >> >> On 7 November 2014 08:37, Yuanhan Liu > >> >> wrote: > >> >> > On Fri, Nov 07, 2014 at 08:17:36AM +0100, Ard Biesheuvel wrote: > >> >> >> On 7 November 2014 06:47, LKP wrote: > >> >> >> > FYI, we noticed the below changes on > >> >> >> > > >> >> >> > https://git.linaro.org/people/ard.biesheuvel/linux-arm efi-for-3.19 > >> >> >> > commit aacdce6e880894acb57d71dcb2e3fc61b4ed4e96 ("dmi: add support > >> >> >> > for SMBIOS 3.0 64-bit entry point") > >> >> >> > > >> >> >> > > >> >> >> > +---+++ > >> >> >> > | | 2fa165a26c | aacdce6e88 | > >> >> >> > +---+++ > >> >> >> > | boot_successes| 20 | 10 | > >> >> >> > | early-boot-hang | 1 || > >> >> >> > | boot_failures | 0 | 5 | > >> >> >> > | PANIC:early_exception | 0 | 5 | > >> >> >> > +---+++ > >> >> >> > > >> >> >> > > >> >> >> > [0.00] BIOS-e820: [mem > >> >> >> > 0x0001-0x00036fff] usable > >> >> >> > [0.00] bootconsole [earlyser0] enabled > >> >> >> > [0.00] NX (Execute Disable) protection: active > >> >> >> > PANIC: early exception 0e rip 10:81899e6b error 9 cr2 > >> >> >> > ff24 > >> >> >> > [0.00] CPU: 0 PID: 0 Comm: swapper Not tainted > >> >> >> > 3.18.0-rc2-gc5221e6 #1 > >> >> >> > [0.00] 82203d30 819f0a6e > >> >> >> > 03f8 > >> >> >> > [0.00] ff24 82203e18 823701b0 > >> >> >> > 82511401 > >> >> >> > [0.00] 0ba3 > >> >> >> > ff24 > >> >> >> > [0.00] Call Trace: > >> >> >> > [0.00] [] dump_stack+0x4e/0x68 > >> >> >> > [0.00] [] early_idt_handler+0x90/0xb7 > >> >> >> > [0.00] [] ? > >> >> >> > dmi_save_one_device+0x81/0x81 > >> >> >> > [0.00] [] ? dmi_table+0x3f/0x94 > >> >> >> > [0.00] [] ? dmi_table+0x16/0x94 > >> >> >> > [0.00] [] ? > >> >> >> > dmi_save_one_device+0x81/0x81 > >> >> >> > [0.00] [] ? > >> >> >> > dmi_save_one_device+0x81/0x81 > >> >> >> > [0.00] [] dmi_walk_early+0x44/0x69 > >> >> >> > [0.00] [] dmi_present+0x180/0x1ff > >> >> >> > [0.00] [] dmi_scan_machine+0x144/0x191 > >> >> >> > [0.00] [] ? loglevel+0x31/0x31 > >> >> >> > [0.00] [] setup_arch+0x490/0xc73 > >> >> >> > [0.00] [] ? printk+0x4d/0x4f > >> >> >> > [0.00] [] start_kernel+0x9c/0x43f > >> >> >> > [0.00] [] ? > >> >> >> > early_idt_handlers+0x120/0x120 > >> >> >> > [0.00] [] > >> >> >> > x86_64_start_reservations+0x2a/0x2c > >> >> >> > [0.00] [] > >> >> >> > x86_64_start_kernel+0x13b/0x14a > >> >> >> > [0.00] RIP 0x4 > >> >> >> > > >> >> >> > >> >> >> This is most puzzling. Could anyone decode the exception? > >> >> >> This looks like the non-EFI path through dmi_scan_machine(), which
Re: [LKP] [dmi] PANIC: early exception 0e rip 10:ffffffff81899e6b error 9 cr2 ffffffffff240000
On Fri, Nov 07, 2014 at 09:23:56AM +0100, Ard Biesheuvel wrote: > On 7 November 2014 09:13, Yuanhan Liu wrote: > > On Fri, Nov 07, 2014 at 08:44:40AM +0100, Ard Biesheuvel wrote: > >> On 7 November 2014 08:37, Yuanhan Liu wrote: > >> > On Fri, Nov 07, 2014 at 08:17:36AM +0100, Ard Biesheuvel wrote: > >> >> On 7 November 2014 06:47, LKP wrote: > >> >> > FYI, we noticed the below changes on > >> >> > > >> >> > https://git.linaro.org/people/ard.biesheuvel/linux-arm efi-for-3.19 > >> >> > commit aacdce6e880894acb57d71dcb2e3fc61b4ed4e96 ("dmi: add support > >> >> > for SMBIOS 3.0 64-bit entry point") > >> >> > > >> >> > > >> >> > +---+++ > >> >> > | | 2fa165a26c | aacdce6e88 | > >> >> > +---+++ > >> >> > | boot_successes| 20 | 10 | > >> >> > | early-boot-hang | 1 || > >> >> > | boot_failures | 0 | 5 | > >> >> > | PANIC:early_exception | 0 | 5 | > >> >> > +---+++ > >> >> > > >> >> > > >> >> > [0.00] BIOS-e820: [mem 0x0001-0x00036fff] > >> >> > usable > >> >> > [0.00] bootconsole [earlyser0] enabled > >> >> > [0.00] NX (Execute Disable) protection: active > >> >> > PANIC: early exception 0e rip 10:81899e6b error 9 cr2 > >> >> > ff24 > >> >> > [0.00] CPU: 0 PID: 0 Comm: swapper Not tainted > >> >> > 3.18.0-rc2-gc5221e6 #1 > >> >> > [0.00] 82203d30 819f0a6e > >> >> > 03f8 > >> >> > [0.00] ff24 82203e18 823701b0 > >> >> > 82511401 > >> >> > [0.00] 0ba3 > >> >> > ff24 > >> >> > [0.00] Call Trace: > >> >> > [0.00] [] dump_stack+0x4e/0x68 > >> >> > [0.00] [] early_idt_handler+0x90/0xb7 > >> >> > [0.00] [] ? dmi_save_one_device+0x81/0x81 > >> >> > [0.00] [] ? dmi_table+0x3f/0x94 > >> >> > [0.00] [] ? dmi_table+0x16/0x94 > >> >> > [0.00] [] ? dmi_save_one_device+0x81/0x81 > >> >> > [0.00] [] ? dmi_save_one_device+0x81/0x81 > >> >> > [0.00] [] dmi_walk_early+0x44/0x69 > >> >> > [0.00] [] dmi_present+0x180/0x1ff > >> >> > [0.00] [] dmi_scan_machine+0x144/0x191 > >> >> > [0.00] [] ? loglevel+0x31/0x31 > >> >> > [0.00] [] setup_arch+0x490/0xc73 > >> >> > [0.00] [] ? printk+0x4d/0x4f > >> >> > [0.00] [] start_kernel+0x9c/0x43f > >> >> > [0.00] [] ? early_idt_handlers+0x120/0x120 > >> >> > [0.00] [] > >> >> > x86_64_start_reservations+0x2a/0x2c > >> >> > [0.00] [] x86_64_start_kernel+0x13b/0x14a > >> >> > [0.00] RIP 0x4 > >> >> > > >> >> > >> >> This is most puzzling. Could anyone decode the exception? > >> >> This looks like the non-EFI path through dmi_scan_machine(), which > >> >> calls dmi_present() /after/ calling dmi_smbios3_present(), which > >> >> apparently has not found the _SM3_ header tag. Or could the call stack > >> >> be inaccurate? > >> >> > >> >> Anyway, it would be good to know the exact type of the platform, > >> > > >> > It's a Nehalem-EP machine, wht 16 CPU and 12G memory. > >> > > >> >> and > >> >> perhaps we could find out if there is an inadvertent _SM3_ tag > >> >> somewhere in the 0xF - 0xF range? > >> > > >> > Sorry, how? > >> > > >> > >> That's not a brand new machine, so I suppose there wouldn't be a > >> SMBIOS 3.0 header lurking in there. > >> > >> Anyway, if
Re: [LKP] [dmi] PANIC: early exception 0e rip 10:ffffffff81899e6b error 9 cr2 ffffffffff240000
On Fri, Nov 07, 2014 at 08:44:40AM +0100, Ard Biesheuvel wrote: > On 7 November 2014 08:37, Yuanhan Liu wrote: > > On Fri, Nov 07, 2014 at 08:17:36AM +0100, Ard Biesheuvel wrote: > >> On 7 November 2014 06:47, LKP wrote: > >> > FYI, we noticed the below changes on > >> > > >> > https://git.linaro.org/people/ard.biesheuvel/linux-arm efi-for-3.19 > >> > commit aacdce6e880894acb57d71dcb2e3fc61b4ed4e96 ("dmi: add support for > >> > SMBIOS 3.0 64-bit entry point") > >> > > >> > > >> > +---+++ > >> > | | 2fa165a26c | aacdce6e88 | > >> > +---+++ > >> > | boot_successes| 20 | 10 | > >> > | early-boot-hang | 1 || > >> > | boot_failures | 0 | 5 | > >> > | PANIC:early_exception | 0 | 5 | > >> > +---+++ > >> > > >> > > >> > [0.00] BIOS-e820: [mem 0x0001-0x00036fff] > >> > usable > >> > [0.00] bootconsole [earlyser0] enabled > >> > [0.00] NX (Execute Disable) protection: active > >> > PANIC: early exception 0e rip 10:81899e6b error 9 cr2 > >> > ff24 > >> > [0.00] CPU: 0 PID: 0 Comm: swapper Not tainted > >> > 3.18.0-rc2-gc5221e6 #1 > >> > [0.00] 82203d30 819f0a6e > >> > 03f8 > >> > [0.00] ff24 82203e18 823701b0 > >> > 82511401 > >> > [0.00] 0ba3 > >> > ff24 > >> > [0.00] Call Trace: > >> > [0.00] [] dump_stack+0x4e/0x68 > >> > [0.00] [] early_idt_handler+0x90/0xb7 > >> > [0.00] [] ? dmi_save_one_device+0x81/0x81 > >> > [0.00] [] ? dmi_table+0x3f/0x94 > >> > [0.00] [] ? dmi_table+0x16/0x94 > >> > [0.00] [] ? dmi_save_one_device+0x81/0x81 > >> > [0.00] [] ? dmi_save_one_device+0x81/0x81 > >> > [0.00] [] dmi_walk_early+0x44/0x69 > >> > [0.00] [] dmi_present+0x180/0x1ff > >> > [0.00] [] dmi_scan_machine+0x144/0x191 > >> > [0.00] [] ? loglevel+0x31/0x31 > >> > [0.00] [] setup_arch+0x490/0xc73 > >> > [0.00] [] ? printk+0x4d/0x4f > >> > [0.00] [] start_kernel+0x9c/0x43f > >> > [0.00] [] ? early_idt_handlers+0x120/0x120 > >> > [0.00] [] x86_64_start_reservations+0x2a/0x2c > >> > [0.00] [] x86_64_start_kernel+0x13b/0x14a > >> > [0.00] RIP 0x4 > >> > > >> > >> This is most puzzling. Could anyone decode the exception? > >> This looks like the non-EFI path through dmi_scan_machine(), which > >> calls dmi_present() /after/ calling dmi_smbios3_present(), which > >> apparently has not found the _SM3_ header tag. Or could the call stack > >> be inaccurate? > >> > >> Anyway, it would be good to know the exact type of the platform, > > > > It's a Nehalem-EP machine, wht 16 CPU and 12G memory. > > > >> and > >> perhaps we could find out if there is an inadvertent _SM3_ tag > >> somewhere in the 0xF - 0xF range? > > > > Sorry, how? > > > > That's not a brand new machine, so I suppose there wouldn't be a > SMBIOS 3.0 header lurking in there. > > Anyway, if you are in a position to try things, could you apply this > > --- a/drivers/firmware/dmi_scan.c > +++ b/drivers/firmware/dmi_scan.c > @@ -617,7 +617,7 @@ void __init dmi_scan_machine(void) > memset(buf, 0, 16); > for (q = p; q < p + 0x1; q += 16) { > memcpy_fromio(buf + 16, q, 16); > - if (!dmi_smbios3_present(buf) || !dmi_present(buf)) { > + if (!dmi_present(buf)) { > dmi_available = 1; > dmi_early_unmap(p, 0x1); > goto out; > > and try again? kernel boots perfectly with this patch applied. --yliu > That is the only change that is relevant to the non-EFI > code path which this machine appears to take, so if this fixes things, > that would be valuable information even if it doesn't tell us exactly > what is going wrong. > > Thanks, > Ard. -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [LKP] [dmi] PANIC: early exception 0e rip 10:ffffffff81899e6b error 9 cr2 ffffffffff240000
On Fri, Nov 07, 2014 at 08:44:40AM +0100, Ard Biesheuvel wrote: On 7 November 2014 08:37, Yuanhan Liu yuanhan@linux.intel.com wrote: On Fri, Nov 07, 2014 at 08:17:36AM +0100, Ard Biesheuvel wrote: On 7 November 2014 06:47, LKP l...@01.org wrote: FYI, we noticed the below changes on https://git.linaro.org/people/ard.biesheuvel/linux-arm efi-for-3.19 commit aacdce6e880894acb57d71dcb2e3fc61b4ed4e96 (dmi: add support for SMBIOS 3.0 64-bit entry point) +---+++ | | 2fa165a26c | aacdce6e88 | +---+++ | boot_successes| 20 | 10 | | early-boot-hang | 1 || | boot_failures | 0 | 5 | | PANIC:early_exception | 0 | 5 | +---+++ [0.00] BIOS-e820: [mem 0x0001-0x00036fff] usable [0.00] bootconsole [earlyser0] enabled [0.00] NX (Execute Disable) protection: active PANIC: early exception 0e rip 10:81899e6b error 9 cr2 ff24 [0.00] CPU: 0 PID: 0 Comm: swapper Not tainted 3.18.0-rc2-gc5221e6 #1 [0.00] 82203d30 819f0a6e 03f8 [0.00] ff24 82203e18 823701b0 82511401 [0.00] 0ba3 ff24 [0.00] Call Trace: [0.00] [819f0a6e] dump_stack+0x4e/0x68 [0.00] [823701b0] early_idt_handler+0x90/0xb7 [0.00] [823c80da] ? dmi_save_one_device+0x81/0x81 [0.00] [81899e6b] ? dmi_table+0x3f/0x94 [0.00] [81899e42] ? dmi_table+0x16/0x94 [0.00] [823c80da] ? dmi_save_one_device+0x81/0x81 [0.00] [823c80da] ? dmi_save_one_device+0x81/0x81 [0.00] [823c7eff] dmi_walk_early+0x44/0x69 [0.00] [823c88a2] dmi_present+0x180/0x1ff [0.00] [823c8ab3] dmi_scan_machine+0x144/0x191 [0.00] [82370702] ? loglevel+0x31/0x31 [0.00] [82377f52] setup_arch+0x490/0xc73 [0.00] [819eef73] ? printk+0x4d/0x4f [0.00] [82370b90] start_kernel+0x9c/0x43f [0.00] [82370120] ? early_idt_handlers+0x120/0x120 [0.00] [823704a2] x86_64_start_reservations+0x2a/0x2c [0.00] [823705df] x86_64_start_kernel+0x13b/0x14a [0.00] RIP 0x4 This is most puzzling. Could anyone decode the exception? This looks like the non-EFI path through dmi_scan_machine(), which calls dmi_present() /after/ calling dmi_smbios3_present(), which apparently has not found the _SM3_ header tag. Or could the call stack be inaccurate? Anyway, it would be good to know the exact type of the platform, It's a Nehalem-EP machine, wht 16 CPU and 12G memory. and perhaps we could find out if there is an inadvertent _SM3_ tag somewhere in the 0xF - 0xF range? Sorry, how? That's not a brand new machine, so I suppose there wouldn't be a SMBIOS 3.0 header lurking in there. Anyway, if you are in a position to try things, could you apply this --- a/drivers/firmware/dmi_scan.c +++ b/drivers/firmware/dmi_scan.c @@ -617,7 +617,7 @@ void __init dmi_scan_machine(void) memset(buf, 0, 16); for (q = p; q p + 0x1; q += 16) { memcpy_fromio(buf + 16, q, 16); - if (!dmi_smbios3_present(buf) || !dmi_present(buf)) { + if (!dmi_present(buf)) { dmi_available = 1; dmi_early_unmap(p, 0x1); goto out; and try again? kernel boots perfectly with this patch applied. --yliu That is the only change that is relevant to the non-EFI code path which this machine appears to take, so if this fixes things, that would be valuable information even if it doesn't tell us exactly what is going wrong. Thanks, Ard. -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [LKP] [dmi] PANIC: early exception 0e rip 10:ffffffff81899e6b error 9 cr2 ffffffffff240000
On Fri, Nov 07, 2014 at 09:23:56AM +0100, Ard Biesheuvel wrote: On 7 November 2014 09:13, Yuanhan Liu yuanhan@linux.intel.com wrote: On Fri, Nov 07, 2014 at 08:44:40AM +0100, Ard Biesheuvel wrote: On 7 November 2014 08:37, Yuanhan Liu yuanhan@linux.intel.com wrote: On Fri, Nov 07, 2014 at 08:17:36AM +0100, Ard Biesheuvel wrote: On 7 November 2014 06:47, LKP l...@01.org wrote: FYI, we noticed the below changes on https://git.linaro.org/people/ard.biesheuvel/linux-arm efi-for-3.19 commit aacdce6e880894acb57d71dcb2e3fc61b4ed4e96 (dmi: add support for SMBIOS 3.0 64-bit entry point) +---+++ | | 2fa165a26c | aacdce6e88 | +---+++ | boot_successes| 20 | 10 | | early-boot-hang | 1 || | boot_failures | 0 | 5 | | PANIC:early_exception | 0 | 5 | +---+++ [0.00] BIOS-e820: [mem 0x0001-0x00036fff] usable [0.00] bootconsole [earlyser0] enabled [0.00] NX (Execute Disable) protection: active PANIC: early exception 0e rip 10:81899e6b error 9 cr2 ff24 [0.00] CPU: 0 PID: 0 Comm: swapper Not tainted 3.18.0-rc2-gc5221e6 #1 [0.00] 82203d30 819f0a6e 03f8 [0.00] ff24 82203e18 823701b0 82511401 [0.00] 0ba3 ff24 [0.00] Call Trace: [0.00] [819f0a6e] dump_stack+0x4e/0x68 [0.00] [823701b0] early_idt_handler+0x90/0xb7 [0.00] [823c80da] ? dmi_save_one_device+0x81/0x81 [0.00] [81899e6b] ? dmi_table+0x3f/0x94 [0.00] [81899e42] ? dmi_table+0x16/0x94 [0.00] [823c80da] ? dmi_save_one_device+0x81/0x81 [0.00] [823c80da] ? dmi_save_one_device+0x81/0x81 [0.00] [823c7eff] dmi_walk_early+0x44/0x69 [0.00] [823c88a2] dmi_present+0x180/0x1ff [0.00] [823c8ab3] dmi_scan_machine+0x144/0x191 [0.00] [82370702] ? loglevel+0x31/0x31 [0.00] [82377f52] setup_arch+0x490/0xc73 [0.00] [819eef73] ? printk+0x4d/0x4f [0.00] [82370b90] start_kernel+0x9c/0x43f [0.00] [82370120] ? early_idt_handlers+0x120/0x120 [0.00] [823704a2] x86_64_start_reservations+0x2a/0x2c [0.00] [823705df] x86_64_start_kernel+0x13b/0x14a [0.00] RIP 0x4 This is most puzzling. Could anyone decode the exception? This looks like the non-EFI path through dmi_scan_machine(), which calls dmi_present() /after/ calling dmi_smbios3_present(), which apparently has not found the _SM3_ header tag. Or could the call stack be inaccurate? Anyway, it would be good to know the exact type of the platform, It's a Nehalem-EP machine, wht 16 CPU and 12G memory. and perhaps we could find out if there is an inadvertent _SM3_ tag somewhere in the 0xF - 0xF range? Sorry, how? That's not a brand new machine, so I suppose there wouldn't be a SMBIOS 3.0 header lurking in there. Anyway, if you are in a position to try things, could you apply this --- a/drivers/firmware/dmi_scan.c +++ b/drivers/firmware/dmi_scan.c @@ -617,7 +617,7 @@ void __init dmi_scan_machine(void) memset(buf, 0, 16); for (q = p; q p + 0x1; q += 16) { memcpy_fromio(buf + 16, q, 16); - if (!dmi_smbios3_present(buf) || !dmi_present(buf)) { + if (!dmi_present(buf)) { dmi_available = 1; dmi_early_unmap(p, 0x1); goto out; and try again? kernel boots perfectly with this patch applied. --yliu Thank you! Very useful to know Sigh, I made a silly error, I speicified wrong commit while testing your patch. Sorry for that. And I tested it again, with your former patch, sorry, the panic still happens. --yliu Sorry to keep you busy, but could you please apply this on top of the previous patch --- a/drivers/firmware/dmi_scan.c +++ b/drivers/firmware/dmi_scan.c @@ -617,6 +617,8 @@ void __init dmi_scan_machine(void) memset(buf, 0, 16); for (q = p; q p + 0x1; q += 16) { memcpy_fromio(buf + 16, q, 16); + if (memcmp(buf, _SM3_, 5) == 0
Re: [LKP] [dmi] PANIC: early exception 0e rip 10:ffffffff81899e6b error 9 cr2 ffffffffff240000
On Fri, Nov 07, 2014 at 10:03:55AM +0100, Ard Biesheuvel wrote: On 7 November 2014 09:46, Yuanhan Liu yuanhan@linux.intel.com wrote: On Fri, Nov 07, 2014 at 09:23:56AM +0100, Ard Biesheuvel wrote: On 7 November 2014 09:13, Yuanhan Liu yuanhan@linux.intel.com wrote: On Fri, Nov 07, 2014 at 08:44:40AM +0100, Ard Biesheuvel wrote: On 7 November 2014 08:37, Yuanhan Liu yuanhan@linux.intel.com wrote: On Fri, Nov 07, 2014 at 08:17:36AM +0100, Ard Biesheuvel wrote: On 7 November 2014 06:47, LKP l...@01.org wrote: FYI, we noticed the below changes on https://git.linaro.org/people/ard.biesheuvel/linux-arm efi-for-3.19 commit aacdce6e880894acb57d71dcb2e3fc61b4ed4e96 (dmi: add support for SMBIOS 3.0 64-bit entry point) +---+++ | | 2fa165a26c | aacdce6e88 | +---+++ | boot_successes| 20 | 10 | | early-boot-hang | 1 || | boot_failures | 0 | 5 | | PANIC:early_exception | 0 | 5 | +---+++ [0.00] BIOS-e820: [mem 0x0001-0x00036fff] usable [0.00] bootconsole [earlyser0] enabled [0.00] NX (Execute Disable) protection: active PANIC: early exception 0e rip 10:81899e6b error 9 cr2 ff24 [0.00] CPU: 0 PID: 0 Comm: swapper Not tainted 3.18.0-rc2-gc5221e6 #1 [0.00] 82203d30 819f0a6e 03f8 [0.00] ff24 82203e18 823701b0 82511401 [0.00] 0ba3 ff24 [0.00] Call Trace: [0.00] [819f0a6e] dump_stack+0x4e/0x68 [0.00] [823701b0] early_idt_handler+0x90/0xb7 [0.00] [823c80da] ? dmi_save_one_device+0x81/0x81 [0.00] [81899e6b] ? dmi_table+0x3f/0x94 [0.00] [81899e42] ? dmi_table+0x16/0x94 [0.00] [823c80da] ? dmi_save_one_device+0x81/0x81 [0.00] [823c80da] ? dmi_save_one_device+0x81/0x81 [0.00] [823c7eff] dmi_walk_early+0x44/0x69 [0.00] [823c88a2] dmi_present+0x180/0x1ff [0.00] [823c8ab3] dmi_scan_machine+0x144/0x191 [0.00] [82370702] ? loglevel+0x31/0x31 [0.00] [82377f52] setup_arch+0x490/0xc73 [0.00] [819eef73] ? printk+0x4d/0x4f [0.00] [82370b90] start_kernel+0x9c/0x43f [0.00] [82370120] ? early_idt_handlers+0x120/0x120 [0.00] [823704a2] x86_64_start_reservations+0x2a/0x2c [0.00] [823705df] x86_64_start_kernel+0x13b/0x14a [0.00] RIP 0x4 This is most puzzling. Could anyone decode the exception? This looks like the non-EFI path through dmi_scan_machine(), which calls dmi_present() /after/ calling dmi_smbios3_present(), which apparently has not found the _SM3_ header tag. Or could the call stack be inaccurate? Anyway, it would be good to know the exact type of the platform, It's a Nehalem-EP machine, wht 16 CPU and 12G memory. and perhaps we could find out if there is an inadvertent _SM3_ tag somewhere in the 0xF - 0xF range? Sorry, how? That's not a brand new machine, so I suppose there wouldn't be a SMBIOS 3.0 header lurking in there. Anyway, if you are in a position to try things, could you apply this --- a/drivers/firmware/dmi_scan.c +++ b/drivers/firmware/dmi_scan.c @@ -617,7 +617,7 @@ void __init dmi_scan_machine(void) memset(buf, 0, 16); for (q = p; q p + 0x1; q += 16) { memcpy_fromio(buf + 16, q, 16); - if (!dmi_smbios3_present(buf) || !dmi_present(buf)) { + if (!dmi_present(buf)) { dmi_available = 1; dmi_early_unmap(p, 0x1); goto out; and try again? kernel boots perfectly with this patch applied. --yliu Thank you! Very useful to know Sigh, I made a silly error, I speicified wrong commit while testing your patch. Sorry for that. And I tested it again, with your former patch, sorry, the panic still happens. --yliu OK, no worries. Could you please try the attached patch? On my ARM system, it produces something like this == Decoding
Re: [LKP] [dmi] PANIC: early exception 0e rip 10:ffffffff81899e6b error 9 cr2 ffffffffff240000
On Fri, Nov 07, 2014 at 09:16:02AM +, Matt Fleming wrote: On Fri, 2014-11-07 at 08:17 +0100, Ard Biesheuvel wrote: On 7 November 2014 06:47, LKP l...@01.org wrote: FYI, we noticed the below changes on https://git.linaro.org/people/ard.biesheuvel/linux-arm efi-for-3.19 commit aacdce6e880894acb57d71dcb2e3fc61b4ed4e96 (dmi: add support for SMBIOS 3.0 64-bit entry point) +---+++ | | 2fa165a26c | aacdce6e88 | +---+++ | boot_successes| 20 | 10 | | early-boot-hang | 1 || | boot_failures | 0 | 5 | | PANIC:early_exception | 0 | 5 | +---+++ [0.00] BIOS-e820: [mem 0x0001-0x00036fff] usable [0.00] bootconsole [earlyser0] enabled [0.00] NX (Execute Disable) protection: active PANIC: early exception 0e rip 10:81899e6b error 9 cr2 ff24 [0.00] CPU: 0 PID: 0 Comm: swapper Not tainted 3.18.0-rc2-gc5221e6 #1 [0.00] 82203d30 819f0a6e 03f8 [0.00] ff24 82203e18 823701b0 82511401 [0.00] 0ba3 ff24 [0.00] Call Trace: [0.00] [819f0a6e] dump_stack+0x4e/0x68 [0.00] [823701b0] early_idt_handler+0x90/0xb7 [0.00] [823c80da] ? dmi_save_one_device+0x81/0x81 [0.00] [81899e6b] ? dmi_table+0x3f/0x94 [0.00] [81899e42] ? dmi_table+0x16/0x94 [0.00] [823c80da] ? dmi_save_one_device+0x81/0x81 [0.00] [823c80da] ? dmi_save_one_device+0x81/0x81 [0.00] [823c7eff] dmi_walk_early+0x44/0x69 [0.00] [823c88a2] dmi_present+0x180/0x1ff [0.00] [823c8ab3] dmi_scan_machine+0x144/0x191 [0.00] [82370702] ? loglevel+0x31/0x31 [0.00] [82377f52] setup_arch+0x490/0xc73 [0.00] [819eef73] ? printk+0x4d/0x4f [0.00] [82370b90] start_kernel+0x9c/0x43f [0.00] [82370120] ? early_idt_handlers+0x120/0x120 [0.00] [823704a2] x86_64_start_reservations+0x2a/0x2c [0.00] [823705df] x86_64_start_kernel+0x13b/0x14a [0.00] RIP 0x4 This is most puzzling. Could anyone decode the exception? This looks like the non-EFI path through dmi_scan_machine(), which calls dmi_present() /after/ calling dmi_smbios3_present(), which apparently has not found the _SM3_ header tag. Or could the call stack be inaccurate? The code triggered a page fault while trying to access 0xff24, caused because the reserved bit was set in the page table and no page was found. Looks like it jumped through a bogus pointer. And yes, the callstack may definitely be wrong - the stack dumper is just scraping addresses from the stack, as indicated by the '?' symbol. Yuanhan, what symbol does 0x81899e6b (the faulting instruction) translate to? I found no System.map for that kernel, I then changed to another kernel, and here is the new panic dmesg: PANIC: early exception 0e rip 10:8167aa1a error 9 cr2 ff240001 [0.00] CPU: 0 PID: 0 Comm: swapper Not tainted 3.18.0-rc2-8-g4d3a0be #66 [0.00] 0ba3 81bcfd10 818010a4 03f8 [0.00] 003e 81bcfdf8 81d801b0 617420534f49424d [0.00] 001f ff24 ff24 [0.00] Call Trace: [0.00] [818010a4] dump_stack+0x46/0x58 [0.00] [81d801b0] early_idt_handler+0x90/0xb7 [0.00] [81dd4cfc] ? dmi_format_ids.constprop.9+0x13c/0x13c [0.00] [8167aa1a] ? dmi_table+0x4a/0xf0 [0.00] [817fa71b] ? printk+0x61/0x63 [0.00] [81dd4cfc] ? dmi_format_ids.constprop.9+0x13c/0x13c [0.00] [81dd4cfc] ? dmi_format_ids.constprop.9+0x13c/0x13c [0.00] [81dd49dc] dmi_walk_early+0x6b/0x90 [0.00] [81dd52fc] dmi_present+0x1b4/0x23f [0.00] [81dd55ab] dmi_scan_machine+0x1d4/0x23a [0.00] [81d80120] ? early_idt_handlers+0x120/0x120 [0.00] [81d883a2] setup_arch+0x462/0xcc6 [0.00] [81d80120] ? early_idt_handlers+0x120/0x120 [0.00] [81d80167] ? early_idt_handler+0x47/0xb7 [0.00] [81d80120] ? early_idt_handlers+0x120/0x120 [0.00] [81d80cf0] start_kernel+0x97/0x456 [0.00] [81d80120] ?
Re: [LKP] [dmi] PANIC: early exception 0e rip 10:ffffffff81899e6b error 9 cr2 ffffffffff240000
On Fri, Nov 07, 2014 at 10:35:44AM +0100, Ard Biesheuvel wrote: On 7 November 2014 10:26, Yuanhan Liu yuanhan@linux.intel.com wrote: On Fri, Nov 07, 2014 at 10:03:55AM +0100, Ard Biesheuvel wrote: On 7 November 2014 09:46, Yuanhan Liu yuanhan@linux.intel.com wrote: On Fri, Nov 07, 2014 at 09:23:56AM +0100, Ard Biesheuvel wrote: On 7 November 2014 09:13, Yuanhan Liu yuanhan@linux.intel.com wrote: On Fri, Nov 07, 2014 at 08:44:40AM +0100, Ard Biesheuvel wrote: On 7 November 2014 08:37, Yuanhan Liu yuanhan@linux.intel.com wrote: On Fri, Nov 07, 2014 at 08:17:36AM +0100, Ard Biesheuvel wrote: On 7 November 2014 06:47, LKP l...@01.org wrote: FYI, we noticed the below changes on https://git.linaro.org/people/ard.biesheuvel/linux-arm efi-for-3.19 commit aacdce6e880894acb57d71dcb2e3fc61b4ed4e96 (dmi: add support for SMBIOS 3.0 64-bit entry point) +---+++ | | 2fa165a26c | aacdce6e88 | +---+++ | boot_successes| 20 | 10 | | early-boot-hang | 1 || | boot_failures | 0 | 5 | | PANIC:early_exception | 0 | 5 | +---+++ [0.00] BIOS-e820: [mem 0x0001-0x00036fff] usable [0.00] bootconsole [earlyser0] enabled [0.00] NX (Execute Disable) protection: active PANIC: early exception 0e rip 10:81899e6b error 9 cr2 ff24 [0.00] CPU: 0 PID: 0 Comm: swapper Not tainted 3.18.0-rc2-gc5221e6 #1 [0.00] 82203d30 819f0a6e 03f8 [0.00] ff24 82203e18 823701b0 82511401 [0.00] 0ba3 ff24 [0.00] Call Trace: [0.00] [819f0a6e] dump_stack+0x4e/0x68 [0.00] [823701b0] early_idt_handler+0x90/0xb7 [0.00] [823c80da] ? dmi_save_one_device+0x81/0x81 [0.00] [81899e6b] ? dmi_table+0x3f/0x94 [0.00] [81899e42] ? dmi_table+0x16/0x94 [0.00] [823c80da] ? dmi_save_one_device+0x81/0x81 [0.00] [823c80da] ? dmi_save_one_device+0x81/0x81 [0.00] [823c7eff] dmi_walk_early+0x44/0x69 [0.00] [823c88a2] dmi_present+0x180/0x1ff [0.00] [823c8ab3] dmi_scan_machine+0x144/0x191 [0.00] [82370702] ? loglevel+0x31/0x31 [0.00] [82377f52] setup_arch+0x490/0xc73 [0.00] [819eef73] ? printk+0x4d/0x4f [0.00] [82370b90] start_kernel+0x9c/0x43f [0.00] [82370120] ? early_idt_handlers+0x120/0x120 [0.00] [823704a2] x86_64_start_reservations+0x2a/0x2c [0.00] [823705df] x86_64_start_kernel+0x13b/0x14a [0.00] RIP 0x4 This is most puzzling. Could anyone decode the exception? This looks like the non-EFI path through dmi_scan_machine(), which calls dmi_present() /after/ calling dmi_smbios3_present(), which apparently has not found the _SM3_ header tag. Or could the call stack be inaccurate? Anyway, it would be good to know the exact type of the platform, It's a Nehalem-EP machine, wht 16 CPU and 12G memory. and perhaps we could find out if there is an inadvertent _SM3_ tag somewhere in the 0xF - 0xF range? Sorry, how? That's not a brand new machine, so I suppose there wouldn't be a SMBIOS 3.0 header lurking in there. Anyway, if you are in a position to try things, could you apply this --- a/drivers/firmware/dmi_scan.c +++ b/drivers/firmware/dmi_scan.c @@ -617,7 +617,7 @@ void __init dmi_scan_machine(void) memset(buf, 0, 16); for (q = p; q p + 0x1; q += 16) { memcpy_fromio(buf + 16, q, 16); - if (!dmi_smbios3_present(buf) || !dmi_present(buf)) { + if (!dmi_present(buf)) { dmi_available = 1; dmi_early_unmap(p, 0x1); goto out; and try again? kernel boots perfectly with this patch applied. --yliu Thank you! Very useful to know Sigh, I made a silly error, I speicified wrong commit while testing
Re: [LKP] [dmi] PANIC: early exception 0e rip 10:ffffffff81899e6b error 9 cr2 ffffffffff240000
On Fri, Nov 07, 2014 at 08:17:36AM +0100, Ard Biesheuvel wrote: > On 7 November 2014 06:47, LKP wrote: > > FYI, we noticed the below changes on > > > > https://git.linaro.org/people/ard.biesheuvel/linux-arm efi-for-3.19 > > commit aacdce6e880894acb57d71dcb2e3fc61b4ed4e96 ("dmi: add support for > > SMBIOS 3.0 64-bit entry point") > > > > > > +---+++ > > | | 2fa165a26c | aacdce6e88 | > > +---+++ > > | boot_successes| 20 | 10 | > > | early-boot-hang | 1 || > > | boot_failures | 0 | 5 | > > | PANIC:early_exception | 0 | 5 | > > +---+++ > > > > > > [0.00] BIOS-e820: [mem 0x0001-0x00036fff] usable > > [0.00] bootconsole [earlyser0] enabled > > [0.00] NX (Execute Disable) protection: active > > PANIC: early exception 0e rip 10:81899e6b error 9 cr2 > > ff24 > > [0.00] CPU: 0 PID: 0 Comm: swapper Not tainted 3.18.0-rc2-gc5221e6 > > #1 > > [0.00] 82203d30 819f0a6e > > 03f8 > > [0.00] ff24 82203e18 823701b0 > > 82511401 > > [0.00] 0ba3 > > ff24 > > [0.00] Call Trace: > > [0.00] [] dump_stack+0x4e/0x68 > > [0.00] [] early_idt_handler+0x90/0xb7 > > [0.00] [] ? dmi_save_one_device+0x81/0x81 > > [0.00] [] ? dmi_table+0x3f/0x94 > > [0.00] [] ? dmi_table+0x16/0x94 > > [0.00] [] ? dmi_save_one_device+0x81/0x81 > > [0.00] [] ? dmi_save_one_device+0x81/0x81 > > [0.00] [] dmi_walk_early+0x44/0x69 > > [0.00] [] dmi_present+0x180/0x1ff > > [0.00] [] dmi_scan_machine+0x144/0x191 > > [0.00] [] ? loglevel+0x31/0x31 > > [0.00] [] setup_arch+0x490/0xc73 > > [0.00] [] ? printk+0x4d/0x4f > > [0.00] [] start_kernel+0x9c/0x43f > > [0.00] [] ? early_idt_handlers+0x120/0x120 > > [0.00] [] x86_64_start_reservations+0x2a/0x2c > > [0.00] [] x86_64_start_kernel+0x13b/0x14a > > [0.00] RIP 0x4 > > > > This is most puzzling. Could anyone decode the exception? > This looks like the non-EFI path through dmi_scan_machine(), which > calls dmi_present() /after/ calling dmi_smbios3_present(), which > apparently has not found the _SM3_ header tag. Or could the call stack > be inaccurate? > > Anyway, it would be good to know the exact type of the platform, It's a Nehalem-EP machine, wht 16 CPU and 12G memory. > and > perhaps we could find out if there is an inadvertent _SM3_ tag > somewhere in the 0xF - 0xF range? Sorry, how? --yliu -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [LKP] [dmi] PANIC: early exception 0e rip 10:ffffffff81899e6b error 9 cr2 ffffffffff240000
On Fri, Nov 07, 2014 at 08:17:36AM +0100, Ard Biesheuvel wrote: On 7 November 2014 06:47, LKP l...@01.org wrote: FYI, we noticed the below changes on https://git.linaro.org/people/ard.biesheuvel/linux-arm efi-for-3.19 commit aacdce6e880894acb57d71dcb2e3fc61b4ed4e96 (dmi: add support for SMBIOS 3.0 64-bit entry point) +---+++ | | 2fa165a26c | aacdce6e88 | +---+++ | boot_successes| 20 | 10 | | early-boot-hang | 1 || | boot_failures | 0 | 5 | | PANIC:early_exception | 0 | 5 | +---+++ [0.00] BIOS-e820: [mem 0x0001-0x00036fff] usable [0.00] bootconsole [earlyser0] enabled [0.00] NX (Execute Disable) protection: active PANIC: early exception 0e rip 10:81899e6b error 9 cr2 ff24 [0.00] CPU: 0 PID: 0 Comm: swapper Not tainted 3.18.0-rc2-gc5221e6 #1 [0.00] 82203d30 819f0a6e 03f8 [0.00] ff24 82203e18 823701b0 82511401 [0.00] 0ba3 ff24 [0.00] Call Trace: [0.00] [819f0a6e] dump_stack+0x4e/0x68 [0.00] [823701b0] early_idt_handler+0x90/0xb7 [0.00] [823c80da] ? dmi_save_one_device+0x81/0x81 [0.00] [81899e6b] ? dmi_table+0x3f/0x94 [0.00] [81899e42] ? dmi_table+0x16/0x94 [0.00] [823c80da] ? dmi_save_one_device+0x81/0x81 [0.00] [823c80da] ? dmi_save_one_device+0x81/0x81 [0.00] [823c7eff] dmi_walk_early+0x44/0x69 [0.00] [823c88a2] dmi_present+0x180/0x1ff [0.00] [823c8ab3] dmi_scan_machine+0x144/0x191 [0.00] [82370702] ? loglevel+0x31/0x31 [0.00] [82377f52] setup_arch+0x490/0xc73 [0.00] [819eef73] ? printk+0x4d/0x4f [0.00] [82370b90] start_kernel+0x9c/0x43f [0.00] [82370120] ? early_idt_handlers+0x120/0x120 [0.00] [823704a2] x86_64_start_reservations+0x2a/0x2c [0.00] [823705df] x86_64_start_kernel+0x13b/0x14a [0.00] RIP 0x4 This is most puzzling. Could anyone decode the exception? This looks like the non-EFI path through dmi_scan_machine(), which calls dmi_present() /after/ calling dmi_smbios3_present(), which apparently has not found the _SM3_ header tag. Or could the call stack be inaccurate? Anyway, it would be good to know the exact type of the platform, It's a Nehalem-EP machine, wht 16 CPU and 12G memory. and perhaps we could find out if there is an inadvertent _SM3_ tag somewhere in the 0xF - 0xF range? Sorry, how? --yliu -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH 0/3] Shrinkers and proportional reclaim
On Thu, May 22, 2014 at 05:30:51PM +0100, Mel Gorman wrote: > On Fri, May 23, 2014 at 12:14:16AM +0800, Yuanhan Liu wrote: > > On Thu, May 22, 2014 at 10:09:36AM +0100, Mel Gorman wrote: > > > This series is aimed at regressions noticed during reclaim activity. The > > > first two patches are shrinker patches that were posted ages ago but never > > > merged for reasons that are unclear to me. I'm posting them again to see > > > if > > > there was a reason they were dropped or if they just got lost. Dave? > > > Time? > > > The last patch adjusts proportional reclaim. Yuanhan Liu, can you retest > > > the vm scalability test cases on a larger machine? Hugh, does this work > > > for you on the memcg test cases? > > > > Sure, and here is the result. I applied these 3 patches on v3.15-rc6, > > and head commit is 60c10afd. e82e0561 is the old commit that introduced > > the regression. The testserver has 512G memory and 120 CPU. > > > > It's a simple result; if you need more data, I can gather them and send > > it to you tomorrow: > > > > e82e0561v3.15-rc6 60c10afd > > > > 185607851223212238868453 > > -34%+109 > > > > As you can see, the performance is back, and it is way much better ;) > > > > Thanks a lot for that and the quick response. It is much appreciated. Welcome! And sorry that I made a silly mistake. Those numbers are right though, I just setup wrong compare base; I should compare them with e82e0561's parent, which is 75485363ce85526 at below table. Here is the detailed results to compensate the mistake I made ;) Legend: ~XX%- stddev percent (3 runs for each kernel) [+-]XX% - change percent 75485363ce85526 e82e0561dae9f3ae5a21fc2d3 v3.15-rc6 60c10afd233f3344479d229dc --- - - - 35979244 ~ 0% -48.4% 18560785 ~ 0% -66.0% 12235090 ~ 0% +8.0% 38868453 ~ 0% vm-scalability.throughput 28138 ~ 0% +7448.2%2123943 ~ 0% +2724.5% 794777 ~ 0% +1.6% 28598 ~ 0% proc-vmstat.allocstall 544 ~ 6% -95.2% 26 ~ 0% -96.5% 19 ~21% -6.9%506 ~ 6% numa-vmstat.node2.nr_isolated_file 12009832 ~11%+368.1% 56215319 ~ 0%+312.9% 49589361 ~ 1% +0.7% 12091235 ~ 5% numa-numastat.node3.numa_foreign 560 ~ 5% -95.7% 24 ~12% -96.9% 17 ~10% -8.7%511 ~ 2% numa-vmstat.node1.nr_isolated_file 8740137 ~12%+574.0% 58910256 ~ 0%+321.0% 36798827 ~ 0% +21.0% 10578905 ~13% numa-vmstat.node0.numa_other 8734988 ~12%+574.4% 58904944 ~ 0%+321.2% 36794158 ~ 0% +21.0% 10572718 ~13% numa-vmstat.node0.numa_miss 1308 ~12%-100.0% 0 ~ 0%-100.0% 0 +23.3% 1612 ~18% proc-vmstat.pgscan_direct_throttle 12294788 ~11%+401.2% 61622745 ~ 0%+332.6% 53190547 ~ 0% -13.2% 10667387 ~ 5% numa-numastat.node1.numa_foreign 576 ~ 6% -91.2% 50 ~22% -94.3% 33 ~20% -18.1%472 ~ 1% numa-vmstat.node0.nr_isolated_file 12 ~24% +2400.0%316 ~ 4% +13543.7% 1728 ~ 5% +155.3% 32 ~29% proc-vmstat.compact_stall 572 ~ 2% -96.4% 20 ~18% -97.6% 13 ~11% -17.5%472 ~ 2% numa-vmstat.node3.nr_isolated_file 3012 ~12% +2388.4% 74959 ~ 0%+254.7% 10685 ~ 1% -45.4% 1646 ~ 1% proc-vmstat.pageoutrun 2312 ~ 3% -94.2%133 ~ 4% -95.8% 97 ~ 8% -12.6% 2021 ~ 2% proc-vmstat.nr_isolated_file 2575163 ~ 0% +2779.1% 74141888 ~ 0%+958.0% 27244229 ~ 0% -1.3%2542941 ~ 0% proc-vmstat.pgscan_direct_dma32 21916603 ~13% +2519.8% 5.742e+08 ~ 0% +2868.9% 6.507e+08 ~ 0% -16.1% 18397644 ~ 5% proc-vmstat.pgscan_kswapd_normal 53306 ~24% +1077.9% 627895 ~ 0% +2066.2%1154741 ~ 0% +23.5% 65815 ~24% proc-vmstat.pgscan_kswapd_dma32 2575163 ~ 0% +2778.6% 74129497 ~ 0%+957.8% 27239606 ~ 0% -1.3%2542353 ~ 0% proc-vmstat.pgsteal_direct_dma32 21907744 ~14% +2520.8% 5.742e+08 ~ 0% +2870.0% 6.507e+08 ~ 0% -16.1% 18386641 ~ 5% proc-vmstat.pgsteal_kswapd_normal 53306 ~24% +1077.7% 627796 ~ 0% +2065.7%1154436 ~ 0% +23.3% 65731 ~24% proc-vmstat.pgsteal_kswapd_dma32 2967449 ~ 0% +2432.7% 75156011 ~ 0%+869.9% 28781337 ~ 0% -0.7%2945933 ~ 0% proc-vmstat.pgalloc_dma32 13081172 ~11%+599.4% 91495653 ~ 0%+337.1% 57180622 ~ 0% +12.1% 14668
Re: [PATCH 0/3] Shrinkers and proportional reclaim
On Thu, May 22, 2014 at 10:09:36AM +0100, Mel Gorman wrote: > This series is aimed at regressions noticed during reclaim activity. The > first two patches are shrinker patches that were posted ages ago but never > merged for reasons that are unclear to me. I'm posting them again to see if > there was a reason they were dropped or if they just got lost. Dave? Time? > The last patch adjusts proportional reclaim. Yuanhan Liu, can you retest > the vm scalability test cases on a larger machine? Hugh, does this work > for you on the memcg test cases? Sure, and here is the result. I applied these 3 patches on v3.15-rc6, and head commit is 60c10afd. e82e0561 is the old commit that introduced the regression. The testserver has 512G memory and 120 CPU. It's a simple result; if you need more data, I can gather them and send it to you tomorrow: e82e0561v3.15-rc6 60c10afd 185607851223212238868453 -34%+109 As you can see, the performance is back, and it is way much better ;) --yliu > > Based on ext4, I get the following results but unfortunately my larger test > machines are all unavailable so this is based on a relatively small machine. > > postmark > 3.15.0-rc53.15.0-rc5 > vanilla proportion-v1r4 > Ops/sec Transactions 21.00 ( 0.00%) 25.00 ( 19.05%) > Ops/sec FilesCreate 39.00 ( 0.00%) 45.00 ( 15.38%) > Ops/sec CreateTransact 10.00 ( 0.00%) 12.00 ( 20.00%) > Ops/sec FilesDeleted 6202.00 ( 0.00%) 6202.00 ( 0.00%) > Ops/sec DeleteTransact 11.00 ( 0.00%) 12.00 ( 9.09%) > Ops/sec DataRead/MB 25.97 ( 0.00%) 30.02 ( 15.59%) > Ops/sec DataWrite/MB 49.99 ( 0.00%) 57.78 ( 15.58%) > > ffsb (mail server simulator) > 3.15.0-rc5 3.15.0-rc5 > vanillaproportion-v1r4 > Ops/sec readall 9402.63 ( 0.00%) 9805.74 ( 4.29%) > Ops/sec create4695.45 ( 0.00%) 4781.39 ( 1.83%) > Ops/sec delete 173.72 ( 0.00%) 177.23 ( 2.02%) > Ops/sec Transactions 14271.80 ( 0.00%) 14764.37 ( 3.45%) > Ops/sec Read37.00 ( 0.00%)38.50 ( 4.05%) > Ops/sec Write 18.20 ( 0.00%)18.50 ( 1.65%) > > dd of a large file > 3.15.0-rc53.15.0-rc5 >vanilla proportion-v1r4 > WallTime DownloadTar 75.00 ( 0.00%) 61.00 ( 18.67%) > WallTime DD 423.00 ( 0.00%) 401.00 ( 5.20%) > WallTime Delete 2.00 ( 0.00%)5.00 (-150.00%) > > stutter (times mmap latency during large amounts of IO) > > 3.15.0-rc53.15.0-rc5 >vanilla proportion-v1r4 > Unit >5ms Delays 80252. ( 0.00%) 81523. ( -1.58%) > Unit Mmap min 8.2118 ( 0.00%) 8.3206 ( -1.33%) > Unit Mmap mean 17.4614 ( 0.00%) 17.2868 ( 1.00%) > Unit Mmap stddev 24.9059 ( 0.00%) 34.6771 (-39.23%) > Unit Mmap max 2811.6433 ( 0.00%) 2645.1398 ( 5.92%) > Unit Mmap 90%20.5098 ( 0.00%) 18.3105 ( 10.72%) > Unit Mmap 93%22.9180 ( 0.00%) 20.1751 ( 11.97%) > Unit Mmap 95%25.2114 ( 0.00%) 22.4988 ( 10.76%) > Unit Mmap 99%46.1430 ( 0.00%) 43.5952 ( 5.52%) > Unit Ideal Tput 85.2623 ( 0.00%) 78.8906 ( 7.47%) > Unit Tput min44.0666 ( 0.00%) 43.9609 ( 0.24%) > Unit Tput mean 45.5646 ( 0.00%) 45.2009 ( 0.80%) > Unit Tput stddev 0.9318 ( 0.00%) 1.1084 (-18.95%) > Unit Tput max46.7375 ( 0.00%) 46.7539 ( -0.04%) > > fs/super.c | 16 +--- > mm/vmscan.c | 36 +--- > 2 files changed, 34 insertions(+), 18 deletions(-) > > -- > 1.8.4.5 -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH 0/3] Shrinkers and proportional reclaim
On Thu, May 22, 2014 at 10:09:36AM +0100, Mel Gorman wrote: This series is aimed at regressions noticed during reclaim activity. The first two patches are shrinker patches that were posted ages ago but never merged for reasons that are unclear to me. I'm posting them again to see if there was a reason they were dropped or if they just got lost. Dave? Time? The last patch adjusts proportional reclaim. Yuanhan Liu, can you retest the vm scalability test cases on a larger machine? Hugh, does this work for you on the memcg test cases? Sure, and here is the result. I applied these 3 patches on v3.15-rc6, and head commit is 60c10afd. e82e0561 is the old commit that introduced the regression. The testserver has 512G memory and 120 CPU. It's a simple result; if you need more data, I can gather them and send it to you tomorrow: e82e0561v3.15-rc6 60c10afd 185607851223212238868453 -34%+109 As you can see, the performance is back, and it is way much better ;) --yliu Based on ext4, I get the following results but unfortunately my larger test machines are all unavailable so this is based on a relatively small machine. postmark 3.15.0-rc53.15.0-rc5 vanilla proportion-v1r4 Ops/sec Transactions 21.00 ( 0.00%) 25.00 ( 19.05%) Ops/sec FilesCreate 39.00 ( 0.00%) 45.00 ( 15.38%) Ops/sec CreateTransact 10.00 ( 0.00%) 12.00 ( 20.00%) Ops/sec FilesDeleted 6202.00 ( 0.00%) 6202.00 ( 0.00%) Ops/sec DeleteTransact 11.00 ( 0.00%) 12.00 ( 9.09%) Ops/sec DataRead/MB 25.97 ( 0.00%) 30.02 ( 15.59%) Ops/sec DataWrite/MB 49.99 ( 0.00%) 57.78 ( 15.58%) ffsb (mail server simulator) 3.15.0-rc5 3.15.0-rc5 vanillaproportion-v1r4 Ops/sec readall 9402.63 ( 0.00%) 9805.74 ( 4.29%) Ops/sec create4695.45 ( 0.00%) 4781.39 ( 1.83%) Ops/sec delete 173.72 ( 0.00%) 177.23 ( 2.02%) Ops/sec Transactions 14271.80 ( 0.00%) 14764.37 ( 3.45%) Ops/sec Read37.00 ( 0.00%)38.50 ( 4.05%) Ops/sec Write 18.20 ( 0.00%)18.50 ( 1.65%) dd of a large file 3.15.0-rc53.15.0-rc5 vanilla proportion-v1r4 WallTime DownloadTar 75.00 ( 0.00%) 61.00 ( 18.67%) WallTime DD 423.00 ( 0.00%) 401.00 ( 5.20%) WallTime Delete 2.00 ( 0.00%)5.00 (-150.00%) stutter (times mmap latency during large amounts of IO) 3.15.0-rc53.15.0-rc5 vanilla proportion-v1r4 Unit 5ms Delays 80252. ( 0.00%) 81523. ( -1.58%) Unit Mmap min 8.2118 ( 0.00%) 8.3206 ( -1.33%) Unit Mmap mean 17.4614 ( 0.00%) 17.2868 ( 1.00%) Unit Mmap stddev 24.9059 ( 0.00%) 34.6771 (-39.23%) Unit Mmap max 2811.6433 ( 0.00%) 2645.1398 ( 5.92%) Unit Mmap 90%20.5098 ( 0.00%) 18.3105 ( 10.72%) Unit Mmap 93%22.9180 ( 0.00%) 20.1751 ( 11.97%) Unit Mmap 95%25.2114 ( 0.00%) 22.4988 ( 10.76%) Unit Mmap 99%46.1430 ( 0.00%) 43.5952 ( 5.52%) Unit Ideal Tput 85.2623 ( 0.00%) 78.8906 ( 7.47%) Unit Tput min44.0666 ( 0.00%) 43.9609 ( 0.24%) Unit Tput mean 45.5646 ( 0.00%) 45.2009 ( 0.80%) Unit Tput stddev 0.9318 ( 0.00%) 1.1084 (-18.95%) Unit Tput max46.7375 ( 0.00%) 46.7539 ( -0.04%) fs/super.c | 16 +--- mm/vmscan.c | 36 +--- 2 files changed, 34 insertions(+), 18 deletions(-) -- 1.8.4.5 -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH 0/3] Shrinkers and proportional reclaim
On Thu, May 22, 2014 at 05:30:51PM +0100, Mel Gorman wrote: On Fri, May 23, 2014 at 12:14:16AM +0800, Yuanhan Liu wrote: On Thu, May 22, 2014 at 10:09:36AM +0100, Mel Gorman wrote: This series is aimed at regressions noticed during reclaim activity. The first two patches are shrinker patches that were posted ages ago but never merged for reasons that are unclear to me. I'm posting them again to see if there was a reason they were dropped or if they just got lost. Dave? Time? The last patch adjusts proportional reclaim. Yuanhan Liu, can you retest the vm scalability test cases on a larger machine? Hugh, does this work for you on the memcg test cases? Sure, and here is the result. I applied these 3 patches on v3.15-rc6, and head commit is 60c10afd. e82e0561 is the old commit that introduced the regression. The testserver has 512G memory and 120 CPU. It's a simple result; if you need more data, I can gather them and send it to you tomorrow: e82e0561v3.15-rc6 60c10afd 185607851223212238868453 -34%+109 As you can see, the performance is back, and it is way much better ;) Thanks a lot for that and the quick response. It is much appreciated. Welcome! And sorry that I made a silly mistake. Those numbers are right though, I just setup wrong compare base; I should compare them with e82e0561's parent, which is 75485363ce85526 at below table. Here is the detailed results to compensate the mistake I made ;) Legend: ~XX%- stddev percent (3 runs for each kernel) [+-]XX% - change percent 75485363ce85526 e82e0561dae9f3ae5a21fc2d3 v3.15-rc6 60c10afd233f3344479d229dc --- - - - 35979244 ~ 0% -48.4% 18560785 ~ 0% -66.0% 12235090 ~ 0% +8.0% 38868453 ~ 0% vm-scalability.throughput 28138 ~ 0% +7448.2%2123943 ~ 0% +2724.5% 794777 ~ 0% +1.6% 28598 ~ 0% proc-vmstat.allocstall 544 ~ 6% -95.2% 26 ~ 0% -96.5% 19 ~21% -6.9%506 ~ 6% numa-vmstat.node2.nr_isolated_file 12009832 ~11%+368.1% 56215319 ~ 0%+312.9% 49589361 ~ 1% +0.7% 12091235 ~ 5% numa-numastat.node3.numa_foreign 560 ~ 5% -95.7% 24 ~12% -96.9% 17 ~10% -8.7%511 ~ 2% numa-vmstat.node1.nr_isolated_file 8740137 ~12%+574.0% 58910256 ~ 0%+321.0% 36798827 ~ 0% +21.0% 10578905 ~13% numa-vmstat.node0.numa_other 8734988 ~12%+574.4% 58904944 ~ 0%+321.2% 36794158 ~ 0% +21.0% 10572718 ~13% numa-vmstat.node0.numa_miss 1308 ~12%-100.0% 0 ~ 0%-100.0% 0 +23.3% 1612 ~18% proc-vmstat.pgscan_direct_throttle 12294788 ~11%+401.2% 61622745 ~ 0%+332.6% 53190547 ~ 0% -13.2% 10667387 ~ 5% numa-numastat.node1.numa_foreign 576 ~ 6% -91.2% 50 ~22% -94.3% 33 ~20% -18.1%472 ~ 1% numa-vmstat.node0.nr_isolated_file 12 ~24% +2400.0%316 ~ 4% +13543.7% 1728 ~ 5% +155.3% 32 ~29% proc-vmstat.compact_stall 572 ~ 2% -96.4% 20 ~18% -97.6% 13 ~11% -17.5%472 ~ 2% numa-vmstat.node3.nr_isolated_file 3012 ~12% +2388.4% 74959 ~ 0%+254.7% 10685 ~ 1% -45.4% 1646 ~ 1% proc-vmstat.pageoutrun 2312 ~ 3% -94.2%133 ~ 4% -95.8% 97 ~ 8% -12.6% 2021 ~ 2% proc-vmstat.nr_isolated_file 2575163 ~ 0% +2779.1% 74141888 ~ 0%+958.0% 27244229 ~ 0% -1.3%2542941 ~ 0% proc-vmstat.pgscan_direct_dma32 21916603 ~13% +2519.8% 5.742e+08 ~ 0% +2868.9% 6.507e+08 ~ 0% -16.1% 18397644 ~ 5% proc-vmstat.pgscan_kswapd_normal 53306 ~24% +1077.9% 627895 ~ 0% +2066.2%1154741 ~ 0% +23.5% 65815 ~24% proc-vmstat.pgscan_kswapd_dma32 2575163 ~ 0% +2778.6% 74129497 ~ 0%+957.8% 27239606 ~ 0% -1.3%2542353 ~ 0% proc-vmstat.pgsteal_direct_dma32 21907744 ~14% +2520.8% 5.742e+08 ~ 0% +2870.0% 6.507e+08 ~ 0% -16.1% 18386641 ~ 5% proc-vmstat.pgsteal_kswapd_normal 53306 ~24% +1077.7% 627796 ~ 0% +2065.7%1154436 ~ 0% +23.3% 65731 ~24% proc-vmstat.pgsteal_kswapd_dma32 2967449 ~ 0% +2432.7% 75156011 ~ 0%+869.9% 28781337 ~ 0% -0.7%2945933 ~ 0% proc-vmstat.pgalloc_dma32 13081172 ~11%+599.4% 91495653 ~ 0%+337.1% 57180622 ~ 0% +12.1% 14668141 ~13% numa-numastat.node0.other_node 13073426 ~11%+599.8% 91489575 ~ 0%+337.4% 57177129 ~ 0% +12.1% 14660341 ~13% numa-numastat.node0.numa_miss 281 ~23% +1969.4% 5822 ~ 1% +3321.4% 9625 ~ 2% -26.9
Re: performance regression due to commit e82e0561("mm: vmscan: obey proportional scanning requirements for kswapd")
On Sat, Mar 15, 2014 at 08:56:10PM -0700, Hugh Dickins wrote: > On Fri, 14 Mar 2014, Mel Gorman wrote: > > On Thu, Mar 13, 2014 at 05:44:57AM -0700, Hugh Dickins wrote: > > > On Wed, 12 Mar 2014, Mel Gorman wrote: > > > > On Tue, Feb 18, 2014 at 04:01:22PM +0800, Yuanhan Liu wrote: ... snip ... > > > I missed Yuanhan's mail, but seeing your reply reminds me of another > > > issue with that proportionality patch - or perhaps more thought would > > > show them to be two sides of the same issue, with just one fix required. > > > Let me throw our patch into the cauldron. > > > > > > [PATCH] mm: revisit shrink_lruvec's attempt at proportionality > > > > > > We have a memcg reclaim test which exerts a certain amount of pressure, > > > and expects to see a certain range of page reclaim in response. It's a > > > very wide range allowed, but the test repeatably failed on v3.11 onwards, > > > because reclaim goes wild and frees up almost everything. > > > > > > This wild behaviour bisects to Mel's "scan_adjusted" commit e82e0561dae9 > > > "mm: vmscan: obey proportional scanning requirements for kswapd". That > > > attempts to achieve proportionality between anon and file lrus: to the > > > extent that once one of those is empty, it then tries to empty the other. > > > Stop that. > > > > > > Signed-off-by: Hugh Dickins > > > --- > > > > > > We've been running happily with this for months; but all that time it's > > > been on my TODO list with a "needs more thought" tag before we could > > > upstream it, and I never got around to that. We also have a somewhat > > > similar, but older and quite independent, fix to get_scan_count() from > > > Suleiman, which I'd meant to send along at the same time: I'll dig that > > > one out tomorrow or the day after. > > I've sent that one out now in a new thread > https://lkml.org/lkml/2014/3/15/168 > and also let's tie these together with Hannes's > https://lkml.org/lkml/2014/3/14/277 > > > > > > > > I ran a battery of page reclaim related tests against it on top of > > 3.14-rc6. Workloads showed small improvements in their absolute performance > > but actual IO behaviour looked much better in some tests. This is the > > iostats summary for the test that showed the biggest different -- dd of > > a large file on ext3. > > > > 3.14.0-rc6 3.14.0-rc6 > >vanilla proportional-v1r1 > > Meansda-avgqz 1045.64 224.18 > > Meansda-await 2120.12 506.77 > > Meansda-r_await 18.61 19.78 > > Meansda-w_await 11089.602126.35 > > Max sda-avgqz 2294.39 787.13 > > Max sda-await 7074.79 2371.67 > > Max sda-r_await 503.00 414.00 > > Max sda-w_await 35721.937249.84 > > > > Not all workloads benefitted. The same workload on ext4 showed no useful > > difference. btrfs looks like > > > > 3.14.0-rc6 3.14.0-rc6 > >vanilla proportional-v1r1 > > Meansda-avgqz 762.69 650.39 > > Meansda-await 2438.46 2495.15 > > Meansda-r_await 44.18 47.20 > > Meansda-w_await 6109.19 5139.86 > > Max sda-avgqz 2203.50 1870.78 > > Max sda-await 7098.26 6847.21 > > Max sda-r_await 63.02 156.00 > > Max sda-w_await 19921.7011085.13 > > > > Better but not as dramatically so. I didn't analyse why. A workload that > > had a large anonymous mapping with large amounts of IO in the background > > did not show any regressions so based on that and the fact the patch looks > > ok, here goes nothing; > > > > Acked-by: Mel Gorman > > Big thank you, Mel, for doing so much work on it, and so very quickly. > I get quite lost in the numbers myself: I'm much more convinced of it > by your numbers and ack. > > > > > You say it's already been tested for months but it would be nice if the > > workload that generated this thread was also tested. > > Yes indeed: Yuanhan, do you have time to try this patch for your > testcase? I'm hoping it will prove at least as effective as your > own suggested patch, but please let us know what you find - thanks. Hi Hugh, Sure, and sorry to t
Re: performance regression due to commit e82e0561(mm: vmscan: obey proportional scanning requirements for kswapd)
On Sat, Mar 15, 2014 at 08:56:10PM -0700, Hugh Dickins wrote: On Fri, 14 Mar 2014, Mel Gorman wrote: On Thu, Mar 13, 2014 at 05:44:57AM -0700, Hugh Dickins wrote: On Wed, 12 Mar 2014, Mel Gorman wrote: On Tue, Feb 18, 2014 at 04:01:22PM +0800, Yuanhan Liu wrote: ... snip ... I missed Yuanhan's mail, but seeing your reply reminds me of another issue with that proportionality patch - or perhaps more thought would show them to be two sides of the same issue, with just one fix required. Let me throw our patch into the cauldron. [PATCH] mm: revisit shrink_lruvec's attempt at proportionality We have a memcg reclaim test which exerts a certain amount of pressure, and expects to see a certain range of page reclaim in response. It's a very wide range allowed, but the test repeatably failed on v3.11 onwards, because reclaim goes wild and frees up almost everything. This wild behaviour bisects to Mel's scan_adjusted commit e82e0561dae9 mm: vmscan: obey proportional scanning requirements for kswapd. That attempts to achieve proportionality between anon and file lrus: to the extent that once one of those is empty, it then tries to empty the other. Stop that. Signed-off-by: Hugh Dickins hu...@google.com --- We've been running happily with this for months; but all that time it's been on my TODO list with a needs more thought tag before we could upstream it, and I never got around to that. We also have a somewhat similar, but older and quite independent, fix to get_scan_count() from Suleiman, which I'd meant to send along at the same time: I'll dig that one out tomorrow or the day after. I've sent that one out now in a new thread https://lkml.org/lkml/2014/3/15/168 and also let's tie these together with Hannes's https://lkml.org/lkml/2014/3/14/277 I ran a battery of page reclaim related tests against it on top of 3.14-rc6. Workloads showed small improvements in their absolute performance but actual IO behaviour looked much better in some tests. This is the iostats summary for the test that showed the biggest different -- dd of a large file on ext3. 3.14.0-rc6 3.14.0-rc6 vanilla proportional-v1r1 Meansda-avgqz 1045.64 224.18 Meansda-await 2120.12 506.77 Meansda-r_await 18.61 19.78 Meansda-w_await 11089.602126.35 Max sda-avgqz 2294.39 787.13 Max sda-await 7074.79 2371.67 Max sda-r_await 503.00 414.00 Max sda-w_await 35721.937249.84 Not all workloads benefitted. The same workload on ext4 showed no useful difference. btrfs looks like 3.14.0-rc6 3.14.0-rc6 vanilla proportional-v1r1 Meansda-avgqz 762.69 650.39 Meansda-await 2438.46 2495.15 Meansda-r_await 44.18 47.20 Meansda-w_await 6109.19 5139.86 Max sda-avgqz 2203.50 1870.78 Max sda-await 7098.26 6847.21 Max sda-r_await 63.02 156.00 Max sda-w_await 19921.7011085.13 Better but not as dramatically so. I didn't analyse why. A workload that had a large anonymous mapping with large amounts of IO in the background did not show any regressions so based on that and the fact the patch looks ok, here goes nothing; Acked-by: Mel Gorman mgor...@suse.de Big thank you, Mel, for doing so much work on it, and so very quickly. I get quite lost in the numbers myself: I'm much more convinced of it by your numbers and ack. You say it's already been tested for months but it would be nice if the workload that generated this thread was also tested. Yes indeed: Yuanhan, do you have time to try this patch for your testcase? I'm hoping it will prove at least as effective as your own suggested patch, but please let us know what you find - thanks. Hi Hugh, Sure, and sorry to tell you that this patch introduced another half performance descrease from avg 60 MB/s to 30 MB/s in this testcase. Moreover, the dd throughput for each process was steady before, however, it's quite bumpy from 20 MB/s to 40 MB/s w/ this patch applied, and thus got a avg of 30 MB/s: 11327188992 bytes (11 GB) copied, 300.014 s, 37.8 MB/s 1809373+0 records in 1809372+0 records out 7411187712 bytes (7.4 GB) copied, 300.008 s, 24.7 MB/s 3068285+0 records in 3068284+0 records out 12567691264 bytes (13 GB) copied, 300.001 s, 41.9 MB/s 1883877+0 records in 1883876+0 records out 7716356096 bytes (7.7 GB) copied, 300.002 s, 25.7 MB/s 1807674+0 records in 1807673+0 records out 7404228608 bytes (7.4 GB) copied
Re: performance regression due to commit e82e0561("mm: vmscan: obey proportional scanning requirements for kswapd")
On Wed, Mar 12, 2014 at 04:54:47PM +, Mel Gorman wrote: > On Tue, Feb 18, 2014 at 04:01:22PM +0800, Yuanhan Liu wrote: > > Hi, > > > > Commit e82e0561("mm: vmscan: obey proportional scanning requirements for > > kswapd") caused a big performance regression(73%) for vm-scalability/ > > lru-file-readonce testcase on a system with 256G memory without swap. > > > > That testcase simply looks like this: > > truncate -s 1T /tmp/vm-scalability.img > > mkfs.xfs -q /tmp/vm-scalability.img > > mount -o loop /tmp/vm-scalability.img /tmp/vm-scalability > > > > SPARESE_FILE="/tmp/vm-scalability/sparse-lru-file-readonce" > > for i in `seq 1 120`; do > > truncate $SPARESE_FILE-$i -s 36G > > timeout --foreground -s INT 300 dd bs=4k if=$SPARESE_FILE-$i > > of=/dev/null > > done > > > > wait > > > > The filename implies that it's a sparse file with no IO but does not say > what the truncate function/program/whatever actually does. It's actually the /usr/bin/truncate file from coreutils. > If it's really a > sparse file then the dd process should be reading zeros and writing them to > NULL without IO. Where are pages being dirtied? Sorry, my bad. I was wrong and I meant to "the speed of getting new pages", but not "the speed of dirtying pages". > Does the truncate command > really create a sparse file or is it something else? > > > Actually, it's not the newlly added code(obey proportional scanning) > > in that commit caused the regression. But instead, it's the following > > change: > > + > > + if (nr_reclaimed < nr_to_reclaim || scan_adjusted) > > + continue; > > + > > > > > > - if (nr_reclaimed >= nr_to_reclaim && > > - sc->priority < DEF_PRIORITY) > > + if (global_reclaim(sc) && !current_is_kswapd()) > > break; > > > > The difference is that we might reclaim more than requested before > > in the first round reclaimming(sc->priority == DEF_PRIORITY). > > > > So, for a testcase like lru-file-readonce, the dirty rate is fast, and > > reclaimming SWAP_CLUSTER_MAX(32 pages) each time is not enough for catching > > up the dirty rate. And thus page allocation stalls, and performance drops: > > > >O for e82e0561 > >* for parent commit > > > > proc-vmstat.allocstall > > > > 2e+06 > > ++---+ > >1.8e+06 O+ OO O > > | > >| > > | > >1.6e+06 ++ > > | > >1.4e+06 ++ > > | > >| > > | > >1.2e+06 ++ > > | > > 1e+06 ++ > > | > > 80 ++ > > | > >| > > | > > 60 ++ > > | > > 40 ++ > > | > >| > > | > > 20 > > *+..**...*...* > > 0 > > ++---+ > > > >vm-scalability.throughput > > > >2.2e+07 > > ++---+ > >| > > | > > 2e+07 > > *+..**...*...* > >1.8e+07 ++ > > | > >| > > | > >1.6e+07 ++
Re: performance regression due to commit e82e0561(mm: vmscan: obey proportional scanning requirements for kswapd)
On Wed, Mar 12, 2014 at 04:54:47PM +, Mel Gorman wrote: On Tue, Feb 18, 2014 at 04:01:22PM +0800, Yuanhan Liu wrote: Hi, Commit e82e0561(mm: vmscan: obey proportional scanning requirements for kswapd) caused a big performance regression(73%) for vm-scalability/ lru-file-readonce testcase on a system with 256G memory without swap. That testcase simply looks like this: truncate -s 1T /tmp/vm-scalability.img mkfs.xfs -q /tmp/vm-scalability.img mount -o loop /tmp/vm-scalability.img /tmp/vm-scalability SPARESE_FILE=/tmp/vm-scalability/sparse-lru-file-readonce for i in `seq 1 120`; do truncate $SPARESE_FILE-$i -s 36G timeout --foreground -s INT 300 dd bs=4k if=$SPARESE_FILE-$i of=/dev/null done wait The filename implies that it's a sparse file with no IO but does not say what the truncate function/program/whatever actually does. It's actually the /usr/bin/truncate file from coreutils. If it's really a sparse file then the dd process should be reading zeros and writing them to NULL without IO. Where are pages being dirtied? Sorry, my bad. I was wrong and I meant to the speed of getting new pages, but not the speed of dirtying pages. Does the truncate command really create a sparse file or is it something else? Actually, it's not the newlly added code(obey proportional scanning) in that commit caused the regression. But instead, it's the following change: + + if (nr_reclaimed nr_to_reclaim || scan_adjusted) + continue; + - if (nr_reclaimed = nr_to_reclaim - sc-priority DEF_PRIORITY) + if (global_reclaim(sc) !current_is_kswapd()) break; The difference is that we might reclaim more than requested before in the first round reclaimming(sc-priority == DEF_PRIORITY). So, for a testcase like lru-file-readonce, the dirty rate is fast, and reclaimming SWAP_CLUSTER_MAX(32 pages) each time is not enough for catching up the dirty rate. And thus page allocation stalls, and performance drops: O for e82e0561 * for parent commit proc-vmstat.allocstall 2e+06 ++---+ 1.8e+06 O+ OO O | | | 1.6e+06 ++ | 1.4e+06 ++ | | | 1.2e+06 ++ | 1e+06 ++ | 80 ++ | | | 60 ++ | 40 ++ | | | 20 *+..**...*...* 0 ++---+ vm-scalability.throughput 2.2e+07 ++---+ | | 2e+07 *+..**...*...* 1.8e+07 ++ | | | 1.6e+07 ++ | | | 1.4e+07 ++ | | | 1.2e+07 ++ | 1e+07 ++ | | | 8e+06 ++ OO O | O | 6e+06 ++---+ I made a patch which simply keeps reclaimming more if sc-priority == DEF_PRIORITY. I'm not sure it's the right way to go or not. Anyway, I pasted
Re: performance regression due to commit e82e0561("mm: vmscan: obey proportional scanning requirements for kswapd")
ping... On Tue, Feb 18, 2014 at 04:01:22PM +0800, Yuanhan Liu wrote: > Hi, > > Commit e82e0561("mm: vmscan: obey proportional scanning requirements for > kswapd") caused a big performance regression(73%) for vm-scalability/ > lru-file-readonce testcase on a system with 256G memory without swap. > > That testcase simply looks like this: > truncate -s 1T /tmp/vm-scalability.img > mkfs.xfs -q /tmp/vm-scalability.img > mount -o loop /tmp/vm-scalability.img /tmp/vm-scalability > > SPARESE_FILE="/tmp/vm-scalability/sparse-lru-file-readonce" > for i in `seq 1 120`; do > truncate $SPARESE_FILE-$i -s 36G > timeout --foreground -s INT 300 dd bs=4k if=$SPARESE_FILE-$i > of=/dev/null > done > > wait > > Actually, it's not the newlly added code(obey proportional scanning) > in that commit caused the regression. But instead, it's the following > change: > + > + if (nr_reclaimed < nr_to_reclaim || scan_adjusted) > + continue; > + > > > - if (nr_reclaimed >= nr_to_reclaim && > - sc->priority < DEF_PRIORITY) > + if (global_reclaim(sc) && !current_is_kswapd()) > break; > > The difference is that we might reclaim more than requested before > in the first round reclaimming(sc->priority == DEF_PRIORITY). > > So, for a testcase like lru-file-readonce, the dirty rate is fast, and > reclaimming SWAP_CLUSTER_MAX(32 pages) each time is not enough for catching > up the dirty rate. And thus page allocation stalls, and performance drops: > >O for e82e0561 >* for parent commit > > proc-vmstat.allocstall > > 2e+06 ++---+ >1.8e+06 O+ OO O | >|| >1.6e+06 ++ | >1.4e+06 ++ | >|| >1.2e+06 ++ | > 1e+06 ++ | > 80 ++ | >|| > 60 ++ | > 40 ++ | >|| > 20 *+..**...*...* > 0 ++---+ > >vm-scalability.throughput > >2.2e+07 ++---+ >|| > 2e+07 *+..**...*...* >1.8e+07 ++ | >|| >1.6e+07 ++ | >|| >1.4e+07 ++ | >|| >1.2e+07 ++ | > 1e+07 ++ | >|| > 8e+06 ++ OO O | >O| > 6e+06 ++---+ > > I made a patch which simply keeps reclaimming more if sc->priority == > DEF_PRIORITY. > I'm not sure it's the right way to go or not. Anyway, I pasted it here for > comments. > > --- > diff --git a/mm/vmscan.c b/mm/vmscan.c > index 26ad67f..37004a8 100644 > --- a/mm/vmscan.c > +++ b/mm/vmscan.c > @@ -1828,7 +1828,16 @@ static void shrink_lruvec(struct lruvec *lruvec, > struct scan_control *sc) > unsigned long nr_reclaimed = 0; > unsigned long nr_to_
Re: performance regression due to commit e82e0561(mm: vmscan: obey proportional scanning requirements for kswapd)
ping... On Tue, Feb 18, 2014 at 04:01:22PM +0800, Yuanhan Liu wrote: Hi, Commit e82e0561(mm: vmscan: obey proportional scanning requirements for kswapd) caused a big performance regression(73%) for vm-scalability/ lru-file-readonce testcase on a system with 256G memory without swap. That testcase simply looks like this: truncate -s 1T /tmp/vm-scalability.img mkfs.xfs -q /tmp/vm-scalability.img mount -o loop /tmp/vm-scalability.img /tmp/vm-scalability SPARESE_FILE=/tmp/vm-scalability/sparse-lru-file-readonce for i in `seq 1 120`; do truncate $SPARESE_FILE-$i -s 36G timeout --foreground -s INT 300 dd bs=4k if=$SPARESE_FILE-$i of=/dev/null done wait Actually, it's not the newlly added code(obey proportional scanning) in that commit caused the regression. But instead, it's the following change: + + if (nr_reclaimed nr_to_reclaim || scan_adjusted) + continue; + - if (nr_reclaimed = nr_to_reclaim - sc-priority DEF_PRIORITY) + if (global_reclaim(sc) !current_is_kswapd()) break; The difference is that we might reclaim more than requested before in the first round reclaimming(sc-priority == DEF_PRIORITY). So, for a testcase like lru-file-readonce, the dirty rate is fast, and reclaimming SWAP_CLUSTER_MAX(32 pages) each time is not enough for catching up the dirty rate. And thus page allocation stalls, and performance drops: O for e82e0561 * for parent commit proc-vmstat.allocstall 2e+06 ++---+ 1.8e+06 O+ OO O | || 1.6e+06 ++ | 1.4e+06 ++ | || 1.2e+06 ++ | 1e+06 ++ | 80 ++ | || 60 ++ | 40 ++ | || 20 *+..**...*...* 0 ++---+ vm-scalability.throughput 2.2e+07 ++---+ || 2e+07 *+..**...*...* 1.8e+07 ++ | || 1.6e+07 ++ | || 1.4e+07 ++ | || 1.2e+07 ++ | 1e+07 ++ | || 8e+06 ++ OO O | O| 6e+06 ++---+ I made a patch which simply keeps reclaimming more if sc-priority == DEF_PRIORITY. I'm not sure it's the right way to go or not. Anyway, I pasted it here for comments. --- diff --git a/mm/vmscan.c b/mm/vmscan.c index 26ad67f..37004a8 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1828,7 +1828,16 @@ static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc) unsigned long nr_reclaimed = 0; unsigned long nr_to_reclaim = sc-nr_to_reclaim; struct blk_plug plug; - bool scan_adjusted = false; + /* + * On large memory systems, direct reclamming of SWAP_CLUSTER_MAX + * each time may not catch up the dirty rate in some cases(say, + * vm-scalability/lru-file-readonce), which may increase the + * page allocation stall latency in the end. + * + * Here we try to reclaim more than requested for the first round
performance regression due to commit e82e0561("mm: vmscan: obey proportional scanning requirements for kswapd")
Hi, Commit e82e0561("mm: vmscan: obey proportional scanning requirements for kswapd") caused a big performance regression(73%) for vm-scalability/ lru-file-readonce testcase on a system with 256G memory without swap. That testcase simply looks like this: truncate -s 1T /tmp/vm-scalability.img mkfs.xfs -q /tmp/vm-scalability.img mount -o loop /tmp/vm-scalability.img /tmp/vm-scalability SPARESE_FILE="/tmp/vm-scalability/sparse-lru-file-readonce" for i in `seq 1 120`; do truncate $SPARESE_FILE-$i -s 36G timeout --foreground -s INT 300 dd bs=4k if=$SPARESE_FILE-$i of=/dev/null done wait Actually, it's not the newlly added code(obey proportional scanning) in that commit caused the regression. But instead, it's the following change: + + if (nr_reclaimed < nr_to_reclaim || scan_adjusted) + continue; + - if (nr_reclaimed >= nr_to_reclaim && - sc->priority < DEF_PRIORITY) + if (global_reclaim(sc) && !current_is_kswapd()) break; The difference is that we might reclaim more than requested before in the first round reclaimming(sc->priority == DEF_PRIORITY). So, for a testcase like lru-file-readonce, the dirty rate is fast, and reclaimming SWAP_CLUSTER_MAX(32 pages) each time is not enough for catching up the dirty rate. And thus page allocation stalls, and performance drops: O for e82e0561 * for parent commit proc-vmstat.allocstall 2e+06 ++---+ 1.8e+06 O+ OO O | || 1.6e+06 ++ | 1.4e+06 ++ | || 1.2e+06 ++ | 1e+06 ++ | 80 ++ | || 60 ++ | 40 ++ | || 20 *+..**...*...* 0 ++---+ vm-scalability.throughput 2.2e+07 ++---+ || 2e+07 *+..**...*...* 1.8e+07 ++ | || 1.6e+07 ++ | || 1.4e+07 ++ | || 1.2e+07 ++ | 1e+07 ++ | || 8e+06 ++ OO O | O| 6e+06 ++---+ I made a patch which simply keeps reclaimming more if sc->priority == DEF_PRIORITY. I'm not sure it's the right way to go or not. Anyway, I pasted it here for comments. --- diff --git a/mm/vmscan.c b/mm/vmscan.c index 26ad67f..37004a8 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1828,7 +1828,16 @@ static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc) unsigned long nr_reclaimed = 0; unsigned long nr_to_reclaim = sc->nr_to_reclaim; struct blk_plug plug; - bool scan_adjusted = false; + /* +* On large memory systems, direct reclamming of SWAP_CLUSTER_MAX +* each time may not catch up the dirty rate in some cases(say, +* vm-scalability/lru-file-readonce), which may increase the +* page allocation stall latency in the end. +* +* Here we try to reclaim more than requested for the first round +* (sc->priority == DEF_PRIORITY) to reduce such latency. +*/ + bool scan_adjusted = sc->priority == DEF_PRIORITY;
performance regression due to commit e82e0561(mm: vmscan: obey proportional scanning requirements for kswapd)
Hi, Commit e82e0561(mm: vmscan: obey proportional scanning requirements for kswapd) caused a big performance regression(73%) for vm-scalability/ lru-file-readonce testcase on a system with 256G memory without swap. That testcase simply looks like this: truncate -s 1T /tmp/vm-scalability.img mkfs.xfs -q /tmp/vm-scalability.img mount -o loop /tmp/vm-scalability.img /tmp/vm-scalability SPARESE_FILE=/tmp/vm-scalability/sparse-lru-file-readonce for i in `seq 1 120`; do truncate $SPARESE_FILE-$i -s 36G timeout --foreground -s INT 300 dd bs=4k if=$SPARESE_FILE-$i of=/dev/null done wait Actually, it's not the newlly added code(obey proportional scanning) in that commit caused the regression. But instead, it's the following change: + + if (nr_reclaimed nr_to_reclaim || scan_adjusted) + continue; + - if (nr_reclaimed = nr_to_reclaim - sc-priority DEF_PRIORITY) + if (global_reclaim(sc) !current_is_kswapd()) break; The difference is that we might reclaim more than requested before in the first round reclaimming(sc-priority == DEF_PRIORITY). So, for a testcase like lru-file-readonce, the dirty rate is fast, and reclaimming SWAP_CLUSTER_MAX(32 pages) each time is not enough for catching up the dirty rate. And thus page allocation stalls, and performance drops: O for e82e0561 * for parent commit proc-vmstat.allocstall 2e+06 ++---+ 1.8e+06 O+ OO O | || 1.6e+06 ++ | 1.4e+06 ++ | || 1.2e+06 ++ | 1e+06 ++ | 80 ++ | || 60 ++ | 40 ++ | || 20 *+..**...*...* 0 ++---+ vm-scalability.throughput 2.2e+07 ++---+ || 2e+07 *+..**...*...* 1.8e+07 ++ | || 1.6e+07 ++ | || 1.4e+07 ++ | || 1.2e+07 ++ | 1e+07 ++ | || 8e+06 ++ OO O | O| 6e+06 ++---+ I made a patch which simply keeps reclaimming more if sc-priority == DEF_PRIORITY. I'm not sure it's the right way to go or not. Anyway, I pasted it here for comments. --- diff --git a/mm/vmscan.c b/mm/vmscan.c index 26ad67f..37004a8 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1828,7 +1828,16 @@ static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc) unsigned long nr_reclaimed = 0; unsigned long nr_to_reclaim = sc-nr_to_reclaim; struct blk_plug plug; - bool scan_adjusted = false; + /* +* On large memory systems, direct reclamming of SWAP_CLUSTER_MAX +* each time may not catch up the dirty rate in some cases(say, +* vm-scalability/lru-file-readonce), which may increase the +* page allocation stall latency in the end. +* +* Here we try to reclaim more than requested for the first round +* (sc-priority == DEF_PRIORITY) to reduce such latency. +*/ + bool scan_adjusted = sc-priority == DEF_PRIORITY;
Re: changes caused by 0d11e6ac("blk-mq: fix use-after-free of request")
On Wed, Dec 18, 2013 at 11:29:30AM +0100, Matias Bjørling wrote: > On 12/18/2013 09:50 AM, Yuanhan Liu wrote: > >Hi, > > > >FYI, we noticed some changes caused by 0d11e6ac("blk-mq: fix use-after-free > >of request"): > > > > The blk-mq accounting was faulty up to that commit. We should > compare the blk-mq with the previous block layer. > > Could you try to revert the following patches: > > f02b9ac virtio-blk: virtqueue_kick() must be ordered with other... > 1cf7e9c virtio_blk: blk-mq support > > and compare the two runs (upto 0d11e6ac applied, and the same, with > the two patches reverted) Hi Matias, You are right. Those counter restore back with the two patches reverted(d1b4e3825c8848b0ea0f). 959a35f13eb785f982d7 0d11e6aca396e679c07b d1b4e3825c8848b0ea0f ---- - 0.00 60.02 ~42% 0.00 vpx/micro/xfstests/4HDD-btrfs-generic-quick 0.00 367.81 ~27% 0.00 vpx/micro/xfstests/4HDD-ext4-generic-mid 0.00 411.64 ~13% 0.00 vpx/micro/xfstests/4HDD-xfs-generic-mid 0.00 208.39 ~10% 0.00 vpx/micro/xfstests/4HDD-xfs-generic-quick 0.001047.86 0.00 TOTAL iostat.vdd.await 959a35f13eb785f982d7 0d11e6aca396e679c07b d1b4e3825c8848b0ea0f ---- - 0.00 301.60 ~34% 0.00 vpx/micro/xfstests/4HDD-btrfs-generic-mid 0.00 249.16 ~12% 0.00 vpx/micro/xfstests/4HDD-btrfs-generic-quick 0.00 51.45 ~26% 0.00 vpx/micro/xfstests/4HDD-ext4-generic-mid 0.00 91.51 ~21% 0.04 vpx/micro/xfstests/4HDD-xfs-generic-127 0.001919.27 ~43% 0.00 vpx/micro/xfstests/4HDD-xfs-generic-mid 0.00 121.04 ~11% 0.00 vpx/micro/xfstests/4HDD-xfs-generic-quick 0.002734.03 0.04 TOTAL iostat.vda.r_await 959a35f13eb785f982d7 0d11e6aca396e679c07b d1b4e3825c8848b0ea0f ---- - 0.00 406.12 ~10% 0.00 vpx/micro/xfstests/4HDD-btrfs-generic-mid 0.00 433.66 ~ 7% 0.00 vpx/micro/xfstests/4HDD-btrfs-generic-quick 0.00 807.79 ~15% 0.00 vpx/micro/xfstests/4HDD-ext4-generic-mid 0.00 42.94 ~67% 0.51 vpx/micro/xfstests/4HDD-xfs-generic-127 0.00 592.20 ~16% 0.00 vpx/micro/xfstests/4HDD-xfs-generic-mid 0.00 401.74 ~12% 0.00 vpx/micro/xfstests/4HDD-xfs-generic-quick 0.002684.45 0.51 TOTAL iostat.vda.w_await --yliu -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: changes caused by 0d11e6ac(blk-mq: fix use-after-free of request)
On Wed, Dec 18, 2013 at 11:29:30AM +0100, Matias Bjørling wrote: On 12/18/2013 09:50 AM, Yuanhan Liu wrote: Hi, FYI, we noticed some changes caused by 0d11e6ac(blk-mq: fix use-after-free of request): The blk-mq accounting was faulty up to that commit. We should compare the blk-mq with the previous block layer. Could you try to revert the following patches: f02b9ac virtio-blk: virtqueue_kick() must be ordered with other... 1cf7e9c virtio_blk: blk-mq support and compare the two runs (upto 0d11e6ac applied, and the same, with the two patches reverted) Hi Matias, You are right. Those counter restore back with the two patches reverted(d1b4e3825c8848b0ea0f). 959a35f13eb785f982d7 0d11e6aca396e679c07b d1b4e3825c8848b0ea0f ---- - 0.00 60.02 ~42% 0.00 vpx/micro/xfstests/4HDD-btrfs-generic-quick 0.00 367.81 ~27% 0.00 vpx/micro/xfstests/4HDD-ext4-generic-mid 0.00 411.64 ~13% 0.00 vpx/micro/xfstests/4HDD-xfs-generic-mid 0.00 208.39 ~10% 0.00 vpx/micro/xfstests/4HDD-xfs-generic-quick 0.001047.86 0.00 TOTAL iostat.vdd.await 959a35f13eb785f982d7 0d11e6aca396e679c07b d1b4e3825c8848b0ea0f ---- - 0.00 301.60 ~34% 0.00 vpx/micro/xfstests/4HDD-btrfs-generic-mid 0.00 249.16 ~12% 0.00 vpx/micro/xfstests/4HDD-btrfs-generic-quick 0.00 51.45 ~26% 0.00 vpx/micro/xfstests/4HDD-ext4-generic-mid 0.00 91.51 ~21% 0.04 vpx/micro/xfstests/4HDD-xfs-generic-127 0.001919.27 ~43% 0.00 vpx/micro/xfstests/4HDD-xfs-generic-mid 0.00 121.04 ~11% 0.00 vpx/micro/xfstests/4HDD-xfs-generic-quick 0.002734.03 0.04 TOTAL iostat.vda.r_await 959a35f13eb785f982d7 0d11e6aca396e679c07b d1b4e3825c8848b0ea0f ---- - 0.00 406.12 ~10% 0.00 vpx/micro/xfstests/4HDD-btrfs-generic-mid 0.00 433.66 ~ 7% 0.00 vpx/micro/xfstests/4HDD-btrfs-generic-quick 0.00 807.79 ~15% 0.00 vpx/micro/xfstests/4HDD-ext4-generic-mid 0.00 42.94 ~67% 0.51 vpx/micro/xfstests/4HDD-xfs-generic-127 0.00 592.20 ~16% 0.00 vpx/micro/xfstests/4HDD-xfs-generic-mid 0.00 401.74 ~12% 0.00 vpx/micro/xfstests/4HDD-xfs-generic-quick 0.002684.45 0.51 TOTAL iostat.vda.w_await --yliu -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/