Re: [PATCH 1/2] block: Implement global tagset

2017-04-07 Thread Arun Easi
On Thu, 6 Apr 2017, 1:49am, Hannes Reinecke wrote:

> On 04/06/2017 08:27 AM, Arun Easi wrote:
> > Hi Hannes,
> > 
> > Thanks for taking a crack at the issue. My comments below..
> > 
> > On Tue, 4 Apr 2017, 5:07am, Hannes Reinecke wrote:
> > 
> >> Most legacy HBAs have a tagset per HBA, not per queue. To map
> >> these devices onto block-mq this patch implements a new tagset
> >> flag BLK_MQ_F_GLOBAL_TAGS, which will cause the tag allocator
> >> to use just one tagset for all hardware queues.
> >>
> >> Signed-off-by: Hannes Reinecke 
> >> ---
> >>  block/blk-mq-tag.c | 12 
> >>  block/blk-mq.c | 10 --
> >>  include/linux/blk-mq.h |  1 +
> >>  3 files changed, 17 insertions(+), 6 deletions(-)
> >>
> >> diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c
> >> index e48bc2c..a14e76c 100644
> >> --- a/block/blk-mq-tag.c
> >> +++ b/block/blk-mq-tag.c
> >> @@ -276,9 +276,11 @@ static void blk_mq_all_tag_busy_iter(struct 
> >> blk_mq_tags *tags,
> >>  void blk_mq_tagset_busy_iter(struct blk_mq_tag_set *tagset,
> >>busy_tag_iter_fn *fn, void *priv)
> >>  {
> >> -  int i;
> >> +  int i, lim = tagset->nr_hw_queues;
> >>  
> >> -  for (i = 0; i < tagset->nr_hw_queues; i++) {
> >> +  if (tagset->flags & BLK_MQ_F_GLOBAL_TAGS)
> >> +  lim = 1;
> >> +  for (i = 0; i < lim; i++) {
> >>if (tagset->tags && tagset->tags[i])
> >>blk_mq_all_tag_busy_iter(tagset->tags[i], fn, priv);
> >>}
> >> @@ -287,12 +289,14 @@ void blk_mq_tagset_busy_iter(struct blk_mq_tag_set 
> >> *tagset,
> >>  
> >>  int blk_mq_reinit_tagset(struct blk_mq_tag_set *set)
> >>  {
> >> -  int i, j, ret = 0;
> >> +  int i, j, ret = 0, lim = set->nr_hw_queues;
> >>  
> >>if (!set->ops->reinit_request)
> >>goto out;
> >>  
> >> -  for (i = 0; i < set->nr_hw_queues; i++) {
> >> +  if (set->flags & BLK_MQ_F_GLOBAL_TAGS)
> >> +  lim = 1;
> >> +  for (i = 0; i < lim; i++) {
> >>struct blk_mq_tags *tags = set->tags[i];
> >>  
> >>for (j = 0; j < tags->nr_tags; j++) {
> >> diff --git a/block/blk-mq.c b/block/blk-mq.c
> >> index 159187a..db96ed0 100644
> >> --- a/block/blk-mq.c
> >> +++ b/block/blk-mq.c
> >> @@ -2061,6 +2061,10 @@ static bool __blk_mq_alloc_rq_map(struct 
> >> blk_mq_tag_set *set, int hctx_idx)
> >>  {
> >>int ret = 0;
> >>  
> >> +  if ((set->flags & BLK_MQ_F_GLOBAL_TAGS) && hctx_idx != 0) {
> >> +  set->tags[hctx_idx] = set->tags[0];
> >> +  return true;
> >> +  }
> > 
:
> 
> > BTW, if you would like me to try out this patch on my setup, please let me 
> > know.
> > 
> Oh, yes. Please do.
> 

Ran the tests on my setup (Dell R730, 2 Node). This change did not drop 
any IOPs (got ~2M 512b). The cache miss percentage was varying based on if 
the tests were running on one node or both (latter yperformed worse). All 
interrupts were directed to only 1 node. Interestingly, the cache miss 
percentage was lowest when MQ was off.

I hit a fdisk hang (open path), btw, not sure if it has anything todo with 
this change, though.

Notes and hang stack attached.

Let me know if you are interested in any specific perf event/command-line.

Regards,
-Arunperf stat, ran on a short 10 second load.

---1port-1node-new-mq
 Performance counter stats for 'CPU(s) 2':

 188,642,696  LLC-loads(66.66%)
   3,615,142  LLC-load-misses  #1.92% of all LL-cache hits (66.67%)
  86,488,341  LLC-stores   (33.34%)
  10,820,977  LLC-store-misses (33.33%)
 391,370,104  cache-references (49.99%)
  14,498,491  cache-misses #3.705 % of all cache refs  (66.66%)

---1port-1node-mq---
 Performance counter stats for 'CPU(s) 2':

 145,025,999  LLC-loads(66.67%)
   3,793,427  LLC-load-misses  #2.62% of all LL-cache hits (66.67%)
  60,878,939  LLC-stores   (33.33%)
   8,044,714  LLC-store-misses (33.33%)
 294,713,070  cache-references (50.00%)
  11,923,354  cache-misses #4.046 % of all cache refs  (66.66%)

---1port-1node-nomq---
 Performance counter stats for 'CPU(s) 2':

 157,375,709  LLC-loads(66.66%)
 476,117  LLC-load-misses  #0.30% of all LL-cache hits (66.66%)
  76,046,098  LLC-stores   (33.34%)
 840,756  LLC-store-misses (33.34%)
 326,230,969  cache-references (50.00%)
   1,332,398  cache-misses #0.408 % of all cache refs  (66.67%)

==

--2port-allnodes-new-mq--
 Performance counter stats for 

Re: [PATCH 1/2] block: Implement global tagset

2017-04-06 Thread Hannes Reinecke
On 04/06/2017 08:27 AM, Arun Easi wrote:
> Hi Hannes,
> 
> Thanks for taking a crack at the issue. My comments below..
> 
> On Tue, 4 Apr 2017, 5:07am, Hannes Reinecke wrote:
> 
>> Most legacy HBAs have a tagset per HBA, not per queue. To map
>> these devices onto block-mq this patch implements a new tagset
>> flag BLK_MQ_F_GLOBAL_TAGS, which will cause the tag allocator
>> to use just one tagset for all hardware queues.
>>
>> Signed-off-by: Hannes Reinecke 
>> ---
>>  block/blk-mq-tag.c | 12 
>>  block/blk-mq.c | 10 --
>>  include/linux/blk-mq.h |  1 +
>>  3 files changed, 17 insertions(+), 6 deletions(-)
>>
>> diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c
>> index e48bc2c..a14e76c 100644
>> --- a/block/blk-mq-tag.c
>> +++ b/block/blk-mq-tag.c
>> @@ -276,9 +276,11 @@ static void blk_mq_all_tag_busy_iter(struct blk_mq_tags 
>> *tags,
>>  void blk_mq_tagset_busy_iter(struct blk_mq_tag_set *tagset,
>>  busy_tag_iter_fn *fn, void *priv)
>>  {
>> -int i;
>> +int i, lim = tagset->nr_hw_queues;
>>  
>> -for (i = 0; i < tagset->nr_hw_queues; i++) {
>> +if (tagset->flags & BLK_MQ_F_GLOBAL_TAGS)
>> +lim = 1;
>> +for (i = 0; i < lim; i++) {
>>  if (tagset->tags && tagset->tags[i])
>>  blk_mq_all_tag_busy_iter(tagset->tags[i], fn, priv);
>>  }
>> @@ -287,12 +289,14 @@ void blk_mq_tagset_busy_iter(struct blk_mq_tag_set 
>> *tagset,
>>  
>>  int blk_mq_reinit_tagset(struct blk_mq_tag_set *set)
>>  {
>> -int i, j, ret = 0;
>> +int i, j, ret = 0, lim = set->nr_hw_queues;
>>  
>>  if (!set->ops->reinit_request)
>>  goto out;
>>  
>> -for (i = 0; i < set->nr_hw_queues; i++) {
>> +if (set->flags & BLK_MQ_F_GLOBAL_TAGS)
>> +lim = 1;
>> +for (i = 0; i < lim; i++) {
>>  struct blk_mq_tags *tags = set->tags[i];
>>  
>>  for (j = 0; j < tags->nr_tags; j++) {
>> diff --git a/block/blk-mq.c b/block/blk-mq.c
>> index 159187a..db96ed0 100644
>> --- a/block/blk-mq.c
>> +++ b/block/blk-mq.c
>> @@ -2061,6 +2061,10 @@ static bool __blk_mq_alloc_rq_map(struct 
>> blk_mq_tag_set *set, int hctx_idx)
>>  {
>>  int ret = 0;
>>  
>> +if ((set->flags & BLK_MQ_F_GLOBAL_TAGS) && hctx_idx != 0) {
>> +set->tags[hctx_idx] = set->tags[0];
>> +return true;
>> +}
> 
> So, this effectively make all request allocations to the same NUMA node 
> locality of the hctx_idx 0, correct? Is the performance hit you were 
> talking about in the cover letter?
> 
Yes. It does make the request allocations local to NUMA node 0, but then
this will only affect LLDDs which are actually _using_ NUMA locality
when allocating the request nodes.
However, SCSI doesn't set a NUMA node locality to begin with, so this
doesn't affect us.

No, what I meant is this:
the 'sbitmap' allocator already splits up the bitmap into several words,
which then should provide a better NUMA locality per map.
When we're using a shared global map it's unclear whether the individual
words of the sbitmap can and will be moved to the various NUMA nodes, or
whether we suffer from non-locality.

My tests so far have been inconclusive; but then I'm not happy with the
testcase anyway (using null_blk I only get 250k/250k r/w IOPs, which I
found rather disappointing).

> Do you have any other alternatives in mind? Dynamic growing/shrinking 
> tags/request-pool in hctx with a fixed base as start?
> 
> One alternative that comes to my mind is to move the divvy up logic to 
> SCSI (instead of LLD doing it), IOW:
> 
> 1. Have SCSI set tag_set.queue_depth to can_queue/nr_hw_queues
> 2. Have blk_mq_unique_tag() (or a new i/f) returning "hwq * nr_hw_queue + 
>rq->tag"
> 
> That would make the tags linear in the can_queue space, but could result 
> in poor use of LLD resource if a given hctx has used up all it's tags.
> 
Exactly. This is the method I used for implementing mq support for lpfc
and mpt3sas; however the complaint there indeed was that we might be
running into a tag starvation scenario with a large number of LUNs and
single-threaded I/O submission.

> On a related note, would not the current use of can_queue in SCSI lead to 
> poor resource utilization in MQ cases? Like, block layer allocating 
> nr_hw_queues * tags+request+driver_data.etc * can_queue, but SCSI limiting 
> the number of requests to can_queue.
> 
Yes, indeed. That's another problem which we should be looking at.
However, it's only ever relevant if we indeed implement some divvying
logic; if we move to the shared tags approach it should work as designed.

> BTW, if you would like me to try out this patch on my setup, please let me 
> know.
> 
Oh, yes. Please do.

Cheers,

Hannes
-- 
Dr. Hannes ReineckezSeries & Storage
h...@suse.com  +49 911 74053 688
SUSE LINUX GmbH, Maxfeldstr. 5, 90409 Nürnberg
GF: F. Imendörffer, J. Smithard, D. 

Re: [PATCH 1/2] block: Implement global tagset

2017-04-06 Thread Arun Easi
Hi Hannes,

Thanks for taking a crack at the issue. My comments below..

On Tue, 4 Apr 2017, 5:07am, Hannes Reinecke wrote:

> Most legacy HBAs have a tagset per HBA, not per queue. To map
> these devices onto block-mq this patch implements a new tagset
> flag BLK_MQ_F_GLOBAL_TAGS, which will cause the tag allocator
> to use just one tagset for all hardware queues.
> 
> Signed-off-by: Hannes Reinecke 
> ---
>  block/blk-mq-tag.c | 12 
>  block/blk-mq.c | 10 --
>  include/linux/blk-mq.h |  1 +
>  3 files changed, 17 insertions(+), 6 deletions(-)
> 
> diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c
> index e48bc2c..a14e76c 100644
> --- a/block/blk-mq-tag.c
> +++ b/block/blk-mq-tag.c
> @@ -276,9 +276,11 @@ static void blk_mq_all_tag_busy_iter(struct blk_mq_tags 
> *tags,
>  void blk_mq_tagset_busy_iter(struct blk_mq_tag_set *tagset,
>   busy_tag_iter_fn *fn, void *priv)
>  {
> - int i;
> + int i, lim = tagset->nr_hw_queues;
>  
> - for (i = 0; i < tagset->nr_hw_queues; i++) {
> + if (tagset->flags & BLK_MQ_F_GLOBAL_TAGS)
> + lim = 1;
> + for (i = 0; i < lim; i++) {
>   if (tagset->tags && tagset->tags[i])
>   blk_mq_all_tag_busy_iter(tagset->tags[i], fn, priv);
>   }
> @@ -287,12 +289,14 @@ void blk_mq_tagset_busy_iter(struct blk_mq_tag_set 
> *tagset,
>  
>  int blk_mq_reinit_tagset(struct blk_mq_tag_set *set)
>  {
> - int i, j, ret = 0;
> + int i, j, ret = 0, lim = set->nr_hw_queues;
>  
>   if (!set->ops->reinit_request)
>   goto out;
>  
> - for (i = 0; i < set->nr_hw_queues; i++) {
> + if (set->flags & BLK_MQ_F_GLOBAL_TAGS)
> + lim = 1;
> + for (i = 0; i < lim; i++) {
>   struct blk_mq_tags *tags = set->tags[i];
>  
>   for (j = 0; j < tags->nr_tags; j++) {
> diff --git a/block/blk-mq.c b/block/blk-mq.c
> index 159187a..db96ed0 100644
> --- a/block/blk-mq.c
> +++ b/block/blk-mq.c
> @@ -2061,6 +2061,10 @@ static bool __blk_mq_alloc_rq_map(struct 
> blk_mq_tag_set *set, int hctx_idx)
>  {
>   int ret = 0;
>  
> + if ((set->flags & BLK_MQ_F_GLOBAL_TAGS) && hctx_idx != 0) {
> + set->tags[hctx_idx] = set->tags[0];
> + return true;
> + }

So, this effectively make all request allocations to the same NUMA node 
locality of the hctx_idx 0, correct? Is the performance hit you were 
talking about in the cover letter?

Do you have any other alternatives in mind? Dynamic growing/shrinking 
tags/request-pool in hctx with a fixed base as start?

One alternative that comes to my mind is to move the divvy up logic to 
SCSI (instead of LLD doing it), IOW:

1. Have SCSI set tag_set.queue_depth to can_queue/nr_hw_queues
2. Have blk_mq_unique_tag() (or a new i/f) returning "hwq * nr_hw_queue + 
   rq->tag"

That would make the tags linear in the can_queue space, but could result 
in poor use of LLD resource if a given hctx has used up all it's tags.

On a related note, would not the current use of can_queue in SCSI lead to 
poor resource utilization in MQ cases? Like, block layer allocating 
nr_hw_queues * tags+request+driver_data.etc * can_queue, but SCSI limiting 
the number of requests to can_queue.

BTW, if you would like me to try out this patch on my setup, please let me 
know.

Regards,
-Arun

>   set->tags[hctx_idx] = blk_mq_alloc_rq_map(set, hctx_idx,
>   set->queue_depth, set->reserved_tags);
>   if (!set->tags[hctx_idx])
> @@ -2080,8 +2084,10 @@ static void blk_mq_free_map_and_requests(struct 
> blk_mq_tag_set *set,
>unsigned int hctx_idx)
>  {
>   if (set->tags[hctx_idx]) {
> - blk_mq_free_rqs(set, set->tags[hctx_idx], hctx_idx);
> - blk_mq_free_rq_map(set->tags[hctx_idx]);
> + if (!(set->flags & BLK_MQ_F_GLOBAL_TAGS) || hctx_idx == 0) {
> + blk_mq_free_rqs(set, set->tags[hctx_idx], hctx_idx);
> + blk_mq_free_rq_map(set->tags[hctx_idx]);
> + }
>   set->tags[hctx_idx] = NULL;
>   }
>  }
> diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
> index b296a90..eee27b016 100644
> --- a/include/linux/blk-mq.h
> +++ b/include/linux/blk-mq.h
> @@ -155,6 +155,7 @@ enum {
>   BLK_MQ_F_DEFER_ISSUE= 1 << 4,
>   BLK_MQ_F_BLOCKING   = 1 << 5,
>   BLK_MQ_F_NO_SCHED   = 1 << 6,
> + BLK_MQ_F_GLOBAL_TAGS= 1 << 7,
>   BLK_MQ_F_ALLOC_POLICY_START_BIT = 8,
>   BLK_MQ_F_ALLOC_POLICY_BITS = 1,
>  
> 


[PATCH 1/2] block: Implement global tagset

2017-04-04 Thread Hannes Reinecke
Most legacy HBAs have a tagset per HBA, not per queue. To map
these devices onto block-mq this patch implements a new tagset
flag BLK_MQ_F_GLOBAL_TAGS, which will cause the tag allocator
to use just one tagset for all hardware queues.

Signed-off-by: Hannes Reinecke 
---
 block/blk-mq-tag.c | 12 
 block/blk-mq.c | 10 --
 include/linux/blk-mq.h |  1 +
 3 files changed, 17 insertions(+), 6 deletions(-)

diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c
index e48bc2c..a14e76c 100644
--- a/block/blk-mq-tag.c
+++ b/block/blk-mq-tag.c
@@ -276,9 +276,11 @@ static void blk_mq_all_tag_busy_iter(struct blk_mq_tags 
*tags,
 void blk_mq_tagset_busy_iter(struct blk_mq_tag_set *tagset,
busy_tag_iter_fn *fn, void *priv)
 {
-   int i;
+   int i, lim = tagset->nr_hw_queues;
 
-   for (i = 0; i < tagset->nr_hw_queues; i++) {
+   if (tagset->flags & BLK_MQ_F_GLOBAL_TAGS)
+   lim = 1;
+   for (i = 0; i < lim; i++) {
if (tagset->tags && tagset->tags[i])
blk_mq_all_tag_busy_iter(tagset->tags[i], fn, priv);
}
@@ -287,12 +289,14 @@ void blk_mq_tagset_busy_iter(struct blk_mq_tag_set 
*tagset,
 
 int blk_mq_reinit_tagset(struct blk_mq_tag_set *set)
 {
-   int i, j, ret = 0;
+   int i, j, ret = 0, lim = set->nr_hw_queues;
 
if (!set->ops->reinit_request)
goto out;
 
-   for (i = 0; i < set->nr_hw_queues; i++) {
+   if (set->flags & BLK_MQ_F_GLOBAL_TAGS)
+   lim = 1;
+   for (i = 0; i < lim; i++) {
struct blk_mq_tags *tags = set->tags[i];
 
for (j = 0; j < tags->nr_tags; j++) {
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 159187a..db96ed0 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -2061,6 +2061,10 @@ static bool __blk_mq_alloc_rq_map(struct blk_mq_tag_set 
*set, int hctx_idx)
 {
int ret = 0;
 
+   if ((set->flags & BLK_MQ_F_GLOBAL_TAGS) && hctx_idx != 0) {
+   set->tags[hctx_idx] = set->tags[0];
+   return true;
+   }
set->tags[hctx_idx] = blk_mq_alloc_rq_map(set, hctx_idx,
set->queue_depth, set->reserved_tags);
if (!set->tags[hctx_idx])
@@ -2080,8 +2084,10 @@ static void blk_mq_free_map_and_requests(struct 
blk_mq_tag_set *set,
 unsigned int hctx_idx)
 {
if (set->tags[hctx_idx]) {
-   blk_mq_free_rqs(set, set->tags[hctx_idx], hctx_idx);
-   blk_mq_free_rq_map(set->tags[hctx_idx]);
+   if (!(set->flags & BLK_MQ_F_GLOBAL_TAGS) || hctx_idx == 0) {
+   blk_mq_free_rqs(set, set->tags[hctx_idx], hctx_idx);
+   blk_mq_free_rq_map(set->tags[hctx_idx]);
+   }
set->tags[hctx_idx] = NULL;
}
 }
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index b296a90..eee27b016 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -155,6 +155,7 @@ enum {
BLK_MQ_F_DEFER_ISSUE= 1 << 4,
BLK_MQ_F_BLOCKING   = 1 << 5,
BLK_MQ_F_NO_SCHED   = 1 << 6,
+   BLK_MQ_F_GLOBAL_TAGS= 1 << 7,
BLK_MQ_F_ALLOC_POLICY_START_BIT = 8,
BLK_MQ_F_ALLOC_POLICY_BITS = 1,
 
-- 
1.8.5.6