Re: question about request merge

2018-04-22 Thread Zhengyuan Liu
2018-04-22 0:23 GMT+08:00 Jens Axboe <ax...@kernel.dk>:
> On 4/21/18 8:07 AM, Zhengyuan Liu wrote:
>> 2018-04-20 22:34 GMT+08:00 Jens Axboe <ax...@kernel.dk>:
>>> On 4/19/18 9:51 PM, Zhengyuan Liu wrote:
>>>> Hi, Shaohua
>>>>
>>>> I found it indeed doesn't do front merge when two threads flush plug list  
>>>> concurrently.   To
>>>> reappear , I prepared two IO threads , named a0.io and a1.io .
>>>> Thread a1.io  uses libaio to write 5 requests :
>>>> sectors: 16 + 8, 40 + 8, 64 + 8, 88 + 8, 112 + 8
>>>> Thread a0.io  uses libaio to write other 5 requests :
>>>> sectors: 8+ 8, 32 + 8, 56 + 8, 80 + 8, 104 + 8
>>>
>>> I'm cutting some of the below.
>>>
>>> Thanks for the detailed email. It's mostly on purpose that we don't
>>> spend cycles and memory on maintaining a separate front merge hash,
>>> since it's generally not something that happens very often. If you have
>>> a thread pool doing IO and split sequential IO such that you would
>>> benefit a lot from front merging, then I would generally claim that
>>> you're not writing your app in the most optimal manner.
>>>
>>
>> Thanks for explanation, I only consider the problem through the code's
>> perspective and ignore the reality situation of app.
>
> That's quite by design and not accidental.
>
>>> So I'm curious, what's the big interest in front merging?
>>
>> If it's not something that happens so much often, I think it's not worth to
>> support front merging too.
>>
>> By the way, I got another question that why not  blktrace tracing the back
>> merging of requests while flushing plugged requests to queue,  if it does
>> we may get a more clear view about IO merging.
>
> Not sure I follow, exactly where is a back merge trace missing?
>

I mean blktrace only traces bio merging , not traces request merging; Let
me give a example, I use thread a.out to write three bios, seeing bellow:

a.out:  0 + 8, 16 + 8, 8 + 8

The result of blktrace was showed as bellow:

  8,16   17 0.292069180  1222  Q  WS 0 + 8 [a.out]
  8,16   18 0.292073960  1222  G  WS 0 + 8 [a.out]
  8,16   19 0.292074440  1222  P   N [a.out]
  8,16   1   10 0.292079380  1222  Q  WS 16 + 8 [a.out]
  8,16   1   11 0.292081840  1222  G  WS 16 + 8 [a.out]
  8,16   1   12 0.292085860  1222  Q  WS 8 + 8 [a.out]
  8,16   1   13 0.292087240  1222  F  WS 8 + 8 [a.out]
  8,16   1   14 0.292089100  1222  I  WS 0 + 8 [a.out]
  8,16   1   15 0.292095200  1222  I  WS 8 + 16 [a.out]
  8,16   1   16 0.295931920  1222  U   N [a.out] 2
  8,16   1   17 0.298528980  1222  D  WS 0 + 24 [a.out]
  8,16   03 0.302617360 3  C  WS 0 + 24 [0]
  
  Total (8,16):
 Reads Queued:   0,0KiB  Writes Queued: 
  3,   12KiB
 Read Dispatches:0,0KiB Write Dispatches:   
 1,   12KiB
 Reads Requeued: 0 Writes Requeued: 0
 Reads Completed:0,0KiB  Writes Completed:1,
   12KiB
 Read Merges:0,0KiB  Write Merges:  
  1,4KiB
 PC Reads Queued:0,0KiB  PC Writes Queued:0,
0KiB
 PC Read Disp.:  3,0KiB  PC Write Disp.:
  0,0KiB
 PC Reads Req.:  0   PC Writes Req.:  0
 PC Reads Compl.:3   PC Writes Compl.:0
 IO unplugs: 1   Timer unplugs:   0

we merge bio(8 + 8) into request(16 + 8) at plug stage and that's well traced
as F, when comes to unplug stage request(0 + 8) and request(8 + 16) merge
into only one request(0 + 24),  but there isn't tracing information about that
operation.

 So I'm just a bit curious and please forgive my ignorance.

Thanks.
> --
> Jens Axboe
>

Re: question about request merge

2018-04-21 Thread Zhengyuan Liu
2018-04-20 22:34 GMT+08:00 Jens Axboe <ax...@kernel.dk>:
> On 4/19/18 9:51 PM, Zhengyuan Liu wrote:
>> Hi, Shaohua
>>
>> I found it indeed doesn't do front merge when two threads flush plug list  
>> concurrently.   To
>> reappear , I prepared two IO threads , named a0.io and a1.io .
>> Thread a1.io  uses libaio to write 5 requests :
>> sectors: 16 + 8, 40 + 8, 64 + 8, 88 + 8, 112 + 8
>> Thread a0.io  uses libaio to write other 5 requests :
>> sectors: 8+ 8, 32 + 8, 56 + 8, 80 + 8, 104 + 8
>
> I'm cutting some of the below.
>
> Thanks for the detailed email. It's mostly on purpose that we don't
> spend cycles and memory on maintaining a separate front merge hash,
> since it's generally not something that happens very often. If you have
> a thread pool doing IO and split sequential IO such that you would
> benefit a lot from front merging, then I would generally claim that
> you're not writing your app in the most optimal manner.
>

Thanks for explanation, I only consider the problem through the code's 
perspective and ignore the reality situation of app. 

> So I'm curious, what's the big interest in front merging?

If it's not something that happens so much often, I think it's not worth to 
support front merging too.

By the way, I got another question that why not  blktrace tracing the back
merging of requests while flushing plugged requests to queue,  if it does
we may get a more clear view about IO merging.

>
> --
> Jens Axboe
>

Re:question about request merge

2018-04-19 Thread Zhengyuan Liu
 259   471.608957780  1473  G  WS 112 + 8 [a1.io]
  8,16   2  260   471.608959880  1473  I  WS 16 + 8 [a1.io]
  8,16   2  261   471.608975880  1473  I  WS 40 + 8 [a1.io]
  8,16   2  262   471.608986420  1473  I  WS 64 + 8 [a1.io]
  8,16   2  263   471.608995480  1473  I  WS 88 + 8 [a1.io]
  8,16   2  264   471.609003720  1473  I  WS 112 + 8 [a1.io]
  8,16   3  267   471.610200680  1474  Q  WS 8 + 8 [a0.io]
  8,16   3  268   471.610204700  1474  G  WS 8 + 8 [a0.io]
  8,16   3  270   471.610210280  1474  Q  WS 32 + 8 [a0.io]
  8,16   3  271   471.610211840  1474  G  WS 32 + 8 [a0.io]
  8,16   3  272   471.610215660  1474  Q  WS 56 + 8 [a0.io]
  8,16   3  273   471.610217200  1474  G  WS 56 + 8 [a0.io]
  8,16   3  274   471.610220860  1474  Q  WS 80 + 8 [a0.io]
  8,16   3  275   471.610222300  1474  G  WS 80 + 8 [a0.io]
  8,16   3  276   471.610225720  1474  Q  WS 104 + 8 [a0.io]
  8,16   3  277   471.610227540  1474  G  WS 104 + 8 [a0.io]
  8,16   3  278   471.610229100  1474  I  WS 8 + 8 [a0.io]
  8,16   3  279   471.615291420  1474  I  WS 32 + 8 [a0.io]
  8,16   3  280   471.620311200  1474  I  WS 56 + 8 [a0.io]
  8,16   3  281   471.625327620  1474  I  WS 80 + 8 [a0.io]
  8,16   3  282   471.630343080  1474  I  WS 104 + 8 [a0.io]
  8,16   3  284   471.637881600  1474  D  WS 104 + 16 [a0.io]
  8,16   3  285   471.640429120  1474  D  WS 80 + 16 [a0.io]
  8,16   0  397   471.644573100 3  C  WS 104 + 16 [0]
  8,16   0  398   471.647159640 3  D  WS 56 + 16 [ksoftirqd/0]
  8,16   1   49   471.64982594013  C  WS 80 + 16 [0]
  8,16   1   50   471.65239154013  D  WS 32 + 16 [ksoftirqd/1]
  8,16   0  399   471.654446580 3  C  WS 56 + 16 [0]
  8,16   0  400   471.657000820 3  D  WS 8 + 16 [ksoftirqd/0]
  8,16   0  401   471.659445000 3  C  WS 32 + 16 [0]
  8,16   0  402   471.663946360     3  C  WS 8 + 16 [0]

Now those adjacent request get merged.

-- Original --
From:  "Zhengyuan Liu"<liuzhengy...@kylinos.cn>;
Date:  Fri, Apr 13, 2018 04:16 PM
To:  "shli"<s...@fb.com>;
Subject:  question about request merge
 
Hi, Shaohua

There is a question around me while viewing block layer code, so I'm writing to 
you to get help.

When a request from plug list get flushed to queue, it try to merge into a 
existing request by
calling function elv_attempt_insert_merge before added directly to the queue, 
seeing bellow:

  /*
 * Attempt to do an insertion back merge. Only check for the case where
 * we can append 'rq' to an existing request, so we can throw 'rq' away
 * afterwards.
 *
 * Returns true if we merged, false otherwise
 */
bool elv_attempt_insert_merge(struct request_queue *q, struct request 
*rq)
{
struct request *__rq;
bool ret;

if (blk_queue_nomerges(q))
return false;

/*
 * First try one-hit cache.
 */
if (q->last_merge && blk_attempt_req_merge(q, q->last_merge, 
rq))
return true;

if (blk_queue_noxmerges(q))
return false;

ret = false;
/*
 * See if our hash lookup can find a potential backmerge.
 */
while (1) {
__rq = elv_rqhash_find(q, blk_rq_pos(rq));
if (!__rq || !blk_attempt_req_merge(q, __rq, rq))
break;

/* The merged request could be merged with others, try 
again */
ret = true;
rq = __rq;
}

return ret;
}

From the comment and code we can see it only do backmerge, I think its enough 
when there is only
one thread doing IO operation since we have sorted the requests in plug list 
before unplug, seeing the 
patch from Jianpeng Ma 422765c("block: Remove should_sort judgement when flush 
blk_plug") and
975927b("block: Add blk_rq_pos(rq) to sort rq when plushing"). However when 
comes to multiple
threads, lets image bellow IO scenario, thread A and B both hold three requests 
in plug list.

threadA: a+1, a+4, a+7
threadB: a+2, a+5, a+8

if threadA has flushed all its request to queue before threadB does, then 
request a+1 and a+2  and
other adjacent pairs have the chance to get merged through backmerge, but if  
threadB flushs all its
requests to queue before threadA how could those adjacent requests get merged? 
or it doesn't matter
at all?  I know your patch bee0393("block: recursive merge requests&quo

[PATCH] block/brd: add a numa_node module parameter

2017-07-20 Thread Zhengyuan Liu
Provide user a way to control which NUMA node the memory for ram
device is allocated from. The default memory allocation can fall
into more than one numa node for one raw device, with this patch
you can specify one device just allocting from one node, if not
exceed the node's total memory, and another device from a different
node as the parameter is writable.

Signed-off-by: Zhengyuan Liu <liuzhengy...@kylinos.cn>
---
 drivers/block/brd.c | 32 +++-
 1 file changed, 27 insertions(+), 5 deletions(-)

diff --git a/drivers/block/brd.c b/drivers/block/brd.c
index 17723fd5..a9e5867 100644
--- a/drivers/block/brd.c
+++ b/drivers/block/brd.c
@@ -56,10 +56,28 @@ struct brd_device {
struct radix_tree_root  brd_pages;
 };
 
+static DEFINE_MUTEX(brd_mutex);
+
+static int rd_numa_node = NUMA_NO_NODE;
+
+static int set_rd_numa_node(const char *val, struct kernel_param *kp)
+{
+   int ret = param_set_int(val, kp);
+   if (ret)
+   return ret;
+   mutex_lock(_mutex);
+   if (rd_numa_node < NUMA_NO_NODE)
+   rd_numa_node = NUMA_NO_NODE;
+   else if (rd_numa_node > num_online_nodes() - 1)
+   rd_numa_node = num_online_nodes() - 1;
+   pr_info("setting rd_numa_node to (%d)\n", rd_numa_node);
+   mutex_unlock(_mutex);
+   return 0;
+}
+
 /*
  * Look up and return a brd's page for a given sector.
  */
-static DEFINE_MUTEX(brd_mutex);
 static struct page *brd_lookup_page(struct brd_device *brd, sector_t sector)
 {
pgoff_t idx;
@@ -114,7 +132,7 @@ static struct page *brd_insert_page(struct brd_device *brd, 
sector_t sector)
 #ifndef CONFIG_BLK_DEV_RAM_DAX
gfp_flags |= __GFP_HIGHMEM;
 #endif
-   page = alloc_page(gfp_flags);
+   page = alloc_pages_node(rd_numa_node, gfp_flags, 0);
if (!page)
return NULL;
 
@@ -413,14 +431,14 @@ static struct brd_device *brd_alloc(int i)
struct brd_device *brd;
struct gendisk *disk;
 
-   brd = kzalloc(sizeof(*brd), GFP_KERNEL);
+   brd = kzalloc_node(sizeof(*brd), GFP_KERNEL, rd_numa_node);
if (!brd)
goto out;
brd->brd_number = i;
spin_lock_init(>brd_lock);
INIT_RADIX_TREE(>brd_pages, GFP_ATOMIC);
 
-   brd->brd_queue = blk_alloc_queue(GFP_KERNEL);
+   brd->brd_queue = blk_alloc_queue_node(GFP_KERNEL, rd_numa_node);
if (!brd->brd_queue)
goto out_free_dev;
 
@@ -434,7 +452,7 @@ static struct brd_device *brd_alloc(int i)
 *  is harmless)
 */
blk_queue_physical_block_size(brd->brd_queue, PAGE_SIZE);
-   disk = brd->brd_disk = alloc_disk(max_part);
+   disk = brd->brd_disk = alloc_disk_node(max_part, rd_numa_node);
if (!disk)
goto out_free_queue;
disk->major = RAMDISK_MAJOR;
@@ -593,6 +611,10 @@ static void __exit brd_exit(void)
pr_info("brd: module unloaded\n");
 }
 
+module_param_call(rd_numa_node, set_rd_numa_node, param_get_int,
+   _numa_node, S_IRUGO | S_IWUSR);
+MODULE_PARM_DESC(rd_numa_node, "Numa node to which memeory alloc from");
+
 module_init(brd_init);
 module_exit(brd_exit);
 
-- 
2.7.4