Hi, Shaohua
I found it indeed doesn't do front merge when two threads flush plug list
concurrently. To
reappear , I prepared two IO threads , named a0.io and a1.io .
Thread a1.io uses libaio to write 5 requests :
sectors: 16 + 8, 40 + 8, 64 + 8, 88 + 8, 112 + 8
Thread a0.io uses libaio to write other 5 requests :
sectors: 8+ 8, 32 + 8, 56 + 8, 80 + 8, 104 + 8
To meet the condition that thread a1.io flush all its requests to queuee before
thread a0.io
does, I delay thread a1.io to run queue after it adds all pluged requests to
queue, seeing
bellow patch:
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -3324,6 +3324,21 @@ void blk_flush_plug_list(struct blk_plug *plug, bool
from_schedule)
/*
* This drops the queue lock
*/
+
+ if (strcmp(current->comm, "a1.io") == 0)
+ {
+ spin_unlock_irq(q->queue_lock);
+ mdelay(2000);
+ spin_lock_irq(q->queue_lock);
+ }
Then I used " ./a1.io & sleep 1 && ./a0.io " to dispatch the IO operation and
used blktrace to
trace the IO processing, the trace result was showed bellow:
8,16 1 174 176.733880160 1217 Q WS 16 + 8 [a1.io]
8,16 1 175 176.733885300 1217 G WS 16 + 8 [a1.io]
8,16 1 177 176.733890820 1217 Q WS 40 + 8 [a1.io]
8,16 1 178 176.733892460 1217 G WS 40 + 8 [a1.io]
8,16 1 179 176.733896380 1217 Q WS 64 + 8 [a1.io]
8,16 1 180 176.733897900 1217 G WS 64 + 8 [a1.io]
8,16 1 181 176.733901260 1217 Q WS 88 + 8 [a1.io]
8,16 1 182 176.733902760 1217 G WS 88 + 8 [a1.io]
8,16 1 183 176.733906200 1217 Q WS 112 + 8 [a1.io]
8,16 1 184 176.733907640 1217 G WS 112 + 8 [a1.io]
8,16 1 185 176.733909560 1217 I WS 16 + 8 [a1.io]
8,16 1 186 176.733925480 1217 I WS 40 + 8 [a1.io]
8,16 1 187 176.733936560 1217 I WS 64 + 8 [a1.io]
8,16 1 188 176.733945920 1217 I WS 88 + 8 [a1.io]
8,16 1 189 176.733953880 1217 I WS 112 + 8 [a1.io]
8,16 2 103 176.734317080 1218 Q WS 8 + 8 [a0.io]
8,16 2 104 176.734320600 1218 G WS 8 + 8 [a0.io]
8,16 2 106 176.734325520 1218 Q WS 32 + 8 [a0.io]
8,16 2 107 176.734327140 1218 G WS 32 + 8 [a0.io]
8,16 2 108 176.734330620 1218 Q WS 56 + 8 [a0.io]
8,16 2 109 176.734332140 1218 G WS 56 + 8 [a0.io]
8,16 2 110 176.734335380 1218 Q WS 80 + 8 [a0.io]
8,16 2 111 176.734336760 1218 G WS 80 + 8 [a0.io]
8,16 2 112 176.734340300 1218 Q WS 104 + 8 [a0.io]
8,16 2 113 176.734341900 1218 G WS 104 + 8 [a0.io]
8,16 2 114 176.734343520 1218 I WS 8 + 8 [a0.io]
8,16 2 115 176.734357280 1218 I WS 32 + 8 [a0.io]
8,16 2 116 176.734366560 1218 I WS 56 + 8 [a0.io]
8,16 2 117 176.734374800 1218 I WS 80 + 8 [a0.io]
8,16 2 118 176.734382460 1218 I WS 104 + 8 [a0.io]
8,16 2 120 176.738012960 1218 D WS 112 + 8 [a0.io]
8,16 0 151 176.740923200 3 C WS 112 + 8 [0]
8,16 2 121 176.741804960 1218 D WS 88 + 8 [a0.io]
8,16 2 122 176.741826060 1218 R WS 88 + 8 [0]
8,16 2 123 176.741827740 1218 I WS 88 + 8 [a0.io]
8,16 0 152 176.745259080 3 D WS 88 + 8 [ksoftirqd/0]
8,16 0 153 176.747824380 3 D WS 64 + 8 [ksoftirqd/0]
8,16 0 154 176.748921220 3 C WS 88 + 8 [0]
8,16 0 155 176.751472640 3 D WS 40 + 8 [ksoftirqd/0]
8,16 0 156 176.753172500 3 C WS 64 + 8 [0]
8,16 0 157 176.755720740 3 D WS 16 + 8 [ksoftirqd/0]
8,16 0 158 176.757920780 3 C WS 40 + 8 [0]
8,16 0 159 176.760471320 3 D WS 104 + 8 [ksoftirqd/0]
8,16 0 160 176.763172180 3 C WS 16 + 8 [0]
8,16 0 161 176.765717560 3 D WS 80 + 8 [ksoftirqd/0]
8,16 0 162 176.766795400 3 C WS 104 + 8 [0]
8,16 0 163 176.769341020 3 D WS 56 + 8 [ksoftirqd/0]
8,16 2 124 176.770926380 18 C WS 80 + 8 [0]
8,16 0 164 176.774335640 924 C WS 56 + 8 [0]
8,16 2 125 176.774642140 18 D WS 32 + 8 [ksoftirqd/2]
8,16 2 126 176.774651300 18 R WS 32 + 8 [0]
8,16 2 127 176.774652900 18 I WS 32 + 8 [ksoftirqd/2]
8,16 0 165 176.778546360 924 D WS 32 + 8 [sshd]
8,16 0 166 176.782498120 924 D WS 8 + 8 [sshd]
8,16 0 167 176.785298980 3 C WS 32 + 8 [0]
8,16 0 168 176.788671240 3 C WS 8 + 8 [0]
We can see from the result that the adjacent requests didn't get merged.
To make those adjacent requests to get merged, I try to write a simple patch:
diff --git a/block/elevator.c b/block/elevator.c
index c3555c9c6..40acd0e84 100644
--- a/block/elevator.c
+++ b/block/elevator.c
@@ -245,6 +245,7 @@ EXPORT_SYMBOL(elevator_exit);
static inline void __elv_rqhash_del(struct request *rq)
{
hash_del(&rq->hash);
+ hash_del(&rq->front_hash);
rq->cmd_flags &= ~REQ_HASHED;
}
@@ -252,6 +253,7 @@ static void elv_rqhash_del(struct request_queue *q, struct
request *rq)
{
if (ELV_ON_HASH(rq))
__elv_rqhash_del(rq);
+
}
static void elv_rqhash_add(struct request_queue *q, struct request *rq)
@@ -260,6 +262,7 @@ static void elv_rqhash_add(struct request_queue *q, struct
request *rq)
BUG_ON(ELV_ON_HASH(rq));
hash_add(e->hash, &rq->hash, rq_hash_key(rq));
+ hash_add(e->front_hash, &rq->front_hash, blk_rq_pos(rq));
rq->cmd_flags |= REQ_HASHED;
}
@@ -290,6 +293,28 @@ static struct request *elv_rqhash_find(struct
request_queue *q, sector_t offset)
return NULL;
}
+static struct request *elv_fronthash_find(struct request_queue *q, sector_t
offset)
+{
+ struct elevator_queue *e = q->elevator;
+ struct hlist_node *next;
+ struct request *rq;
+
+ hash_for_each_possible_safe(e->front_hash, rq, next, front_hash,
offset) {
+ BUG_ON(!ELV_ON_HASH(rq));
+
+ if (unlikely(!rq_mergeable(rq))) {
+ __elv_rqhash_del(rq);
+ continue;
+ }
+
+ if (blk_rq_pos(rq) == offset)
+ return rq;
+ }
+
+ return NULL;
+}
+
+
/*
* RB-tree support functions for inserting/lookup/removal of requests
* in a sorted RB tree.
@@ -493,6 +518,39 @@ static bool elv_attempt_insert_merge(struct request_queue
*q,
return ret;
}
+/*
+ * Attempt to do an insertion front merge.
+ * Returns true if we merged, false otherwise
+ */
+static bool elv_attempt_front_merge(struct request_queue *q,
+ struct request *rq)
+{
+ struct request *__rq;
+ bool ret;
+
+ if (blk_queue_nomerges(q))
+ return false;
+
+ /*
+ * First try one-hit cache.
+ */
+ if (q->last_merge && blk_attempt_req_merge(q, rq, q->last_merge))
+ return true;
+
+ if (blk_queue_noxmerges(q))
+ return false;
+
+ ret = false;
+ /*
+ * See if our hash lookup can find a potential frontmerge.
+ */
+
+ __rq = elv_fronthash_find(q, rq_hash_key(rq));
+ if (__rq && blk_attempt_req_merge(q, rq, __rq)) {
+ ret = true;
+ }
+ return ret;
+}
void elv_merged_request(struct request_queue *q, struct request *rq, int type)
{
@@ -657,6 +715,8 @@ void __elv_add_request(struct request_queue *q, struct
request *rq, int where)
* elevator_add_req_fn.
*/
q->elevator->type->ops.elevator_add_req_fn(q, rq);
+ /* do a front merge after we added it to elevator */
+ elv_attempt_front_merge(q, rq);
break;
case ELEVATOR_INSERT_FLUSH:
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 085a03f67..9fe25be1b 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -120,6 +120,8 @@ struct request {
struct list_head ipi_list;
};
+ struct hlist_node front_hash; /* front merge hash */
+
/*
* The rb_node is only used inside the io scheduler, requests
* are pruned when moved to the dispatch queue. So let the
diff --git a/include/linux/elevator.h b/include/linux/elevator.h
index 638b324f0..1567e5412 100644
--- a/include/linux/elevator.h
+++ b/include/linux/elevator.h
@@ -114,6 +114,7 @@ struct elevator_queue
struct mutex sysfs_lock;
unsigned int registered:1;
DECLARE_HASHTABLE(hash, ELV_HASH_BITS);
+ DECLARE_HASHTABLE(front_hash, ELV_HASH_BITS);
};
/*
With the patch I restart "./a1.io & sleep 1 && ./a0.io" and blktrace result
showed bellow:
8,16 2 249 471.608930240 1473 Q WS 16 + 8 [a1.io]
8,16 2 250 471.608934940 1473 G WS 16 + 8 [a1.io]
8,16 2 252 471.608940680 1473 Q WS 40 + 8 [a1.io]
8,16 2 253 471.608942840 1473 G WS 40 + 8 [a1.io]
8,16 2 254 471.608946500 1473 Q WS 64 + 8 [a1.io]
8,16 2 255 471.608948040 1473 G WS 64 + 8 [a1.io]
8,16 2 256 471.608951460 1473 Q WS 88 + 8 [a1.io]
8,16 2 257 471.608952960 1473 G WS 88 + 8 [a1.io]
8,16 2 258 471.608956340 1473 Q WS 112 + 8 [a1.io]
8,16 2 259 471.608957780 1473 G WS 112 + 8 [a1.io]
8,16 2 260 471.608959880 1473 I WS 16 + 8 [a1.io]
8,16 2 261 471.608975880 1473 I WS 40 + 8 [a1.io]
8,16 2 262 471.608986420 1473 I WS 64 + 8 [a1.io]
8,16 2 263 471.608995480 1473 I WS 88 + 8 [a1.io]
8,16 2 264 471.609003720 1473 I WS 112 + 8 [a1.io]
8,16 3 267 471.610200680 1474 Q WS 8 + 8 [a0.io]
8,16 3 268 471.610204700 1474 G WS 8 + 8 [a0.io]
8,16 3 270 471.610210280 1474 Q WS 32 + 8 [a0.io]
8,16 3 271 471.610211840 1474 G WS 32 + 8 [a0.io]
8,16 3 272 471.610215660 1474 Q WS 56 + 8 [a0.io]
8,16 3 273 471.610217200 1474 G WS 56 + 8 [a0.io]
8,16 3 274 471.610220860 1474 Q WS 80 + 8 [a0.io]
8,16 3 275 471.610222300 1474 G WS 80 + 8 [a0.io]
8,16 3 276 471.610225720 1474 Q WS 104 + 8 [a0.io]
8,16 3 277 471.610227540 1474 G WS 104 + 8 [a0.io]
8,16 3 278 471.610229100 1474 I WS 8 + 8 [a0.io]
8,16 3 279 471.615291420 1474 I WS 32 + 8 [a0.io]
8,16 3 280 471.620311200 1474 I WS 56 + 8 [a0.io]
8,16 3 281 471.625327620 1474 I WS 80 + 8 [a0.io]
8,16 3 282 471.630343080 1474 I WS 104 + 8 [a0.io]
8,16 3 284 471.637881600 1474 D WS 104 + 16 [a0.io]
8,16 3 285 471.640429120 1474 D WS 80 + 16 [a0.io]
8,16 0 397 471.644573100 3 C WS 104 + 16 [0]
8,16 0 398 471.647159640 3 D WS 56 + 16 [ksoftirqd/0]
8,16 1 49 471.649825940 13 C WS 80 + 16 [0]
8,16 1 50 471.652391540 13 D WS 32 + 16 [ksoftirqd/1]
8,16 0 399 471.654446580 3 C WS 56 + 16 [0]
8,16 0 400 471.657000820 3 D WS 8 + 16 [ksoftirqd/0]
8,16 0 401 471.659445000 3 C WS 32 + 16 [0]
8,16 0 402 471.663946360 3 C WS 8 + 16 [0]
Now those adjacent request get merged.
------------------ Original ------------------
From: "Zhengyuan Liu"<[email protected]>;
Date: Fri, Apr 13, 2018 04:16 PM
To: "shli"<[email protected]>;
Subject: question about request merge
Hi, Shaohua
There is a question around me while viewing block layer code, so I'm writing to
you to get help.
When a request from plug list get flushed to queue, it try to merge into a
existing request by
calling function elv_attempt_insert_merge before added directly to the queue,
seeing bellow:
/*
* Attempt to do an insertion back merge. Only check for the case where
* we can append 'rq' to an existing request, so we can throw 'rq' away
* afterwards.
*
* Returns true if we merged, false otherwise
*/
bool elv_attempt_insert_merge(struct request_queue *q, struct request
*rq)
{
struct request *__rq;
bool ret;
if (blk_queue_nomerges(q))
return false;
/*
* First try one-hit cache.
*/
if (q->last_merge && blk_attempt_req_merge(q, q->last_merge,
rq))
return true;
if (blk_queue_noxmerges(q))
return false;
ret = false;
/*
* See if our hash lookup can find a potential backmerge.
*/
while (1) {
__rq = elv_rqhash_find(q, blk_rq_pos(rq));
if (!__rq || !blk_attempt_req_merge(q, __rq, rq))
break;
/* The merged request could be merged with others, try
again */
ret = true;
rq = __rq;
}
return ret;
}
From the comment and code we can see it only do backmerge, I think its enough
when there is only
one thread doing IO operation since we have sorted the requests in plug list
before unplug, seeing the
patch from Jianpeng Ma 422765c("block: Remove should_sort judgement when flush
blk_plug") and
975927b("block: Add blk_rq_pos(rq) to sort rq when plushing"). However when
comes to multiple
threads, lets image bellow IO scenario, thread A and B both hold three requests
in plug list.
threadA: a+1, a+4, a+7
threadB: a+2, a+5, a+8
if threadA has flushed all its request to queue before threadB does, then
request a+1 and a+2 and
other adjacent pairs have the chance to get merged through backmerge, but if
threadB flushs all its
requests to queue before threadA how could those adjacent requests get merged?
or it doesn't matter
at all? I know your patch bee0393("block: recursive merge requests") has
solved a multi-thread merge
problem but not mines.
Its hard to simulate such a IO scenario from userspace and I cann't fingure out
other useful methods
to verify whether the requests in such IO scenario really cann't get merged.
All my thoughts are from
speculation, so I hope you could give me some answers. Any reply would be
thankful.
Zhengyuan