From: Liu Yuan <[email protected]> This dead lock can be reprodiced by 026.
We should always service CREATE_AND_WRITE request instead of queueing it on wait queues while in recovery. The recovery can be finished without any any objects in the list (rw->count == 0 in some special case), in which case that no one calls resume_wait_recovery_requests() or other flusher on rw_list or obj_list. Signed-off-by: Liu Yuan <[email protected]> --- sheep/request.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/sheep/request.c b/sheep/request.c index 5981e14..fd210d3 100644 --- a/sheep/request.c +++ b/sheep/request.c @@ -141,6 +141,12 @@ static int check_request_epoch(struct request *req) static bool request_in_recovery(struct request *req) { + + /* For CREATE request, we simply service it */ + if (req->rq.opcode == SD_OP_CREATE_AND_WRITE_PEER || + req->rq.opcode == SD_OP_CREATE_AND_WRITE_OBJ) + return false; + /* * Request from recovery should go down the Farm even if * oid_in_recovery() returns true because we should also try snap @@ -152,10 +158,12 @@ static bool request_in_recovery(struct request *req) * Put request on wait queues of local node */ if (is_recovery_init()) { + dprintf("%"PRIx64" on rw_queue\n", req->local_oid); req->rp.result = SD_RES_OBJ_RECOVERING; list_add_tail(&req->request_list, &sys->wait_rw_queue); } else { + dprintf("%"PRIx64" on obj_queue\n", req->local_oid); list_add_tail(&req->request_list, &sys->wait_obj_queue); } @@ -328,7 +336,7 @@ static void queue_request(struct request *req) goto done; } - dprintf("%s\n", op_name(req->op)); + dprintf("%s, %d\n", op_name(req->op), sys->status); switch (sys->status) { case SD_STATUS_KILLED: -- 1.7.12.84.gefa6462 -- sheepdog mailing list [email protected] http://lists.wpkg.org/mailman/listinfo/sheepdog
