On Thu, May 21, 2015 at 12:19 PM, Li Wang <[email protected]> wrote:
> From: Min Chen <[email protected]>
>
> Signed-off-by: Min Chen <[email protected]>
> Signed-off-by: Li Wang <[email protected]>
> Signed-off-by: Yunchuan Wen <[email protected]>
> ---
> drivers/block/rbd.c | 186
> +++++++++++++++++++++++++++++++++++++++++++++++++++-
> 1 file changed, 183 insertions(+), 3 deletions(-)
>
> diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
> index 99a3a556..51d8398 100644
> --- a/drivers/block/rbd.c
> +++ b/drivers/block/rbd.c
> @@ -1851,12 +1851,16 @@ static void rbd_osd_read_callback(struct
> rbd_obj_request *obj_request)
> obj_request, img_request, obj_request->result,
> obj_request->xferred, obj_request->length);
> if (layered && obj_request->result == -ENOENT &&
> - obj_request->img_offset < rbd_dev->parent_overlap)
> + obj_request->img_offset < rbd_dev->parent_overlap) {
> rbd_img_parent_read(obj_request);
> - else if (img_request)
> + rbd_assert(obj_request->img_request);
> + if(is_copy_on_read(obj_request->img_request->rbd_dev))
> + rbd_img_copyup_start(obj_request->img_request,
> obj_request->object_name);
> + } else if (img_request) {
> rbd_img_obj_request_read_callback(obj_request);
> - else
> + } else {
> obj_request_done_set(obj_request);
> + }
> }
>
> static void rbd_osd_write_callback(struct rbd_obj_request *obj_request)
> @@ -2915,6 +2919,182 @@ out_err:
> return result;
> }
>
> +static void rbd_img_copyup_end(struct rbd_copyup_request *copyup_request)
> +{
> + struct rbd_img_request *img_request = NULL;
> + rbd_assert(copyup_request);
> + img_request = copyup_request->img_request;
> + rbd_img_copyup_request_del(img_request, copyup_request);
> + rbd_copyup_request_destroy(©up_request->kref);
> + rbd_img_request_put(img_request);
> +}
> +
> +static void rbd_osd_req_copyup_callback(struct ceph_osd_request *osd_req,
> + struct ceph_msg *msg)
> +{
> + struct rbd_copyup_request *copyup_request = NULL;
> + rbd_assert(osd_req);
> + copyup_request = osd_req->r_priv;
> + copyup_request->result = osd_req->r_result;
> + if(copyup_request->callback)
> + copyup_request->callback(copyup_request);
> + else
> + complete_all(©up_request->completion);
> +}
> +
> +static void rbd_img_copyup_write_async(struct rbd_copyup_request
> *copyup_request)
> +{
> + struct rbd_img_request *img_request = NULL;
> + struct ceph_snap_context *snapc = NULL;
> + struct ceph_osd_request *osd_req = NULL;
> + struct ceph_osd_client *osdc = NULL;
> + struct rbd_device *rbd_dev = NULL;
> + struct page **pages = NULL;
> + struct timespec mtime = CURRENT_TIME;
> + u32 page_count = 0;
> + u64 object_size = 0;
> + int result = 0;
> +
> + /* if copyup_request read from parent failed, just end it */
> + if (copyup_request->result < 0) {
> + rbd_img_copyup_end(copyup_request);
> + goto out;
> + }
> +
> + img_request = copyup_request->img_request;
> + rbd_assert(img_request);
> + rbd_dev = img_request->rbd_dev;
> + rbd_assert(rbd_dev);
> + osdc = &rbd_dev->rbd_client->client->osdc;
> + rbd_assert(osdc);
> + snapc = rbd_dev->header.snapc;
> +
> + ceph_osdc_put_request(copyup_request->osd_req);
> +
> + copyup_request->osd_req = NULL;
> + osd_req = ceph_osdc_alloc_request(osdc, snapc, 1, false, GFP_ATOMIC);
> + if (!osd_req)
> + goto out;
> +
> + pages = copyup_request->copyup_pages;
> + page_count = copyup_request->copyup_page_count;
> + object_size = (u64)1 << rbd_dev->header.obj_order;
> +
> + /* Initialize copyup op */
> + osd_req_op_cls_init(osd_req, 0, CEPH_OSD_OP_CALL, "rbd", "copyup");
> + osd_req_op_cls_request_data_pages(osd_req, 0, pages, object_size, 0,
> false, false);
> + osd_req->r_flags = CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK;
> + osd_req->r_callback = rbd_osd_req_copyup_callback;
> + osd_req->r_priv = copyup_request;
> +
> + osd_req->r_base_oloc.pool = ceph_file_layout_pg_pool(rbd_dev->layout);
> + ceph_oid_set_name(&osd_req->r_base_oid, copyup_request->object_name);
> +
> + copyup_request->osd_req = osd_req;
> + copyup_request->callback = rbd_img_copyup_end;
> +
> + ceph_osdc_build_request(osd_req, 0, snapc, CEPH_NOSNAP, &mtime);
> + result = ceph_osdc_start_request(osdc, osd_req, false);
> + if(!result)
> + goto out;
> +
> + ceph_osdc_put_request(osd_req);
> +out:
> + return;
> +}
> +
> +static void rbd_img_copyup_start(struct rbd_img_request *img_request,
> + const char *object_name)
> +{
> + struct rbd_copyup_request *copyup_request = NULL;
> + struct rbd_device *rbd_dev = NULL;
> + struct ceph_snap_context *snapc = NULL;
> + struct ceph_osd_client *osdc = NULL;
> + struct ceph_osd_request *osd_req = NULL;
> + const char *parent_object_name = NULL;
> +
> + int result = 0;
> + u64 object_no = (u64)-1;
> + u64 object_size = 0;
> + u64 snap_id = 0;
> + __u8 obj_order = 0;
> + bool is_read = false;
> +
> + rbd_assert(img_request != NULL);
> + rbd_assert(object_name != NULL);
> +
> + rbd_dev = img_request->rbd_dev;
> + rbd_assert(rbd_dev != NULL);
> +
> + is_read = !img_request_write_test(img_request) &&
> + !img_request_discard_test(img_request);
> +
> + object_no = rbd_object_no(rbd_dev, object_name);
> + obj_order = rbd_dev->header.obj_order;
> + object_size = (u64)1 << obj_order;
> +
> + spin_lock(&img_request->copyup_list_lock);
> + /* Find if object_no exists in copyup_list */
> + for_each_copyup_request(img_request, copyup_request) {
> + /* Found, just return */
> + if(copyup_request->object_no == object_no) {
> + spin_unlock(&img_request->copyup_list_lock);
> + return;
> + }
> + }
> + spin_unlock(&img_request->copyup_list_lock);
> +
> + /* Not found, send new copyup request */
> + copyup_request = NULL;
> + osdc = &rbd_dev->rbd_client->client->osdc;
> + parent_object_name = rbd_segment_name(rbd_dev->parent, object_no <<
> obj_order);
> + if (!parent_object_name)
> + goto out;
> + osd_req = ceph_osdc_alloc_request(osdc, snapc, 1, false, GFP_ATOMIC);
> + if (!osd_req)
> + goto out;
> + copyup_request = rbd_copyup_request_create(object_name, rbd_dev);
> + if (!copyup_request) {
> + ceph_osdc_put_request(osd_req);
> + goto out;
> + }
> +
> + /* Init osd_req */
> + osd_req_op_extent_init(osd_req, 0, CEPH_OSD_OP_READ, 0, object_size,
> 0, 0);
> + osd_req_op_extent_osd_data_pages(osd_req, 0,
> copyup_request->copyup_pages, object_size,
> + 0, false, false);
> +
> + osd_req->r_flags = CEPH_OSD_FLAG_READ;
> + osd_req->r_callback = rbd_osd_req_copyup_callback;
> + osd_req->r_priv = copyup_request;
> +
> + osd_req->r_base_oloc.pool =
> ceph_file_layout_pg_pool(rbd_dev->parent->layout);
> + ceph_oid_set_name(&osd_req->r_base_oid, parent_object_name);
> + rbd_segment_name_free(parent_object_name);
> +
> + /* Init copyup request */
> + rbd_assert(copyup_request->osd_req == NULL);
> + copyup_request->osd_req = osd_req;
> + copyup_request->callback = rbd_img_copyup_write_async;
> +
> + /* Encode osd_req data */
> + snap_id = img_request ? img_request->snap_id : CEPH_NOSNAP;
> + ceph_osdc_build_request(osd_req, 0, NULL, snap_id, NULL);
> +
> + /* Add copyup request to img_request->copyup_list */
> + rbd_img_copyup_request_add(img_request, copyup_request);
> +
> + rbd_img_request_get(img_request);
> +
> + /* Send osd_req */
> + result = ceph_osdc_start_request(osdc, osd_req, false);
> + if (!result)
> + goto out;
> +out:
> + return;
> +}
> +
> +
> static void rbd_img_obj_exists_callback(struct rbd_obj_request *obj_request)
> {
> struct rbd_obj_request *orig_request;
Hi,
Sorry for late feedback, I thought I had sent this..
I have a number of high-level issues with this. First and foremost,
the lifetime rules are unclear. It looks to me like there is nothing
preventing a bunch of async copyup requests from hanging around after
rbd unmap completes. These copyups bump img_request refcount, which
means that img_requests along with rbd_device, rbd_client and
ceph_client will also hang in there after rbd unmap, waiting for late
replies. This is wrong: if there are no images mapped, there should be
no rbd or libceph state around. I'm pretty sure async copyup requests
don't need an osd_client callback at all - you can just send and
forget.
Second, I think sync (copy-on-write) and async (copy-on-read) copyups
should be coordinated with each other. If there is an async copyup in
progress, a sync copyup can just wait for it to complete.
Related to the above is the copyup request list. I think it should be
a per image rather than a per img_request thing - copyup_list in struct
rbd_img_request doesn't make much sense to me. What exactly is it's
use there?
Fourth, unless I'm missing something, the following
rbd_img_parent_read(obj_request);
rbd_assert(obj_request->img_request);
if(is_copy_on_read(obj_request->img_request->rbd_dev))
rbd_img_copyup_start(...);
in rbd_osd_read_callback() means that async copyup requests will be
issued for objects that don't exist in the parent. I think the correct
way to handle this is to wait for a parent read request to complete and
issue a copyup only if it completes successfully.
Thanks,
Ilya
--
To unsubscribe from this list: send the line "unsubscribe ceph-devel" in
the body of a message to [email protected]
More majordomo info at http://vger.kernel.org/majordomo-info.html