In an effort to reduce fragmentation, prefix every rbd write with
a CEPH_OSD_OP_SETALLOCHINT osd op with an expected_write_size value set
to the object size (1 << order).  Backwards compatibility is taken care
of on the libceph/osd side.

Signed-off-by: Ilya Dryomov <[email protected]>
---
 drivers/block/rbd.c |   66 ++++++++++++++++++++++++++++++++++++++-------------
 1 file changed, 49 insertions(+), 17 deletions(-)

diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index 6cf001ef00bc..14496f39c770 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -1662,7 +1662,12 @@ static void rbd_osd_req_callback(struct ceph_osd_request 
*osd_req,
         */
        obj_request->xferred = osd_req->r_reply_op_len[0];
        rbd_assert(obj_request->xferred < (u64)UINT_MAX);
+
        opcode = osd_req->r_ops[0].op;
+       if (opcode == CEPH_OSD_OP_SETALLOCHINT) {
+               BUG_ON(osd_req->r_ops[1].op != CEPH_OSD_OP_WRITE);
+               opcode = CEPH_OSD_OP_WRITE;
+       }
        switch (opcode) {
        case CEPH_OSD_OP_READ:
                rbd_osd_read_callback(obj_request);
@@ -1715,6 +1720,12 @@ static void rbd_osd_req_format_write(struct 
rbd_obj_request *obj_request)
                        snapc, CEPH_NOSNAP, &mtime);
 }
 
+/*
+ * Create an osd request.  A read request has one osd op (read).
+ * A write request has either one (watch) or two (hint+write) osd ops.
+ * (All rbd writes are prefixed with an allocation hint op, but
+ * technically osd watch is a write request, hence this distinction.)
+ */
 static struct ceph_osd_request *rbd_osd_req_create(
                                        struct rbd_device *rbd_dev,
                                        bool write_request,
@@ -1734,7 +1745,8 @@ static struct ceph_osd_request *rbd_osd_req_create(
                        snapc = img_request->snapc;
        }
 
-       rbd_assert(num_ops == 1);
+       rbd_assert((!write_request && num_ops == 1) ||
+                  (write_request && num_ops >= 1 && num_ops <= 2));
 
        /* Allocate and initialize the request, for the num_ops ops */
 
@@ -1760,8 +1772,8 @@ static struct ceph_osd_request *rbd_osd_req_create(
 
 /*
  * Create a copyup osd request based on the information in the
- * object request supplied.  A copyup request has two osd ops,
- * a copyup method call, and a "normal" write request.
+ * object request supplied.  A copyup request has three osd ops,
+ * a copyup method call, a hint op, and a write op.
  */
 static struct ceph_osd_request *
 rbd_osd_req_create_copyup(struct rbd_obj_request *obj_request)
@@ -1777,12 +1789,12 @@ rbd_osd_req_create_copyup(struct rbd_obj_request 
*obj_request)
        rbd_assert(img_request);
        rbd_assert(img_request_write_test(img_request));
 
-       /* Allocate and initialize the request, for the two ops */
+       /* Allocate and initialize the request, for the three ops */
 
        snapc = img_request->snapc;
        rbd_dev = img_request->rbd_dev;
        osdc = &rbd_dev->rbd_client->client->osdc;
-       osd_req = ceph_osdc_alloc_request(osdc, snapc, 2, false, GFP_ATOMIC);
+       osd_req = ceph_osdc_alloc_request(osdc, snapc, 3, false, GFP_ATOMIC);
        if (!osd_req)
                return NULL;    /* ENOMEM */
 
@@ -2159,12 +2171,10 @@ static int rbd_img_request_fill(struct rbd_img_request 
*img_request,
        struct page **pages = NULL;
        u64 img_offset;
        u64 resid;
-       u16 opcode;
 
        dout("%s: img %p type %d data_desc %p\n", __func__, img_request,
                (int)type, data_desc);
 
-       opcode = write_request ? CEPH_OSD_OP_WRITE : CEPH_OSD_OP_READ;
        img_offset = img_request->offset;
        resid = img_request->length;
        rbd_assert(resid > 0);
@@ -2183,6 +2193,8 @@ static int rbd_img_request_fill(struct rbd_img_request 
*img_request,
                const char *object_name;
                u64 offset;
                u64 length;
+               unsigned int which;
+               u16 opcode;
 
                object_name = rbd_segment_name(rbd_dev, img_offset);
                if (!object_name)
@@ -2224,20 +2236,34 @@ static int rbd_img_request_fill(struct rbd_img_request 
*img_request,
                        pages += page_count;
                }
 
-               osd_req = rbd_osd_req_create(rbd_dev, write_request, 1,
+               osd_req = rbd_osd_req_create(rbd_dev, write_request,
+                                            (write_request ? 2 : 1),
                                             obj_request);
                if (!osd_req)
                        goto out_partial;
                obj_request->osd_req = osd_req;
                obj_request->callback = rbd_img_obj_callback;
 
-               osd_req_op_extent_init(osd_req, 0, opcode, offset, length,
-                                               0, 0);
+               if (write_request) {
+                       osd_req_op_hint_init(osd_req, 0,
+                                            CEPH_OSD_OP_SETALLOCHINT,
+                                            rbd_obj_bytes(&rbd_dev->header),
+                                            rbd_obj_bytes(&rbd_dev->header),
+                                            0);
+
+                       which = 1;
+                       opcode = CEPH_OSD_OP_WRITE;
+               } else {
+                       which = 0;
+                       opcode = CEPH_OSD_OP_READ;
+               }
+               osd_req_op_extent_init(osd_req, which, opcode, offset, length,
+                                      0, 0);
                if (type == OBJ_REQUEST_BIO)
-                       osd_req_op_extent_osd_data_bio(osd_req, 0,
+                       osd_req_op_extent_osd_data_bio(osd_req, which,
                                        obj_request->bio_list, length);
                else
-                       osd_req_op_extent_osd_data_pages(osd_req, 0,
+                       osd_req_op_extent_osd_data_pages(osd_req, which,
                                        obj_request->pages, length,
                                        offset & ~PAGE_MASK, false, false);
 
@@ -2358,7 +2384,7 @@ rbd_img_obj_parent_read_full_callback(struct 
rbd_img_request *img_request)
 
        /*
         * The original osd request is of no use to use any more.
-        * We need a new one that can hold the two ops in a copyup
+        * We need a new one that can hold the three ops in a copyup
         * request.  Allocate the new copyup osd request for the
         * original request, and release the old one.
         */
@@ -2377,17 +2403,23 @@ rbd_img_obj_parent_read_full_callback(struct 
rbd_img_request *img_request)
        osd_req_op_cls_request_data_pages(osd_req, 0, pages, parent_length, 0,
                                                false, false);
 
-       /* Then the original write request op */
+       /* Then the hint op */
+
+       osd_req_op_hint_init(osd_req, 1, CEPH_OSD_OP_SETALLOCHINT,
+                            rbd_obj_bytes(&rbd_dev->header),
+                            rbd_obj_bytes(&rbd_dev->header), 0);
+
+       /* And the original write request op */
 
        offset = orig_request->offset;
        length = orig_request->length;
-       osd_req_op_extent_init(osd_req, 1, CEPH_OSD_OP_WRITE,
+       osd_req_op_extent_init(osd_req, 2, CEPH_OSD_OP_WRITE,
                                        offset, length, 0, 0);
        if (orig_request->type == OBJ_REQUEST_BIO)
-               osd_req_op_extent_osd_data_bio(osd_req, 1,
+               osd_req_op_extent_osd_data_bio(osd_req, 2,
                                        orig_request->bio_list, length);
        else
-               osd_req_op_extent_osd_data_pages(osd_req, 1,
+               osd_req_op_extent_osd_data_pages(osd_req, 2,
                                        orig_request->pages, length,
                                        offset & ~PAGE_MASK, false, false);
 
-- 
1.7.10.4

--
To unsubscribe from this list: send the line "unsubscribe ceph-devel" in
the body of a message to [email protected]
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Reply via email to