From: Mike Christie <[email protected]>

This patch adds support to rbd for SCSI COMPARE_AND_WRITE commands. Higher
levels like LIO will work with IMG_REQ_CMP_AND_WRITE requests, but
rbd breaks it up into CMPEXT and WRITE Ceph requests.

Signed-off-by: Mike Christie <[email protected]>
---
 drivers/block/rbd.c | 182 ++++++++++++++++++++++++++++++++++++++++++++++------
 1 file changed, 162 insertions(+), 20 deletions(-)

diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index 425c3d8..b6d7f33 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -216,6 +216,7 @@ enum obj_operation_type {
        OBJ_OP_WRITE,
        OBJ_OP_READ,
        OBJ_OP_DISCARD,
+       OBJ_OP_CMP_AND_WRITE,
 };
 
 enum obj_req_flags {
@@ -289,6 +290,7 @@ enum img_req_flags {
        IMG_REQ_CHILD,          /* initiator: block = 0, child image = 1 */
        IMG_REQ_LAYERED,        /* ENOENT handling: normal = 0, layered = 1 */
        IMG_REQ_DISCARD,        /* discard: normal = 0, discard request = 1 */
+       IMG_REQ_CMP_AND_WRITE,  /* normal = 0, compare and write request = 1 */
 };
 
 struct rbd_img_request {
@@ -296,10 +298,9 @@ struct rbd_img_request {
        u64                     offset; /* starting image byte offset */
        u64                     length; /* byte count from offset */
        unsigned long           flags;
-       union {
-               u64                     snap_id;        /* for reads */
-               struct ceph_snap_context *snapc;        /* for writes */
-       };
+
+       u64                     snap_id;        /* for reads */
+       struct ceph_snap_context *snapc;        /* for writes */
 
        struct request          *rq;            /* block request */
        struct rbd_obj_request  *obj_request;   /* obj req initiator */
@@ -818,6 +819,8 @@ static int obj_num_ops(enum obj_operation_type op_type)
        switch (op_type) {
        case OBJ_OP_WRITE:
                return 2;
+       case OBJ_OP_CMP_AND_WRITE:
+               return 3;
        default:
                return 1;
        }
@@ -832,6 +835,8 @@ static char* obj_op_name(enum obj_operation_type op_type)
                return "write";
        case OBJ_OP_DISCARD:
                return "discard";
+       case OBJ_OP_CMP_AND_WRITE:
+               return "compare-and-write";
        default:
                return "???";
        }
@@ -1749,10 +1754,23 @@ static bool img_request_layered_test(struct 
rbd_img_request *img_request)
        return test_bit(IMG_REQ_LAYERED, &img_request->flags) != 0;
 }
 
+static void img_request_cmp_and_write_set(struct rbd_img_request *img_request)
+{
+       set_bit(IMG_REQ_CMP_AND_WRITE, &img_request->flags);
+       smp_mb();
+}
+
+static bool img_request_cmp_and_write_test(struct rbd_img_request *img_request)
+{
+       smp_mb();
+       return test_bit(IMG_REQ_CMP_AND_WRITE, &img_request->flags) != 0;
+}
+
 static bool img_request_is_write_type_test(struct rbd_img_request *img_request)
 {
        return img_request_write_test(img_request) ||
-              img_request_discard_test(img_request);
+              img_request_discard_test(img_request) ||
+              img_request_cmp_and_write_test(img_request);
 }
 
 static enum obj_operation_type
@@ -1762,6 +1780,8 @@ rbd_img_request_op_type(struct rbd_img_request 
*img_request)
                return OBJ_OP_WRITE;
        else if (img_request_discard_test(img_request))
                return OBJ_OP_DISCARD;
+       else if (img_request_cmp_and_write_test(img_request))
+               return OBJ_OP_CMP_AND_WRITE;
        else
                return OBJ_OP_READ;
 }
@@ -1856,6 +1876,23 @@ static void rbd_osd_write_callback(struct 
rbd_obj_request *obj_request)
        obj_request_done_set(obj_request);
 }
 
+static void rbd_osd_cmpext_callback(struct rbd_obj_request *obj_request,
+                                   struct ceph_osd_request *osd_req)
+{
+       dout("%s: obj %p result %d %llu\n", __func__, obj_request,
+               obj_request->result, obj_request->length);
+
+       if (obj_request->result == -EILSEQ)
+               /*
+                * on mismatch reply buf will contain offset and mismatched
+                * data
+                */
+               obj_request->xferred = osd_req->r_reply_op_len[1];
+       else
+               obj_request->xferred = obj_request->length;
+       obj_request_done_set(obj_request);
+}
+
 static void rbd_osd_discard_callback(struct rbd_obj_request *obj_request)
 {
        dout("%s: obj %p result %d %llu\n", __func__, obj_request,
@@ -1915,11 +1952,19 @@ static void rbd_osd_req_callback(struct 
ceph_osd_request *osd_req,
                rbd_osd_read_callback(obj_request);
                break;
        case CEPH_OSD_OP_SETALLOCHINT:
-               rbd_assert(osd_req->r_ops[1].op == CEPH_OSD_OP_WRITE);
-               /* fall through */
+               if (osd_req->r_ops[1].op == CEPH_OSD_OP_WRITE)
+                       rbd_osd_write_callback(obj_request);
+               else if (osd_req->r_ops[1].op == CEPH_OSD_OP_CMPEXT)
+                       rbd_osd_cmpext_callback(obj_request, osd_req);
+               else
+                       rbd_assert(0);
+               break;
        case CEPH_OSD_OP_WRITE:
                rbd_osd_write_callback(obj_request);
                break;
+       case CEPH_OSD_OP_CMPEXT:
+               rbd_osd_cmpext_callback(obj_request, osd_req);
+               break;
        case CEPH_OSD_OP_STAT:
                rbd_osd_stat_callback(obj_request);
                break;
@@ -1943,6 +1988,22 @@ static void rbd_osd_req_callback(struct ceph_osd_request 
*osd_req,
                rbd_obj_request_complete(obj_request);
 }
 
+static void rbd_osd_req_format_rw(struct rbd_obj_request *obj_request)
+{
+       struct rbd_img_request *img_request = obj_request->img_request;
+       struct ceph_osd_request *osd_req = obj_request->osd_req;
+       struct ceph_snap_context *snapc;
+       struct timespec mtime = CURRENT_TIME;
+       u64 snap_id;
+
+       rbd_assert(osd_req != NULL);
+
+       snapc = img_request ? img_request->snapc : NULL;
+       snap_id = img_request ? img_request->snap_id : CEPH_NOSNAP;
+       ceph_osdc_build_request(osd_req, obj_request->offset,
+                               snapc, snap_id, &mtime);
+}
+
 static void rbd_osd_req_format_read(struct rbd_obj_request *obj_request)
 {
        struct rbd_img_request *img_request = obj_request->img_request;
@@ -1975,6 +2036,7 @@ static void rbd_osd_req_format_write(struct 
rbd_obj_request *obj_request)
  * A write request has either one (watch) or two (hint+write) osd ops.
  * (All rbd data writes are prefixed with an allocation hint op, but
  * technically osd watch is a write request, hence this distinction.)
+ * A extent cmp has three (cmp+write+hint).
  */
 static struct ceph_osd_request *rbd_osd_req_create(
                                        struct rbd_device *rbd_dev,
@@ -1987,12 +2049,15 @@ static struct ceph_osd_request *rbd_osd_req_create(
        struct ceph_osd_request *osd_req;
 
        if (obj_request_img_data_test(obj_request) &&
-               (op_type == OBJ_OP_DISCARD || op_type == OBJ_OP_WRITE)) {
+               (op_type == OBJ_OP_DISCARD || op_type == OBJ_OP_WRITE ||
+                op_type == OBJ_OP_CMP_AND_WRITE)) {
                struct rbd_img_request *img_request = obj_request->img_request;
                if (op_type == OBJ_OP_WRITE) {
                        rbd_assert(img_request_write_test(img_request));
-               } else {
+               } else if (op_type == OBJ_OP_DISCARD) {
                        rbd_assert(img_request_discard_test(img_request));
+               } else if (op_type == OBJ_OP_CMP_AND_WRITE) {
+                       rbd_assert(img_request_cmp_and_write_test(img_request));
                }
                snapc = img_request->snapc;
        }
@@ -2007,7 +2072,8 @@ static struct ceph_osd_request *rbd_osd_req_create(
        if (!osd_req)
                return NULL;    /* ENOMEM */
 
-       if (op_type == OBJ_OP_WRITE || op_type == OBJ_OP_DISCARD)
+       if (op_type == OBJ_OP_WRITE || op_type == OBJ_OP_DISCARD ||
+           op_type == OBJ_OP_CMP_AND_WRITE)
                osd_req->r_flags = CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK;
        else
                osd_req->r_flags = CEPH_OSD_FLAG_READ;
@@ -2236,6 +2302,10 @@ static struct rbd_img_request *rbd_img_request_create(
        } else if (op_type == OBJ_OP_WRITE) {
                img_request_write_set(img_request);
                img_request->snapc = snapc;
+       } else if (op_type == OBJ_OP_CMP_AND_WRITE) {
+               img_request_cmp_and_write_set(img_request);
+               img_request->snapc = snapc;
+               img_request->snap_id = rbd_dev->spec->snap_id;
        } else {
                img_request->snap_id = rbd_dev->spec->snap_id;
        }
@@ -2332,18 +2402,11 @@ static bool rbd_img_obj_end_request(struct 
rbd_obj_request *obj_request)
        result = obj_request->result;
        if (result) {
                struct rbd_device *rbd_dev = img_request->rbd_dev;
-               enum obj_operation_type op_type;
-
-               if (img_request_discard_test(img_request))
-                       op_type = OBJ_OP_DISCARD;
-               else if (img_request_write_test(img_request))
-                       op_type = OBJ_OP_WRITE;
-               else
-                       op_type = OBJ_OP_READ;
 
                rbd_warn(rbd_dev, "%s %llx at %llx (%llx)",
-                       obj_op_name(op_type), obj_request->length,
-                       obj_request->img_offset, obj_request->offset);
+                       obj_op_name(rbd_img_request_op_type(img_request)),
+                       obj_request->length, obj_request->img_offset,
+                       obj_request->offset);
                rbd_warn(rbd_dev, "  result %d xferred %x",
                        result, xferred);
                if (!img_request->result)
@@ -2624,6 +2687,85 @@ out_unwind:
        return -ENOMEM;
 }
 
+int rbd_img_cmp_and_write_request_fill(struct rbd_img_request *img_request,
+                                      struct scatterlist *cmp_sgl,
+                                      u64 cmp_length,
+                                      struct scatterlist *write_sgl,
+                                      u64 write_length,
+                                      struct page **response_pages,
+                                      u64 response_length)
+{
+       struct rbd_device *rbd_dev = img_request->rbd_dev;
+       u64 object_size = rbd_obj_bytes(&rbd_dev->header);
+       struct rbd_obj_request *obj_request;
+       struct ceph_osd_request *osd_req;
+       const char *object_name;
+       int num_ops = 0;
+       u64 img_offset;
+       u64 offset;
+
+       img_offset = img_request->offset;
+       offset = rbd_segment_offset(rbd_dev, img_offset);
+
+       /*
+        * LIO currently only supports 1 sector reqs and we assume the req
+        * will not span segments.
+        */
+       if (rbd_segment_length(rbd_dev, offset, cmp_length) != cmp_length)
+               return -EOPNOTSUPP;
+
+       object_name = rbd_segment_name(rbd_dev, img_offset);
+       if (!object_name)
+               return -EINVAL;
+
+       obj_request = rbd_obj_request_create(object_name, offset,
+                                            cmp_length, OBJ_REQUEST_SG);
+       /* object request has its own copy of the object name */
+       rbd_segment_name_free(object_name);
+       if (!obj_request)
+               return -ENOMEM;
+
+       rbd_img_obj_request_add(img_request, obj_request);
+
+       obj_request->pages = response_pages;
+       obj_request->page_count = calc_pages_for(0, response_length);
+
+       osd_req = rbd_osd_req_create(rbd_dev, OBJ_OP_CMP_AND_WRITE, 3,
+                                    obj_request);
+       if (!osd_req)
+               goto del_obj_req;
+
+       obj_request->osd_req = osd_req;
+       obj_request->callback = rbd_img_obj_callback;
+       obj_request->img_offset = img_offset;
+
+       osd_req_op_alloc_hint_init(osd_req, num_ops, object_size, object_size);
+
+       num_ops++;
+       osd_req_op_extent_init(osd_req, num_ops, CEPH_OSD_OP_CMPEXT, offset,
+                              cmp_length, 0, 0);
+       osd_req_op_extent_osd_data_sg(osd_req, num_ops, cmp_sgl, 0, cmp_length);
+       osd_req_op_extent_osd_data_pages(osd_req, num_ops, obj_request->pages,
+                                        response_length, 0,
+                                        obj_request->page_count, false);
+
+       num_ops++;
+       osd_req_op_extent_init(osd_req, num_ops, CEPH_OSD_OP_WRITE, offset,
+                              write_length, 0, 0);
+       osd_req_op_extent_osd_data_sg(osd_req, num_ops, write_sgl, 0,
+                                     write_length);
+
+       rbd_osd_req_format_rw(obj_request);
+
+       rbd_img_request_get(img_request);
+       return 0;
+
+del_obj_req:
+       rbd_img_obj_request_del(img_request, obj_request);
+       return -ENOMEM;
+}
+EXPORT_SYMBOL(rbd_img_cmp_and_write_request_fill);
+
 static void
 rbd_img_obj_copyup_callback(struct rbd_obj_request *obj_request)
 {
-- 
1.8.3.1

--
To unsubscribe from this list: send the line "unsubscribe ceph-devel" in
the body of a message to [email protected]
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Reply via email to