On Fri, 21 Feb 2014, Ilya Dryomov wrote:
> This is primarily for rbd's benefit and is supposed to combat
> fragmentation:
> 
> "... knowing that rbd images have a 4m size, librbd can pass a hint
> that will let the osd do the xfs allocation size ioctl on new files so
> that they are allocated in 1m or 4m chunks.  We've seen cases where
> users with rbd workloads have very high levels of fragmentation in xfs
> and this would mitigate that and probably have a pretty nice
> performance benefit."
> 
> SETALLOCHINT is considered advisory, so our backwards compatibility
> mechanism here is to set FAILOK flag for all SETALLOCHINT ops.
> 
> Signed-off-by: Ilya Dryomov <[email protected]>
> ---
>  include/linux/ceph/osd_client.h |    9 +++++++++
>  include/linux/ceph/rados.h      |    8 ++++++++
>  net/ceph/osd_client.c           |   30 ++++++++++++++++++++++++++++++
>  3 files changed, 47 insertions(+)
> 
> diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h
> index e94f5da251d6..6bfcb0eca8ab 100644
> --- a/include/linux/ceph/osd_client.h
> +++ b/include/linux/ceph/osd_client.h
> @@ -103,6 +103,11 @@ struct ceph_osd_req_op {
>                       u32 timeout;
>                       __u8 flag;
>               } watch;
> +             struct {
> +                     u64 expected_size;
> +                     u64 expected_write_size;
> +                     __u8 expected_size_probability;
> +             } hint;

s/hint/alloc_hint/ ?

>       };
>  };
>  
> @@ -294,6 +299,10 @@ extern void osd_req_op_cls_init(struct ceph_osd_request 
> *osd_req,
>  extern void osd_req_op_watch_init(struct ceph_osd_request *osd_req,
>                                       unsigned int which, u16 opcode,
>                                       u64 cookie, u64 version, int flag);
> +extern void osd_req_op_hint_init(struct ceph_osd_request *osd_req,
> +                              unsigned int which, u16 opcode,
> +                              u64 expected_size, u64 expected_write_size,
> +                              u8 expected_size_probability);
>  
>  extern struct ceph_osd_request *ceph_osdc_alloc_request(struct 
> ceph_osd_client *osdc,
>                                              struct ceph_snap_context *snapc,
> diff --git a/include/linux/ceph/rados.h b/include/linux/ceph/rados.h
> index 8f9bf4570215..b8e2dd11f186 100644
> --- a/include/linux/ceph/rados.h
> +++ b/include/linux/ceph/rados.h
> @@ -227,6 +227,9 @@ enum {
>       CEPH_OSD_OP_OMAPRMKEYS    = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA 
> | 24,
>       CEPH_OSD_OP_OMAP_CMP      = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_DATA 
> | 25,
>  
> +     /* hints */
> +     CEPH_OSD_OP_SETALLOCHINT = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA 
> | 35,
> +
>       /** multi **/
>       CEPH_OSD_OP_CLONERANGE = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_MULTI | 
> 1,
>       CEPH_OSD_OP_ASSERT_SRC_VERSION = CEPH_OSD_OP_MODE_RD | 
> CEPH_OSD_OP_TYPE_MULTI | 2,
> @@ -416,6 +419,11 @@ struct ceph_osd_op {
>                       __le64 offset, length;
>                       __le64 src_offset;
>               } __attribute__ ((packed)) clonerange;
> +             struct {
> +                     __le64 expected_size;
> +                     __le64 expected_write_size;
> +                     __u8 expected_size_probability;
> +             } __attribute__ ((packed)) hint;

s/hint/alloc_hint/, I think.  Just made the same comment on the user space 
side.

>       };
>       __le32 payload_len;
>  } __attribute__ ((packed));
> diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
> index 5d7fd0b8c1c8..4090f6e8db3a 100644
> --- a/net/ceph/osd_client.c
> +++ b/net/ceph/osd_client.c
> @@ -436,6 +436,7 @@ static bool osd_req_opcode_valid(u16 opcode)
>       case CEPH_OSD_OP_OMAPCLEAR:
>       case CEPH_OSD_OP_OMAPRMKEYS:
>       case CEPH_OSD_OP_OMAP_CMP:
> +     case CEPH_OSD_OP_SETALLOCHINT:
>       case CEPH_OSD_OP_CLONERANGE:
>       case CEPH_OSD_OP_ASSERT_SRC_VERSION:
>       case CEPH_OSD_OP_SRC_CMPXATTR:
> @@ -591,6 +592,28 @@ void osd_req_op_watch_init(struct ceph_osd_request 
> *osd_req,
>  }
>  EXPORT_SYMBOL(osd_req_op_watch_init);
>  
> +void osd_req_op_hint_init(struct ceph_osd_request *osd_req,
> +                       unsigned int which, u16 opcode,
> +                       u64 expected_size, u64 expected_write_size,
> +                       u8 expected_size_probability)
> +{
> +     struct ceph_osd_req_op *op = _osd_req_op_init(osd_req, which, opcode);
> +
> +     BUG_ON(opcode != CEPH_OSD_OP_SETALLOCHINT);

I would just drop the opcode argument all together.  And 
s/hint/alloc_hint/ in the function name...  I wouldn't expect that any 
other type of hint would have these same arguments.

> +
> +     op->hint.expected_size = expected_size;
> +     op->hint.expected_write_size = expected_write_size;
> +     op->hint.expected_size_probability = expected_size_probability;
> +
> +     /*
> +      * CEPH_OSD_OP_SETALLOCHINT op is advisory and therefore deemed
> +      * not worth a feature bit.  Set FAILOK per-op flag to make
> +      * sure older osds don't trip over an unsupported opcode.
> +      */
> +     op->flags |= CEPH_OSD_OP_FLAG_FAILOK;
> +}
> +EXPORT_SYMBOL(osd_req_op_hint_init);
> +
>  static void ceph_osdc_msg_data_add(struct ceph_msg *msg,
>                               struct ceph_osd_data *osd_data)
>  {
> @@ -681,6 +704,13 @@ static u64 osd_req_encode_op(struct ceph_osd_request 
> *req,
>               dst->watch.ver = cpu_to_le64(src->watch.ver);
>               dst->watch.flag = src->watch.flag;
>               break;
> +     case CEPH_OSD_OP_SETALLOCHINT:
> +             dst->hint.expected_size = cpu_to_le64(src->hint.expected_size);
> +             dst->hint.expected_write_size =
> +                 cpu_to_le64(src->hint.expected_write_size);
> +             dst->hint.expected_size_probability =
> +                 src->hint.expected_size_probability;
> +             break;
>       default:
>               pr_err("unsupported osd opcode %s\n",
>                       ceph_osd_op_name(src->op));
> -- 
> 1.7.10.4
> 
> --
> To unsubscribe from this list: send the line "unsubscribe ceph-devel" in
> the body of a message to [email protected]
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> 
> 
--
To unsubscribe from this list: send the line "unsubscribe ceph-devel" in
the body of a message to [email protected]
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Reply via email to