At Fri, 13 Feb 2015 18:20:53 +0900, Teruaki Ishizaki wrote: > > Previously, qemu block driver of sheepdog used hard-coded VDI object size. > This patch enables users to handle VDI object size. > > When you start qemu, you don't need to specify additional command option. > > But when you create the VDI which doesn't have default object size > with qemu-img command, you specify object_size option. > > If you want to create a VDI of 8MB object size, > you need to specify following command option. > > # qemu-img create -o object_size=8M sheepdog:test1 100M > > In addition, when you don't specify qemu-img command option, > a default value of sheepdog cluster is used for creating VDI. > > # qemu-img create sheepdog:test2 100M > > Signed-off-by: Teruaki Ishizaki <ishizaki.teru...@lab.ntt.co.jp> > --- > V5: > - Change option from block_size_shift to object_size. > - Change parse type to QEMU_OPT_SIZE. > - Add operation to verify max VDI size for resizing. > - Change to use 4MB object size with using old Sheepdog. > > V4: > - Limit a read/write buffer size for creating a preallocated VDI. > - Replace a parse function for the block_size_shift option. > - Fix an error message. > > V3: > - Delete the needless operation of buffer. > - Delete the needless operations of request header. > for SD_OP_GET_CLUSTER_DEFAULT. > - Fix coding style problems. > > V2: > - Fix coding style problem (white space). > - Add members, store_policy and block_size_shift to struct SheepdogVdiReq. > - Initialize request header to use block_size_shift specified by user. > --- > block/sheepdog.c | 155 > ++++++++++++++++++++++++++++++++++++++------- > include/block/block_int.h | 1 + > 2 files changed, 134 insertions(+), 22 deletions(-)
Looks good to me. Acked-by: Hitoshi Mitake <mitake.hito...@lab.ntt.co.jp> Thanks, Hitoshi > > diff --git a/block/sheepdog.c b/block/sheepdog.c > index be3176f..f6fe97e 100644 > --- a/block/sheepdog.c > +++ b/block/sheepdog.c > @@ -37,6 +37,7 @@ > #define SD_OP_READ_VDIS 0x15 > #define SD_OP_FLUSH_VDI 0x16 > #define SD_OP_DEL_VDI 0x17 > +#define SD_OP_GET_CLUSTER_DEFAULT 0x18 > > #define SD_FLAG_CMD_WRITE 0x01 > #define SD_FLAG_CMD_COW 0x02 > @@ -91,6 +92,7 @@ > #define SD_NR_VDIS (1U << 24) > #define SD_DATA_OBJ_SIZE (UINT64_C(1) << 22) > #define SD_MAX_VDI_SIZE (SD_DATA_OBJ_SIZE * MAX_DATA_OBJS) > +#define SD_DEFAULT_BLOCK_SIZE_SHIFT 22 > /* > * For erasure coding, we use at most SD_EC_MAX_STRIP for data strips and > * (SD_EC_MAX_STRIP - 1) for parity strips > @@ -167,7 +169,8 @@ typedef struct SheepdogVdiReq { > uint32_t base_vdi_id; > uint8_t copies; > uint8_t copy_policy; > - uint8_t reserved[2]; > + uint8_t store_policy; > + uint8_t block_size_shift; > uint32_t snapid; > uint32_t type; > uint32_t pad[2]; > @@ -186,6 +189,21 @@ typedef struct SheepdogVdiRsp { > uint32_t pad[5]; > } SheepdogVdiRsp; > > +typedef struct SheepdogClusterRsp { > + uint8_t proto_ver; > + uint8_t opcode; > + uint16_t flags; > + uint32_t epoch; > + uint32_t id; > + uint32_t data_length; > + uint32_t result; > + uint8_t nr_copies; > + uint8_t copy_policy; > + uint8_t block_size_shift; > + uint8_t __pad1; > + uint32_t __pad2[6]; > +} SheepdogClusterRsp; > + > typedef struct SheepdogInode { > char name[SD_MAX_VDI_LEN]; > char tag[SD_MAX_VDI_TAG_LEN]; > @@ -1544,6 +1562,7 @@ static int do_sd_create(BDRVSheepdogState *s, uint32_t > *vdi_id, int snapshot, > hdr.vdi_size = s->inode.vdi_size; > hdr.copy_policy = s->inode.copy_policy; > hdr.copies = s->inode.nr_copies; > + hdr.block_size_shift = s->inode.block_size_shift; > > ret = do_req(fd, s->aio_context, (SheepdogReq *)&hdr, buf, &wlen, &rlen); > > @@ -1569,9 +1588,12 @@ static int do_sd_create(BDRVSheepdogState *s, uint32_t > *vdi_id, int snapshot, > static int sd_prealloc(const char *filename, Error **errp) > { > BlockDriverState *bs = NULL; > + BDRVSheepdogState *base = NULL; > + unsigned long buf_size; > uint32_t idx, max_idx; > + uint32_t object_size; > int64_t vdi_size; > - void *buf = g_malloc0(SD_DATA_OBJ_SIZE); > + void *buf = NULL; > int ret; > > ret = bdrv_open(&bs, filename, NULL, NULL, BDRV_O_RDWR | BDRV_O_PROTOCOL, > @@ -1585,18 +1607,24 @@ static int sd_prealloc(const char *filename, Error > **errp) > ret = vdi_size; > goto out; > } > - max_idx = DIV_ROUND_UP(vdi_size, SD_DATA_OBJ_SIZE); > + > + base = bs->opaque; > + object_size = (UINT32_C(1) << base->inode.block_size_shift); > + buf_size = MIN(object_size, SD_DATA_OBJ_SIZE); > + buf = g_malloc0(buf_size); > + > + max_idx = DIV_ROUND_UP(vdi_size, buf_size); > > for (idx = 0; idx < max_idx; idx++) { > /* > * The created image can be a cloned image, so we need to read > * a data from the source image. > */ > - ret = bdrv_pread(bs, idx * SD_DATA_OBJ_SIZE, buf, SD_DATA_OBJ_SIZE); > + ret = bdrv_pread(bs, idx * buf_size, buf, buf_size); > if (ret < 0) { > goto out; > } > - ret = bdrv_pwrite(bs, idx * SD_DATA_OBJ_SIZE, buf, SD_DATA_OBJ_SIZE); > + ret = bdrv_pwrite(bs, idx * buf_size, buf, buf_size); > if (ret < 0) { > goto out; > } > @@ -1669,6 +1697,27 @@ static int parse_redundancy(BDRVSheepdogState *s, > const char *opt) > return 0; > } > > +static int parse_block_size_shift(BDRVSheepdogState *s, QemuOpts *opt) > +{ > + struct SheepdogInode *inode = &s->inode; > + uint64_t object_size; > + int obj_order; > + > + object_size = qemu_opt_get_size_del(opt, BLOCK_OPT_OBJECT_SIZE, 0); > + if (object_size) { > + if ((object_size - 1) & object_size) { /* not a power of 2? */ > + return -EINVAL; > + } > + obj_order = ffs(object_size) - 1; > + if (obj_order < 20 || obj_order > 31) { > + return -EINVAL; > + } > + inode->block_size_shift = (uint8_t)obj_order; > + } > + > + return 0; > +} > + > static int sd_create(const char *filename, QemuOpts *opts, > Error **errp) > { > @@ -1679,6 +1728,7 @@ static int sd_create(const char *filename, QemuOpts > *opts, > BDRVSheepdogState *s; > char tag[SD_MAX_VDI_TAG_LEN]; > uint32_t snapid; > + uint64_t max_vdi_size; > bool prealloc = false; > > s = g_new0(BDRVSheepdogState, 1); > @@ -1717,10 +1767,11 @@ static int sd_create(const char *filename, QemuOpts > *opts, > goto out; > } > } > - > - if (s->inode.vdi_size > SD_MAX_VDI_SIZE) { > - error_setg(errp, "too big image size"); > - ret = -EINVAL; > + ret = parse_block_size_shift(s, opts); > + if (ret < 0) { > + error_setg(errp, "Invalid object_size." > + " obect_size needs to be power of 2" > + " and be limited from 2^20 to 2^31"); > goto out; > } > > @@ -1757,6 +1808,51 @@ static int sd_create(const char *filename, QemuOpts > *opts, > } > > s->aio_context = qemu_get_aio_context(); > + > + /* if block_size_shift is not specified, get cluster default value */ > + if (s->inode.block_size_shift == 0) { > + SheepdogVdiReq hdr; > + SheepdogClusterRsp *rsp = (SheepdogClusterRsp *)&hdr; > + Error *local_err = NULL; > + int fd; > + unsigned int wlen = 0, rlen = 0; > + > + fd = connect_to_sdog(s, &local_err); > + if (fd < 0) { > + error_report("%s", error_get_pretty(local_err)); > + error_free(local_err); > + ret = -EIO; > + goto out; > + } > + > + memset(&hdr, 0, sizeof(hdr)); > + hdr.opcode = SD_OP_GET_CLUSTER_DEFAULT; > + hdr.proto_ver = SD_PROTO_VER; > + > + ret = do_req(fd, s->aio_context, (SheepdogReq *)&hdr, > + NULL, &wlen, &rlen); > + closesocket(fd); > + if (ret) { > + error_setg_errno(errp, -ret, "failed to get cluster default"); > + goto out; > + } > + if (rsp->result == SD_RES_SUCCESS) { > + s->inode.block_size_shift = rsp->block_size_shift; > + } else { > + s->inode.block_size_shift = SD_DEFAULT_BLOCK_SIZE_SHIFT; > + } > + } > + > + max_vdi_size = (UINT64_C(1) << s->inode.block_size_shift) * > MAX_DATA_OBJS; > + > + if (s->inode.vdi_size > max_vdi_size) { > + error_setg(errp, "An image is too large." > + " The maximum image size is %"PRIu64 "GB", > + max_vdi_size / 1024 / 1024 / 1024); > + ret = -EINVAL; > + goto out; > + } > + > ret = do_sd_create(s, &vid, 0, errp); > if (ret) { > goto out; > @@ -1827,11 +1923,13 @@ static int sd_truncate(BlockDriverState *bs, int64_t > offset) > BDRVSheepdogState *s = bs->opaque; > int ret, fd; > unsigned int datalen; > + uint64_t max_vdi_size; > > + max_vdi_size = (UINT64_C(1) << s->inode.block_size_shift) * > MAX_DATA_OBJS; > if (offset < s->inode.vdi_size) { > error_report("shrinking is not supported"); > return -EINVAL; > - } else if (offset > SD_MAX_VDI_SIZE) { > + } else if (offset > max_vdi_size) { > error_report("too big image size"); > return -EINVAL; > } > @@ -2013,9 +2111,10 @@ static int coroutine_fn sd_co_rw_vector(void *p) > SheepdogAIOCB *acb = p; > int ret = 0; > unsigned long len, done = 0, total = acb->nb_sectors * BDRV_SECTOR_SIZE; > - unsigned long idx = acb->sector_num * BDRV_SECTOR_SIZE / > SD_DATA_OBJ_SIZE; > + unsigned long idx; > + uint32_t object_size; > uint64_t oid; > - uint64_t offset = (acb->sector_num * BDRV_SECTOR_SIZE) % > SD_DATA_OBJ_SIZE; > + uint64_t offset; > BDRVSheepdogState *s = acb->common.bs->opaque; > SheepdogInode *inode = &s->inode; > AIOReq *aio_req; > @@ -2032,6 +2131,10 @@ static int coroutine_fn sd_co_rw_vector(void *p) > } > } > > + object_size = (UINT32_C(1) << inode->block_size_shift); > + idx = acb->sector_num * BDRV_SECTOR_SIZE / object_size; > + offset = (acb->sector_num * BDRV_SECTOR_SIZE) % object_size; > + > /* > * Make sure we don't free the aiocb before we are done with all > requests. > * This additional reference is dropped at the end of this function. > @@ -2045,7 +2148,7 @@ static int coroutine_fn sd_co_rw_vector(void *p) > > oid = vid_to_data_oid(inode->data_vdi_id[idx], idx); > > - len = MIN(total - done, SD_DATA_OBJ_SIZE - offset); > + len = MIN(total - done, object_size - offset); > > switch (acb->aiocb_type) { > case AIOCB_READ_UDATA: > @@ -2069,7 +2172,7 @@ static int coroutine_fn sd_co_rw_vector(void *p) > * We discard the object only when the whole object is > * 1) allocated 2) trimmed. Otherwise, simply skip it. > */ > - if (len != SD_DATA_OBJ_SIZE || inode->data_vdi_id[idx] == 0) { > + if (len != object_size || inode->data_vdi_id[idx] == 0) { > goto done; > } > break; > @@ -2426,6 +2529,7 @@ static int do_load_save_vmstate(BDRVSheepdogState *s, > uint8_t *data, > uint64_t offset; > uint32_t vdi_index; > uint32_t vdi_id = load ? s->inode.parent_vdi_id : s->inode.vdi_id; > + uint32_t object_size = (UINT32_C(1) << s->inode.block_size_shift); > > fd = connect_to_sdog(s, &local_err); > if (fd < 0) { > @@ -2435,10 +2539,10 @@ static int do_load_save_vmstate(BDRVSheepdogState *s, > uint8_t *data, > } > > while (remaining) { > - vdi_index = pos / SD_DATA_OBJ_SIZE; > - offset = pos % SD_DATA_OBJ_SIZE; > + vdi_index = pos / object_size; > + offset = pos % object_size; > > - data_len = MIN(remaining, SD_DATA_OBJ_SIZE - offset); > + data_len = MIN(remaining, object_size - offset); > > vmstate_oid = vid_to_vmstate_oid(vdi_id, vdi_index); > > @@ -2525,10 +2629,11 @@ sd_co_get_block_status(BlockDriverState *bs, int64_t > sector_num, int nb_sectors, > { > BDRVSheepdogState *s = bs->opaque; > SheepdogInode *inode = &s->inode; > + uint32_t object_size = (UINT32_C(1) << inode->block_size_shift); > uint64_t offset = sector_num * BDRV_SECTOR_SIZE; > - unsigned long start = offset / SD_DATA_OBJ_SIZE, > + unsigned long start = offset / object_size, > end = DIV_ROUND_UP((sector_num + nb_sectors) * > - BDRV_SECTOR_SIZE, SD_DATA_OBJ_SIZE); > + BDRV_SECTOR_SIZE, object_size); > unsigned long idx; > int64_t ret = BDRV_BLOCK_DATA | BDRV_BLOCK_OFFSET_VALID | offset; > > @@ -2547,7 +2652,7 @@ sd_co_get_block_status(BlockDriverState *bs, int64_t > sector_num, int nb_sectors, > } > } > > - *pnum = (idx - start) * SD_DATA_OBJ_SIZE / BDRV_SECTOR_SIZE; > + *pnum = (idx - start) * object_size / BDRV_SECTOR_SIZE; > if (*pnum > nb_sectors) { > *pnum = nb_sectors; > } > @@ -2558,14 +2663,15 @@ static int64_t > sd_get_allocated_file_size(BlockDriverState *bs) > { > BDRVSheepdogState *s = bs->opaque; > SheepdogInode *inode = &s->inode; > - unsigned long i, last = DIV_ROUND_UP(inode->vdi_size, SD_DATA_OBJ_SIZE); > + uint32_t object_size = (UINT32_C(1) << inode->block_size_shift); > + unsigned long i, last = DIV_ROUND_UP(inode->vdi_size, object_size); > uint64_t size = 0; > > for (i = 0; i < last; i++) { > if (inode->data_vdi_id[i] == 0) { > continue; > } > - size += SD_DATA_OBJ_SIZE; > + size += object_size; > } > return size; > } > @@ -2594,6 +2700,11 @@ static QemuOptsList sd_create_opts = { > .type = QEMU_OPT_STRING, > .help = "Redundancy of the image" > }, > + { > + .name = BLOCK_OPT_OBJECT_SIZE, > + .type = QEMU_OPT_SIZE, > + .help = "Object size of the image" > + }, > { /* end of list */ } > } > }; > diff --git a/include/block/block_int.h b/include/block/block_int.h > index 7ad1950..5e718a3 100644 > --- a/include/block/block_int.h > +++ b/include/block/block_int.h > @@ -56,6 +56,7 @@ > #define BLOCK_OPT_ADAPTER_TYPE "adapter_type" > #define BLOCK_OPT_REDUNDANCY "redundancy" > #define BLOCK_OPT_NOCOW "nocow" > +#define BLOCK_OPT_OBJECT_SIZE "object_size" > > #define BLOCK_PROBE_BUF_SIZE 512 > > -- > 1.7.1 > > -- > sheepdog mailing list > sheep...@lists.wpkg.org > https://lists.wpkg.org/mailman/listinfo/sheepdog