Reviewed-by: Robin Dong <[email protected]>
2013/11/29 Liu Yuan <[email protected]> > This implements a per-vdi allocator for http extent-based container. > > We use meta object tracks the free information of data vdi for the object > allocation in a free list. Free list is a redundant structure for bitmap > for > faster allocation. > > future enhancements: > - Add allocation group for scalability and solve the meta size limitation > - use aio to speed up discard of objects > - add range support for inode update > > Signed-off-by: Liu Yuan <[email protected]> > --- > v2: > - use uint64_t for i,j,k > > sheep/Makefile.am | 3 +- > sheep/http/http.h | 6 + > sheep/http/oalloc.c | 306 > +++++++++++++++++++++++++++++++++++++++++++++++++++ > 3 files changed, 314 insertions(+), 1 deletion(-) > create mode 100644 sheep/http/oalloc.c > > diff --git a/sheep/Makefile.am b/sheep/Makefile.am > index 552e86a..af1087f 100644 > --- a/sheep/Makefile.am > +++ b/sheep/Makefile.am > @@ -30,7 +30,8 @@ sheep_SOURCES = sheep.c group.c request.c > gateway.c store.c vdi.c \ > plain_store.c config.c migrate.c md.c > > if BUILD_HTTP > -sheep_SOURCES += http/http.c http/kv.c http/s3.c http/swift.c > +sheep_SOURCES += http/http.c http/kv.c http/s3.c http/swift.c \ > + http/oalloc.c > endif > if BUILD_COROSYNC > sheep_SOURCES += cluster/corosync.c > diff --git a/sheep/http/http.h b/sheep/http/http.h > index 495a66c..046d412 100644 > --- a/sheep/http/http.h > +++ b/sheep/http/http.h > @@ -108,4 +108,10 @@ int http_request_writes(struct http_request *req, > const char *str); > __printf(2, 3) > int http_request_writef(struct http_request *req, const char *fmt, ...); > > +/* object_allocator.c */ > +int oalloc_new_prepare(uint32_t vid, uint64_t *start, uint64_t count); > +int oalloc_new_finish(uint32_t vid, uint64_t start, uint64_t count); > +int oalloc_free(uint32_t vid, uint64_t start, uint64_t count); > +int oalloc_init(uint32_t vid); > + > #endif /* __SHEEP_HTTP_H__ */ > diff --git a/sheep/http/oalloc.c b/sheep/http/oalloc.c > new file mode 100644 > index 0000000..56362fa > --- /dev/null > +++ b/sheep/http/oalloc.c > @@ -0,0 +1,306 @@ > +/* > + * Copyright (C) 2013 Taobao Inc. > + * > + * Liu Yuan <[email protected]> > + * > + * This program is free software; you can redistribute it and/or > + * modify it under the terms of the GNU General Public License version > + * 2 as published by the Free Software Foundation. > + * > + * You should have received a copy of the GNU General Public License > + * along with this program. If not, see <http://www.gnu.org/licenses/>. > + */ > + > +#include "sheep_priv.h" > +#include "http.h" > + > +/* > + * Meta Object tracks the free information of data vdi for the object > allocation > + * in a free list. Free list is a redundant structure for bitmap for > faster > + * allocation. > + * > + * +-------------------------------+ > + * | | > + * | sorted list v------v > + * +--------------------------------+-----------------------+ > +--------+ > + * | Header | fd1 | fd2 | ... | fdN | .... object data .... | <-- | > bitmap | > + * +--------------------------------+-----------------------+ > +--------- > + * |<-- 4M -->| > + * > + * Best-fit algorithm for allocation and merge and sort the free list at > + * deallocation. One simple sorted list is effecient enough for extent > based > + * invariable user object. > + * > + * XXX: Add allocation group for scalability and solve the meta size > limitation > + */ > + > +struct header { > + uint64_t used; > + uint64_t nr_free; > +}; > + > +struct free_desc { > + uint64_t start; > + uint64_t count; > +}; > + > +static inline uint32_t oalloc_meta_length(struct header *hd) > +{ > + return sizeof(struct header) + sizeof(struct free_desc) * > hd->nr_free; > +} > + > +#define HEADER_TO_FREE_DESC(hd) ((struct free_desc *) \ > + ((char *)hd + sizeof(struct header))) > + > +#define MAX_FREE_DESC ((SD_DATA_OBJ_SIZE - sizeof(struct header)) / \ > + sizeof(struct free_desc)) > + > +/* > + * Initialize the data vdi > + * > + * @vid: the vdi where the allocator resides > + */ > +int oalloc_init(uint32_t vid) > +{ > + struct strbuf buf = STRBUF_INIT; > + struct sd_inode *inode = xmalloc(sizeof(struct sd_inode)); > + struct header hd = { > + .nr_free = 1, > + }; > + struct free_desc fd = { > + .start = 1, /* Use first object as the meta object */ > + .count = MAX_DATA_OBJS - 1, > + }; > + int ret; > + > + strbuf_add(&buf, &hd, sizeof(hd)); > + strbuf_add(&buf, &fd, sizeof(fd)); > + > + ret = sd_read_object(vid_to_vdi_oid(vid), (char *)inode, > + sizeof(*inode), 0); > + if (ret != SD_RES_SUCCESS) { > + sd_err("failed to read inode, %" PRIx32", %s", vid, > + sd_strerror(ret)); > + goto out; > + } > + ret = sd_write_object(vid_to_data_oid(vid, 0), buf.buf, > + buf.len, 0, true); > + if (ret != SD_RES_SUCCESS) { > + sd_err("failed to create meta object for %" PRIx32", %s", > vid, > + sd_strerror(ret)); > + goto out; > + } > + INODE_SET_VID(inode, 0, vid); > + ret = sd_inode_write_vid(sheep_bnode_writer, inode, 0, > + vid, vid, 0, false, false); > + if (ret != SD_RES_SUCCESS) { > + sd_err("failed to update inode, %" PRIx32", %s", vid, > + sd_strerror(ret)); > + goto out; > + } > +out: > + strbuf_release(&buf); > + free(inode); > + return ret; > +} > + > +/* > + * Allocate the objects and upate the free list. > + * > + * Callers are expected to call oalloc_new_finish() to update the inode > bitmap > + * after filling up the data. > + * > + * @vid: the vdi where the allocator resides > + * @start: start index of the objects to allocate > + * @count: number of the objects to allocate > + */ > +int oalloc_new_prepare(uint32_t vid, uint64_t *start, uint64_t count) > +{ > + char *meta = xvalloc(SD_DATA_OBJ_SIZE); > + struct header *hd; > + struct free_desc *fd; > + uint64_t oid = vid_to_data_oid(vid, 0), i; > + int ret; > + > + ret = sd_read_object(oid, meta, SD_DATA_OBJ_SIZE, 0); > + if (ret != SD_RES_SUCCESS) { > + sd_err("failed to read meta %" PRIx64 ", %s", oid, > + sd_strerror(ret)); > + goto out; > + } > + > + hd = (struct header *)meta; > + fd = (struct free_desc *)(meta + oalloc_meta_length(hd)) - 1; > + sd_debug("used %"PRIu64", nr_free %"PRIu64, hd->used, hd->nr_free); > + for (i = 0; i < hd->nr_free; i++, fd--) { > + sd_debug("start %"PRIu64", count %"PRIu64, fd->start, > + fd->count); > + if (fd->count > count) > + break; > + } > + if (i == hd->nr_free) { > + ret = SD_RES_NO_SPACE; > + goto out; > + } > + > + *start = fd->start; > + fd->start += count; > + fd->count -= count; > + hd->used += count; > + > + /* Update the meta object */ > + ret = sd_write_object(oid, meta, oalloc_meta_length(hd), 0, false); > + if (ret != SD_RES_SUCCESS) > + sd_err("failed to update meta %"PRIx64 ", %s", oid, > + sd_strerror(ret)); > +out: > + free(meta); > + return ret; > +} > + > +/* > + * Update the inode map of the vid > + * > + * @vid: the vdi where the allocator resides > + * @start: start index of the objects to update > + * @count: number of the objects to update > + */ > +int oalloc_new_finish(uint32_t vid, uint64_t start, uint64_t count) > +{ > + struct sd_inode *inode = xmalloc(sizeof(struct sd_inode)); > + int ret; > + > + ret = sd_read_object(vid_to_vdi_oid(vid), (char *)inode, > + sizeof(*inode), 0); > + if (ret != SD_RES_SUCCESS) { > + sd_err("failed to read inode, %" PRIx64 ", %s", > + vid_to_vdi_oid(vid), sd_strerror(ret)); > + goto out; > + } > + > + /* TODO: add range support for inode update */ > + for (uint64_t i = 0; i < count; i++) { > + INODE_SET_VID(inode, start + i, vid); > + > + ret = sd_inode_write_vid(sheep_bnode_writer, inode, start > + i, > + vid, vid, 0, false, false); > + if (ret != SD_RES_SUCCESS) { > + sd_err("failed to update inode, %" PRIx64", %s", > + vid_to_vdi_oid(vid), sd_strerror(ret)); > + goto out; > + } > + } > +out: > + free(inode); > + return ret; > +} > + > +static int free_desc_cmp(struct free_desc *a, struct free_desc *b) > +{ > + return -intcmp(a->start, b->start); > +} > + > +static inline int update_and_merge_free_desc(char *meta, uint64_t start, > + uint64_t count, uint32_t vid) > +{ > + struct header *hd = (struct header *)meta; > + struct free_desc *tail, *fd = HEADER_TO_FREE_DESC(hd); > + uint64_t i, j; > + > + /* Try our best to merge it in place, or append it to tail */ > + for (i = 0; i < hd->nr_free; i++) { > + if (start + count == fd->start) { > + fd->start = start; > + fd->count += count; > + break; > + } else if(fd->start + fd->count == start) { > + fd->count +=count; > + break; > + } > + fd++; > + } > + > + if (i == hd->nr_free) { > + if (hd->nr_free >= MAX_FREE_DESC) > + return SD_RES_NO_SPACE; > + > + tail = (struct free_desc *)(meta + oalloc_meta_length(hd)); > + tail->start = start; > + tail->count = count; > + hd->nr_free++; > + } > + > + hd->used -= count; > + xqsort(HEADER_TO_FREE_DESC(hd), hd->nr_free, free_desc_cmp); > + > + /* Merge as hard as we can */ > + j = hd->nr_free - 1; > + tail = (struct free_desc *)(meta + oalloc_meta_length(hd)) - 1; > + for (i = 0; i < j; i++, tail--) { > + struct free_desc *front = tail - 1; > + > + sd_debug("start %"PRIu64", count %"PRIu64, tail->start, > + tail->count); > + if (tail->start + tail->count > front->start) > + sd_emerg("bad free descriptor found at %"PRIx32, > vid); > + if (tail->start + tail->count == front->start) { > + front->start = tail->start; > + front->count += tail->count; > + memmove(tail, tail + 1, sizeof(*tail) * i); > + hd->nr_free--; > + } > + } > + > + return SD_RES_SUCCESS; > +} > + > +/* > + * Discard the allocted objects and update the free list of the allocator > + * > + * Caller should check the return value since it might fail. > + * > + * @vid: the vdi where the allocator resides > + * @start: start index of the objects to free > + * @count: number of the objects to free > + */ > +int oalloc_free(uint32_t vid, uint64_t start, uint64_t count) > +{ > + char *meta = xvalloc(SD_DATA_OBJ_SIZE); > + struct header *hd; > + uint64_t oid = vid_to_data_oid(vid, 0), i; > + int ret; > + > + ret = sd_read_object(oid, meta, SD_DATA_OBJ_SIZE, 0); > + if (ret != SD_RES_SUCCESS) { > + sd_err("failed to read meta %" PRIx64 ", %s", oid, > + sd_strerror(ret)); > + goto out; > + } > + > + ret = update_and_merge_free_desc(meta, start, count, vid); > + if (ret != SD_RES_SUCCESS) > + goto out; > + > + /* XXX use aio to speed up discard of objects */ > + for (i = 0; i < count; i++) { > + struct sd_req hdr; > + > + sd_init_req(&hdr, SD_OP_DISCARD_OBJ); > + hdr.obj.oid = vid_to_data_oid(vid, start + i); > + ret = exec_local_req(&hdr, NULL); > + if (ret != SD_RES_SUCCESS) > + goto out; > + } > + > + hd = (struct header *)meta; > + ret = sd_write_object(oid, meta, oalloc_meta_length(hd), 0, false); > + if (ret != SD_RES_SUCCESS) { > + sd_err("failed to update meta %"PRIx64 ", %s", oid, > + sd_strerror(ret)); > + goto out; > + } > + sd_debug("used %"PRIu64", nr_free %"PRIu64, hd->used, hd->nr_free); > +out: > + free(meta); > + return ret; > +} > -- > 1.7.9.5 > > -- > sheepdog mailing list > [email protected] > http://lists.wpkg.org/mailman/listinfo/sheepdog > -- -- Best Regard Robin Dong
-- sheepdog mailing list [email protected] http://lists.wpkg.org/mailman/listinfo/sheepdog
