This implements a per-vdi allocator for http extent-based container. We use meta object tracks the free information of data vdi for the object allocation in a free list. Free list is a redundant structure for bitmap for faster allocation.
future enhancements: - Add allocation group for scalability and solve the meta size limitation - use aio to speed up discard of objects - add range support for inode update Signed-off-by: Liu Yuan <[email protected]> --- sheep/Makefile.am | 3 +- sheep/http/http.h | 6 + sheep/http/oalloc.c | 304 +++++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 312 insertions(+), 1 deletion(-) create mode 100644 sheep/http/oalloc.c diff --git a/sheep/Makefile.am b/sheep/Makefile.am index 552e86a..af1087f 100644 --- a/sheep/Makefile.am +++ b/sheep/Makefile.am @@ -30,7 +30,8 @@ sheep_SOURCES = sheep.c group.c request.c gateway.c store.c vdi.c \ plain_store.c config.c migrate.c md.c if BUILD_HTTP -sheep_SOURCES += http/http.c http/kv.c http/s3.c http/swift.c +sheep_SOURCES += http/http.c http/kv.c http/s3.c http/swift.c \ + http/oalloc.c endif if BUILD_COROSYNC sheep_SOURCES += cluster/corosync.c diff --git a/sheep/http/http.h b/sheep/http/http.h index 495a66c..046d412 100644 --- a/sheep/http/http.h +++ b/sheep/http/http.h @@ -108,4 +108,10 @@ int http_request_writes(struct http_request *req, const char *str); __printf(2, 3) int http_request_writef(struct http_request *req, const char *fmt, ...); +/* object_allocator.c */ +int oalloc_new_prepare(uint32_t vid, uint64_t *start, uint64_t count); +int oalloc_new_finish(uint32_t vid, uint64_t start, uint64_t count); +int oalloc_free(uint32_t vid, uint64_t start, uint64_t count); +int oalloc_init(uint32_t vid); + #endif /* __SHEEP_HTTP_H__ */ diff --git a/sheep/http/oalloc.c b/sheep/http/oalloc.c new file mode 100644 index 0000000..dc6be68 --- /dev/null +++ b/sheep/http/oalloc.c @@ -0,0 +1,304 @@ +/* + * Copyright (C) 2013 Taobao Inc. + * + * Liu Yuan <[email protected]> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License version + * 2 as published by the Free Software Foundation. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. + */ + +#include "sheep_priv.h" +#include "http.h" + +/* + * Meta Object tracks the free information of data vdi for the object allocation + * in a free list. Free list is a redundant structure for bitmap for faster + * allocation. + * + * +-------------------------------+ + * | | + * | sorted list v------v + * +--------------------------------+-----------------------+ +--------+ + * | Header | fd1 | fd2 | ... | fdN | .... object data .... | <-- | bitmap | + * +--------------------------------+-----------------------+ +--------- + * |<-- 4M -->| + * + * Best-fit algorithm for allocation and merge and sort the free list at + * deallocation. One simple sorted list is effecient enough for extent based + * invariable user object. + * + * XXX: Add allocation group for scalability and solve the meta size limitation + */ + +struct header { + uint64_t used; + uint64_t nr_free; +}; + +struct free_desc { + uint64_t start; + uint64_t count; +}; + +static inline uint32_t oalloc_meta_length(struct header *hd) +{ + return sizeof(struct header) + sizeof(struct free_desc) * hd->nr_free; +} + +#define HEADER_TO_FREE_DESC(hd) ((struct free_desc *) \ + ((char *)hd + sizeof(struct header))) + +#define MAX_FREE_DESC ((SD_DATA_OBJ_SIZE - sizeof(struct header)) / \ + sizeof(struct free_desc)) + +/* + * Initialize the data vdi + * + * @vid: the vdi where the allocator resides + */ +int oalloc_init(uint32_t vid) +{ + struct strbuf buf = STRBUF_INIT; + struct sd_inode *inode = xmalloc(sizeof(struct sd_inode)); + struct header hd = { + .nr_free = 1, + }; + struct free_desc fd = { + .start = 1, /* Use first object as the meta object */ + .count = MAX_DATA_OBJS - 1, + }; + int ret; + + strbuf_add(&buf, &hd, sizeof(hd)); + strbuf_add(&buf, &fd, sizeof(fd)); + + ret = sd_read_object(vid_to_vdi_oid(vid), (char *)inode, + sizeof(*inode), 0); + if (ret != SD_RES_SUCCESS) { + sd_err("failed to read inode, %" PRIx32", %s", vid, + sd_strerror(ret)); + goto out; + } + ret = sd_write_object(vid_to_data_oid(vid, 0), buf.buf, + buf.len, 0, true); + if (ret != SD_RES_SUCCESS) { + sd_err("failed to create meta object for %" PRIx32", %s", vid, + sd_strerror(ret)); + goto out; + } + INODE_SET_VID(inode, 0, vid); + ret = sd_inode_write_vid(sheep_bnode_writer, inode, 0, + vid, vid, 0, false, false); + if (ret != SD_RES_SUCCESS) { + sd_err("failed to update inode, %" PRIx32", %s", vid, + sd_strerror(ret)); + goto out; + } +out: + strbuf_release(&buf); + free(inode); + return ret; +} + +/* + * Allocate the objects and upate the free list. + * + * Callers are expected to call oalloc_new_finish() to update the inode bitmap + * after filling up the data. + * + * @vid: the vdi where the allocator resides + * @start: start index of the objects to allocate + * @count: number of the objects to allocate + */ +int oalloc_new_prepare(uint32_t vid, uint64_t *start, uint64_t count) +{ + char *meta = xvalloc(SD_DATA_OBJ_SIZE); + struct header *hd; + struct free_desc *fd; + uint64_t oid = vid_to_data_oid(vid, 0); + int i, ret; + + ret = sd_read_object(oid, meta, SD_DATA_OBJ_SIZE, 0); + if (ret != SD_RES_SUCCESS) { + sd_err("failed to read meta %" PRIx64 ", %s", oid, + sd_strerror(ret)); + goto out; + } + + hd = (struct header *)meta; + fd = (struct free_desc *)(meta + oalloc_meta_length(hd)) - 1; + sd_debug("used %"PRIu64", nr_free %"PRIu64, hd->used, hd->nr_free); + for (i = 0; i < hd->nr_free; i++, fd--) { + sd_debug("start %"PRIu64", count %"PRIu64, fd->start, + fd->count); + if (fd->count > count) + break; + } + if (i == hd->nr_free) { + ret = SD_RES_NO_SPACE; + goto out; + } + + *start = fd->start; + fd->start += count; + fd->count -= count; + hd->used += count; + + /* Update the meta object */ + ret = sd_write_object(oid, meta, oalloc_meta_length(hd), 0, false); + if (ret != SD_RES_SUCCESS) + sd_err("failed to update meta %"PRIx64 ", %s", oid, + sd_strerror(ret)); +out: + free(meta); + return ret; +} + +/* + * Update the inode map of the vid + * + * @vid: the vdi where the allocator resides + * @start: start index of the objects to update + * @count: number of the objects to update + */ +int oalloc_new_finish(uint32_t vid, uint64_t start, uint64_t count) +{ + struct sd_inode *inode = xmalloc(sizeof(struct sd_inode)); + int ret; + + ret = sd_read_object(vid_to_vdi_oid(vid), (char *)inode, + sizeof(*inode), 0); + if (ret != SD_RES_SUCCESS) { + sd_err("failed to read inode, %" PRIx64 ", %s", + vid_to_vdi_oid(vid), sd_strerror(ret)); + goto out; + } + /* TODO: add range support for inode update */ + for (uint64_t i = 0; i < count; i++) { + INODE_SET_VID(inode, start + i, vid); + ret = sd_inode_write_vid(sheep_bnode_writer, inode, start + i, + vid, vid, 0, false, false); + if (ret != SD_RES_SUCCESS) { + sd_err("failed to update inode, %" PRIx64", %s", + vid_to_vdi_oid(vid), sd_strerror(ret)); + goto out; + } + } +out: + free(inode); + return ret; +} + +static int free_desc_cmp(struct free_desc *a, struct free_desc *b) +{ + return -intcmp(a->start, b->start); +} + +static inline int update_and_merge_free_desc(char *meta, uint64_t start, + uint64_t count, uint32_t vid) +{ + struct header *hd = (struct header *)meta; + struct free_desc *tail, *fd = HEADER_TO_FREE_DESC(hd); + int i, j; + + /* Try our best to merge it in place, or append it to tail */ + for (i = 0; i < hd->nr_free; i++) { + if (start + count == fd->start) { + fd->start = start; + fd->count += count; + break; + } else if(fd->start + fd->count == start) { + fd->count +=count; + break; + } + fd++; + } + + if (i == hd->nr_free) { + if (hd->nr_free >= MAX_FREE_DESC) + return SD_RES_NO_SPACE; + + tail = (struct free_desc *)(meta + oalloc_meta_length(hd)); + tail->start = start; + tail->count = count; + hd->nr_free++; + } + + hd->used -= count; + xqsort(HEADER_TO_FREE_DESC(hd), hd->nr_free, free_desc_cmp); + + /* Merge as hard as we can */ + j = hd->nr_free - 1; + tail = (struct free_desc *)(meta + oalloc_meta_length(hd)) - 1; + for (i = 0; i < j; i++, tail--) { + struct free_desc *front = tail - 1; + + sd_debug("start %"PRIu64", count %"PRIu64, tail->start, + tail->count); + if (tail->start + tail->count > front->start) + sd_emerg("bad free descriptor found at %"PRIx32, vid); + if (tail->start + tail->count == front->start) { + front->start = tail->start; + front->count += tail->count; + memmove(tail, tail + 1, sizeof(*tail) * i); + hd->nr_free--; + } + } + + return SD_RES_SUCCESS; +} + +/* + * Discard the allocted objects and update the free list of the allocator + * + * Caller should check the return value since it might fail. + * + * @vid: the vdi where the allocator resides + * @start: start index of the objects to free + * @count: number of the objects to free + */ +int oalloc_free(uint32_t vid, uint64_t start, uint64_t count) +{ + char *meta = xvalloc(SD_DATA_OBJ_SIZE); + struct header *hd; + uint64_t oid = vid_to_data_oid(vid, 0); + int i, ret; + + ret = sd_read_object(oid, meta, SD_DATA_OBJ_SIZE, 0); + if (ret != SD_RES_SUCCESS) { + sd_err("failed to read meta %" PRIx64 ", %s", oid, + sd_strerror(ret)); + goto out; + } + + ret = update_and_merge_free_desc(meta, start, count, vid); + if (ret != SD_RES_SUCCESS) + goto out; + + /* XXX use aio to speed up discard of objects */ + for (i = 0; i < count; i++) { + struct sd_req hdr; + + sd_init_req(&hdr, SD_OP_DISCARD_OBJ); + hdr.obj.oid = vid_to_data_oid(vid, start + i); + ret = exec_local_req(&hdr, NULL); + if (ret != SD_RES_SUCCESS) + goto out; + } + + hd = (struct header *)meta; + ret = sd_write_object(oid, meta, oalloc_meta_length(hd), 0, false); + if (ret != SD_RES_SUCCESS) { + sd_err("failed to update meta %"PRIx64 ", %s", oid, + sd_strerror(ret)); + goto out; + } + sd_debug("used %"PRIu64", nr_free %"PRIu64, hd->used, hd->nr_free); +out: + free(meta); + return ret; +} -- 1.7.9.5 -- sheepdog mailing list [email protected] http://lists.wpkg.org/mailman/listinfo/sheepdog
