At Sat, 24 Mar 2012 16:47:13 +0800,
Liu Yuan wrote:
> 
> From: Liu Yuan <[email protected]>
> 
> Object cache caches data and vdi objects on the local node. It is at
> higher level than backend store. This extra cache layer translate gateway
> requests into local requests, largely reducing the network traffic and highly
> improve the IO performance.
> 
> Dirty objects will be flushed to cluster storage by 'sync' request from
> guest OS.
> 
> - use red-black tree to track dirty objects
> - use file lock to avoid RW race on object granularity
> - use hash lists to maintain vdi space.
> - each vid has its own independent object cache
> 
> Signed-off-by: Liu Yuan <[email protected]>
> ---
>  sheep/object_cache.c |  421 
> ++++++++++++++++++++++++++++++++++++++++++++++++++
>  sheep/sheep_priv.h   |   34 ++++
>  2 files changed, 455 insertions(+), 0 deletions(-)
>  create mode 100644 sheep/object_cache.c
> 
> diff --git a/sheep/object_cache.c b/sheep/object_cache.c
> new file mode 100644
> index 0000000..929e28d
> --- /dev/null
> +++ b/sheep/object_cache.c
> @@ -0,0 +1,421 @@
> +/*
> + * Copyright (C) 2012 Taobao Inc.
> + *
> + * Liu Yuan <[email protected]>
> + *
> + * This program is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU General Public License version
> + * 2 as published by the Free Software Foundation.
> + *
> + * You should have received a copy of the GNU General Public License
> + * along with this program. If not, see <http://www.gnu.org/licenses/>.
> + */
> +
> +#include <stdlib.h>
> +#include <stdio.h>
> +#include <sys/types.h>
> +#include <sys/stat.h>
> +#include <fcntl.h>
> +#include <unistd.h>
> +#include <pthread.h>
> +#include <errno.h>
> +#include <sys/file.h>
> +
> +#include "sheep_priv.h"
> +#include "util.h"
> +#include "strbuf.h"
> +#include "rbtree.h"
> +
> +#define HASH_BITS    5
> +#define HASH_SIZE    (1 << HASH_BITS)
> +
> +static char cache_dir[PATH_MAX];
> +static int def_open_flags = O_DSYNC | O_RDWR;

I think we don't need a sync flag for a cache.


> +extern mode_t def_fmode;
> +extern mode_t def_dmode;
> +extern struct store_driver *sd_store;
> +
> +static pthread_mutex_t hashtable_lock[HASH_SIZE] = { [0 ... HASH_SIZE - 1] = 
> PTHREAD_MUTEX_INITIALIZER };
> +static struct hlist_head cache_hashtable[HASH_SIZE];
> +
> +static inline int hash(uint64_t vid)
> +{
> +     return hash_64(vid, HASH_BITS);
> +}
> +
> +static struct object_cache_entry *dirty_rb_insert(struct rb_root *root,
> +             struct object_cache_entry *new)
> +{
> +     struct rb_node **p = &root->rb_node;
> +     struct rb_node *parent = NULL;
> +     struct object_cache_entry *entry;
> +
> +     while (*p) {
> +             parent = *p;
> +             entry = rb_entry(parent, struct object_cache_entry, rb);
> +
> +             if (new->idx < entry->idx)
> +                     p = &(*p)->rb_left;
> +             else if (new->idx > entry->idx)
> +                     p = &(*p)->rb_right;
> +             else
> +                     return entry; /* already has this entry */
> +     }
> +     rb_link_node(&new->rb, parent, p);
> +     rb_insert_color(&new->rb, root);
> +
> +     return NULL; /* insert successfully */
> +}
> +
> +__attribute__ ((unused))
> +static struct object_cache_entry *dirty_rb_search(struct rb_root *root,
> +             struct object_cache_entry *entry)
> +{
> +     struct rb_node *n = root->rb_node;
> +     struct object_cache_entry *t;
> +
> +     while (n) {
> +             t = rb_entry(n, struct object_cache_entry, rb);
> +
> +             if (entry->idx < t->idx)
> +                     n = n->rb_left;
> +             else if (entry->idx > t->idx)
> +                     n = n->rb_right;
> +             else
> +                     return t; /* found it */
> +     }
> +
> +     return NULL;
> +}
> +
> +static int create_dir_for(uint32_t vid)
> +{
> +     int ret = 0;
> +     struct strbuf buf = STRBUF_INIT;
> +
> +     strbuf_addstr(&buf, cache_dir);
> +     strbuf_addf(&buf, "/%06"PRIx32, vid);
> +     if (mkdir(buf.buf, def_dmode) < 0)
> +             if (errno != EEXIST) {
> +                     eprintf("%m\n");
> +                     ret = -1;
> +                     goto err;
> +             }
> +err:
> +     strbuf_release(&buf);
> +     return ret;
> +}
> +
> +static struct object_cache *lookup_object_cache(uint32_t vid, int create)
> +{
> +     int h = hash(vid);
> +     struct hlist_head *head = cache_hashtable + h;
> +     struct object_cache *cache = NULL;
> +     struct hlist_node *node;
> +
> +     pthread_mutex_lock(&hashtable_lock[h]);
> +     if (hlist_empty(head))
> +             goto not_found;
> +
> +     hlist_for_each_entry(cache, node, head, hash) {
> +             if (cache->vid == vid)
> +                     goto out;
> +     }
> +not_found:
> +     if (create) {
> +             cache = xzalloc(sizeof(*cache));
> +             cache->vid = vid;
> +             create_dir_for(vid);
> +             cache->dirty_rb = RB_ROOT;
> +             pthread_mutex_init(&cache->lock, NULL);
> +             INIT_LIST_HEAD(&cache->dirty_list);
> +             hlist_add_head(&cache->hash, head);
> +     } else
> +             cache = NULL;
> +out:
> +     pthread_mutex_unlock(&hashtable_lock[h]);
> +     return cache;
> +}
> +
> +struct object_cache *find_object_cache(uint32_t vid)
> +{
> +     return lookup_object_cache(vid, 1);
> +}
> +
> +/* The caller is responsible to release fd */
> +int object_cache_lookup(struct object_cache *oc, uint32_t idx)
> +{
> +     struct strbuf buf;
> +     int fd, ret = 0;
> +
> +     strbuf_init(&buf, PATH_MAX);
> +     strbuf_addstr(&buf, cache_dir);
> +     strbuf_addf(&buf, "/%06"PRIx32"/%08"PRIx32, oc->vid, idx);
> +
> +     fd = open(buf.buf, def_open_flags, def_fmode);
> +     if (fd < 0) {
> +             ret = -1;
> +             goto out;
> +     }
> +     close(fd);
> +out:
> +     strbuf_release(&buf);
> +     return ret;
> +}
> +
> +static int write_cache_object(uint32_t vid, uint32_t idx, void *buf, size_t 
> count, off_t offset)
> +{
> +     size_t size;
> +     int fd, ret = SD_RES_SUCCESS;
> +     struct strbuf p;
> +
> +     strbuf_init(&p, PATH_MAX);
> +     strbuf_addstr(&p, cache_dir);
> +     strbuf_addf(&p, "/%06"PRIx32"/%08"PRIx32, vid, idx);
> +
> +     fd = open(p.buf, def_open_flags, def_fmode);
> +     if (flock(fd, LOCK_EX) < 0) {

Do we need flock here?  We don't assume that multiple clients open the
same VDI at the same time.


> +             ret = SD_RES_EIO;
> +             eprintf("%m\n");
> +             goto out;
> +     }
> +     size = xpwrite(fd, buf, count, offset);
> +     if (flock(fd, LOCK_UN) < 0) {
> +             ret = SD_RES_EIO;
> +             eprintf("%m\n");
> +             goto out;
> +     }
> +
> +     if (size != count)
> +             ret = SD_RES_EIO;
> +out:
> +     close(fd);
> +     strbuf_release(&p);
> +     return ret;
> +}
> +
> +static int read_cache_object(uint32_t vid, uint32_t idx, void *buf, size_t 
> count, off_t offset)
> +{
> +     size_t size;
> +     int fd, ret = SD_RES_SUCCESS;
> +     struct strbuf p;
> +
> +     strbuf_init(&p, PATH_MAX);
> +     strbuf_addstr(&p, cache_dir);
> +     strbuf_addf(&p, "/%06"PRIx32"/%08"PRIx32, vid, idx);
> +
> +     fd = open(p.buf, def_open_flags, def_fmode);
> +     if (flock(fd, LOCK_SH) < 0) {
> +             ret = SD_RES_EIO;
> +             eprintf("%m\n");
> +             goto out;
> +     }
> +     size = xpread(fd, buf, count, offset);
> +     if (flock(fd, LOCK_UN) < 0) {
> +             ret = SD_RES_EIO;
> +             eprintf("%m\n");
> +             goto out;
> +     }
> +     if (size != count)
> +             ret = SD_RES_EIO;
> +out:
> +     close(fd);
> +     strbuf_release(&p);
> +     return ret;
> +}
> +
> +static void add_to_dirty_rb_and_list(struct object_cache *oc, uint32_t idx)
> +{
> +     struct object_cache_entry *entry = xzalloc(sizeof(*entry));
> +
> +     entry->idx = idx;
> +     pthread_mutex_lock(&oc->lock);
> +     if (!dirty_rb_insert(&oc->dirty_rb, entry))
> +             list_add(&entry->list, &oc->dirty_list);
> +     else
> +             free(entry);
> +     pthread_mutex_unlock(&oc->lock);
> +}
> +
> +int object_cache_rw(struct object_cache *oc, uint32_t idx, struct request 
> *req)
> +{
> +     struct sd_obj_req *hdr = (struct sd_obj_req *)&req->rq;
> +     struct sd_obj_rsp *rsp = (struct sd_obj_rsp *)&req->rp;
> +     int ret;
> +
> +     dprintf("%"PRIx64", len %"PRIu32", off %"PRIu64"\n", oc->oid, 
> hdr->data_length, hdr->offset);
> +     if (hdr->flags & SD_FLAG_CMD_WRITE) {
> +             ret = write_cache_object(oc->vid, idx, req->data, 
> hdr->data_length, hdr->offset);
> +             if (ret != SD_RES_SUCCESS)
> +                     goto out;
> +             add_to_dirty_rb_and_list(oc, idx);
> +     } else {
> +             ret = read_cache_object(oc->vid, idx, req->data, 
> hdr->data_length, hdr->offset);
> +             if (ret != SD_RES_SUCCESS)
> +                     goto out;

We must call forward_read_obj when we miss the cache.

Thanks,

Kazutaka
-- 
sheepdog mailing list
[email protected]
http://lists.wpkg.org/mailman/listinfo/sheepdog

Reply via email to