On Wed, 2011-05-18 at 11:17 +0100, Prasad Joshi wrote:
> QCOW uses two tables level1 (L1) table and level2
> (L2) table. The L1 table points to offset of L2
> table. When a QCOW image is probed, the L1 table is
> cached in the memory to avoid reading it from disk
> on every reference. This caching imporves the
> performance. The similar performance improvment can
> be observed when L2 tables are also cached. It is
> impossible to cache all of the L2 tables because of
> the memory constraint. The patch adds L2 table
> caching capability for upto 128 L2 tables, it uses
> combination of RB tree and List to manage the L2
> cached tables. The link list implementation helps
> in building simple LRU structure and RB tree helps
> in improving the search time during read/write
> operations.
> 
> Signed-off-by: Prasad Joshi <prasadjoshi...@gmail.com>
> ---
>  tools/kvm/include/kvm/qcow.h |   18 ++++
>  tools/kvm/qcow.c             |  230 
> ++++++++++++++++++++++++++++++++++++++----
>  2 files changed, 230 insertions(+), 18 deletions(-)
> 
> diff --git a/tools/kvm/include/kvm/qcow.h b/tools/kvm/include/kvm/qcow.h
> index b6e7493..095566a 100644
> --- a/tools/kvm/include/kvm/qcow.h
> +++ b/tools/kvm/include/kvm/qcow.h
> @@ -3,6 +3,8 @@
>  
>  #include <linux/types.h>
>  #include <stdbool.h>
> +#include <linux/rbtree.h>
> +#include <linux/list.h>
>  
>  #define QCOW_MAGIC           (('Q' << 24) | ('F' << 16) | ('I' << 8) | 0xfb)
>  
> @@ -17,6 +19,16 @@
>  #define QCOW2_OFLAG_COMPRESSED       (1LL << 62)
>  #define QCOW2_OFLAG_MASK     (QCOW2_OFLAG_COPIED|QCOW2_OFLAG_COMPRESSED)
>  
> +#define MAX_CACHE_NODES         128
> +
> +struct qcow_l2_cache {
> +     u64                     offset;
> +     u64                     *table;
> +     struct rb_node          node;
> +     struct list_head        list;
> +     struct qcow             *qcow;
> +};
> +
>  struct qcow_table {
>       u32                     table_size;
>       u64                     *l1_table;
> @@ -26,6 +38,12 @@ struct qcow {
>       void                    *header;
>       struct qcow_table       table;
>       int                     fd;
> +
> +     /* Level2 caching data structures */
> +     struct qcow_l2_cache    cache;

Why is 'cache' here?

> +     struct rb_root          root;
> +     struct list_head        lru_list;
> +     int                     no_cached;
>  };
>  
>  struct qcow_header {
> diff --git a/tools/kvm/qcow.c b/tools/kvm/qcow.c
> index bb2345c..ca61374 100644
> --- a/tools/kvm/qcow.c
> +++ b/tools/kvm/qcow.c
> @@ -16,6 +16,153 @@
>  #include <linux/kernel.h>
>  #include <linux/types.h>
>  
> +static inline int insert(struct rb_root *root, struct qcow_l2_cache *new)
> +{
> +     struct rb_node **link = &(root->rb_node), *parent = NULL;
> +     u64 offset = new->offset;
> +     
> +     /* search the tree */
> +     while (*link) {
> +             struct qcow_l2_cache *t;
> +
> +             t = rb_entry(*link, struct qcow_l2_cache, node);
> +             if (!t)
> +                     goto error;
> +
> +             parent = *link;
> +
> +             if (t->offset > offset)
> +                     link = &(*link)->rb_left;
> +             else if (t->offset < offset)
> +                     link = &(*link)->rb_right;
> +             else
> +                     goto out;
> +     }
> +
> +     /* add new node */
> +     rb_link_node(&new->node, parent, link);
> +     rb_insert_color(&new->node, root);
> +out:
> +     return 0;
> +error:
> +     return -1;
> +}
> +
> +static inline struct qcow_l2_cache *search(struct rb_root *root, u64 offset)
> +{
> +     struct rb_node *link = root->rb_node;
> +
> +     while (link) {
> +             struct qcow_l2_cache *t;
> +
> +             t = rb_entry(link, struct qcow_l2_cache, node);
> +             if (!t)
> +                     goto out;
> +
> +             if (t->offset > offset)
> +                     link = link->rb_left;
> +             else if (t->offset < offset)
> +                     link = link->rb_right;
> +             else
> +                     return t;
> +     }
> +out:
> +     return NULL;
> +}
> +
> +static inline void free_cache(struct qcow *q)
> +{
> +     struct list_head *pos, *n;
> +     struct qcow_l2_cache *t;
> +     struct rb_root *r = &q->root;
> +
> +     list_for_each_safe(pos, n, &q->lru_list) {
> +             /* Remove cache table from the list and RB tree */
> +             list_del(pos);
> +             t = list_entry(pos, struct qcow_l2_cache, list);
> +             rb_erase(&t->node, r);
> +
> +             /* Free the cached node */
> +             free(t->table);
> +             free(t);
> +     }
> +}
> +
> +static inline int cache_table(struct qcow *q, u64 *table, u64 offset)
> +{
> +     struct qcow_l2_cache *n;
> +     struct rb_root *r = &q->root;
> +     struct qcow_l2_cache *lru;
> +
> +     n = search(r, offset);
> +     if (n) {
> +             /* Update the LRU state */
> +             list_del_init(&n->list);
> +             list_add_tail(&n->list, &q->lru_list);
> +             return 0;
> +     }
> +
> +     lru = NULL;
> +     if (q->no_cached == MAX_CACHE_NODES) {
> +             /* Find the node for LRU replacement */
> +             lru = list_first_entry(&q->lru_list, struct qcow_l2_cache, 
> list);
> +     }
> +
> +     n = calloc(1, sizeof(struct qcow_l2_cache));
                        sizeof(*n)

> +     if (!n)
> +             goto error;
> +
> +     n->offset = offset;
> +     n->table  = table;
> +     n->qcow   = q;
> +     RB_CLEAR_NODE(&n->node);
> +     INIT_LIST_HEAD(&n->list);
> +
> +     if (lru) {
> +             /* LRU Replacemen */
                         Replacement

> +             rb_replace_node(&lru->node, &n->node, r);

rb_replace_node() doesn't re-balance the tree, it's intended to be used
when both keys are the same.

Doing so for nodes with different keys breaks the rbtree.

> +
> +             /*
> +              * The new node should be the last node in the LRU list
> +              * therefore, do not list_replace instead delete the node to
> +              * be replaced and add a new node at the end of the list.
> +              */
> +             list_del_init(&lru->list);
> +             list_add_tail(&n->list, &q->lru_list);
> +
> +             /* Free the LRUed node */
> +             free(lru->table);
> +             free(lru);
> +     } else {
> +             /* Add in RB Tree: Helps in searching faster */
> +             if (insert(r, n) < 0)
> +                     goto error;
> +
> +             /* Add in LRU replacement list */
> +             list_add_tail(&n->list, &q->lru_list);
> +             q->no_cached++;
> +     }
> +
> +     return 0;
> +error:
> +     free(n);
> +     return -1;
> +}
> +
> +static inline int search_table(struct qcow *q, u64 **table, u64 offset)
> +{
> +     struct qcow_l2_cache *c;
> +
> +     *table = NULL;
> +
> +     c = search(&q->root, offset);
> +     if (!c)
> +             return -1;
> +
> +     *table = c->table;
> +     return 0;
> +}
> +
>  static inline u64 get_l1_index(struct qcow *q, u64 offset)
>  {
>       struct qcow_header *header = q->header;
> @@ -37,6 +184,43 @@ static inline u64 get_cluster_offset(struct qcow *q, u64 
> offset)
>       return offset & ((1 << header->cluster_bits)-1);
>  }
>  
> +static int qcow1_read_l2_table(struct qcow *q, u64 **table, u64 offset)
> +{
> +     struct qcow_header *header = q->header;
> +     u64 size;
> +     u64 i;
> +     u64 *t;
> +
> +     *table = NULL;
> +     size   = 1 << header->l2_bits;
> +
> +     /* search an entry for offset in cache */
> +     if (search_table(q, table, offset) >= 0)
> +             return 0;
> +
> +     t = calloc(size, sizeof(u64));
                           sizeof(*t)

> +     if (!t)
> +             goto error;
> +
> +     /* table not cached: read from the disk */
> +     if (pread_in_full(q->fd, t, size * sizeof(u64), offset) < 0)
> +             goto error;
> +
> +     /* cache the table */
> +     if (cache_table(q, t, offset) < 0)
> +             goto error;
> +
> +     /* change cached table to CPU's byte-order */
> +     for (i = 0; i < size; i++)
> +             be64_to_cpus(&t[i]);
> +
> +     *table = t;
> +     return 0;
> +error:
> +     free(t);
> +     return -1;
> +}
> +
>  static ssize_t qcow1_read_cluster(struct qcow *q, u64 offset, void *dst, u32 
> dst_len)
>  {
>       struct qcow_header *header = q->header;
> @@ -51,8 +235,6 @@ static ssize_t qcow1_read_cluster(struct qcow *q, u64 
> offset, void *dst, u32 dst
>       u64 l1_idx;
>       u64 l2_idx;
>  
> -     l2_table = NULL;
> -
>       cluster_size = 1 << header->cluster_bits;
>  
>       l1_idx = get_l1_index(q, offset);
> @@ -72,18 +254,16 @@ static ssize_t qcow1_read_cluster(struct qcow *q, u64 
> offset, void *dst, u32 dst
>               goto zero_cluster;
>  
>       l2_table_size = 1 << header->l2_bits;
> -     l2_table = calloc(l2_table_size, sizeof(u64));
> -     if (!l2_table)
> -             goto out_error;
>  
> -     if (pread_in_full(q->fd, l2_table, l2_table_size * sizeof(u64), 
> l2_table_offset) < 0)
> +     /* read and cache level 2 table */
> +     if (qcow1_read_l2_table(q, &l2_table, l2_table_offset) < 0)
>               goto out_error;
>  
>       l2_idx = get_l2_index(q, offset);
>       if (l2_idx >= l2_table_size)
>               goto out_error;
>  
> -     clust_start = be64_to_cpu(l2_table[l2_idx]) & ~header->oflag_mask;
> +     clust_start = l2_table[l2_idx] & ~header->oflag_mask;
>       if (!clust_start)
>               goto zero_cluster;
>  
> @@ -91,7 +271,6 @@ static ssize_t qcow1_read_cluster(struct qcow *q, u64 
> offset, void *dst, u32 dst
>               goto out_error;
>  
>  out:
> -     free(l2_table);
>       return length;
>  
>  zero_cluster:
> @@ -221,15 +400,16 @@ static ssize_t qcow1_write_cluster(struct qcow *q, u64 
> offset, void *buf, u32 sr
>       if (len > src_len)
>               len = src_len;
>  
> -     l2t = calloc(l2t_sz, sizeof(u64));
> -     if (!l2t)
> -             goto error;
> -
>       l2t_off         = table->l1_table[l1t_idx] & ~header->oflag_mask;
>       if (l2t_off) {
> -             if (pread_in_full(q->fd, l2t, l2t_sz * sizeof(u64), l2t_off) < 
> 0)
> -                     goto free_l2;
> +             /* cache level2 table */
> +             if (qcow1_read_l2_table(q, &l2t, l2t_off) < 0)
> +                     goto error;
>       } else {
> +             l2t = calloc(l2t_sz, sizeof(u64));
                                       sizeof(*l2t)

> +             if (!l2t)
> +                     goto error;
> +
>               /* Capture the state of the consistent QCOW image */
>               f_sz            = file_size(q->fd);
>               if (!f_sz)
> @@ -251,6 +431,13 @@ static ssize_t qcow1_write_cluster(struct qcow *q, u64 
> offset, void *buf, u32 sr
>                       goto free_l2;
>               }
>  
> +             if (cache_table(q, l2t, l2t_off) < 0) {
> +                     if (ftruncate(q->fd, f_sz) < 0)
> +                             goto free_l2;
> +
> +                     goto free_l2;
> +             }
> +
>               /* Update the in-core entry */
>               table->l1_table[l1t_idx] = l2t_off;
>       }
> @@ -258,17 +445,15 @@ static ssize_t qcow1_write_cluster(struct qcow *q, u64 
> offset, void *buf, u32 sr
>       /* Capture the state of the consistent QCOW image */
>       f_sz            = file_size(q->fd);
>       if (!f_sz)
> -             goto free_l2;
> +             goto error;
>  
> -     clust_start     = be64_to_cpu(l2t[l2t_idx]) & ~header->oflag_mask;
> +     clust_start     = l2t[l2t_idx] & ~header->oflag_mask;
>       if (!clust_start) {
>               clust_start     = ALIGN(f_sz, clust_sz);
>               update_meta     = true;
>       } else
>               update_meta     = false;
>  
> -     free(l2t);
> -
>       /* Write actual data */
>       if (pwrite_in_full(q->fd, buf, len, clust_start + clust_off) < 0)
>               goto error;
> @@ -282,9 +467,13 @@ static ssize_t qcow1_write_cluster(struct qcow *q, u64 
> offset, void *buf, u32 sr
>  
>                       goto error;
>               }
> +
> +             /* Update the cached level2 entry */
> +             l2t[l2t_idx] = clust_start;
>       }
>  
>       return len;
> +
>  free_l2:
>       free(l2t);
>  error:
> @@ -335,6 +524,7 @@ static void qcow1_disk_close(struct disk_image *disk)
>  
>       q = disk->priv;
>  
> +     free_cache(q);
>       free(q->table.l1_table);
>       free(q->header);
>       free(q);
> @@ -425,6 +615,8 @@ static struct disk_image *qcow2_probe(int fd, bool 
> readonly)
>               goto error;
>  
>       q->fd = fd;
> +     q->root = RB_ROOT;
> +     INIT_LIST_HEAD(&q->lru_list);
>  
>       h = q->header = qcow2_read_header(fd);
>       if (!h)
> @@ -519,6 +711,8 @@ static struct disk_image *qcow1_probe(int fd, bool 
> readonly)
>               goto error;
>  
>       q->fd = fd;
> +     q->root = RB_ROOT;
> +     INIT_LIST_HEAD(&q->lru_list);
>  
>       h = q->header = qcow1_read_header(fd);
>       if (!h)


-- 

Sasha.

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Reply via email to