[RFC/PATCH] kvm tools, qcow: Add support for writing to zero refcount clusters

Pekka Enberg Sun, 24 Jul 2011 12:45:23 -0700

This patch adds support for writing to zero refcount clusters. Refcount blocks
are cached in like L2 tables and flushed upon VIRTIO_BLK_T_FLUSH and when
evicted from the LRU cache.


With this patch applied, 'qemu-img check' no longer complains about referenced
clusters with zero reference count after

  dd if=/dev/zero of=/mnt/tmp

where '/mnt' is freshly generated QCOW2 image.

Cc: Asias He <[email protected]>
Cc: Cyrill Gorcunov <[email protected]>
Cc: Ingo Molnar <[email protected]>
Cc: Kevin Wolf <[email protected]>
Cc: Prasad Joshi <[email protected]>
Cc: Sasha Levin <[email protected]>
Signed-off-by: Pekka Enberg <[email protected]>
---
 tools/kvm/disk/qcow.c        |  267 ++++++++++++++++++++++++++++++++++++++++--
 tools/kvm/include/kvm/qcow.h |   24 ++++
 2 files changed, 283 insertions(+), 8 deletions(-)

diff --git a/tools/kvm/disk/qcow.c b/tools/kvm/disk/qcow.c
index 2ef1ecb..7c48dfb 100644
--- a/tools/kvm/disk/qcow.c
+++ b/tools/kvm/disk/qcow.c
@@ -382,6 +382,189 @@ static u64 qcow_write_l2_table(struct qcow *q, u64 *table)
        return off;
 }
 
+static void refcount_table_free_cache(struct qcow_refcount_table *rft)
+{
+       struct rb_root *r = &rft->root;
+       struct list_head *pos, *n;
+       struct qcow_refcount_block *t;
+
+       list_for_each_safe(pos, n, &rft->lru_list) {
+               list_del(pos);
+               t = list_entry(pos, struct qcow_refcount_block, list);
+               rb_erase(&t->node, r);
+
+               free(t);
+       }
+}
+
+static int refcount_block_insert(struct rb_root *root, struct 
qcow_refcount_block *new)
+{
+       struct rb_node **link = &(root->rb_node), *parent = NULL;
+       u64 offset = new->offset;
+
+       /* search the tree */
+       while (*link) {
+               struct qcow_refcount_block *t;
+
+               t = rb_entry(*link, struct qcow_refcount_block, node);
+               if (!t)
+                       goto error;
+
+               parent = *link;
+
+               if (t->offset > offset)
+                       link = &(*link)->rb_left;
+               else if (t->offset < offset)
+                       link = &(*link)->rb_right;
+               else
+                       goto out;
+       }
+
+       /* add new node */
+       rb_link_node(&new->node, parent, link);
+       rb_insert_color(&new->node, root);
+out:
+       return 0;
+error:
+       return -1;
+}
+
+static int write_refcount_block(struct qcow *q, struct qcow_refcount_block 
*rfb)
+{
+       if (!rfb->dirty)
+               return 0;
+
+       if (pwrite_in_full(q->fd, rfb->entries, rfb->size * sizeof(u16), 
rfb->offset) < 0)
+               return -1;
+
+       rfb->dirty = 0;
+
+       return 0;
+}
+
+static int cache_refcount_block(struct qcow *q, struct qcow_refcount_block *c)
+{
+       struct qcow_refcount_table *rft = &q->refcount_table;
+       struct rb_root *r = &rft->root;
+       struct qcow_refcount_block *lru;
+
+       if (rft->nr_cached == MAX_CACHE_NODES) {
+               lru = list_first_entry(&rft->lru_list, struct 
qcow_refcount_block, list);
+
+               if (write_refcount_block(q, lru) < 0)
+                       goto error;
+
+               rb_erase(&lru->node, r);
+               list_del_init(&lru->list);
+               rft->nr_cached--;
+
+               free(lru);
+       }
+
+       if (refcount_block_insert(r, c) < 0)
+               goto error;
+
+       list_add_tail(&c->list, &rft->lru_list);
+       rft->nr_cached++;
+
+       return 0;
+error:
+       return -1;
+}
+
+static struct qcow_refcount_block *new_refcount_block(struct qcow *q, u64 
rfb_offset)
+{
+       struct qcow_header *header = q->header;
+       struct qcow_refcount_block *rfb;
+       u64 cluster_size;
+
+       cluster_size = 1 << header->cluster_bits;
+
+       rfb = malloc(sizeof *rfb + cluster_size);
+       if (!rfb)
+               return NULL;
+
+       rfb->offset = rfb_offset;
+       rfb->size = cluster_size / sizeof(u16);
+       RB_CLEAR_NODE(&rfb->node);
+       INIT_LIST_HEAD(&rfb->list);
+
+       return rfb;
+}
+
+static struct qcow_refcount_block *refcount_block_lookup(struct rb_root *root, 
u64 offset)
+{
+       struct rb_node *link = root->rb_node;
+
+       while (link) {
+               struct qcow_refcount_block *t;
+
+               t = rb_entry(link, struct qcow_refcount_block, node);
+               if (!t)
+                       goto out;
+
+               if (t->offset > offset)
+                       link = link->rb_left;
+               else if (t->offset < offset)
+                       link = link->rb_right;
+               else
+                       return t;
+       }
+out:
+       return NULL;
+}
+
+static struct qcow_refcount_block *refcount_block_search(struct qcow *q, u64 
offset)
+{
+       struct qcow_refcount_table *rft = &q->refcount_table;
+       struct qcow_refcount_block *rfb;
+
+       rfb = refcount_block_lookup(&rft->root, offset);
+       if (!rfb)
+               return NULL;
+
+       /* Update the LRU state, by moving the searched node to list tail */
+       list_move_tail(&rfb->list, &rft->lru_list);
+
+       return rfb;
+}
+
+static struct qcow_refcount_block *qcow_read_refcount_block(struct qcow *q, 
u64 clust_idx)
+{
+       struct qcow_header *header = q->header;
+       struct qcow_refcount_table *rft = &q->refcount_table;
+       struct qcow_refcount_block *rfb;
+       u64 rfb_offset;
+       u64 rft_idx;
+
+       rft_idx = clust_idx >> (header->cluster_bits - 
QCOW_REFCOUNT_BLOCK_SHIFT);
+       if (rft_idx >= rft->rf_size)
+               return NULL;
+
+       rfb_offset = be64_to_cpu(rft->rf_table[rft_idx]);
+
+       rfb = refcount_block_search(q, rfb_offset);
+       if (rfb)
+               return rfb;
+
+       rfb = new_refcount_block(q, rfb_offset);
+       if (!rfb)
+               return NULL;
+
+       if (pread_in_full(q->fd, rfb->entries, rfb->size * sizeof(u16), 
rfb_offset) < 0)
+               goto error_free_rfb;
+
+       if (cache_refcount_block(q, rfb) < 0)
+               goto error_free_rfb;
+
+       return rfb;
+
+error_free_rfb:
+       free(rfb);
+
+       return NULL;
+}
+
 /*
  * QCOW file might grow during a write operation. Not only data but metadata is
  * also written at the end of the file. Therefore it is necessary to ensure
@@ -398,6 +581,7 @@ static ssize_t qcow_write_cluster(struct qcow *q, u64 
offset, void *buf, u32 src
        struct qcow_l1_table *l1t = &q->table;
        struct qcow_l2_table *l2t;
        u64 clust_start;
+       u64 clust_flags;
        u64 l2t_offset;
        u64 clust_off;
        u64 l2t_size;
@@ -435,7 +619,7 @@ static ssize_t qcow_write_cluster(struct qcow *q, u64 
offset, void *buf, u32 src
                goto error;
        }
        if (!(l2t_offset & QCOW_OFLAG_COPIED)) {
-               pr_warning("copy-on-write clusters are not supported");
+               pr_warning("L2 copy-on-write clusters are not supported");
                goto error;
        }
 
@@ -476,23 +660,54 @@ static ssize_t qcow_write_cluster(struct qcow *q, u64 
offset, void *buf, u32 src
        if (!f_sz)
                goto error;
 
-       clust_start     = be64_to_cpu(l2t->table[l2t_idx]);
-       if (clust_start & QCOW_OFLAG_COMPRESSED) {
+       clust_start = be64_to_cpu(l2t->table[l2t_idx]);
+
+       clust_flags = clust_start & QCOW_OFLAGS_MASK;
+       if (clust_flags & QCOW_OFLAG_COMPRESSED) {
                pr_warning("compressed clusters are not supported");
                goto error;
        }
-       if (!(clust_start & QCOW_OFLAG_COPIED)) {
-               pr_warning("copy-on-write clusters are not supported");
-               goto error;
-       }
 
        clust_start &= QCOW_OFFSET_MASK;
        if (!clust_start) {
                clust_start             = ALIGN(f_sz, clust_sz);
-               l2t->table[l2t_idx]     = cpu_to_be64(clust_start);
+               l2t->table[l2t_idx]     = cpu_to_be64(clust_start | 
QCOW_OFLAG_COPIED);
                l2t->dirty              = 1;
        }
 
+       if (!(clust_flags & QCOW_OFLAG_COPIED)) {
+               struct qcow_refcount_block *rfb = NULL;
+               u16 clust_refcount;
+               u64 clust_idx;
+               u64 rfb_idx;
+
+               clust_idx = (clust_start & QCOW_OFFSET_MASK) >> 
(header->cluster_bits);
+
+               rfb = qcow_read_refcount_block(q, clust_idx);
+               if (!rfb) {
+                       pr_warning("L1: error while reading refcount table");
+                       goto error;
+               }
+
+               rfb_idx = clust_idx & (((1ULL << (header->cluster_bits - 
QCOW_REFCOUNT_BLOCK_SHIFT)) - 1));
+               if (rfb_idx >= rfb->size) {
+                       pr_warning("L1: refcount block index out of bounds");
+                       goto error;
+               }
+
+               clust_refcount = be16_to_cpu(rfb->entries[rfb_idx]);
+               if (!clust_refcount) {
+                       clust_refcount = 1;
+                       rfb->entries[rfb_idx] = cpu_to_be16(clust_refcount);
+                       rfb->dirty = 1;
+               }
+
+               if (clust_refcount > 1) {
+                       pr_warning("L1 copy-on-write clusters are not 
supported");
+                       goto error;
+               }
+       }
+
        mutex_unlock(&q->mutex);
 
        /* Write actual data */
@@ -547,15 +762,24 @@ static ssize_t qcow_nowrite_sector(struct disk_image 
*disk, u64 sector, void *sr
 static int qcow_disk_flush(struct disk_image *disk)
 {
        struct qcow *q = disk->priv;
+       struct qcow_refcount_table *rft;
        struct qcow_header *header;
        struct list_head *pos, *n;
        struct qcow_l1_table *l1t;
 
        header = q->header;
        l1t = &q->table;
+       rft = &q->refcount_table;
 
        mutex_lock(&q->mutex);
 
+       list_for_each_safe(pos, n, &rft->lru_list) {
+               struct qcow_refcount_block *c = list_entry(pos, struct 
qcow_refcount_block, list);
+
+               if (write_refcount_block(q, c) < 0)
+                       goto error_unlock;
+       }
+
        list_for_each_safe(pos, n, &l1t->lru_list) {
                struct qcow_l2_table *c = list_entry(pos, struct qcow_l2_table, 
list);
 
@@ -587,7 +811,9 @@ static int qcow_disk_close(struct disk_image *disk)
 
        q = disk->priv;
 
+       refcount_table_free_cache(&q->refcount_table);
        l1_table_free_cache(&q->table);
+       free(q->refcount_table.rf_table);
        free(q->table.l1_table);
        free(q->header);
        free(q);
@@ -608,6 +834,26 @@ static struct disk_image_operations qcow_disk_ops = {
        .close                  = qcow_disk_close,
 };
 
+static int qcow_read_refcount_table(struct qcow *q)
+{
+       struct qcow_header *header = q->header;
+       struct qcow_refcount_table *rft = &q->refcount_table;
+       u64 cluster_size;
+
+       cluster_size = 1 << header->cluster_bits;
+
+       rft->rf_size = (header->refcount_table_size * cluster_size) / 
sizeof(u64);
+
+       rft->rf_table = calloc(rft->rf_size, sizeof(u64));
+       if (!rft->rf_table)
+               return -1;
+
+       rft->root = RB_ROOT;
+       INIT_LIST_HEAD(&rft->lru_list);
+
+       return pread_in_full(q->fd, rft->rf_table, sizeof(u64) * rft->rf_size, 
header->refcount_table_offset);
+}
+
 static int qcow_read_l1_table(struct qcow *q)
 {
        struct qcow_header *header = q->header;
@@ -656,6 +902,8 @@ static void *qcow2_read_header(int fd)
                .l1_size                = f_header.l1_size,
                .cluster_bits           = f_header.cluster_bits,
                .l2_bits                = f_header.cluster_bits - 3,
+               .refcount_table_offset  = f_header.refcount_table_offset,
+               .refcount_table_size    = f_header.refcount_table_clusters,
        };
 
        return header;
@@ -687,6 +935,9 @@ static struct disk_image *qcow2_probe(int fd, bool readonly)
        if (qcow_read_l1_table(q) < 0)
                goto error;
 
+       if (qcow_read_refcount_table(q) < 0)
+               goto error;
+
        /*
         * Do not use mmap use read/write instead
         */
diff --git a/tools/kvm/include/kvm/qcow.h b/tools/kvm/include/kvm/qcow.h
index 4ddc2b2..46db702 100644
--- a/tools/kvm/include/kvm/qcow.h
+++ b/tools/kvm/include/kvm/qcow.h
@@ -40,10 +40,32 @@ struct qcow_l1_table {
        int                             nr_cached;
 };
 
+#define QCOW_REFCOUNT_BLOCK_SHIFT      1
+
+struct qcow_refcount_block {
+       u64                             offset;
+       struct rb_node                  node;
+       struct list_head                list;
+       u64                             size;
+       u8                              dirty;
+       u16                             entries[];
+};
+
+struct qcow_refcount_table {
+       u32                             rf_size;
+       u64                             *rf_table;
+
+       /* Refcount block caching data structures */
+       struct rb_root                  root;
+       struct list_head                lru_list;
+       int                             nr_cached;
+};
+
 struct qcow {
        pthread_mutex_t                 mutex;
        void                            *header;
        struct qcow_l1_table            table;
+       struct qcow_refcount_table      refcount_table;
        int                             fd;
 };
 
@@ -53,6 +75,8 @@ struct qcow_header {
        u32                             l1_size;
        u8                              cluster_bits;
        u8                              l2_bits;
+       u64                             refcount_table_offset;
+       u32                             refcount_table_size;
 };
 
 struct qcow1_header_disk {
-- 
1.7.0.4

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to [email protected]
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[RFC/PATCH] kvm tools, qcow: Add support for writing to zero refcount clusters

Reply via email to