On Mon, 25 Jul 2011, Kevin Wolf wrote:
Looks okay, except that in case of a crash you'll most likely corrupt
the image because the order in which refcounts and mapping are written
out is completely undefined.

For a reliable implementation you need to make sure that for cluster
allocation you first write out the refcount update, then fsync, then
write out the mapping. Otherwise if the mapping is written out first and
then a crash happens, you'll have clusters that are used, but marked
free, so that in the next run a second cluster can be mapped to the same
location.

Does this look better?

                        Pekka

From 4bc5a97711094d50ec4a25ba65b01b204e986574 Mon Sep 17 00:00:00 2001
From: Pekka Enberg <[email protected]>
Date: Thu, 21 Jul 2011 12:04:39 +0300
Subject: [PATCH v2] kvm tools, qcow: Add support for writing to zero refcount 
clusters

This patch adds support for writing to zero refcount clusters. Refcount blocks
are cached in like L2 tables and flushed upon VIRTIO_BLK_T_FLUSH and when
evicted from the LRU cache.

With this patch applied, 'qemu-img check' no longer complains about referenced
clusters with zero reference count after

  dd if=/dev/zero of=/mnt/tmp

where '/mnt' is freshly generated QCOW2 image.

Cc: Asias He <[email protected]>
Cc: Cyrill Gorcunov <[email protected]>
Cc: Ingo Molnar <[email protected]>
Cc: Kevin Wolf <[email protected]>
Cc: Prasad Joshi <[email protected]>
Cc: Sasha Levin <[email protected]>
Signed-off-by: Pekka Enberg <[email protected]>
---
v1 -> v2: Use fdatasync() after writing out refcount tables in 
qcow_disk_flush().

 tools/kvm/disk/qcow.c        |  270 ++++++++++++++++++++++++++++++++++++++++--
 tools/kvm/include/kvm/qcow.h |   24 ++++
 2 files changed, 286 insertions(+), 8 deletions(-)

diff --git a/tools/kvm/disk/qcow.c b/tools/kvm/disk/qcow.c
index 2ef1ecb..2471aeb 100644
--- a/tools/kvm/disk/qcow.c
+++ b/tools/kvm/disk/qcow.c
@@ -382,6 +382,189 @@ static u64 qcow_write_l2_table(struct qcow *q, u64 *table)
        return off;
 }

+static void refcount_table_free_cache(struct qcow_refcount_table *rft)
+{
+       struct rb_root *r = &rft->root;
+       struct list_head *pos, *n;
+       struct qcow_refcount_block *t;
+
+       list_for_each_safe(pos, n, &rft->lru_list) {
+               list_del(pos);
+               t = list_entry(pos, struct qcow_refcount_block, list);
+               rb_erase(&t->node, r);
+
+               free(t);
+       }
+}
+
+static int refcount_block_insert(struct rb_root *root, struct 
qcow_refcount_block *new)
+{
+       struct rb_node **link = &(root->rb_node), *parent = NULL;
+       u64 offset = new->offset;
+
+       /* search the tree */
+       while (*link) {
+               struct qcow_refcount_block *t;
+
+               t = rb_entry(*link, struct qcow_refcount_block, node);
+               if (!t)
+                       goto error;
+
+               parent = *link;
+
+               if (t->offset > offset)
+                       link = &(*link)->rb_left;
+               else if (t->offset < offset)
+                       link = &(*link)->rb_right;
+               else
+                       goto out;
+       }
+
+       /* add new node */
+       rb_link_node(&new->node, parent, link);
+       rb_insert_color(&new->node, root);
+out:
+       return 0;
+error:
+       return -1;
+}
+
+static int write_refcount_block(struct qcow *q, struct qcow_refcount_block 
*rfb)
+{
+       if (!rfb->dirty)
+               return 0;
+
+       if (pwrite_in_full(q->fd, rfb->entries, rfb->size * sizeof(u16), 
rfb->offset) < 0)
+               return -1;
+
+       rfb->dirty = 0;
+
+       return 0;
+}
+
+static int cache_refcount_block(struct qcow *q, struct qcow_refcount_block *c)
+{
+       struct qcow_refcount_table *rft = &q->refcount_table;
+       struct rb_root *r = &rft->root;
+       struct qcow_refcount_block *lru;
+
+       if (rft->nr_cached == MAX_CACHE_NODES) {
+               lru = list_first_entry(&rft->lru_list, struct 
qcow_refcount_block, list);
+
+               if (write_refcount_block(q, lru) < 0)
+                       goto error;
+
+               rb_erase(&lru->node, r);
+               list_del_init(&lru->list);
+               rft->nr_cached--;
+
+               free(lru);
+       }
+
+       if (refcount_block_insert(r, c) < 0)
+               goto error;
+
+       list_add_tail(&c->list, &rft->lru_list);
+       rft->nr_cached++;
+
+       return 0;
+error:
+       return -1;
+}
+
+static struct qcow_refcount_block *new_refcount_block(struct qcow *q, u64 
rfb_offset)
+{
+       struct qcow_header *header = q->header;
+       struct qcow_refcount_block *rfb;
+       u64 cluster_size;
+
+       cluster_size = 1 << header->cluster_bits;
+
+       rfb = malloc(sizeof *rfb + cluster_size);
+       if (!rfb)
+               return NULL;
+
+       rfb->offset = rfb_offset;
+       rfb->size = cluster_size / sizeof(u16);
+       RB_CLEAR_NODE(&rfb->node);
+       INIT_LIST_HEAD(&rfb->list);
+
+       return rfb;
+}
+
+static struct qcow_refcount_block *refcount_block_lookup(struct rb_root *root, 
u64 offset)
+{
+       struct rb_node *link = root->rb_node;
+
+       while (link) {
+               struct qcow_refcount_block *t;
+
+               t = rb_entry(link, struct qcow_refcount_block, node);
+               if (!t)
+                       goto out;
+
+               if (t->offset > offset)
+                       link = link->rb_left;
+               else if (t->offset < offset)
+                       link = link->rb_right;
+               else
+                       return t;
+       }
+out:
+       return NULL;
+}
+
+static struct qcow_refcount_block *refcount_block_search(struct qcow *q, u64 
offset)
+{
+       struct qcow_refcount_table *rft = &q->refcount_table;
+       struct qcow_refcount_block *rfb;
+
+       rfb = refcount_block_lookup(&rft->root, offset);
+       if (!rfb)
+               return NULL;
+
+       /* Update the LRU state, by moving the searched node to list tail */
+       list_move_tail(&rfb->list, &rft->lru_list);
+
+       return rfb;
+}
+
+static struct qcow_refcount_block *qcow_read_refcount_block(struct qcow *q, 
u64 clust_idx)
+{
+       struct qcow_header *header = q->header;
+       struct qcow_refcount_table *rft = &q->refcount_table;
+       struct qcow_refcount_block *rfb;
+       u64 rfb_offset;
+       u64 rft_idx;
+
+       rft_idx = clust_idx >> (header->cluster_bits - 
QCOW_REFCOUNT_BLOCK_SHIFT);
+       if (rft_idx >= rft->rf_size)
+               return NULL;
+
+       rfb_offset = be64_to_cpu(rft->rf_table[rft_idx]);
+
+       rfb = refcount_block_search(q, rfb_offset);
+       if (rfb)
+               return rfb;
+
+       rfb = new_refcount_block(q, rfb_offset);
+       if (!rfb)
+               return NULL;
+
+       if (pread_in_full(q->fd, rfb->entries, rfb->size * sizeof(u16), 
rfb_offset) < 0)
+               goto error_free_rfb;
+
+       if (cache_refcount_block(q, rfb) < 0)
+               goto error_free_rfb;
+
+       return rfb;
+
+error_free_rfb:
+       free(rfb);
+
+       return NULL;
+}
+
 /*
  * QCOW file might grow during a write operation. Not only data but metadata is
  * also written at the end of the file. Therefore it is necessary to ensure
@@ -398,6 +581,7 @@ static ssize_t qcow_write_cluster(struct qcow *q, u64 
offset, void *buf, u32 src
        struct qcow_l1_table *l1t = &q->table;
        struct qcow_l2_table *l2t;
        u64 clust_start;
+       u64 clust_flags;
        u64 l2t_offset;
        u64 clust_off;
        u64 l2t_size;
@@ -435,7 +619,7 @@ static ssize_t qcow_write_cluster(struct qcow *q, u64 
offset, void *buf, u32 src
                goto error;
        }
        if (!(l2t_offset & QCOW_OFLAG_COPIED)) {
-               pr_warning("copy-on-write clusters are not supported");
+               pr_warning("L2 copy-on-write clusters are not supported");
                goto error;
        }

@@ -476,23 +660,54 @@ static ssize_t qcow_write_cluster(struct qcow *q, u64 
offset, void *buf, u32 src
        if (!f_sz)
                goto error;

-       clust_start     = be64_to_cpu(l2t->table[l2t_idx]);
-       if (clust_start & QCOW_OFLAG_COMPRESSED) {
+       clust_start = be64_to_cpu(l2t->table[l2t_idx]);
+
+       clust_flags = clust_start & QCOW_OFLAGS_MASK;
+       if (clust_flags & QCOW_OFLAG_COMPRESSED) {
                pr_warning("compressed clusters are not supported");
                goto error;
        }
-       if (!(clust_start & QCOW_OFLAG_COPIED)) {
-               pr_warning("copy-on-write clusters are not supported");
-               goto error;
-       }

        clust_start &= QCOW_OFFSET_MASK;
        if (!clust_start) {
                clust_start             = ALIGN(f_sz, clust_sz);
-               l2t->table[l2t_idx]  = cpu_to_be64(clust_start);
+               l2t->table[l2t_idx]  = cpu_to_be64(clust_start | 
QCOW_OFLAG_COPIED);
                l2t->dirty           = 1;
        }

+       if (!(clust_flags & QCOW_OFLAG_COPIED)) {
+               struct qcow_refcount_block *rfb = NULL;
+               u16 clust_refcount;
+               u64 clust_idx;
+               u64 rfb_idx;
+
+               clust_idx = (clust_start & QCOW_OFFSET_MASK) >> 
(header->cluster_bits);
+
+               rfb = qcow_read_refcount_block(q, clust_idx);
+               if (!rfb) {
+                       pr_warning("L1: error while reading refcount table");
+                       goto error;
+               }
+
+               rfb_idx = clust_idx & (((1ULL << (header->cluster_bits - 
QCOW_REFCOUNT_BLOCK_SHIFT)) - 1));
+               if (rfb_idx >= rfb->size) {
+                       pr_warning("L1: refcount block index out of bounds");
+                       goto error;
+               }
+
+               clust_refcount = be16_to_cpu(rfb->entries[rfb_idx]);
+               if (!clust_refcount) {
+                       clust_refcount = 1;
+                       rfb->entries[rfb_idx] = cpu_to_be16(clust_refcount);
+                       rfb->dirty = 1;
+               }
+
+               if (clust_refcount > 1) {
+                       pr_warning("L1 copy-on-write clusters are not 
supported");
+                       goto error;
+               }
+       }
+
        mutex_unlock(&q->mutex);

        /* Write actual data */
@@ -547,15 +762,27 @@ static ssize_t qcow_nowrite_sector(struct disk_image 
*disk, u64 sector, void *sr
 static int qcow_disk_flush(struct disk_image *disk)
 {
        struct qcow *q = disk->priv;
+       struct qcow_refcount_table *rft;
        struct qcow_header *header;
        struct list_head *pos, *n;
        struct qcow_l1_table *l1t;

        header = q->header;
        l1t = &q->table;
+       rft = &q->refcount_table;

        mutex_lock(&q->mutex);

+       list_for_each_safe(pos, n, &rft->lru_list) {
+               struct qcow_refcount_block *c = list_entry(pos, struct 
qcow_refcount_block, list);
+
+               if (write_refcount_block(q, c) < 0)
+                       goto error_unlock;
+       }
+
+       if (fdatasync(disk->fd) < 0)
+               goto error_unlock;
+
        list_for_each_safe(pos, n, &l1t->lru_list) {
                struct qcow_l2_table *c = list_entry(pos, struct qcow_l2_table, 
list);

@@ -587,7 +814,9 @@ static int qcow_disk_close(struct disk_image *disk)

        q = disk->priv;

+       refcount_table_free_cache(&q->refcount_table);
        l1_table_free_cache(&q->table);
+       free(q->refcount_table.rf_table);
        free(q->table.l1_table);
        free(q->header);
        free(q);
@@ -608,6 +837,26 @@ static struct disk_image_operations qcow_disk_ops = {
        .close                  = qcow_disk_close,
 };

+static int qcow_read_refcount_table(struct qcow *q)
+{
+       struct qcow_header *header = q->header;
+       struct qcow_refcount_table *rft = &q->refcount_table;
+       u64 cluster_size;
+
+       cluster_size = 1 << header->cluster_bits;
+
+       rft->rf_size = (header->refcount_table_size * cluster_size) / 
sizeof(u64);
+
+       rft->rf_table = calloc(rft->rf_size, sizeof(u64));
+       if (!rft->rf_table)
+               return -1;
+
+       rft->root = RB_ROOT;
+       INIT_LIST_HEAD(&rft->lru_list);
+
+       return pread_in_full(q->fd, rft->rf_table, sizeof(u64) * rft->rf_size, 
header->refcount_table_offset);
+}
+
 static int qcow_read_l1_table(struct qcow *q)
 {
        struct qcow_header *header = q->header;
@@ -656,6 +905,8 @@ static void *qcow2_read_header(int fd)
                .l1_size                = f_header.l1_size,
                .cluster_bits           = f_header.cluster_bits,
                .l2_bits                = f_header.cluster_bits - 3,
+               .refcount_table_offset  = f_header.refcount_table_offset,
+               .refcount_table_size    = f_header.refcount_table_clusters,
        };

        return header;
@@ -687,6 +938,9 @@ static struct disk_image *qcow2_probe(int fd, bool readonly)
        if (qcow_read_l1_table(q) < 0)
                goto error;

+       if (qcow_read_refcount_table(q) < 0)
+               goto error;
+
        /*
         * Do not use mmap use read/write instead
         */
diff --git a/tools/kvm/include/kvm/qcow.h b/tools/kvm/include/kvm/qcow.h
index 4ddc2b2..46db702 100644
--- a/tools/kvm/include/kvm/qcow.h
+++ b/tools/kvm/include/kvm/qcow.h
@@ -40,10 +40,32 @@ struct qcow_l1_table {
        int                             nr_cached;
 };

+#define QCOW_REFCOUNT_BLOCK_SHIFT      1
+
+struct qcow_refcount_block {
+       u64                             offset;
+       struct rb_node                  node;
+       struct list_head                list;
+       u64                             size;
+       u8                              dirty;
+       u16                             entries[];
+};
+
+struct qcow_refcount_table {
+       u32                             rf_size;
+       u64                             *rf_table;
+
+       /* Refcount block caching data structures */
+       struct rb_root                  root;
+       struct list_head                lru_list;
+       int                             nr_cached;
+};
+
 struct qcow {
        pthread_mutex_t                 mutex;
        void                            *header;
        struct qcow_l1_table            table;
+       struct qcow_refcount_table      refcount_table;
        int                             fd;
 };

@@ -53,6 +75,8 @@ struct qcow_header {
        u32                             l1_size;
        u8                              cluster_bits;
        u8                              l2_bits;
+       u64                             refcount_table_offset;
+       u32                             refcount_table_size;
 };

 struct qcow1_header_disk {
--
1.7.0.4

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to [email protected]
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Reply via email to