Re: [RFC v2 PATCH] kvm tools, qcow: Add the support for copy-on-write clusters

Kevin Wolf Fri, 18 Nov 2011 02:09:38 -0800

Am 18.11.2011 09:47, schrieb Lan Tianyu:
> When meeting request to write the cluster without copied flag,
> allocate a new cluster and write original data with modification
> to the new cluster. This also adds support for the writing operation
> of the qcow2 compressed image. After testing, image file can pass
> through "qemu-img check".
> 
>  Please enter the commit message for your changes. Lines starting
> 
> Signed-off-by: Lan Tianyu <tianyu....@intel.com>
> ---
>  tools/kvm/disk/qcow.c        |  366 
> +++++++++++++++++++++++++++++-------------
>  tools/kvm/include/kvm/qcow.h |    2 +
>  2 files changed, 255 insertions(+), 113 deletions(-)
> 
> diff --git a/tools/kvm/disk/qcow.c b/tools/kvm/disk/qcow.c
> index 680b37d..4d9125d 100644
> --- a/tools/kvm/disk/qcow.c
> +++ b/tools/kvm/disk/qcow.c
> @@ -122,9 +122,6 @@ static int cache_table(struct qcow *q, struct 
> qcow_l2_table *c)
>                */
>               lru = list_first_entry(&l1t->lru_list, struct qcow_l2_table, 
> list);
>  
> -             if (qcow_l2_cache_write(q, lru) < 0)
> -                     goto error;
> -
>               /* Remove the node from the cache */
>               rb_erase(&lru->node, r);
>               list_del_init(&lru->list);
> @@ -618,9 +615,6 @@ static int cache_refcount_block(struct qcow *q, struct 
> qcow_refcount_block *c)
>       if (rft->nr_cached == MAX_CACHE_NODES) {
>               lru = list_first_entry(&rft->lru_list, struct 
> qcow_refcount_block, list);
>  
> -             if (write_refcount_block(q, lru) < 0)
> -                     goto error;
> -
>               rb_erase(&lru->node, r);
>               list_del_init(&lru->list);
>               rft->nr_cached--;
> @@ -706,6 +700,11 @@ static struct qcow_refcount_block 
> *qcow_read_refcount_block(struct qcow *q, u64
>  
>       rfb_offset = be64_to_cpu(rft->rf_table[rft_idx]);
>  
> +     if (!rfb_offset) {
> +             pr_warning("Don't support to grow refcount table");
> +             return NULL;
> +     }
> +
>       rfb = refcount_block_search(q, rfb_offset);
>       if (rfb)
>               return rfb;
> @@ -728,35 +727,121 @@ error_free_rfb:
>       return NULL;
>  }
>  
> -/*
> - * QCOW file might grow during a write operation. Not only data but metadata 
> is
> - * also written at the end of the file. Therefore it is necessary to ensure
> - * every write is committed to disk. Hence we use uses qcow_pwrite_sync() to
> - * synchronize the in-core state of QCOW image to disk.
> - *
> - * We also try to restore the image to a consistent state if the metdata
> - * operation fails. The two metadat operations are: level 1 and level 2 table
> - * update. If either of them fails the image is truncated to a consistent 
> state.
> +static u16 qcow_get_refcount(struct qcow *q, u64 clust_idx)
> +{
> +     struct qcow_refcount_block *rfb = NULL;
> +     struct qcow_header *header = q->header;
> +     u64 rfb_idx;
> +
> +     rfb = qcow_read_refcount_block(q, clust_idx);
> +     if (!rfb) {
> +             pr_warning("Error while reading refcount table");
> +             return -1;
> +     }
> +
> +     rfb_idx = clust_idx & (((1ULL <<
> +             (header->cluster_bits - QCOW_REFCOUNT_BLOCK_SHIFT)) - 1));
> +
> +     if (rfb_idx >= rfb->size) {
> +             pr_warning("L1: refcount block index out of bounds");
> +             return -1;
> +     }
> +
> +     return be16_to_cpu(rfb->entries[rfb_idx]);
> +}
> +
> +static int update_cluster_refcount(struct qcow *q, u64 clust_idx, u16 append)
> +{
> +     struct qcow_refcount_block *rfb = NULL;
> +     struct qcow_header *header = q->header;
> +     u16 refcount;
> +     u64 rfb_idx;
> +
> +     rfb = qcow_read_refcount_block(q, clust_idx);
> +     if (!rfb) {
> +             pr_warning("error while reading refcount table");
> +             return -1;
> +     }
> +
> +     rfb_idx = clust_idx & (((1ULL <<
> +             (header->cluster_bits - QCOW_REFCOUNT_BLOCK_SHIFT)) - 1));
> +     if (rfb_idx >= rfb->size) {
> +             pr_warning("refcount block index out of bounds");
> +             return -1;
> +     }
> +
> +     refcount = be16_to_cpu(rfb->entries[rfb_idx]) + append;
> +     rfb->entries[rfb_idx] = cpu_to_be16(refcount);
> +     rfb->dirty = 1;
> +
> +     /*write refcount block*/
> +     write_refcount_block(q, rfb);


Missing error handling.

> +
> +     /*update free_clust_idx since refcount becomes zero*/
> +     if (!refcount && clust_idx < q->free_clust_idx)
> +             q->free_clust_idx = clust_idx;
> +
> +     return 0;
> +}
> +
> +static void  qcow_free_clusters(struct qcow *q, u64 clust_start, u64 size)
> +{
> +     struct qcow_header *header = q->header;
> +     u64 start, end, offset;
> +
> +     start = clust_start & ~(q->cluster_size - 1);
> +     end = (clust_start + size - 1) & ~(q->cluster_size - 1);
> +     for (offset = start; offset <= end; offset += q->cluster_size)
> +             update_cluster_refcount(q, offset >> header->cluster_bits, -1);
> +}
> +
> +/*Allocate clusters according to the size. Find a postion that
> + *can satisfy the size. free_clust_idx is initialized to zero and
> + *Record last position.
> +*/
> +static u64 qcow_alloc_clusters(struct qcow *q, u64 size)
> +{
> +     struct qcow_header *header = q->header;
> +     u16 clust_refcount;
> +     u32 clust_idx, i;
> +     u64 clust_num;
> +
> +     clust_num = (size + (q->cluster_size - 1)) >> header->cluster_bits;
> +
> +again:
> +     for (i = 0; i < clust_num; i++) {
> +             clust_idx = q->free_clust_idx++;
> +             clust_refcount = qcow_get_refcount(q, clust_idx);
> +             if (clust_refcount < 0)
> +                     return -1;
> +             else if (clust_refcount > 0)
> +                     goto again;
> +     }
> +
> +     for (i = 0; i < clust_num; i++)
> +             update_cluster_refcount(q,
> +                     q->free_clust_idx - clust_num + i, 1);

Error handling again.

> +
> +     return (q->free_clust_idx - clust_num) << header->cluster_bits;
> +}
> +
> +/*Get l2 table. If the table has been copied, read table directly.
> + *If the table exists, allocate a new cluster and copy the table
> + *to the new cluster.
>   */
> -static ssize_t qcow_write_cluster(struct qcow *q, u64 offset, void *buf, u32 
> src_len)
> +static int get_cluster_table(struct qcow *q, u64 offset,
> +     struct qcow_l2_table **result_l2t, u64 *result_l2_index)
>  {
>       struct qcow_header *header = q->header;
>       struct qcow_l1_table *l1t = &q->table;
>       struct qcow_l2_table *l2t;
> -     u64 clust_start;
> -     u64 clust_flags;
> -     u64 l2t_offset;
> -     u64 clust_off;
> -     u64 l2t_size;
> -     u64 clust_sz;
>       u64 l1t_idx;
> +     u64 l2t_offset;
>       u64 l2t_idx;
> -     u64 f_sz;
> -     u64 len;
> +     u64 l2t_size;
> +     u64 l2t_new_offset;
>  
> -     l2t             = NULL;
> -     l2t_size        = 1 << header->l2_bits;
> -     clust_sz        = 1 << header->cluster_bits;
> +     l2t_size = 1 << header->l2_bits;
>  
>       l1t_idx = get_l1_index(q, offset);
>       if (l1t_idx >= l1t->table_size)
> @@ -766,122 +851,166 @@ static ssize_t qcow_write_cluster(struct qcow *q, u64 
> offset, void *buf, u32 src
>       if (l2t_idx >= l2t_size)
>               return -1;
>  
> -     clust_off = get_cluster_offset(q, offset);
> -     if (clust_off >= clust_sz)
> -             return -1;
> -
> -     len = clust_sz - clust_off;
> -     if (len > src_len)
> -             len = src_len;
> -
> -     mutex_lock(&q->mutex);
> -
>       l2t_offset = be64_to_cpu(l1t->l1_table[l1t_idx]);
> -     if (l2t_offset & QCOW2_OFLAG_COMPRESSED) {
> -             pr_warning("compressed clusters are not supported");
> -             goto error;
> -     }
> -     if (!(l2t_offset & QCOW2_OFLAG_COPIED)) {
> -             pr_warning("L2 copy-on-write clusters are not supported");
> -             goto error;
> -     }
> -
> -     l2t_offset &= QCOW2_OFFSET_MASK;
> -     if (l2t_offset) {
> -             /* read and cache l2 table */
> +     if (l2t_offset & QCOW2_OFLAG_COPIED) {
> +             l2t_offset &= ~QCOW2_OFLAG_COPIED;
>               l2t = qcow_read_l2_table(q, l2t_offset);
>               if (!l2t)
>                       goto error;
>       } else {
> -             l2t = new_cache_table(q, l2t_offset);
> -             if (!l2t)
> +             l2t_new_offset = qcow_alloc_clusters(q, l2t_size*sizeof(u64));
> +             if (l2t_new_offset < 0)
>                       goto error;
>  
> -             /* Capture the state of the consistent QCOW image */
> -             f_sz = file_size(q->fd);
> -             if (!f_sz)
> -                     goto free_cache;
> +             l2t = new_cache_table(q, l2t_new_offset);
> +             if (!l2t)
> +                     goto free_cluster;
> +
> +             if (l2t_offset)
> +                     qcow2_read_cluster(q, l2t_offset, l2t->table,
> +                             l2t_size*sizeof(u64));

There must be a system behind it. :-)

> +             else
> +                     memset(l2t->table, 0x00, l2t_size * sizeof(u64));
>  
> -             /* Write the l2 table of 0's at the end of the file */
> -             l2t_offset = qcow_write_l2_table(q, l2t->table);
> -             if (!l2t_offset)
> +             /*write l2 table*/
> +             l2t->dirty = 1;
> +             if (qcow_l2_cache_write(q, l2t) < 0)
>                       goto free_cache;

You need to make sure that the refcount update is written first (e.g.
with fsync), otherwise you risk corruption when the host crashes in the
middle.

>  
> -             if (cache_table(q, l2t) < 0) {
> -                     if (ftruncate(q->fd, f_sz) < 0)
> -                             goto free_cache;
> +             /* Update the l1 talble */
> +             l1t->l1_table[l1t_idx] = cpu_to_be64(l2t_new_offset
> +                     | QCOW2_OFLAG_COPIED);
>  
> -                     goto free_cache;
> -             }
> +             if (pwrite_in_full(q->fd, l1t->l1_table,
> +                     l1t->table_size * sizeof(u64),
> +                     header->l1_table_offset) < 0)
> +                     goto error;

Likewise, the L1 table write must be ordered against the L2 write.

goto error is using the wrong label.

>  
> -             /* Update the in-core entry */
> -             l1t->l1_table[l1t_idx] = cpu_to_be64(l2t_offset);
> +             /*cache l2 table*/
> +             cache_table(q, l2t);

After so many explicit comments, you can probably guess what's wrong here...

> +
> +             /*free old cluster*/
> +             qcow_free_clusters(q, l2t_offset, q->cluster_size);
>       }
>  
> -     /* Capture the state of the consistent QCOW image */
> -     f_sz            = file_size(q->fd);
> -     if (!f_sz)
> -             goto error;
> +     *result_l2t = l2t;
> +     *result_l2_index = l2t_idx;
>  
> -     clust_start = be64_to_cpu(l2t->table[l2t_idx]);
> +     return 0;
>  
> -     clust_flags = clust_start & QCOW2_OFLAGS_MASK;
> -     if (clust_flags & QCOW2_OFLAG_COMPRESSED) {
> -             pr_warning("compressed clusters are not supported");
> +free_cache:
> +     free(l2t);
> +
> +free_cluster:
> +     qcow_free_clusters(q, l2t_new_offset, q->cluster_size);
> +
> +error:
> +     return -1;
> +}
> +
> +/*If the cluster has been copied, write data directly. If not,
> + *read the original data and write it to the new cluster with
> + *modification.
> +*/
> +static ssize_t qcow_write_cluster(struct qcow *q, u64 offset,
> +             void *buf, u32 src_len)
> +{
> +     struct qcow_header *header = q->header;
> +     struct qcow_l2_table *l2t;
> +     u64 clust_new_idx;
> +     u64 clust_new_start;
> +     u64 clust_start;
> +     u64 clust_flags;
> +     u64 clust_off;
> +     u64 l2t_idx;
> +     u64 len;
> +
> +     l2t = NULL;
> +
> +     clust_off = get_cluster_offset(q, offset);
> +     if (clust_off >= q->cluster_size)
> +             return -1;
> +
> +     len = q->cluster_size - clust_off;
> +     if (len > src_len)
> +             len = src_len;
> +
> +     mutex_lock(&q->mutex);
> +
> +     if (get_cluster_table(q, offset, &l2t, &l2t_idx)) {
> +             pr_warning("Get l2 table error");
>               goto error;
>       }
>  
> -     clust_start &= QCOW2_OFFSET_MASK;
> -     if (!clust_start) {
> -             clust_start             = ALIGN(f_sz, clust_sz);
> -             l2t->table[l2t_idx]     = cpu_to_be64(clust_start | 
> QCOW2_OFLAG_COPIED);
> -             l2t->dirty              = 1;
> -     }
> +     clust_start = be64_to_cpu(l2t->table[l2t_idx]);
> +     clust_flags = clust_start & QCOW2_OFLAGS_MASK;
>  
> +     clust_start &= QCOW2_OFFSET_MASK;
>       if (!(clust_flags & QCOW2_OFLAG_COPIED)) {
> -             struct qcow_refcount_block *rfb = NULL;
> -             u16 clust_refcount;
> -             u64 clust_idx;
> -             u64 rfb_idx;
> -
> -             clust_idx = (clust_start & QCOW2_OFFSET_MASK)
> -                     >> (header->cluster_bits);
>  
> -             rfb = qcow_read_refcount_block(q, clust_idx);
> -             if (!rfb) {
> -                     pr_warning("L1: error while reading refcount table");
> +             clust_new_start = qcow_alloc_clusters(q, q->cluster_size);
> +             if (clust_new_start < 0) {
> +                     pr_warning("Cluster alloc error!");
>                       goto error;
>               }
>  
> -             rfb_idx = clust_idx & (((1ULL << (header->cluster_bits - 
> QCOW_REFCOUNT_BLOCK_SHIFT)) - 1));
> -             if (rfb_idx >= rfb->size) {
> -                     pr_warning("L1: refcount block index out of bounds");
> -                     goto error;
> -             }
> +             clust_new_idx = clust_new_start >> header->cluster_bits;
> +             offset &= ~(q->cluster_size - 1);
> +
> +             /*if clust_start is not zero, read the original data*/
> +             if (clust_start) {
> +                     mutex_unlock(&q->mutex);
> +                     if (qcow2_read_cluster(q, offset, q->copy_buff,
> +                             q->cluster_size) < 0) {
> +                             pr_warning("Read copy cluster error");
> +                             qcow_free_clusters(q, clust_new_start,
> +                                     q->cluster_size);
> +                             return -1;
> +                     }
> +                     mutex_lock(&q->mutex);
> +             } else
> +                     memset(q->copy_buff, 0x00, q->cluster_size);
> +
> +             memcpy(q->copy_buff + clust_off, buf, len);
> +
> +              /* Write actual data */
> +             if (pwrite_in_full(q->fd, q->copy_buff, q->cluster_size,
> +                     clust_new_start) < 0)
> +                     goto free_cluster;
> +
> +             /*update l2 table*/
> +             l2t->table[l2t_idx] = cpu_to_be64(clust_new_start
> +                     | QCOW2_OFLAG_COPIED);
> +             l2t->dirty = 1;
> +
> +             if (qcow_l2_cache_write(q, l2t))
> +                     goto free_cluster;

Cluster allocation must be ordered against L2 update.

> +
> +             /*free old cluster*/
> +             if (clust_flags & QCOW2_OFLAG_COMPRESSED) {
> +                     int size;
> +                     size = ((clust_start >> q->csize_shift) &
> +                             q->csize_mask) + 1;
> +                     size *= 512;
> +                     clust_start &= q->cluster_offset_mask;
> +                     clust_start &= ~511;
> +
> +                     qcow_free_clusters(q, clust_start, size);
> +             } else if (clust_start)
> +                     qcow_free_clusters(q, clust_start, q->cluster_size);
>  
> -             clust_refcount = be16_to_cpu(rfb->entries[rfb_idx]);
> -             if (!clust_refcount) {
> -                     clust_refcount = 1;
> -                     rfb->entries[rfb_idx] = cpu_to_be16(clust_refcount);
> -                     rfb->dirty = 1;
> -             }
> -
> -             if (clust_refcount > 1) {
> -                     pr_warning("L1 copy-on-write clusters are not 
> supported");
> +     } else {
> +             /* Write actual data */
> +             if (pwrite_in_full(q->fd, buf, len,
> +                     clust_start + clust_off) < 0)
>                       goto error;
> -             }
>       }
> -
>       mutex_unlock(&q->mutex);
> -
> -     /* Write actual data */
> -     if (pwrite_in_full(q->fd, buf, len, clust_start + clust_off) < 0)
> -             return -1;
> -
>       return len;
>  
> -free_cache:
> -     free(l2t);
> +free_cluster:
> +     qcow_free_clusters(q, clust_new_start, q->cluster_size);
> +
>  error:
>       mutex_unlock(&q->mutex);
>       return -1;
> @@ -993,6 +1122,7 @@ static int qcow_disk_close(struct disk_image *disk)
>  
>       refcount_table_free_cache(&q->refcount_table);
>       l1_table_free_cache(&q->table);
> +     free(q->copy_buff);
>       free(q->cluster_data);
>       free(q->cluster_cache);
>       free(q->refcount_table.rf_table);
> @@ -1117,10 +1247,16 @@ static struct disk_image *qcow2_probe(int fd, bool 
> readonly)
>       q->cluster_offset_mask = (1LL << q->csize_shift) - 1;
>       q->cluster_size = 1 << q->header->cluster_bits;
>  
> +     q->copy_buff = malloc(q->cluster_size);
> +     if (!q->copy_buff) {
> +             pr_warning("copy buff malloc error!");
> +             goto free_header;
> +     }
> +
>       q->cluster_data = malloc(q->cluster_size);
>       if (!q->cluster_data) {
>               pr_warning("cluster data malloc error!");
> -             goto free_header;
> +             goto free_copy_buff;
>       }
>  
>       q->cluster_cache = malloc(q->cluster_size);
> @@ -1163,6 +1299,9 @@ free_cluster_cache:
>  free_cluster_data:
>       if (q->cluster_data)
>               free(q->cluster_data);
> +free_copy_buff:
> +     if (q->cluster_data)
> +             free(q->cluster_data);

This looks like the wrong field.

>  free_header:
>       if (q->header)
>               free(q->header);
> @@ -1252,6 +1391,7 @@ static struct disk_image *qcow1_probe(int fd, bool 
> readonly)
>       q->version = QCOW1_VERSION;
>       q->cluster_size = 1 << q->header->cluster_bits;
>       q->cluster_offset_mask = (1LL << (63 - q->header->cluster_bits)) - 1;
> +     q->free_clust_idx = 0;
>  
>       q->cluster_data = malloc(q->cluster_size);
>       if (!q->cluster_data) {
> diff --git a/tools/kvm/include/kvm/qcow.h b/tools/kvm/include/kvm/qcow.h
> index bbf7913..e032a1e 100644
> --- a/tools/kvm/include/kvm/qcow.h
> +++ b/tools/kvm/include/kvm/qcow.h
> @@ -84,8 +84,10 @@ struct qcow {
>       u32                             version;
>       u64                             cluster_size;
>       u64                             cluster_offset_mask;
> +     u64                             free_clust_idx;
>       void                            *cluster_cache;
>       void                            *cluster_data;
> +     void                            *copy_buff;
>  };
>  
>  struct qcow1_header_disk {

Kevin
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [RFC v2 PATCH] kvm tools, qcow: Add the support for copy-on-write clusters

Reply via email to