On Mon, Jul 07, 2014 at 10:54:27AM -0400, Milos Vyletel wrote: > VMDK's streamOptimized format is different that regular sparse format. > L1(GD) and L2(GT) tables are not predefined but rather generated and > written during image creation mainly because there is no way to tell > how much space data will occupy once they are compressed. Also the > location of header, L1 and L2 tables differs. > > - L2 tables (grain tables) are written after all grains they point to > - L1 tables are written after all grains and L2 tables > - footer at the end is used instead of header in first sector > > This patch improves streamOptimized support and adds possibility to > create true streamOptimized images using qemu-img. Some of the changes > are from VMDK specs, some of them from hexdump-ing images from VMWare > and VirtualBox. > > I have compared these images to the ones generated by VMWare and vbox > and they are identical with the exception of DescriptorFile that has > some differences but none that would change behavior(CID and some > additional DDB entries differ) and streamOptimized image generated from > raw image was succesfully imported (as OVA) into VMWare ESXi and Oracle > OVM. > > Signed-off-by: Milos Vyletel <milos.vyle...@gmail.com> > --- > block/vmdk.c | 363 > +++++++++++++++++++++++++++++++++++++++++++++------------- > 1 files changed, 281 insertions(+), 82 deletions(-)
What does this patch do beyond what QEMU already supports today? Is there a particular application that rejected QEMU's streamOptimized images? Is this a bug fix? Please use scripts/checkpatch.pl to check coding style. Fam: Please review > diff --git a/block/vmdk.c b/block/vmdk.c > index 27a78da..f482225 100644 > --- a/block/vmdk.c > +++ b/block/vmdk.c > @@ -81,6 +81,21 @@ typedef struct { > uint16_t compressAlgorithm; > } QEMU_PACKED VMDK4Header; > > +typedef struct { > + uint64_t val; > + uint32_t size; > + uint32_t type; > + uint8_t pad[BDRV_SECTOR_SIZE - sizeof(uint64_t) - 2*sizeof(uint32_t)]; > +} QEMU_PACKED VMDK4MetaMarker; > + > +typedef struct { > + VMDK4MetaMarker footer_marker; > + uint32_t magic; > + VMDK4Header header; > + uint8_t pad[BDRV_SECTOR_SIZE - sizeof(uint32_t) - sizeof(VMDK4Header)]; > + VMDK4MetaMarker eos_marker; > +} QEMU_PACKED VMDK4Footer; > + > #define L2_CACHE_SIZE 16 > > typedef struct VmdkExtent { > @@ -89,24 +104,29 @@ typedef struct VmdkExtent { > bool compressed; > bool has_marker; > bool has_zero_grain; > + bool has_footer; > int version; > int64_t sectors; > int64_t end_sector; > int64_t flat_start_offset; > int64_t l1_table_offset; > int64_t l1_backup_table_offset; > + uint32_t l1_index; > uint32_t *l1_table; > uint32_t *l1_backup_table; > unsigned int l1_size; > uint32_t l1_entry_sectors; > > unsigned int l2_size; > + uint32_t *l2_table; > uint32_t *l2_cache; > uint32_t l2_cache_offsets[L2_CACHE_SIZE]; > uint32_t l2_cache_counts[L2_CACHE_SIZE]; > > int64_t cluster_sectors; > char *type; > + > + VMDK4Footer footer; > } VmdkExtent; > > typedef struct BDRVVmdkState { > @@ -555,14 +575,51 @@ static char *vmdk_read_desc(BlockDriverState *file, > uint64_t desc_offset, > return buf; > } > > +static int vmdk_read_footer(BlockDriverState *bs, > + VMDK4Footer *footer) > +{ > + int ret; > + > + /* > + * footer starts 3 sectors from end > + * - footer marker > + * - footer > + * - end-of-stream marker > + */ > + ret = bdrv_pread(bs->file, > + (bs->file->total_sectors - 3) * BDRV_SECTOR_SIZE, > + footer, sizeof(*footer)); > + if (ret < 0) { > + goto out; > + } > + > + /* Some sanity checks for the footer */ > + if (be32_to_cpu(footer->magic) != VMDK4_MAGIC || > + le32_to_cpu(footer->footer_marker.size) != 0 || > + le32_to_cpu(footer->footer_marker.type) != MARKER_FOOTER || > + le64_to_cpu(footer->eos_marker.val) != 0 || > + le32_to_cpu(footer->eos_marker.size) != 0 || > + le32_to_cpu(footer->eos_marker.type) != MARKER_END_OF_STREAM) > + { > + ret = -EINVAL; > + goto out; > + } > + > + ret = VMDK_OK; > + out: > + return ret; > +} > + > static int vmdk_open_vmdk4(BlockDriverState *bs, > BlockDriverState *file, > int flags, Error **errp) > { > int ret; > + bool has_footer = false; > uint32_t magic; > uint32_t l1_size, l1_entry_sectors; > VMDK4Header header; > + VMDK4Footer footer; > VmdkExtent *extent; > BDRVVmdkState *s = bs->opaque; > int64_t l1_backup_offset = 0; > @@ -593,48 +650,13 @@ static int vmdk_open_vmdk4(BlockDriverState *bs, > > if (le64_to_cpu(header.gd_offset) == VMDK4_GD_AT_END) { > /* > - * The footer takes precedence over the header, so read it in. The > - * footer starts at offset -1024 from the end: One sector for the > - * footer, and another one for the end-of-stream marker. > + * The footer takes precedence over the header, so read it in. > */ > - struct { > - struct { > - uint64_t val; > - uint32_t size; > - uint32_t type; > - uint8_t pad[512 - 16]; > - } QEMU_PACKED footer_marker; > - > - uint32_t magic; > - VMDK4Header header; > - uint8_t pad[512 - 4 - sizeof(VMDK4Header)]; > - > - struct { > - uint64_t val; > - uint32_t size; > - uint32_t type; > - uint8_t pad[512 - 16]; > - } QEMU_PACKED eos_marker; > - } QEMU_PACKED footer; > - > - ret = bdrv_pread(file, > - bs->file->total_sectors * 512 - 1536, > - &footer, sizeof(footer)); > + ret = vmdk_read_footer(bs, &footer); > if (ret < 0) { > return ret; > } > - > - /* Some sanity checks for the footer */ > - if (be32_to_cpu(footer.magic) != VMDK4_MAGIC || > - le32_to_cpu(footer.footer_marker.size) != 0 || > - le32_to_cpu(footer.footer_marker.type) != MARKER_FOOTER || > - le64_to_cpu(footer.eos_marker.val) != 0 || > - le32_to_cpu(footer.eos_marker.size) != 0 || > - le32_to_cpu(footer.eos_marker.type) != MARKER_END_OF_STREAM) > - { > - return -EINVAL; > - } > - > + has_footer = true; > header = footer.header; > } > > @@ -645,11 +667,15 @@ static int vmdk_open_vmdk4(BlockDriverState *bs, > error_set(errp, QERR_UNKNOWN_BLOCK_FORMAT_FEATURE, > bs->device_name, "vmdk", buf); > return -ENOTSUP; > - } else if (le32_to_cpu(header.version) == 3 && (flags & BDRV_O_RDWR)) { > + } else if (le32_to_cpu(header.version) == 3 && > + (flags & BDRV_O_RDWR) && > + !(le64_to_cpu(header.flags) & VMDK4_FLAG_COMPRESS)) { > /* VMware KB 2064959 explains that version 3 added support for > * persistent changed block tracking (CBT), and backup software can > * read it as version=1 if it doesn't care about the changed area > - * information. So we are safe to enable read only. */ > + * information. So we are safe to enable read only. > + * Note that this does not apply to streamOptimized images which > + * are written only once and are used as transport format */ > error_setg(errp, "VMDK version 3 must be read only"); > return -EINVAL; > } > @@ -689,11 +715,20 @@ static int vmdk_open_vmdk4(BlockDriverState *bs, > if (ret < 0) { > return ret; > } > + if (has_footer) { > + extent->has_footer = has_footer; > + extent->footer = footer; > + } > + > extent->compressed = > le16_to_cpu(header.compressAlgorithm) == VMDK4_COMPRESSION_DEFLATE; > if (extent->compressed) { > g_free(s->create_type); > s->create_type = g_strdup("streamOptimized"); > + > + if (flags & BDRV_O_RDWR) > + bdrv_truncate(file, > + le64_to_cpu(header.grain_offset) * BDRV_SECTOR_SIZE); > } > extent->has_marker = le32_to_cpu(header.flags) & VMDK4_FLAG_MARKER; > extent->version = le32_to_cpu(header.version); > @@ -1000,6 +1035,11 @@ static int vmdk_L2update(VmdkExtent *extent, > VmdkMetaData *m_data) > uint32_t offset; > QEMU_BUILD_BUG_ON(sizeof(offset) != sizeof(m_data->offset)); > offset = cpu_to_le32(m_data->offset); > + > + /* do not update on streamOptimized */ > + if (extent->compressed) > + return VMDK_OK; > + > /* update L2 table */ > if (bdrv_pwrite_sync( > extent->file, > @@ -1026,6 +1066,97 @@ static int vmdk_L2update(VmdkExtent *extent, > VmdkMetaData *m_data) > return VMDK_OK; > } > > +static int vmdk_write_footer(BlockDriverState *bs, > + VMDK4Footer *footer, > + VmdkExtent *extent) > +{ > + int i, ret, gd_buf_size; > + uint32_t *gd_buf = NULL; > + uint32_t grains, gd_sectors, gt_size, gt_count; > + uint64_t offset; > + VMDK4Header header; > + VMDK4MetaMarker gd_marker; > + > + header = footer->header; > + offset = le64_to_cpu(header.gd_offset); > + > + grains = DIV_ROUND_UP(header.capacity, header.granularity); > + gt_size = DIV_ROUND_UP(header.num_gtes_per_gt * sizeof(uint32_t), > + BDRV_SECTOR_SIZE); > + gt_count = DIV_ROUND_UP(grains, header.num_gtes_per_gt); > + gd_sectors = DIV_ROUND_UP(gt_count * sizeof(uint32_t), BDRV_SECTOR_SIZE); > + > + /* write grain directory marker */ > + memset(&gd_marker, 0, sizeof(gd_marker)); > + gd_marker.val = cpu_to_le64(gd_sectors); > + gd_marker.type = cpu_to_le32(MARKER_GRAIN_DIRECTORY); > + > + ret = bdrv_pwrite(bs, offset * BDRV_SECTOR_SIZE, &gd_marker, > sizeof(gd_marker)); > + if (ret < 0) > + goto exit; > + offset += sizeof(gd_marker) / BDRV_SECTOR_SIZE; > + > + /* write grain directory */ > + gd_buf_size = gd_sectors * BDRV_SECTOR_SIZE; > + gd_buf = g_malloc0(gd_buf_size); > + if (extent) { > + /* copy over L1 table if we have it */ > + for (i = 0; i < gt_count; i++) { > + gd_buf[i] = cpu_to_le32(extent->l1_table[i]); > + } > + } > + ret = bdrv_pwrite(bs, offset * BDRV_SECTOR_SIZE, gd_buf, gd_buf_size); > + if (ret < 0) > + goto exit; > + > + /* save real gd_offset */ > + footer->header.gd_offset = cpu_to_le64(offset); > + offset += gd_sectors; > + > + /* write footer */ > + ret = bdrv_pwrite(bs, offset * BDRV_SECTOR_SIZE, footer, > sizeof(*footer)); > + if (ret < 0) > + goto exit; > + > + ret = 0; > + exit: > + return ret; > +} > + > +static int vmdk_write_grain_table(VmdkExtent *extent) > +{ > + int ret; > + VMDK4MetaMarker gtm; > + uint64_t offset; > + > + offset = le64_to_cpu(extent->footer.header.gd_offset) << 9; > + > + memset(>m, 0, sizeof(gtm)); > + gtm.val = cpu_to_le32((extent->l2_size * sizeof(uint32_t)) >> 9); > + gtm.type = cpu_to_le32(MARKER_GRAIN_TABLE); > + if (bdrv_pwrite(extent->file, offset, >m, sizeof(gtm)) < 0) { > + ret = -EIO; > + goto out; > + } > + > + offset += sizeof(gtm); > + extent->l1_table[extent->l1_index] = offset >> 9; > + if (bdrv_pwrite(extent->file, offset, > + extent->l2_table, extent->l2_size * sizeof(uint32_t) > + ) != extent->l2_size * sizeof(uint32_t)) { > + ret = -EIO; > + goto out; > + } > + > + offset += extent->l2_size * sizeof(uint32_t); > + extent->l1_table_offset = offset; > + extent->footer.header.gd_offset = cpu_to_le64(offset >> 9); > + > + ret = 0; > + out: > + return ret; > +} > + > static int get_cluster_offset(BlockDriverState *bs, > VmdkExtent *extent, > VmdkMetaData *m_data, > @@ -1034,8 +1165,8 @@ static int get_cluster_offset(BlockDriverState *bs, > uint64_t *cluster_offset) > { > unsigned int l1_index, l2_offset, l2_index; > - int min_index, i, j; > - uint32_t min_count, *l2_table; > + int min_index, i, j, ret; > + uint32_t min_count; > bool zeroed = false; > > if (m_data) { > @@ -1048,11 +1179,25 @@ static int get_cluster_offset(BlockDriverState *bs, > > offset -= (extent->end_sector - extent->sectors) * SECTOR_SIZE; > l1_index = (offset >> 9) / extent->l1_entry_sectors; > - if (l1_index >= extent->l1_size) { > + if (extent->compressed && l1_index && > + extent->l1_index != l1_index) { > + ret = vmdk_write_grain_table(extent); > + if (ret < 0) > + return ret; > + } > + > + extent->l1_index = l1_index; > + if (extent->l1_index >= extent->l1_size) { > return VMDK_ERROR; > } > - l2_offset = extent->l1_table[l1_index]; > + retry: > + l2_offset = extent->l1_table[extent->l1_index]; > + > if (!l2_offset) { > + if (extent->compressed) { > + extent->l1_table[extent->l1_index] = > bdrv_getlength(extent->file); > + goto retry; > + } > return VMDK_UNALLOC; > } > for (i = 0; i < L2_CACHE_SIZE; i++) { > @@ -1063,7 +1208,7 @@ static int get_cluster_offset(BlockDriverState *bs, > extent->l2_cache_counts[j] >>= 1; > } > } > - l2_table = extent->l2_cache + (i * extent->l2_size); > + extent->l2_table = extent->l2_cache + (i * extent->l2_size); > goto found; > } > } > @@ -1076,11 +1221,11 @@ static int get_cluster_offset(BlockDriverState *bs, > min_index = i; > } > } > - l2_table = extent->l2_cache + (min_index * extent->l2_size); > + extent->l2_table = extent->l2_cache + (min_index * extent->l2_size); > if (bdrv_pread( > extent->file, > (int64_t)l2_offset * 512, > - l2_table, > + extent->l2_table, > extent->l2_size * sizeof(uint32_t) > ) != extent->l2_size * sizeof(uint32_t)) { > return VMDK_ERROR; > @@ -1090,15 +1235,15 @@ static int get_cluster_offset(BlockDriverState *bs, > extent->l2_cache_counts[min_index] = 1; > found: > l2_index = ((offset >> 9) / extent->cluster_sectors) % extent->l2_size; > - *cluster_offset = le32_to_cpu(l2_table[l2_index]); > + *cluster_offset = le32_to_cpu(extent->l2_table[l2_index]); > > if (m_data) { > m_data->valid = 1; > - m_data->l1_index = l1_index; > + m_data->l1_index = extent->l1_index; > m_data->l2_index = l2_index; > m_data->offset = *cluster_offset; > m_data->l2_offset = l2_offset; > - m_data->l2_cache_entry = &l2_table[l2_index]; > + m_data->l2_cache_entry = &extent->l2_table[l2_index]; > } > if (extent->has_zero_grain && *cluster_offset == VMDK_GTE_ZEROED) { > zeroed = true; > @@ -1119,7 +1264,7 @@ static int get_cluster_offset(BlockDriverState *bs, > } > > *cluster_offset >>= 9; > - l2_table[l2_index] = cpu_to_le32(*cluster_offset); > + extent->l2_table[l2_index] = cpu_to_le32(*cluster_offset); > > /* First of all we write grain itself, to avoid race condition > * that may to corrupt the image. > @@ -1210,6 +1355,7 @@ static int vmdk_write_extent(VmdkExtent *extent, > int64_t cluster_offset, > uLongf buf_len; > const uint8_t *write_buf = buf; > int write_len = nb_sectors * 512; > + uint64_t gd_offset; > > if (extent->compressed) { > if (!extent->has_marker) { > @@ -1227,6 +1373,7 @@ static int vmdk_write_extent(VmdkExtent *extent, > int64_t cluster_offset, > data->size = buf_len; > write_buf = (uint8_t *)data; > write_len = buf_len + sizeof(VmdkGrainMarker); > + > } > ret = bdrv_pwrite(extent->file, > cluster_offset + offset_in_cluster, > @@ -1236,6 +1383,13 @@ static int vmdk_write_extent(VmdkExtent *extent, > int64_t cluster_offset, > ret = ret < 0 ? ret : -EIO; > goto out; > } > + if (extent->compressed) { > + /* update GD offset after each write */ > + gd_offset = bdrv_getlength(extent->file); > + extent->l1_table_offset = gd_offset; > + gd_offset /= BDRV_SECTOR_SIZE; > + extent->footer.header.gd_offset = cpu_to_le64(gd_offset); > + } > ret = 0; > out: > g_free(data); > @@ -1534,10 +1688,12 @@ static int vmdk_create_extent(const char *filename, > int64_t filesize, > int ret, i; > BlockDriverState *bs = NULL; > VMDK4Header header; > + VMDK4Footer footer; > Error *local_err = NULL; > uint32_t tmp, magic, grains, gd_sectors, gt_size, gt_count; > uint32_t *gd_buf = NULL; > int gd_buf_size; > + uint64_t grain_offset, rgd_offset, gd_offset; > > ret = bdrv_create_file(filename, opts, &local_err); > if (ret < 0) { > @@ -1562,28 +1718,38 @@ static int vmdk_create_extent(const char *filename, > int64_t filesize, > } > magic = cpu_to_be32(VMDK4_MAGIC); > memset(&header, 0, sizeof(header)); > - header.version = zeroed_grain ? 2 : 1; > - header.flags = VMDK4_FLAG_RGD | VMDK4_FLAG_NL_DETECT > - | (compress ? VMDK4_FLAG_COMPRESS | VMDK4_FLAG_MARKER : 0) > + memset(&footer, 0, sizeof(footer)); > + > + header.version = (compress ? 3 : zeroed_grain ? 2 : 1); > + header.flags = VMDK4_FLAG_NL_DETECT > + | (compress ? VMDK4_FLAG_COMPRESS | VMDK4_FLAG_MARKER > + : VMDK4_FLAG_RGD) > | (zeroed_grain ? VMDK4_FLAG_ZERO_GRAIN : 0); > header.compressAlgorithm = compress ? VMDK4_COMPRESSION_DEFLATE : 0; > header.capacity = filesize / BDRV_SECTOR_SIZE; > header.granularity = 128; > header.num_gtes_per_gt = BDRV_SECTOR_SIZE; > > - grains = DIV_ROUND_UP(filesize / BDRV_SECTOR_SIZE, header.granularity); > + grains = DIV_ROUND_UP(header.capacity, header.granularity); > gt_size = DIV_ROUND_UP(header.num_gtes_per_gt * sizeof(uint32_t), > BDRV_SECTOR_SIZE); > gt_count = DIV_ROUND_UP(grains, header.num_gtes_per_gt); > gd_sectors = DIV_ROUND_UP(gt_count * sizeof(uint32_t), BDRV_SECTOR_SIZE); > > header.desc_offset = 1; > - header.desc_size = 20; > - header.rgd_offset = header.desc_offset + header.desc_size; > - header.gd_offset = header.rgd_offset + gd_sectors + (gt_size * gt_count); > - header.grain_offset = > + header.desc_size = (compress ? 2 : 20); > + rgd_offset = header.desc_offset + header.desc_size; > + header.rgd_offset = ((header.flags & VMDK4_FLAG_RGD) ? rgd_offset : 0); > + gd_offset = rgd_offset + gd_sectors + (gt_size * gt_count); > + header.gd_offset = (compress ? VMDK4_GD_AT_END : gd_offset); > + grain_offset = > ROUND_UP(header.gd_offset + gd_sectors + (gt_size * gt_count), > header.granularity); > + /* streamOptimized reserves first 128 sectors */ > + header.grain_offset = (compress ? header.granularity : grain_offset); > + /* streamOptimzed's grain directory is at the end */ > + gd_offset = header.grain_offset + 1; > + > /* swap endianness for all header fields */ > header.version = cpu_to_le32(header.version); > header.flags = cpu_to_le32(header.flags); > @@ -1620,30 +1786,54 @@ static int vmdk_create_extent(const char *filename, > int64_t filesize, > goto exit; > } > > - /* write grain directory */ > - gd_buf_size = gd_sectors * BDRV_SECTOR_SIZE; > - gd_buf = g_malloc0(gd_buf_size); > - for (i = 0, tmp = le64_to_cpu(header.rgd_offset) + gd_sectors; > - i < gt_count; i++, tmp += gt_size) { > - gd_buf[i] = cpu_to_le32(tmp); > - } > - ret = bdrv_pwrite(bs, le64_to_cpu(header.rgd_offset) * BDRV_SECTOR_SIZE, > - gd_buf, gd_buf_size); > - if (ret < 0) { > - error_set(errp, QERR_IO_ERROR); > - goto exit; > - } > + if (compress) { > + /* footer marker */ > + footer.footer_marker.val = cpu_to_le64(1); > + footer.footer_marker.type = cpu_to_le32(MARKER_FOOTER); > > - /* write backup grain directory */ > - for (i = 0, tmp = le64_to_cpu(header.gd_offset) + gd_sectors; > - i < gt_count; i++, tmp += gt_size) { > - gd_buf[i] = cpu_to_le32(tmp); > - } > - ret = bdrv_pwrite(bs, le64_to_cpu(header.gd_offset) * BDRV_SECTOR_SIZE, > - gd_buf, gd_buf_size); > - if (ret < 0) { > - error_set(errp, QERR_IO_ERROR); > - goto exit; > + /* header */ > + footer.magic = cpu_to_be32(VMDK4_MAGIC); > + footer.header = header; > + > + /* grain directory offset */ > + footer.header.gd_offset = cpu_to_le64(gd_offset); > + > + /* EOS marker */ > + footer.eos_marker.type = cpu_to_le32(MARKER_END_OF_STREAM); > + > + ret = vmdk_write_footer(bs, &footer, NULL); > + if (ret < 0) { > + error_set(errp, QERR_IO_ERROR); > + goto exit; > + } > + } else { > + /* write redundant grain directory (if applicable) */ > + if (le64_to_cpu(header.rgd_offset)) { > + gd_buf_size = gd_sectors * BDRV_SECTOR_SIZE; > + gd_buf = g_malloc0(gd_buf_size); > + for (i = 0, tmp = le64_to_cpu(header.rgd_offset) + gd_sectors; > + i < gt_count; i++, tmp += gt_size) { > + gd_buf[i] = cpu_to_le32(tmp); > + } > + ret = bdrv_pwrite(bs, le64_to_cpu(header.rgd_offset) * > BDRV_SECTOR_SIZE, > + gd_buf, gd_buf_size); > + if (ret < 0) { > + error_set(errp, QERR_IO_ERROR); > + goto exit; > + } > + } > + > + /* write grain directory */ > + for (i = 0, tmp = le64_to_cpu(header.gd_offset) + gd_sectors; > + i < gt_count; i++, tmp += gt_size) { > + gd_buf[i] = cpu_to_le32(tmp); > + } > + ret = bdrv_pwrite(bs, le64_to_cpu(header.gd_offset) * > BDRV_SECTOR_SIZE, > + gd_buf, gd_buf_size); > + if (ret < 0) { > + error_set(errp, QERR_IO_ERROR); > + goto exit; > + } > } > > ret = 0; > @@ -1914,6 +2104,15 @@ exit: > static void vmdk_close(BlockDriverState *bs) > { > BDRVVmdkState *s = bs->opaque; > + VmdkExtent *extent = &s->extents[0]; > + > + if (extent->compressed) { > + while (extent < &s->extents[s->num_extents]) { > + vmdk_write_grain_table(extent); > + vmdk_write_footer(extent->file, &extent->footer, extent); > + extent++; > + } > + } > > vmdk_free_extents(bs); > g_free(s->create_type); > -- > 1.7.1 >
pgpPTlz0GfS24.pgp
Description: PGP signature