Signed-off-by: Benoit Canet <ben...@irqsave.net> --- block/qcow2.h | 203 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 201 insertions(+), 2 deletions(-)
diff --git a/block/qcow2.h b/block/qcow2.h index 9421843..953edfe 100644 --- a/block/qcow2.h +++ b/block/qcow2.h @@ -57,7 +57,182 @@ #define REFCOUNT_CACHE_SIZE 4 #define DEFAULT_CLUSTER_SIZE 65536 - +#define DEFAULT_DEDUP_CLUSTER_SIZE 4096 + +#define HASH_LENGTH 32 + +/* indicate that this cluster hash has been deleted from the key value store */ +#define QCOW_DEDUP_DELETED (1LL << 61) +/* indicate that the hash structure is empty and miss offset */ +#define QCOW_DEDUP_FLAG_EMPTY (1LL << 62) + +#define SSD_ERASE_BLOCK_SIZE (512 * 1024) /* match SSD erase block size */ +#define JOURNAL_CLUSTER_SIZE 4096 /* used to read entries */ +#define HASH_STORE_CLUSTER_SIZE 4096 + +#define QCOW_LOG_END_SIZE 2 /* size of a end block journal entry */ +#define QCOW_LOG_STORE_ENTRY_USED (1LL << 60) /* mark used entry in table */ +#define QCOW_LOG_STORE_BUCKET_SIZE 4 /* size of a cuckoo hash bucket */ +#define QCOW_LOG_STORE_MAX_KICKS 128 /* max numbers of cuckoo hash kicks */ +#define QCOW_LOG_STORE_JOURNAL_RATIO 2 /* the ratio to compute the extra + * room the journal will take based + * on the log store size + */ +#define QCOW2_NB_INCARNATION_GOAL 128 /* targeted number of incarnation */ + +#define QCOW_DEDUP_DIRTY 1 /* dirty flag in the qcow2 header extension */ + +typedef enum { + QCOW_LOG_NONE = 0xFF, /* on SSD erased clusters will mark none */ + QCOW_LOG_END = 1, /* end a block and point to the next */ + QCOW_LOG_HASH = 2, /* used to journalize a QCowHashInfo */ +} QCowLogEntryType; + +typedef enum { + QCOW_HASH_SHA256 = 0, + QCOW_HASH_SHA3 = 1, + QCOW_HASH_SKEIN = 2, +} QCowHashAlgo; + +typedef struct { + uint8_t data[HASH_LENGTH]; /* 32 bytes hash of a given cluster */ +} __attribute__((packed)) QCowHash; + +/* deduplication info */ +typedef struct { + QCowHash hash; + uint64_t physical_sect; /* where the cluster is stored on disk */ + uint64_t first_logical_sect; /* logical sector of the first occurrence of + * this cluster + */ +} __attribute__((packed)) QCowHashInfo; + +/* Used to keep a single precomputed hash between the calls of the dedup + * function + */ +typedef struct { + QCowHashInfo hash_info; + bool reuse; /* The main deduplication function can set this field to + * true before exiting to avoid computing the same hash + * twice. It's a speed optimization. + */ +} QcowPersistentHash; + +/* Undedupable hashes that must be written later to disk */ +typedef struct QCowHashElement { + QCowHashInfo hash_info; + QTAILQ_ENTRY(QCowHashElement) next; +} QCowHashElement; + +typedef struct { + QcowPersistentHash phash; /* contains a hash persisting between calls of + * qcow2_dedup() + */ + QTAILQ_HEAD(, QCowHashElement) undedupables; + uint64_t nb_clusters_processed; + uint64_t nb_undedupable_sectors; +} QCowDedupState; + +/* The code must take care that the maximum size field of a QCowJournalEntry + * will be no more than 254 bytes. + * It's required to save the 2 bytes of room for QCOW_LOG_END entries + * in every cases + */ +typedef union { + QCowHashInfo hash_info; + uint8_t padding[254]; /* note the extra two bytes of padding to avoid + * read overflow. + */ +} QCowJournalEntryUnion; + +typedef struct { + uint8_t size; /* maximum size of a journal entry is 254 bytes */ + uint8_t type; /* contains a QCowLogEntryType for future usage */ + QCowJournalEntryUnion u; +} __attribute__((packed)) QCowJournalEntry; + +typedef struct { + uint64_t sector; /* the journal physical on disk sector */ + uint64_t size; /* the size of the journal in bytes */ + uint64_t index; /* index of next buf cluster to write */ + uint8_t *write_buf; /* used to buffer written data */ + uint64_t offset_in_buf; /* the offset in the write buffer */ + bool flushed; /* true if the buffer reached disk*/ + uint8_t *read_cache; /* used to cache read data */ + int64_t read_index; /* index the cached read cluster */ + bool started; /* has the journal been resumed */ +} QCowJournal; + +typedef struct { + QCowJournal journal; /* the journal this log store will use */ + uint32_t order; /* the number of bits used for the sub hashes + * as sub hashes will be used as an index for + * locating each bucket nb_bucket = 2^order + */ + uint16_t nb_kicks; /* the number of cuckoo hash kicks done */ + bool *kick_map; /* take care of not doing kick path loops */ + QCowHashInfo *hash_table; /* nb_buckets * QCOW_LOG_STORE_BUCKET_SIZE */ + QCowHashInfo *hash_table_copy; /* copy of the hash table for packing */ + + /* members required to freeze a log store into a hash store consumable + * hash table (incarnation) + */ + uint8_t *write_buf; /* the buffer used to write */ + uint64_t write_buf_offset; /* the on disk offset of the buffer */ + uint32_t in_buf_offset; /* the current offset in the write buffer */ +} QCowLogStore; + +/* a QCowIncarnation is a frozen QCowLogStore + * Freezes are read only and their in ram filters is queried from the youngest + * to the oldest in order to know if their hash table contains a given + * QCowHashInfo. + * If so a read is issued to retrieve the QCowHashInfo. + */ +typedef struct QCowIncarnation { + uint64_t filter_offset; /* the on disk offset of the ram filter */ + uint64_t hash_table_offset; /* the on disk offset of the hash table + * (should be just after the end of the filter) + */ + uint64_t size; /* the on disk size of the incarnation */ + uint8_t *filter; /* an in ram filter */ + QTAILQ_ENTRY(QCowIncarnation) next; +} QCowIncarnation; + +typedef struct QCowLimbo { + uint64_t offset; /* the on disk offset of the to reincarnate + * disk space + */ + QTAILQ_ENTRY(QCowLimbo) next; +} QCowLimbo; + +typedef struct { + uint32_t order; /* the number of bits used for the sub hashes + * as sub hashes will be used as an index for + * locating each bucket nb_bucket = 2^order + */ + uint32_t nb_incarnations; /* the current number of incarnations */ + uint32_t nb_in_limbo; /* the number of dead incarnations */ + QTAILQ_HEAD(incarnations_head, QCowIncarnation) incarnations; + /* a list of frozen hash table + * ordered from the youngest + * to the oldest + */ + QTAILQ_HEAD(in_limbo_head, QCowLimbo) in_limbo; + /* a list of dead incarnation which disk + * space can be recycled + */ +} QCowHashStore; + +typedef struct { + uint32_t order; + QCowLogStore log_store; /* the current log store */ + QCowLogStore frozen_log_store; /* the log store to incarnate */ + bool freezing; /* are we incarnating a log store */ + QCowHashStore hash_store; + CoMutex insert_lock; /* used to prevent multiple freeze attempts + * at the same time + */ +} QCowStore; #define QCOW2_OPT_LAZY_REFCOUNTS "lazy_refcounts" @@ -117,8 +292,10 @@ enum { enum { QCOW2_INCOMPAT_DIRTY_BITNR = 0, QCOW2_INCOMPAT_DIRTY = 1 << QCOW2_INCOMPAT_DIRTY_BITNR, + QCOW2_INCOMPAT_DEDUP_BITNR = 1, + QCOW2_INCOMPAT_DEDUP = 1 << QCOW2_INCOMPAT_DEDUP_BITNR, - QCOW2_INCOMPAT_MASK = QCOW2_INCOMPAT_DIRTY, + QCOW2_INCOMPAT_MASK = QCOW2_INCOMPAT_DIRTY | QCOW2_INCOMPAT_DEDUP, }; /* Compatible feature bits */ @@ -163,6 +340,28 @@ typedef struct BDRVQcowState { int64_t free_cluster_index; int64_t free_byte_offset; + bool has_dedup; /* indicate if this image has dedup */ + + /* the following fields are saved in QCOW2 dedup header extension */ + bool dedup_dirty; /* mapped to the header dirty flag */ + uint64_t dedup_conf_offset; /* disk offset of the dedup config */ + size_t dedup_conf_size; /* disk size of the dedup config */ + QCowHashAlgo dedup_hash_algo; /* the cryptographic hash algo used */ + uint32_t dedup_max_incarnations; /* the maximum number of incarnations in + * hash store -> oldest incarnation will + * be dropped. It's harmless for the + * dedup and allow to scale. + * The whole thing act as a FIFO or LRU + * if value is 0 there is no limits + */ + + QCowStore key_value_store; /* the key value store used to store + * dedup hashes + */ + int freeze_errno; /* catch errors when incarnating */ + Coroutine *freeze_co; /* coroutine used when freezing */ + Coroutine *load_filter_co; /* used to load incarnations filters */ + CoMutex lock; uint32_t crypt_method; /* current crypt method, 0 if no key yet */ -- 1.7.10.4