Add a new bitmap mode for dm-integrity.

Signed-off-by: Mikulas Patocka <[email protected]>

---
 Documentation/device-mapper/dm-integrity.txt |   23 +
 drivers/md/dm-integrity.c                    |  534 +++++++++++++++++++++++++--
 2 files changed, 525 insertions(+), 32 deletions(-)

Index: linux-2.6/drivers/md/dm-integrity.c
===================================================================
--- linux-2.6.orig/drivers/md/dm-integrity.c    2019-04-27 10:28:49.000000000 
+0200
+++ linux-2.6/drivers/md/dm-integrity.c 2019-04-29 11:43:38.000000000 +0200
@@ -24,6 +24,7 @@
 
 #define DEFAULT_INTERLEAVE_SECTORS     32768
 #define DEFAULT_JOURNAL_SIZE_FACTOR    7
+#define DEFAULT_SECTORS_PER_BITMAP_BIT 32768
 #define DEFAULT_BUFFER_SECTORS         128
 #define DEFAULT_JOURNAL_WATERMARK      50
 #define DEFAULT_SYNC_MSEC              10000
@@ -33,6 +34,8 @@
 #define METADATA_WORKQUEUE_MAX_ACTIVE  16
 #define RECALC_SECTORS                 8192
 #define RECALC_WRITE_SUPER             16
+#define BITMAP_BLOCK_SIZE              4096    /* don't change it */
+#define BITMAP_FLUSH_INTERVAL          (10 * HZ)
 
 /*
  * Warning - DEBUG_PRINT prints security-sensitive data to the log,
@@ -48,6 +51,7 @@
 #define SB_MAGIC                       "integrt"
 #define SB_VERSION_1                   1
 #define SB_VERSION_2                   2
+#define SB_VERSION_3                   3
 #define SB_SECTORS                     8
 #define MAX_SECTORS_PER_BLOCK          8
 
@@ -60,12 +64,14 @@ struct superblock {
        __u64 provided_data_sectors;    /* userspace uses this value */
        __u32 flags;
        __u8 log2_sectors_per_block;
-       __u8 pad[3];
+       __u8 log2_blocks_per_bitmap_bit;
+       __u8 pad[2];
        __u64 recalc_sector;
 };
 
 #define SB_FLAG_HAVE_JOURNAL_MAC       0x1
 #define SB_FLAG_RECALCULATING          0x2
+#define SB_FLAG_DIRTY_BITMAP           0x4
 
 #define        JOURNAL_ENTRY_ROUNDUP           8
 
@@ -155,9 +161,16 @@ struct dm_integrity_c {
        struct workqueue_struct *metadata_wq;
        struct superblock *sb;
        unsigned journal_pages;
+       unsigned n_bitmap_blocks;
+
        struct page_list *journal;
        struct page_list *journal_io;
        struct page_list *journal_xor;
+       struct page_list *recalc_bitmap;
+       struct page_list *may_write_bitmap;
+       struct bitmap_block_status *bbs;
+       unsigned bitmap_flush_interval;
+       struct delayed_work bitmap_flush_work;
 
        struct crypto_skcipher *journal_crypt;
        struct scatterlist **journal_scatterlist;
@@ -184,6 +197,7 @@ struct dm_integrity_c {
        __s8 log2_metadata_run;
        __u8 log2_buffer_sectors;
        __u8 sectors_per_block;
+       __u8 log2_blocks_per_bitmap_bit;
 
        unsigned char mode;
        int suspending;
@@ -236,6 +250,7 @@ struct dm_integrity_c {
 
        bool journal_uptodate;
        bool just_formatted;
+       bool recalculate_flag;
 
        struct alg_spec internal_hash_alg;
        struct alg_spec journal_crypt_alg;
@@ -292,6 +307,16 @@ struct journal_io {
        struct journal_completion *comp;
 };
 
+struct bitmap_block_status {
+       struct work_struct work;
+       struct dm_integrity_c *ic;
+       unsigned idx;
+       unsigned long *bitmap;
+       struct bio_list bio_queue;
+       spinlock_t bio_queue_lock;
+
+};
+
 static struct kmem_cache *journal_io_cache;
 
 #define JOURNAL_IO_MEMPOOL     32
@@ -427,7 +452,9 @@ static void wraparound_section(struct dm
 
 static void sb_set_version(struct dm_integrity_c *ic)
 {
-       if (ic->meta_dev || ic->sb->flags & cpu_to_le32(SB_FLAG_RECALCULATING))
+       if (ic->mode == 'B' || ic->sb->flags & 
cpu_to_le32(SB_FLAG_DIRTY_BITMAP))
+               ic->sb->version = SB_VERSION_3;
+       else if (ic->meta_dev || ic->sb->flags & 
cpu_to_le32(SB_FLAG_RECALCULATING))
                ic->sb->version = SB_VERSION_2;
        else
                ic->sb->version = SB_VERSION_1;
@@ -451,6 +478,135 @@ static int sync_rw_sb(struct dm_integrit
        return dm_io(&io_req, 1, &io_loc, NULL);
 }
 
+#define BITMAP_OP_TEST_ALL_SET         0
+#define BITMAP_OP_TEST_ALL_CLEAR       1
+#define BITMAP_OP_SET                  2
+#define BITMAP_OP_CLEAR                        3
+
+static bool block_bitmap_op(struct dm_integrity_c *ic, struct page_list 
*bitmap, sector_t sector, sector_t n_sectors, int mode)
+{
+       unsigned long bit, end_bit, this_end_bit, page, end_page;
+       unsigned long *data;
+
+       if (unlikely(((sector | n_sectors) & ((1 << 
ic->sb->log2_sectors_per_block) - 1)) != 0)) {
+               DMCRIT("invalid bitmap access (%llx,%llx,%d,%d,%d)\n",
+                       (unsigned long long)sector,
+                       (unsigned long long)n_sectors,
+                       ic->sb->log2_sectors_per_block,
+                       ic->log2_blocks_per_bitmap_bit,
+                       mode);
+               BUG();
+       }
+
+       if (unlikely(!n_sectors))
+               return true;
+
+       bit = sector >> (ic->sb->log2_sectors_per_block + 
ic->log2_blocks_per_bitmap_bit);
+       end_bit = (sector + n_sectors - 1) >> (ic->sb->log2_sectors_per_block + 
ic->log2_blocks_per_bitmap_bit);
+
+       page = bit / (PAGE_SIZE * 8);
+       bit %= PAGE_SIZE * 8;
+
+       end_page = end_bit / (PAGE_SIZE * 8);
+       end_bit %= PAGE_SIZE * 8;
+
+repeat:
+       if (page < end_page) {
+               this_end_bit = PAGE_SIZE * 8 - 1;
+       } else {
+               this_end_bit = end_bit;
+       }
+
+       data = lowmem_page_address(bitmap[page].page);
+
+       if (mode == BITMAP_OP_TEST_ALL_SET) {
+               while (bit <= this_end_bit) {
+                       if (!(bit % BITS_PER_LONG) && this_end_bit >= bit + 
BITS_PER_LONG - 1) {
+                               do {
+                                       if (data[bit / BITS_PER_LONG] != -1)
+                                               return false;
+                                       bit += BITS_PER_LONG;
+                               } while (this_end_bit >= bit + BITS_PER_LONG - 
1);
+                               continue;
+                       }
+                       if (!test_bit(bit, data))
+                               return false;
+                       bit++;
+               }
+       } else if (mode == BITMAP_OP_TEST_ALL_CLEAR) {
+               while (bit <= this_end_bit) {
+                       if (!(bit % BITS_PER_LONG) && this_end_bit >= bit + 
BITS_PER_LONG - 1) {
+                               do {
+                                       if (data[bit / BITS_PER_LONG] != 0)
+                                               return false;
+                                       bit += BITS_PER_LONG;
+                               } while (this_end_bit >= bit + BITS_PER_LONG - 
1);
+                               continue;
+                       }
+                       if (test_bit(bit, data))
+                               return false;
+                       bit++;
+               }
+       } else if (mode == BITMAP_OP_SET) {
+               while (bit <= this_end_bit) {
+                       if (!(bit % BITS_PER_LONG) && this_end_bit >= bit + 
BITS_PER_LONG - 1) {
+                               do {
+                                       data[bit / BITS_PER_LONG] = -1;
+                                       bit += BITS_PER_LONG;
+                               } while (this_end_bit >= bit + BITS_PER_LONG - 
1);
+                               continue;
+                       }
+                       __set_bit(bit, data);
+                       bit++;
+               }
+       } else if (mode == BITMAP_OP_CLEAR) {
+               if (!bit && this_end_bit == PAGE_SIZE * 8 - 1)
+                       clear_page(data);
+               else while (bit <= this_end_bit) {
+                       if (!(bit % BITS_PER_LONG) && this_end_bit >= bit + 
BITS_PER_LONG - 1) {
+                               do {
+                                       data[bit / BITS_PER_LONG] = 0;
+                                       bit += BITS_PER_LONG;
+                               } while (this_end_bit >= bit + BITS_PER_LONG - 
1);
+                               continue;
+                       }
+                       __clear_bit(bit, data);
+                       bit++;
+               }
+       } else {
+               BUG();
+       }
+
+       if (unlikely(page < end_page)) {
+               bit = 0;
+               page++;
+               goto repeat;
+       }
+
+       return true;
+}
+
+static void block_bitmap_copy(struct dm_integrity_c *ic, struct page_list 
*dst, struct page_list *src)
+{
+       unsigned n_bitmap_pages = DIV_ROUND_UP(ic->n_bitmap_blocks, PAGE_SIZE / 
BITMAP_BLOCK_SIZE);
+       unsigned i;
+
+       for (i = 0; i < n_bitmap_pages; i++) {
+               unsigned long *dst_data = lowmem_page_address(dst[i].page);
+               unsigned long *src_data = lowmem_page_address(src[i].page);
+               copy_page(dst_data, src_data);
+       }
+}
+
+static struct bitmap_block_status *sector_to_bitmap_block(struct 
dm_integrity_c *ic, sector_t sector)
+{
+       unsigned bit = sector >> (ic->sb->log2_sectors_per_block + 
ic->log2_blocks_per_bitmap_bit);
+       unsigned bitmap_block = bit / (BITMAP_BLOCK_SIZE * 8);
+
+       BUG_ON(bitmap_block >= ic->n_bitmap_blocks);
+       return &ic->bbs[bitmap_block];
+}
+
 static void access_journal_check(struct dm_integrity_c *ic, unsigned section, 
unsigned offset,
                                 bool e, const char *function)
 {
@@ -1784,6 +1940,20 @@ offload_to_thread:
                goto journal_read_write;
        }
 
+       if (ic->mode == 'B' && dio->write) {
+               if (!block_bitmap_op(ic, ic->may_write_bitmap, 
dio->range.logical_sector, dio->range.n_sectors, BITMAP_OP_TEST_ALL_SET)) {
+                       struct bitmap_block_status *bbs = 
sector_to_bitmap_block(ic, dio->range.logical_sector);
+
+                       spin_lock(&bbs->bio_queue_lock);
+                       bio_list_add(&bbs->bio_queue, bio);
+                       spin_unlock(&bbs->bio_queue_lock);
+
+                       queue_work(ic->writer_wq, &bbs->work);
+
+                       return;
+               }
+       }
+
        dio->in_flight = (atomic_t)ATOMIC_INIT(2);
 
        if (need_sync_io) {
@@ -1810,10 +1980,14 @@ offload_to_thread:
 
        if (need_sync_io) {
                wait_for_completion_io(&read_comp);
-               if (unlikely(ic->recalc_wq != NULL) &&
-                   ic->sb->flags & cpu_to_le32(SB_FLAG_RECALCULATING) &&
+               if (ic->sb->flags & cpu_to_le32(SB_FLAG_RECALCULATING) &&
                    dio->range.logical_sector + dio->range.n_sectors > 
le64_to_cpu(ic->sb->recalc_sector))
                        goto skip_check;
+               if (ic->mode == 'B') {
+                       if (!block_bitmap_op(ic, ic->recalc_bitmap, 
dio->range.logical_sector, dio->range.n_sectors, BITMAP_OP_TEST_ALL_CLEAR))
+                               goto skip_check;
+               }
+
                if (likely(!bio->bi_status))
                        integrity_metadata(&dio->work);
                else
@@ -1851,8 +2025,22 @@ static void pad_uncommitted(struct dm_in
                wraparound_section(ic, &ic->free_section);
                ic->n_uncommitted_sections++;
        }
-       WARN_ON(ic->journal_sections * ic->journal_section_entries !=
-               (ic->n_uncommitted_sections + ic->n_committed_sections) * 
ic->journal_section_entries + ic->free_sectors);
+       if (WARN_ON(ic->journal_sections * ic->journal_section_entries !=
+               (ic->n_uncommitted_sections + ic->n_committed_sections) * 
ic->journal_section_entries + ic->free_sectors)) {
+               printk(KERN_CRIT "dm-integrity: "
+                       "journal_sections %u, "
+                       "journal_section_entries %u, "
+                       "n_uncommitted_sections %u, "
+                       "n_committed_sections %u, "
+                       "journal_section_entries %u, "
+                       "free_sectors %u\n",
+                       ic->journal_sections,
+                       ic->journal_section_entries,
+                       ic->n_uncommitted_sections,
+                       ic->n_committed_sections,
+                       ic->journal_section_entries,
+                       ic->free_sectors);
+       }
 }
 
 static void integrity_commit(struct work_struct *w)
@@ -2139,11 +2327,14 @@ static void integrity_recalc(struct work
        sector_t area, offset;
        sector_t metadata_block;
        unsigned metadata_offset;
+       sector_t logical_sector, n_sectors;
        __u8 *t;
        unsigned i;
        int r;
        unsigned super_counter = 0;
 
+       DEBUG_print("start recalculation... (position %llx)\n", 
le64_to_cpu(ic->sb->recalc_sector));
+
        spin_lock_irq(&ic->endio_wait.lock);
 
 next_chunk:
@@ -2152,8 +2343,13 @@ next_chunk:
                goto unlock_ret;
 
        range.logical_sector = le64_to_cpu(ic->sb->recalc_sector);
-       if (unlikely(range.logical_sector >= ic->provided_data_sectors))
+       if (unlikely(range.logical_sector >= ic->provided_data_sectors)) {
+               if (ic->mode == 'B') {
+                       DEBUG_print("queue_delayed_work: bitmap_flush_work\n");
+                       queue_delayed_work(ic->commit_wq, 
&ic->bitmap_flush_work, 0);
+               }
                goto unlock_ret;
+       }
 
        get_area_and_offset(ic, range.logical_sector, &area, &offset);
        range.n_sectors = min((sector_t)RECALC_SECTORS, 
ic->provided_data_sectors - range.logical_sector);
@@ -2161,11 +2357,33 @@ next_chunk:
                range.n_sectors = min(range.n_sectors, ((sector_t)1U << 
ic->sb->log2_interleave_sectors) - (unsigned)offset);
 
        add_new_range_and_wait(ic, &range);
-
        spin_unlock_irq(&ic->endio_wait.lock);
+       logical_sector = range.logical_sector;
+       n_sectors = range.n_sectors;
+
+       if (ic->mode == 'B') {
+               if (block_bitmap_op(ic, ic->recalc_bitmap, logical_sector, 
n_sectors, BITMAP_OP_TEST_ALL_CLEAR)) {
+                       goto advance_and_next;
+               }
+               while (block_bitmap_op(ic, ic->recalc_bitmap, logical_sector, 
ic->sectors_per_block, BITMAP_OP_TEST_ALL_CLEAR)) {
+                       logical_sector += ic->sectors_per_block;
+                       n_sectors -= ic->sectors_per_block;
+                       cond_resched();
+               }
+               while (block_bitmap_op(ic, ic->recalc_bitmap, logical_sector + 
n_sectors - ic->sectors_per_block, ic->sectors_per_block, 
BITMAP_OP_TEST_ALL_CLEAR)) {
+                       n_sectors -= ic->sectors_per_block;
+                       cond_resched();
+               }
+               get_area_and_offset(ic, logical_sector, &area, &offset);
+       }
+
+       DEBUG_print("recalculating: %lx, %lx\n", logical_sector, n_sectors);
 
        if (unlikely(++super_counter == RECALC_WRITE_SUPER)) {
                recalc_write_super(ic);
+               if (ic->mode == 'B') {
+                       queue_delayed_work(ic->commit_wq, 
&ic->bitmap_flush_work, ic->bitmap_flush_interval);
+               }
                super_counter = 0;
        }
 
@@ -2180,7 +2398,7 @@ next_chunk:
        io_req.client = ic->io;
        io_loc.bdev = ic->dev->bdev;
        io_loc.sector = get_data_sector(ic, area, offset);
-       io_loc.count = range.n_sectors;
+       io_loc.count = n_sectors;
 
        r = dm_io(&io_req, 1, &io_loc, NULL);
        if (unlikely(r)) {
@@ -2189,8 +2407,8 @@ next_chunk:
        }
 
        t = ic->recalc_tags;
-       for (i = 0; i < range.n_sectors; i += ic->sectors_per_block) {
-               integrity_sector_checksum(ic, range.logical_sector + i, 
ic->recalc_buffer + (i << SECTOR_SHIFT), t);
+       for (i = 0; i < n_sectors; i += ic->sectors_per_block) {
+               integrity_sector_checksum(ic, logical_sector + i, 
ic->recalc_buffer + (i << SECTOR_SHIFT), t);
                t += ic->tag_size;
        }
 
@@ -2202,6 +2420,9 @@ next_chunk:
                goto err;
        }
 
+advance_and_next:
+       cond_resched();
+
        spin_lock_irq(&ic->endio_wait.lock);
        remove_range_unlocked(ic, &range);
        ic->sb->recalc_sector = cpu_to_le64(range.logical_sector + 
range.n_sectors);
@@ -2217,6 +2438,89 @@ unlock_ret:
        recalc_write_super(ic);
 }
 
+static void bitmap_block_work(struct work_struct *w)
+{
+       struct bitmap_block_status *bbs = container_of(w, struct 
bitmap_block_status, work);
+       struct dm_integrity_c *ic = bbs->ic;
+       struct bio *bio;
+       struct bio_list bio_queue;
+       struct bio_list waiting;
+
+       bio_list_init(&waiting);
+
+       spin_lock(&bbs->bio_queue_lock);
+       bio_queue = bbs->bio_queue;
+       bio_list_init(&bbs->bio_queue);
+       spin_unlock(&bbs->bio_queue_lock);
+
+       while ((bio = bio_list_pop(&bio_queue))) {
+               struct dm_integrity_io *dio;
+
+               dio = dm_per_bio_data(bio, sizeof(struct dm_integrity_io));
+
+               if (block_bitmap_op(ic, ic->may_write_bitmap, 
dio->range.logical_sector, dio->range.n_sectors, BITMAP_OP_TEST_ALL_SET)) {
+                       remove_range(ic, &dio->range);
+                       INIT_WORK(&dio->work, integrity_bio_wait);
+                       queue_work(ic->wait_wq, &dio->work);
+               } else {
+                       block_bitmap_op(ic, ic->journal, 
dio->range.logical_sector, dio->range.n_sectors, BITMAP_OP_SET);
+                       bio_list_add(&waiting, bio);
+               }
+       }
+
+       if (bio_list_empty(&waiting))
+               return;
+
+       rw_journal_sectors(ic, REQ_OP_WRITE, REQ_FUA | REQ_SYNC, bbs->idx * 
(BITMAP_BLOCK_SIZE >> SECTOR_SHIFT), BITMAP_BLOCK_SIZE >> SECTOR_SHIFT, NULL);
+
+       while ((bio = bio_list_pop(&waiting))) {
+               struct dm_integrity_io *dio = dm_per_bio_data(bio, 
sizeof(struct dm_integrity_io));
+
+               block_bitmap_op(ic, ic->may_write_bitmap, 
dio->range.logical_sector, dio->range.n_sectors, BITMAP_OP_SET);
+
+               remove_range(ic, &dio->range);
+               INIT_WORK(&dio->work, integrity_bio_wait);
+               queue_work(ic->wait_wq, &dio->work);
+       }
+
+       queue_delayed_work(ic->commit_wq, &ic->bitmap_flush_work, 
ic->bitmap_flush_interval);
+}
+
+static void bitmap_flush_work(struct work_struct *work)
+{
+       struct dm_integrity_c *ic = container_of(work, struct dm_integrity_c, 
bitmap_flush_work.work);
+       struct dm_integrity_range range;
+       unsigned long limit;
+
+       dm_integrity_flush_buffers(ic);
+
+       range.logical_sector = 0;
+       range.n_sectors = ic->provided_data_sectors;
+
+       spin_lock_irq(&ic->endio_wait.lock);
+       add_new_range_and_wait(ic, &range);
+       spin_unlock_irq(&ic->endio_wait.lock);
+
+       dm_integrity_flush_buffers(ic);
+       if (ic->meta_dev)
+               blkdev_issue_flush(ic->dev->bdev, GFP_NOIO, NULL);
+
+       limit = ic->provided_data_sectors;
+       if (ic->sb->flags & cpu_to_le32(SB_FLAG_RECALCULATING)) {
+               limit = le64_to_cpu(ic->sb->recalc_sector)
+                       >> (ic->sb->log2_sectors_per_block + 
ic->log2_blocks_per_bitmap_bit)
+                       << (ic->sb->log2_sectors_per_block + 
ic->log2_blocks_per_bitmap_bit);
+       }
+       DEBUG_print("zeroing journal\n");
+       block_bitmap_op(ic, ic->journal, 0, limit, BITMAP_OP_CLEAR);
+       block_bitmap_op(ic, ic->may_write_bitmap, 0, limit, BITMAP_OP_CLEAR);
+
+       rw_journal_sectors(ic, REQ_OP_WRITE, REQ_FUA | REQ_SYNC, 0, 
ic->n_bitmap_blocks * (BITMAP_BLOCK_SIZE >> SECTOR_SHIFT), NULL);
+
+       remove_range(ic, &range);
+}
+
+
 static void init_journal(struct dm_integrity_c *ic, unsigned start_section,
                         unsigned n_sections, unsigned char commit_seq)
 {
@@ -2416,6 +2720,7 @@ clear_journal:
 static void dm_integrity_postsuspend(struct dm_target *ti)
 {
        struct dm_integrity_c *ic = (struct dm_integrity_c *)ti->private;
+       int r;
 
        del_timer_sync(&ic->autocommit_timer);
 
@@ -2424,6 +2729,9 @@ static void dm_integrity_postsuspend(str
        if (ic->recalc_wq)
                drain_workqueue(ic->recalc_wq);
 
+       if (ic->mode == 'B')
+               cancel_delayed_work_sync(&ic->bitmap_flush_work);
+
        queue_work(ic->commit_wq, &ic->commit_work);
        drain_workqueue(ic->commit_wq);
 
@@ -2434,6 +2742,17 @@ static void dm_integrity_postsuspend(str
                dm_integrity_flush_buffers(ic);
        }
 
+       if (ic->mode == 'B') {
+               dm_integrity_flush_buffers(ic);
+#if 1
+               init_journal(ic, 0, ic->journal_sections, 0);
+               ic->sb->flags &= ~cpu_to_le32(SB_FLAG_DIRTY_BITMAP);
+               r = sync_rw_sb(ic, REQ_OP_WRITE, REQ_FUA);
+               if (unlikely(r))
+                       dm_integrity_io_error(ic, "writing superblock", r);
+#endif
+       }
+
        WRITE_ONCE(ic->suspending, 0);
 
        BUG_ON(!RB_EMPTY_ROOT(&ic->in_progress));
@@ -2444,11 +2763,65 @@ static void dm_integrity_postsuspend(str
 static void dm_integrity_resume(struct dm_target *ti)
 {
        struct dm_integrity_c *ic = (struct dm_integrity_c *)ti->private;
+       int r;
+       DEBUG_print("resume\n");
 
-       replay_journal(ic);
+       if (ic->sb->flags & cpu_to_le32(SB_FLAG_DIRTY_BITMAP)) {
+               DEBUG_print("resume dirty_bitmap\n");
+               rw_journal_sectors(ic, REQ_OP_READ, 0, 0, ic->n_bitmap_blocks * 
(BITMAP_BLOCK_SIZE >> SECTOR_SHIFT), NULL);
+               if (ic->mode == 'B') {
+                       if (ic->sb->log2_blocks_per_bitmap_bit == 
ic->log2_blocks_per_bitmap_bit) {
+                               block_bitmap_copy(ic, ic->recalc_bitmap, 
ic->journal);
+                               block_bitmap_copy(ic, ic->may_write_bitmap, 
ic->journal);
+                               if (!block_bitmap_op(ic, ic->journal, 0, 
ic->provided_data_sectors, BITMAP_OP_TEST_ALL_CLEAR)) {
+                                       ic->sb->flags |= 
cpu_to_le32(SB_FLAG_RECALCULATING);
+                                       ic->sb->recalc_sector = cpu_to_le64(0);
+                               }
+                       } else {
+                               DEBUG_print("non-matching 
blocks_per_bitmap_bit: %u, %u\n", ic->sb->log2_blocks_per_bitmap_bit, 
ic->log2_blocks_per_bitmap_bit);
+                               ic->sb->log2_blocks_per_bitmap_bit = 
ic->log2_blocks_per_bitmap_bit;
+                               block_bitmap_op(ic, ic->recalc_bitmap, 0, 
ic->provided_data_sectors, BITMAP_OP_SET);
+                               block_bitmap_op(ic, ic->may_write_bitmap, 0, 
ic->provided_data_sectors, BITMAP_OP_SET);
+                               block_bitmap_op(ic, ic->journal, 0, 
ic->provided_data_sectors, BITMAP_OP_SET);
+                               rw_journal_sectors(ic, REQ_OP_WRITE, REQ_FUA | 
REQ_SYNC, 0, ic->n_bitmap_blocks * (BITMAP_BLOCK_SIZE >> SECTOR_SHIFT), NULL);
+                               ic->sb->flags |= 
cpu_to_le32(SB_FLAG_RECALCULATING);
+                               ic->sb->recalc_sector = cpu_to_le64(0);
+                       }
+               } else {
+                       if (!(ic->sb->log2_blocks_per_bitmap_bit == 
ic->log2_blocks_per_bitmap_bit &&
+                             block_bitmap_op(ic, ic->journal, 0, 
ic->provided_data_sectors, BITMAP_OP_TEST_ALL_CLEAR))) {
+                               ic->sb->flags |= 
cpu_to_le32(SB_FLAG_RECALCULATING);
+                               ic->sb->recalc_sector = cpu_to_le64(0);
+                       }
+                       init_journal(ic, 0, ic->journal_sections, 0);
+                       replay_journal(ic);
+                       ic->sb->flags &= ~cpu_to_le32(SB_FLAG_DIRTY_BITMAP);
+               }
+               r = sync_rw_sb(ic, REQ_OP_WRITE, REQ_FUA);
+               if (unlikely(r))
+                       dm_integrity_io_error(ic, "writing superblock", r);
+       } else {
+               replay_journal(ic);
+               if (ic->mode == 'B') {
+                       int mode;
+                       ic->sb->flags |= cpu_to_le32(SB_FLAG_DIRTY_BITMAP);
+                       ic->sb->log2_blocks_per_bitmap_bit = 
ic->log2_blocks_per_bitmap_bit;
+                       r = sync_rw_sb(ic, REQ_OP_WRITE, REQ_FUA);
+                       if (unlikely(r))
+                               dm_integrity_io_error(ic, "writing superblock", 
r);
+
+                       mode = ic->recalculate_flag ? BITMAP_OP_SET : 
BITMAP_OP_CLEAR;
+                       block_bitmap_op(ic, ic->journal, 0, 
ic->provided_data_sectors, mode);
+                       block_bitmap_op(ic, ic->recalc_bitmap, 0, 
ic->provided_data_sectors, mode);
+                       block_bitmap_op(ic, ic->may_write_bitmap, 0, 
ic->provided_data_sectors, mode);
+                       rw_journal_sectors(ic, REQ_OP_WRITE, REQ_FUA | 
REQ_SYNC, 0, ic->n_bitmap_blocks * (BITMAP_BLOCK_SIZE >> SECTOR_SHIFT), NULL);
+               }
+       }
 
-       if (ic->recalc_wq && ic->sb->flags & 
cpu_to_le32(SB_FLAG_RECALCULATING)) {
+       DEBUG_print("testing recalc: %x\n", ic->sb->flags);
+       if (ic->sb->flags & cpu_to_le32(SB_FLAG_RECALCULATING)) {
                __u64 recalc_pos = le64_to_cpu(ic->sb->recalc_sector);
+               DEBUG_print("recalc pos: %lx / %lx\n", (long)recalc_pos, 
ic->provided_data_sectors);
                if (recalc_pos < ic->provided_data_sectors) {
                        queue_work(ic->recalc_wq, &ic->recalc_work);
                } else if (recalc_pos > ic->provided_data_sectors) {
@@ -2486,6 +2859,8 @@ static void dm_integrity_status(struct d
                arg_count += !!(ic->sb->flags & 
cpu_to_le32(SB_FLAG_RECALCULATING));
                arg_count += ic->mode == 'J';
                arg_count += ic->mode == 'J';
+               arg_count += ic->mode == 'B';
+               arg_count += ic->mode == 'B';
                arg_count += !!ic->internal_hash_alg.alg_string;
                arg_count += !!ic->journal_crypt_alg.alg_string;
                arg_count += !!ic->journal_mac_alg.alg_string;
@@ -2495,7 +2870,7 @@ static void dm_integrity_status(struct d
                        DMEMIT(" meta_device:%s", ic->meta_dev->name);
                if (ic->sectors_per_block != 1)
                        DMEMIT(" block_size:%u", ic->sectors_per_block << 
SECTOR_SHIFT);
-               if (ic->sb->flags & cpu_to_le32(SB_FLAG_RECALCULATING))
+               if (ic->recalculate_flag)
                        DMEMIT(" recalculate");
                DMEMIT(" journal_sectors:%u", ic->initial_sectors - SB_SECTORS);
                DMEMIT(" interleave_sectors:%u", 1U << 
ic->sb->log2_interleave_sectors);
@@ -2504,6 +2879,10 @@ static void dm_integrity_status(struct d
                        DMEMIT(" journal_watermark:%u", 
(unsigned)watermark_percentage);
                        DMEMIT(" commit_time:%u", ic->autocommit_msec);
                }
+               if (ic->mode == 'B') {
+                       DMEMIT(" sectors_per_bit:%llu", (unsigned long 
long)ic->sectors_per_block << ic->log2_blocks_per_bitmap_bit);
+                       DMEMIT(" bitmap_flush_interval:%u", 
jiffies_to_msecs(ic->bitmap_flush_interval));
+               }
 
 #define EMIT_ALG(a, n)                                                 \
                do {                                                    \
@@ -3085,7 +3464,7 @@ bad:
  *     device
  *     offset from the start of the device
  *     tag size
- *     D - direct writes, J - journal writes, R - recovery mode
+ *     D - direct writes, J - journal writes, B - bitmap mode, R - recovery 
mode
  *     number of optional arguments
  *     optional arguments:
  *             journal_sectors
@@ -3095,6 +3474,8 @@ bad:
  *             commit_time
  *             meta_device
  *             block_size
+ *             sectors_per_bit
+ *             bitmap_flush_interval
  *             internal_hash
  *             journal_crypt
  *             journal_mac
@@ -3111,10 +3492,13 @@ static int dm_integrity_ctr(struct dm_ta
                {0, 9, "Invalid number of feature args"},
        };
        unsigned journal_sectors, interleave_sectors, buffer_sectors, 
journal_watermark, sync_msec;
-       bool recalculate;
        bool should_write_sb;
        __u64 threshold;
        unsigned long long start;
+       __s8 log2_sectors_per_bitmap_bit = -1;
+       __s8 log2_blocks_per_bitmap_bit;
+       __u64 bits_in_journal;
+       __u64 n_bitmap_bits;
 
 #define DIRECT_ARGUMENTS       4
 
@@ -3138,6 +3522,7 @@ static int dm_integrity_ctr(struct dm_ta
        init_waitqueue_head(&ic->copy_to_journal_wait);
        init_completion(&ic->crypto_backoff);
        atomic64_set(&ic->number_of_mismatches, 0);
+       ic->bitmap_flush_interval = BITMAP_FLUSH_INTERVAL;
 
        r = dm_get_device(ti, argv[0], dm_table_get_mode(ti->table), &ic->dev);
        if (r) {
@@ -3160,10 +3545,10 @@ static int dm_integrity_ctr(struct dm_ta
                }
        }
 
-       if (!strcmp(argv[3], "J") || !strcmp(argv[3], "D") || !strcmp(argv[3], 
"R"))
+       if (!strcmp(argv[3], "J") || !strcmp(argv[3], "B") || !strcmp(argv[3], 
"D") || !strcmp(argv[3], "R")) {
                ic->mode = argv[3][0];
-       else {
-               ti->error = "Invalid mode (expecting J, D, R)";
+       } else {
+               ti->error = "Invalid mode (expecting J, B, D, R)";
                r = -EINVAL;
                goto bad;
        }
@@ -3173,7 +3558,6 @@ static int dm_integrity_ctr(struct dm_ta
        buffer_sectors = DEFAULT_BUFFER_SECTORS;
        journal_watermark = DEFAULT_JOURNAL_WATERMARK;
        sync_msec = DEFAULT_SYNC_MSEC;
-       recalculate = false;
        ic->sectors_per_block = 1;
 
        as.argc = argc - DIRECT_ARGUMENTS;
@@ -3185,6 +3569,7 @@ static int dm_integrity_ctr(struct dm_ta
        while (extra_args--) {
                const char *opt_string;
                unsigned val;
+               unsigned long long llval;
                opt_string = dm_shift_arg(&as);
                if (!opt_string) {
                        r = -EINVAL;
@@ -3220,6 +3605,14 @@ static int dm_integrity_ctr(struct dm_ta
                                goto bad;
                        }
                        ic->sectors_per_block = val >> SECTOR_SHIFT;
+               } else if (sscanf(opt_string, "sectors_per_bit:%llu%c", &llval, 
&dummy) == 1) {
+                       log2_sectors_per_bitmap_bit = !llval ? 0 : 
__ilog2_u64(llval);
+               } else if (sscanf(opt_string, "bitmap_flush_interval:%u%c", 
&val, &dummy) == 1) {
+                       if (val >= (uint64_t)UINT_MAX * 1000 / HZ) {
+                               r = -EINVAL;
+                               ti->error = "Invalid bitmap_flush_interval 
argument";
+                       }
+                       ic->bitmap_flush_interval = msecs_to_jiffies(val);
                } else if (!strncmp(opt_string, "internal_hash:", 
strlen("internal_hash:"))) {
                        r = get_alg_and_key(opt_string, &ic->internal_hash_alg, 
&ti->error,
                                            "Invalid internal_hash argument");
@@ -3236,7 +3629,7 @@ static int dm_integrity_ctr(struct dm_ta
                        if (r)
                                goto bad;
                } else if (!strcmp(opt_string, "recalculate")) {
-                       recalculate = true;
+                       ic->recalculate_flag = true;
                } else {
                        r = -EINVAL;
                        ti->error = "Invalid argument";
@@ -3287,6 +3680,12 @@ static int dm_integrity_ctr(struct dm_ta
        else
                ic->log2_tag_size = -1;
 
+       if (ic->mode == 'B' && !ic->internal_hash) {
+               r = -EINVAL;
+               ti->error = "Bitmap mode can be only used with internal hash";
+               goto bad;
+       }
+
        ic->autocommit_jiffies = msecs_to_jiffies(sync_msec);
        ic->autocommit_msec = sync_msec;
        timer_setup(&ic->autocommit_timer, autocommit_fn, 0);
@@ -3332,7 +3731,7 @@ static int dm_integrity_ctr(struct dm_ta
        }
        INIT_WORK(&ic->commit_work, integrity_commit);
 
-       if (ic->mode == 'J') {
+       if (ic->mode == 'J' || ic->mode == 'B') {
                ic->writer_wq = alloc_workqueue("dm-integrity-writer", 
WQ_MEM_RECLAIM, 1);
                if (!ic->writer_wq) {
                        ti->error = "Cannot allocate workqueue";
@@ -3373,7 +3772,7 @@ static int dm_integrity_ctr(struct dm_ta
                        should_write_sb = true;
        }
 
-       if (!ic->sb->version || ic->sb->version > SB_VERSION_2) {
+       if (!ic->sb->version || ic->sb->version > SB_VERSION_3) {
                r = -EINVAL;
                ti->error = "Unknown version";
                goto bad;
@@ -3433,6 +3832,27 @@ try_smaller_buffer:
                ti->error = "The device is too small";
                goto bad;
        }
+
+       if (log2_sectors_per_bitmap_bit < 0)
+               log2_sectors_per_bitmap_bit = 
__fls(DEFAULT_SECTORS_PER_BITMAP_BIT);
+       if (log2_sectors_per_bitmap_bit < ic->sb->log2_sectors_per_block)
+               log2_sectors_per_bitmap_bit = ic->sb->log2_sectors_per_block;
+
+       bits_in_journal = ((__u64)ic->journal_section_sectors * 
ic->journal_sections) << (SECTOR_SHIFT + 3);
+       if (bits_in_journal > UINT_MAX)
+               bits_in_journal = UINT_MAX;
+       while (bits_in_journal < (ic->provided_data_sectors + ((sector_t)1 << 
log2_sectors_per_bitmap_bit) - 1) >> log2_sectors_per_bitmap_bit)
+               log2_sectors_per_bitmap_bit++;
+
+       log2_blocks_per_bitmap_bit = log2_sectors_per_bitmap_bit - 
ic->sb->log2_sectors_per_block;
+       ic->log2_blocks_per_bitmap_bit = log2_blocks_per_bitmap_bit;
+       if (should_write_sb) {
+               ic->sb->log2_blocks_per_bitmap_bit = log2_blocks_per_bitmap_bit;
+       }
+       n_bitmap_bits = ((ic->provided_data_sectors >> 
ic->sb->log2_sectors_per_block)
+                               + (((sector_t)1 << log2_blocks_per_bitmap_bit) 
- 1)) >> log2_blocks_per_bitmap_bit;
+       ic->n_bitmap_blocks = DIV_ROUND_UP(n_bitmap_bits, BITMAP_BLOCK_SIZE * 
8);
+
        if (!ic->meta_dev)
                ic->log2_buffer_sectors = min(ic->log2_buffer_sectors, 
(__u8)__ffs(ic->metadata_run));
 
@@ -3457,25 +3877,21 @@ try_smaller_buffer:
        DEBUG_print("   journal_sections %u\n", 
(unsigned)le32_to_cpu(ic->sb->journal_sections));
        DEBUG_print("   journal_entries %u\n", ic->journal_entries);
        DEBUG_print("   log2_interleave_sectors %d\n", 
ic->sb->log2_interleave_sectors);
-       DEBUG_print("   device_sectors 0x%llx\n", (unsigned long 
long)ic->device_sectors);
+       DEBUG_print("   device_sectors 0x%llx\n", 
i_size_read(ic->dev->bdev->bd_inode) >> SECTOR_SHIFT);
        DEBUG_print("   initial_sectors 0x%x\n", ic->initial_sectors);
        DEBUG_print("   metadata_run 0x%x\n", ic->metadata_run);
        DEBUG_print("   log2_metadata_run %d\n", ic->log2_metadata_run);
        DEBUG_print("   provided_data_sectors 0x%llx (%llu)\n", (unsigned long 
long)ic->provided_data_sectors,
                    (unsigned long long)ic->provided_data_sectors);
        DEBUG_print("   log2_buffer_sectors %u\n", ic->log2_buffer_sectors);
+       DEBUG_print("   bits_in_journal %llu\n", (unsigned long 
long)bits_in_journal);
 
-       if (recalculate && !(ic->sb->flags & 
cpu_to_le32(SB_FLAG_RECALCULATING))) {
+       if (ic->recalculate_flag && !(ic->sb->flags & 
cpu_to_le32(SB_FLAG_RECALCULATING))) {
                ic->sb->flags |= cpu_to_le32(SB_FLAG_RECALCULATING);
                ic->sb->recalc_sector = cpu_to_le64(0);
        }
 
-       if (ic->sb->flags & cpu_to_le32(SB_FLAG_RECALCULATING)) {
-               if (!ic->internal_hash) {
-                       r = -EINVAL;
-                       ti->error = "Recalculate is only valid with internal 
hash";
-                       goto bad;
-               }
+       if (ic->internal_hash) {
                ic->recalc_wq = alloc_workqueue("dm-integrity-recalc", 
WQ_MEM_RECLAIM, 1);
                if (!ic->recalc_wq ) {
                        ti->error = "Cannot allocate workqueue";
@@ -3512,6 +3928,45 @@ try_smaller_buffer:
                r = create_journal(ic, &ti->error);
                if (r)
                        goto bad;
+
+       }
+
+       if (ic->mode == 'B') {
+               unsigned i;
+               unsigned n_bitmap_pages = DIV_ROUND_UP(ic->n_bitmap_blocks, 
PAGE_SIZE / BITMAP_BLOCK_SIZE);
+
+               ic->recalc_bitmap = 
dm_integrity_alloc_page_list(n_bitmap_pages);
+               if (!ic->recalc_bitmap) {
+                       r = -ENOMEM;
+                       goto bad;
+               }
+               ic->may_write_bitmap = 
dm_integrity_alloc_page_list(n_bitmap_pages);
+               if (!ic->may_write_bitmap) {
+                       r = -ENOMEM;
+                       goto bad;
+               }
+               ic->bbs = kvmalloc_array(ic->n_bitmap_blocks, sizeof(struct 
bitmap_block_status), GFP_KERNEL);
+               if (!ic->bbs) {
+                       r = -ENOMEM;
+                       goto bad;
+               }
+               INIT_DELAYED_WORK(&ic->bitmap_flush_work, bitmap_flush_work);
+               for (i = 0; i < ic->n_bitmap_blocks; i++) {
+                       struct bitmap_block_status *bbs = &ic->bbs[i];
+                       unsigned sector, pl_index, pl_offset;
+
+                       INIT_WORK(&bbs->work, bitmap_block_work);
+                       bbs->ic = ic;
+                       bbs->idx = i;
+                       bio_list_init(&bbs->bio_queue);
+                       spin_lock_init(&bbs->bio_queue_lock);
+
+                       sector = i * (BITMAP_BLOCK_SIZE >> SECTOR_SHIFT);
+                       pl_index = sector >> (PAGE_SHIFT - SECTOR_SHIFT);
+                       pl_offset = (sector << SECTOR_SHIFT) & (PAGE_SIZE - 1);
+
+                       bbs->bitmap = 
lowmem_page_address(ic->journal[pl_index].page) + pl_offset;
+               }
        }
 
        if (should_write_sb) {
@@ -3536,6 +3991,17 @@ try_smaller_buffer:
                if (r)
                        goto bad;
        }
+       if (ic->mode == 'B') {
+               unsigned max_io_len = ((sector_t)ic->sectors_per_block << 
ic->log2_blocks_per_bitmap_bit) * (BITMAP_BLOCK_SIZE * 8);
+               if (!max_io_len)
+                       max_io_len = 1U << 31;
+               DEBUG_print("max_io_len: old %u, new %u\n", ti->max_io_len, 
max_io_len);
+               if (!ti->max_io_len || ti->max_io_len > max_io_len) {
+                       r = dm_set_target_max_io_len(ti, max_io_len);
+                       if (r)
+                               goto bad;
+               }
+       }
 
        if (!ic->internal_hash)
                dm_integrity_set(ti, ic);
@@ -3544,6 +4010,7 @@ try_smaller_buffer:
        ti->flush_supported = true;
 
        return 0;
+
 bad:
        dm_integrity_dtr(ti);
        return r;
@@ -3568,6 +4035,7 @@ static void dm_integrity_dtr(struct dm_t
                destroy_workqueue(ic->recalc_wq);
        vfree(ic->recalc_buffer);
        kvfree(ic->recalc_tags);
+       kvfree(ic->bbs);
        if (ic->bufio)
                dm_bufio_client_destroy(ic->bufio);
        mempool_exit(&ic->journal_io_mempool);
@@ -3580,6 +4048,8 @@ static void dm_integrity_dtr(struct dm_t
        dm_integrity_free_page_list(ic->journal);
        dm_integrity_free_page_list(ic->journal_io);
        dm_integrity_free_page_list(ic->journal_xor);
+       dm_integrity_free_page_list(ic->recalc_bitmap);
+       dm_integrity_free_page_list(ic->may_write_bitmap);
        if (ic->journal_scatterlist)
                dm_integrity_free_journal_scatterlist(ic, 
ic->journal_scatterlist);
        if (ic->journal_io_scatterlist)
@@ -3617,7 +4087,7 @@ static void dm_integrity_dtr(struct dm_t
 
 static struct target_type integrity_target = {
        .name                   = "integrity",
-       .version                = {1, 2, 0},
+       .version                = {1, 3, 0},
        .module                 = THIS_MODULE,
        .features               = DM_TARGET_SINGLETON | DM_TARGET_INTEGRITY,
        .ctr                    = dm_integrity_ctr,
Index: linux-2.6/Documentation/device-mapper/dm-integrity.txt
===================================================================
--- linux-2.6.orig/Documentation/device-mapper/dm-integrity.txt 2019-04-27 
10:28:49.000000000 +0200
+++ linux-2.6/Documentation/device-mapper/dm-integrity.txt      2019-04-27 
10:28:49.000000000 +0200
@@ -21,6 +21,14 @@ mode it calculates and verifies the inte
 mode, the dm-integrity target can be used to detect silent data
 corruption on the disk or in the I/O path.
 
+There's an alternative mode of operation whete dm-integrity uses bitmap
+instead of a journal. If a bit in the bitmap is 1, the corresponding
+region's data and integrity tags are not synchronized - if the machine
+crashes, the unsynchronized regions will be recalculated. The bitmap mode
+is faster than the journal mode, because we don't have to write the data
+twice, but it is also less reliable, because if data corruption happens
+when the machine crashes, it may not be detected.
+
 
 When loading the target for the first time, the kernel driver will format
 the device. But it will only format the device if the superblock contains
@@ -59,6 +67,10 @@ Target arguments:
                either both data and tag or none of them are written. The
                journaled mode degrades write throughput twice because the
                data have to be written twice.
+       B - bitmap mode - data and metadata are written without any
+               synchronization, the driver maintains a bitmap of dirty
+               regions where data and metadata don't match. This mode can
+               only be used with internal hash.
        R - recovery mode - in this mode, journal is not replayed,
                checksums are not checked and writes to the device are not
                allowed. This mode is useful for data recovery if the
@@ -150,6 +162,15 @@ block_size:number
        Supported values are 512, 1024, 2048 and 4096 bytes.  If not
        specified the default block size is 512 bytes.
 
+sectors_per_bit:number
+       In the bitmap mode, this parameter specifies the number of
+       512-byte sectors that corresponds to one bitmap bit.
+
+bitmap_flush_interval:number
+       The bitmap flush interval in milliseconds. The metadata buffers
+       are synchronized when this interval expires.
+
+
 The journal mode (D/J), buffer_sectors, journal_watermark, commit_time can
 be changed when reloading the target (load an inactive table and swap the
 tables with suspend and resume). The other arguments should not be changed
@@ -174,6 +195,8 @@ The layout of the formatted block device
        * flags
          SB_FLAG_HAVE_JOURNAL_MAC - a flag is set if journal_mac is used
          SB_FLAG_RECALCULATING - recalculating is in progress
+         SB_FLAG_DIRTY_BITMAP - journal area contains the bitmap of dirty
+               blocks
        * log2(sectors per block)
        * a position where recalculating finished
 * journal

--
dm-devel mailing list
[email protected]
https://www.redhat.com/mailman/listinfo/dm-devel

Reply via email to