dm-integrity: support recalculation in the 'I' mode
authorMikulas Patocka <mpatocka@redhat.com>
Thu, 5 Sep 2024 19:01:52 +0000 (21:01 +0200)
committerMikulas Patocka <mpatocka@redhat.com>
Fri, 6 Sep 2024 10:34:05 +0000 (12:34 +0200)
In the kernel 6.11, dm-integrity was enhanced with an inline ('I') mode.
This mode uses devices with non-power-of-2 sector size. The extra
metadata after each sector are used to hold the integrity hash.

This commit enhances the inline mode, so that there is automatic
recalculation of the integrity hashes when the 'reclaculate' parameter is
used. It allows us to activate the device instantly, and the
recalculation is done on background.

If the device is deactivated while recalculation is in progress, it will
remember the point where it stopped and it will continue from this point
when activated again.

Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
drivers/md/dm-integrity.c

index 34fa98efa9fa9550e4e003b9ed1779369908c3ce..c40df05e05211dba50e2e42d599b1ee399aa0bd0 100644 (file)
@@ -284,6 +284,7 @@ struct dm_integrity_c {
 
        mempool_t recheck_pool;
        struct bio_set recheck_bios;
+       struct bio_set recalc_bios;
 
        struct notifier_block reboot_notifier;
 };
@@ -321,7 +322,9 @@ struct dm_integrity_io {
        struct dm_bio_details bio_details;
 
        char *integrity_payload;
+       unsigned payload_len;
        bool integrity_payload_from_mempool;
+       bool integrity_range_locked;
 };
 
 struct journal_completion {
@@ -359,7 +362,7 @@ static struct kmem_cache *journal_io_cache;
 #endif
 
 static void dm_integrity_map_continue(struct dm_integrity_io *dio, bool from_map);
-static int dm_integrity_map_inline(struct dm_integrity_io *dio);
+static int dm_integrity_map_inline(struct dm_integrity_io *dio, bool from_map);
 static void integrity_bio_wait(struct work_struct *w);
 static void dm_integrity_dtr(struct dm_target *ti);
 
@@ -1946,8 +1949,13 @@ static int dm_integrity_map(struct dm_target *ti, struct bio *bio)
        dio->bi_status = 0;
        dio->op = bio_op(bio);
 
-       if (ic->mode == 'I')
-               return dm_integrity_map_inline(dio);
+       if (ic->mode == 'I') {
+               bio->bi_iter.bi_sector = dm_target_offset(ic->ti, bio->bi_iter.bi_sector);
+               dio->integrity_payload = NULL;
+               dio->integrity_payload_from_mempool = false;
+               dio->integrity_range_locked = false;
+               return dm_integrity_map_inline(dio, true);
+       }
 
        if (unlikely(dio->op == REQ_OP_DISCARD)) {
                if (ti->max_io_len) {
@@ -2395,15 +2403,13 @@ journal_read_write:
        do_endio_flush(ic, dio);
 }
 
-static int dm_integrity_map_inline(struct dm_integrity_io *dio)
+static int dm_integrity_map_inline(struct dm_integrity_io *dio, bool from_map)
 {
        struct dm_integrity_c *ic = dio->ic;
        struct bio *bio = dm_bio_from_per_bio_data(dio, sizeof(struct dm_integrity_io));
        struct bio_integrity_payload *bip;
-       unsigned payload_len, digest_size, extra_size, ret;
-
-       dio->integrity_payload = NULL;
-       dio->integrity_payload_from_mempool = false;
+       unsigned ret;
+       sector_t recalc_sector;
 
        if (unlikely(bio_integrity(bio))) {
                bio->bi_status = BLK_STS_NOTSUPP;
@@ -2416,28 +2422,67 @@ static int dm_integrity_map_inline(struct dm_integrity_io *dio)
                return DM_MAPIO_REMAPPED;
 
 retry:
-       payload_len = ic->tuple_size * (bio_sectors(bio) >> ic->sb->log2_sectors_per_block);
-       digest_size = crypto_shash_digestsize(ic->internal_hash);
-       extra_size = unlikely(digest_size > ic->tag_size) ? digest_size - ic->tag_size : 0;
-       payload_len += extra_size;
-       dio->integrity_payload = kmalloc(payload_len, GFP_NOIO | __GFP_NORETRY | __GFP_NOMEMALLOC | __GFP_NOWARN);
-       if (unlikely(!dio->integrity_payload)) {
-               const unsigned x_size = PAGE_SIZE << 1;
-               if (payload_len > x_size) {
-                       unsigned sectors = ((x_size - extra_size) / ic->tuple_size) << ic->sb->log2_sectors_per_block;
-                       if (WARN_ON(!sectors || sectors >= bio_sectors(bio))) {
-                               bio->bi_status = BLK_STS_NOTSUPP;
-                               bio_endio(bio);
-                               return DM_MAPIO_SUBMITTED;
+       if (!dio->integrity_payload) {
+               unsigned digest_size, extra_size;
+               dio->payload_len = ic->tuple_size * (bio_sectors(bio) >> ic->sb->log2_sectors_per_block);
+               digest_size = crypto_shash_digestsize(ic->internal_hash);
+               extra_size = unlikely(digest_size > ic->tag_size) ? digest_size - ic->tag_size : 0;
+               dio->payload_len += extra_size;
+               dio->integrity_payload = kmalloc(dio->payload_len, GFP_NOIO | __GFP_NORETRY | __GFP_NOMEMALLOC | __GFP_NOWARN);
+               if (unlikely(!dio->integrity_payload)) {
+                       const unsigned x_size = PAGE_SIZE << 1;
+                       if (dio->payload_len > x_size) {
+                               unsigned sectors = ((x_size - extra_size) / ic->tuple_size) << ic->sb->log2_sectors_per_block;
+                               if (WARN_ON(!sectors || sectors >= bio_sectors(bio))) {
+                                       bio->bi_status = BLK_STS_NOTSUPP;
+                                       bio_endio(bio);
+                                       return DM_MAPIO_SUBMITTED;
+                               }
+                               dm_accept_partial_bio(bio, sectors);
+                               goto retry;
                        }
-                       dm_accept_partial_bio(bio, sectors);
-                       goto retry;
                }
+       }
+
+       dio->range.logical_sector = bio->bi_iter.bi_sector;
+       dio->range.n_sectors = bio_sectors(bio);
+
+       if (!(ic->sb->flags & cpu_to_le32(SB_FLAG_RECALCULATING)))
+               goto skip_spinlock;
+#ifdef CONFIG_64BIT
+       /*
+        * On 64-bit CPUs we can optimize the lock away (so that it won't cause
+        * cache line bouncing) and use acquire/release barriers instead.
+        *
+        * Paired with smp_store_release in integrity_recalc_inline.
+        */
+       recalc_sector = le64_to_cpu(smp_load_acquire(&ic->sb->recalc_sector));
+       if (likely(dio->range.logical_sector + dio->range.n_sectors <= recalc_sector))
+               goto skip_spinlock;
+#endif
+       spin_lock_irq(&ic->endio_wait.lock);
+       recalc_sector = le64_to_cpu(ic->sb->recalc_sector);
+       if (dio->range.logical_sector + dio->range.n_sectors <= recalc_sector)
+               goto skip_unlock;
+       if (unlikely(!add_new_range(ic, &dio->range, true))) {
+               if (from_map) {
+                       spin_unlock_irq(&ic->endio_wait.lock);
+                       INIT_WORK(&dio->work, integrity_bio_wait);
+                       queue_work(ic->wait_wq, &dio->work);
+                       return DM_MAPIO_SUBMITTED;
+               }
+               wait_and_add_new_range(ic, &dio->range);
+       }
+       dio->integrity_range_locked = true;
+skip_unlock:
+       spin_unlock_irq(&ic->endio_wait.lock);
+skip_spinlock:
+
+       if (unlikely(!dio->integrity_payload)) {
                dio->integrity_payload = page_to_virt((struct page *)mempool_alloc(&ic->recheck_pool, GFP_NOIO));
                dio->integrity_payload_from_mempool = true;
        }
 
-       bio->bi_iter.bi_sector = dm_target_offset(ic->ti, bio->bi_iter.bi_sector);
        dio->bio_details.bi_iter = bio->bi_iter;
 
        if (unlikely(!dm_integrity_check_limits(ic, bio->bi_iter.bi_sector, bio))) {
@@ -2468,8 +2513,8 @@ retry:
        }
 
        ret = bio_integrity_add_page(bio, virt_to_page(dio->integrity_payload),
-                                       payload_len, offset_in_page(dio->integrity_payload));
-       if (unlikely(ret != payload_len)) {
+                                       dio->payload_len, offset_in_page(dio->integrity_payload));
+       if (unlikely(ret != dio->payload_len)) {
                bio->bi_status = BLK_STS_RESOURCE;
                bio_endio(bio);
                return DM_MAPIO_SUBMITTED;
@@ -2577,6 +2622,9 @@ static int dm_integrity_end_io(struct dm_target *ti, struct bio *bio, blk_status
                struct dm_integrity_io *dio = dm_per_bio_data(bio, sizeof(struct dm_integrity_io));
                if (dio->op == REQ_OP_READ && likely(*status == BLK_STS_OK)) {
                        unsigned pos = 0;
+                       if (ic->sb->flags & cpu_to_le32(SB_FLAG_RECALCULATING) &&
+                           unlikely(dio->integrity_range_locked))
+                               goto skip_check;
                        while (dio->bio_details.bi_iter.bi_size) {
                                char digest[HASH_MAX_DIGESTSIZE];
                                struct bio_vec bv = bio_iter_iovec(bio, dio->bio_details.bi_iter);
@@ -2596,9 +2644,10 @@ static int dm_integrity_end_io(struct dm_target *ti, struct bio *bio, blk_status
                                bio_advance_iter_single(bio, &dio->bio_details.bi_iter, ic->sectors_per_block << SECTOR_SHIFT);
                        }
                }
-               if (likely(dio->op == REQ_OP_READ) || likely(dio->op == REQ_OP_WRITE)) {
-                       dm_integrity_free_payload(dio);
-               }
+skip_check:
+               dm_integrity_free_payload(dio);
+               if (unlikely(dio->integrity_range_locked))
+                       remove_range(ic, &dio->range);
        }
        return DM_ENDIO_DONE;
 }
@@ -2606,8 +2655,26 @@ static int dm_integrity_end_io(struct dm_target *ti, struct bio *bio, blk_status
 static void integrity_bio_wait(struct work_struct *w)
 {
        struct dm_integrity_io *dio = container_of(w, struct dm_integrity_io, work);
+       struct dm_integrity_c *ic = dio->ic;
 
-       dm_integrity_map_continue(dio, false);
+       if (ic->mode == 'I') {
+               struct bio *bio = dm_bio_from_per_bio_data(dio, sizeof(struct dm_integrity_io));
+               int r = dm_integrity_map_inline(dio, false);
+               switch (r) {
+                       case DM_MAPIO_KILL:
+                               bio->bi_status = BLK_STS_IOERR;
+                               fallthrough;
+                       case DM_MAPIO_REMAPPED:
+                               submit_bio_noacct(bio);
+                               fallthrough;
+                       case DM_MAPIO_SUBMITTED:
+                               return;
+                       default:
+                               BUG();
+               }
+       } else {
+               dm_integrity_map_continue(dio, false);
+       }
 }
 
 static void pad_uncommitted(struct dm_integrity_c *ic)
@@ -3079,6 +3146,133 @@ free_ret:
        kvfree(recalc_tags);
 }
 
+static void integrity_recalc_inline(struct work_struct *w)
+{
+       struct dm_integrity_c *ic = container_of(w, struct dm_integrity_c, recalc_work);
+       size_t recalc_tags_size;
+       u8 *recalc_buffer = NULL;
+       u8 *recalc_tags = NULL;
+       struct dm_integrity_range range;
+       struct bio *bio;
+       struct bio_integrity_payload *bip;
+       __u8 *t;
+       unsigned int i;
+       int r;
+       unsigned ret;
+       unsigned int super_counter = 0;
+       unsigned recalc_sectors = RECALC_SECTORS;
+
+retry:
+       recalc_buffer = kmalloc(recalc_sectors << SECTOR_SHIFT, GFP_NOIO | __GFP_NOWARN);
+       if (!recalc_buffer) {
+oom:
+               recalc_sectors >>= 1;
+               if (recalc_sectors >= 1U << ic->sb->log2_sectors_per_block)
+                       goto retry;
+               DMCRIT("out of memory for recalculate buffer - recalculation disabled");
+               goto free_ret;
+       }
+
+       recalc_tags_size = (recalc_sectors >> ic->sb->log2_sectors_per_block) * ic->tuple_size;
+       if (crypto_shash_digestsize(ic->internal_hash) > ic->tuple_size)
+               recalc_tags_size += crypto_shash_digestsize(ic->internal_hash) - ic->tuple_size;
+       recalc_tags = kmalloc(recalc_tags_size, GFP_NOIO | __GFP_NOWARN);
+       if (!recalc_tags) {
+               kfree(recalc_buffer);
+               recalc_buffer = NULL;
+               goto oom;
+       }
+
+       spin_lock_irq(&ic->endio_wait.lock);
+
+next_chunk:
+       if (unlikely(dm_post_suspending(ic->ti)))
+               goto unlock_ret;
+
+       range.logical_sector = le64_to_cpu(ic->sb->recalc_sector);
+       if (unlikely(range.logical_sector >= ic->provided_data_sectors))
+               goto unlock_ret;
+       range.n_sectors = min((sector_t)recalc_sectors, ic->provided_data_sectors - range.logical_sector);
+
+       add_new_range_and_wait(ic, &range);
+       spin_unlock_irq(&ic->endio_wait.lock);
+
+       if (unlikely(++super_counter == RECALC_WRITE_SUPER)) {
+               recalc_write_super(ic);
+               super_counter = 0;
+       }
+
+       if (unlikely(dm_integrity_failed(ic)))
+               goto err;
+
+       DEBUG_print("recalculating: %llx - %llx\n", range.logical_sector, range.n_sectors);
+
+       bio = bio_alloc_bioset(ic->dev->bdev, 1, REQ_OP_READ, GFP_NOIO, &ic->recalc_bios);
+       bio->bi_iter.bi_sector = ic->start + SB_SECTORS + range.logical_sector;
+       __bio_add_page(bio, virt_to_page(recalc_buffer), range.n_sectors << SECTOR_SHIFT, offset_in_page(recalc_buffer));
+       r = submit_bio_wait(bio);
+       bio_put(bio);
+       if (unlikely(r)) {
+               dm_integrity_io_error(ic, "reading data", r);
+               goto err;
+       }
+
+       t = recalc_tags;
+       for (i = 0; i < range.n_sectors; i += ic->sectors_per_block) {
+               memset(t, 0, ic->tuple_size);
+               integrity_sector_checksum(ic, range.logical_sector + i, recalc_buffer + (i << SECTOR_SHIFT), t);
+               t += ic->tuple_size;
+       }
+
+       bio = bio_alloc_bioset(ic->dev->bdev, 1, REQ_OP_WRITE, GFP_NOIO, &ic->recalc_bios);
+       bio->bi_iter.bi_sector = ic->start + SB_SECTORS + range.logical_sector;
+       __bio_add_page(bio, virt_to_page(recalc_buffer), range.n_sectors << SECTOR_SHIFT, offset_in_page(recalc_buffer));
+
+       bip = bio_integrity_alloc(bio, GFP_NOIO, 1);
+       if (unlikely(IS_ERR(bip))) {
+               bio_put(bio);
+               DMCRIT("out of memory for bio integrity payload - recalculation disabled");
+               goto err;
+       }
+       ret = bio_integrity_add_page(bio, virt_to_page(recalc_tags), t - recalc_tags, offset_in_page(recalc_tags));
+       if (unlikely(ret != t - recalc_tags)) {
+               bio_put(bio);
+               dm_integrity_io_error(ic, "attaching integrity tags", -ENOMEM);
+               goto err;
+       }
+
+       r = submit_bio_wait(bio);
+       bio_put(bio);
+       if (unlikely(r)) {
+               dm_integrity_io_error(ic, "writing data", r);
+               goto err;
+       }
+
+       cond_resched();
+       spin_lock_irq(&ic->endio_wait.lock);
+       remove_range_unlocked(ic, &range);
+#ifdef CONFIG_64BIT
+       /* Paired with smp_load_acquire in dm_integrity_map_inline. */
+       smp_store_release(&ic->sb->recalc_sector, cpu_to_le64(range.logical_sector + range.n_sectors));
+#else
+       ic->sb->recalc_sector = cpu_to_le64(range.logical_sector + range.n_sectors);
+#endif
+       goto next_chunk;
+
+err:
+       remove_range(ic, &range);
+       goto free_ret;
+
+unlock_ret:
+       spin_unlock_irq(&ic->endio_wait.lock);
+
+       recalc_write_super(ic);
+
+free_ret:
+       kfree(recalc_buffer);
+       kfree(recalc_tags);
+}
+
 static void bitmap_block_work(struct work_struct *w)
 {
        struct bitmap_block_status *bbs = container_of(w, struct bitmap_block_status, work);
@@ -4617,6 +4811,17 @@ static int dm_integrity_ctr(struct dm_target *ti, unsigned int argc, char **argv
                        r = -ENOMEM;
                        goto bad;
                }
+               r = bioset_init(&ic->recalc_bios, 1, 0, BIOSET_NEED_BVECS);
+               if (r) {
+                       ti->error = "Cannot allocate bio set";
+                       goto bad;
+               }
+               r = bioset_integrity_create(&ic->recalc_bios, 1);
+               if (r) {
+                       ti->error = "Cannot allocate bio integrity set";
+                       r = -ENOMEM;
+                       goto bad;
+               }
        }
 
        ic->metadata_wq = alloc_workqueue("dm-integrity-metadata",
@@ -4833,7 +5038,7 @@ try_smaller_buffer:
                        r = -ENOMEM;
                        goto bad;
                }
-               INIT_WORK(&ic->recalc_work, integrity_recalc);
+               INIT_WORK(&ic->recalc_work, ic->mode == 'I' ? integrity_recalc_inline : integrity_recalc);
        } else {
                if (ic->sb->flags & cpu_to_le32(SB_FLAG_RECALCULATING)) {
                        ti->error = "Recalculate can only be specified with internal_hash";
@@ -4850,17 +5055,15 @@ try_smaller_buffer:
                goto bad;
        }
 
-       if (ic->mode != 'I') {
-               ic->bufio = dm_bufio_client_create(ic->meta_dev ? ic->meta_dev->bdev : ic->dev->bdev,
-                               1U << (SECTOR_SHIFT + ic->log2_buffer_sectors), 1, 0, NULL, NULL, 0);
-               if (IS_ERR(ic->bufio)) {
-                       r = PTR_ERR(ic->bufio);
-                       ti->error = "Cannot initialize dm-bufio";
-                       ic->bufio = NULL;
-                       goto bad;
-               }
-               dm_bufio_set_sector_offset(ic->bufio, ic->start + ic->initial_sectors);
+       ic->bufio = dm_bufio_client_create(ic->meta_dev ? ic->meta_dev->bdev : ic->dev->bdev,
+                       1U << (SECTOR_SHIFT + ic->log2_buffer_sectors), 1, 0, NULL, NULL, 0);
+       if (IS_ERR(ic->bufio)) {
+               r = PTR_ERR(ic->bufio);
+               ti->error = "Cannot initialize dm-bufio";
+               ic->bufio = NULL;
+               goto bad;
        }
+       dm_bufio_set_sector_offset(ic->bufio, ic->start + ic->initial_sectors);
 
        if (ic->mode != 'R' && ic->mode != 'I') {
                r = create_journal(ic, &ti->error);
@@ -4982,6 +5185,7 @@ static void dm_integrity_dtr(struct dm_target *ti)
        kvfree(ic->bbs);
        if (ic->bufio)
                dm_bufio_client_destroy(ic->bufio);
+       bioset_exit(&ic->recalc_bios);
        bioset_exit(&ic->recheck_bios);
        mempool_exit(&ic->recheck_pool);
        mempool_exit(&ic->journal_io_mempool);
@@ -5036,7 +5240,7 @@ static void dm_integrity_dtr(struct dm_target *ti)
 
 static struct target_type integrity_target = {
        .name                   = "integrity",
-       .version                = {1, 12, 0},
+       .version                = {1, 13, 0},
        .module                 = THIS_MODULE,
        .features               = DM_TARGET_SINGLETON | DM_TARGET_INTEGRITY,
        .ctr                    = dm_integrity_ctr,