dm: fix deadlock when swapping to encrypted device
authorMikulas Patocka <mpatocka@redhat.com>
Wed, 10 Feb 2021 20:26:23 +0000 (15:26 -0500)
committerMike Snitzer <snitzer@redhat.com>
Thu, 11 Feb 2021 14:45:28 +0000 (09:45 -0500)
The system would deadlock when swapping to a dm-crypt device. The reason
is that for each incoming write bio, dm-crypt allocates memory that holds
encrypted data. These excessive allocations exhaust all the memory and the
result is either deadlock or OOM trigger.

This patch limits the number of in-flight swap bios, so that the memory
consumed by dm-crypt is limited. The limit is enforced if the target set
the "limit_swap_bios" variable and if the bio has REQ_SWAP set.

Non-swap bios are not affected becuase taking the semaphore would cause
performance degradation.

This is similar to request-based drivers - they will also block when the
number of requests is over the limit.

Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
Cc: stable@vger.kernel.org
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
drivers/md/dm-core.h
drivers/md/dm-crypt.c
drivers/md/dm.c
include/linux/device-mapper.h

index bf3e66f39a4a0f72102b0a03412e762228d017b1..5953ff2bd2609040bcd70d9e7890294f27f25cd3 100644 (file)
@@ -103,6 +103,10 @@ struct mapped_device {
        /* kobject and completion */
        struct dm_kobject_holder kobj_holder;
 
+       int swap_bios;
+       struct semaphore swap_bios_semaphore;
+       struct mutex swap_bios_lock;
+
        struct dm_stats stats;
 
        /* for blk-mq request-based DM support */
index ae0f0a4e3689115b330e8c30aff59be312b6f206..11c105ecd165a07f44969d44e58a874fab42a9d3 100644 (file)
@@ -3342,6 +3342,7 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
        wake_up_process(cc->write_thread);
 
        ti->num_flush_bios = 1;
+       ti->limit_swap_bios = true;
 
        return 0;
 
index 7021aea82aa4841aa29b5ffd9345e9b18fc25f64..50b693d776d603727bb4cabc1f4b452c7e262d78 100644 (file)
@@ -153,6 +153,16 @@ EXPORT_SYMBOL_GPL(dm_bio_get_target_bio_nr);
 #define DM_NUMA_NODE NUMA_NO_NODE
 static int dm_numa_node = DM_NUMA_NODE;
 
+#define DEFAULT_SWAP_BIOS      (8 * 1048576 / PAGE_SIZE)
+static int swap_bios = DEFAULT_SWAP_BIOS;
+static int get_swap_bios(void)
+{
+       int latch = READ_ONCE(swap_bios);
+       if (unlikely(latch <= 0))
+               latch = DEFAULT_SWAP_BIOS;
+       return latch;
+}
+
 /*
  * For mempools pre-allocation at the table loading time.
  */
@@ -974,6 +984,11 @@ void disable_write_zeroes(struct mapped_device *md)
        limits->max_write_zeroes_sectors = 0;
 }
 
+static bool swap_bios_limit(struct dm_target *ti, struct bio *bio)
+{
+       return unlikely((bio->bi_opf & REQ_SWAP) != 0) && unlikely(ti->limit_swap_bios);
+}
+
 static void clone_endio(struct bio *bio)
 {
        blk_status_t error = bio->bi_status;
@@ -1025,6 +1040,11 @@ static void clone_endio(struct bio *bio)
                }
        }
 
+       if (unlikely(swap_bios_limit(tio->ti, bio))) {
+               struct mapped_device *md = io->md;
+               up(&md->swap_bios_semaphore);
+       }
+
        free_tio(tio);
        dec_pending(io, error);
 }
@@ -1258,6 +1278,22 @@ void dm_accept_partial_bio(struct bio *bio, unsigned n_sectors)
 }
 EXPORT_SYMBOL_GPL(dm_accept_partial_bio);
 
+static noinline void __set_swap_bios_limit(struct mapped_device *md, int latch)
+{
+       mutex_lock(&md->swap_bios_lock);
+       while (latch < md->swap_bios) {
+               cond_resched();
+               down(&md->swap_bios_semaphore);
+               md->swap_bios--;
+       }
+       while (latch > md->swap_bios) {
+               cond_resched();
+               up(&md->swap_bios_semaphore);
+               md->swap_bios++;
+       }
+       mutex_unlock(&md->swap_bios_lock);
+}
+
 static blk_qc_t __map_bio(struct dm_target_io *tio)
 {
        int r;
@@ -1277,6 +1313,14 @@ static blk_qc_t __map_bio(struct dm_target_io *tio)
        atomic_inc(&io->io_count);
        sector = clone->bi_iter.bi_sector;
 
+       if (unlikely(swap_bios_limit(ti, clone))) {
+               struct mapped_device *md = io->md;
+               int latch = get_swap_bios();
+               if (unlikely(latch != md->swap_bios))
+                       __set_swap_bios_limit(md, latch);
+               down(&md->swap_bios_semaphore);
+       }
+
        r = ti->type->map(ti, clone);
        switch (r) {
        case DM_MAPIO_SUBMITTED:
@@ -1287,10 +1331,18 @@ static blk_qc_t __map_bio(struct dm_target_io *tio)
                ret = submit_bio_noacct(clone);
                break;
        case DM_MAPIO_KILL:
+               if (unlikely(swap_bios_limit(ti, clone))) {
+                       struct mapped_device *md = io->md;
+                       up(&md->swap_bios_semaphore);
+               }
                free_tio(tio);
                dec_pending(io, BLK_STS_IOERR);
                break;
        case DM_MAPIO_REQUEUE:
+               if (unlikely(swap_bios_limit(ti, clone))) {
+                       struct mapped_device *md = io->md;
+                       up(&md->swap_bios_semaphore);
+               }
                free_tio(tio);
                dec_pending(io, BLK_STS_DM_REQUEUE);
                break;
@@ -1767,6 +1819,7 @@ static void cleanup_mapped_device(struct mapped_device *md)
        mutex_destroy(&md->suspend_lock);
        mutex_destroy(&md->type_lock);
        mutex_destroy(&md->table_devices_lock);
+       mutex_destroy(&md->swap_bios_lock);
 
        dm_mq_cleanup_mapped_device(md);
 }
@@ -1834,6 +1887,10 @@ static struct mapped_device *alloc_dev(int minor)
        init_waitqueue_head(&md->eventq);
        init_completion(&md->kobj_holder.completion);
 
+       md->swap_bios = get_swap_bios();
+       sema_init(&md->swap_bios_semaphore, md->swap_bios);
+       mutex_init(&md->swap_bios_lock);
+
        md->disk->major = _major;
        md->disk->first_minor = minor;
        md->disk->fops = &dm_blk_dops;
@@ -3117,6 +3174,9 @@ MODULE_PARM_DESC(reserved_bio_based_ios, "Reserved IOs in bio-based mempools");
 module_param(dm_numa_node, int, S_IRUGO | S_IWUSR);
 MODULE_PARM_DESC(dm_numa_node, "NUMA node for DM device memory allocations");
 
+module_param(swap_bios, int, S_IRUGO | S_IWUSR);
+MODULE_PARM_DESC(swap_bios, "Maximum allowed inflight swap IOs");
+
 MODULE_DESCRIPTION(DM_NAME " driver");
 MODULE_AUTHOR("Joe Thornber <dm-devel@redhat.com>");
 MODULE_LICENSE("GPL");
index c98d847b0f0b8ed8f070c613a0abddd8909ee1f0..7f4ac87c0b323ad162b4d8dda2729b110e702be5 100644 (file)
@@ -343,6 +343,11 @@ struct dm_target {
         * whether or not its underlying devices have support.
         */
        bool discards_supported:1;
+
+       /*
+        * Set if we need to limit the number of in-flight bios when swapping.
+        */
+       bool limit_swap_bios:1;
 };
 
 void *dm_per_bio_data(struct bio *bio, size_t data_size);