diff options
author | Jens Axboe <axboe@kernel.dk> | 2022-07-03 10:18:10 -0600 |
---|---|---|
committer | Jens Axboe <axboe@kernel.dk> | 2022-07-03 10:18:10 -0600 |
commit | d01051ed365d95dfd5dec100a2b22e0c0cb8912d (patch) | |
tree | ee929cefcde61de4ad2a0def8b54bee3dd8e9651 | |
parent | 197f80d97e9ccc8a496f4935ba939f3ed7432d53 (diff) | |
parent | ff4ec5f79108cf82fe7168547c76fe754c4ade0a (diff) |
Merge branch 'md-next' of https://git.kernel.org/pub/scm/linux/kernel/git/song/md into for-5.20/drivers
Pull MD updates from Song:
"1. Improve raid5 lock contention, by Logan Gunthorpe.
2. Misc fixes to raid5, by Logan Gunthorpe.
3. Fix race condition with md_reap_sync_thread(), by Guoqing Jiang."
* 'md-next' of https://git.kernel.org/pub/scm/linux/kernel/git/song/md: (29 commits)
md: Fix spelling mistake in comments
md/raid5: Increase restriction on max segments per request
md/raid5: Improve debug prints
md/raid5: Pivot raid5_make_request()
md/raid5: Check all disks in a stripe_head for reshape progress
md/raid5: Refactor add_stripe_bio()
md/raid5: Keep a reference to last stripe_head for batch
md/raid5: Refactor for loop in raid5_make_request() into while loop
md/raid5: Move read_seqcount_begin() into make_stripe_request()
md/raid5: Drop the do_prepare flag in raid5_make_request()
md/raid5: Factor out helper from raid5_make_request() loop
md/raid5: Move common stripe get code into new find_get_stripe() helper
md/raid5: Move stripe_add_to_batch_list() call out of add_stripe_bio()
md/raid5: Refactor raid5_make_request loop
md/raid5: Factor out ahead_of_reshape() function
md/raid5: Make logic blocking check consistent with logic that blocks
md: unlock mddev before reap sync_thread in action_store
md: Explicitly create command-line configured devices
md: Notify sysfs sync_completed in md_reap_sync_thread()
md: Ensure resync is reported after it starts
...
-rw-r--r-- | MAINTAINERS | 1 | ||||
-rw-r--r-- | drivers/md/dm-raid.c | 1 | ||||
-rw-r--r-- | drivers/md/md-autodetect.c | 1 | ||||
-rw-r--r-- | drivers/md/md-cluster.c | 4 | ||||
-rw-r--r-- | drivers/md/md.c | 76 | ||||
-rw-r--r-- | drivers/md/md.h | 16 | ||||
-rw-r--r-- | drivers/md/raid5-cache.c | 40 | ||||
-rw-r--r-- | drivers/md/raid5-log.h | 77 | ||||
-rw-r--r-- | drivers/md/raid5-ppl.c | 2 | ||||
-rw-r--r-- | drivers/md/raid5.c | 646 |
10 files changed, 549 insertions, 315 deletions
diff --git a/MAINTAINERS b/MAINTAINERS index fe5daf141501..68147e525440 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -18540,6 +18540,7 @@ SOFTWARE RAID (Multiple Disks) SUPPORT M: Song Liu <song@kernel.org> L: linux-raid@vger.kernel.org S: Supported +Q: https://patchwork.kernel.org/project/linux-raid/list/ T: git git://git.kernel.org/pub/scm/linux/kernel/git/song/md.git F: drivers/md/Kconfig F: drivers/md/Makefile diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c index 9526ccbedafb..d43b8075c055 100644 --- a/drivers/md/dm-raid.c +++ b/drivers/md/dm-raid.c @@ -3725,6 +3725,7 @@ static int raid_message(struct dm_target *ti, unsigned int argc, char **argv, if (!strcasecmp(argv[0], "idle") || !strcasecmp(argv[0], "frozen")) { if (mddev->sync_thread) { set_bit(MD_RECOVERY_INTR, &mddev->recovery); + md_unregister_thread(&mddev->sync_thread); md_reap_sync_thread(mddev); } } else if (decipher_sync_action(mddev, mddev->recovery) != st_idle) diff --git a/drivers/md/md-autodetect.c b/drivers/md/md-autodetect.c index 2cf973722f59..344910ba435c 100644 --- a/drivers/md/md-autodetect.c +++ b/drivers/md/md-autodetect.c @@ -169,6 +169,7 @@ static void __init md_setup_drive(struct md_setup_args *args) pr_info("md: Loading %s: %s\n", name, args->device_names); + md_alloc(mdev, name); bdev = blkdev_get_by_dev(mdev, FMODE_READ, NULL); if (IS_ERR(bdev)) { pr_err("md: open failed - cannot start array %s\n", name); diff --git a/drivers/md/md-cluster.c b/drivers/md/md-cluster.c index 37cbcce3cc66..742b2349fea3 100644 --- a/drivers/md/md-cluster.c +++ b/drivers/md/md-cluster.c @@ -40,7 +40,7 @@ struct resync_info { /* Lock the send communication. This is done through * bit manipulation as opposed to a mutex in order to - * accomodate lock and hold. See next comment. + * accommodate lock and hold. See next comment. */ #define MD_CLUSTER_SEND_LOCK 4 /* If cluster operations (such as adding a disk) must lock the @@ -689,7 +689,7 @@ static int lock_comm(struct md_cluster_info *cinfo, bool mddev_locked) /* * If resync thread run after raid1d thread, then process_metadata_update * could not continue if raid1d held reconfig_mutex (and raid1d is blocked - * since another node already got EX on Token and waitting the EX of Ack), + * since another node already got EX on Token and waiting the EX of Ack), * so let resync wake up thread in case flag is set. */ if (mddev_locked && !test_bit(MD_CLUSTER_HOLDING_MUTEX_FOR_RECVD, diff --git a/drivers/md/md.c b/drivers/md/md.c index c7ecb0bffda0..b64de313838f 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -4830,6 +4830,19 @@ action_store(struct mddev *mddev, const char *page, size_t len) if (work_pending(&mddev->del_work)) flush_workqueue(md_misc_wq); if (mddev->sync_thread) { + sector_t save_rp = mddev->reshape_position; + + mddev_unlock(mddev); + set_bit(MD_RECOVERY_INTR, &mddev->recovery); + md_unregister_thread(&mddev->sync_thread); + mddev_lock_nointr(mddev); + /* + * set RECOVERY_INTR again and restore reshape + * position in case others changed them after + * got lock, eg, reshape_position_store and + * md_check_recovery. + */ + mddev->reshape_position = save_rp; set_bit(MD_RECOVERY_INTR, &mddev->recovery); md_reap_sync_thread(mddev); } @@ -5001,7 +5014,7 @@ static ssize_t sync_speed_show(struct mddev *mddev, char *page) { unsigned long resync, dt, db; - if (mddev->curr_resync == 0) + if (mddev->curr_resync == MD_RESYNC_NONE) return sprintf(page, "none\n"); resync = mddev->curr_mark_cnt - atomic_read(&mddev->recovery_active); dt = (jiffies - mddev->resync_mark) / HZ; @@ -5020,8 +5033,8 @@ sync_completed_show(struct mddev *mddev, char *page) if (!test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) return sprintf(page, "none\n"); - if (mddev->curr_resync == 1 || - mddev->curr_resync == 2) + if (mddev->curr_resync == MD_RESYNC_YIELDED || + mddev->curr_resync == MD_RESYNC_DELAYED) return sprintf(page, "delayed\n"); if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) || @@ -5623,7 +5636,7 @@ int mddev_init_writes_pending(struct mddev *mddev) } EXPORT_SYMBOL_GPL(mddev_init_writes_pending); -static int md_alloc(dev_t dev, char *name) +int md_alloc(dev_t dev, char *name) { /* * If dev is zero, name is the name of a device to allocate with @@ -6197,6 +6210,7 @@ static void __md_stop_writes(struct mddev *mddev) flush_workqueue(md_misc_wq); if (mddev->sync_thread) { set_bit(MD_RECOVERY_INTR, &mddev->recovery); + md_unregister_thread(&mddev->sync_thread); md_reap_sync_thread(mddev); } @@ -8018,16 +8032,26 @@ static int status_resync(struct seq_file *seq, struct mddev *mddev) max_sectors = mddev->dev_sectors; resync = mddev->curr_resync; - if (resync <= 3) { + if (resync < MD_RESYNC_ACTIVE) { if (test_bit(MD_RECOVERY_DONE, &mddev->recovery)) /* Still cleaning up */ resync = max_sectors; - } else if (resync > max_sectors) + } else if (resync > max_sectors) { resync = max_sectors; - else + } else { resync -= atomic_read(&mddev->recovery_active); + if (resync < MD_RESYNC_ACTIVE) { + /* + * Resync has started, but the subtraction has + * yielded one of the special values. Force it + * to active to ensure the status reports an + * active resync. + */ + resync = MD_RESYNC_ACTIVE; + } + } - if (resync == 0) { + if (resync == MD_RESYNC_NONE) { if (test_bit(MD_RESYNCING_REMOTE, &mddev->recovery)) { struct md_rdev *rdev; @@ -8051,7 +8075,7 @@ static int status_resync(struct seq_file *seq, struct mddev *mddev) } return 0; } - if (resync < 3) { + if (resync < MD_RESYNC_ACTIVE) { seq_printf(seq, "\tresync=DELAYED"); return 1; } @@ -8729,13 +8753,7 @@ void md_do_sync(struct md_thread *thread) mddev->last_sync_action = action ?: desc; - /* we overload curr_resync somewhat here. - * 0 == not engaged in resync at all - * 2 == checking that there is no conflict with another sync - * 1 == like 2, but have yielded to allow conflicting resync to - * commence - * other == active in resync - this many blocks - * + /* * Before starting a resync we must have set curr_resync to * 2, and then checked that every "conflicting" array has curr_resync * less than ours. When we find one that is the same or higher @@ -8747,7 +8765,7 @@ void md_do_sync(struct md_thread *thread) do { int mddev2_minor = -1; - mddev->curr_resync = 2; + mddev->curr_resync = MD_RESYNC_DELAYED; try_again: if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) @@ -8759,12 +8777,14 @@ void md_do_sync(struct md_thread *thread) && mddev2->curr_resync && match_mddev_units(mddev, mddev2)) { DEFINE_WAIT(wq); - if (mddev < mddev2 && mddev->curr_resync == 2) { + if (mddev < mddev2 && + mddev->curr_resync == MD_RESYNC_DELAYED) { /* arbitrarily yield */ - mddev->curr_resync = 1; + mddev->curr_resync = MD_RESYNC_YIELDED; wake_up(&resync_wait); } - if (mddev > mddev2 && mddev->curr_resync == 1) + if (mddev > mddev2 && + mddev->curr_resync == MD_RESYNC_YIELDED) /* no need to wait here, we can wait the next * time 'round when curr_resync == 2 */ @@ -8792,7 +8812,7 @@ void md_do_sync(struct md_thread *thread) finish_wait(&resync_wait, &wq); } } - } while (mddev->curr_resync < 2); + } while (mddev->curr_resync < MD_RESYNC_DELAYED); j = 0; if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) { @@ -8876,7 +8896,7 @@ void md_do_sync(struct md_thread *thread) desc, mdname(mddev)); mddev->curr_resync = j; } else - mddev->curr_resync = 3; /* no longer delayed */ + mddev->curr_resync = MD_RESYNC_ACTIVE; /* no longer delayed */ mddev->curr_resync_completed = j; sysfs_notify_dirent_safe(mddev->sysfs_completed); md_new_event(); @@ -9011,14 +9031,14 @@ void md_do_sync(struct md_thread *thread) if (!test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) && !test_bit(MD_RECOVERY_INTR, &mddev->recovery) && - mddev->curr_resync > 3) { + mddev->curr_resync >= MD_RESYNC_ACTIVE) { mddev->curr_resync_completed = mddev->curr_resync; sysfs_notify_dirent_safe(mddev->sysfs_completed); } mddev->pers->sync_request(mddev, max_sectors, &skipped); if (!test_bit(MD_RECOVERY_CHECK, &mddev->recovery) && - mddev->curr_resync > 3) { + mddev->curr_resync >= MD_RESYNC_ACTIVE) { if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) { if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) { if (mddev->curr_resync >= mddev->recovery_cp) { @@ -9082,7 +9102,7 @@ void md_do_sync(struct md_thread *thread) } else if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) mddev->resync_min = mddev->curr_resync_completed; set_bit(MD_RECOVERY_DONE, &mddev->recovery); - mddev->curr_resync = 0; + mddev->curr_resync = MD_RESYNC_NONE; spin_unlock(&mddev->lock); wake_up(&resync_wait); @@ -9303,6 +9323,7 @@ void md_check_recovery(struct mddev *mddev) * ->spare_active and clear saved_raid_disk */ set_bit(MD_RECOVERY_INTR, &mddev->recovery); + md_unregister_thread(&mddev->sync_thread); md_reap_sync_thread(mddev); clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery); clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery); @@ -9338,6 +9359,7 @@ void md_check_recovery(struct mddev *mddev) goto unlock; } if (mddev->sync_thread) { + md_unregister_thread(&mddev->sync_thread); md_reap_sync_thread(mddev); goto unlock; } @@ -9417,8 +9439,7 @@ void md_reap_sync_thread(struct mddev *mddev) sector_t old_dev_sectors = mddev->dev_sectors; bool is_reshaped = false; - /* resync has finished, collect result */ - md_unregister_thread(&mddev->sync_thread); + /* sync_thread should be unregistered, collect result */ if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery) && !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery) && mddev->degraded != mddev->raid_disks) { @@ -9466,6 +9487,7 @@ void md_reap_sync_thread(struct mddev *mddev) wake_up(&resync_wait); /* flag recovery needed just to double check */ set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); + sysfs_notify_dirent_safe(mddev->sysfs_completed); sysfs_notify_dirent_safe(mddev->sysfs_action); md_new_event(); if (mddev->event_work.func) diff --git a/drivers/md/md.h b/drivers/md/md.h index cf2cbb17acbd..1a85dbe78a71 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h @@ -288,6 +288,21 @@ struct serial_info { sector_t _subtree_last; /* highest sector in subtree of rb node */ }; +/* + * mddev->curr_resync stores the current sector of the resync but + * also has some overloaded values. + */ +enum { + /* No resync in progress */ + MD_RESYNC_NONE = 0, + /* Yielded to allow another conflicting resync to commence */ + MD_RESYNC_YIELDED = 1, + /* Delayed to check that there is no conflict with another sync */ + MD_RESYNC_DELAYED = 2, + /* Any value greater than or equal to this is in an active resync */ + MD_RESYNC_ACTIVE = 3, +}; + struct mddev { void *private; struct md_personality *pers; @@ -751,6 +766,7 @@ extern int md_integrity_add_rdev(struct md_rdev *rdev, struct mddev *mddev); extern int strict_strtoul_scaled(const char *cp, unsigned long *res, int scale); extern void mddev_init(struct mddev *mddev); +int md_alloc(dev_t dev, char *name); extern int md_run(struct mddev *mddev); extern int md_start(struct mddev *mddev); extern void md_stop(struct mddev *mddev); diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c index 83c184eddbda..b2e6016934ee 100644 --- a/drivers/md/raid5-cache.c +++ b/drivers/md/raid5-cache.c @@ -1590,18 +1590,13 @@ void r5l_quiesce(struct r5l_log *log, int quiesce) bool r5l_log_disk_error(struct r5conf *conf) { - struct r5l_log *log; - bool ret; - /* don't allow write if journal disk is missing */ - rcu_read_lock(); - log = rcu_dereference(conf->log); + struct r5l_log *log = conf->log; + /* don't allow write if journal disk is missing */ if (!log) - ret = test_bit(MD_HAS_JOURNAL, &conf->mddev->flags); + return test_bit(MD_HAS_JOURNAL, &conf->mddev->flags); else - ret = test_bit(Faulty, &log->rdev->flags); - rcu_read_unlock(); - return ret; + return test_bit(Faulty, &log->rdev->flags); } #define R5L_RECOVERY_PAGE_POOL_SIZE 256 @@ -2534,12 +2529,13 @@ static ssize_t r5c_journal_mode_show(struct mddev *mddev, char *page) struct r5conf *conf; int ret; - spin_lock(&mddev->lock); + ret = mddev_lock(mddev); + if (ret) + return ret; + conf = mddev->private; - if (!conf || !conf->log) { - spin_unlock(&mddev->lock); - return 0; - } + if (!conf || !conf->log) + goto out_unlock; switch (conf->log->r5c_journal_mode) { case R5C_JOURNAL_MODE_WRITE_THROUGH: @@ -2557,7 +2553,9 @@ static ssize_t r5c_journal_mode_show(struct mddev *mddev, char *page) default: ret = 0; } - spin_unlock(&mddev->lock); + +out_unlock: + mddev_unlock(mddev); return ret; } @@ -2639,7 +2637,7 @@ int r5c_try_caching_write(struct r5conf *conf, int i; struct r5dev *dev; int to_cache = 0; - void **pslot; + void __rcu **pslot; sector_t tree_index; int ret; uintptr_t refcount; @@ -2806,7 +2804,7 @@ void r5c_finish_stripe_write_out(struct r5conf *conf, int i; int do_wakeup = 0; sector_t tree_index; - void **pslot; + void __rcu **pslot; uintptr_t refcount; if (!log || !test_bit(R5_InJournal, &sh->dev[sh->pd_idx].flags)) @@ -3145,7 +3143,7 @@ int r5l_init_log(struct r5conf *conf, struct md_rdev *rdev) spin_lock_init(&log->stripe_in_journal_lock); atomic_set(&log->stripe_in_journal_count, 0); - rcu_assign_pointer(conf->log, log); + conf->log = log; set_bit(MD_HAS_JOURNAL, &conf->mddev->flags); return 0; @@ -3167,13 +3165,13 @@ void r5l_exit_log(struct r5conf *conf) { struct r5l_log *log = conf->log; - conf->log = NULL; - synchronize_rcu(); - /* Ensure disable_writeback_work wakes up and exits */ wake_up(&conf->mddev->sb_wait); flush_work(&log->disable_writeback_work); md_unregister_thread(&log->reclaim_thread); + + conf->log = NULL; + mempool_exit(&log->meta_pool); bioset_exit(&log->bs); mempool_exit(&log->io_pool); diff --git a/drivers/md/raid5-log.h b/drivers/md/raid5-log.h index 43c714a8798c..c8332502669e 100644 --- a/drivers/md/raid5-log.h +++ b/drivers/md/raid5-log.h @@ -2,49 +2,46 @@ #ifndef _RAID5_LOG_H #define _RAID5_LOG_H -extern int r5l_init_log(struct r5conf *conf, struct md_rdev *rdev); -extern void r5l_exit_log(struct r5conf *conf); -extern int r5l_write_stripe(struct r5l_log *log, struct stripe_head *head_sh); -extern void r5l_write_stripe_run(struct r5l_log *log); -extern void r5l_flush_stripe_to_raid(struct r5l_log *log); -extern void r5l_stripe_write_finished(struct stripe_head *sh); -extern int r5l_handle_flush_request(struct r5l_log *log, struct bio *bio); -extern void r5l_quiesce(struct r5l_log *log, int quiesce); -extern bool r5l_log_disk_error(struct r5conf *conf); -extern bool r5c_is_writeback(struct r5l_log *log); -extern int -r5c_try_caching_write(struct r5conf *conf, struct stripe_head *sh, - struct stripe_head_state *s, int disks); -extern void -r5c_finish_stripe_write_out(struct r5conf *conf, struct stripe_head *sh, - struct stripe_head_state *s); -extern void r5c_release_extra_page(struct stripe_head *sh); -extern void r5c_use_extra_page(struct stripe_head *sh); -extern void r5l_wake_reclaim(struct r5l_log *log, sector_t space); -extern void r5c_handle_cached_data_endio(struct r5conf *conf, - struct stripe_head *sh, int disks); -extern int r5c_cache_data(struct r5l_log *log, struct stripe_head *sh); -extern void r5c_make_stripe_write_out(struct stripe_head *sh); -extern void r5c_flush_cache(struct r5conf *conf, int num); -extern void r5c_check_stripe_cache_usage(struct r5conf *conf); -extern void r5c_check_cached_full_stripe(struct r5conf *conf); +int r5l_init_log(struct r5conf *conf, struct md_rdev *rdev); +void r5l_exit_log(struct r5conf *conf); +int r5l_write_stripe(struct r5l_log *log, struct stripe_head *head_sh); +void r5l_write_stripe_run(struct r5l_log *log); +void r5l_flush_stripe_to_raid(struct r5l_log *log); +void r5l_stripe_write_finished(struct stripe_head *sh); +int r5l_handle_flush_request(struct r5l_log *log, struct bio *bio); +void r5l_quiesce(struct r5l_log *log, int quiesce); +bool r5l_log_disk_error(struct r5conf *conf); +bool r5c_is_writeback(struct r5l_log *log); +int r5c_try_caching_write(struct r5conf *conf, struct stripe_head *sh, + struct stripe_head_state *s, int disks); +void r5c_finish_stripe_write_out(struct r5conf *conf, struct stripe_head *sh, + struct stripe_head_state *s); +void r5c_release_extra_page(struct stripe_head *sh); +void r5c_use_extra_page(struct stripe_head *sh); +void r5l_wake_reclaim(struct r5l_log *log, sector_t space); +void r5c_handle_cached_data_endio(struct r5conf *conf, + struct stripe_head *sh, int disks); +int r5c_cache_data(struct r5l_log *log, struct stripe_head *sh); +void r5c_make_stripe_write_out(struct stripe_head *sh); +void r5c_flush_cache(struct r5conf *conf, int num); +void r5c_check_stripe_cache_usage(struct r5conf *conf); +void r5c_check_cached_full_stripe(struct r5conf *conf); extern struct md_sysfs_entry r5c_journal_mode; -extern void r5c_update_on_rdev_error(struct mddev *mddev, - struct md_rdev *rdev); -extern bool r5c_big_stripe_cached(struct r5conf *conf, sector_t sect); -extern int r5l_start(struct r5l_log *log); +void r5c_update_on_rdev_error(struct mddev *mddev, struct md_rdev *rdev); +bool r5c_big_stripe_cached(struct r5conf *conf, sector_t sect); +int r5l_start(struct r5l_log *log); -extern struct dma_async_tx_descriptor * +struct dma_async_tx_descriptor * ops_run_partial_parity(struct stripe_head *sh, struct raid5_percpu *percpu, struct dma_async_tx_descriptor *tx); -extern int ppl_init_log(struct r5conf *conf); -extern void ppl_exit_log(struct r5conf *conf); -extern int ppl_write_stripe(struct r5conf *conf, struct stripe_head *sh); -extern void ppl_write_stripe_run(struct r5conf *conf); -extern void ppl_stripe_write_finished(struct stripe_head *sh); -extern int ppl_modify_log(struct r5conf *conf, struct md_rdev *rdev, bool add); -extern void ppl_quiesce(struct r5conf *conf, int quiesce); -extern int ppl_handle_flush_request(struct r5l_log *log, struct bio *bio); +int ppl_init_log(struct r5conf *conf); +void ppl_exit_log(struct r5conf *conf); +int ppl_write_stripe(struct r5conf *conf, struct stripe_head *sh); +void ppl_write_stripe_run(struct r5conf *conf); +void ppl_stripe_write_finished(struct stripe_head *sh); +int ppl_modify_log(struct r5conf *conf, struct md_rdev *rdev, bool add); +void ppl_quiesce(struct r5conf *conf, int quiesce); +int ppl_handle_flush_request(struct bio *bio); extern struct md_sysfs_entry ppl_write_hint; static inline bool raid5_has_log(struct r5conf *conf) @@ -111,7 +108,7 @@ static inline int log_handle_flush_request(struct r5conf *conf, struct bio *bio) if (conf->log) ret = r5l_handle_flush_request(conf->log, bio); else if (raid5_has_ppl(conf)) - ret = ppl_handle_flush_request(conf->log, bio); + ret = ppl_handle_flush_request(bio); return ret; } diff --git a/drivers/md/raid5-ppl.c b/drivers/md/raid5-ppl.c index 0a2e4806b1ec..db49edec362a 100644 --- a/drivers/md/raid5-ppl.c +++ b/drivers/md/raid5-ppl.c @@ -679,7 +679,7 @@ void ppl_quiesce(struct r5conf *conf, int quiesce) } } -int ppl_handle_flush_request(struct r5l_log *log, struct bio *bio) +int ppl_handle_flush_request(struct bio *bio) { if (bio->bi_iter.bi_size == 0) { bio_endio(bio); diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 5d09256d7f81..184145b49b7c 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -61,6 +61,8 @@ #define cpu_to_group(cpu) cpu_to_node(cpu) #define ANY_GROUP NUMA_NO_NODE +#define RAID5_MAX_REQ_STRIPES 256 + static bool devices_handle_discard_safely = false; module_param(devices_handle_discard_safely, bool, 0644); MODULE_PARM_DESC(devices_handle_discard_safely, @@ -624,6 +626,49 @@ static struct stripe_head *__find_stripe(struct r5conf *conf, sector_t sector, return NULL; } +static struct stripe_head *find_get_stripe(struct r5conf *conf, + sector_t sector, short generation, int hash) +{ + int inc_empty_inactive_list_flag; + struct stripe_head *sh; + + sh = __find_stripe(conf, sector, generation); + if (!sh) + return NULL; + + if (atomic_inc_not_zero(&sh->count)) + return sh; + + /* + * Slow path. The reference count is zero which means the stripe must + * be on a list (sh->lru). Must remove the stripe from the list that + * references it with the device_lock held. + */ + + spin_lock(&conf->device_lock); + if (!atomic_read(&sh->count)) { + if (!test_bit(STRIPE_HANDLE, &sh->state)) + atomic_inc(&conf->active_stripes); + BUG_ON(list_empty(&sh->lru) && + !test_bit(STRIPE_EXPANDING, &sh->state)); + inc_empty_inactive_list_flag = 0; + if (!list_empty(conf->inactive_list + hash)) + inc_empty_inactive_list_flag = 1; + list_del_init(&sh->lru); + if (list_empty(conf->inactive_list + hash) && + inc_empty_inactive_list_flag) + atomic_inc(&conf->empty_inactive_list_nr); + if (sh->group) { + sh->group->stripes_cnt--; + sh->group = NULL; + } + } + atomic_inc(&sh->count); + spin_unlock(&conf->device_lock); + + return sh; +} + /* * Need to check if array has failed when deciding whether to: * - start an array @@ -716,7 +761,6 @@ raid5_get_active_stripe(struct r5conf *conf, sector_t sector, { struct stripe_head *sh; int hash = stripe_hash_locks_hash(conf, sector); - int inc_empty_inactive_list_flag; pr_debug("get_stripe, sector %llu\n", (unsigned long long)sector); @@ -726,57 +770,34 @@ raid5_get_active_stripe(struct r5conf *conf, sector_t sector, wait_event_lock_irq(conf->wait_for_quiescent, conf->quiesce == 0 || noquiesce, *(conf->hash_locks + hash)); - sh = __find_stripe(conf, sector, conf->generation - previous); - if (!sh) { - if (!test_bit(R5_INACTIVE_BLOCKED, &conf->cache_state)) { - sh = get_free_stripe(conf, hash); - if (!sh && !test_bit(R5_DID_ALLOC, - &conf->cache_state)) - set_bit(R5_ALLOC_MORE, - &conf->cache_state); - } - if (noblock && sh == NULL) - break; + sh = find_get_stripe(conf, sector, conf->generation - previous, + hash); + if (sh) + break; - r5c_check_stripe_cache_usage(conf); - if (!sh) { - set_bit(R5_INACTIVE_BLOCKED, - &conf->cache_state); - r5l_wake_reclaim(conf->log, 0); - wait_event_lock_irq( - conf->wait_for_stripe, + if (!test_bit(R5_INACTIVE_BLOCKED, &conf->cache_state)) { + sh = get_free_stripe(conf, hash); + if (!sh && !test_bit(R5_DID_ALLOC, &conf->cache_state)) + set_bit(R5_ALLOC_MORE, &conf->cache_state); + } + if (noblock && !sh) + break; + + r5c_check_stripe_cache_usage(conf); + if (!sh) { + set_bit(R5_INACTIVE_BLOCKED, &conf->cache_state); + r5l_wake_reclaim(conf->log, 0); + wait_event_lock_irq(conf->wait_for_stripe, !list_empty(conf->inactive_list + hash) && (atomic_read(&conf->active_stripes) < (conf->max_nr_stripes * 3 / 4) || !test_bit(R5_INACTIVE_BLOCKED, &conf->cache_state)), *(conf->hash_locks + hash)); - clear_bit(R5_INACTIVE_BLOCKED, - &conf->cache_state); - } else { - init_stripe(sh, sector, previous); - atomic_inc(&sh->count); - } - } else if (!atomic_inc_not_zero(&sh->count)) { - spin_lock(&conf->device_lock); - if (!atomic_read(&sh->count)) { - if (!test_bit(STRIPE_HANDLE, &sh->state)) - atomic_inc(&conf->active_stripes); - BUG_ON(list_empty(&sh->lru) && - !test_bit(STRIPE_EXPANDING, &sh->state)); - inc_empty_inactive_list_flag = 0; - if (!list_empty(conf->inactive_list + hash)) - inc_empty_inactive_list_flag = 1; - list_del_init(&sh->lru); - if (list_empty(conf->inactive_list + hash) && inc_empty_inactive_list_flag) - atomic_inc(&conf->empty_inactive_list_nr); - if (sh->group) { - sh->group->stripes_cnt--; - sh->group = NULL; - } - } + clear_bit(R5_INACTIVE_BLOCKED, &conf->cache_state); + } else { + init_stripe(sh, sector, previous); atomic_inc(&sh->count); - spin_unlock(&conf->device_lock); } } while (sh == NULL); @@ -824,13 +845,13 @@ static bool stripe_can_batch(struct stripe_head *sh) } /* we only do back search */ -static void stripe_add_to_batch_list(struct r5conf *conf, struct stripe_head *sh) +static void stripe_add_to_batch_list(struct r5conf *conf, + struct stripe_head *sh, struct stripe_head *last_sh) { struct stripe_head *head; sector_t head_sector, tmp_sec; int hash; int dd_idx; - int inc_empty_inactive_list_flag; /* Don't cross chunks, so stripe pd_idx/qd_idx is the same */ tmp_sec = sh->sector; @@ -838,36 +859,20 @@ static void stripe_add_to_batch_list(struct r5conf *conf, struct stripe_head *sh return; head_sector = sh->sector - RAID5_STRIPE_SECTORS(conf); - hash = stripe_hash_locks_hash(conf, head_sector); - spin_lock_irq(conf->hash_locks + hash); - head = __find_stripe(conf, head_sector, conf->generation); - if (head && !atomic_inc_not_zero(&head->count)) { - spin_lock(&conf->device_lock); - if (!atomic_read(&head->count)) { - if (!test_bit(STRIPE_HANDLE, &head->state)) - atomic_inc(&conf->active_stripes); - BUG_ON(list_empty(&head->lru) && - !test_bit(STRIPE_EXPANDING, &head->state)); - inc_empty_inactive_list_flag = 0; - if (!list_empty(conf->inactive_list + hash)) - inc_empty_inactive_list_flag = 1; - list_del_init(&head->lru); - if (list_empty(conf->inactive_list + hash) && inc_empty_inactive_list_flag) - atomic_inc(&conf->empty_inactive_list_nr); - if (head->group) { - head->group->stripes_cnt--; - head->group = NULL; - } - } + if (last_sh && head_sector == last_sh->sector) { + head = last_sh; atomic_inc(&head->count); - spin_unlock(&conf->device_lock); + } else { + hash = stripe_hash_locks_hash(conf, head_sector); + spin_lock_irq(conf->hash_locks + hash); + head = find_get_stripe(conf, head_sector, conf->generation, + hash); + spin_unlock_irq(conf->hash_locks + hash); + if (!head) + return; + if (!stripe_can_batch(head)) + goto out; } - spin_unlock_irq(conf->hash_locks + hash); - - if (!head) - return; - if (!stripe_can_batch(head)) - goto out; lock_two_stripes(head, sh); /* clear_batch_ready clear the flag */ @@ -3412,39 +3417,32 @@ schedule_reconstruction(struct stripe_head *sh, struct stripe_head_state *s, s->locked, s->ops_request); } -/* - * Each stripe/dev can have one or more bion attached. - * toread/towrite point to the first in a chain. - * The bi_next chain must be in order. - */ -static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, - int forwrite, int previous) +static bool stripe_bio_overlaps(struct stripe_head *sh, struct bio *bi, + int dd_idx, int forwrite) { - struct bio **bip; struct r5conf *conf = sh->raid_conf; - int firstwrite=0; + struct bio **bip; - pr_debug("adding bi b#%llu to stripe s#%llu\n", - (unsigned long long)bi->bi_iter.bi_sector, - (unsigned long long)sh->sector); + pr_debug("checking bi b#%llu to stripe s#%llu\n", + bi->bi_iter.bi_sector, sh->sector); - spin_lock_irq(&sh->stripe_lock); /* Don't allow new IO added to stripes in batch list */ if (sh->batch_head) - goto overlap; - if (forwrite) { + return true; + + if (forwrite) bip = &sh->dev[dd_idx].towrite; - if (*bip == NULL) - firstwrite = 1; - } else + else bip = &sh->dev[dd_idx].toread; + while (*bip && (*bip)->bi_iter.bi_sector < bi->bi_iter.bi_sector) { if (bio_end_sector(*bip) > bi->bi_iter.bi_sector) - goto overlap; - bip = & (*bip)->bi_next; + return true; + bip = &(*bip)->bi_next; } + if (*bip && (*bip)->bi_iter.bi_sector < bio_end_sector(bi)) - goto overlap; + return true; if (forwrite && raid5_has_ppl(conf)) { /* @@ -3473,9 +3471,30 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, } if (first + conf->chunk_sectors * (count - 1) != last) - goto overlap; + return true; } + return false; +} + +static void __add_stripe_bio(struct stripe_head *sh, struct bio *bi, + int dd_idx, int forwrite, int previous) +{ + struct r5conf *conf = sh->raid_conf; + struct bio **bip; + int firstwrite = 0; + + if (forwrite) { + bip = &sh->dev[dd_idx].towrite; + if (!*bip) + firstwrite = 1; + } else { + bip = &sh->dev[dd_idx].toread; + } + + while (*bip && (*bip)->bi_iter.bi_sector < bi->bi_iter.bi_sector) + bip = &(*bip)->bi_next; + if (!forwrite || previous) clear_bit(STRIPE_BATCH_READY, &sh->state); @@ -3501,9 +3520,9 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, sh->overwrite_disks++; } - pr_debug("added bi b#%llu to stripe s#%llu, disk %d.\n", - (unsigned long long)(*bip)->bi_iter.bi_sector, - (unsigned long long)sh->sector, dd_idx); + pr_debug("added bi b#%llu to stripe s#%llu, disk %d, logical %llu\n", + (*bip)->bi_iter.bi_sector, sh->sector, dd_idx, + sh->dev[dd_idx].sector); if (conf->mddev->bitmap && firstwrite) { /* Cannot hold spinlock over bitmap_startwrite, @@ -3511,7 +3530,7 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, * we have added to the bitmap and set bm_seq. * So set STRIPE_BITMAP_PENDING to prevent * batching. - * If multiple add_stripe_bio() calls race here they + * If multiple __add_stripe_bio() calls race here they * much all set STRIPE_BITMAP_PENDING. So only the first one * to complete "bitmap_startwrite" gets to set * STRIPE_BIT_DELAY. This is important as once a stripe @@ -3529,16 +3548,27 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, set_bit(STRIPE_BIT_DELAY, &sh->state); } } - spin_unlock_irq(&sh->stripe_lock); +} - if (stripe_can_batch(sh)) - stripe_add_to_batch_list(conf, sh); - return 1; +/* + * Each stripe/dev can have one or more bios attached. + * toread/towrite point to the first in a chain. + * The bi_next chain must be in order. + */ +static bool add_stripe_bio(struct stripe_head *sh, struct bio *bi, + int dd_idx, int forwrite, int previous) +{ + spin_lock_irq(&sh->stripe_lock); + + if (stripe_bio_overlaps(sh, bi, dd_idx, forwrite)) { + set_bit(R5_Overlap, &sh->dev[dd_idx].flags); + spin_unlock_irq(&sh->stripe_lock); + return false; + } - overlap: - set_bit(R5_Overlap, &sh->dev[dd_idx].flags); + __add_stripe_bio(sh, bi, dd_idx, forwrite, previous); spin_unlock_irq(&sh->stripe_lock); - return 0; + return true; } static void end_reshape(struct r5conf *conf); @@ -5784,17 +5814,239 @@ static void make_discard_request(struct mddev *mddev, struct bio *bi) bio_endio(bi); } -static bool raid5_make_request(struct mddev *mddev, struct bio * bi) +static bool ahead_of_reshape(struct mddev *mddev, sector_t sector, + sector_t reshape_sector) { - struct r5conf *conf = mddev->private; + return mddev->reshape_backwards ? sector < reshape_sector : + sector >= reshape_sector; +} + +static bool range_ahead_of_reshape(struct mddev *mddev, sector_t min, + sector_t max, sector_t reshape_sector) +{ + return mddev->reshape_backwards ? max < reshape_sector : + min >= reshape_sector; +} + +static bool stripe_ahead_of_reshape(struct mddev *mddev, struct r5conf *conf, + struct stripe_head *sh) +{ + sector_t max_sector = 0, min_sector = MaxSector; + bool ret = false; int dd_idx; - sector_t new_sector; - sector_t logical_sector, last_sector; + + for (dd_idx = 0; dd_idx < sh->disks; dd_idx++) { + if (dd_idx == sh->pd_idx) + continue; + + min_sector = min(min_sector, sh->dev[dd_idx].sector); + max_sector = min(max_sector, sh->dev[dd_idx].sector); + } + + spin_lock_irq(&conf->device_lock); + + if (!range_ahead_of_reshape(mddev, min_sector, max_sector, + conf->reshape_progress)) + /* mismatch, need to try again */ + ret = true; + + spin_unlock_irq(&conf->device_lock); + + return ret; +} + +enum stripe_result { + STRIPE_SUCCESS = 0, + STRIPE_RETRY, + STRIPE_SCHEDULE_AND_RETRY, + STRIPE_FAIL, +}; + +struct stripe_request_ctx { + /* a reference to the last stripe_head for batching */ + struct stripe_head *batch_last; + + /* first sector in the request */ + sector_t first_sector; + + /* last sector in the request */ + sector_t last_sector; + + /* bitmap to track stripe sectors that have been added to stripes */ + DECLARE_BITMAP(sectors_to_do, RAID5_MAX_REQ_STRIPES); + + /* the request had REQ_PREFLUSH, cleared after the first stripe_head */ + bool do_flush; +}; + +static int add_all_stripe_bios(struct r5conf *conf, + struct stripe_request_ctx *ctx, struct stripe_head *sh, + struct bio *bi, int forwrite, int previous) +{ + int dd_idx; + int ret = 1; + + spin_lock_irq(&sh->stripe_lock); + + for (dd_idx = 0; dd_idx < sh->disks; dd_idx++) { + struct r5dev *dev = &sh->dev[dd_idx]; + + if (dd_idx == sh->pd_idx || dd_idx == sh->qd_idx) + continue; + + if (dev->sector < ctx->first_sector || + dev->sector >= ctx->last_sector) + continue; + + if (stripe_bio_overlaps(sh, bi, dd_idx, forwrite)) { + set_bit(R5_Overlap, &dev->flags); + ret = 0; + continue; + } + } + + if (!ret) + goto out; + + for (dd_idx = 0; dd_idx < sh->disks; dd_idx++) { + struct r5dev *dev = &sh->dev[dd_idx]; + + if (dd_idx == sh->pd_idx || dd_idx == sh->qd_idx) + continue; + + if (dev->sector < ctx->first_sector || + dev->sector >= ctx->last_sector) + continue; + + __add_stripe_bio(sh, bi, dd_idx, forwrite, previous); + clear_bit((dev->sector - ctx->first_sector) >> + RAID5_STRIPE_SHIFT(conf), ctx->sectors_to_do); + } + +out: + spin_unlock_irq(&sh->stripe_lock); + return ret; +} + +static enum stripe_result make_stripe_request(struct mddev *mddev, + struct r5conf *conf, struct stripe_request_ctx *ctx, + sector_t logical_sector, struct bio *bi) +{ + const int rw = bio_data_dir(bi); + enum stripe_result ret; struct stripe_head *sh; + sector_t new_sector; + int previous = 0; + int seq, dd_idx; + + seq = read_seqcount_begin(&conf->gen_lock); + + if (unlikely(conf->reshape_progress != MaxSector)) { + /* + * Spinlock is needed as reshape_progress may be + * 64bit on a 32bit platform, and so it might be + * possible to see a half-updated value + * Of course reshape_progress could change after + * the lock is dropped, so once we get a reference + * to the stripe that we think it is, we will have + * to check again. + */ + spin_lock_irq(&conf->device_lock); + if (ahead_of_reshape(mddev, logical_sector, + conf->reshape_progress)) { + previous = 1; + } else { + if (ahead_of_reshape(mddev, logical_sector, + conf->reshape_safe)) { + spin_unlock_irq(&conf->device_lock); + return STRIPE_SCHEDULE_AND_RETRY; + } + } + spin_unlock_irq(&conf->device_lock); + } + + new_sector = raid5_compute_sector(conf, logical_sector, previous, + &dd_idx, NULL); + pr_debug("raid456: %s, sector %llu logical %llu\n", __func__, + new_sector, logical_sector); + + sh = raid5_get_active_stripe(conf, new_sector, previous, + (bi->bi_opf & REQ_RAHEAD), 0); + if (unlikely(!sh)) { + /* cannot get stripe, just give-up */ + bi->bi_status = BLK_STS_IOERR; + return STRIPE_FAIL; + } + + if (unlikely(previous) && + stripe_ahead_of_reshape(mddev, conf, sh)) { + /* + * Expansion moved on while waiting for a stripe. + * Expansion could still move past after this + * test, but as we are holding a reference to + * 'sh', we know that if that happens, + * STRIPE_EXPANDING will get set and the expansion + * won't proceed until we finish with the stripe. + */ + ret = STRIPE_SCHEDULE_AND_RETRY; + goto out_release; + } + + if (read_seqcount_retry(&conf->gen_lock, seq)) { + /* Might have got the wrong stripe_head by accident */ + ret = STRIPE_RETRY; + goto out_release; + } + + if (test_bit(STRIPE_EXPANDING, &sh->state) || + !add_all_stripe_bios(conf, ctx, sh, bi, rw, previous)) { + /* + * Stripe is busy expanding or add failed due to + * overlap. Flush everything and wait a while. + */ + md_wakeup_thread(mddev->thread); + ret = STRIPE_SCHEDULE_AND_RETRY; + goto out_release; + } + + if (stripe_can_batch(sh)) { + stripe_add_to_batch_list(conf, sh, ctx->batch_last); + if (ctx->batch_last) + raid5_release_stripe(ctx->batch_last); + atomic_inc(&sh->count); + ctx->batch_last = sh; + } + + if (ctx->do_flush) { + set_bit(STRIPE_R5C_PREFLUSH, &sh->state); + /* we only need flush for one stripe */ + ctx->do_flush = false; + } + + set_bit(STRIPE_HANDLE, &sh->state); + clear_bit(STRIPE_DELAYED, &sh->state); + if ((!sh->batch_head || sh == sh->batch_head) && + (bi->bi_opf & REQ_SYNC) && + !test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) + atomic_inc(&conf->preread_active_stripes); + + release_stripe_plug(mddev, sh); + return STRIPE_SUCCESS; + +out_release: + raid5_release_stripe(sh); + return ret; +} + +static bool raid5_make_request(struct mddev *mddev, struct bio * bi) +{ + struct r5conf *conf = mddev->private; + sector_t logical_sector; + struct stripe_request_ctx ctx = {}; const int rw = bio_data_dir(bi); + enum stripe_result res; DEFINE_WAIT(w); - bool do_prepare; - bool do_flush = false; + int s; if (unlikely(bi->bi_opf & REQ_PREFLUSH)) { int ret = log_handle_flush_request(conf, bi); @@ -5810,7 +6062,7 @@ static bool raid5_make_request(struct mddev *mddev, struct bio * bi) * if r5l_handle_flush_request() didn't clear REQ_PREFLUSH, * we need to flush journal device */ - do_flush = bi->bi_opf & REQ_PREFLUSH; + ctx.do_flush = bi->bi_opf & REQ_PREFLUSH; } if (!md_write_start(mddev, bi)) @@ -5834,15 +6086,22 @@ static bool raid5_make_request(struct mddev *mddev, struct bio * bi) } logical_sector = bi->bi_iter.bi_sector & ~((sector_t)RAID5_STRIPE_SECTORS(conf)-1); - last_sector = bio_end_sector(bi); + ctx.first_sector = logical_sector; + ctx.last_sector = bio_end_sector(bi); bi->bi_next = NULL; + bitmap_set(ctx.sectors_to_do, 0, + DIV_ROUND_UP_SECTOR_T(ctx.last_sector - logical_sector, + RAID5_STRIPE_SECTORS(conf))); + + pr_debug("raid456: %s, logical %llu to %llu\n", __func__, + bi->bi_iter.bi_sector, ctx.last_sector); + /* Bail out if conflicts with reshape and REQ_NOWAIT is set */ if ((bi->bi_opf & REQ_NOWAIT) && (conf->reshape_progress != MaxSector) && - (mddev->reshape_backwards - ? (logical_sector > conf->reshape_progress && logical_sector <= conf->reshape_safe) - : (logical_sector >= conf->reshape_safe && logical_sector < conf->reshape_progress))) { + !ahead_of_reshape(mddev, logical_sector, conf->reshape_progress) && + ahead_of_reshape(mddev, logical_sector, conf->reshape_safe)) { bio_wouldblock_error(bi); if (rw == WRITE) md_write_end(mddev); @@ -5850,119 +6109,47 @@ static bool raid5_make_request(struct mddev *mddev, struct bio * bi) } md_account_bio(mddev, &bi); prepare_to_wait(&conf->wait_for_overlap, &w, TASK_UNINTERRUPTIBLE); - for (; logical_sector < last_sector; logical_sector += RAID5_STRIPE_SECTORS(conf)) { - int previous; - int seq; - - do_prepare = false; - retry: - seq = read_seqcount_begin(&conf->gen_lock); - previous = 0; - if (do_prepare) - prepare_to_wait(&conf->wait_for_overlap, &w, - TASK_UNINTERRUPTIBLE); - if (unlikely(conf->reshape_progress != MaxSector)) { - /* spinlock is needed as reshape_progress may be - * 64bit on a 32bit platform, and so it might be - * possible to see a half-updated value - * Of course reshape_progress could change after - * the lock is dropped, so once we get a reference - * to the stripe that we think it is, we will have - * to check again. - */ - spin_lock_irq(&conf->device_lock); - if (mddev->reshape_backwards - ? logical_sector < conf->reshape_progress - : logical_sector >= conf->reshape_progress) { - previous = 1; - } else { - if (mddev->reshape_backwards - ? logical_sector < conf->reshape_safe - : logical_sector >= conf->reshape_safe) { - spin_unlock_irq(&conf->device_lock); - schedule(); - do_prepare = true; - goto retry; - } - } - spin_unlock_irq(&conf->device_lock); - } + while (1) { + res = make_stripe_request(mddev, conf, &ctx, logical_sector, + bi); + if (res == STRIPE_FAIL) + break; - new_sector = raid5_compute_sector(conf, logical_sector, - previous, - &dd_idx, NULL); - pr_debug("raid456: raid5_make_request, sector %llu logical %llu\n", - (unsigned long long)new_sector, - (unsigned long long)logical_sector); + if (res == STRIPE_RETRY) + continue; - sh = raid5_get_active_stripe(conf, new_sector, previous, - (bi->bi_opf & REQ_RAHEAD), 0); - if (sh) { - if (unlikely(previous)) { - /* expansion might have moved on while waiting for a - * stripe, so we must do the range check again. - * Expansion could still move past after this - * test, but as we are holding a reference to - * 'sh', we know that if that happens, - * STRIPE_EXPANDING will get set and the expansion - * won't proceed until we finish with the stripe. - */ - int must_retry = 0; - spin_lock_irq(&conf->device_lock); - if (mddev->reshape_backwards - ? logical_sector >= conf->reshape_progress - : logical_sector < conf->reshape_progress) - /* mismatch, need to try again */ - must_retry = 1; - spin_unlock_irq(&conf->device_lock); - if (must_retry) { - raid5_release_stripe(sh); - schedule(); - do_prepare = true; - goto retry; - } - } - if (read_seqcount_retry(&conf->gen_lock, seq)) { - /* Might have got the wrong stripe_head - * by accident - */ - raid5_release_stripe(sh); - goto retry; + if (res == STRIPE_SCHEDULE_AND_RETRY) { + /* + * Must release the reference to batch_last before + * scheduling and waiting for work to be done, + * otherwise the batch_last stripe head could prevent + * raid5_activate_delayed() from making progress + * and thus deadlocking. + */ + if (ctx.batch_last) { + raid5_release_stripe(ctx.batch_last); + ctx.batch_last = NULL; } - if (test_bit(STRIPE_EXPANDING, &sh->state) || - !add_stripe_bio(sh, bi, dd_idx, rw, previous)) { - /* Stripe is busy expanding or - * add failed due to overlap. Flush everything - * and wait a while - */ - md_wakeup_thread(mddev->thread); - raid5_release_stripe(sh); - schedule(); - do_prepare = true; - goto retry; - } - if (do_flush) { - set_bit(STRIPE_R5C_PREFLUSH, &sh->state); - /* we only need flush for one stripe */ - do_flush = false; - } + schedule(); + prepare_to_wait(&conf->wait_for_overlap, &w, + TASK_UNINTERRUPTIBLE); + continue; + } - set_bit(STRIPE_HANDLE, &sh->state); - clear_bit(STRIPE_DELAYED, &sh->state); - if ((!sh->batch_head || sh == sh->batch_head) && - (bi->bi_opf & REQ_SYNC) && - !test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) - atomic_inc(&conf->preread_active_stripes); - release_stripe_plug(mddev, sh); - } else { - /* cannot get stripe for read-ahead, just give-up */ - bi->bi_status = BLK_STS_IOERR; + s = find_first_bit(ctx.sectors_to_do, RAID5_MAX_REQ_STRIPES); + if (s == RAID5_MAX_REQ_STRIPES) break; - } + + logical_sector = ctx.first_sector + + (s << RAID5_STRIPE_SHIFT(conf)); } + finish_wait(&conf->wait_for_overlap, &w); + if (ctx.batch_last) + raid5_release_stripe(ctx.batch_last); + if (rw == WRITE) md_write_end(mddev); bio_endio(bi); @@ -7812,7 +7999,15 @@ static int raid5_run(struct mddev *mddev) mddev->queue->limits.discard_granularity < stripe) blk_queue_max_discard_sectors(mddev->queue, 0); - blk_queue_max_hw_sectors(mddev->queue, UINT_MAX); + /* + * Requests require having a bitmap for each stripe. + * Limit the max sectors based on this. + */ + blk_queue_max_hw_sectors(mddev->queue, + RAID5_MAX_REQ_STRIPES << RAID5_STRIPE_SHIFT(conf)); + + /* No restrictions on the number of segments in the request */ + blk_queue_max_segments(mddev->queue, USHRT_MAX); } if (log_init(conf, journal_dev, raid5_has_ppl(conf))) @@ -8697,8 +8892,11 @@ static int raid5_change_consistency_policy(struct mddev *mddev, const char *buf) err = log_init(conf, NULL, true); if (!err) { err = resize_stripes(conf, conf->pool_size); - if (err) + if (err) { + mddev_suspend(mddev); log_exit(conf); + mddev_resume(mddev); + } } } else err = -EINVAL; |