summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJens Axboe <axboe@kernel.dk>2022-07-03 10:18:10 -0600
committerJens Axboe <axboe@kernel.dk>2022-07-03 10:18:10 -0600
commitd01051ed365d95dfd5dec100a2b22e0c0cb8912d (patch)
treeee929cefcde61de4ad2a0def8b54bee3dd8e9651
parent197f80d97e9ccc8a496f4935ba939f3ed7432d53 (diff)
parentff4ec5f79108cf82fe7168547c76fe754c4ade0a (diff)
Merge branch 'md-next' of https://git.kernel.org/pub/scm/linux/kernel/git/song/md into for-5.20/drivers
Pull MD updates from Song: "1. Improve raid5 lock contention, by Logan Gunthorpe. 2. Misc fixes to raid5, by Logan Gunthorpe. 3. Fix race condition with md_reap_sync_thread(), by Guoqing Jiang." * 'md-next' of https://git.kernel.org/pub/scm/linux/kernel/git/song/md: (29 commits) md: Fix spelling mistake in comments md/raid5: Increase restriction on max segments per request md/raid5: Improve debug prints md/raid5: Pivot raid5_make_request() md/raid5: Check all disks in a stripe_head for reshape progress md/raid5: Refactor add_stripe_bio() md/raid5: Keep a reference to last stripe_head for batch md/raid5: Refactor for loop in raid5_make_request() into while loop md/raid5: Move read_seqcount_begin() into make_stripe_request() md/raid5: Drop the do_prepare flag in raid5_make_request() md/raid5: Factor out helper from raid5_make_request() loop md/raid5: Move common stripe get code into new find_get_stripe() helper md/raid5: Move stripe_add_to_batch_list() call out of add_stripe_bio() md/raid5: Refactor raid5_make_request loop md/raid5: Factor out ahead_of_reshape() function md/raid5: Make logic blocking check consistent with logic that blocks md: unlock mddev before reap sync_thread in action_store md: Explicitly create command-line configured devices md: Notify sysfs sync_completed in md_reap_sync_thread() md: Ensure resync is reported after it starts ...
-rw-r--r--MAINTAINERS1
-rw-r--r--drivers/md/dm-raid.c1
-rw-r--r--drivers/md/md-autodetect.c1
-rw-r--r--drivers/md/md-cluster.c4
-rw-r--r--drivers/md/md.c76
-rw-r--r--drivers/md/md.h16
-rw-r--r--drivers/md/raid5-cache.c40
-rw-r--r--drivers/md/raid5-log.h77
-rw-r--r--drivers/md/raid5-ppl.c2
-rw-r--r--drivers/md/raid5.c646
10 files changed, 549 insertions, 315 deletions
diff --git a/MAINTAINERS b/MAINTAINERS
index fe5daf141501..68147e525440 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -18540,6 +18540,7 @@ SOFTWARE RAID (Multiple Disks) SUPPORT
M: Song Liu <song@kernel.org>
L: linux-raid@vger.kernel.org
S: Supported
+Q: https://patchwork.kernel.org/project/linux-raid/list/
T: git git://git.kernel.org/pub/scm/linux/kernel/git/song/md.git
F: drivers/md/Kconfig
F: drivers/md/Makefile
diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c
index 9526ccbedafb..d43b8075c055 100644
--- a/drivers/md/dm-raid.c
+++ b/drivers/md/dm-raid.c
@@ -3725,6 +3725,7 @@ static int raid_message(struct dm_target *ti, unsigned int argc, char **argv,
if (!strcasecmp(argv[0], "idle") || !strcasecmp(argv[0], "frozen")) {
if (mddev->sync_thread) {
set_bit(MD_RECOVERY_INTR, &mddev->recovery);
+ md_unregister_thread(&mddev->sync_thread);
md_reap_sync_thread(mddev);
}
} else if (decipher_sync_action(mddev, mddev->recovery) != st_idle)
diff --git a/drivers/md/md-autodetect.c b/drivers/md/md-autodetect.c
index 2cf973722f59..344910ba435c 100644
--- a/drivers/md/md-autodetect.c
+++ b/drivers/md/md-autodetect.c
@@ -169,6 +169,7 @@ static void __init md_setup_drive(struct md_setup_args *args)
pr_info("md: Loading %s: %s\n", name, args->device_names);
+ md_alloc(mdev, name);
bdev = blkdev_get_by_dev(mdev, FMODE_READ, NULL);
if (IS_ERR(bdev)) {
pr_err("md: open failed - cannot start array %s\n", name);
diff --git a/drivers/md/md-cluster.c b/drivers/md/md-cluster.c
index 37cbcce3cc66..742b2349fea3 100644
--- a/drivers/md/md-cluster.c
+++ b/drivers/md/md-cluster.c
@@ -40,7 +40,7 @@ struct resync_info {
/* Lock the send communication. This is done through
* bit manipulation as opposed to a mutex in order to
- * accomodate lock and hold. See next comment.
+ * accommodate lock and hold. See next comment.
*/
#define MD_CLUSTER_SEND_LOCK 4
/* If cluster operations (such as adding a disk) must lock the
@@ -689,7 +689,7 @@ static int lock_comm(struct md_cluster_info *cinfo, bool mddev_locked)
/*
* If resync thread run after raid1d thread, then process_metadata_update
* could not continue if raid1d held reconfig_mutex (and raid1d is blocked
- * since another node already got EX on Token and waitting the EX of Ack),
+ * since another node already got EX on Token and waiting the EX of Ack),
* so let resync wake up thread in case flag is set.
*/
if (mddev_locked && !test_bit(MD_CLUSTER_HOLDING_MUTEX_FOR_RECVD,
diff --git a/drivers/md/md.c b/drivers/md/md.c
index c7ecb0bffda0..b64de313838f 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -4830,6 +4830,19 @@ action_store(struct mddev *mddev, const char *page, size_t len)
if (work_pending(&mddev->del_work))
flush_workqueue(md_misc_wq);
if (mddev->sync_thread) {
+ sector_t save_rp = mddev->reshape_position;
+
+ mddev_unlock(mddev);
+ set_bit(MD_RECOVERY_INTR, &mddev->recovery);
+ md_unregister_thread(&mddev->sync_thread);
+ mddev_lock_nointr(mddev);
+ /*
+ * set RECOVERY_INTR again and restore reshape
+ * position in case others changed them after
+ * got lock, eg, reshape_position_store and
+ * md_check_recovery.
+ */
+ mddev->reshape_position = save_rp;
set_bit(MD_RECOVERY_INTR, &mddev->recovery);
md_reap_sync_thread(mddev);
}
@@ -5001,7 +5014,7 @@ static ssize_t
sync_speed_show(struct mddev *mddev, char *page)
{
unsigned long resync, dt, db;
- if (mddev->curr_resync == 0)
+ if (mddev->curr_resync == MD_RESYNC_NONE)
return sprintf(page, "none\n");
resync = mddev->curr_mark_cnt - atomic_read(&mddev->recovery_active);
dt = (jiffies - mddev->resync_mark) / HZ;
@@ -5020,8 +5033,8 @@ sync_completed_show(struct mddev *mddev, char *page)
if (!test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
return sprintf(page, "none\n");
- if (mddev->curr_resync == 1 ||
- mddev->curr_resync == 2)
+ if (mddev->curr_resync == MD_RESYNC_YIELDED ||
+ mddev->curr_resync == MD_RESYNC_DELAYED)
return sprintf(page, "delayed\n");
if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) ||
@@ -5623,7 +5636,7 @@ int mddev_init_writes_pending(struct mddev *mddev)
}
EXPORT_SYMBOL_GPL(mddev_init_writes_pending);
-static int md_alloc(dev_t dev, char *name)
+int md_alloc(dev_t dev, char *name)
{
/*
* If dev is zero, name is the name of a device to allocate with
@@ -6197,6 +6210,7 @@ static void __md_stop_writes(struct mddev *mddev)
flush_workqueue(md_misc_wq);
if (mddev->sync_thread) {
set_bit(MD_RECOVERY_INTR, &mddev->recovery);
+ md_unregister_thread(&mddev->sync_thread);
md_reap_sync_thread(mddev);
}
@@ -8018,16 +8032,26 @@ static int status_resync(struct seq_file *seq, struct mddev *mddev)
max_sectors = mddev->dev_sectors;
resync = mddev->curr_resync;
- if (resync <= 3) {
+ if (resync < MD_RESYNC_ACTIVE) {
if (test_bit(MD_RECOVERY_DONE, &mddev->recovery))
/* Still cleaning up */
resync = max_sectors;
- } else if (resync > max_sectors)
+ } else if (resync > max_sectors) {
resync = max_sectors;
- else
+ } else {
resync -= atomic_read(&mddev->recovery_active);
+ if (resync < MD_RESYNC_ACTIVE) {
+ /*
+ * Resync has started, but the subtraction has
+ * yielded one of the special values. Force it
+ * to active to ensure the status reports an
+ * active resync.
+ */
+ resync = MD_RESYNC_ACTIVE;
+ }
+ }
- if (resync == 0) {
+ if (resync == MD_RESYNC_NONE) {
if (test_bit(MD_RESYNCING_REMOTE, &mddev->recovery)) {
struct md_rdev *rdev;
@@ -8051,7 +8075,7 @@ static int status_resync(struct seq_file *seq, struct mddev *mddev)
}
return 0;
}
- if (resync < 3) {
+ if (resync < MD_RESYNC_ACTIVE) {
seq_printf(seq, "\tresync=DELAYED");
return 1;
}
@@ -8729,13 +8753,7 @@ void md_do_sync(struct md_thread *thread)
mddev->last_sync_action = action ?: desc;
- /* we overload curr_resync somewhat here.
- * 0 == not engaged in resync at all
- * 2 == checking that there is no conflict with another sync
- * 1 == like 2, but have yielded to allow conflicting resync to
- * commence
- * other == active in resync - this many blocks
- *
+ /*
* Before starting a resync we must have set curr_resync to
* 2, and then checked that every "conflicting" array has curr_resync
* less than ours. When we find one that is the same or higher
@@ -8747,7 +8765,7 @@ void md_do_sync(struct md_thread *thread)
do {
int mddev2_minor = -1;
- mddev->curr_resync = 2;
+ mddev->curr_resync = MD_RESYNC_DELAYED;
try_again:
if (test_bit(MD_RECOVERY_INTR, &mddev->recovery))
@@ -8759,12 +8777,14 @@ void md_do_sync(struct md_thread *thread)
&& mddev2->curr_resync
&& match_mddev_units(mddev, mddev2)) {
DEFINE_WAIT(wq);
- if (mddev < mddev2 && mddev->curr_resync == 2) {
+ if (mddev < mddev2 &&
+ mddev->curr_resync == MD_RESYNC_DELAYED) {
/* arbitrarily yield */
- mddev->curr_resync = 1;
+ mddev->curr_resync = MD_RESYNC_YIELDED;
wake_up(&resync_wait);
}
- if (mddev > mddev2 && mddev->curr_resync == 1)
+ if (mddev > mddev2 &&
+ mddev->curr_resync == MD_RESYNC_YIELDED)
/* no need to wait here, we can wait the next
* time 'round when curr_resync == 2
*/
@@ -8792,7 +8812,7 @@ void md_do_sync(struct md_thread *thread)
finish_wait(&resync_wait, &wq);
}
}
- } while (mddev->curr_resync < 2);
+ } while (mddev->curr_resync < MD_RESYNC_DELAYED);
j = 0;
if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
@@ -8876,7 +8896,7 @@ void md_do_sync(struct md_thread *thread)
desc, mdname(mddev));
mddev->curr_resync = j;
} else
- mddev->curr_resync = 3; /* no longer delayed */
+ mddev->curr_resync = MD_RESYNC_ACTIVE; /* no longer delayed */
mddev->curr_resync_completed = j;
sysfs_notify_dirent_safe(mddev->sysfs_completed);
md_new_event();
@@ -9011,14 +9031,14 @@ void md_do_sync(struct md_thread *thread)
if (!test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
!test_bit(MD_RECOVERY_INTR, &mddev->recovery) &&
- mddev->curr_resync > 3) {
+ mddev->curr_resync >= MD_RESYNC_ACTIVE) {
mddev->curr_resync_completed = mddev->curr_resync;
sysfs_notify_dirent_safe(mddev->sysfs_completed);
}
mddev->pers->sync_request(mddev, max_sectors, &skipped);
if (!test_bit(MD_RECOVERY_CHECK, &mddev->recovery) &&
- mddev->curr_resync > 3) {
+ mddev->curr_resync >= MD_RESYNC_ACTIVE) {
if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) {
if (mddev->curr_resync >= mddev->recovery_cp) {
@@ -9082,7 +9102,7 @@ void md_do_sync(struct md_thread *thread)
} else if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery))
mddev->resync_min = mddev->curr_resync_completed;
set_bit(MD_RECOVERY_DONE, &mddev->recovery);
- mddev->curr_resync = 0;
+ mddev->curr_resync = MD_RESYNC_NONE;
spin_unlock(&mddev->lock);
wake_up(&resync_wait);
@@ -9303,6 +9323,7 @@ void md_check_recovery(struct mddev *mddev)
* ->spare_active and clear saved_raid_disk
*/
set_bit(MD_RECOVERY_INTR, &mddev->recovery);
+ md_unregister_thread(&mddev->sync_thread);
md_reap_sync_thread(mddev);
clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
@@ -9338,6 +9359,7 @@ void md_check_recovery(struct mddev *mddev)
goto unlock;
}
if (mddev->sync_thread) {
+ md_unregister_thread(&mddev->sync_thread);
md_reap_sync_thread(mddev);
goto unlock;
}
@@ -9417,8 +9439,7 @@ void md_reap_sync_thread(struct mddev *mddev)
sector_t old_dev_sectors = mddev->dev_sectors;
bool is_reshaped = false;
- /* resync has finished, collect result */
- md_unregister_thread(&mddev->sync_thread);
+ /* sync_thread should be unregistered, collect result */
if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery) &&
!test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery) &&
mddev->degraded != mddev->raid_disks) {
@@ -9466,6 +9487,7 @@ void md_reap_sync_thread(struct mddev *mddev)
wake_up(&resync_wait);
/* flag recovery needed just to double check */
set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
+ sysfs_notify_dirent_safe(mddev->sysfs_completed);
sysfs_notify_dirent_safe(mddev->sysfs_action);
md_new_event();
if (mddev->event_work.func)
diff --git a/drivers/md/md.h b/drivers/md/md.h
index cf2cbb17acbd..1a85dbe78a71 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -288,6 +288,21 @@ struct serial_info {
sector_t _subtree_last; /* highest sector in subtree of rb node */
};
+/*
+ * mddev->curr_resync stores the current sector of the resync but
+ * also has some overloaded values.
+ */
+enum {
+ /* No resync in progress */
+ MD_RESYNC_NONE = 0,
+ /* Yielded to allow another conflicting resync to commence */
+ MD_RESYNC_YIELDED = 1,
+ /* Delayed to check that there is no conflict with another sync */
+ MD_RESYNC_DELAYED = 2,
+ /* Any value greater than or equal to this is in an active resync */
+ MD_RESYNC_ACTIVE = 3,
+};
+
struct mddev {
void *private;
struct md_personality *pers;
@@ -751,6 +766,7 @@ extern int md_integrity_add_rdev(struct md_rdev *rdev, struct mddev *mddev);
extern int strict_strtoul_scaled(const char *cp, unsigned long *res, int scale);
extern void mddev_init(struct mddev *mddev);
+int md_alloc(dev_t dev, char *name);
extern int md_run(struct mddev *mddev);
extern int md_start(struct mddev *mddev);
extern void md_stop(struct mddev *mddev);
diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c
index 83c184eddbda..b2e6016934ee 100644
--- a/drivers/md/raid5-cache.c
+++ b/drivers/md/raid5-cache.c
@@ -1590,18 +1590,13 @@ void r5l_quiesce(struct r5l_log *log, int quiesce)
bool r5l_log_disk_error(struct r5conf *conf)
{
- struct r5l_log *log;
- bool ret;
- /* don't allow write if journal disk is missing */
- rcu_read_lock();
- log = rcu_dereference(conf->log);
+ struct r5l_log *log = conf->log;
+ /* don't allow write if journal disk is missing */
if (!log)
- ret = test_bit(MD_HAS_JOURNAL, &conf->mddev->flags);
+ return test_bit(MD_HAS_JOURNAL, &conf->mddev->flags);
else
- ret = test_bit(Faulty, &log->rdev->flags);
- rcu_read_unlock();
- return ret;
+ return test_bit(Faulty, &log->rdev->flags);
}
#define R5L_RECOVERY_PAGE_POOL_SIZE 256
@@ -2534,12 +2529,13 @@ static ssize_t r5c_journal_mode_show(struct mddev *mddev, char *page)
struct r5conf *conf;
int ret;
- spin_lock(&mddev->lock);
+ ret = mddev_lock(mddev);
+ if (ret)
+ return ret;
+
conf = mddev->private;
- if (!conf || !conf->log) {
- spin_unlock(&mddev->lock);
- return 0;
- }
+ if (!conf || !conf->log)
+ goto out_unlock;
switch (conf->log->r5c_journal_mode) {
case R5C_JOURNAL_MODE_WRITE_THROUGH:
@@ -2557,7 +2553,9 @@ static ssize_t r5c_journal_mode_show(struct mddev *mddev, char *page)
default:
ret = 0;
}
- spin_unlock(&mddev->lock);
+
+out_unlock:
+ mddev_unlock(mddev);
return ret;
}
@@ -2639,7 +2637,7 @@ int r5c_try_caching_write(struct r5conf *conf,
int i;
struct r5dev *dev;
int to_cache = 0;
- void **pslot;
+ void __rcu **pslot;
sector_t tree_index;
int ret;
uintptr_t refcount;
@@ -2806,7 +2804,7 @@ void r5c_finish_stripe_write_out(struct r5conf *conf,
int i;
int do_wakeup = 0;
sector_t tree_index;
- void **pslot;
+ void __rcu **pslot;
uintptr_t refcount;
if (!log || !test_bit(R5_InJournal, &sh->dev[sh->pd_idx].flags))
@@ -3145,7 +3143,7 @@ int r5l_init_log(struct r5conf *conf, struct md_rdev *rdev)
spin_lock_init(&log->stripe_in_journal_lock);
atomic_set(&log->stripe_in_journal_count, 0);
- rcu_assign_pointer(conf->log, log);
+ conf->log = log;
set_bit(MD_HAS_JOURNAL, &conf->mddev->flags);
return 0;
@@ -3167,13 +3165,13 @@ void r5l_exit_log(struct r5conf *conf)
{
struct r5l_log *log = conf->log;
- conf->log = NULL;
- synchronize_rcu();
-
/* Ensure disable_writeback_work wakes up and exits */
wake_up(&conf->mddev->sb_wait);
flush_work(&log->disable_writeback_work);
md_unregister_thread(&log->reclaim_thread);
+
+ conf->log = NULL;
+
mempool_exit(&log->meta_pool);
bioset_exit(&log->bs);
mempool_exit(&log->io_pool);
diff --git a/drivers/md/raid5-log.h b/drivers/md/raid5-log.h
index 43c714a8798c..c8332502669e 100644
--- a/drivers/md/raid5-log.h
+++ b/drivers/md/raid5-log.h
@@ -2,49 +2,46 @@
#ifndef _RAID5_LOG_H
#define _RAID5_LOG_H
-extern int r5l_init_log(struct r5conf *conf, struct md_rdev *rdev);
-extern void r5l_exit_log(struct r5conf *conf);
-extern int r5l_write_stripe(struct r5l_log *log, struct stripe_head *head_sh);
-extern void r5l_write_stripe_run(struct r5l_log *log);
-extern void r5l_flush_stripe_to_raid(struct r5l_log *log);
-extern void r5l_stripe_write_finished(struct stripe_head *sh);
-extern int r5l_handle_flush_request(struct r5l_log *log, struct bio *bio);
-extern void r5l_quiesce(struct r5l_log *log, int quiesce);
-extern bool r5l_log_disk_error(struct r5conf *conf);
-extern bool r5c_is_writeback(struct r5l_log *log);
-extern int
-r5c_try_caching_write(struct r5conf *conf, struct stripe_head *sh,
- struct stripe_head_state *s, int disks);
-extern void
-r5c_finish_stripe_write_out(struct r5conf *conf, struct stripe_head *sh,
- struct stripe_head_state *s);
-extern void r5c_release_extra_page(struct stripe_head *sh);
-extern void r5c_use_extra_page(struct stripe_head *sh);
-extern void r5l_wake_reclaim(struct r5l_log *log, sector_t space);
-extern void r5c_handle_cached_data_endio(struct r5conf *conf,
- struct stripe_head *sh, int disks);
-extern int r5c_cache_data(struct r5l_log *log, struct stripe_head *sh);
-extern void r5c_make_stripe_write_out(struct stripe_head *sh);
-extern void r5c_flush_cache(struct r5conf *conf, int num);
-extern void r5c_check_stripe_cache_usage(struct r5conf *conf);
-extern void r5c_check_cached_full_stripe(struct r5conf *conf);
+int r5l_init_log(struct r5conf *conf, struct md_rdev *rdev);
+void r5l_exit_log(struct r5conf *conf);
+int r5l_write_stripe(struct r5l_log *log, struct stripe_head *head_sh);
+void r5l_write_stripe_run(struct r5l_log *log);
+void r5l_flush_stripe_to_raid(struct r5l_log *log);
+void r5l_stripe_write_finished(struct stripe_head *sh);
+int r5l_handle_flush_request(struct r5l_log *log, struct bio *bio);
+void r5l_quiesce(struct r5l_log *log, int quiesce);
+bool r5l_log_disk_error(struct r5conf *conf);
+bool r5c_is_writeback(struct r5l_log *log);
+int r5c_try_caching_write(struct r5conf *conf, struct stripe_head *sh,
+ struct stripe_head_state *s, int disks);
+void r5c_finish_stripe_write_out(struct r5conf *conf, struct stripe_head *sh,
+ struct stripe_head_state *s);
+void r5c_release_extra_page(struct stripe_head *sh);
+void r5c_use_extra_page(struct stripe_head *sh);
+void r5l_wake_reclaim(struct r5l_log *log, sector_t space);
+void r5c_handle_cached_data_endio(struct r5conf *conf,
+ struct stripe_head *sh, int disks);
+int r5c_cache_data(struct r5l_log *log, struct stripe_head *sh);
+void r5c_make_stripe_write_out(struct stripe_head *sh);
+void r5c_flush_cache(struct r5conf *conf, int num);
+void r5c_check_stripe_cache_usage(struct r5conf *conf);
+void r5c_check_cached_full_stripe(struct r5conf *conf);
extern struct md_sysfs_entry r5c_journal_mode;
-extern void r5c_update_on_rdev_error(struct mddev *mddev,
- struct md_rdev *rdev);
-extern bool r5c_big_stripe_cached(struct r5conf *conf, sector_t sect);
-extern int r5l_start(struct r5l_log *log);
+void r5c_update_on_rdev_error(struct mddev *mddev, struct md_rdev *rdev);
+bool r5c_big_stripe_cached(struct r5conf *conf, sector_t sect);
+int r5l_start(struct r5l_log *log);
-extern struct dma_async_tx_descriptor *
+struct dma_async_tx_descriptor *
ops_run_partial_parity(struct stripe_head *sh, struct raid5_percpu *percpu,
struct dma_async_tx_descriptor *tx);
-extern int ppl_init_log(struct r5conf *conf);
-extern void ppl_exit_log(struct r5conf *conf);
-extern int ppl_write_stripe(struct r5conf *conf, struct stripe_head *sh);
-extern void ppl_write_stripe_run(struct r5conf *conf);
-extern void ppl_stripe_write_finished(struct stripe_head *sh);
-extern int ppl_modify_log(struct r5conf *conf, struct md_rdev *rdev, bool add);
-extern void ppl_quiesce(struct r5conf *conf, int quiesce);
-extern int ppl_handle_flush_request(struct r5l_log *log, struct bio *bio);
+int ppl_init_log(struct r5conf *conf);
+void ppl_exit_log(struct r5conf *conf);
+int ppl_write_stripe(struct r5conf *conf, struct stripe_head *sh);
+void ppl_write_stripe_run(struct r5conf *conf);
+void ppl_stripe_write_finished(struct stripe_head *sh);
+int ppl_modify_log(struct r5conf *conf, struct md_rdev *rdev, bool add);
+void ppl_quiesce(struct r5conf *conf, int quiesce);
+int ppl_handle_flush_request(struct bio *bio);
extern struct md_sysfs_entry ppl_write_hint;
static inline bool raid5_has_log(struct r5conf *conf)
@@ -111,7 +108,7 @@ static inline int log_handle_flush_request(struct r5conf *conf, struct bio *bio)
if (conf->log)
ret = r5l_handle_flush_request(conf->log, bio);
else if (raid5_has_ppl(conf))
- ret = ppl_handle_flush_request(conf->log, bio);
+ ret = ppl_handle_flush_request(bio);
return ret;
}
diff --git a/drivers/md/raid5-ppl.c b/drivers/md/raid5-ppl.c
index 0a2e4806b1ec..db49edec362a 100644
--- a/drivers/md/raid5-ppl.c
+++ b/drivers/md/raid5-ppl.c
@@ -679,7 +679,7 @@ void ppl_quiesce(struct r5conf *conf, int quiesce)
}
}
-int ppl_handle_flush_request(struct r5l_log *log, struct bio *bio)
+int ppl_handle_flush_request(struct bio *bio)
{
if (bio->bi_iter.bi_size == 0) {
bio_endio(bio);
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 5d09256d7f81..184145b49b7c 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -61,6 +61,8 @@
#define cpu_to_group(cpu) cpu_to_node(cpu)
#define ANY_GROUP NUMA_NO_NODE
+#define RAID5_MAX_REQ_STRIPES 256
+
static bool devices_handle_discard_safely = false;
module_param(devices_handle_discard_safely, bool, 0644);
MODULE_PARM_DESC(devices_handle_discard_safely,
@@ -624,6 +626,49 @@ static struct stripe_head *__find_stripe(struct r5conf *conf, sector_t sector,
return NULL;
}
+static struct stripe_head *find_get_stripe(struct r5conf *conf,
+ sector_t sector, short generation, int hash)
+{
+ int inc_empty_inactive_list_flag;
+ struct stripe_head *sh;
+
+ sh = __find_stripe(conf, sector, generation);
+ if (!sh)
+ return NULL;
+
+ if (atomic_inc_not_zero(&sh->count))
+ return sh;
+
+ /*
+ * Slow path. The reference count is zero which means the stripe must
+ * be on a list (sh->lru). Must remove the stripe from the list that
+ * references it with the device_lock held.
+ */
+
+ spin_lock(&conf->device_lock);
+ if (!atomic_read(&sh->count)) {
+ if (!test_bit(STRIPE_HANDLE, &sh->state))
+ atomic_inc(&conf->active_stripes);
+ BUG_ON(list_empty(&sh->lru) &&
+ !test_bit(STRIPE_EXPANDING, &sh->state));
+ inc_empty_inactive_list_flag = 0;
+ if (!list_empty(conf->inactive_list + hash))
+ inc_empty_inactive_list_flag = 1;
+ list_del_init(&sh->lru);
+ if (list_empty(conf->inactive_list + hash) &&
+ inc_empty_inactive_list_flag)
+ atomic_inc(&conf->empty_inactive_list_nr);
+ if (sh->group) {
+ sh->group->stripes_cnt--;
+ sh->group = NULL;
+ }
+ }
+ atomic_inc(&sh->count);
+ spin_unlock(&conf->device_lock);
+
+ return sh;
+}
+
/*
* Need to check if array has failed when deciding whether to:
* - start an array
@@ -716,7 +761,6 @@ raid5_get_active_stripe(struct r5conf *conf, sector_t sector,
{
struct stripe_head *sh;
int hash = stripe_hash_locks_hash(conf, sector);
- int inc_empty_inactive_list_flag;
pr_debug("get_stripe, sector %llu\n", (unsigned long long)sector);
@@ -726,57 +770,34 @@ raid5_get_active_stripe(struct r5conf *conf, sector_t sector,
wait_event_lock_irq(conf->wait_for_quiescent,
conf->quiesce == 0 || noquiesce,
*(conf->hash_locks + hash));
- sh = __find_stripe(conf, sector, conf->generation - previous);
- if (!sh) {
- if (!test_bit(R5_INACTIVE_BLOCKED, &conf->cache_state)) {
- sh = get_free_stripe(conf, hash);
- if (!sh && !test_bit(R5_DID_ALLOC,
- &conf->cache_state))
- set_bit(R5_ALLOC_MORE,
- &conf->cache_state);
- }
- if (noblock && sh == NULL)
- break;
+ sh = find_get_stripe(conf, sector, conf->generation - previous,
+ hash);
+ if (sh)
+ break;
- r5c_check_stripe_cache_usage(conf);
- if (!sh) {
- set_bit(R5_INACTIVE_BLOCKED,
- &conf->cache_state);
- r5l_wake_reclaim(conf->log, 0);
- wait_event_lock_irq(
- conf->wait_for_stripe,
+ if (!test_bit(R5_INACTIVE_BLOCKED, &conf->cache_state)) {
+ sh = get_free_stripe(conf, hash);
+ if (!sh && !test_bit(R5_DID_ALLOC, &conf->cache_state))
+ set_bit(R5_ALLOC_MORE, &conf->cache_state);
+ }
+ if (noblock && !sh)
+ break;
+
+ r5c_check_stripe_cache_usage(conf);
+ if (!sh) {
+ set_bit(R5_INACTIVE_BLOCKED, &conf->cache_state);
+ r5l_wake_reclaim(conf->log, 0);
+ wait_event_lock_irq(conf->wait_for_stripe,
!list_empty(conf->inactive_list + hash) &&
(atomic_read(&conf->active_stripes)
< (conf->max_nr_stripes * 3 / 4)
|| !test_bit(R5_INACTIVE_BLOCKED,
&conf->cache_state)),
*(conf->hash_locks + hash));
- clear_bit(R5_INACTIVE_BLOCKED,
- &conf->cache_state);
- } else {
- init_stripe(sh, sector, previous);
- atomic_inc(&sh->count);
- }
- } else if (!atomic_inc_not_zero(&sh->count)) {
- spin_lock(&conf->device_lock);
- if (!atomic_read(&sh->count)) {
- if (!test_bit(STRIPE_HANDLE, &sh->state))
- atomic_inc(&conf->active_stripes);
- BUG_ON(list_empty(&sh->lru) &&
- !test_bit(STRIPE_EXPANDING, &sh->state));
- inc_empty_inactive_list_flag = 0;
- if (!list_empty(conf->inactive_list + hash))
- inc_empty_inactive_list_flag = 1;
- list_del_init(&sh->lru);
- if (list_empty(conf->inactive_list + hash) && inc_empty_inactive_list_flag)
- atomic_inc(&conf->empty_inactive_list_nr);
- if (sh->group) {
- sh->group->stripes_cnt--;
- sh->group = NULL;
- }
- }
+ clear_bit(R5_INACTIVE_BLOCKED, &conf->cache_state);
+ } else {
+ init_stripe(sh, sector, previous);
atomic_inc(&sh->count);
- spin_unlock(&conf->device_lock);
}
} while (sh == NULL);
@@ -824,13 +845,13 @@ static bool stripe_can_batch(struct stripe_head *sh)
}
/* we only do back search */
-static void stripe_add_to_batch_list(struct r5conf *conf, struct stripe_head *sh)
+static void stripe_add_to_batch_list(struct r5conf *conf,
+ struct stripe_head *sh, struct stripe_head *last_sh)
{
struct stripe_head *head;
sector_t head_sector, tmp_sec;
int hash;
int dd_idx;
- int inc_empty_inactive_list_flag;
/* Don't cross chunks, so stripe pd_idx/qd_idx is the same */
tmp_sec = sh->sector;
@@ -838,36 +859,20 @@ static void stripe_add_to_batch_list(struct r5conf *conf, struct stripe_head *sh
return;
head_sector = sh->sector - RAID5_STRIPE_SECTORS(conf);
- hash = stripe_hash_locks_hash(conf, head_sector);
- spin_lock_irq(conf->hash_locks + hash);
- head = __find_stripe(conf, head_sector, conf->generation);
- if (head && !atomic_inc_not_zero(&head->count)) {
- spin_lock(&conf->device_lock);
- if (!atomic_read(&head->count)) {
- if (!test_bit(STRIPE_HANDLE, &head->state))
- atomic_inc(&conf->active_stripes);
- BUG_ON(list_empty(&head->lru) &&
- !test_bit(STRIPE_EXPANDING, &head->state));
- inc_empty_inactive_list_flag = 0;
- if (!list_empty(conf->inactive_list + hash))
- inc_empty_inactive_list_flag = 1;
- list_del_init(&head->lru);
- if (list_empty(conf->inactive_list + hash) && inc_empty_inactive_list_flag)
- atomic_inc(&conf->empty_inactive_list_nr);
- if (head->group) {
- head->group->stripes_cnt--;
- head->group = NULL;
- }
- }
+ if (last_sh && head_sector == last_sh->sector) {
+ head = last_sh;
atomic_inc(&head->count);
- spin_unlock(&conf->device_lock);
+ } else {
+ hash = stripe_hash_locks_hash(conf, head_sector);
+ spin_lock_irq(conf->hash_locks + hash);
+ head = find_get_stripe(conf, head_sector, conf->generation,
+ hash);
+ spin_unlock_irq(conf->hash_locks + hash);
+ if (!head)
+ return;
+ if (!stripe_can_batch(head))
+ goto out;
}
- spin_unlock_irq(conf->hash_locks + hash);
-
- if (!head)
- return;
- if (!stripe_can_batch(head))
- goto out;
lock_two_stripes(head, sh);
/* clear_batch_ready clear the flag */
@@ -3412,39 +3417,32 @@ schedule_reconstruction(struct stripe_head *sh, struct stripe_head_state *s,
s->locked, s->ops_request);
}
-/*
- * Each stripe/dev can have one or more bion attached.
- * toread/towrite point to the first in a chain.
- * The bi_next chain must be in order.
- */
-static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx,
- int forwrite, int previous)
+static bool stripe_bio_overlaps(struct stripe_head *sh, struct bio *bi,
+ int dd_idx, int forwrite)
{
- struct bio **bip;
struct r5conf *conf = sh->raid_conf;
- int firstwrite=0;
+ struct bio **bip;
- pr_debug("adding bi b#%llu to stripe s#%llu\n",
- (unsigned long long)bi->bi_iter.bi_sector,
- (unsigned long long)sh->sector);
+ pr_debug("checking bi b#%llu to stripe s#%llu\n",
+ bi->bi_iter.bi_sector, sh->sector);
- spin_lock_irq(&sh->stripe_lock);
/* Don't allow new IO added to stripes in batch list */
if (sh->batch_head)
- goto overlap;
- if (forwrite) {
+ return true;
+
+ if (forwrite)
bip = &sh->dev[dd_idx].towrite;
- if (*bip == NULL)
- firstwrite = 1;
- } else
+ else
bip = &sh->dev[dd_idx].toread;
+
while (*bip && (*bip)->bi_iter.bi_sector < bi->bi_iter.bi_sector) {
if (bio_end_sector(*bip) > bi->bi_iter.bi_sector)
- goto overlap;
- bip = & (*bip)->bi_next;
+ return true;
+ bip = &(*bip)->bi_next;
}
+
if (*bip && (*bip)->bi_iter.bi_sector < bio_end_sector(bi))
- goto overlap;
+ return true;
if (forwrite && raid5_has_ppl(conf)) {
/*
@@ -3473,9 +3471,30 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx,
}
if (first + conf->chunk_sectors * (count - 1) != last)
- goto overlap;
+ return true;
}
+ return false;
+}
+
+static void __add_stripe_bio(struct stripe_head *sh, struct bio *bi,
+ int dd_idx, int forwrite, int previous)
+{
+ struct r5conf *conf = sh->raid_conf;
+ struct bio **bip;
+ int firstwrite = 0;
+
+ if (forwrite) {
+ bip = &sh->dev[dd_idx].towrite;
+ if (!*bip)
+ firstwrite = 1;
+ } else {
+ bip = &sh->dev[dd_idx].toread;
+ }
+
+ while (*bip && (*bip)->bi_iter.bi_sector < bi->bi_iter.bi_sector)
+ bip = &(*bip)->bi_next;
+
if (!forwrite || previous)
clear_bit(STRIPE_BATCH_READY, &sh->state);
@@ -3501,9 +3520,9 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx,
sh->overwrite_disks++;
}
- pr_debug("added bi b#%llu to stripe s#%llu, disk %d.\n",
- (unsigned long long)(*bip)->bi_iter.bi_sector,
- (unsigned long long)sh->sector, dd_idx);
+ pr_debug("added bi b#%llu to stripe s#%llu, disk %d, logical %llu\n",
+ (*bip)->bi_iter.bi_sector, sh->sector, dd_idx,
+ sh->dev[dd_idx].sector);
if (conf->mddev->bitmap && firstwrite) {
/* Cannot hold spinlock over bitmap_startwrite,
@@ -3511,7 +3530,7 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx,
* we have added to the bitmap and set bm_seq.
* So set STRIPE_BITMAP_PENDING to prevent
* batching.
- * If multiple add_stripe_bio() calls race here they
+ * If multiple __add_stripe_bio() calls race here they
* much all set STRIPE_BITMAP_PENDING. So only the first one
* to complete "bitmap_startwrite" gets to set
* STRIPE_BIT_DELAY. This is important as once a stripe
@@ -3529,16 +3548,27 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx,
set_bit(STRIPE_BIT_DELAY, &sh->state);
}
}
- spin_unlock_irq(&sh->stripe_lock);
+}
- if (stripe_can_batch(sh))
- stripe_add_to_batch_list(conf, sh);
- return 1;
+/*
+ * Each stripe/dev can have one or more bios attached.
+ * toread/towrite point to the first in a chain.
+ * The bi_next chain must be in order.
+ */
+static bool add_stripe_bio(struct stripe_head *sh, struct bio *bi,
+ int dd_idx, int forwrite, int previous)
+{
+ spin_lock_irq(&sh->stripe_lock);
+
+ if (stripe_bio_overlaps(sh, bi, dd_idx, forwrite)) {
+ set_bit(R5_Overlap, &sh->dev[dd_idx].flags);
+ spin_unlock_irq(&sh->stripe_lock);
+ return false;
+ }
- overlap:
- set_bit(R5_Overlap, &sh->dev[dd_idx].flags);
+ __add_stripe_bio(sh, bi, dd_idx, forwrite, previous);
spin_unlock_irq(&sh->stripe_lock);
- return 0;
+ return true;
}
static void end_reshape(struct r5conf *conf);
@@ -5784,17 +5814,239 @@ static void make_discard_request(struct mddev *mddev, struct bio *bi)
bio_endio(bi);
}
-static bool raid5_make_request(struct mddev *mddev, struct bio * bi)
+static bool ahead_of_reshape(struct mddev *mddev, sector_t sector,
+ sector_t reshape_sector)
{
- struct r5conf *conf = mddev->private;
+ return mddev->reshape_backwards ? sector < reshape_sector :
+ sector >= reshape_sector;
+}
+
+static bool range_ahead_of_reshape(struct mddev *mddev, sector_t min,
+ sector_t max, sector_t reshape_sector)
+{
+ return mddev->reshape_backwards ? max < reshape_sector :
+ min >= reshape_sector;
+}
+
+static bool stripe_ahead_of_reshape(struct mddev *mddev, struct r5conf *conf,
+ struct stripe_head *sh)
+{
+ sector_t max_sector = 0, min_sector = MaxSector;
+ bool ret = false;
int dd_idx;
- sector_t new_sector;
- sector_t logical_sector, last_sector;
+
+ for (dd_idx = 0; dd_idx < sh->disks; dd_idx++) {
+ if (dd_idx == sh->pd_idx)
+ continue;
+
+ min_sector = min(min_sector, sh->dev[dd_idx].sector);
+ max_sector = min(max_sector, sh->dev[dd_idx].sector);
+ }
+
+ spin_lock_irq(&conf->device_lock);
+
+ if (!range_ahead_of_reshape(mddev, min_sector, max_sector,
+ conf->reshape_progress))
+ /* mismatch, need to try again */
+ ret = true;
+
+ spin_unlock_irq(&conf->device_lock);
+
+ return ret;
+}
+
+enum stripe_result {
+ STRIPE_SUCCESS = 0,
+ STRIPE_RETRY,
+ STRIPE_SCHEDULE_AND_RETRY,
+ STRIPE_FAIL,
+};
+
+struct stripe_request_ctx {
+ /* a reference to the last stripe_head for batching */
+ struct stripe_head *batch_last;
+
+ /* first sector in the request */
+ sector_t first_sector;
+
+ /* last sector in the request */
+ sector_t last_sector;
+
+ /* bitmap to track stripe sectors that have been added to stripes */
+ DECLARE_BITMAP(sectors_to_do, RAID5_MAX_REQ_STRIPES);
+
+ /* the request had REQ_PREFLUSH, cleared after the first stripe_head */
+ bool do_flush;
+};
+
+static int add_all_stripe_bios(struct r5conf *conf,
+ struct stripe_request_ctx *ctx, struct stripe_head *sh,
+ struct bio *bi, int forwrite, int previous)
+{
+ int dd_idx;
+ int ret = 1;
+
+ spin_lock_irq(&sh->stripe_lock);
+
+ for (dd_idx = 0; dd_idx < sh->disks; dd_idx++) {
+ struct r5dev *dev = &sh->dev[dd_idx];
+
+ if (dd_idx == sh->pd_idx || dd_idx == sh->qd_idx)
+ continue;
+
+ if (dev->sector < ctx->first_sector ||
+ dev->sector >= ctx->last_sector)
+ continue;
+
+ if (stripe_bio_overlaps(sh, bi, dd_idx, forwrite)) {
+ set_bit(R5_Overlap, &dev->flags);
+ ret = 0;
+ continue;
+ }
+ }
+
+ if (!ret)
+ goto out;
+
+ for (dd_idx = 0; dd_idx < sh->disks; dd_idx++) {
+ struct r5dev *dev = &sh->dev[dd_idx];
+
+ if (dd_idx == sh->pd_idx || dd_idx == sh->qd_idx)
+ continue;
+
+ if (dev->sector < ctx->first_sector ||
+ dev->sector >= ctx->last_sector)
+ continue;
+
+ __add_stripe_bio(sh, bi, dd_idx, forwrite, previous);
+ clear_bit((dev->sector - ctx->first_sector) >>
+ RAID5_STRIPE_SHIFT(conf), ctx->sectors_to_do);
+ }
+
+out:
+ spin_unlock_irq(&sh->stripe_lock);
+ return ret;
+}
+
+static enum stripe_result make_stripe_request(struct mddev *mddev,
+ struct r5conf *conf, struct stripe_request_ctx *ctx,
+ sector_t logical_sector, struct bio *bi)
+{
+ const int rw = bio_data_dir(bi);
+ enum stripe_result ret;
struct stripe_head *sh;
+ sector_t new_sector;
+ int previous = 0;
+ int seq, dd_idx;
+
+ seq = read_seqcount_begin(&conf->gen_lock);
+
+ if (unlikely(conf->reshape_progress != MaxSector)) {
+ /*
+ * Spinlock is needed as reshape_progress may be
+ * 64bit on a 32bit platform, and so it might be
+ * possible to see a half-updated value
+ * Of course reshape_progress could change after
+ * the lock is dropped, so once we get a reference
+ * to the stripe that we think it is, we will have
+ * to check again.
+ */
+ spin_lock_irq(&conf->device_lock);
+ if (ahead_of_reshape(mddev, logical_sector,
+ conf->reshape_progress)) {
+ previous = 1;
+ } else {
+ if (ahead_of_reshape(mddev, logical_sector,
+ conf->reshape_safe)) {
+ spin_unlock_irq(&conf->device_lock);
+ return STRIPE_SCHEDULE_AND_RETRY;
+ }
+ }
+ spin_unlock_irq(&conf->device_lock);
+ }
+
+ new_sector = raid5_compute_sector(conf, logical_sector, previous,
+ &dd_idx, NULL);
+ pr_debug("raid456: %s, sector %llu logical %llu\n", __func__,
+ new_sector, logical_sector);
+
+ sh = raid5_get_active_stripe(conf, new_sector, previous,
+ (bi->bi_opf & REQ_RAHEAD), 0);
+ if (unlikely(!sh)) {
+ /* cannot get stripe, just give-up */
+ bi->bi_status = BLK_STS_IOERR;
+ return STRIPE_FAIL;
+ }
+
+ if (unlikely(previous) &&
+ stripe_ahead_of_reshape(mddev, conf, sh)) {
+ /*
+ * Expansion moved on while waiting for a stripe.
+ * Expansion could still move past after this
+ * test, but as we are holding a reference to
+ * 'sh', we know that if that happens,
+ * STRIPE_EXPANDING will get set and the expansion
+ * won't proceed until we finish with the stripe.
+ */
+ ret = STRIPE_SCHEDULE_AND_RETRY;
+ goto out_release;
+ }
+
+ if (read_seqcount_retry(&conf->gen_lock, seq)) {
+ /* Might have got the wrong stripe_head by accident */
+ ret = STRIPE_RETRY;
+ goto out_release;
+ }
+
+ if (test_bit(STRIPE_EXPANDING, &sh->state) ||
+ !add_all_stripe_bios(conf, ctx, sh, bi, rw, previous)) {
+ /*
+ * Stripe is busy expanding or add failed due to
+ * overlap. Flush everything and wait a while.
+ */
+ md_wakeup_thread(mddev->thread);
+ ret = STRIPE_SCHEDULE_AND_RETRY;
+ goto out_release;
+ }
+
+ if (stripe_can_batch(sh)) {
+ stripe_add_to_batch_list(conf, sh, ctx->batch_last);
+ if (ctx->batch_last)
+ raid5_release_stripe(ctx->batch_last);
+ atomic_inc(&sh->count);
+ ctx->batch_last = sh;
+ }
+
+ if (ctx->do_flush) {
+ set_bit(STRIPE_R5C_PREFLUSH, &sh->state);
+ /* we only need flush for one stripe */
+ ctx->do_flush = false;
+ }
+
+ set_bit(STRIPE_HANDLE, &sh->state);
+ clear_bit(STRIPE_DELAYED, &sh->state);
+ if ((!sh->batch_head || sh == sh->batch_head) &&
+ (bi->bi_opf & REQ_SYNC) &&
+ !test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
+ atomic_inc(&conf->preread_active_stripes);
+
+ release_stripe_plug(mddev, sh);
+ return STRIPE_SUCCESS;
+
+out_release:
+ raid5_release_stripe(sh);
+ return ret;
+}
+
+static bool raid5_make_request(struct mddev *mddev, struct bio * bi)
+{
+ struct r5conf *conf = mddev->private;
+ sector_t logical_sector;
+ struct stripe_request_ctx ctx = {};
const int rw = bio_data_dir(bi);
+ enum stripe_result res;
DEFINE_WAIT(w);
- bool do_prepare;
- bool do_flush = false;
+ int s;
if (unlikely(bi->bi_opf & REQ_PREFLUSH)) {
int ret = log_handle_flush_request(conf, bi);
@@ -5810,7 +6062,7 @@ static bool raid5_make_request(struct mddev *mddev, struct bio * bi)
* if r5l_handle_flush_request() didn't clear REQ_PREFLUSH,
* we need to flush journal device
*/
- do_flush = bi->bi_opf & REQ_PREFLUSH;
+ ctx.do_flush = bi->bi_opf & REQ_PREFLUSH;
}
if (!md_write_start(mddev, bi))
@@ -5834,15 +6086,22 @@ static bool raid5_make_request(struct mddev *mddev, struct bio * bi)
}
logical_sector = bi->bi_iter.bi_sector & ~((sector_t)RAID5_STRIPE_SECTORS(conf)-1);
- last_sector = bio_end_sector(bi);
+ ctx.first_sector = logical_sector;
+ ctx.last_sector = bio_end_sector(bi);
bi->bi_next = NULL;
+ bitmap_set(ctx.sectors_to_do, 0,
+ DIV_ROUND_UP_SECTOR_T(ctx.last_sector - logical_sector,
+ RAID5_STRIPE_SECTORS(conf)));
+
+ pr_debug("raid456: %s, logical %llu to %llu\n", __func__,
+ bi->bi_iter.bi_sector, ctx.last_sector);
+
/* Bail out if conflicts with reshape and REQ_NOWAIT is set */
if ((bi->bi_opf & REQ_NOWAIT) &&
(conf->reshape_progress != MaxSector) &&
- (mddev->reshape_backwards
- ? (logical_sector > conf->reshape_progress && logical_sector <= conf->reshape_safe)
- : (logical_sector >= conf->reshape_safe && logical_sector < conf->reshape_progress))) {
+ !ahead_of_reshape(mddev, logical_sector, conf->reshape_progress) &&
+ ahead_of_reshape(mddev, logical_sector, conf->reshape_safe)) {
bio_wouldblock_error(bi);
if (rw == WRITE)
md_write_end(mddev);
@@ -5850,119 +6109,47 @@ static bool raid5_make_request(struct mddev *mddev, struct bio * bi)
}
md_account_bio(mddev, &bi);
prepare_to_wait(&conf->wait_for_overlap, &w, TASK_UNINTERRUPTIBLE);
- for (; logical_sector < last_sector; logical_sector += RAID5_STRIPE_SECTORS(conf)) {
- int previous;
- int seq;
-
- do_prepare = false;
- retry:
- seq = read_seqcount_begin(&conf->gen_lock);
- previous = 0;
- if (do_prepare)
- prepare_to_wait(&conf->wait_for_overlap, &w,
- TASK_UNINTERRUPTIBLE);
- if (unlikely(conf->reshape_progress != MaxSector)) {
- /* spinlock is needed as reshape_progress may be
- * 64bit on a 32bit platform, and so it might be
- * possible to see a half-updated value
- * Of course reshape_progress could change after
- * the lock is dropped, so once we get a reference
- * to the stripe that we think it is, we will have
- * to check again.
- */
- spin_lock_irq(&conf->device_lock);
- if (mddev->reshape_backwards
- ? logical_sector < conf->reshape_progress
- : logical_sector >= conf->reshape_progress) {
- previous = 1;
- } else {
- if (mddev->reshape_backwards
- ? logical_sector < conf->reshape_safe
- : logical_sector >= conf->reshape_safe) {
- spin_unlock_irq(&conf->device_lock);
- schedule();
- do_prepare = true;
- goto retry;
- }
- }
- spin_unlock_irq(&conf->device_lock);
- }
+ while (1) {
+ res = make_stripe_request(mddev, conf, &ctx, logical_sector,
+ bi);
+ if (res == STRIPE_FAIL)
+ break;
- new_sector = raid5_compute_sector(conf, logical_sector,
- previous,
- &dd_idx, NULL);
- pr_debug("raid456: raid5_make_request, sector %llu logical %llu\n",
- (unsigned long long)new_sector,
- (unsigned long long)logical_sector);
+ if (res == STRIPE_RETRY)
+ continue;
- sh = raid5_get_active_stripe(conf, new_sector, previous,
- (bi->bi_opf & REQ_RAHEAD), 0);
- if (sh) {
- if (unlikely(previous)) {
- /* expansion might have moved on while waiting for a
- * stripe, so we must do the range check again.
- * Expansion could still move past after this
- * test, but as we are holding a reference to
- * 'sh', we know that if that happens,
- * STRIPE_EXPANDING will get set and the expansion
- * won't proceed until we finish with the stripe.
- */
- int must_retry = 0;
- spin_lock_irq(&conf->device_lock);
- if (mddev->reshape_backwards
- ? logical_sector >= conf->reshape_progress
- : logical_sector < conf->reshape_progress)
- /* mismatch, need to try again */
- must_retry = 1;
- spin_unlock_irq(&conf->device_lock);
- if (must_retry) {
- raid5_release_stripe(sh);
- schedule();
- do_prepare = true;
- goto retry;
- }
- }
- if (read_seqcount_retry(&conf->gen_lock, seq)) {
- /* Might have got the wrong stripe_head
- * by accident
- */
- raid5_release_stripe(sh);
- goto retry;
+ if (res == STRIPE_SCHEDULE_AND_RETRY) {
+ /*
+ * Must release the reference to batch_last before
+ * scheduling and waiting for work to be done,
+ * otherwise the batch_last stripe head could prevent
+ * raid5_activate_delayed() from making progress
+ * and thus deadlocking.
+ */
+ if (ctx.batch_last) {
+ raid5_release_stripe(ctx.batch_last);
+ ctx.batch_last = NULL;
}
- if (test_bit(STRIPE_EXPANDING, &sh->state) ||
- !add_stripe_bio(sh, bi, dd_idx, rw, previous)) {
- /* Stripe is busy expanding or
- * add failed due to overlap. Flush everything
- * and wait a while
- */
- md_wakeup_thread(mddev->thread);
- raid5_release_stripe(sh);
- schedule();
- do_prepare = true;
- goto retry;
- }
- if (do_flush) {
- set_bit(STRIPE_R5C_PREFLUSH, &sh->state);
- /* we only need flush for one stripe */
- do_flush = false;
- }
+ schedule();
+ prepare_to_wait(&conf->wait_for_overlap, &w,
+ TASK_UNINTERRUPTIBLE);
+ continue;
+ }
- set_bit(STRIPE_HANDLE, &sh->state);
- clear_bit(STRIPE_DELAYED, &sh->state);
- if ((!sh->batch_head || sh == sh->batch_head) &&
- (bi->bi_opf & REQ_SYNC) &&
- !test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
- atomic_inc(&conf->preread_active_stripes);
- release_stripe_plug(mddev, sh);
- } else {
- /* cannot get stripe for read-ahead, just give-up */
- bi->bi_status = BLK_STS_IOERR;
+ s = find_first_bit(ctx.sectors_to_do, RAID5_MAX_REQ_STRIPES);
+ if (s == RAID5_MAX_REQ_STRIPES)
break;
- }
+
+ logical_sector = ctx.first_sector +
+ (s << RAID5_STRIPE_SHIFT(conf));
}
+
finish_wait(&conf->wait_for_overlap, &w);
+ if (ctx.batch_last)
+ raid5_release_stripe(ctx.batch_last);
+
if (rw == WRITE)
md_write_end(mddev);
bio_endio(bi);
@@ -7812,7 +7999,15 @@ static int raid5_run(struct mddev *mddev)
mddev->queue->limits.discard_granularity < stripe)
blk_queue_max_discard_sectors(mddev->queue, 0);
- blk_queue_max_hw_sectors(mddev->queue, UINT_MAX);
+ /*
+ * Requests require having a bitmap for each stripe.
+ * Limit the max sectors based on this.
+ */
+ blk_queue_max_hw_sectors(mddev->queue,
+ RAID5_MAX_REQ_STRIPES << RAID5_STRIPE_SHIFT(conf));
+
+ /* No restrictions on the number of segments in the request */
+ blk_queue_max_segments(mddev->queue, USHRT_MAX);
}
if (log_init(conf, journal_dev, raid5_has_ppl(conf)))
@@ -8697,8 +8892,11 @@ static int raid5_change_consistency_policy(struct mddev *mddev, const char *buf)
err = log_init(conf, NULL, true);
if (!err) {
err = resize_stripes(conf, conf->pool_size);
- if (err)
+ if (err) {
+ mddev_suspend(mddev);
log_exit(conf);
+ mddev_resume(mddev);
+ }
}
} else
err = -EINVAL;