[linux-block.git] / drivers / md / raid1-10.c

// SPDX-License-Identifier: GPL-2.0
/* Maximum size of each resync request */
#define RESYNC_BLOCK_SIZE (64*1024)
#define RESYNC_PAGES ((RESYNC_BLOCK_SIZE + PAGE_SIZE-1) / PAGE_SIZE)

/*
 * Number of guaranteed raid bios in case of extreme VM load:
 */
#define	NR_RAID_BIOS 256

/* when we get a read error on a read-only array, we redirect to another
 * device without failing the first device, or trying to over-write to
 * correct the read error.  To keep track of bad blocks on a per-bio
 * level, we store IO_BLOCKED in the appropriate 'bios' pointer
 */
#define IO_BLOCKED ((struct bio *)1)
/* When we successfully write to a known bad-block, we need to remove the
 * bad-block marking which must be done from process context.  So we record
 * the success by setting devs[n].bio to IO_MADE_GOOD
 */
#define IO_MADE_GOOD ((struct bio *)2)

#define BIO_SPECIAL(bio) ((unsigned long)bio <= 2)
#define MAX_PLUG_BIO 32

/* for managing resync I/O pages */
struct resync_pages {
	void		*raid_bio;
	struct page	*pages[RESYNC_PAGES];
};

struct raid1_plug_cb {
	struct blk_plug_cb	cb;
	struct bio_list		pending;
	unsigned int		count;
};

static void rbio_pool_free(void *rbio, void *data)
{
	kfree(rbio);
}

static inline int resync_alloc_pages(struct resync_pages *rp,
				     gfp_t gfp_flags)
{
	int i;

	for (i = 0; i < RESYNC_PAGES; i++) {
		rp->pages[i] = alloc_page(gfp_flags);
		if (!rp->pages[i])
			goto out_free;
	}

	return 0;

out_free:
	while (--i >= 0)
		put_page(rp->pages[i]);
	return -ENOMEM;
}

static inline void resync_free_pages(struct resync_pages *rp)
{
	int i;

	for (i = 0; i < RESYNC_PAGES; i++)
		put_page(rp->pages[i]);
}

static inline void resync_get_all_pages(struct resync_pages *rp)
{
	int i;

	for (i = 0; i < RESYNC_PAGES; i++)
		get_page(rp->pages[i]);
}

static inline struct page *resync_fetch_page(struct resync_pages *rp,
					     unsigned idx)
{
	if (WARN_ON_ONCE(idx >= RESYNC_PAGES))
		return NULL;
	return rp->pages[idx];
}

/*
 * 'strct resync_pages' stores actual pages used for doing the resync
 *  IO, and it is per-bio, so make .bi_private points to it.
 */
static inline struct resync_pages *get_resync_pages(struct bio *bio)
{
	return bio->bi_private;
}

/* generally called after bio_reset() for reseting bvec */
static void md_bio_reset_resync_pages(struct bio *bio, struct resync_pages *rp,
			       int size)
{
	int idx = 0;

	/* initialize bvec table again */
	do {
		struct page *page = resync_fetch_page(rp, idx);
		int len = min_t(int, size, PAGE_SIZE);

		if (WARN_ON(!bio_add_page(bio, page, len, 0))) {
			bio->bi_status = BLK_STS_RESOURCE;
			bio_endio(bio);
			return;
		}

		size -= len;
	} while (idx++ < RESYNC_PAGES && size > 0);
}


static inline void raid1_submit_write(struct bio *bio)
{
	struct md_rdev *rdev = (void *)bio->bi_bdev;

	bio->bi_next = NULL;
	bio_set_dev(bio, rdev->bdev);
	if (test_bit(Faulty, &rdev->flags))
		bio_io_error(bio);
	else if (unlikely(bio_op(bio) ==  REQ_OP_DISCARD &&
			  !bdev_max_discard_sectors(bio->bi_bdev)))
		/* Just ignore it */
		bio_endio(bio);
	else
		submit_bio_noacct(bio);
}

static inline bool raid1_add_bio_to_plug(struct mddev *mddev, struct bio *bio,
				      blk_plug_cb_fn unplug, int copies)
{
	struct raid1_plug_cb *plug = NULL;
	struct blk_plug_cb *cb;

	/*
	 * If bitmap is not enabled, it's safe to submit the io directly, and
	 * this can get optimal performance.
	 */
	if (!mddev->bitmap_ops->enabled(mddev)) {
		raid1_submit_write(bio);
		return true;
	}

	cb = blk_check_plugged(unplug, mddev, sizeof(*plug));
	if (!cb)
		return false;

	plug = container_of(cb, struct raid1_plug_cb, cb);
	bio_list_add(&plug->pending, bio);
	if (++plug->count / MAX_PLUG_BIO >= copies) {
		list_del(&cb->list);
		cb->callback(cb, false);
	}


	return true;
}

/*
 * current->bio_list will be set under submit_bio() context, in this case bitmap
 * io will be added to the list and wait for current io submission to finish,
 * while current io submission must wait for bitmap io to be done. In order to
 * avoid such deadlock, submit bitmap io asynchronously.
 */
static inline void raid1_prepare_flush_writes(struct mddev *mddev)
{
	mddev->bitmap_ops->unplug(mddev, current->bio_list == NULL);
}

/*
 * Used by fix_read_error() to decay the per rdev read_errors.
 * We halve the read error count for every hour that has elapsed
 * since the last recorded read error.
 */
static inline void check_decay_read_errors(struct mddev *mddev, struct md_rdev *rdev)
{
	long cur_time_mon;
	unsigned long hours_since_last;
	unsigned int read_errors = atomic_read(&rdev->read_errors);

	cur_time_mon = ktime_get_seconds();

	if (rdev->last_read_error == 0) {
		/* first time we've seen a read error */
		rdev->last_read_error = cur_time_mon;
		return;
	}

	hours_since_last = (long)(cur_time_mon -
			    rdev->last_read_error) / 3600;

	rdev->last_read_error = cur_time_mon;

	/*
	 * if hours_since_last is > the number of bits in read_errors
	 * just set read errors to 0. We do this to avoid
	 * overflowing the shift of read_errors by hours_since_last.
	 */
	if (hours_since_last >= 8 * sizeof(read_errors))
		atomic_set(&rdev->read_errors, 0);
	else
		atomic_set(&rdev->read_errors, read_errors >> hours_since_last);
}

static inline bool exceed_read_errors(struct mddev *mddev, struct md_rdev *rdev)
{
	int max_read_errors = atomic_read(&mddev->max_corr_read_errors);
	int read_errors;

	check_decay_read_errors(mddev, rdev);
	read_errors =  atomic_inc_return(&rdev->read_errors);
	if (read_errors > max_read_errors) {
		pr_notice("md/"RAID_1_10_NAME":%s: %pg: Raid device exceeded read_error threshold [cur %d:max %d]\n",
			  mdname(mddev), rdev->bdev, read_errors, max_read_errors);
		pr_notice("md/"RAID_1_10_NAME":%s: %pg: Failing raid device\n",
			  mdname(mddev), rdev->bdev);
		md_error(mddev, rdev);
		return true;
	}

	return false;
}

/**
 * raid1_check_read_range() - check a given read range for bad blocks,
 * available read length is returned;
 * @rdev: the rdev to read;
 * @this_sector: read position;
 * @len: read length;
 *
 * helper function for read_balance()
 *
 * 1) If there are no bad blocks in the range, @len is returned;
 * 2) If the range are all bad blocks, 0 is returned;
 * 3) If there are partial bad blocks:
 *  - If the bad block range starts after @this_sector, the length of first
 *  good region is returned;
 *  - If the bad block range starts before @this_sector, 0 is returned and
 *  the @len is updated to the offset into the region before we get to the
 *  good blocks;
 */
static inline int raid1_check_read_range(struct md_rdev *rdev,
					 sector_t this_sector, int *len)
{
	sector_t first_bad;
	sector_t bad_sectors;

	/* no bad block overlap */
	if (!is_badblock(rdev, this_sector, *len, &first_bad, &bad_sectors))
		return *len;

	/*
	 * bad block range starts offset into our range so we can return the
	 * number of sectors before the bad blocks start.
	 */
	if (first_bad > this_sector)
		return first_bad - this_sector;

	/* read range is fully consumed by bad blocks. */
	if (this_sector + *len <= first_bad + bad_sectors)
		return 0;

	/*
	 * final case, bad block range starts before or at the start of our
	 * range but does not cover our entire range so we still return 0 but
	 * update the length with the number of sectors before we get to the
	 * good ones.
	 */
	*len = first_bad + bad_sectors - this_sector;
	return 0;
}

/*
 * Check if read should choose the first rdev.
 *
 * Balance on the whole device if no resync is going on (recovery is ok) or
 * below the resync window. Otherwise, take the first readable disk.
 */
static inline bool raid1_should_read_first(struct mddev *mddev,
					   sector_t this_sector, int len)
{
	if ((mddev->recovery_cp < this_sector + len))
		return true;

	if (mddev_is_clustered(mddev) &&
	    mddev->cluster_ops->area_resyncing(mddev, READ, this_sector,
					       this_sector + len))
		return true;

	return false;
}

/*
 * bio with REQ_RAHEAD or REQ_NOWAIT can fail at anytime, before such IO is
 * submitted to the underlying disks, hence don't record badblocks or retry
 * in this case.
 */
static inline bool raid1_should_handle_error(struct bio *bio)
{
	return !(bio->bi_opf & (REQ_RAHEAD | REQ_NOWAIT));
}
Commit	Line	Data
	1	// SPDX-License-Identifier: GPL-2.0
	2	/* Maximum size of each resync request */
	3	#define RESYNC_BLOCK_SIZE (64*1024)
	4	#define RESYNC_PAGES ((RESYNC_BLOCK_SIZE + PAGE_SIZE-1) / PAGE_SIZE)
	5
	6	/*
	7	* Number of guaranteed raid bios in case of extreme VM load:
	8	*/
	9	#define NR_RAID_BIOS 256
	10
	11	/* when we get a read error on a read-only array, we redirect to another
	12	* device without failing the first device, or trying to over-write to
	13	* correct the read error. To keep track of bad blocks on a per-bio
	14	* level, we store IO_BLOCKED in the appropriate 'bios' pointer
	15	*/
	16	#define IO_BLOCKED ((struct bio *)1)
	17	/* When we successfully write to a known bad-block, we need to remove the
	18	* bad-block marking which must be done from process context. So we record
	19	* the success by setting devs[n].bio to IO_MADE_GOOD
	20	*/
	21	#define IO_MADE_GOOD ((struct bio *)2)
	22
	23	#define BIO_SPECIAL(bio) ((unsigned long)bio <= 2)
	24	#define MAX_PLUG_BIO 32
	25
	26	/* for managing resync I/O pages */
	27	struct resync_pages {
	28	void *raid_bio;
	29	struct page *pages[RESYNC_PAGES];
	30	};
	31
	32	struct raid1_plug_cb {
	33	struct blk_plug_cb cb;
	34	struct bio_list pending;
	35	unsigned int count;
	36	};
	37
	38	static void rbio_pool_free(void rbio, void data)
	39	{
	40	kfree(rbio);
	41	}
	42
	43	static inline int resync_alloc_pages(struct resync_pages *rp,
	44	gfp_t gfp_flags)
	45	{
	46	int i;
	47
	48	for (i = 0; i < RESYNC_PAGES; i++) {
	49	rp->pages[i] = alloc_page(gfp_flags);
	50	if (!rp->pages[i])
	51	goto out_free;
	52	}
	53
	54	return 0;
	55
	56	out_free:
	57	while (--i >= 0)
	58	put_page(rp->pages[i]);
	59	return -ENOMEM;
	60	}
	61
	62	static inline void resync_free_pages(struct resync_pages *rp)
	63	{
	64	int i;
	65
	66	for (i = 0; i < RESYNC_PAGES; i++)
	67	put_page(rp->pages[i]);
	68	}
	69
	70	static inline void resync_get_all_pages(struct resync_pages *rp)
	71	{
	72	int i;
	73
	74	for (i = 0; i < RESYNC_PAGES; i++)
	75	get_page(rp->pages[i]);
	76	}
	77
	78	static inline struct page resync_fetch_page(struct resync_pages rp,
	79	unsigned idx)
	80	{
	81	if (WARN_ON_ONCE(idx >= RESYNC_PAGES))
	82	return NULL;
	83	return rp->pages[idx];
	84	}
	85
	86	/*
	87	* 'strct resync_pages' stores actual pages used for doing the resync
	88	* IO, and it is per-bio, so make .bi_private points to it.
	89	*/
	90	static inline struct resync_pages get_resync_pages(struct bio bio)
	91	{
	92	return bio->bi_private;
	93	}
	94
	95	/* generally called after bio_reset() for reseting bvec */
	96	static void md_bio_reset_resync_pages(struct bio bio, struct resync_pages rp,
	97	int size)
	98	{
	99	int idx = 0;
	100
	101	/* initialize bvec table again */
	102	do {
	103	struct page *page = resync_fetch_page(rp, idx);
	104	int len = min_t(int, size, PAGE_SIZE);
	105
	106	if (WARN_ON(!bio_add_page(bio, page, len, 0))) {
	107	bio->bi_status = BLK_STS_RESOURCE;
	108	bio_endio(bio);
	109	return;
	110	}
	111
	112	size -= len;
	113	} while (idx++ < RESYNC_PAGES && size > 0);
	114	}
	115
	116
	117	static inline void raid1_submit_write(struct bio *bio)
	118	{
	119	struct md_rdev rdev = (void )bio->bi_bdev;
	120
	121	bio->bi_next = NULL;
	122	bio_set_dev(bio, rdev->bdev);
	123	if (test_bit(Faulty, &rdev->flags))
	124	bio_io_error(bio);
	125	else if (unlikely(bio_op(bio) == REQ_OP_DISCARD &&
	126	!bdev_max_discard_sectors(bio->bi_bdev)))
	127	/* Just ignore it */
	128	bio_endio(bio);
	129	else
	130	submit_bio_noacct(bio);
	131	}
	132
	133	static inline bool raid1_add_bio_to_plug(struct mddev mddev, struct bio bio,
	134	blk_plug_cb_fn unplug, int copies)
	135	{
	136	struct raid1_plug_cb *plug = NULL;
	137	struct blk_plug_cb *cb;
	138
	139	/*
	140	* If bitmap is not enabled, it's safe to submit the io directly, and
	141	* this can get optimal performance.
	142	*/
	143	if (!mddev->bitmap_ops->enabled(mddev)) {
	144	raid1_submit_write(bio);
	145	return true;
	146	}
	147
	148	cb = blk_check_plugged(unplug, mddev, sizeof(*plug));
	149	if (!cb)
	150	return false;
	151
	152	plug = container_of(cb, struct raid1_plug_cb, cb);
	153	bio_list_add(&plug->pending, bio);
	154	if (++plug->count / MAX_PLUG_BIO >= copies) {
	155	list_del(&cb->list);
	156	cb->callback(cb, false);
	157	}
	158
	159
	160	return true;
	161	}
	162
	163	/*
	164	* current->bio_list will be set under submit_bio() context, in this case bitmap
	165	* io will be added to the list and wait for current io submission to finish,
	166	* while current io submission must wait for bitmap io to be done. In order to
	167	* avoid such deadlock, submit bitmap io asynchronously.
	168	*/
	169	static inline void raid1_prepare_flush_writes(struct mddev *mddev)
	170	{
	171	mddev->bitmap_ops->unplug(mddev, current->bio_list == NULL);
	172	}
	173
	174	/*
	175	* Used by fix_read_error() to decay the per rdev read_errors.
	176	* We halve the read error count for every hour that has elapsed
	177	* since the last recorded read error.
	178	*/
	179	static inline void check_decay_read_errors(struct mddev mddev, struct md_rdev rdev)
	180	{
	181	long cur_time_mon;
	182	unsigned long hours_since_last;
	183	unsigned int read_errors = atomic_read(&rdev->read_errors);
	184
	185	cur_time_mon = ktime_get_seconds();
	186
	187	if (rdev->last_read_error == 0) {
	188	/* first time we've seen a read error */
	189	rdev->last_read_error = cur_time_mon;
	190	return;
	191	}
	192
	193	hours_since_last = (long)(cur_time_mon -
	194	rdev->last_read_error) / 3600;
	195
	196	rdev->last_read_error = cur_time_mon;
	197
	198	/*
	199	* if hours_since_last is > the number of bits in read_errors
	200	* just set read errors to 0. We do this to avoid
	201	* overflowing the shift of read_errors by hours_since_last.
	202	*/
	203	if (hours_since_last >= 8 * sizeof(read_errors))
	204	atomic_set(&rdev->read_errors, 0);
	205	else
	206	atomic_set(&rdev->read_errors, read_errors >> hours_since_last);
	207	}
	208
	209	static inline bool exceed_read_errors(struct mddev mddev, struct md_rdev rdev)
	210	{
	211	int max_read_errors = atomic_read(&mddev->max_corr_read_errors);
	212	int read_errors;
	213
	214	check_decay_read_errors(mddev, rdev);
	215	read_errors = atomic_inc_return(&rdev->read_errors);
	216	if (read_errors > max_read_errors) {
	217	pr_notice("md/"RAID_1_10_NAME":%s: %pg: Raid device exceeded read_error threshold [cur %d:max %d]\n",
	218	mdname(mddev), rdev->bdev, read_errors, max_read_errors);
	219	pr_notice("md/"RAID_1_10_NAME":%s: %pg: Failing raid device\n",
	220	mdname(mddev), rdev->bdev);
	221	md_error(mddev, rdev);
	222	return true;
	223	}
	224
	225	return false;
	226	}
	227
	228	/**
	229	* raid1_check_read_range() - check a given read range for bad blocks,
	230	* available read length is returned;
	231	* @rdev: the rdev to read;
	232	* @this_sector: read position;
	233	* @len: read length;
	234	*
	235	* helper function for read_balance()
	236	*
	237	* 1) If there are no bad blocks in the range, @len is returned;
	238	* 2) If the range are all bad blocks, 0 is returned;
	239	* 3) If there are partial bad blocks:
	240	* - If the bad block range starts after @this_sector, the length of first
	241	* good region is returned;
	242	* - If the bad block range starts before @this_sector, 0 is returned and
	243	* the @len is updated to the offset into the region before we get to the
	244	* good blocks;
	245	*/
	246	static inline int raid1_check_read_range(struct md_rdev *rdev,
	247	sector_t this_sector, int *len)
	248	{
	249	sector_t first_bad;
	250	sector_t bad_sectors;
	251
	252	/* no bad block overlap */
	253	if (!is_badblock(rdev, this_sector, *len, &first_bad, &bad_sectors))
	254	return *len;
	255
	256	/*
	257	* bad block range starts offset into our range so we can return the
	258	* number of sectors before the bad blocks start.
	259	*/
	260	if (first_bad > this_sector)
	261	return first_bad - this_sector;
	262
	263	/* read range is fully consumed by bad blocks. */
	264	if (this_sector + *len <= first_bad + bad_sectors)
	265	return 0;
	266
	267	/*
	268	* final case, bad block range starts before or at the start of our
	269	* range but does not cover our entire range so we still return 0 but
	270	* update the length with the number of sectors before we get to the
	271	* good ones.
	272	*/
	273	*len = first_bad + bad_sectors - this_sector;
	274	return 0;
	275	}
	276
	277	/*
	278	* Check if read should choose the first rdev.
	279	*
	280	* Balance on the whole device if no resync is going on (recovery is ok) or
	281	* below the resync window. Otherwise, take the first readable disk.
	282	*/
	283	static inline bool raid1_should_read_first(struct mddev *mddev,
	284	sector_t this_sector, int len)
	285	{
	286	if ((mddev->recovery_cp < this_sector + len))
	287	return true;
	288
	289	if (mddev_is_clustered(mddev) &&
	290	mddev->cluster_ops->area_resyncing(mddev, READ, this_sector,
	291	this_sector + len))
	292	return true;
	293
	294	return false;
	295	}
	296
	297	/*
	298	* bio with REQ_RAHEAD or REQ_NOWAIT can fail at anytime, before such IO is
	299	* submitted to the underlying disks, hence don't record badblocks or retry
	300	* in this case.
	301	*/
	302	static inline bool raid1_should_handle_error(struct bio *bio)
	303	{
	304	return !(bio->bi_opf & (REQ_RAHEAD \| REQ_NOWAIT));
	305	}