[linux-block.git] / drivers / md / raid1-10.c

// SPDX-License-Identifier: GPL-2.0
/* Maximum size of each resync request */
#define RESYNC_BLOCK_SIZE (64*1024)
#define RESYNC_PAGES ((RESYNC_BLOCK_SIZE + PAGE_SIZE-1) / PAGE_SIZE)

/*
 * Number of guaranteed raid bios in case of extreme VM load:
 */
#define	NR_RAID_BIOS 256

/* when we get a read error on a read-only array, we redirect to another
 * device without failing the first device, or trying to over-write to
 * correct the read error.  To keep track of bad blocks on a per-bio
 * level, we store IO_BLOCKED in the appropriate 'bios' pointer
 */
#define IO_BLOCKED ((struct bio *)1)
/* When we successfully write to a known bad-block, we need to remove the
 * bad-block marking which must be done from process context.  So we record
 * the success by setting devs[n].bio to IO_MADE_GOOD
 */
#define IO_MADE_GOOD ((struct bio *)2)

#define BIO_SPECIAL(bio) ((unsigned long)bio <= 2)
#define MAX_PLUG_BIO 32

/* for managing resync I/O pages */
struct resync_pages {
	void		*raid_bio;
	struct page	*pages[RESYNC_PAGES];
};

struct raid1_plug_cb {
	struct blk_plug_cb	cb;
	struct bio_list		pending;
	unsigned int		count;
};

static void rbio_pool_free(void *rbio, void *data)
{
	kfree(rbio);
}

static inline int resync_alloc_pages(struct resync_pages *rp,
				     gfp_t gfp_flags)
{
	int i;

	for (i = 0; i < RESYNC_PAGES; i++) {
		rp->pages[i] = alloc_page(gfp_flags);
		if (!rp->pages[i])
			goto out_free;
	}

	return 0;

out_free:
	while (--i >= 0)
		put_page(rp->pages[i]);
	return -ENOMEM;
}

static inline void resync_free_pages(struct resync_pages *rp)
{
	int i;

	for (i = 0; i < RESYNC_PAGES; i++)
		put_page(rp->pages[i]);
}

static inline void resync_get_all_pages(struct resync_pages *rp)
{
	int i;

	for (i = 0; i < RESYNC_PAGES; i++)
		get_page(rp->pages[i]);
}

static inline struct page *resync_fetch_page(struct resync_pages *rp,
					     unsigned idx)
{
	if (WARN_ON_ONCE(idx >= RESYNC_PAGES))
		return NULL;
	return rp->pages[idx];
}

/*
 * 'strct resync_pages' stores actual pages used for doing the resync
 *  IO, and it is per-bio, so make .bi_private points to it.
 */
static inline struct resync_pages *get_resync_pages(struct bio *bio)
{
	return bio->bi_private;
}

/* generally called after bio_reset() for reseting bvec */
static void md_bio_reset_resync_pages(struct bio *bio, struct resync_pages *rp,
			       int size)
{
	int idx = 0;

	/* initialize bvec table again */
	do {
		struct page *page = resync_fetch_page(rp, idx);
		int len = min_t(int, size, PAGE_SIZE);

		if (WARN_ON(!bio_add_page(bio, page, len, 0))) {
			bio->bi_status = BLK_STS_RESOURCE;
			bio_endio(bio);
			return;
		}

		size -= len;
	} while (idx++ < RESYNC_PAGES && size > 0);
}


static inline void raid1_submit_write(struct bio *bio)
{
	struct md_rdev *rdev = (void *)bio->bi_bdev;

	bio->bi_next = NULL;
	bio_set_dev(bio, rdev->bdev);
	if (test_bit(Faulty, &rdev->flags))
		bio_io_error(bio);
	else if (unlikely(bio_op(bio) ==  REQ_OP_DISCARD &&
			  !bdev_max_discard_sectors(bio->bi_bdev)))
		/* Just ignore it */
		bio_endio(bio);
	else
		submit_bio_noacct(bio);
}

static inline bool raid1_add_bio_to_plug(struct mddev *mddev, struct bio *bio,
				      blk_plug_cb_fn unplug, int copies)
{
	struct raid1_plug_cb *plug = NULL;
	struct blk_plug_cb *cb;

	/*
	 * If bitmap is not enabled, it's safe to submit the io directly, and
	 * this can get optimal performance.
	 */
	if (!md_bitmap_enabled(mddev->bitmap)) {
		raid1_submit_write(bio);
		return true;
	}

	cb = blk_check_plugged(unplug, mddev, sizeof(*plug));
	if (!cb)
		return false;

	plug = container_of(cb, struct raid1_plug_cb, cb);
	bio_list_add(&plug->pending, bio);
	if (++plug->count / MAX_PLUG_BIO >= copies) {
		list_del(&cb->list);
		cb->callback(cb, false);
	}


	return true;
}

/*
 * current->bio_list will be set under submit_bio() context, in this case bitmap
 * io will be added to the list and wait for current io submission to finish,
 * while current io submission must wait for bitmap io to be done. In order to
 * avoid such deadlock, submit bitmap io asynchronously.
 */
static inline void raid1_prepare_flush_writes(struct bitmap *bitmap)
{
	if (current->bio_list)
		md_bitmap_unplug_async(bitmap);
	else
		md_bitmap_unplug(bitmap);
}

/*
 * Used by fix_read_error() to decay the per rdev read_errors.
 * We halve the read error count for every hour that has elapsed
 * since the last recorded read error.
 */
static inline void check_decay_read_errors(struct mddev *mddev, struct md_rdev *rdev)
{
	long cur_time_mon;
	unsigned long hours_since_last;
	unsigned int read_errors = atomic_read(&rdev->read_errors);

	cur_time_mon = ktime_get_seconds();

	if (rdev->last_read_error == 0) {
		/* first time we've seen a read error */
		rdev->last_read_error = cur_time_mon;
		return;
	}

	hours_since_last = (long)(cur_time_mon -
			    rdev->last_read_error) / 3600;

	rdev->last_read_error = cur_time_mon;

	/*
	 * if hours_since_last is > the number of bits in read_errors
	 * just set read errors to 0. We do this to avoid
	 * overflowing the shift of read_errors by hours_since_last.
	 */
	if (hours_since_last >= 8 * sizeof(read_errors))
		atomic_set(&rdev->read_errors, 0);
	else
		atomic_set(&rdev->read_errors, read_errors >> hours_since_last);
}

static inline bool exceed_read_errors(struct mddev *mddev, struct md_rdev *rdev)
{
	int max_read_errors = atomic_read(&mddev->max_corr_read_errors);
	int read_errors;

	check_decay_read_errors(mddev, rdev);
	read_errors =  atomic_inc_return(&rdev->read_errors);
	if (read_errors > max_read_errors) {
		pr_notice("md/"RAID_1_10_NAME":%s: %pg: Raid device exceeded read_error threshold [cur %d:max %d]\n",
			  mdname(mddev), rdev->bdev, read_errors, max_read_errors);
		pr_notice("md/"RAID_1_10_NAME":%s: %pg: Failing raid device\n",
			  mdname(mddev), rdev->bdev);
		md_error(mddev, rdev);
		return true;
	}

	return false;
}
Commit	Line	Data
b2441318	1	// SPDX-License-Identifier: GPL-2.0
be453e77 ML	2	/* Maximum size of each resync request */
	3	#define RESYNC_BLOCK_SIZE (64*1024)
	4	#define RESYNC_PAGES ((RESYNC_BLOCK_SIZE + PAGE_SIZE-1) / PAGE_SIZE)
	5
3f677f9c MPS	6	/*
	7	* Number of guaranteed raid bios in case of extreme VM load:
	8	*/
	9	#define NR_RAID_BIOS 256
	10
	11	/* when we get a read error on a read-only array, we redirect to another
	12	* device without failing the first device, or trying to over-write to
	13	* correct the read error. To keep track of bad blocks on a per-bio
	14	* level, we store IO_BLOCKED in the appropriate 'bios' pointer
	15	*/
	16	#define IO_BLOCKED ((struct bio *)1)
	17	/* When we successfully write to a known bad-block, we need to remove the
	18	* bad-block marking which must be done from process context. So we record
	19	* the success by setting devs[n].bio to IO_MADE_GOOD
	20	*/
	21	#define IO_MADE_GOOD ((struct bio *)2)
	22
	23	#define BIO_SPECIAL(bio) ((unsigned long)bio <= 2)
460af1f9	24	#define MAX_PLUG_BIO 32
3f677f9c	25
be453e77 ML	26	/* for managing resync I/O pages */
	27	struct resync_pages {
	28	void *raid_bio;
	29	struct page *pages[RESYNC_PAGES];
	30	};
	31
daae161f MT	32	struct raid1_plug_cb {
	33	struct blk_plug_cb cb;
	34	struct bio_list pending;
460af1f9	35	unsigned int count;
daae161f MT	36	};
daae161f MT	37
c7afa803 MPS	38	static void rbio_pool_free(void rbio, void data)
	39	{
	40	kfree(rbio);
	41	}
	42
be453e77 ML	43	static inline int resync_alloc_pages(struct resync_pages *rp,
	44	gfp_t gfp_flags)
	45	{
	46	int i;
	47
	48	for (i = 0; i < RESYNC_PAGES; i++) {
	49	rp->pages[i] = alloc_page(gfp_flags);
	50	if (!rp->pages[i])
	51	goto out_free;
	52	}
	53
	54	return 0;
	55
	56	out_free:
	57	while (--i >= 0)
	58	put_page(rp->pages[i]);
	59	return -ENOMEM;
	60	}
	61
	62	static inline void resync_free_pages(struct resync_pages *rp)
	63	{
	64	int i;
	65
	66	for (i = 0; i < RESYNC_PAGES; i++)
	67	put_page(rp->pages[i]);
	68	}
	69
	70	static inline void resync_get_all_pages(struct resync_pages *rp)
	71	{
	72	int i;
	73
	74	for (i = 0; i < RESYNC_PAGES; i++)
	75	get_page(rp->pages[i]);
	76	}
	77
	78	static inline struct page resync_fetch_page(struct resync_pages rp,
	79	unsigned idx)
	80	{
	81	if (WARN_ON_ONCE(idx >= RESYNC_PAGES))
	82	return NULL;
	83	return rp->pages[idx];
	84	}
	85
	86	/*
	87	* 'strct resync_pages' stores actual pages used for doing the resync
	88	* IO, and it is per-bio, so make .bi_private points to it.
	89	*/
	90	static inline struct resync_pages get_resync_pages(struct bio bio)
	91	{
	92	return bio->bi_private;
	93	}
	94
fb0eb5df ML	95	/* generally called after bio_reset() for reseting bvec */
	96	static void md_bio_reset_resync_pages(struct bio bio, struct resync_pages rp,
	97	int size)
	98	{
	99	int idx = 0;
	100
	101	/* initialize bvec table again */
	102	do {
	103	struct page *page = resync_fetch_page(rp, idx);
	104	int len = min_t(int, size, PAGE_SIZE);
	105
0c67dd64 JT	106	if (WARN_ON(!bio_add_page(bio, page, len, 0))) {
	107	bio->bi_status = BLK_STS_RESOURCE;
	108	bio_endio(bio);
	109	return;
	110	}
	111
fb0eb5df ML	112	size -= len;
	113	} while (idx++ < RESYNC_PAGES && size > 0);
	114	}
5ec6ca14	115
8295efbe YK	116
	117	static inline void raid1_submit_write(struct bio *bio)
	118	{
b5a99602	119	struct md_rdev rdev = (void )bio->bi_bdev;
8295efbe YK	120
	121	bio->bi_next = NULL;
	122	bio_set_dev(bio, rdev->bdev);
	123	if (test_bit(Faulty, &rdev->flags))
	124	bio_io_error(bio);
	125	else if (unlikely(bio_op(bio) == REQ_OP_DISCARD &&
	126	!bdev_max_discard_sectors(bio->bi_bdev)))
	127	/* Just ignore it */
	128	bio_endio(bio);
	129	else
	130	submit_bio_noacct(bio);
	131	}
	132
5ec6ca14	133	static inline bool raid1_add_bio_to_plug(struct mddev mddev, struct bio bio,
460af1f9	134	blk_plug_cb_fn unplug, int copies)
5ec6ca14 YK	135	{
5ec6ca14 YK	136	struct raid1_plug_cb *plug = NULL;
7db922ba YK	137	struct blk_plug_cb *cb;
	138
	139	/*
	140	* If bitmap is not enabled, it's safe to submit the io directly, and
	141	* this can get optimal performance.
	142	*/
	143	if (!md_bitmap_enabled(mddev->bitmap)) {
	144	raid1_submit_write(bio);
	145	return true;
	146	}
5ec6ca14	147
7db922ba	148	cb = blk_check_plugged(unplug, mddev, sizeof(*plug));
5ec6ca14 YK	149	if (!cb)
	150	return false;
	151
	152	plug = container_of(cb, struct raid1_plug_cb, cb);
	153	bio_list_add(&plug->pending, bio);
460af1f9 YK	154	if (++plug->count / MAX_PLUG_BIO >= copies) {
	155	list_del(&cb->list);
	156	cb->callback(cb, false);
	157	}
	158
5ec6ca14 YK	159
	160	return true;
	161	}
9efcc2c3 YK	162
	163	/*
	164	* current->bio_list will be set under submit_bio() context, in this case bitmap
	165	* io will be added to the list and wait for current io submission to finish,
	166	* while current io submission must wait for bitmap io to be done. In order to
	167	* avoid such deadlock, submit bitmap io asynchronously.
	168	*/
	169	static inline void raid1_prepare_flush_writes(struct bitmap *bitmap)
	170	{
	171	if (current->bio_list)
	172	md_bitmap_unplug_async(bitmap);
	173	else
	174	md_bitmap_unplug(bitmap);
	175	}
1979dbbe LN	176
	177	/*
	178	* Used by fix_read_error() to decay the per rdev read_errors.
	179	* We halve the read error count for every hour that has elapsed
	180	* since the last recorded read error.
	181	*/
	182	static inline void check_decay_read_errors(struct mddev mddev, struct md_rdev rdev)
	183	{
	184	long cur_time_mon;
	185	unsigned long hours_since_last;
	186	unsigned int read_errors = atomic_read(&rdev->read_errors);
	187
	188	cur_time_mon = ktime_get_seconds();
	189
	190	if (rdev->last_read_error == 0) {
	191	/* first time we've seen a read error */
	192	rdev->last_read_error = cur_time_mon;
	193	return;
	194	}
	195
	196	hours_since_last = (long)(cur_time_mon -
	197	rdev->last_read_error) / 3600;
	198
	199	rdev->last_read_error = cur_time_mon;
	200
	201	/*
	202	* if hours_since_last is > the number of bits in read_errors
	203	* just set read errors to 0. We do this to avoid
	204	* overflowing the shift of read_errors by hours_since_last.
	205	*/
	206	if (hours_since_last >= 8 * sizeof(read_errors))
	207	atomic_set(&rdev->read_errors, 0);
	208	else
	209	atomic_set(&rdev->read_errors, read_errors >> hours_since_last);
	210	}
	211
	212	static inline bool exceed_read_errors(struct mddev mddev, struct md_rdev rdev)
	213	{
	214	int max_read_errors = atomic_read(&mddev->max_corr_read_errors);
	215	int read_errors;
	216
	217	check_decay_read_errors(mddev, rdev);
	218	read_errors = atomic_inc_return(&rdev->read_errors);
	219	if (read_errors > max_read_errors) {
	220	pr_notice("md/"RAID_1_10_NAME":%s: %pg: Raid device exceeded read_error threshold [cur %d:max %d]\n",
	221	mdname(mddev), rdev->bdev, read_errors, max_read_errors);
	222	pr_notice("md/"RAID_1_10_NAME":%s: %pg: Failing raid device\n",
	223	mdname(mddev), rdev->bdev);
	224	md_error(mddev, rdev);
	225	return true;
	226	}
	227
	228	return false;
	229	}