[linux-block.git] / mm / page_io.c

// SPDX-License-Identifier: GPL-2.0
/*
 *  linux/mm/page_io.c
 *
 *  Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
 *
 *  Swap reorganised 29.12.95, 
 *  Asynchronous swapping added 30.12.95. Stephen Tweedie
 *  Removed race in async swapping. 14.4.1996. Bruno Haible
 *  Add swap of shared pages through the page cache. 20.2.1998. Stephen Tweedie
 *  Always use brw_page, life becomes simpler. 12 May 1998 Eric Biederman
 */

#include <linux/mm.h>
#include <linux/kernel_stat.h>
#include <linux/gfp.h>
#include <linux/pagemap.h>
#include <linux/swap.h>
#include <linux/bio.h>
#include <linux/swapops.h>
#include <linux/buffer_head.h>
#include <linux/writeback.h>
#include <linux/frontswap.h>
#include <linux/blkdev.h>
#include <linux/psi.h>
#include <linux/uio.h>
#include <linux/sched/task.h>

void end_swap_bio_write(struct bio *bio)
{
	struct page *page = bio_first_page_all(bio);

	if (bio->bi_status) {
		SetPageError(page);
		/*
		 * We failed to write the page out to swap-space.
		 * Re-dirty the page in order to avoid it being reclaimed.
		 * Also print a dire warning that things will go BAD (tm)
		 * very quickly.
		 *
		 * Also clear PG_reclaim to avoid rotate_reclaimable_page()
		 */
		set_page_dirty(page);
		pr_alert_ratelimited("Write-error on swap-device (%u:%u:%llu)\n",
				     MAJOR(bio_dev(bio)), MINOR(bio_dev(bio)),
				     (unsigned long long)bio->bi_iter.bi_sector);
		ClearPageReclaim(page);
	}
	end_page_writeback(page);
	bio_put(bio);
}

static void swap_slot_free_notify(struct page *page)
{
	struct swap_info_struct *sis;
	struct gendisk *disk;
	swp_entry_t entry;

	/*
	 * There is no guarantee that the page is in swap cache - the software
	 * suspend code (at least) uses end_swap_bio_read() against a non-
	 * swapcache page.  So we must check PG_swapcache before proceeding with
	 * this optimization.
	 */
	if (unlikely(!PageSwapCache(page)))
		return;

	sis = page_swap_info(page);
	if (data_race(!(sis->flags & SWP_BLKDEV)))
		return;

	/*
	 * The swap subsystem performs lazy swap slot freeing,
	 * expecting that the page will be swapped out again.
	 * So we can avoid an unnecessary write if the page
	 * isn't redirtied.
	 * This is good for real swap storage because we can
	 * reduce unnecessary I/O and enhance wear-leveling
	 * if an SSD is used as the as swap device.
	 * But if in-memory swap device (eg zram) is used,
	 * this causes a duplicated copy between uncompressed
	 * data in VM-owned memory and compressed data in
	 * zram-owned memory.  So let's free zram-owned memory
	 * and make the VM-owned decompressed page *dirty*,
	 * so the page should be swapped out somewhere again if
	 * we again wish to reclaim it.
	 */
	disk = sis->bdev->bd_disk;
	entry.val = page_private(page);
	if (disk->fops->swap_slot_free_notify && __swap_count(entry) == 1) {
		unsigned long offset;

		offset = swp_offset(entry);

		SetPageDirty(page);
		disk->fops->swap_slot_free_notify(sis->bdev,
				offset);
	}
}

static void end_swap_bio_read(struct bio *bio)
{
	struct page *page = bio_first_page_all(bio);
	struct task_struct *waiter = bio->bi_private;

	if (bio->bi_status) {
		SetPageError(page);
		ClearPageUptodate(page);
		pr_alert_ratelimited("Read-error on swap-device (%u:%u:%llu)\n",
				     MAJOR(bio_dev(bio)), MINOR(bio_dev(bio)),
				     (unsigned long long)bio->bi_iter.bi_sector);
		goto out;
	}

	SetPageUptodate(page);
	swap_slot_free_notify(page);
out:
	unlock_page(page);
	WRITE_ONCE(bio->bi_private, NULL);
	bio_put(bio);
	if (waiter) {
		blk_wake_io_task(waiter);
		put_task_struct(waiter);
	}
}

int generic_swapfile_activate(struct swap_info_struct *sis,
				struct file *swap_file,
				sector_t *span)
{
	struct address_space *mapping = swap_file->f_mapping;
	struct inode *inode = mapping->host;
	unsigned blocks_per_page;
	unsigned long page_no;
	unsigned blkbits;
	sector_t probe_block;
	sector_t last_block;
	sector_t lowest_block = -1;
	sector_t highest_block = 0;
	int nr_extents = 0;
	int ret;

	blkbits = inode->i_blkbits;
	blocks_per_page = PAGE_SIZE >> blkbits;

	/*
	 * Map all the blocks into the extent tree.  This code doesn't try
	 * to be very smart.
	 */
	probe_block = 0;
	page_no = 0;
	last_block = i_size_read(inode) >> blkbits;
	while ((probe_block + blocks_per_page) <= last_block &&
			page_no < sis->max) {
		unsigned block_in_page;
		sector_t first_block;

		cond_resched();

		first_block = probe_block;
		ret = bmap(inode, &first_block);
		if (ret || !first_block)
			goto bad_bmap;

		/*
		 * It must be PAGE_SIZE aligned on-disk
		 */
		if (first_block & (blocks_per_page - 1)) {
			probe_block++;
			goto reprobe;
		}

		for (block_in_page = 1; block_in_page < blocks_per_page;
					block_in_page++) {
			sector_t block;

			block = probe_block + block_in_page;
			ret = bmap(inode, &block);
			if (ret || !block)
				goto bad_bmap;

			if (block != first_block + block_in_page) {
				/* Discontiguity */
				probe_block++;
				goto reprobe;
			}
		}

		first_block >>= (PAGE_SHIFT - blkbits);
		if (page_no) {	/* exclude the header page */
			if (first_block < lowest_block)
				lowest_block = first_block;
			if (first_block > highest_block)
				highest_block = first_block;
		}

		/*
		 * We found a PAGE_SIZE-length, PAGE_SIZE-aligned run of blocks
		 */
		ret = add_swap_extent(sis, page_no, 1, first_block);
		if (ret < 0)
			goto out;
		nr_extents += ret;
		page_no++;
		probe_block += blocks_per_page;
reprobe:
		continue;
	}
	ret = nr_extents;
	*span = 1 + highest_block - lowest_block;
	if (page_no == 0)
		page_no = 1;	/* force Empty message */
	sis->max = page_no;
	sis->pages = page_no - 1;
	sis->highest_bit = page_no - 1;
out:
	return ret;
bad_bmap:
	pr_err("swapon: swapfile has holes\n");
	ret = -EINVAL;
	goto out;
}

/*
 * We may have stale swap cache pages in memory: notice
 * them here and get rid of the unnecessary final write.
 */
int swap_writepage(struct page *page, struct writeback_control *wbc)
{
	int ret = 0;

	if (try_to_free_swap(page)) {
		unlock_page(page);
		goto out;
	}
	/*
	 * Arch code may have to preserve more data than just the page
	 * contents, e.g. memory tags.
	 */
	ret = arch_prepare_to_swap(page);
	if (ret) {
		set_page_dirty(page);
		unlock_page(page);
		goto out;
	}
	if (frontswap_store(page) == 0) {
		set_page_writeback(page);
		unlock_page(page);
		end_page_writeback(page);
		goto out;
	}
	ret = __swap_writepage(page, wbc, end_swap_bio_write);
out:
	return ret;
}

static sector_t swap_page_sector(struct page *page)
{
	return (sector_t)__page_file_index(page) << (PAGE_SHIFT - 9);
}

static inline void count_swpout_vm_event(struct page *page)
{
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
	if (unlikely(PageTransHuge(page)))
		count_vm_event(THP_SWPOUT);
#endif
	count_vm_events(PSWPOUT, thp_nr_pages(page));
}

#if defined(CONFIG_MEMCG) && defined(CONFIG_BLK_CGROUP)
static void bio_associate_blkg_from_page(struct bio *bio, struct page *page)
{
	struct cgroup_subsys_state *css;
	struct mem_cgroup *memcg;

	memcg = page_memcg(page);
	if (!memcg)
		return;

	rcu_read_lock();
	css = cgroup_e_css(memcg->css.cgroup, &io_cgrp_subsys);
	bio_associate_blkg_from_css(bio, css);
	rcu_read_unlock();
}
#else
#define bio_associate_blkg_from_page(bio, page)		do { } while (0)
#endif /* CONFIG_MEMCG && CONFIG_BLK_CGROUP */

int __swap_writepage(struct page *page, struct writeback_control *wbc,
		bio_end_io_t end_write_func)
{
	struct bio *bio;
	int ret;
	struct swap_info_struct *sis = page_swap_info(page);

	VM_BUG_ON_PAGE(!PageSwapCache(page), page);
	if (data_race(sis->flags & SWP_FS_OPS)) {
		struct kiocb kiocb;
		struct file *swap_file = sis->swap_file;
		struct address_space *mapping = swap_file->f_mapping;
		struct bio_vec bv = {
			.bv_page = page,
			.bv_len  = PAGE_SIZE,
			.bv_offset = 0
		};
		struct iov_iter from;

		iov_iter_bvec(&from, WRITE, &bv, 1, PAGE_SIZE);
		init_sync_kiocb(&kiocb, swap_file);
		kiocb.ki_pos = page_file_offset(page);

		set_page_writeback(page);
		unlock_page(page);
		ret = mapping->a_ops->direct_IO(&kiocb, &from);
		if (ret == PAGE_SIZE) {
			count_vm_event(PSWPOUT);
			ret = 0;
		} else {
			/*
			 * In the case of swap-over-nfs, this can be a
			 * temporary failure if the system has limited
			 * memory for allocating transmit buffers.
			 * Mark the page dirty and avoid
			 * rotate_reclaimable_page but rate-limit the
			 * messages but do not flag PageError like
			 * the normal direct-to-bio case as it could
			 * be temporary.
			 */
			set_page_dirty(page);
			ClearPageReclaim(page);
			pr_err_ratelimited("Write error on dio swapfile (%llu)\n",
					   page_file_offset(page));
		}
		end_page_writeback(page);
		return ret;
	}

	ret = bdev_write_page(sis->bdev, swap_page_sector(page), page, wbc);
	if (!ret) {
		count_swpout_vm_event(page);
		return 0;
	}

	bio = bio_alloc(GFP_NOIO, 1);
	bio_set_dev(bio, sis->bdev);
	bio->bi_iter.bi_sector = swap_page_sector(page);
	bio->bi_opf = REQ_OP_WRITE | REQ_SWAP | wbc_to_write_flags(wbc);
	bio->bi_end_io = end_write_func;
	bio_add_page(bio, page, thp_size(page), 0);

	bio_associate_blkg_from_page(bio, page);
	count_swpout_vm_event(page);
	set_page_writeback(page);
	unlock_page(page);
	submit_bio(bio);

	return 0;
}

int swap_readpage(struct page *page, bool synchronous)
{
	struct bio *bio;
	int ret = 0;
	struct swap_info_struct *sis = page_swap_info(page);
	blk_qc_t qc;
	struct gendisk *disk;
	unsigned long pflags;

	VM_BUG_ON_PAGE(!PageSwapCache(page) && !synchronous, page);
	VM_BUG_ON_PAGE(!PageLocked(page), page);
	VM_BUG_ON_PAGE(PageUptodate(page), page);

	/*
	 * Count submission time as memory stall. When the device is congested,
	 * or the submitting cgroup IO-throttled, submission can be a
	 * significant part of overall IO time.
	 */
	psi_memstall_enter(&pflags);

	if (frontswap_load(page) == 0) {
		SetPageUptodate(page);
		unlock_page(page);
		goto out;
	}

	if (data_race(sis->flags & SWP_FS_OPS)) {
		struct file *swap_file = sis->swap_file;
		struct address_space *mapping = swap_file->f_mapping;

		ret = mapping->a_ops->readpage(swap_file, page);
		if (!ret)
			count_vm_event(PSWPIN);
		goto out;
	}

	if (sis->flags & SWP_SYNCHRONOUS_IO) {
		ret = bdev_read_page(sis->bdev, swap_page_sector(page), page);
		if (!ret) {
			if (trylock_page(page)) {
				swap_slot_free_notify(page);
				unlock_page(page);
			}

			count_vm_event(PSWPIN);
			goto out;
		}
	}

	ret = 0;
	bio = bio_alloc(GFP_KERNEL, 1);
	bio_set_dev(bio, sis->bdev);
	bio->bi_opf = REQ_OP_READ;
	bio->bi_iter.bi_sector = swap_page_sector(page);
	bio->bi_end_io = end_swap_bio_read;
	bio_add_page(bio, page, thp_size(page), 0);

	disk = bio->bi_bdev->bd_disk;
	/*
	 * Keep this task valid during swap readpage because the oom killer may
	 * attempt to access it in the page fault retry time check.
	 */
	if (synchronous) {
		bio->bi_opf |= REQ_HIPRI;
		get_task_struct(current);
		bio->bi_private = current;
	}
	count_vm_event(PSWPIN);
	bio_get(bio);
	qc = submit_bio(bio);
	while (synchronous) {
		set_current_state(TASK_UNINTERRUPTIBLE);
		if (!READ_ONCE(bio->bi_private))
			break;

		if (!blk_poll(disk->queue, qc, true))
			blk_io_schedule();
	}
	__set_current_state(TASK_RUNNING);
	bio_put(bio);

out:
	psi_memstall_leave(&pflags);
	return ret;
}

int swap_set_page_dirty(struct page *page)
{
	struct swap_info_struct *sis = page_swap_info(page);

	if (data_race(sis->flags & SWP_FS_OPS)) {
		struct address_space *mapping = sis->swap_file->f_mapping;

		VM_BUG_ON_PAGE(!PageSwapCache(page), page);
		return mapping->a_ops->set_page_dirty(page);
	} else {
		return __set_page_dirty_no_writeback(page);
	}
}
Commit	Line	Data
b2441318	1	// SPDX-License-Identifier: GPL-2.0
1da177e4 LT	2	/*
	3	* linux/mm/page_io.c
	4	*
	5	* Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds
	6	*
	7	* Swap reorganised 29.12.95,
	8	* Asynchronous swapping added 30.12.95. Stephen Tweedie
	9	* Removed race in async swapping. 14.4.1996. Bruno Haible
	10	* Add swap of shared pages through the page cache. 20.2.1998. Stephen Tweedie
	11	* Always use brw_page, life becomes simpler. 12 May 1998 Eric Biederman
	12	*/
	13
	14	#include <linux/mm.h>
	15	#include <linux/kernel_stat.h>
5a0e3ad6	16	#include <linux/gfp.h>
1da177e4 LT	17	#include <linux/pagemap.h>
	18	#include <linux/swap.h>
	19	#include <linux/bio.h>
	20	#include <linux/swapops.h>
62c230bc	21	#include <linux/buffer_head.h>
1da177e4	22	#include <linux/writeback.h>
38b5faf4	23	#include <linux/frontswap.h>
b430e9d1	24	#include <linux/blkdev.h>
93779069	25	#include <linux/psi.h>
e2e40f2c	26	#include <linux/uio.h>
b0ba2d0f	27	#include <linux/sched/task.h>
1da177e4	28
4246a0b6	29	void end_swap_bio_write(struct bio *bio)
1da177e4	30	{
263663cd	31	struct page *page = bio_first_page_all(bio);
1da177e4	32
4e4cbee9	33	if (bio->bi_status) {
1da177e4	34	SetPageError(page);
6ddab3b9 PZ	35	/*
	36	* We failed to write the page out to swap-space.
	37	* Re-dirty the page in order to avoid it being reclaimed.
	38	* Also print a dire warning that things will go BAD (tm)
	39	* very quickly.
	40	*
	41	* Also clear PG_reclaim to avoid rotate_reclaimable_page()
	42	*/
	43	set_page_dirty(page);
25eaab43 GD	44	pr_alert_ratelimited("Write-error on swap-device (%u:%u:%llu)\n",
	45	MAJOR(bio_dev(bio)), MINOR(bio_dev(bio)),
	46	(unsigned long long)bio->bi_iter.bi_sector);
6ddab3b9 PZ	47	ClearPageReclaim(page);
6ddab3b9 PZ	48	}
1da177e4 LT	49	end_page_writeback(page);
1da177e4 LT	50	bio_put(bio);
1da177e4 LT	51	}
1da177e4 LT	52
3f2b1a04 MK	53	static void swap_slot_free_notify(struct page *page)
	54	{
	55	struct swap_info_struct *sis;
	56	struct gendisk *disk;
5df373e9	57	swp_entry_t entry;
3f2b1a04 MK	58
	59	/*
	60	* There is no guarantee that the page is in swap cache - the software
	61	* suspend code (at least) uses end_swap_bio_read() against a non-
	62	* swapcache page. So we must check PG_swapcache before proceeding with
	63	* this optimization.
	64	*/
	65	if (unlikely(!PageSwapCache(page)))
	66	return;
	67
	68	sis = page_swap_info(page);
7b37e226	69	if (data_race(!(sis->flags & SWP_BLKDEV)))
3f2b1a04 MK	70	return;
	71
	72	/*
	73	* The swap subsystem performs lazy swap slot freeing,
	74	* expecting that the page will be swapped out again.
	75	* So we can avoid an unnecessary write if the page
	76	* isn't redirtied.
	77	* This is good for real swap storage because we can
	78	* reduce unnecessary I/O and enhance wear-leveling
	79	* if an SSD is used as the as swap device.
	80	* But if in-memory swap device (eg zram) is used,
	81	* this causes a duplicated copy between uncompressed
	82	* data in VM-owned memory and compressed data in
	83	* zram-owned memory. So let's free zram-owned memory
	84	* and make the VM-owned decompressed page dirty,
	85	* so the page should be swapped out somewhere again if
	86	* we again wish to reclaim it.
	87	*/
	88	disk = sis->bdev->bd_disk;
5df373e9 VM	89	entry.val = page_private(page);
5df373e9 VM	90	if (disk->fops->swap_slot_free_notify && __swap_count(entry) == 1) {
3f2b1a04 MK	91	unsigned long offset;
3f2b1a04 MK	92
3f2b1a04 MK	93	offset = swp_offset(entry);
	94
	95	SetPageDirty(page);
	96	disk->fops->swap_slot_free_notify(sis->bdev,
	97	offset);
	98	}
	99	}
	100
4246a0b6	101	static void end_swap_bio_read(struct bio *bio)
1da177e4	102	{
263663cd	103	struct page *page = bio_first_page_all(bio);
23955622	104	struct task_struct *waiter = bio->bi_private;
1da177e4	105
4e4cbee9	106	if (bio->bi_status) {
1da177e4 LT	107	SetPageError(page);
1da177e4 LT	108	ClearPageUptodate(page);
25eaab43 GD	109	pr_alert_ratelimited("Read-error on swap-device (%u:%u:%llu)\n",
	110	MAJOR(bio_dev(bio)), MINOR(bio_dev(bio)),
	111	(unsigned long long)bio->bi_iter.bi_sector);
b430e9d1	112	goto out;
1da177e4	113	}
b430e9d1 MK	114
b430e9d1 MK	115	SetPageUptodate(page);
3f2b1a04	116	swap_slot_free_notify(page);
b430e9d1	117	out:
1da177e4	118	unlock_page(page);
23955622	119	WRITE_ONCE(bio->bi_private, NULL);
1da177e4	120	bio_put(bio);
87518530 ON	121	if (waiter) {
	122	blk_wake_io_task(waiter);
	123	put_task_struct(waiter);
	124	}
1da177e4 LT	125	}
1da177e4 LT	126
a509bc1a MG	127	int generic_swapfile_activate(struct swap_info_struct *sis,
	128	struct file *swap_file,
	129	sector_t *span)
	130	{
	131	struct address_space *mapping = swap_file->f_mapping;
	132	struct inode *inode = mapping->host;
	133	unsigned blocks_per_page;
	134	unsigned long page_no;
	135	unsigned blkbits;
	136	sector_t probe_block;
	137	sector_t last_block;
	138	sector_t lowest_block = -1;
	139	sector_t highest_block = 0;
	140	int nr_extents = 0;
	141	int ret;
	142
	143	blkbits = inode->i_blkbits;
	144	blocks_per_page = PAGE_SIZE >> blkbits;
	145
	146	/*
4efaceb1	147	* Map all the blocks into the extent tree. This code doesn't try
a509bc1a MG	148	* to be very smart.
	149	*/
	150	probe_block = 0;
	151	page_no = 0;
	152	last_block = i_size_read(inode) >> blkbits;
	153	while ((probe_block + blocks_per_page) <= last_block &&
	154	page_no < sis->max) {
	155	unsigned block_in_page;
	156	sector_t first_block;
	157
7e4411bf MP	158	cond_resched();
7e4411bf MP	159
30460e1e CM	160	first_block = probe_block;
	161	ret = bmap(inode, &first_block);
	162	if (ret \|\| !first_block)
a509bc1a MG	163	goto bad_bmap;
	164
	165	/*
	166	* It must be PAGE_SIZE aligned on-disk
	167	*/
	168	if (first_block & (blocks_per_page - 1)) {
	169	probe_block++;
	170	goto reprobe;
	171	}
	172
	173	for (block_in_page = 1; block_in_page < blocks_per_page;
	174	block_in_page++) {
	175	sector_t block;
	176
30460e1e CM	177	block = probe_block + block_in_page;
	178	ret = bmap(inode, &block);
	179	if (ret \|\| !block)
a509bc1a	180	goto bad_bmap;
30460e1e	181
a509bc1a MG	182	if (block != first_block + block_in_page) {
	183	/* Discontiguity */
	184	probe_block++;
	185	goto reprobe;
	186	}
	187	}
	188
	189	first_block >>= (PAGE_SHIFT - blkbits);
	190	if (page_no) { /* exclude the header page */
	191	if (first_block < lowest_block)
	192	lowest_block = first_block;
	193	if (first_block > highest_block)
	194	highest_block = first_block;
	195	}
	196
	197	/*
	198	* We found a PAGE_SIZE-length, PAGE_SIZE-aligned run of blocks
	199	*/
	200	ret = add_swap_extent(sis, page_no, 1, first_block);
	201	if (ret < 0)
	202	goto out;
	203	nr_extents += ret;
	204	page_no++;
	205	probe_block += blocks_per_page;
	206	reprobe:
	207	continue;
	208	}
	209	ret = nr_extents;
	210	*span = 1 + highest_block - lowest_block;
	211	if (page_no == 0)
	212	page_no = 1; /* force Empty message */
	213	sis->max = page_no;
	214	sis->pages = page_no - 1;
	215	sis->highest_bit = page_no - 1;
	216	out:
	217	return ret;
	218	bad_bmap:
1170532b	219	pr_err("swapon: swapfile has holes\n");
a509bc1a MG	220	ret = -EINVAL;
	221	goto out;
	222	}
	223
1da177e4 LT	224	/*
	225	* We may have stale swap cache pages in memory: notice
	226	* them here and get rid of the unnecessary final write.
	227	*/
	228	int swap_writepage(struct page page, struct writeback_control wbc)
	229	{
2f772e6c	230	int ret = 0;
1da177e4	231
a2c43eed	232	if (try_to_free_swap(page)) {
1da177e4 LT	233	unlock_page(page);
	234	goto out;
	235	}
8a84802e SP	236	/*
	237	* Arch code may have to preserve more data than just the page
	238	* contents, e.g. memory tags.
	239	*/
	240	ret = arch_prepare_to_swap(page);
	241	if (ret) {
	242	set_page_dirty(page);
	243	unlock_page(page);
	244	goto out;
	245	}
165c8aed	246	if (frontswap_store(page) == 0) {
38b5faf4 DM	247	set_page_writeback(page);
	248	unlock_page(page);
	249	end_page_writeback(page);
	250	goto out;
	251	}
1eec6702	252	ret = __swap_writepage(page, wbc, end_swap_bio_write);
2f772e6c SJ	253	out:
	254	return ret;
	255	}
	256
dd6bd0d9 MW	257	static sector_t swap_page_sector(struct page *page)
dd6bd0d9 MW	258	{
09cbfeaf	259	return (sector_t)__page_file_index(page) << (PAGE_SHIFT - 9);
dd6bd0d9 MW	260	}
dd6bd0d9 MW	261
225311a4 HY	262	static inline void count_swpout_vm_event(struct page *page)
	263	{
	264	#ifdef CONFIG_TRANSPARENT_HUGEPAGE
	265	if (unlikely(PageTransHuge(page)))
	266	count_vm_event(THP_SWPOUT);
	267	#endif
6c357848	268	count_vm_events(PSWPOUT, thp_nr_pages(page));
225311a4 HY	269	}
225311a4 HY	270
a18b9b15 CH	271	#if defined(CONFIG_MEMCG) && defined(CONFIG_BLK_CGROUP)
	272	static void bio_associate_blkg_from_page(struct bio bio, struct page page)
	273	{
	274	struct cgroup_subsys_state *css;
bcfe06bf	275	struct mem_cgroup *memcg;
a18b9b15	276
bcfe06bf RG	277	memcg = page_memcg(page);
bcfe06bf RG	278	if (!memcg)
a18b9b15 CH	279	return;
	280
	281	rcu_read_lock();
bcfe06bf	282	css = cgroup_e_css(memcg->css.cgroup, &io_cgrp_subsys);
a18b9b15 CH	283	bio_associate_blkg_from_css(bio, css);
	284	rcu_read_unlock();
	285	}
	286	#else
	287	#define bio_associate_blkg_from_page(bio, page) do { } while (0)
	288	#endif /* CONFIG_MEMCG && CONFIG_BLK_CGROUP */
	289
1eec6702	290	int __swap_writepage(struct page page, struct writeback_control wbc,
4246a0b6	291	bio_end_io_t end_write_func)
2f772e6c SJ	292	{
2f772e6c SJ	293	struct bio *bio;
4e49ea4a	294	int ret;
2f772e6c	295	struct swap_info_struct *sis = page_swap_info(page);
62c230bc	296
cc30c5d6	297	VM_BUG_ON_PAGE(!PageSwapCache(page), page);
32646315	298	if (data_race(sis->flags & SWP_FS_OPS)) {
62c230bc MG	299	struct kiocb kiocb;
	300	struct file *swap_file = sis->swap_file;
	301	struct address_space *mapping = swap_file->f_mapping;
62a8067a AV	302	struct bio_vec bv = {
	303	.bv_page = page,
	304	.bv_len = PAGE_SIZE,
	305	.bv_offset = 0
	306	};
05afcb77	307	struct iov_iter from;
62c230bc	308
aa563d7b	309	iov_iter_bvec(&from, WRITE, &bv, 1, PAGE_SIZE);
62c230bc MG	310	init_sync_kiocb(&kiocb, swap_file);
62c230bc MG	311	kiocb.ki_pos = page_file_offset(page);
62c230bc	312
0cdc444a	313	set_page_writeback(page);
62c230bc	314	unlock_page(page);
c8b8e32d	315	ret = mapping->a_ops->direct_IO(&kiocb, &from);
62c230bc MG	316	if (ret == PAGE_SIZE) {
	317	count_vm_event(PSWPOUT);
	318	ret = 0;
2d30d31e	319	} else {
0cdc444a MG	320	/*
	321	* In the case of swap-over-nfs, this can be a
	322	* temporary failure if the system has limited
	323	* memory for allocating transmit buffers.
	324	* Mark the page dirty and avoid
	325	* rotate_reclaimable_page but rate-limit the
	326	* messages but do not flag PageError like
	327	* the normal direct-to-bio case as it could
	328	* be temporary.
	329	*/
2d30d31e	330	set_page_dirty(page);
0cdc444a	331	ClearPageReclaim(page);
1170532b JP	332	pr_err_ratelimited("Write error on dio swapfile (%llu)\n",
1170532b JP	333	page_file_offset(page));
62c230bc	334	}
0cdc444a	335	end_page_writeback(page);
62c230bc MG	336	return ret;
	337	}
	338
dd6bd0d9 MW	339	ret = bdev_write_page(sis->bdev, swap_page_sector(page), page, wbc);
dd6bd0d9 MW	340	if (!ret) {
225311a4	341	count_swpout_vm_event(page);
dd6bd0d9 MW	342	return 0;
	343	}
	344
48d15436 CH	345	bio = bio_alloc(GFP_NOIO, 1);
	346	bio_set_dev(bio, sis->bdev);
	347	bio->bi_iter.bi_sector = swap_page_sector(page);
0d1e0c7c	348	bio->bi_opf = REQ_OP_WRITE \| REQ_SWAP \| wbc_to_write_flags(wbc);
48d15436 CH	349	bio->bi_end_io = end_write_func;
	350	bio_add_page(bio, page, thp_size(page), 0);
	351
6a7f6d86	352	bio_associate_blkg_from_page(bio, page);
225311a4	353	count_swpout_vm_event(page);
1da177e4 LT	354	set_page_writeback(page);
1da177e4 LT	355	unlock_page(page);
4e49ea4a	356	submit_bio(bio);
548d9782 ML	357
548d9782 ML	358	return 0;
1da177e4 LT	359	}
1da177e4 LT	360
0bcac06f	361	int swap_readpage(struct page *page, bool synchronous)
1da177e4 LT	362	{
	363	struct bio *bio;
	364	int ret = 0;
62c230bc	365	struct swap_info_struct *sis = page_swap_info(page);
23955622	366	blk_qc_t qc;
74d46992	367	struct gendisk *disk;
93779069	368	unsigned long pflags;
1da177e4	369
0bcac06f	370	VM_BUG_ON_PAGE(!PageSwapCache(page) && !synchronous, page);
309381fe SL	371	VM_BUG_ON_PAGE(!PageLocked(page), page);
309381fe SL	372	VM_BUG_ON_PAGE(PageUptodate(page), page);
93779069 MK	373
	374	/*
	375	* Count submission time as memory stall. When the device is congested,
	376	* or the submitting cgroup IO-throttled, submission can be a
	377	* significant part of overall IO time.
	378	*/
	379	psi_memstall_enter(&pflags);
	380
165c8aed	381	if (frontswap_load(page) == 0) {
38b5faf4 DM	382	SetPageUptodate(page);
	383	unlock_page(page);
	384	goto out;
	385	}
62c230bc	386
32646315	387	if (data_race(sis->flags & SWP_FS_OPS)) {
62c230bc MG	388	struct file *swap_file = sis->swap_file;
	389	struct address_space *mapping = swap_file->f_mapping;
	390
	391	ret = mapping->a_ops->readpage(swap_file, page);
	392	if (!ret)
	393	count_vm_event(PSWPIN);
93779069	394	goto out;
62c230bc MG	395	}
62c230bc MG	396
5115db10 CH	397	if (sis->flags & SWP_SYNCHRONOUS_IO) {
	398	ret = bdev_read_page(sis->bdev, swap_page_sector(page), page);
	399	if (!ret) {
	400	if (trylock_page(page)) {
	401	swap_slot_free_notify(page);
	402	unlock_page(page);
	403	}
b06bad17	404
5115db10 CH	405	count_vm_event(PSWPIN);
	406	goto out;
	407	}
dd6bd0d9 MW	408	}
	409
	410	ret = 0;
48d15436 CH	411	bio = bio_alloc(GFP_KERNEL, 1);
	412	bio_set_dev(bio, sis->bdev);
	413	bio->bi_opf = REQ_OP_READ;
	414	bio->bi_iter.bi_sector = swap_page_sector(page);
	415	bio->bi_end_io = end_swap_bio_read;
	416	bio_add_page(bio, page, thp_size(page), 0);
	417
309dca30	418	disk = bio->bi_bdev->bd_disk;
b0ba2d0f TH	419	/*
	420	* Keep this task valid during swap readpage because the oom killer may
	421	* attempt to access it in the page fault retry time check.
	422	*/
87518530	423	if (synchronous) {
b685a735	424	bio->bi_opf \|= REQ_HIPRI;
87518530 ON	425	get_task_struct(current);
	426	bio->bi_private = current;
	427	}
f8891e5e	428	count_vm_event(PSWPIN);
23955622 SL	429	bio_get(bio);
23955622 SL	430	qc = submit_bio(bio);
0bcac06f	431	while (synchronous) {
1ac5cd49	432	set_current_state(TASK_UNINTERRUPTIBLE);
23955622 SL	433	if (!READ_ONCE(bio->bi_private))
	434	break;
	435
0a1b8b87	436	if (!blk_poll(disk->queue, qc, true))
0f190a7a	437	blk_io_schedule();
23955622 SL	438	}
	439	__set_current_state(TASK_RUNNING);
	440	bio_put(bio);
	441
1da177e4	442	out:
93779069	443	psi_memstall_leave(&pflags);
1da177e4 LT	444	return ret;
1da177e4 LT	445	}
62c230bc MG	446
	447	int swap_set_page_dirty(struct page *page)
	448	{
	449	struct swap_info_struct *sis = page_swap_info(page);
	450
32646315	451	if (data_race(sis->flags & SWP_FS_OPS)) {
62c230bc	452	struct address_space *mapping = sis->swap_file->f_mapping;
cc30c5d6 AM	453
cc30c5d6 AM	454	VM_BUG_ON_PAGE(!PageSwapCache(page), page);
62c230bc MG	455	return mapping->a_ops->set_page_dirty(page);
	456	} else {
	457	return __set_page_dirty_no_writeback(page);
	458	}
	459	}