1 // SPDX-License-Identifier: GPL-2.0-or-later
3 * Copyright (C) 2018-2023 Oracle. All Rights Reserved.
4 * Author: Darrick J. Wong <djwong@kernel.org>
8 #include "xfs_shared.h"
9 #include "xfs_format.h"
10 #include "xfs_log_format.h"
11 #include "xfs_trans_resv.h"
12 #include "xfs_mount.h"
13 #include "scrub/xfile.h"
14 #include "scrub/xfarray.h"
15 #include "scrub/scrub.h"
16 #include "scrub/trace.h"
17 #include <linux/shmem_fs.h>
20 * Swappable Temporary Memory
21 * ==========================
23 * Online checking sometimes needs to be able to stage a large amount of data
24 * in memory. This information might not fit in the available memory and it
25 * doesn't all need to be accessible at all times. In other words, we want an
26 * indexed data buffer to store data that can be paged out.
28 * When CONFIG_TMPFS=y, shmemfs is enough of a filesystem to meet those
29 * requirements. Therefore, the xfile mechanism uses an unlinked shmem file to
30 * store our staging data. This file is not installed in the file descriptor
31 * table so that user programs cannot access the data, which means that the
32 * xfile must be freed with xfile_destroy.
34 * xfiles assume that the caller will handle all required concurrency
35 * management; standard vfs locks (freezer and inode) are not taken. Reads
36 * and writes are satisfied directly from the page cache.
38 * NOTE: The current shmemfs implementation has a quirk that in-kernel reads
39 * of a hole cause a page to be mapped into the file. If you are going to
40 * create a sparse xfile, please be careful about reading from uninitialized
41 * parts of the file. These pages are !Uptodate and will eventually be
42 * reclaimed if not written, but in the short term this boosts memory
47 * xfiles must not be exposed to userspace and require upper layers to
48 * coordinate access to the one handle returned by the constructor, so
49 * establish a separate lock class for xfiles to avoid confusing lockdep.
51 static struct lock_class_key xfile_i_mutex_key;
54 * Create an xfile of the given size. The description will be used in the
59 const char *description,
61 struct xfile **xfilep)
67 xf = kmalloc(sizeof(struct xfile), XCHK_GFP_FLAGS);
71 xf->file = shmem_file_setup(description, isize, 0);
74 if (IS_ERR(xf->file)) {
75 error = PTR_ERR(xf->file);
80 * We want a large sparse file that we can pread, pwrite, and seek.
81 * xfile users are responsible for keeping the xfile hidden away from
82 * all other callers, so we skip timestamp updates and security checks.
83 * Make the inode only accessible by root, just in case the xfile ever
86 xf->file->f_mode |= FMODE_PREAD | FMODE_PWRITE | FMODE_NOCMTIME |
88 xf->file->f_flags |= O_RDWR | O_LARGEFILE | O_NOATIME;
89 inode = file_inode(xf->file);
90 inode->i_flags |= S_PRIVATE | S_NOCMTIME | S_NOATIME;
91 inode->i_mode &= ~0177;
92 inode->i_uid = GLOBAL_ROOT_UID;
93 inode->i_gid = GLOBAL_ROOT_GID;
95 lockdep_set_class(&inode->i_rwsem, &xfile_i_mutex_key);
97 trace_xfile_create(xf);
106 /* Close the file and release all resources. */
111 struct inode *inode = file_inode(xf->file);
113 trace_xfile_destroy(xf);
115 lockdep_set_class(&inode->i_rwsem, &inode->i_sb->s_type->i_mutex_key);
121 * Read a memory object directly from the xfile's page cache. Unlike regular
122 * pread, we return -E2BIG and -EFBIG for reads that are too large or at too
123 * high an offset, instead of truncating the read. Otherwise, we return
124 * bytes read or an error code, like regular pread.
133 struct inode *inode = file_inode(xf->file);
134 struct address_space *mapping = inode->i_mapping;
135 struct page *page = NULL;
140 if (count > MAX_RW_COUNT)
142 if (inode->i_sb->s_maxbytes - pos < count)
145 trace_xfile_pread(xf, pos, count);
147 pflags = memalloc_nofs_save();
152 len = min_t(ssize_t, count, PAGE_SIZE - offset_in_page(pos));
155 * In-kernel reads of a shmem file cause it to allocate a page
156 * if the mapping shows a hole. Therefore, if we hit ENOMEM
157 * we can continue by zeroing the caller's buffer.
159 page = shmem_read_mapping_page_gfp(mapping, pos >> PAGE_SHIFT,
162 error = PTR_ERR(page);
163 if (error != -ENOMEM)
170 if (PageUptodate(page)) {
172 * xfile pages must never be mapped into userspace, so
173 * we skip the dcache flush.
175 kaddr = kmap_local_page(page);
176 p = kaddr + offset_in_page(pos);
190 memalloc_nofs_restore(pflags);
198 * Write a memory object directly to the xfile's page cache. Unlike regular
199 * pwrite, we return -E2BIG and -EFBIG for writes that are too large or at too
200 * high an offset, instead of truncating the write. Otherwise, we return
201 * bytes written or an error code, like regular pwrite.
210 struct inode *inode = file_inode(xf->file);
211 struct address_space *mapping = inode->i_mapping;
212 const struct address_space_operations *aops = mapping->a_ops;
213 struct page *page = NULL;
218 if (count > MAX_RW_COUNT)
220 if (inode->i_sb->s_maxbytes - pos < count)
223 trace_xfile_pwrite(xf, pos, count);
225 pflags = memalloc_nofs_save();
232 len = min_t(ssize_t, count, PAGE_SIZE - offset_in_page(pos));
235 * We call write_begin directly here to avoid all the freezer
236 * protection lock-taking that happens in the normal path.
237 * shmem doesn't support fs freeze, but lockdep doesn't know
238 * that and will trip over that.
240 error = aops->write_begin(NULL, mapping, pos, len, &page,
246 * xfile pages must never be mapped into userspace, so we skip
247 * the dcache flush. If the page is not uptodate, zero it
248 * before writing data.
250 kaddr = kmap_local_page(page);
251 if (!PageUptodate(page)) {
252 memset(kaddr, 0, PAGE_SIZE);
253 SetPageUptodate(page);
255 p = kaddr + offset_in_page(pos);
259 ret = aops->write_end(NULL, mapping, pos, len, len, page,
274 memalloc_nofs_restore(pflags);
281 /* Find the next written area in the xfile data for a given offset. */
289 ret = vfs_llseek(xf->file, pos, SEEK_DATA);
290 trace_xfile_seek_data(xf, pos, ret);
294 /* Query stat information for an xfile. */
298 struct xfile_stat *statbuf)
303 error = vfs_getattr_nosec(&xf->file->f_path, &ks,
304 STATX_SIZE | STATX_BLOCKS, AT_STATX_DONT_SYNC);
308 statbuf->size = ks.size;
309 statbuf->bytes = ks.blocks << SECTOR_SHIFT;
314 * Grab the (locked) page for a memory object. The object cannot span a page
315 * boundary. Returns 0 (and a locked page) if successful, -ENOTBLK if we
316 * cannot grab the page, or the usual negative errno.
323 struct xfile_page *xfpage)
325 struct inode *inode = file_inode(xf->file);
326 struct address_space *mapping = inode->i_mapping;
327 const struct address_space_operations *aops = mapping->a_ops;
328 struct page *page = NULL;
330 loff_t key = round_down(pos, PAGE_SIZE);
334 if (inode->i_sb->s_maxbytes - pos < len)
336 if (len > PAGE_SIZE - offset_in_page(pos))
339 trace_xfile_get_page(xf, pos, len);
341 pflags = memalloc_nofs_save();
344 * We call write_begin directly here to avoid all the freezer
345 * protection lock-taking that happens in the normal path. shmem
346 * doesn't support fs freeze, but lockdep doesn't know that and will
349 error = aops->write_begin(NULL, mapping, key, PAGE_SIZE, &page,
354 /* We got the page, so make sure we push out EOF. */
355 if (i_size_read(inode) < pos + len)
356 i_size_write(inode, pos + len);
359 * If the page isn't up to date, fill it with zeroes before we hand it
360 * to the caller and make sure the backing store will hold on to them.
362 if (!PageUptodate(page)) {
365 kaddr = kmap_local_page(page);
366 memset(kaddr, 0, PAGE_SIZE);
368 SetPageUptodate(page);
372 * Mark each page dirty so that the contents are written to some
373 * backing store when we drop this buffer, and take an extra reference
374 * to prevent the xfile page from being swapped or removed from the
375 * page cache by reclaim if the caller unlocks the page.
377 set_page_dirty(page);
381 xfpage->fsdata = fsdata;
384 memalloc_nofs_restore(pflags);
389 * Release the (locked) page for a memory object. Returns 0 or a negative
395 struct xfile_page *xfpage)
397 struct inode *inode = file_inode(xf->file);
398 struct address_space *mapping = inode->i_mapping;
399 const struct address_space_operations *aops = mapping->a_ops;
403 trace_xfile_put_page(xf, xfpage->pos, PAGE_SIZE);
405 /* Give back the reference that we took in xfile_get_page. */
406 put_page(xfpage->page);
408 pflags = memalloc_nofs_save();
409 ret = aops->write_end(NULL, mapping, xfpage->pos, PAGE_SIZE, PAGE_SIZE,
410 xfpage->page, xfpage->fsdata);
411 memalloc_nofs_restore(pflags);
412 memset(xfpage, 0, sizeof(struct xfile_page));
416 if (ret != PAGE_SIZE)