shmem: convert shmem_read_mapping_page_gfp() to use shmem_get_folio_gfp()
[linux-block.git] / mm / shmem.c
CommitLineData
1da177e4
LT
1/*
2 * Resizable virtual memory filesystem for Linux.
3 *
4 * Copyright (C) 2000 Linus Torvalds.
5 * 2000 Transmeta Corp.
6 * 2000-2001 Christoph Rohland
7 * 2000-2001 SAP AG
8 * 2002 Red Hat Inc.
6922c0c7
HD
9 * Copyright (C) 2002-2011 Hugh Dickins.
10 * Copyright (C) 2011 Google Inc.
0edd73b3 11 * Copyright (C) 2002-2005 VERITAS Software Corporation.
1da177e4
LT
12 * Copyright (C) 2004 Andi Kleen, SuSE Labs
13 *
14 * Extended attribute support for tmpfs:
15 * Copyright (c) 2004, Luke Kenneth Casson Leighton <lkcl@lkcl.net>
16 * Copyright (c) 2004 Red Hat, Inc., James Morris <jmorris@redhat.com>
17 *
853ac43a
MM
18 * tiny-shmem:
19 * Copyright (c) 2004, 2008 Matt Mackall <mpm@selenic.com>
20 *
1da177e4
LT
21 * This file is released under the GPL.
22 */
23
853ac43a
MM
24#include <linux/fs.h>
25#include <linux/init.h>
26#include <linux/vfs.h>
27#include <linux/mount.h>
250297ed 28#include <linux/ramfs.h>
caefba17 29#include <linux/pagemap.h>
853ac43a 30#include <linux/file.h>
e408e695 31#include <linux/fileattr.h>
853ac43a 32#include <linux/mm.h>
46c9a946 33#include <linux/random.h>
174cd4b1 34#include <linux/sched/signal.h>
b95f1b31 35#include <linux/export.h>
853ac43a 36#include <linux/swap.h>
e2e40f2c 37#include <linux/uio.h>
749df87b 38#include <linux/hugetlb.h>
626c3920 39#include <linux/fs_parser.h>
86a2f3f2 40#include <linux/swapfile.h>
014bb1de 41#include "swap.h"
95cc09d6 42
853ac43a
MM
43static struct vfsmount *shm_mnt;
44
45#ifdef CONFIG_SHMEM
1da177e4
LT
46/*
47 * This virtual memory filesystem is heavily based on the ramfs. It
48 * extends ramfs by the ability to use swap and honor resource limits
49 * which makes it a completely usable filesystem.
50 */
51
39f0247d 52#include <linux/xattr.h>
a5694255 53#include <linux/exportfs.h>
1c7c474c 54#include <linux/posix_acl.h>
feda821e 55#include <linux/posix_acl_xattr.h>
1da177e4 56#include <linux/mman.h>
1da177e4
LT
57#include <linux/string.h>
58#include <linux/slab.h>
59#include <linux/backing-dev.h>
60#include <linux/shmem_fs.h>
1da177e4 61#include <linux/writeback.h>
bda97eab 62#include <linux/pagevec.h>
41ffe5d5 63#include <linux/percpu_counter.h>
83e4fa9c 64#include <linux/falloc.h>
708e3508 65#include <linux/splice.h>
1da177e4
LT
66#include <linux/security.h>
67#include <linux/swapops.h>
68#include <linux/mempolicy.h>
69#include <linux/namei.h>
b00dc3ad 70#include <linux/ctype.h>
304dbdb7 71#include <linux/migrate.h>
c1f60a5a 72#include <linux/highmem.h>
680d794b 73#include <linux/seq_file.h>
92562927 74#include <linux/magic.h>
9183df25 75#include <linux/syscalls.h>
40e041a2 76#include <linux/fcntl.h>
9183df25 77#include <uapi/linux/memfd.h>
cfda0526 78#include <linux/userfaultfd_k.h>
4c27fe4c 79#include <linux/rmap.h>
2b4db796 80#include <linux/uuid.h>
304dbdb7 81
7c0f6ba6 82#include <linux/uaccess.h>
1da177e4 83
dd56b046
MG
84#include "internal.h"
85
09cbfeaf
KS
86#define BLOCKS_PER_PAGE (PAGE_SIZE/512)
87#define VM_ACCT(size) (PAGE_ALIGN(size) >> PAGE_SHIFT)
1da177e4 88
1da177e4
LT
89/* Pretend that each entry is of this size in directory's i_size */
90#define BOGO_DIRENT_SIZE 20
91
69f07ec9
HD
92/* Symlink up to this size is kmalloc'ed instead of using a swappable page */
93#define SHORT_SYMLINK_LEN 128
94
1aac1400 95/*
f00cdc6d 96 * shmem_fallocate communicates with shmem_fault or shmem_writepage via
9608703e 97 * inode->i_private (with i_rwsem making sure that it has only one user at
f00cdc6d 98 * a time): we would prefer not to enlarge the shmem inode just for that.
1aac1400
HD
99 */
100struct shmem_falloc {
8e205f77 101 wait_queue_head_t *waitq; /* faults into hole wait for punch to end */
1aac1400
HD
102 pgoff_t start; /* start of range currently being fallocated */
103 pgoff_t next; /* the next page offset to be fallocated */
104 pgoff_t nr_falloced; /* how many new pages have been fallocated */
105 pgoff_t nr_unswapped; /* how often writepage refused to swap out */
106};
107
0b5071dd
AV
108struct shmem_options {
109 unsigned long long blocks;
110 unsigned long long inodes;
111 struct mempolicy *mpol;
112 kuid_t uid;
113 kgid_t gid;
114 umode_t mode;
ea3271f7 115 bool full_inums;
0b5071dd
AV
116 int huge;
117 int seen;
118#define SHMEM_SEEN_BLOCKS 1
119#define SHMEM_SEEN_INODES 2
120#define SHMEM_SEEN_HUGE 4
ea3271f7 121#define SHMEM_SEEN_INUMS 8
0b5071dd
AV
122};
123
b76db735 124#ifdef CONFIG_TMPFS
680d794b 125static unsigned long shmem_default_max_blocks(void)
126{
ca79b0c2 127 return totalram_pages() / 2;
680d794b 128}
129
130static unsigned long shmem_default_max_inodes(void)
131{
ca79b0c2
AK
132 unsigned long nr_pages = totalram_pages();
133
134 return min(nr_pages - totalhigh_pages(), nr_pages / 2);
680d794b 135}
b76db735 136#endif
680d794b 137
da08e9b7
MWO
138static int shmem_swapin_folio(struct inode *inode, pgoff_t index,
139 struct folio **foliop, enum sgp_type sgp,
c5bf121e
VRP
140 gfp_t gfp, struct vm_area_struct *vma,
141 vm_fault_t *fault_type);
1da177e4 142
1da177e4
LT
143static inline struct shmem_sb_info *SHMEM_SB(struct super_block *sb)
144{
145 return sb->s_fs_info;
146}
147
148/*
149 * shmem_file_setup pre-accounts the whole fixed size of a VM object,
150 * for shared memory and for shared anonymous (/dev/zero) mappings
151 * (unless MAP_NORESERVE and sysctl_overcommit_memory <= 1),
152 * consistent with the pre-accounting of private mappings ...
153 */
154static inline int shmem_acct_size(unsigned long flags, loff_t size)
155{
0b0a0806 156 return (flags & VM_NORESERVE) ?
191c5424 157 0 : security_vm_enough_memory_mm(current->mm, VM_ACCT(size));
1da177e4
LT
158}
159
160static inline void shmem_unacct_size(unsigned long flags, loff_t size)
161{
0b0a0806 162 if (!(flags & VM_NORESERVE))
1da177e4
LT
163 vm_unacct_memory(VM_ACCT(size));
164}
165
77142517
KK
166static inline int shmem_reacct_size(unsigned long flags,
167 loff_t oldsize, loff_t newsize)
168{
169 if (!(flags & VM_NORESERVE)) {
170 if (VM_ACCT(newsize) > VM_ACCT(oldsize))
171 return security_vm_enough_memory_mm(current->mm,
172 VM_ACCT(newsize) - VM_ACCT(oldsize));
173 else if (VM_ACCT(newsize) < VM_ACCT(oldsize))
174 vm_unacct_memory(VM_ACCT(oldsize) - VM_ACCT(newsize));
175 }
176 return 0;
177}
178
1da177e4
LT
179/*
180 * ... whereas tmpfs objects are accounted incrementally as
75edd345 181 * pages are allocated, in order to allow large sparse files.
1da177e4
LT
182 * shmem_getpage reports shmem_acct_block failure as -ENOSPC not -ENOMEM,
183 * so that a failure on a sparse tmpfs mapping will give SIGBUS not OOM.
184 */
800d8c63 185static inline int shmem_acct_block(unsigned long flags, long pages)
1da177e4 186{
800d8c63
KS
187 if (!(flags & VM_NORESERVE))
188 return 0;
189
190 return security_vm_enough_memory_mm(current->mm,
191 pages * VM_ACCT(PAGE_SIZE));
1da177e4
LT
192}
193
194static inline void shmem_unacct_blocks(unsigned long flags, long pages)
195{
0b0a0806 196 if (flags & VM_NORESERVE)
09cbfeaf 197 vm_unacct_memory(pages * VM_ACCT(PAGE_SIZE));
1da177e4
LT
198}
199
0f079694
MR
200static inline bool shmem_inode_acct_block(struct inode *inode, long pages)
201{
202 struct shmem_inode_info *info = SHMEM_I(inode);
203 struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
204
205 if (shmem_acct_block(info->flags, pages))
206 return false;
207
208 if (sbinfo->max_blocks) {
209 if (percpu_counter_compare(&sbinfo->used_blocks,
210 sbinfo->max_blocks - pages) > 0)
211 goto unacct;
212 percpu_counter_add(&sbinfo->used_blocks, pages);
213 }
214
215 return true;
216
217unacct:
218 shmem_unacct_blocks(info->flags, pages);
219 return false;
220}
221
222static inline void shmem_inode_unacct_blocks(struct inode *inode, long pages)
223{
224 struct shmem_inode_info *info = SHMEM_I(inode);
225 struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
226
227 if (sbinfo->max_blocks)
228 percpu_counter_sub(&sbinfo->used_blocks, pages);
229 shmem_unacct_blocks(info->flags, pages);
230}
231
759b9775 232static const struct super_operations shmem_ops;
30e6a51d 233const struct address_space_operations shmem_aops;
15ad7cdc 234static const struct file_operations shmem_file_operations;
92e1d5be
AV
235static const struct inode_operations shmem_inode_operations;
236static const struct inode_operations shmem_dir_inode_operations;
237static const struct inode_operations shmem_special_inode_operations;
f0f37e2f 238static const struct vm_operations_struct shmem_vm_ops;
779750d2 239static struct file_system_type shmem_fs_type;
1da177e4 240
b0506e48
MR
241bool vma_is_shmem(struct vm_area_struct *vma)
242{
243 return vma->vm_ops == &shmem_vm_ops;
244}
245
1da177e4 246static LIST_HEAD(shmem_swaplist);
cb5f7b9a 247static DEFINE_MUTEX(shmem_swaplist_mutex);
1da177e4 248
e809d5f0
CD
249/*
250 * shmem_reserve_inode() performs bookkeeping to reserve a shmem inode, and
251 * produces a novel ino for the newly allocated inode.
252 *
253 * It may also be called when making a hard link to permit the space needed by
254 * each dentry. However, in that case, no new inode number is needed since that
255 * internally draws from another pool of inode numbers (currently global
256 * get_next_ino()). This case is indicated by passing NULL as inop.
257 */
258#define SHMEM_INO_BATCH 1024
259static int shmem_reserve_inode(struct super_block *sb, ino_t *inop)
5b04c689
PE
260{
261 struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
e809d5f0
CD
262 ino_t ino;
263
264 if (!(sb->s_flags & SB_KERNMOUNT)) {
bf11b9a8 265 raw_spin_lock(&sbinfo->stat_lock);
bb3e96d6
BS
266 if (sbinfo->max_inodes) {
267 if (!sbinfo->free_inodes) {
bf11b9a8 268 raw_spin_unlock(&sbinfo->stat_lock);
bb3e96d6
BS
269 return -ENOSPC;
270 }
271 sbinfo->free_inodes--;
5b04c689 272 }
e809d5f0
CD
273 if (inop) {
274 ino = sbinfo->next_ino++;
275 if (unlikely(is_zero_ino(ino)))
276 ino = sbinfo->next_ino++;
ea3271f7
CD
277 if (unlikely(!sbinfo->full_inums &&
278 ino > UINT_MAX)) {
e809d5f0
CD
279 /*
280 * Emulate get_next_ino uint wraparound for
281 * compatibility
282 */
ea3271f7
CD
283 if (IS_ENABLED(CONFIG_64BIT))
284 pr_warn("%s: inode number overflow on device %d, consider using inode64 mount option\n",
285 __func__, MINOR(sb->s_dev));
286 sbinfo->next_ino = 1;
287 ino = sbinfo->next_ino++;
e809d5f0
CD
288 }
289 *inop = ino;
290 }
bf11b9a8 291 raw_spin_unlock(&sbinfo->stat_lock);
e809d5f0
CD
292 } else if (inop) {
293 /*
294 * __shmem_file_setup, one of our callers, is lock-free: it
295 * doesn't hold stat_lock in shmem_reserve_inode since
296 * max_inodes is always 0, and is called from potentially
297 * unknown contexts. As such, use a per-cpu batched allocator
298 * which doesn't require the per-sb stat_lock unless we are at
299 * the batch boundary.
ea3271f7
CD
300 *
301 * We don't need to worry about inode{32,64} since SB_KERNMOUNT
302 * shmem mounts are not exposed to userspace, so we don't need
303 * to worry about things like glibc compatibility.
e809d5f0
CD
304 */
305 ino_t *next_ino;
bf11b9a8 306
e809d5f0
CD
307 next_ino = per_cpu_ptr(sbinfo->ino_batch, get_cpu());
308 ino = *next_ino;
309 if (unlikely(ino % SHMEM_INO_BATCH == 0)) {
bf11b9a8 310 raw_spin_lock(&sbinfo->stat_lock);
e809d5f0
CD
311 ino = sbinfo->next_ino;
312 sbinfo->next_ino += SHMEM_INO_BATCH;
bf11b9a8 313 raw_spin_unlock(&sbinfo->stat_lock);
e809d5f0
CD
314 if (unlikely(is_zero_ino(ino)))
315 ino++;
316 }
317 *inop = ino;
318 *next_ino = ++ino;
319 put_cpu();
5b04c689 320 }
e809d5f0 321
5b04c689
PE
322 return 0;
323}
324
325static void shmem_free_inode(struct super_block *sb)
326{
327 struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
328 if (sbinfo->max_inodes) {
bf11b9a8 329 raw_spin_lock(&sbinfo->stat_lock);
5b04c689 330 sbinfo->free_inodes++;
bf11b9a8 331 raw_spin_unlock(&sbinfo->stat_lock);
5b04c689
PE
332 }
333}
334
46711810 335/**
41ffe5d5 336 * shmem_recalc_inode - recalculate the block usage of an inode
1da177e4
LT
337 * @inode: inode to recalc
338 *
339 * We have to calculate the free blocks since the mm can drop
340 * undirtied hole pages behind our back.
341 *
342 * But normally info->alloced == inode->i_mapping->nrpages + info->swapped
343 * So mm freed is info->alloced - (inode->i_mapping->nrpages + info->swapped)
344 *
345 * It has to be called with the spinlock held.
346 */
347static void shmem_recalc_inode(struct inode *inode)
348{
349 struct shmem_inode_info *info = SHMEM_I(inode);
350 long freed;
351
352 freed = info->alloced - info->swapped - inode->i_mapping->nrpages;
353 if (freed > 0) {
354 info->alloced -= freed;
54af6042 355 inode->i_blocks -= freed * BLOCKS_PER_PAGE;
0f079694 356 shmem_inode_unacct_blocks(inode, freed);
1da177e4
LT
357 }
358}
359
800d8c63
KS
360bool shmem_charge(struct inode *inode, long pages)
361{
362 struct shmem_inode_info *info = SHMEM_I(inode);
4595ef88 363 unsigned long flags;
800d8c63 364
0f079694 365 if (!shmem_inode_acct_block(inode, pages))
800d8c63 366 return false;
b1cc94ab 367
aaa52e34
HD
368 /* nrpages adjustment first, then shmem_recalc_inode() when balanced */
369 inode->i_mapping->nrpages += pages;
370
4595ef88 371 spin_lock_irqsave(&info->lock, flags);
800d8c63
KS
372 info->alloced += pages;
373 inode->i_blocks += pages * BLOCKS_PER_PAGE;
374 shmem_recalc_inode(inode);
4595ef88 375 spin_unlock_irqrestore(&info->lock, flags);
800d8c63 376
800d8c63
KS
377 return true;
378}
379
380void shmem_uncharge(struct inode *inode, long pages)
381{
382 struct shmem_inode_info *info = SHMEM_I(inode);
4595ef88 383 unsigned long flags;
800d8c63 384
6ffcd825 385 /* nrpages adjustment done by __filemap_remove_folio() or caller */
aaa52e34 386
4595ef88 387 spin_lock_irqsave(&info->lock, flags);
800d8c63
KS
388 info->alloced -= pages;
389 inode->i_blocks -= pages * BLOCKS_PER_PAGE;
390 shmem_recalc_inode(inode);
4595ef88 391 spin_unlock_irqrestore(&info->lock, flags);
800d8c63 392
0f079694 393 shmem_inode_unacct_blocks(inode, pages);
800d8c63
KS
394}
395
7a5d0fbb 396/*
62f945b6 397 * Replace item expected in xarray by a new item, while holding xa_lock.
7a5d0fbb 398 */
62f945b6 399static int shmem_replace_entry(struct address_space *mapping,
7a5d0fbb
HD
400 pgoff_t index, void *expected, void *replacement)
401{
62f945b6 402 XA_STATE(xas, &mapping->i_pages, index);
6dbaf22c 403 void *item;
7a5d0fbb
HD
404
405 VM_BUG_ON(!expected);
6dbaf22c 406 VM_BUG_ON(!replacement);
62f945b6 407 item = xas_load(&xas);
7a5d0fbb
HD
408 if (item != expected)
409 return -ENOENT;
62f945b6 410 xas_store(&xas, replacement);
7a5d0fbb
HD
411 return 0;
412}
413
d1899228
HD
414/*
415 * Sometimes, before we decide whether to proceed or to fail, we must check
416 * that an entry was not already brought back from swap by a racing thread.
417 *
418 * Checking page is not enough: by the time a SwapCache page is locked, it
419 * might be reused, and again be SwapCache, using the same swap as before.
420 */
421static bool shmem_confirm_swap(struct address_space *mapping,
422 pgoff_t index, swp_entry_t swap)
423{
a12831bf 424 return xa_load(&mapping->i_pages, index) == swp_to_radix_entry(swap);
d1899228
HD
425}
426
5a6e75f8
KS
427/*
428 * Definitions for "huge tmpfs": tmpfs mounted with the huge= option
429 *
430 * SHMEM_HUGE_NEVER:
431 * disables huge pages for the mount;
432 * SHMEM_HUGE_ALWAYS:
433 * enables huge pages for the mount;
434 * SHMEM_HUGE_WITHIN_SIZE:
435 * only allocate huge pages if the page will be fully within i_size,
436 * also respect fadvise()/madvise() hints;
437 * SHMEM_HUGE_ADVISE:
438 * only allocate huge pages if requested with fadvise()/madvise();
439 */
440
441#define SHMEM_HUGE_NEVER 0
442#define SHMEM_HUGE_ALWAYS 1
443#define SHMEM_HUGE_WITHIN_SIZE 2
444#define SHMEM_HUGE_ADVISE 3
445
446/*
447 * Special values.
448 * Only can be set via /sys/kernel/mm/transparent_hugepage/shmem_enabled:
449 *
450 * SHMEM_HUGE_DENY:
451 * disables huge on shm_mnt and all mounts, for emergency use;
452 * SHMEM_HUGE_FORCE:
453 * enables huge on shm_mnt and all mounts, w/o needing option, for testing;
454 *
455 */
456#define SHMEM_HUGE_DENY (-1)
457#define SHMEM_HUGE_FORCE (-2)
458
396bcc52 459#ifdef CONFIG_TRANSPARENT_HUGEPAGE
5a6e75f8
KS
460/* ifdef here to avoid bloating shmem.o when not necessary */
461
5e6e5a12 462static int shmem_huge __read_mostly = SHMEM_HUGE_NEVER;
5a6e75f8 463
5e6e5a12
HD
464bool shmem_is_huge(struct vm_area_struct *vma,
465 struct inode *inode, pgoff_t index)
c852023e 466{
c852023e 467 loff_t i_size;
c852023e 468
f7cd16a5
XR
469 if (!S_ISREG(inode->i_mode))
470 return false;
c852023e
HD
471 if (shmem_huge == SHMEM_HUGE_DENY)
472 return false;
5e6e5a12
HD
473 if (vma && ((vma->vm_flags & VM_NOHUGEPAGE) ||
474 test_bit(MMF_DISABLE_THP, &vma->vm_mm->flags)))
c852023e 475 return false;
5e6e5a12
HD
476 if (shmem_huge == SHMEM_HUGE_FORCE)
477 return true;
478
479 switch (SHMEM_SB(inode->i_sb)->huge) {
c852023e
HD
480 case SHMEM_HUGE_ALWAYS:
481 return true;
482 case SHMEM_HUGE_WITHIN_SIZE:
de6ee659 483 index = round_up(index + 1, HPAGE_PMD_NR);
c852023e 484 i_size = round_up(i_size_read(inode), PAGE_SIZE);
de6ee659 485 if (i_size >> PAGE_SHIFT >= index)
c852023e
HD
486 return true;
487 fallthrough;
488 case SHMEM_HUGE_ADVISE:
5e6e5a12
HD
489 if (vma && (vma->vm_flags & VM_HUGEPAGE))
490 return true;
491 fallthrough;
c852023e 492 default:
c852023e
HD
493 return false;
494 }
495}
5a6e75f8 496
e5f2249a 497#if defined(CONFIG_SYSFS)
5a6e75f8
KS
498static int shmem_parse_huge(const char *str)
499{
500 if (!strcmp(str, "never"))
501 return SHMEM_HUGE_NEVER;
502 if (!strcmp(str, "always"))
503 return SHMEM_HUGE_ALWAYS;
504 if (!strcmp(str, "within_size"))
505 return SHMEM_HUGE_WITHIN_SIZE;
506 if (!strcmp(str, "advise"))
507 return SHMEM_HUGE_ADVISE;
508 if (!strcmp(str, "deny"))
509 return SHMEM_HUGE_DENY;
510 if (!strcmp(str, "force"))
511 return SHMEM_HUGE_FORCE;
512 return -EINVAL;
513}
e5f2249a 514#endif
5a6e75f8 515
e5f2249a 516#if defined(CONFIG_SYSFS) || defined(CONFIG_TMPFS)
5a6e75f8
KS
517static const char *shmem_format_huge(int huge)
518{
519 switch (huge) {
520 case SHMEM_HUGE_NEVER:
521 return "never";
522 case SHMEM_HUGE_ALWAYS:
523 return "always";
524 case SHMEM_HUGE_WITHIN_SIZE:
525 return "within_size";
526 case SHMEM_HUGE_ADVISE:
527 return "advise";
528 case SHMEM_HUGE_DENY:
529 return "deny";
530 case SHMEM_HUGE_FORCE:
531 return "force";
532 default:
533 VM_BUG_ON(1);
534 return "bad_val";
535 }
536}
f1f5929c 537#endif
5a6e75f8 538
779750d2
KS
539static unsigned long shmem_unused_huge_shrink(struct shmem_sb_info *sbinfo,
540 struct shrink_control *sc, unsigned long nr_to_split)
541{
542 LIST_HEAD(list), *pos, *next;
253fd0f0 543 LIST_HEAD(to_remove);
779750d2
KS
544 struct inode *inode;
545 struct shmem_inode_info *info;
05624571 546 struct folio *folio;
779750d2 547 unsigned long batch = sc ? sc->nr_to_scan : 128;
62c9827c 548 int split = 0;
779750d2
KS
549
550 if (list_empty(&sbinfo->shrinklist))
551 return SHRINK_STOP;
552
553 spin_lock(&sbinfo->shrinklist_lock);
554 list_for_each_safe(pos, next, &sbinfo->shrinklist) {
555 info = list_entry(pos, struct shmem_inode_info, shrinklist);
556
557 /* pin the inode */
558 inode = igrab(&info->vfs_inode);
559
560 /* inode is about to be evicted */
561 if (!inode) {
562 list_del_init(&info->shrinklist);
779750d2
KS
563 goto next;
564 }
565
566 /* Check if there's anything to gain */
567 if (round_up(inode->i_size, PAGE_SIZE) ==
568 round_up(inode->i_size, HPAGE_PMD_SIZE)) {
253fd0f0 569 list_move(&info->shrinklist, &to_remove);
779750d2
KS
570 goto next;
571 }
572
573 list_move(&info->shrinklist, &list);
574next:
62c9827c 575 sbinfo->shrinklist_len--;
779750d2
KS
576 if (!--batch)
577 break;
578 }
579 spin_unlock(&sbinfo->shrinklist_lock);
580
253fd0f0
KS
581 list_for_each_safe(pos, next, &to_remove) {
582 info = list_entry(pos, struct shmem_inode_info, shrinklist);
583 inode = &info->vfs_inode;
584 list_del_init(&info->shrinklist);
585 iput(inode);
586 }
587
779750d2
KS
588 list_for_each_safe(pos, next, &list) {
589 int ret;
05624571 590 pgoff_t index;
779750d2
KS
591
592 info = list_entry(pos, struct shmem_inode_info, shrinklist);
593 inode = &info->vfs_inode;
594
b3cd54b2 595 if (nr_to_split && split >= nr_to_split)
62c9827c 596 goto move_back;
779750d2 597
05624571
MWO
598 index = (inode->i_size & HPAGE_PMD_MASK) >> PAGE_SHIFT;
599 folio = filemap_get_folio(inode->i_mapping, index);
600 if (!folio)
779750d2
KS
601 goto drop;
602
b3cd54b2 603 /* No huge page at the end of the file: nothing to split */
05624571
MWO
604 if (!folio_test_large(folio)) {
605 folio_put(folio);
779750d2
KS
606 goto drop;
607 }
608
b3cd54b2 609 /*
62c9827c
GL
610 * Move the inode on the list back to shrinklist if we failed
611 * to lock the page at this time.
b3cd54b2
KS
612 *
613 * Waiting for the lock may lead to deadlock in the
614 * reclaim path.
615 */
05624571
MWO
616 if (!folio_trylock(folio)) {
617 folio_put(folio);
62c9827c 618 goto move_back;
b3cd54b2
KS
619 }
620
d788f5b3 621 ret = split_folio(folio);
05624571
MWO
622 folio_unlock(folio);
623 folio_put(folio);
779750d2 624
62c9827c 625 /* If split failed move the inode on the list back to shrinklist */
b3cd54b2 626 if (ret)
62c9827c 627 goto move_back;
779750d2
KS
628
629 split++;
630drop:
631 list_del_init(&info->shrinklist);
62c9827c
GL
632 goto put;
633move_back:
634 /*
635 * Make sure the inode is either on the global list or deleted
636 * from any local list before iput() since it could be deleted
637 * in another thread once we put the inode (then the local list
638 * is corrupted).
639 */
640 spin_lock(&sbinfo->shrinklist_lock);
641 list_move(&info->shrinklist, &sbinfo->shrinklist);
642 sbinfo->shrinklist_len++;
643 spin_unlock(&sbinfo->shrinklist_lock);
644put:
779750d2
KS
645 iput(inode);
646 }
647
779750d2
KS
648 return split;
649}
650
651static long shmem_unused_huge_scan(struct super_block *sb,
652 struct shrink_control *sc)
653{
654 struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
655
656 if (!READ_ONCE(sbinfo->shrinklist_len))
657 return SHRINK_STOP;
658
659 return shmem_unused_huge_shrink(sbinfo, sc, 0);
660}
661
662static long shmem_unused_huge_count(struct super_block *sb,
663 struct shrink_control *sc)
664{
665 struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
666 return READ_ONCE(sbinfo->shrinklist_len);
667}
396bcc52 668#else /* !CONFIG_TRANSPARENT_HUGEPAGE */
5a6e75f8
KS
669
670#define shmem_huge SHMEM_HUGE_DENY
671
5e6e5a12
HD
672bool shmem_is_huge(struct vm_area_struct *vma,
673 struct inode *inode, pgoff_t index)
674{
675 return false;
676}
677
779750d2
KS
678static unsigned long shmem_unused_huge_shrink(struct shmem_sb_info *sbinfo,
679 struct shrink_control *sc, unsigned long nr_to_split)
680{
681 return 0;
682}
396bcc52 683#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
5a6e75f8 684
46f65ec1 685/*
2bb876b5 686 * Like filemap_add_folio, but error if expected item has gone.
46f65ec1 687 */
b7dd44a1 688static int shmem_add_to_page_cache(struct folio *folio,
46f65ec1 689 struct address_space *mapping,
3fea5a49
JW
690 pgoff_t index, void *expected, gfp_t gfp,
691 struct mm_struct *charge_mm)
46f65ec1 692{
b7dd44a1
MWO
693 XA_STATE_ORDER(xas, &mapping->i_pages, index, folio_order(folio));
694 long nr = folio_nr_pages(folio);
3fea5a49 695 int error;
46f65ec1 696
b7dd44a1
MWO
697 VM_BUG_ON_FOLIO(index != round_down(index, nr), folio);
698 VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
699 VM_BUG_ON_FOLIO(!folio_test_swapbacked(folio), folio);
700 VM_BUG_ON(expected && folio_test_large(folio));
46f65ec1 701
b7dd44a1
MWO
702 folio_ref_add(folio, nr);
703 folio->mapping = mapping;
704 folio->index = index;
b065b432 705
b7dd44a1
MWO
706 if (!folio_test_swapcache(folio)) {
707 error = mem_cgroup_charge(folio, charge_mm, gfp);
4c6355b2 708 if (error) {
b7dd44a1 709 if (folio_test_pmd_mappable(folio)) {
4c6355b2
JW
710 count_vm_event(THP_FILE_FALLBACK);
711 count_vm_event(THP_FILE_FALLBACK_CHARGE);
712 }
713 goto error;
3fea5a49 714 }
3fea5a49 715 }
b7dd44a1 716 folio_throttle_swaprate(folio, gfp);
3fea5a49 717
552446a4 718 do {
552446a4 719 xas_lock_irq(&xas);
6b24ca4a
MWO
720 if (expected != xas_find_conflict(&xas)) {
721 xas_set_err(&xas, -EEXIST);
722 goto unlock;
723 }
724 if (expected && xas_find_conflict(&xas)) {
552446a4 725 xas_set_err(&xas, -EEXIST);
552446a4 726 goto unlock;
800d8c63 727 }
b7dd44a1 728 xas_store(&xas, folio);
6b24ca4a
MWO
729 if (xas_error(&xas))
730 goto unlock;
b7dd44a1 731 if (folio_test_pmd_mappable(folio)) {
800d8c63 732 count_vm_event(THP_FILE_ALLOC);
b7dd44a1 733 __lruvec_stat_mod_folio(folio, NR_SHMEM_THPS, nr);
800d8c63 734 }
800d8c63 735 mapping->nrpages += nr;
b7dd44a1
MWO
736 __lruvec_stat_mod_folio(folio, NR_FILE_PAGES, nr);
737 __lruvec_stat_mod_folio(folio, NR_SHMEM, nr);
552446a4
MW
738unlock:
739 xas_unlock_irq(&xas);
740 } while (xas_nomem(&xas, gfp));
741
742 if (xas_error(&xas)) {
3fea5a49
JW
743 error = xas_error(&xas);
744 goto error;
46f65ec1 745 }
552446a4
MW
746
747 return 0;
3fea5a49 748error:
b7dd44a1
MWO
749 folio->mapping = NULL;
750 folio_ref_sub(folio, nr);
3fea5a49 751 return error;
46f65ec1
HD
752}
753
6922c0c7 754/*
4cd400fd 755 * Like delete_from_page_cache, but substitutes swap for @folio.
6922c0c7 756 */
4cd400fd 757static void shmem_delete_from_page_cache(struct folio *folio, void *radswap)
6922c0c7 758{
4cd400fd
MWO
759 struct address_space *mapping = folio->mapping;
760 long nr = folio_nr_pages(folio);
6922c0c7
HD
761 int error;
762
b93b0163 763 xa_lock_irq(&mapping->i_pages);
4cd400fd
MWO
764 error = shmem_replace_entry(mapping, folio->index, folio, radswap);
765 folio->mapping = NULL;
766 mapping->nrpages -= nr;
767 __lruvec_stat_mod_folio(folio, NR_FILE_PAGES, -nr);
768 __lruvec_stat_mod_folio(folio, NR_SHMEM, -nr);
b93b0163 769 xa_unlock_irq(&mapping->i_pages);
4cd400fd 770 folio_put(folio);
6922c0c7
HD
771 BUG_ON(error);
772}
773
7a5d0fbb 774/*
c121d3bb 775 * Remove swap entry from page cache, free the swap and its page cache.
7a5d0fbb
HD
776 */
777static int shmem_free_swap(struct address_space *mapping,
778 pgoff_t index, void *radswap)
779{
6dbaf22c 780 void *old;
7a5d0fbb 781
55f3f7ea 782 old = xa_cmpxchg_irq(&mapping->i_pages, index, radswap, NULL, 0);
6dbaf22c
JW
783 if (old != radswap)
784 return -ENOENT;
785 free_swap_and_cache(radix_to_swp_entry(radswap));
786 return 0;
7a5d0fbb
HD
787}
788
6a15a370
VB
789/*
790 * Determine (in bytes) how many of the shmem object's pages mapped by the
48131e03 791 * given offsets are swapped out.
6a15a370 792 *
9608703e 793 * This is safe to call without i_rwsem or the i_pages lock thanks to RCU,
6a15a370
VB
794 * as long as the inode doesn't go away and racy results are not a problem.
795 */
48131e03
VB
796unsigned long shmem_partial_swap_usage(struct address_space *mapping,
797 pgoff_t start, pgoff_t end)
6a15a370 798{
7ae3424f 799 XA_STATE(xas, &mapping->i_pages, start);
6a15a370 800 struct page *page;
48131e03 801 unsigned long swapped = 0;
6a15a370
VB
802
803 rcu_read_lock();
7ae3424f
MW
804 xas_for_each(&xas, page, end - 1) {
805 if (xas_retry(&xas, page))
2cf938aa 806 continue;
3159f943 807 if (xa_is_value(page))
6a15a370
VB
808 swapped++;
809
810 if (need_resched()) {
7ae3424f 811 xas_pause(&xas);
6a15a370 812 cond_resched_rcu();
6a15a370
VB
813 }
814 }
815
816 rcu_read_unlock();
817
818 return swapped << PAGE_SHIFT;
819}
820
48131e03
VB
821/*
822 * Determine (in bytes) how many of the shmem object's pages mapped by the
823 * given vma is swapped out.
824 *
9608703e 825 * This is safe to call without i_rwsem or the i_pages lock thanks to RCU,
48131e03
VB
826 * as long as the inode doesn't go away and racy results are not a problem.
827 */
828unsigned long shmem_swap_usage(struct vm_area_struct *vma)
829{
830 struct inode *inode = file_inode(vma->vm_file);
831 struct shmem_inode_info *info = SHMEM_I(inode);
832 struct address_space *mapping = inode->i_mapping;
833 unsigned long swapped;
834
835 /* Be careful as we don't hold info->lock */
836 swapped = READ_ONCE(info->swapped);
837
838 /*
839 * The easier cases are when the shmem object has nothing in swap, or
840 * the vma maps it whole. Then we can simply use the stats that we
841 * already track.
842 */
843 if (!swapped)
844 return 0;
845
846 if (!vma->vm_pgoff && vma->vm_end - vma->vm_start >= inode->i_size)
847 return swapped << PAGE_SHIFT;
848
849 /* Here comes the more involved part */
02399c88
PX
850 return shmem_partial_swap_usage(mapping, vma->vm_pgoff,
851 vma->vm_pgoff + vma_pages(vma));
48131e03
VB
852}
853
24513264
HD
854/*
855 * SysV IPC SHM_UNLOCK restore Unevictable pages to their evictable lists.
856 */
857void shmem_unlock_mapping(struct address_space *mapping)
858{
105c988f 859 struct folio_batch fbatch;
24513264
HD
860 pgoff_t index = 0;
861
105c988f 862 folio_batch_init(&fbatch);
24513264
HD
863 /*
864 * Minor point, but we might as well stop if someone else SHM_LOCKs it.
865 */
105c988f
MWO
866 while (!mapping_unevictable(mapping) &&
867 filemap_get_folios(mapping, &index, ~0UL, &fbatch)) {
868 check_move_unevictable_folios(&fbatch);
869 folio_batch_release(&fbatch);
24513264
HD
870 cond_resched();
871 }
7a5d0fbb
HD
872}
873
b9a8a419 874static struct folio *shmem_get_partial_folio(struct inode *inode, pgoff_t index)
71725ed1 875{
b9a8a419
MWO
876 struct folio *folio;
877 struct page *page;
71725ed1 878
b9a8a419
MWO
879 /*
880 * At first avoid shmem_getpage(,,,SGP_READ): that fails
881 * beyond i_size, and reports fallocated pages as holes.
882 */
883 folio = __filemap_get_folio(inode->i_mapping, index,
884 FGP_ENTRY | FGP_LOCK, 0);
885 if (!xa_is_value(folio))
886 return folio;
887 /*
888 * But read a page back from swap if any of it is within i_size
889 * (although in some cases this is just a waste of time).
890 */
891 page = NULL;
892 shmem_getpage(inode, index, &page, SGP_READ);
893 return page ? page_folio(page) : NULL;
71725ed1
HD
894}
895
7a5d0fbb 896/*
7f4446ee 897 * Remove range of pages and swap entries from page cache, and free them.
1635f6a7 898 * If !unfalloc, truncate or punch hole; if unfalloc, undo failed fallocate.
7a5d0fbb 899 */
1635f6a7
HD
900static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend,
901 bool unfalloc)
1da177e4 902{
285b2c4f 903 struct address_space *mapping = inode->i_mapping;
1da177e4 904 struct shmem_inode_info *info = SHMEM_I(inode);
09cbfeaf
KS
905 pgoff_t start = (lstart + PAGE_SIZE - 1) >> PAGE_SHIFT;
906 pgoff_t end = (lend + 1) >> PAGE_SHIFT;
0e499ed3 907 struct folio_batch fbatch;
7a5d0fbb 908 pgoff_t indices[PAGEVEC_SIZE];
b9a8a419
MWO
909 struct folio *folio;
910 bool same_folio;
7a5d0fbb 911 long nr_swaps_freed = 0;
285b2c4f 912 pgoff_t index;
bda97eab
HD
913 int i;
914
83e4fa9c
HD
915 if (lend == -1)
916 end = -1; /* unsigned, so actually very big */
bda97eab 917
d144bf62
HD
918 if (info->fallocend > start && info->fallocend <= end && !unfalloc)
919 info->fallocend = start;
920
51dcbdac 921 folio_batch_init(&fbatch);
bda97eab 922 index = start;
5c211ba2 923 while (index < end && find_lock_entries(mapping, index, end - 1,
51dcbdac
MWO
924 &fbatch, indices)) {
925 for (i = 0; i < folio_batch_count(&fbatch); i++) {
b9a8a419 926 folio = fbatch.folios[i];
bda97eab 927
7a5d0fbb 928 index = indices[i];
bda97eab 929
7b774aab 930 if (xa_is_value(folio)) {
1635f6a7
HD
931 if (unfalloc)
932 continue;
7a5d0fbb 933 nr_swaps_freed += !shmem_free_swap(mapping,
7b774aab 934 index, folio);
bda97eab 935 continue;
7a5d0fbb 936 }
7b774aab 937 index += folio_nr_pages(folio) - 1;
7a5d0fbb 938
7b774aab 939 if (!unfalloc || !folio_test_uptodate(folio))
1e84a3d9 940 truncate_inode_folio(mapping, folio);
7b774aab 941 folio_unlock(folio);
bda97eab 942 }
51dcbdac
MWO
943 folio_batch_remove_exceptionals(&fbatch);
944 folio_batch_release(&fbatch);
bda97eab
HD
945 cond_resched();
946 index++;
947 }
1da177e4 948
b9a8a419
MWO
949 same_folio = (lstart >> PAGE_SHIFT) == (lend >> PAGE_SHIFT);
950 folio = shmem_get_partial_folio(inode, lstart >> PAGE_SHIFT);
951 if (folio) {
952 same_folio = lend < folio_pos(folio) + folio_size(folio);
953 folio_mark_dirty(folio);
954 if (!truncate_inode_partial_folio(folio, lstart, lend)) {
955 start = folio->index + folio_nr_pages(folio);
956 if (same_folio)
957 end = folio->index;
83e4fa9c 958 }
b9a8a419
MWO
959 folio_unlock(folio);
960 folio_put(folio);
961 folio = NULL;
83e4fa9c 962 }
b9a8a419
MWO
963
964 if (!same_folio)
965 folio = shmem_get_partial_folio(inode, lend >> PAGE_SHIFT);
966 if (folio) {
967 folio_mark_dirty(folio);
968 if (!truncate_inode_partial_folio(folio, lstart, lend))
969 end = folio->index;
970 folio_unlock(folio);
971 folio_put(folio);
bda97eab
HD
972 }
973
974 index = start;
b1a36650 975 while (index < end) {
bda97eab 976 cond_resched();
0cd6144a 977
0e499ed3 978 if (!find_get_entries(mapping, index, end - 1, &fbatch,
cf2039af 979 indices)) {
b1a36650
HD
980 /* If all gone or hole-punch or unfalloc, we're done */
981 if (index == start || end != -1)
bda97eab 982 break;
b1a36650 983 /* But if truncating, restart to make sure all gone */
bda97eab
HD
984 index = start;
985 continue;
986 }
0e499ed3 987 for (i = 0; i < folio_batch_count(&fbatch); i++) {
b9a8a419 988 folio = fbatch.folios[i];
bda97eab 989
7a5d0fbb 990 index = indices[i];
0e499ed3 991 if (xa_is_value(folio)) {
1635f6a7
HD
992 if (unfalloc)
993 continue;
0e499ed3 994 if (shmem_free_swap(mapping, index, folio)) {
b1a36650
HD
995 /* Swap was replaced by page: retry */
996 index--;
997 break;
998 }
999 nr_swaps_freed++;
7a5d0fbb
HD
1000 continue;
1001 }
1002
0e499ed3 1003 folio_lock(folio);
800d8c63 1004
0e499ed3 1005 if (!unfalloc || !folio_test_uptodate(folio)) {
0e499ed3 1006 if (folio_mapping(folio) != mapping) {
b1a36650 1007 /* Page was replaced by swap: retry */
0e499ed3 1008 folio_unlock(folio);
b1a36650
HD
1009 index--;
1010 break;
1635f6a7 1011 }
0e499ed3
MWO
1012 VM_BUG_ON_FOLIO(folio_test_writeback(folio),
1013 folio);
b9a8a419 1014 truncate_inode_folio(mapping, folio);
7a5d0fbb 1015 }
b9a8a419 1016 index = folio->index + folio_nr_pages(folio) - 1;
0e499ed3 1017 folio_unlock(folio);
bda97eab 1018 }
0e499ed3
MWO
1019 folio_batch_remove_exceptionals(&fbatch);
1020 folio_batch_release(&fbatch);
bda97eab
HD
1021 index++;
1022 }
94c1e62d 1023
4595ef88 1024 spin_lock_irq(&info->lock);
7a5d0fbb 1025 info->swapped -= nr_swaps_freed;
1da177e4 1026 shmem_recalc_inode(inode);
4595ef88 1027 spin_unlock_irq(&info->lock);
1635f6a7 1028}
1da177e4 1029
1635f6a7
HD
1030void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend)
1031{
1032 shmem_undo_range(inode, lstart, lend, false);
078cd827 1033 inode->i_ctime = inode->i_mtime = current_time(inode);
1da177e4 1034}
94c1e62d 1035EXPORT_SYMBOL_GPL(shmem_truncate_range);
1da177e4 1036
549c7297
CB
1037static int shmem_getattr(struct user_namespace *mnt_userns,
1038 const struct path *path, struct kstat *stat,
a528d35e 1039 u32 request_mask, unsigned int query_flags)
44a30220 1040{
a528d35e 1041 struct inode *inode = path->dentry->d_inode;
44a30220
YZ
1042 struct shmem_inode_info *info = SHMEM_I(inode);
1043
d0424c42 1044 if (info->alloced - info->swapped != inode->i_mapping->nrpages) {
4595ef88 1045 spin_lock_irq(&info->lock);
d0424c42 1046 shmem_recalc_inode(inode);
4595ef88 1047 spin_unlock_irq(&info->lock);
d0424c42 1048 }
e408e695
TT
1049 if (info->fsflags & FS_APPEND_FL)
1050 stat->attributes |= STATX_ATTR_APPEND;
1051 if (info->fsflags & FS_IMMUTABLE_FL)
1052 stat->attributes |= STATX_ATTR_IMMUTABLE;
1053 if (info->fsflags & FS_NODUMP_FL)
1054 stat->attributes |= STATX_ATTR_NODUMP;
1055 stat->attributes_mask |= (STATX_ATTR_APPEND |
1056 STATX_ATTR_IMMUTABLE |
1057 STATX_ATTR_NODUMP);
0d56a451 1058 generic_fillattr(&init_user_ns, inode, stat);
89fdcd26 1059
a7fddc36 1060 if (shmem_is_huge(NULL, inode, 0))
89fdcd26
YS
1061 stat->blksize = HPAGE_PMD_SIZE;
1062
f7cd16a5
XR
1063 if (request_mask & STATX_BTIME) {
1064 stat->result_mask |= STATX_BTIME;
1065 stat->btime.tv_sec = info->i_crtime.tv_sec;
1066 stat->btime.tv_nsec = info->i_crtime.tv_nsec;
1067 }
1068
44a30220
YZ
1069 return 0;
1070}
1071
549c7297
CB
1072static int shmem_setattr(struct user_namespace *mnt_userns,
1073 struct dentry *dentry, struct iattr *attr)
1da177e4 1074{
75c3cfa8 1075 struct inode *inode = d_inode(dentry);
40e041a2 1076 struct shmem_inode_info *info = SHMEM_I(inode);
1da177e4
LT
1077 int error;
1078
2f221d6f 1079 error = setattr_prepare(&init_user_ns, dentry, attr);
db78b877
CH
1080 if (error)
1081 return error;
1082
94c1e62d
HD
1083 if (S_ISREG(inode->i_mode) && (attr->ia_valid & ATTR_SIZE)) {
1084 loff_t oldsize = inode->i_size;
1085 loff_t newsize = attr->ia_size;
3889e6e7 1086
9608703e 1087 /* protected by i_rwsem */
40e041a2
DH
1088 if ((newsize < oldsize && (info->seals & F_SEAL_SHRINK)) ||
1089 (newsize > oldsize && (info->seals & F_SEAL_GROW)))
1090 return -EPERM;
1091
94c1e62d 1092 if (newsize != oldsize) {
77142517
KK
1093 error = shmem_reacct_size(SHMEM_I(inode)->flags,
1094 oldsize, newsize);
1095 if (error)
1096 return error;
94c1e62d 1097 i_size_write(inode, newsize);
078cd827 1098 inode->i_ctime = inode->i_mtime = current_time(inode);
94c1e62d 1099 }
afa2db2f 1100 if (newsize <= oldsize) {
94c1e62d 1101 loff_t holebegin = round_up(newsize, PAGE_SIZE);
d0424c42
HD
1102 if (oldsize > holebegin)
1103 unmap_mapping_range(inode->i_mapping,
1104 holebegin, 0, 1);
1105 if (info->alloced)
1106 shmem_truncate_range(inode,
1107 newsize, (loff_t)-1);
94c1e62d 1108 /* unmap again to remove racily COWed private pages */
d0424c42
HD
1109 if (oldsize > holebegin)
1110 unmap_mapping_range(inode->i_mapping,
1111 holebegin, 0, 1);
94c1e62d 1112 }
1da177e4
LT
1113 }
1114
2f221d6f 1115 setattr_copy(&init_user_ns, inode, attr);
db78b877 1116 if (attr->ia_valid & ATTR_MODE)
e65ce2a5 1117 error = posix_acl_chmod(&init_user_ns, inode, inode->i_mode);
1da177e4
LT
1118 return error;
1119}
1120
1f895f75 1121static void shmem_evict_inode(struct inode *inode)
1da177e4 1122{
1da177e4 1123 struct shmem_inode_info *info = SHMEM_I(inode);
779750d2 1124 struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
1da177e4 1125
30e6a51d 1126 if (shmem_mapping(inode->i_mapping)) {
1da177e4
LT
1127 shmem_unacct_size(info->flags, inode->i_size);
1128 inode->i_size = 0;
bc786390 1129 mapping_set_exiting(inode->i_mapping);
3889e6e7 1130 shmem_truncate_range(inode, 0, (loff_t)-1);
779750d2
KS
1131 if (!list_empty(&info->shrinklist)) {
1132 spin_lock(&sbinfo->shrinklist_lock);
1133 if (!list_empty(&info->shrinklist)) {
1134 list_del_init(&info->shrinklist);
1135 sbinfo->shrinklist_len--;
1136 }
1137 spin_unlock(&sbinfo->shrinklist_lock);
1138 }
af53d3e9
HD
1139 while (!list_empty(&info->swaplist)) {
1140 /* Wait while shmem_unuse() is scanning this inode... */
1141 wait_var_event(&info->stop_eviction,
1142 !atomic_read(&info->stop_eviction));
cb5f7b9a 1143 mutex_lock(&shmem_swaplist_mutex);
af53d3e9
HD
1144 /* ...but beware of the race if we peeked too early */
1145 if (!atomic_read(&info->stop_eviction))
1146 list_del_init(&info->swaplist);
cb5f7b9a 1147 mutex_unlock(&shmem_swaplist_mutex);
1da177e4 1148 }
3ed47db3 1149 }
b09e0fa4 1150
38f38657 1151 simple_xattrs_free(&info->xattrs);
0f3c42f5 1152 WARN_ON(inode->i_blocks);
5b04c689 1153 shmem_free_inode(inode->i_sb);
dbd5768f 1154 clear_inode(inode);
1da177e4
LT
1155}
1156
b56a2d8a 1157static int shmem_find_swap_entries(struct address_space *mapping,
da08e9b7
MWO
1158 pgoff_t start, struct folio_batch *fbatch,
1159 pgoff_t *indices, unsigned int type)
478922e2 1160{
b56a2d8a 1161 XA_STATE(xas, &mapping->i_pages, start);
da08e9b7 1162 struct folio *folio;
87039546 1163 swp_entry_t entry;
478922e2
MW
1164
1165 rcu_read_lock();
da08e9b7
MWO
1166 xas_for_each(&xas, folio, ULONG_MAX) {
1167 if (xas_retry(&xas, folio))
5b9c98f3 1168 continue;
b56a2d8a 1169
da08e9b7 1170 if (!xa_is_value(folio))
478922e2 1171 continue;
b56a2d8a 1172
da08e9b7 1173 entry = radix_to_swp_entry(folio);
6cec2b95
ML
1174 /*
1175 * swapin error entries can be found in the mapping. But they're
1176 * deliberately ignored here as we've done everything we can do.
1177 */
87039546
HD
1178 if (swp_type(entry) != type)
1179 continue;
b56a2d8a 1180
e384200e 1181 indices[folio_batch_count(fbatch)] = xas.xa_index;
da08e9b7
MWO
1182 if (!folio_batch_add(fbatch, folio))
1183 break;
b56a2d8a
VRP
1184
1185 if (need_resched()) {
1186 xas_pause(&xas);
1187 cond_resched_rcu();
1188 }
478922e2 1189 }
478922e2 1190 rcu_read_unlock();
e21a2955 1191
da08e9b7 1192 return xas.xa_index;
478922e2
MW
1193}
1194
46f65ec1 1195/*
b56a2d8a
VRP
1196 * Move the swapped pages for an inode to page cache. Returns the count
1197 * of pages swapped in, or the error in case of failure.
46f65ec1 1198 */
da08e9b7
MWO
1199static int shmem_unuse_swap_entries(struct inode *inode,
1200 struct folio_batch *fbatch, pgoff_t *indices)
1da177e4 1201{
b56a2d8a
VRP
1202 int i = 0;
1203 int ret = 0;
bde05d1c 1204 int error = 0;
b56a2d8a 1205 struct address_space *mapping = inode->i_mapping;
1da177e4 1206
da08e9b7
MWO
1207 for (i = 0; i < folio_batch_count(fbatch); i++) {
1208 struct folio *folio = fbatch->folios[i];
2e0e26c7 1209
da08e9b7 1210 if (!xa_is_value(folio))
b56a2d8a 1211 continue;
da08e9b7
MWO
1212 error = shmem_swapin_folio(inode, indices[i],
1213 &folio, SGP_CACHE,
b56a2d8a
VRP
1214 mapping_gfp_mask(mapping),
1215 NULL, NULL);
1216 if (error == 0) {
da08e9b7
MWO
1217 folio_unlock(folio);
1218 folio_put(folio);
b56a2d8a
VRP
1219 ret++;
1220 }
1221 if (error == -ENOMEM)
1222 break;
1223 error = 0;
bde05d1c 1224 }
b56a2d8a
VRP
1225 return error ? error : ret;
1226}
bde05d1c 1227
b56a2d8a
VRP
1228/*
1229 * If swap found in inode, free it and move page from swapcache to filecache.
1230 */
10a9c496 1231static int shmem_unuse_inode(struct inode *inode, unsigned int type)
b56a2d8a
VRP
1232{
1233 struct address_space *mapping = inode->i_mapping;
1234 pgoff_t start = 0;
da08e9b7 1235 struct folio_batch fbatch;
b56a2d8a 1236 pgoff_t indices[PAGEVEC_SIZE];
b56a2d8a
VRP
1237 int ret = 0;
1238
b56a2d8a 1239 do {
da08e9b7
MWO
1240 folio_batch_init(&fbatch);
1241 shmem_find_swap_entries(mapping, start, &fbatch, indices, type);
1242 if (folio_batch_count(&fbatch) == 0) {
b56a2d8a
VRP
1243 ret = 0;
1244 break;
46f65ec1 1245 }
b56a2d8a 1246
da08e9b7 1247 ret = shmem_unuse_swap_entries(inode, &fbatch, indices);
b56a2d8a
VRP
1248 if (ret < 0)
1249 break;
1250
da08e9b7 1251 start = indices[folio_batch_count(&fbatch) - 1];
b56a2d8a
VRP
1252 } while (true);
1253
1254 return ret;
1da177e4
LT
1255}
1256
1257/*
b56a2d8a
VRP
1258 * Read all the shared memory data that resides in the swap
1259 * device 'type' back into memory, so the swap device can be
1260 * unused.
1da177e4 1261 */
10a9c496 1262int shmem_unuse(unsigned int type)
1da177e4 1263{
b56a2d8a 1264 struct shmem_inode_info *info, *next;
bde05d1c
HD
1265 int error = 0;
1266
b56a2d8a
VRP
1267 if (list_empty(&shmem_swaplist))
1268 return 0;
1269
1270 mutex_lock(&shmem_swaplist_mutex);
b56a2d8a
VRP
1271 list_for_each_entry_safe(info, next, &shmem_swaplist, swaplist) {
1272 if (!info->swapped) {
6922c0c7 1273 list_del_init(&info->swaplist);
b56a2d8a
VRP
1274 continue;
1275 }
af53d3e9
HD
1276 /*
1277 * Drop the swaplist mutex while searching the inode for swap;
1278 * but before doing so, make sure shmem_evict_inode() will not
1279 * remove placeholder inode from swaplist, nor let it be freed
1280 * (igrab() would protect from unlink, but not from unmount).
1281 */
1282 atomic_inc(&info->stop_eviction);
b56a2d8a 1283 mutex_unlock(&shmem_swaplist_mutex);
b56a2d8a 1284
10a9c496 1285 error = shmem_unuse_inode(&info->vfs_inode, type);
cb5f7b9a 1286 cond_resched();
b56a2d8a
VRP
1287
1288 mutex_lock(&shmem_swaplist_mutex);
1289 next = list_next_entry(info, swaplist);
1290 if (!info->swapped)
1291 list_del_init(&info->swaplist);
af53d3e9
HD
1292 if (atomic_dec_and_test(&info->stop_eviction))
1293 wake_up_var(&info->stop_eviction);
b56a2d8a 1294 if (error)
778dd893 1295 break;
1da177e4 1296 }
cb5f7b9a 1297 mutex_unlock(&shmem_swaplist_mutex);
778dd893 1298
778dd893 1299 return error;
1da177e4
LT
1300}
1301
1302/*
1303 * Move the page from the page cache to the swap cache.
1304 */
1305static int shmem_writepage(struct page *page, struct writeback_control *wbc)
1306{
e2e3fdc7 1307 struct folio *folio = page_folio(page);
1da177e4 1308 struct shmem_inode_info *info;
1da177e4 1309 struct address_space *mapping;
1da177e4 1310 struct inode *inode;
6922c0c7
HD
1311 swp_entry_t swap;
1312 pgoff_t index;
1da177e4 1313
1e6decf3
HD
1314 /*
1315 * If /sys/kernel/mm/transparent_hugepage/shmem_enabled is "always" or
1316 * "force", drivers/gpu/drm/i915/gem/i915_gem_shmem.c gets huge pages,
1317 * and its shmem_writeback() needs them to be split when swapping.
1318 */
f530ed0e 1319 if (folio_test_large(folio)) {
1e6decf3 1320 /* Ensure the subpages are still dirty */
f530ed0e 1321 folio_test_set_dirty(folio);
1e6decf3
HD
1322 if (split_huge_page(page) < 0)
1323 goto redirty;
f530ed0e
MWO
1324 folio = page_folio(page);
1325 folio_clear_dirty(folio);
1e6decf3
HD
1326 }
1327
f530ed0e
MWO
1328 BUG_ON(!folio_test_locked(folio));
1329 mapping = folio->mapping;
1330 index = folio->index;
1da177e4
LT
1331 inode = mapping->host;
1332 info = SHMEM_I(inode);
1333 if (info->flags & VM_LOCKED)
1334 goto redirty;
d9fe526a 1335 if (!total_swap_pages)
1da177e4
LT
1336 goto redirty;
1337
d9fe526a 1338 /*
97b713ba
CH
1339 * Our capabilities prevent regular writeback or sync from ever calling
1340 * shmem_writepage; but a stacking filesystem might use ->writepage of
1341 * its underlying filesystem, in which case tmpfs should write out to
1342 * swap only in response to memory pressure, and not for the writeback
1343 * threads or sync.
d9fe526a 1344 */
48f170fb
HD
1345 if (!wbc->for_reclaim) {
1346 WARN_ON_ONCE(1); /* Still happens? Tell us about it! */
1347 goto redirty;
1348 }
1635f6a7
HD
1349
1350 /*
1351 * This is somewhat ridiculous, but without plumbing a SWAP_MAP_FALLOC
1352 * value into swapfile.c, the only way we can correctly account for a
f530ed0e 1353 * fallocated folio arriving here is now to initialize it and write it.
1aac1400 1354 *
f530ed0e 1355 * That's okay for a folio already fallocated earlier, but if we have
1aac1400 1356 * not yet completed the fallocation, then (a) we want to keep track
f530ed0e 1357 * of this folio in case we have to undo it, and (b) it may not be a
1aac1400 1358 * good idea to continue anyway, once we're pushing into swap. So
f530ed0e 1359 * reactivate the folio, and let shmem_fallocate() quit when too many.
1635f6a7 1360 */
f530ed0e 1361 if (!folio_test_uptodate(folio)) {
1aac1400
HD
1362 if (inode->i_private) {
1363 struct shmem_falloc *shmem_falloc;
1364 spin_lock(&inode->i_lock);
1365 shmem_falloc = inode->i_private;
1366 if (shmem_falloc &&
8e205f77 1367 !shmem_falloc->waitq &&
1aac1400
HD
1368 index >= shmem_falloc->start &&
1369 index < shmem_falloc->next)
1370 shmem_falloc->nr_unswapped++;
1371 else
1372 shmem_falloc = NULL;
1373 spin_unlock(&inode->i_lock);
1374 if (shmem_falloc)
1375 goto redirty;
1376 }
f530ed0e
MWO
1377 folio_zero_range(folio, 0, folio_size(folio));
1378 flush_dcache_folio(folio);
1379 folio_mark_uptodate(folio);
1635f6a7
HD
1380 }
1381
e2e3fdc7 1382 swap = folio_alloc_swap(folio);
48f170fb
HD
1383 if (!swap.val)
1384 goto redirty;
d9fe526a 1385
b1dea800
HD
1386 /*
1387 * Add inode to shmem_unuse()'s list of swapped-out inodes,
f530ed0e 1388 * if it's not already there. Do it now before the folio is
6922c0c7 1389 * moved to swap cache, when its pagelock no longer protects
b1dea800 1390 * the inode from eviction. But don't unlock the mutex until
6922c0c7
HD
1391 * we've incremented swapped, because shmem_unuse_inode() will
1392 * prune a !swapped inode from the swaplist under this mutex.
b1dea800 1393 */
48f170fb
HD
1394 mutex_lock(&shmem_swaplist_mutex);
1395 if (list_empty(&info->swaplist))
b56a2d8a 1396 list_add(&info->swaplist, &shmem_swaplist);
b1dea800 1397
a4c366f0 1398 if (add_to_swap_cache(folio, swap,
3852f676
JK
1399 __GFP_HIGH | __GFP_NOMEMALLOC | __GFP_NOWARN,
1400 NULL) == 0) {
4595ef88 1401 spin_lock_irq(&info->lock);
6922c0c7 1402 shmem_recalc_inode(inode);
267a4c76 1403 info->swapped++;
4595ef88 1404 spin_unlock_irq(&info->lock);
6922c0c7 1405
267a4c76 1406 swap_shmem_alloc(swap);
4cd400fd 1407 shmem_delete_from_page_cache(folio, swp_to_radix_entry(swap));
267a4c76 1408
6922c0c7 1409 mutex_unlock(&shmem_swaplist_mutex);
f530ed0e
MWO
1410 BUG_ON(folio_mapped(folio));
1411 swap_writepage(&folio->page, wbc);
1da177e4
LT
1412 return 0;
1413 }
1414
6922c0c7 1415 mutex_unlock(&shmem_swaplist_mutex);
4081f744 1416 put_swap_folio(folio, swap);
1da177e4 1417redirty:
f530ed0e 1418 folio_mark_dirty(folio);
d9fe526a 1419 if (wbc->for_reclaim)
f530ed0e
MWO
1420 return AOP_WRITEPAGE_ACTIVATE; /* Return with folio locked */
1421 folio_unlock(folio);
d9fe526a 1422 return 0;
1da177e4
LT
1423}
1424
75edd345 1425#if defined(CONFIG_NUMA) && defined(CONFIG_TMPFS)
71fe804b 1426static void shmem_show_mpol(struct seq_file *seq, struct mempolicy *mpol)
680d794b 1427{
095f1fc4 1428 char buffer[64];
680d794b 1429
71fe804b 1430 if (!mpol || mpol->mode == MPOL_DEFAULT)
095f1fc4 1431 return; /* show nothing */
680d794b 1432
a7a88b23 1433 mpol_to_str(buffer, sizeof(buffer), mpol);
095f1fc4
LS
1434
1435 seq_printf(seq, ",mpol=%s", buffer);
680d794b 1436}
71fe804b
LS
1437
1438static struct mempolicy *shmem_get_sbmpol(struct shmem_sb_info *sbinfo)
1439{
1440 struct mempolicy *mpol = NULL;
1441 if (sbinfo->mpol) {
bf11b9a8 1442 raw_spin_lock(&sbinfo->stat_lock); /* prevent replace/use races */
71fe804b
LS
1443 mpol = sbinfo->mpol;
1444 mpol_get(mpol);
bf11b9a8 1445 raw_spin_unlock(&sbinfo->stat_lock);
71fe804b
LS
1446 }
1447 return mpol;
1448}
75edd345
HD
1449#else /* !CONFIG_NUMA || !CONFIG_TMPFS */
1450static inline void shmem_show_mpol(struct seq_file *seq, struct mempolicy *mpol)
1451{
1452}
1453static inline struct mempolicy *shmem_get_sbmpol(struct shmem_sb_info *sbinfo)
1454{
1455 return NULL;
1456}
1457#endif /* CONFIG_NUMA && CONFIG_TMPFS */
1458#ifndef CONFIG_NUMA
1459#define vm_policy vm_private_data
1460#endif
680d794b 1461
800d8c63
KS
1462static void shmem_pseudo_vma_init(struct vm_area_struct *vma,
1463 struct shmem_inode_info *info, pgoff_t index)
1464{
1465 /* Create a pseudo vma that just contains the policy */
2c4541e2 1466 vma_init(vma, NULL);
800d8c63
KS
1467 /* Bias interleave by inode number to distribute better across nodes */
1468 vma->vm_pgoff = index + info->vfs_inode.i_ino;
800d8c63
KS
1469 vma->vm_policy = mpol_shared_policy_lookup(&info->policy, index);
1470}
1471
1472static void shmem_pseudo_vma_destroy(struct vm_area_struct *vma)
1473{
1474 /* Drop reference taken by mpol_shared_policy_lookup() */
1475 mpol_cond_put(vma->vm_policy);
1476}
1477
5739a81c 1478static struct folio *shmem_swapin(swp_entry_t swap, gfp_t gfp,
41ffe5d5 1479 struct shmem_inode_info *info, pgoff_t index)
1da177e4 1480{
1da177e4 1481 struct vm_area_struct pvma;
18a2f371 1482 struct page *page;
8c63ca5b
WD
1483 struct vm_fault vmf = {
1484 .vma = &pvma,
1485 };
52cd3b07 1486
800d8c63 1487 shmem_pseudo_vma_init(&pvma, info, index);
e9e9b7ec 1488 page = swap_cluster_readahead(swap, gfp, &vmf);
800d8c63 1489 shmem_pseudo_vma_destroy(&pvma);
18a2f371 1490
5739a81c
MWO
1491 if (!page)
1492 return NULL;
1493 return page_folio(page);
800d8c63
KS
1494}
1495
78cc8cdc
RR
1496/*
1497 * Make sure huge_gfp is always more limited than limit_gfp.
1498 * Some of the flags set permissions, while others set limitations.
1499 */
1500static gfp_t limit_gfp_mask(gfp_t huge_gfp, gfp_t limit_gfp)
1501{
1502 gfp_t allowflags = __GFP_IO | __GFP_FS | __GFP_RECLAIM;
1503 gfp_t denyflags = __GFP_NOWARN | __GFP_NORETRY;
187df5dd
RR
1504 gfp_t zoneflags = limit_gfp & GFP_ZONEMASK;
1505 gfp_t result = huge_gfp & ~(allowflags | GFP_ZONEMASK);
1506
1507 /* Allow allocations only from the originally specified zones. */
1508 result |= zoneflags;
78cc8cdc
RR
1509
1510 /*
1511 * Minimize the result gfp by taking the union with the deny flags,
1512 * and the intersection of the allow flags.
1513 */
1514 result |= (limit_gfp & denyflags);
1515 result |= (huge_gfp & limit_gfp) & allowflags;
1516
1517 return result;
1518}
1519
72827e5c 1520static struct folio *shmem_alloc_hugefolio(gfp_t gfp,
800d8c63
KS
1521 struct shmem_inode_info *info, pgoff_t index)
1522{
1523 struct vm_area_struct pvma;
7b8d046f
MW
1524 struct address_space *mapping = info->vfs_inode.i_mapping;
1525 pgoff_t hindex;
dfe98499 1526 struct folio *folio;
800d8c63 1527
4620a06e 1528 hindex = round_down(index, HPAGE_PMD_NR);
7b8d046f
MW
1529 if (xa_find(&mapping->i_pages, &hindex, hindex + HPAGE_PMD_NR - 1,
1530 XA_PRESENT))
800d8c63 1531 return NULL;
18a2f371 1532
800d8c63 1533 shmem_pseudo_vma_init(&pvma, info, hindex);
dfe98499 1534 folio = vma_alloc_folio(gfp, HPAGE_PMD_ORDER, &pvma, 0, true);
800d8c63 1535 shmem_pseudo_vma_destroy(&pvma);
dfe98499 1536 if (!folio)
dcdf11ee 1537 count_vm_event(THP_FILE_FALLBACK);
72827e5c 1538 return folio;
1da177e4
LT
1539}
1540
0c023ef5 1541static struct folio *shmem_alloc_folio(gfp_t gfp,
41ffe5d5 1542 struct shmem_inode_info *info, pgoff_t index)
1da177e4
LT
1543{
1544 struct vm_area_struct pvma;
0c023ef5 1545 struct folio *folio;
1da177e4 1546
800d8c63 1547 shmem_pseudo_vma_init(&pvma, info, index);
0c023ef5 1548 folio = vma_alloc_folio(gfp, 0, &pvma, 0, false);
800d8c63
KS
1549 shmem_pseudo_vma_destroy(&pvma);
1550
0c023ef5
MWO
1551 return folio;
1552}
1553
b1d0ec3a 1554static struct folio *shmem_alloc_and_acct_folio(gfp_t gfp, struct inode *inode,
800d8c63
KS
1555 pgoff_t index, bool huge)
1556{
0f079694 1557 struct shmem_inode_info *info = SHMEM_I(inode);
72827e5c 1558 struct folio *folio;
800d8c63
KS
1559 int nr;
1560 int err = -ENOSPC;
52cd3b07 1561
396bcc52 1562 if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE))
800d8c63
KS
1563 huge = false;
1564 nr = huge ? HPAGE_PMD_NR : 1;
1565
0f079694 1566 if (!shmem_inode_acct_block(inode, nr))
800d8c63 1567 goto failed;
800d8c63
KS
1568
1569 if (huge)
72827e5c 1570 folio = shmem_alloc_hugefolio(gfp, info, index);
800d8c63 1571 else
72827e5c
MWO
1572 folio = shmem_alloc_folio(gfp, info, index);
1573 if (folio) {
1574 __folio_set_locked(folio);
1575 __folio_set_swapbacked(folio);
b1d0ec3a 1576 return folio;
75edd345 1577 }
18a2f371 1578
800d8c63 1579 err = -ENOMEM;
0f079694 1580 shmem_inode_unacct_blocks(inode, nr);
800d8c63
KS
1581failed:
1582 return ERR_PTR(err);
1da177e4 1583}
71fe804b 1584
bde05d1c
HD
1585/*
1586 * When a page is moved from swapcache to shmem filecache (either by the
fc26babb 1587 * usual swapin of shmem_get_folio_gfp(), or by the less common swapoff of
bde05d1c
HD
1588 * shmem_unuse_inode()), it may have been read in earlier from swap, in
1589 * ignorance of the mapping it belongs to. If that mapping has special
1590 * constraints (like the gma500 GEM driver, which requires RAM below 4GB),
1591 * we may need to copy to a suitable page before moving to filecache.
1592 *
1593 * In a future release, this may well be extended to respect cpuset and
1594 * NUMA mempolicy, and applied also to anonymous pages in do_swap_page();
1595 * but for now it is a simple matter of zone.
1596 */
069d849c 1597static bool shmem_should_replace_folio(struct folio *folio, gfp_t gfp)
bde05d1c 1598{
069d849c 1599 return folio_zonenum(folio) > gfp_zone(gfp);
bde05d1c
HD
1600}
1601
0d698e25 1602static int shmem_replace_folio(struct folio **foliop, gfp_t gfp,
bde05d1c
HD
1603 struct shmem_inode_info *info, pgoff_t index)
1604{
d21bba2b 1605 struct folio *old, *new;
bde05d1c 1606 struct address_space *swap_mapping;
c1cb20d4 1607 swp_entry_t entry;
bde05d1c
HD
1608 pgoff_t swap_index;
1609 int error;
1610
0d698e25 1611 old = *foliop;
907ea17e 1612 entry = folio_swap_entry(old);
c1cb20d4 1613 swap_index = swp_offset(entry);
907ea17e 1614 swap_mapping = swap_address_space(entry);
bde05d1c
HD
1615
1616 /*
1617 * We have arrived here because our zones are constrained, so don't
1618 * limit chance of success by further cpuset and node constraints.
1619 */
1620 gfp &= ~GFP_CONSTRAINT_MASK;
907ea17e
MWO
1621 VM_BUG_ON_FOLIO(folio_test_large(old), old);
1622 new = shmem_alloc_folio(gfp, info, index);
1623 if (!new)
bde05d1c 1624 return -ENOMEM;
bde05d1c 1625
907ea17e
MWO
1626 folio_get(new);
1627 folio_copy(new, old);
1628 flush_dcache_folio(new);
bde05d1c 1629
907ea17e
MWO
1630 __folio_set_locked(new);
1631 __folio_set_swapbacked(new);
1632 folio_mark_uptodate(new);
1633 folio_set_swap_entry(new, entry);
1634 folio_set_swapcache(new);
bde05d1c
HD
1635
1636 /*
1637 * Our caller will very soon move newpage out of swapcache, but it's
1638 * a nice clean interface for us to replace oldpage by newpage there.
1639 */
b93b0163 1640 xa_lock_irq(&swap_mapping->i_pages);
907ea17e 1641 error = shmem_replace_entry(swap_mapping, swap_index, old, new);
0142ef6c 1642 if (!error) {
d21bba2b 1643 mem_cgroup_migrate(old, new);
907ea17e
MWO
1644 __lruvec_stat_mod_folio(new, NR_FILE_PAGES, 1);
1645 __lruvec_stat_mod_folio(new, NR_SHMEM, 1);
1646 __lruvec_stat_mod_folio(old, NR_FILE_PAGES, -1);
1647 __lruvec_stat_mod_folio(old, NR_SHMEM, -1);
0142ef6c 1648 }
b93b0163 1649 xa_unlock_irq(&swap_mapping->i_pages);
bde05d1c 1650
0142ef6c
HD
1651 if (unlikely(error)) {
1652 /*
1653 * Is this possible? I think not, now that our callers check
1654 * both PageSwapCache and page_private after getting page lock;
1655 * but be defensive. Reverse old to newpage for clear and free.
1656 */
907ea17e 1657 old = new;
0142ef6c 1658 } else {
907ea17e 1659 folio_add_lru(new);
0d698e25 1660 *foliop = new;
0142ef6c 1661 }
bde05d1c 1662
907ea17e
MWO
1663 folio_clear_swapcache(old);
1664 old->private = NULL;
bde05d1c 1665
907ea17e
MWO
1666 folio_unlock(old);
1667 folio_put_refs(old, 2);
0142ef6c 1668 return error;
bde05d1c
HD
1669}
1670
6cec2b95
ML
1671static void shmem_set_folio_swapin_error(struct inode *inode, pgoff_t index,
1672 struct folio *folio, swp_entry_t swap)
1673{
1674 struct address_space *mapping = inode->i_mapping;
1675 struct shmem_inode_info *info = SHMEM_I(inode);
1676 swp_entry_t swapin_error;
1677 void *old;
1678
1679 swapin_error = make_swapin_error_entry(&folio->page);
1680 old = xa_cmpxchg_irq(&mapping->i_pages, index,
1681 swp_to_radix_entry(swap),
1682 swp_to_radix_entry(swapin_error), 0);
1683 if (old != swp_to_radix_entry(swap))
1684 return;
1685
1686 folio_wait_writeback(folio);
75fa68a5 1687 delete_from_swap_cache(folio);
6cec2b95
ML
1688 spin_lock_irq(&info->lock);
1689 /*
1690 * Don't treat swapin error folio as alloced. Otherwise inode->i_blocks won't
1691 * be 0 when inode is released and thus trigger WARN_ON(inode->i_blocks) in
1692 * shmem_evict_inode.
1693 */
1694 info->alloced--;
1695 info->swapped--;
1696 shmem_recalc_inode(inode);
1697 spin_unlock_irq(&info->lock);
1698 swap_free(swap);
1699}
1700
c5bf121e 1701/*
833de10f
ML
1702 * Swap in the folio pointed to by *foliop.
1703 * Caller has to make sure that *foliop contains a valid swapped folio.
1704 * Returns 0 and the folio in foliop if success. On failure, returns the
1705 * error code and NULL in *foliop.
c5bf121e 1706 */
da08e9b7
MWO
1707static int shmem_swapin_folio(struct inode *inode, pgoff_t index,
1708 struct folio **foliop, enum sgp_type sgp,
c5bf121e
VRP
1709 gfp_t gfp, struct vm_area_struct *vma,
1710 vm_fault_t *fault_type)
1711{
1712 struct address_space *mapping = inode->i_mapping;
1713 struct shmem_inode_info *info = SHMEM_I(inode);
04f94e3f 1714 struct mm_struct *charge_mm = vma ? vma->vm_mm : NULL;
da08e9b7 1715 struct folio *folio = NULL;
c5bf121e
VRP
1716 swp_entry_t swap;
1717 int error;
1718
da08e9b7
MWO
1719 VM_BUG_ON(!*foliop || !xa_is_value(*foliop));
1720 swap = radix_to_swp_entry(*foliop);
1721 *foliop = NULL;
c5bf121e 1722
6cec2b95
ML
1723 if (is_swapin_error_entry(swap))
1724 return -EIO;
1725
c5bf121e 1726 /* Look it up and read it in.. */
5739a81c
MWO
1727 folio = swap_cache_get_folio(swap, NULL, 0);
1728 if (!folio) {
c5bf121e
VRP
1729 /* Or update major stats only when swapin succeeds?? */
1730 if (fault_type) {
1731 *fault_type |= VM_FAULT_MAJOR;
1732 count_vm_event(PGMAJFAULT);
1733 count_memcg_event_mm(charge_mm, PGMAJFAULT);
1734 }
1735 /* Here we actually start the io */
5739a81c
MWO
1736 folio = shmem_swapin(swap, gfp, info, index);
1737 if (!folio) {
c5bf121e
VRP
1738 error = -ENOMEM;
1739 goto failed;
1740 }
1741 }
1742
833de10f 1743 /* We have to do this with folio locked to prevent races */
da08e9b7
MWO
1744 folio_lock(folio);
1745 if (!folio_test_swapcache(folio) ||
1746 folio_swap_entry(folio).val != swap.val ||
c5bf121e
VRP
1747 !shmem_confirm_swap(mapping, index, swap)) {
1748 error = -EEXIST;
1749 goto unlock;
1750 }
da08e9b7 1751 if (!folio_test_uptodate(folio)) {
c5bf121e
VRP
1752 error = -EIO;
1753 goto failed;
1754 }
da08e9b7 1755 folio_wait_writeback(folio);
c5bf121e 1756
8a84802e
SP
1757 /*
1758 * Some architectures may have to restore extra metadata to the
da08e9b7 1759 * folio after reading from swap.
8a84802e 1760 */
da08e9b7 1761 arch_swap_restore(swap, folio);
8a84802e 1762
069d849c 1763 if (shmem_should_replace_folio(folio, gfp)) {
0d698e25 1764 error = shmem_replace_folio(&folio, gfp, info, index);
c5bf121e
VRP
1765 if (error)
1766 goto failed;
1767 }
1768
b7dd44a1 1769 error = shmem_add_to_page_cache(folio, mapping, index,
3fea5a49
JW
1770 swp_to_radix_entry(swap), gfp,
1771 charge_mm);
1772 if (error)
14235ab3 1773 goto failed;
c5bf121e
VRP
1774
1775 spin_lock_irq(&info->lock);
1776 info->swapped--;
1777 shmem_recalc_inode(inode);
1778 spin_unlock_irq(&info->lock);
1779
1780 if (sgp == SGP_WRITE)
da08e9b7 1781 folio_mark_accessed(folio);
c5bf121e 1782
75fa68a5 1783 delete_from_swap_cache(folio);
da08e9b7 1784 folio_mark_dirty(folio);
c5bf121e
VRP
1785 swap_free(swap);
1786
da08e9b7 1787 *foliop = folio;
c5bf121e
VRP
1788 return 0;
1789failed:
1790 if (!shmem_confirm_swap(mapping, index, swap))
1791 error = -EEXIST;
6cec2b95
ML
1792 if (error == -EIO)
1793 shmem_set_folio_swapin_error(inode, index, folio, swap);
c5bf121e 1794unlock:
da08e9b7
MWO
1795 if (folio) {
1796 folio_unlock(folio);
1797 folio_put(folio);
c5bf121e
VRP
1798 }
1799
1800 return error;
1801}
1802
1da177e4 1803/*
fc26babb 1804 * shmem_get_folio_gfp - find page in cache, or get from swap, or allocate
1da177e4
LT
1805 *
1806 * If we allocate a new one we do not mark it dirty. That's up to the
1807 * vm. If we swap it in we mark it dirty since we also free the swap
9e18eb29
ALC
1808 * entry since a page cannot live in both the swap and page cache.
1809 *
c949b097 1810 * vma, vmf, and fault_type are only supplied by shmem_fault:
9e18eb29 1811 * otherwise they are NULL.
1da177e4 1812 */
fc26babb
MWO
1813static int shmem_get_folio_gfp(struct inode *inode, pgoff_t index,
1814 struct folio **foliop, enum sgp_type sgp, gfp_t gfp,
1815 struct vm_area_struct *vma, struct vm_fault *vmf,
1816 vm_fault_t *fault_type)
1da177e4
LT
1817{
1818 struct address_space *mapping = inode->i_mapping;
23f919d4 1819 struct shmem_inode_info *info = SHMEM_I(inode);
1da177e4 1820 struct shmem_sb_info *sbinfo;
9e18eb29 1821 struct mm_struct *charge_mm;
b7dd44a1 1822 struct folio *folio;
800d8c63 1823 pgoff_t hindex = index;
164cc4fe 1824 gfp_t huge_gfp;
1da177e4 1825 int error;
54af6042 1826 int once = 0;
1635f6a7 1827 int alloced = 0;
1da177e4 1828
09cbfeaf 1829 if (index > (MAX_LFS_FILESIZE >> PAGE_SHIFT))
1da177e4 1830 return -EFBIG;
1da177e4 1831repeat:
c5bf121e
VRP
1832 if (sgp <= SGP_CACHE &&
1833 ((loff_t)index << PAGE_SHIFT) >= i_size_read(inode)) {
1834 return -EINVAL;
1835 }
1836
1837 sbinfo = SHMEM_SB(inode->i_sb);
04f94e3f 1838 charge_mm = vma ? vma->vm_mm : NULL;
c5bf121e 1839
b1d0ec3a
MWO
1840 folio = __filemap_get_folio(mapping, index, FGP_ENTRY | FGP_LOCK, 0);
1841 if (folio && vma && userfaultfd_minor(vma)) {
1842 if (!xa_is_value(folio)) {
1843 folio_unlock(folio);
1844 folio_put(folio);
c949b097
AR
1845 }
1846 *fault_type = handle_userfault(vmf, VM_UFFD_MINOR);
1847 return 0;
1848 }
1849
b1d0ec3a 1850 if (xa_is_value(folio)) {
da08e9b7 1851 error = shmem_swapin_folio(inode, index, &folio,
c5bf121e
VRP
1852 sgp, gfp, vma, fault_type);
1853 if (error == -EEXIST)
1854 goto repeat;
54af6042 1855
fc26babb 1856 *foliop = folio;
c5bf121e 1857 return error;
54af6042
HD
1858 }
1859
b1d0ec3a
MWO
1860 if (folio) {
1861 hindex = folio->index;
acdd9f8e 1862 if (sgp == SGP_WRITE)
b1d0ec3a
MWO
1863 folio_mark_accessed(folio);
1864 if (folio_test_uptodate(folio))
acdd9f8e 1865 goto out;
fc26babb 1866 /* fallocated folio */
1635f6a7
HD
1867 if (sgp != SGP_READ)
1868 goto clear;
b1d0ec3a
MWO
1869 folio_unlock(folio);
1870 folio_put(folio);
1635f6a7 1871 }
27ab7006
HD
1872
1873 /*
fc26babb
MWO
1874 * SGP_READ: succeed on hole, with NULL folio, letting caller zero.
1875 * SGP_NOALLOC: fail on hole, with NULL folio, letting caller fail.
acdd9f8e 1876 */
fc26babb 1877 *foliop = NULL;
acdd9f8e
HD
1878 if (sgp == SGP_READ)
1879 return 0;
1880 if (sgp == SGP_NOALLOC)
1881 return -ENOENT;
1882
1883 /*
1884 * Fast cache lookup and swap lookup did not find it: allocate.
27ab7006 1885 */
54af6042 1886
c5bf121e
VRP
1887 if (vma && userfaultfd_missing(vma)) {
1888 *fault_type = handle_userfault(vmf, VM_UFFD_MISSING);
1889 return 0;
1890 }
cfda0526 1891
5e6e5a12 1892 if (!shmem_is_huge(vma, inode, index))
c5bf121e 1893 goto alloc_nohuge;
1da177e4 1894
164cc4fe 1895 huge_gfp = vma_thp_gfp_mask(vma);
78cc8cdc 1896 huge_gfp = limit_gfp_mask(huge_gfp, gfp);
b1d0ec3a
MWO
1897 folio = shmem_alloc_and_acct_folio(huge_gfp, inode, index, true);
1898 if (IS_ERR(folio)) {
c5bf121e 1899alloc_nohuge:
b1d0ec3a 1900 folio = shmem_alloc_and_acct_folio(gfp, inode, index, false);
c5bf121e 1901 }
b1d0ec3a 1902 if (IS_ERR(folio)) {
c5bf121e 1903 int retry = 5;
800d8c63 1904
b1d0ec3a
MWO
1905 error = PTR_ERR(folio);
1906 folio = NULL;
c5bf121e
VRP
1907 if (error != -ENOSPC)
1908 goto unlock;
1909 /*
fc26babb 1910 * Try to reclaim some space by splitting a large folio
c5bf121e
VRP
1911 * beyond i_size on the filesystem.
1912 */
1913 while (retry--) {
1914 int ret;
66d2f4d2 1915
c5bf121e
VRP
1916 ret = shmem_unused_huge_shrink(sbinfo, NULL, 1);
1917 if (ret == SHRINK_STOP)
1918 break;
1919 if (ret)
1920 goto alloc_nohuge;
b065b432 1921 }
c5bf121e
VRP
1922 goto unlock;
1923 }
54af6042 1924
b1d0ec3a 1925 hindex = round_down(index, folio_nr_pages(folio));
54af6042 1926
c5bf121e 1927 if (sgp == SGP_WRITE)
b1d0ec3a 1928 __folio_set_referenced(folio);
c5bf121e 1929
b7dd44a1 1930 error = shmem_add_to_page_cache(folio, mapping, hindex,
3fea5a49
JW
1931 NULL, gfp & GFP_RECLAIM_MASK,
1932 charge_mm);
1933 if (error)
c5bf121e 1934 goto unacct;
b1d0ec3a 1935 folio_add_lru(folio);
779750d2 1936
c5bf121e 1937 spin_lock_irq(&info->lock);
b1d0ec3a 1938 info->alloced += folio_nr_pages(folio);
fa020a2b 1939 inode->i_blocks += (blkcnt_t)BLOCKS_PER_PAGE << folio_order(folio);
c5bf121e
VRP
1940 shmem_recalc_inode(inode);
1941 spin_unlock_irq(&info->lock);
1942 alloced = true;
1943
b1d0ec3a 1944 if (folio_test_pmd_mappable(folio) &&
c5bf121e 1945 DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE) <
fc26babb 1946 folio_next_index(folio) - 1) {
ec9516fb 1947 /*
fc26babb 1948 * Part of the large folio is beyond i_size: subject
c5bf121e 1949 * to shrink under memory pressure.
1635f6a7 1950 */
c5bf121e 1951 spin_lock(&sbinfo->shrinklist_lock);
1635f6a7 1952 /*
c5bf121e
VRP
1953 * _careful to defend against unlocked access to
1954 * ->shrink_list in shmem_unused_huge_shrink()
ec9516fb 1955 */
c5bf121e
VRP
1956 if (list_empty_careful(&info->shrinklist)) {
1957 list_add_tail(&info->shrinklist,
1958 &sbinfo->shrinklist);
1959 sbinfo->shrinklist_len++;
1960 }
1961 spin_unlock(&sbinfo->shrinklist_lock);
1962 }
800d8c63 1963
c5bf121e 1964 /*
fc26babb 1965 * Let SGP_FALLOC use the SGP_WRITE optimization on a new folio.
c5bf121e
VRP
1966 */
1967 if (sgp == SGP_FALLOC)
1968 sgp = SGP_WRITE;
1969clear:
1970 /*
fc26babb
MWO
1971 * Let SGP_WRITE caller clear ends if write does not fill folio;
1972 * but SGP_FALLOC on a folio fallocated earlier must initialize
c5bf121e
VRP
1973 * it now, lest undo on failure cancel our earlier guarantee.
1974 */
b1d0ec3a
MWO
1975 if (sgp != SGP_WRITE && !folio_test_uptodate(folio)) {
1976 long i, n = folio_nr_pages(folio);
c5bf121e 1977
b1d0ec3a
MWO
1978 for (i = 0; i < n; i++)
1979 clear_highpage(folio_page(folio, i));
1980 flush_dcache_folio(folio);
1981 folio_mark_uptodate(folio);
1da177e4 1982 }
bde05d1c 1983
54af6042 1984 /* Perhaps the file has been truncated since we checked */
75edd345 1985 if (sgp <= SGP_CACHE &&
09cbfeaf 1986 ((loff_t)index << PAGE_SHIFT) >= i_size_read(inode)) {
267a4c76 1987 if (alloced) {
b1d0ec3a
MWO
1988 folio_clear_dirty(folio);
1989 filemap_remove_folio(folio);
4595ef88 1990 spin_lock_irq(&info->lock);
267a4c76 1991 shmem_recalc_inode(inode);
4595ef88 1992 spin_unlock_irq(&info->lock);
267a4c76 1993 }
54af6042 1994 error = -EINVAL;
267a4c76 1995 goto unlock;
e83c32e8 1996 }
63ec1973 1997out:
fc26babb 1998 *foliop = folio;
54af6042 1999 return 0;
1da177e4 2000
59a16ead 2001 /*
54af6042 2002 * Error recovery.
59a16ead 2003 */
54af6042 2004unacct:
b1d0ec3a 2005 shmem_inode_unacct_blocks(inode, folio_nr_pages(folio));
800d8c63 2006
b1d0ec3a
MWO
2007 if (folio_test_large(folio)) {
2008 folio_unlock(folio);
2009 folio_put(folio);
800d8c63
KS
2010 goto alloc_nohuge;
2011 }
d1899228 2012unlock:
b1d0ec3a
MWO
2013 if (folio) {
2014 folio_unlock(folio);
2015 folio_put(folio);
54af6042
HD
2016 }
2017 if (error == -ENOSPC && !once++) {
4595ef88 2018 spin_lock_irq(&info->lock);
54af6042 2019 shmem_recalc_inode(inode);
4595ef88 2020 spin_unlock_irq(&info->lock);
27ab7006 2021 goto repeat;
ff36b801 2022 }
7f4446ee 2023 if (error == -EEXIST)
54af6042
HD
2024 goto repeat;
2025 return error;
1da177e4
LT
2026}
2027
fc26babb
MWO
2028static int shmem_getpage_gfp(struct inode *inode, pgoff_t index,
2029 struct page **pagep, enum sgp_type sgp,
2030 gfp_t gfp, struct vm_area_struct *vma,
2031 struct vm_fault *vmf, vm_fault_t *fault_type)
2032{
2033 struct folio *folio = NULL;
2034 int ret = shmem_get_folio_gfp(inode, index, &folio, sgp, gfp, vma,
2035 vmf, fault_type);
2036
2037 if (folio)
2038 *pagep = folio_file_page(folio, index);
2039 else
2040 *pagep = NULL;
2041 return ret;
2042}
2043
2044int shmem_getpage(struct inode *inode, pgoff_t index,
2045 struct page **pagep, enum sgp_type sgp)
2046{
2047 return shmem_getpage_gfp(inode, index, pagep, sgp,
2048 mapping_gfp_mask(inode->i_mapping), NULL, NULL, NULL);
2049}
2050
10d20bd2
LT
2051/*
2052 * This is like autoremove_wake_function, but it removes the wait queue
2053 * entry unconditionally - even if something else had already woken the
2054 * target.
2055 */
ac6424b9 2056static int synchronous_wake_function(wait_queue_entry_t *wait, unsigned mode, int sync, void *key)
10d20bd2
LT
2057{
2058 int ret = default_wake_function(wait, mode, sync, key);
2055da97 2059 list_del_init(&wait->entry);
10d20bd2
LT
2060 return ret;
2061}
2062
20acce67 2063static vm_fault_t shmem_fault(struct vm_fault *vmf)
1da177e4 2064{
11bac800 2065 struct vm_area_struct *vma = vmf->vma;
496ad9aa 2066 struct inode *inode = file_inode(vma->vm_file);
9e18eb29 2067 gfp_t gfp = mapping_gfp_mask(inode->i_mapping);
68a54100 2068 struct folio *folio = NULL;
20acce67
SJ
2069 int err;
2070 vm_fault_t ret = VM_FAULT_LOCKED;
1da177e4 2071
f00cdc6d
HD
2072 /*
2073 * Trinity finds that probing a hole which tmpfs is punching can
2074 * prevent the hole-punch from ever completing: which in turn
9608703e 2075 * locks writers out with its hold on i_rwsem. So refrain from
8e205f77
HD
2076 * faulting pages into the hole while it's being punched. Although
2077 * shmem_undo_range() does remove the additions, it may be unable to
2078 * keep up, as each new page needs its own unmap_mapping_range() call,
2079 * and the i_mmap tree grows ever slower to scan if new vmas are added.
2080 *
2081 * It does not matter if we sometimes reach this check just before the
2082 * hole-punch begins, so that one fault then races with the punch:
2083 * we just need to make racing faults a rare case.
2084 *
2085 * The implementation below would be much simpler if we just used a
9608703e 2086 * standard mutex or completion: but we cannot take i_rwsem in fault,
8e205f77 2087 * and bloating every shmem inode for this unlikely case would be sad.
f00cdc6d
HD
2088 */
2089 if (unlikely(inode->i_private)) {
2090 struct shmem_falloc *shmem_falloc;
2091
2092 spin_lock(&inode->i_lock);
2093 shmem_falloc = inode->i_private;
8e205f77
HD
2094 if (shmem_falloc &&
2095 shmem_falloc->waitq &&
2096 vmf->pgoff >= shmem_falloc->start &&
2097 vmf->pgoff < shmem_falloc->next) {
8897c1b1 2098 struct file *fpin;
8e205f77 2099 wait_queue_head_t *shmem_falloc_waitq;
10d20bd2 2100 DEFINE_WAIT_FUNC(shmem_fault_wait, synchronous_wake_function);
8e205f77
HD
2101
2102 ret = VM_FAULT_NOPAGE;
8897c1b1
KS
2103 fpin = maybe_unlock_mmap_for_io(vmf, NULL);
2104 if (fpin)
8e205f77 2105 ret = VM_FAULT_RETRY;
8e205f77
HD
2106
2107 shmem_falloc_waitq = shmem_falloc->waitq;
2108 prepare_to_wait(shmem_falloc_waitq, &shmem_fault_wait,
2109 TASK_UNINTERRUPTIBLE);
2110 spin_unlock(&inode->i_lock);
2111 schedule();
2112
2113 /*
2114 * shmem_falloc_waitq points into the shmem_fallocate()
2115 * stack of the hole-punching task: shmem_falloc_waitq
2116 * is usually invalid by the time we reach here, but
2117 * finish_wait() does not dereference it in that case;
2118 * though i_lock needed lest racing with wake_up_all().
2119 */
2120 spin_lock(&inode->i_lock);
2121 finish_wait(shmem_falloc_waitq, &shmem_fault_wait);
2122 spin_unlock(&inode->i_lock);
8897c1b1
KS
2123
2124 if (fpin)
2125 fput(fpin);
8e205f77 2126 return ret;
f00cdc6d 2127 }
8e205f77 2128 spin_unlock(&inode->i_lock);
f00cdc6d
HD
2129 }
2130
68a54100 2131 err = shmem_get_folio_gfp(inode, vmf->pgoff, &folio, SGP_CACHE,
cfda0526 2132 gfp, vma, vmf, &ret);
20acce67
SJ
2133 if (err)
2134 return vmf_error(err);
68a54100
MWO
2135 if (folio)
2136 vmf->page = folio_file_page(folio, vmf->pgoff);
68da9f05 2137 return ret;
1da177e4
LT
2138}
2139
c01d5b30
HD
2140unsigned long shmem_get_unmapped_area(struct file *file,
2141 unsigned long uaddr, unsigned long len,
2142 unsigned long pgoff, unsigned long flags)
2143{
2144 unsigned long (*get_area)(struct file *,
2145 unsigned long, unsigned long, unsigned long, unsigned long);
2146 unsigned long addr;
2147 unsigned long offset;
2148 unsigned long inflated_len;
2149 unsigned long inflated_addr;
2150 unsigned long inflated_offset;
2151
2152 if (len > TASK_SIZE)
2153 return -ENOMEM;
2154
2155 get_area = current->mm->get_unmapped_area;
2156 addr = get_area(file, uaddr, len, pgoff, flags);
2157
396bcc52 2158 if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE))
c01d5b30
HD
2159 return addr;
2160 if (IS_ERR_VALUE(addr))
2161 return addr;
2162 if (addr & ~PAGE_MASK)
2163 return addr;
2164 if (addr > TASK_SIZE - len)
2165 return addr;
2166
2167 if (shmem_huge == SHMEM_HUGE_DENY)
2168 return addr;
2169 if (len < HPAGE_PMD_SIZE)
2170 return addr;
2171 if (flags & MAP_FIXED)
2172 return addr;
2173 /*
2174 * Our priority is to support MAP_SHARED mapped hugely;
2175 * and support MAP_PRIVATE mapped hugely too, until it is COWed.
99158997
KS
2176 * But if caller specified an address hint and we allocated area there
2177 * successfully, respect that as before.
c01d5b30 2178 */
99158997 2179 if (uaddr == addr)
c01d5b30
HD
2180 return addr;
2181
2182 if (shmem_huge != SHMEM_HUGE_FORCE) {
2183 struct super_block *sb;
2184
2185 if (file) {
2186 VM_BUG_ON(file->f_op != &shmem_file_operations);
2187 sb = file_inode(file)->i_sb;
2188 } else {
2189 /*
2190 * Called directly from mm/mmap.c, or drivers/char/mem.c
2191 * for "/dev/zero", to create a shared anonymous object.
2192 */
2193 if (IS_ERR(shm_mnt))
2194 return addr;
2195 sb = shm_mnt->mnt_sb;
2196 }
3089bf61 2197 if (SHMEM_SB(sb)->huge == SHMEM_HUGE_NEVER)
c01d5b30
HD
2198 return addr;
2199 }
2200
2201 offset = (pgoff << PAGE_SHIFT) & (HPAGE_PMD_SIZE-1);
2202 if (offset && offset + len < 2 * HPAGE_PMD_SIZE)
2203 return addr;
2204 if ((addr & (HPAGE_PMD_SIZE-1)) == offset)
2205 return addr;
2206
2207 inflated_len = len + HPAGE_PMD_SIZE - PAGE_SIZE;
2208 if (inflated_len > TASK_SIZE)
2209 return addr;
2210 if (inflated_len < len)
2211 return addr;
2212
99158997 2213 inflated_addr = get_area(NULL, uaddr, inflated_len, 0, flags);
c01d5b30
HD
2214 if (IS_ERR_VALUE(inflated_addr))
2215 return addr;
2216 if (inflated_addr & ~PAGE_MASK)
2217 return addr;
2218
2219 inflated_offset = inflated_addr & (HPAGE_PMD_SIZE-1);
2220 inflated_addr += offset - inflated_offset;
2221 if (inflated_offset > offset)
2222 inflated_addr += HPAGE_PMD_SIZE;
2223
2224 if (inflated_addr > TASK_SIZE - len)
2225 return addr;
2226 return inflated_addr;
2227}
2228
1da177e4 2229#ifdef CONFIG_NUMA
41ffe5d5 2230static int shmem_set_policy(struct vm_area_struct *vma, struct mempolicy *mpol)
1da177e4 2231{
496ad9aa 2232 struct inode *inode = file_inode(vma->vm_file);
41ffe5d5 2233 return mpol_set_shared_policy(&SHMEM_I(inode)->policy, vma, mpol);
1da177e4
LT
2234}
2235
d8dc74f2
AB
2236static struct mempolicy *shmem_get_policy(struct vm_area_struct *vma,
2237 unsigned long addr)
1da177e4 2238{
496ad9aa 2239 struct inode *inode = file_inode(vma->vm_file);
41ffe5d5 2240 pgoff_t index;
1da177e4 2241
41ffe5d5
HD
2242 index = ((addr - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
2243 return mpol_shared_policy_lookup(&SHMEM_I(inode)->policy, index);
1da177e4
LT
2244}
2245#endif
2246
d7c9e99a 2247int shmem_lock(struct file *file, int lock, struct ucounts *ucounts)
1da177e4 2248{
496ad9aa 2249 struct inode *inode = file_inode(file);
1da177e4
LT
2250 struct shmem_inode_info *info = SHMEM_I(inode);
2251 int retval = -ENOMEM;
2252
ea0dfeb4
HD
2253 /*
2254 * What serializes the accesses to info->flags?
2255 * ipc_lock_object() when called from shmctl_do_lock(),
2256 * no serialization needed when called from shm_destroy().
2257 */
1da177e4 2258 if (lock && !(info->flags & VM_LOCKED)) {
d7c9e99a 2259 if (!user_shm_lock(inode->i_size, ucounts))
1da177e4
LT
2260 goto out_nomem;
2261 info->flags |= VM_LOCKED;
89e004ea 2262 mapping_set_unevictable(file->f_mapping);
1da177e4 2263 }
d7c9e99a
AG
2264 if (!lock && (info->flags & VM_LOCKED) && ucounts) {
2265 user_shm_unlock(inode->i_size, ucounts);
1da177e4 2266 info->flags &= ~VM_LOCKED;
89e004ea 2267 mapping_clear_unevictable(file->f_mapping);
1da177e4
LT
2268 }
2269 retval = 0;
89e004ea 2270
1da177e4 2271out_nomem:
1da177e4
LT
2272 return retval;
2273}
2274
9b83a6a8 2275static int shmem_mmap(struct file *file, struct vm_area_struct *vma)
1da177e4 2276{
ab3948f5 2277 struct shmem_inode_info *info = SHMEM_I(file_inode(file));
22247efd 2278 int ret;
ab3948f5 2279
22247efd
PX
2280 ret = seal_check_future_write(info->seals, vma);
2281 if (ret)
2282 return ret;
ab3948f5 2283
51b0bff2
CM
2284 /* arm64 - allow memory tagging on RAM-based files */
2285 vma->vm_flags |= VM_MTE_ALLOWED;
2286
1da177e4
LT
2287 file_accessed(file);
2288 vma->vm_ops = &shmem_vm_ops;
2289 return 0;
2290}
2291
cb241339
HD
2292#ifdef CONFIG_TMPFS_XATTR
2293static int shmem_initxattrs(struct inode *, const struct xattr *, void *);
2294
2295/*
2296 * chattr's fsflags are unrelated to extended attributes,
2297 * but tmpfs has chosen to enable them under the same config option.
2298 */
2299static void shmem_set_inode_flags(struct inode *inode, unsigned int fsflags)
2300{
2301 unsigned int i_flags = 0;
2302
2303 if (fsflags & FS_NOATIME_FL)
2304 i_flags |= S_NOATIME;
2305 if (fsflags & FS_APPEND_FL)
2306 i_flags |= S_APPEND;
2307 if (fsflags & FS_IMMUTABLE_FL)
2308 i_flags |= S_IMMUTABLE;
2309 /*
2310 * But FS_NODUMP_FL does not require any action in i_flags.
2311 */
2312 inode_set_flags(inode, i_flags, S_NOATIME | S_APPEND | S_IMMUTABLE);
2313}
2314#else
2315static void shmem_set_inode_flags(struct inode *inode, unsigned int fsflags)
e408e695 2316{
e408e695 2317}
cb241339
HD
2318#define shmem_initxattrs NULL
2319#endif
e408e695
TT
2320
2321static struct inode *shmem_get_inode(struct super_block *sb, struct inode *dir,
09208d15 2322 umode_t mode, dev_t dev, unsigned long flags)
1da177e4
LT
2323{
2324 struct inode *inode;
2325 struct shmem_inode_info *info;
2326 struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
e809d5f0 2327 ino_t ino;
1da177e4 2328
e809d5f0 2329 if (shmem_reserve_inode(sb, &ino))
5b04c689 2330 return NULL;
1da177e4
LT
2331
2332 inode = new_inode(sb);
2333 if (inode) {
e809d5f0 2334 inode->i_ino = ino;
21cb47be 2335 inode_init_owner(&init_user_ns, inode, dir, mode);
1da177e4 2336 inode->i_blocks = 0;
078cd827 2337 inode->i_atime = inode->i_mtime = inode->i_ctime = current_time(inode);
46c9a946 2338 inode->i_generation = prandom_u32();
1da177e4
LT
2339 info = SHMEM_I(inode);
2340 memset(info, 0, (char *)inode - (char *)info);
2341 spin_lock_init(&info->lock);
af53d3e9 2342 atomic_set(&info->stop_eviction, 0);
40e041a2 2343 info->seals = F_SEAL_SEAL;
0b0a0806 2344 info->flags = flags & VM_NORESERVE;
f7cd16a5 2345 info->i_crtime = inode->i_mtime;
e408e695
TT
2346 info->fsflags = (dir == NULL) ? 0 :
2347 SHMEM_I(dir)->fsflags & SHMEM_FL_INHERITED;
cb241339
HD
2348 if (info->fsflags)
2349 shmem_set_inode_flags(inode, info->fsflags);
779750d2 2350 INIT_LIST_HEAD(&info->shrinklist);
1da177e4 2351 INIT_LIST_HEAD(&info->swaplist);
38f38657 2352 simple_xattrs_init(&info->xattrs);
72c04902 2353 cache_no_acl(inode);
ff36da69 2354 mapping_set_large_folios(inode->i_mapping);
1da177e4
LT
2355
2356 switch (mode & S_IFMT) {
2357 default:
39f0247d 2358 inode->i_op = &shmem_special_inode_operations;
1da177e4
LT
2359 init_special_inode(inode, mode, dev);
2360 break;
2361 case S_IFREG:
14fcc23f 2362 inode->i_mapping->a_ops = &shmem_aops;
1da177e4
LT
2363 inode->i_op = &shmem_inode_operations;
2364 inode->i_fop = &shmem_file_operations;
71fe804b
LS
2365 mpol_shared_policy_init(&info->policy,
2366 shmem_get_sbmpol(sbinfo));
1da177e4
LT
2367 break;
2368 case S_IFDIR:
d8c76e6f 2369 inc_nlink(inode);
1da177e4
LT
2370 /* Some things misbehave if size == 0 on a directory */
2371 inode->i_size = 2 * BOGO_DIRENT_SIZE;
2372 inode->i_op = &shmem_dir_inode_operations;
2373 inode->i_fop = &simple_dir_operations;
2374 break;
2375 case S_IFLNK:
2376 /*
2377 * Must not load anything in the rbtree,
2378 * mpol_free_shared_policy will not be called.
2379 */
71fe804b 2380 mpol_shared_policy_init(&info->policy, NULL);
1da177e4
LT
2381 break;
2382 }
b45d71fb
JFG
2383
2384 lockdep_annotate_inode_mutex_key(inode);
5b04c689
PE
2385 } else
2386 shmem_free_inode(sb);
1da177e4
LT
2387 return inode;
2388}
2389
3460f6e5
AR
2390#ifdef CONFIG_USERFAULTFD
2391int shmem_mfill_atomic_pte(struct mm_struct *dst_mm,
2392 pmd_t *dst_pmd,
2393 struct vm_area_struct *dst_vma,
2394 unsigned long dst_addr,
2395 unsigned long src_addr,
8ee79edf 2396 bool zeropage, bool wp_copy,
3460f6e5 2397 struct page **pagep)
4c27fe4c
MR
2398{
2399 struct inode *inode = file_inode(dst_vma->vm_file);
2400 struct shmem_inode_info *info = SHMEM_I(inode);
4c27fe4c
MR
2401 struct address_space *mapping = inode->i_mapping;
2402 gfp_t gfp = mapping_gfp_mask(mapping);
2403 pgoff_t pgoff = linear_page_index(dst_vma, dst_addr);
4c27fe4c 2404 void *page_kaddr;
b7dd44a1 2405 struct folio *folio;
4c27fe4c 2406 int ret;
3460f6e5 2407 pgoff_t max_off;
4c27fe4c 2408
7ed9d238
AR
2409 if (!shmem_inode_acct_block(inode, 1)) {
2410 /*
2411 * We may have got a page, returned -ENOENT triggering a retry,
2412 * and now we find ourselves with -ENOMEM. Release the page, to
2413 * avoid a BUG_ON in our caller.
2414 */
2415 if (unlikely(*pagep)) {
2416 put_page(*pagep);
2417 *pagep = NULL;
2418 }
7d64ae3a 2419 return -ENOMEM;
7ed9d238 2420 }
4c27fe4c 2421
cb658a45 2422 if (!*pagep) {
7d64ae3a 2423 ret = -ENOMEM;
7a7256d5
MWO
2424 folio = shmem_alloc_folio(gfp, info, pgoff);
2425 if (!folio)
0f079694 2426 goto out_unacct_blocks;
4c27fe4c 2427
3460f6e5 2428 if (!zeropage) { /* COPY */
7a7256d5 2429 page_kaddr = kmap_local_folio(folio, 0);
8d103963
MR
2430 ret = copy_from_user(page_kaddr,
2431 (const void __user *)src_addr,
2432 PAGE_SIZE);
7a7256d5 2433 kunmap_local(page_kaddr);
8d103963 2434
c1e8d7c6 2435 /* fallback to copy_from_user outside mmap_lock */
8d103963 2436 if (unlikely(ret)) {
7a7256d5 2437 *pagep = &folio->page;
7d64ae3a 2438 ret = -ENOENT;
8d103963 2439 /* don't free the page */
7d64ae3a 2440 goto out_unacct_blocks;
8d103963 2441 }
19b482c2 2442
7a7256d5 2443 flush_dcache_folio(folio);
3460f6e5 2444 } else { /* ZEROPAGE */
7a7256d5 2445 clear_user_highpage(&folio->page, dst_addr);
4c27fe4c
MR
2446 }
2447 } else {
7a7256d5
MWO
2448 folio = page_folio(*pagep);
2449 VM_BUG_ON_FOLIO(folio_test_large(folio), folio);
4c27fe4c
MR
2450 *pagep = NULL;
2451 }
2452
7a7256d5
MWO
2453 VM_BUG_ON(folio_test_locked(folio));
2454 VM_BUG_ON(folio_test_swapbacked(folio));
2455 __folio_set_locked(folio);
2456 __folio_set_swapbacked(folio);
2457 __folio_mark_uptodate(folio);
9cc90c66 2458
e2a50c1f 2459 ret = -EFAULT;
e2a50c1f 2460 max_off = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE);
3460f6e5 2461 if (unlikely(pgoff >= max_off))
e2a50c1f
AA
2462 goto out_release;
2463
b7dd44a1 2464 ret = shmem_add_to_page_cache(folio, mapping, pgoff, NULL,
3fea5a49 2465 gfp & GFP_RECLAIM_MASK, dst_mm);
4c27fe4c 2466 if (ret)
3fea5a49 2467 goto out_release;
4c27fe4c 2468
7d64ae3a 2469 ret = mfill_atomic_install_pte(dst_mm, dst_pmd, dst_vma, dst_addr,
7a7256d5 2470 &folio->page, true, wp_copy);
7d64ae3a
AR
2471 if (ret)
2472 goto out_delete_from_cache;
4c27fe4c 2473
94b7cc01 2474 spin_lock_irq(&info->lock);
4c27fe4c
MR
2475 info->alloced++;
2476 inode->i_blocks += BLOCKS_PER_PAGE;
2477 shmem_recalc_inode(inode);
94b7cc01 2478 spin_unlock_irq(&info->lock);
4c27fe4c 2479
7a7256d5 2480 folio_unlock(folio);
7d64ae3a
AR
2481 return 0;
2482out_delete_from_cache:
7a7256d5 2483 filemap_remove_folio(folio);
4c27fe4c 2484out_release:
7a7256d5
MWO
2485 folio_unlock(folio);
2486 folio_put(folio);
4c27fe4c 2487out_unacct_blocks:
0f079694 2488 shmem_inode_unacct_blocks(inode, 1);
7d64ae3a 2489 return ret;
8d103963 2490}
3460f6e5 2491#endif /* CONFIG_USERFAULTFD */
8d103963 2492
1da177e4 2493#ifdef CONFIG_TMPFS
92e1d5be 2494static const struct inode_operations shmem_symlink_inode_operations;
69f07ec9 2495static const struct inode_operations shmem_short_symlink_operations;
1da177e4 2496
1da177e4 2497static int
800d15a5 2498shmem_write_begin(struct file *file, struct address_space *mapping,
9d6b0cd7 2499 loff_t pos, unsigned len,
800d15a5 2500 struct page **pagep, void **fsdata)
1da177e4 2501{
800d15a5 2502 struct inode *inode = mapping->host;
40e041a2 2503 struct shmem_inode_info *info = SHMEM_I(inode);
09cbfeaf 2504 pgoff_t index = pos >> PAGE_SHIFT;
a7605426 2505 int ret = 0;
40e041a2 2506
9608703e 2507 /* i_rwsem is held by caller */
ab3948f5
JFG
2508 if (unlikely(info->seals & (F_SEAL_GROW |
2509 F_SEAL_WRITE | F_SEAL_FUTURE_WRITE))) {
2510 if (info->seals & (F_SEAL_WRITE | F_SEAL_FUTURE_WRITE))
40e041a2
DH
2511 return -EPERM;
2512 if ((info->seals & F_SEAL_GROW) && pos + len > inode->i_size)
2513 return -EPERM;
2514 }
2515
a7605426
YS
2516 ret = shmem_getpage(inode, index, pagep, SGP_WRITE);
2517
2518 if (ret)
2519 return ret;
2520
2521 if (PageHWPoison(*pagep)) {
2522 unlock_page(*pagep);
2523 put_page(*pagep);
2524 *pagep = NULL;
2525 return -EIO;
2526 }
2527
2528 return 0;
800d15a5
NP
2529}
2530
2531static int
2532shmem_write_end(struct file *file, struct address_space *mapping,
2533 loff_t pos, unsigned len, unsigned copied,
2534 struct page *page, void *fsdata)
2535{
2536 struct inode *inode = mapping->host;
2537
d3602444
HD
2538 if (pos + copied > inode->i_size)
2539 i_size_write(inode, pos + copied);
2540
ec9516fb 2541 if (!PageUptodate(page)) {
800d8c63
KS
2542 struct page *head = compound_head(page);
2543 if (PageTransCompound(page)) {
2544 int i;
2545
2546 for (i = 0; i < HPAGE_PMD_NR; i++) {
2547 if (head + i == page)
2548 continue;
2549 clear_highpage(head + i);
2550 flush_dcache_page(head + i);
2551 }
2552 }
09cbfeaf
KS
2553 if (copied < PAGE_SIZE) {
2554 unsigned from = pos & (PAGE_SIZE - 1);
ec9516fb 2555 zero_user_segments(page, 0, from,
09cbfeaf 2556 from + copied, PAGE_SIZE);
ec9516fb 2557 }
800d8c63 2558 SetPageUptodate(head);
ec9516fb 2559 }
800d15a5 2560 set_page_dirty(page);
6746aff7 2561 unlock_page(page);
09cbfeaf 2562 put_page(page);
800d15a5 2563
800d15a5 2564 return copied;
1da177e4
LT
2565}
2566
2ba5bbed 2567static ssize_t shmem_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
1da177e4 2568{
6e58e79d
AV
2569 struct file *file = iocb->ki_filp;
2570 struct inode *inode = file_inode(file);
1da177e4 2571 struct address_space *mapping = inode->i_mapping;
41ffe5d5
HD
2572 pgoff_t index;
2573 unsigned long offset;
f7c1d074 2574 int error = 0;
cb66a7a1 2575 ssize_t retval = 0;
6e58e79d 2576 loff_t *ppos = &iocb->ki_pos;
a0ee5ec5 2577
09cbfeaf
KS
2578 index = *ppos >> PAGE_SHIFT;
2579 offset = *ppos & ~PAGE_MASK;
1da177e4
LT
2580
2581 for (;;) {
2582 struct page *page = NULL;
41ffe5d5
HD
2583 pgoff_t end_index;
2584 unsigned long nr, ret;
1da177e4
LT
2585 loff_t i_size = i_size_read(inode);
2586
09cbfeaf 2587 end_index = i_size >> PAGE_SHIFT;
1da177e4
LT
2588 if (index > end_index)
2589 break;
2590 if (index == end_index) {
09cbfeaf 2591 nr = i_size & ~PAGE_MASK;
1da177e4
LT
2592 if (nr <= offset)
2593 break;
2594 }
2595
56a8c8eb 2596 error = shmem_getpage(inode, index, &page, SGP_READ);
6e58e79d
AV
2597 if (error) {
2598 if (error == -EINVAL)
2599 error = 0;
1da177e4
LT
2600 break;
2601 }
75edd345 2602 if (page) {
d3602444 2603 unlock_page(page);
a7605426
YS
2604
2605 if (PageHWPoison(page)) {
2606 put_page(page);
2607 error = -EIO;
2608 break;
2609 }
75edd345 2610 }
1da177e4
LT
2611
2612 /*
2613 * We must evaluate after, since reads (unlike writes)
9608703e 2614 * are called without i_rwsem protection against truncate
1da177e4 2615 */
09cbfeaf 2616 nr = PAGE_SIZE;
1da177e4 2617 i_size = i_size_read(inode);
09cbfeaf 2618 end_index = i_size >> PAGE_SHIFT;
1da177e4 2619 if (index == end_index) {
09cbfeaf 2620 nr = i_size & ~PAGE_MASK;
1da177e4
LT
2621 if (nr <= offset) {
2622 if (page)
09cbfeaf 2623 put_page(page);
1da177e4
LT
2624 break;
2625 }
2626 }
2627 nr -= offset;
2628
2629 if (page) {
2630 /*
2631 * If users can be writing to this page using arbitrary
2632 * virtual addresses, take care about potential aliasing
2633 * before reading the page on the kernel side.
2634 */
2635 if (mapping_writably_mapped(mapping))
2636 flush_dcache_page(page);
2637 /*
2638 * Mark the page accessed if we read the beginning.
2639 */
2640 if (!offset)
2641 mark_page_accessed(page);
1bdec44b
HD
2642 /*
2643 * Ok, we have the page, and it's up-to-date, so
2644 * now we can copy it to user space...
2645 */
2646 ret = copy_page_to_iter(page, offset, nr, to);
2647 put_page(page);
2648
fcb14cb1 2649 } else if (user_backed_iter(to)) {
1bdec44b
HD
2650 /*
2651 * Copy to user tends to be so well optimized, but
2652 * clear_user() not so much, that it is noticeably
2653 * faster to copy the zero page instead of clearing.
2654 */
2655 ret = copy_page_to_iter(ZERO_PAGE(0), offset, nr, to);
b5810039 2656 } else {
1bdec44b
HD
2657 /*
2658 * But submitting the same page twice in a row to
2659 * splice() - or others? - can result in confusion:
2660 * so don't attempt that optimization on pipes etc.
2661 */
2662 ret = iov_iter_zero(nr, to);
b5810039 2663 }
1da177e4 2664
6e58e79d 2665 retval += ret;
1da177e4 2666 offset += ret;
09cbfeaf
KS
2667 index += offset >> PAGE_SHIFT;
2668 offset &= ~PAGE_MASK;
1da177e4 2669
2ba5bbed 2670 if (!iov_iter_count(to))
1da177e4 2671 break;
6e58e79d
AV
2672 if (ret < nr) {
2673 error = -EFAULT;
2674 break;
2675 }
1da177e4
LT
2676 cond_resched();
2677 }
2678
09cbfeaf 2679 *ppos = ((loff_t) index << PAGE_SHIFT) + offset;
6e58e79d
AV
2680 file_accessed(file);
2681 return retval ? retval : error;
1da177e4
LT
2682}
2683
965c8e59 2684static loff_t shmem_file_llseek(struct file *file, loff_t offset, int whence)
220f2ac9
HD
2685{
2686 struct address_space *mapping = file->f_mapping;
2687 struct inode *inode = mapping->host;
220f2ac9 2688
965c8e59
AM
2689 if (whence != SEEK_DATA && whence != SEEK_HOLE)
2690 return generic_file_llseek_size(file, offset, whence,
220f2ac9 2691 MAX_LFS_FILESIZE, i_size_read(inode));
41139aa4
MWO
2692 if (offset < 0)
2693 return -ENXIO;
2694
5955102c 2695 inode_lock(inode);
9608703e 2696 /* We're holding i_rwsem so we can access i_size directly */
41139aa4 2697 offset = mapping_seek_hole_data(mapping, offset, inode->i_size, whence);
387aae6f
HD
2698 if (offset >= 0)
2699 offset = vfs_setpos(file, offset, MAX_LFS_FILESIZE);
5955102c 2700 inode_unlock(inode);
220f2ac9
HD
2701 return offset;
2702}
2703
83e4fa9c
HD
2704static long shmem_fallocate(struct file *file, int mode, loff_t offset,
2705 loff_t len)
2706{
496ad9aa 2707 struct inode *inode = file_inode(file);
e2d12e22 2708 struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
40e041a2 2709 struct shmem_inode_info *info = SHMEM_I(inode);
1aac1400 2710 struct shmem_falloc shmem_falloc;
d144bf62 2711 pgoff_t start, index, end, undo_fallocend;
e2d12e22 2712 int error;
83e4fa9c 2713
13ace4d0
HD
2714 if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
2715 return -EOPNOTSUPP;
2716
5955102c 2717 inode_lock(inode);
83e4fa9c
HD
2718
2719 if (mode & FALLOC_FL_PUNCH_HOLE) {
2720 struct address_space *mapping = file->f_mapping;
2721 loff_t unmap_start = round_up(offset, PAGE_SIZE);
2722 loff_t unmap_end = round_down(offset + len, PAGE_SIZE) - 1;
8e205f77 2723 DECLARE_WAIT_QUEUE_HEAD_ONSTACK(shmem_falloc_waitq);
83e4fa9c 2724
9608703e 2725 /* protected by i_rwsem */
ab3948f5 2726 if (info->seals & (F_SEAL_WRITE | F_SEAL_FUTURE_WRITE)) {
40e041a2
DH
2727 error = -EPERM;
2728 goto out;
2729 }
2730
8e205f77 2731 shmem_falloc.waitq = &shmem_falloc_waitq;
aa71ecd8 2732 shmem_falloc.start = (u64)unmap_start >> PAGE_SHIFT;
f00cdc6d
HD
2733 shmem_falloc.next = (unmap_end + 1) >> PAGE_SHIFT;
2734 spin_lock(&inode->i_lock);
2735 inode->i_private = &shmem_falloc;
2736 spin_unlock(&inode->i_lock);
2737
83e4fa9c
HD
2738 if ((u64)unmap_end > (u64)unmap_start)
2739 unmap_mapping_range(mapping, unmap_start,
2740 1 + unmap_end - unmap_start, 0);
2741 shmem_truncate_range(inode, offset, offset + len - 1);
2742 /* No need to unmap again: hole-punching leaves COWed pages */
8e205f77
HD
2743
2744 spin_lock(&inode->i_lock);
2745 inode->i_private = NULL;
2746 wake_up_all(&shmem_falloc_waitq);
2055da97 2747 WARN_ON_ONCE(!list_empty(&shmem_falloc_waitq.head));
8e205f77 2748 spin_unlock(&inode->i_lock);
83e4fa9c 2749 error = 0;
8e205f77 2750 goto out;
e2d12e22
HD
2751 }
2752
2753 /* We need to check rlimit even when FALLOC_FL_KEEP_SIZE */
2754 error = inode_newsize_ok(inode, offset + len);
2755 if (error)
2756 goto out;
2757
40e041a2
DH
2758 if ((info->seals & F_SEAL_GROW) && offset + len > inode->i_size) {
2759 error = -EPERM;
2760 goto out;
2761 }
2762
09cbfeaf
KS
2763 start = offset >> PAGE_SHIFT;
2764 end = (offset + len + PAGE_SIZE - 1) >> PAGE_SHIFT;
e2d12e22
HD
2765 /* Try to avoid a swapstorm if len is impossible to satisfy */
2766 if (sbinfo->max_blocks && end - start > sbinfo->max_blocks) {
2767 error = -ENOSPC;
2768 goto out;
83e4fa9c
HD
2769 }
2770
8e205f77 2771 shmem_falloc.waitq = NULL;
1aac1400
HD
2772 shmem_falloc.start = start;
2773 shmem_falloc.next = start;
2774 shmem_falloc.nr_falloced = 0;
2775 shmem_falloc.nr_unswapped = 0;
2776 spin_lock(&inode->i_lock);
2777 inode->i_private = &shmem_falloc;
2778 spin_unlock(&inode->i_lock);
2779
d144bf62
HD
2780 /*
2781 * info->fallocend is only relevant when huge pages might be
2782 * involved: to prevent split_huge_page() freeing fallocated
2783 * pages when FALLOC_FL_KEEP_SIZE committed beyond i_size.
2784 */
2785 undo_fallocend = info->fallocend;
2786 if (info->fallocend < end)
2787 info->fallocend = end;
2788
050dcb5c 2789 for (index = start; index < end; ) {
e2d12e22
HD
2790 struct page *page;
2791
2792 /*
2793 * Good, the fallocate(2) manpage permits EINTR: we may have
2794 * been interrupted because we are using up too much memory.
2795 */
2796 if (signal_pending(current))
2797 error = -EINTR;
1aac1400
HD
2798 else if (shmem_falloc.nr_unswapped > shmem_falloc.nr_falloced)
2799 error = -ENOMEM;
e2d12e22 2800 else
9e18eb29 2801 error = shmem_getpage(inode, index, &page, SGP_FALLOC);
e2d12e22 2802 if (error) {
d144bf62 2803 info->fallocend = undo_fallocend;
1635f6a7 2804 /* Remove the !PageUptodate pages we added */
7f556567
HD
2805 if (index > start) {
2806 shmem_undo_range(inode,
2807 (loff_t)start << PAGE_SHIFT,
2808 ((loff_t)index << PAGE_SHIFT) - 1, true);
2809 }
1aac1400 2810 goto undone;
e2d12e22
HD
2811 }
2812
050dcb5c
HD
2813 index++;
2814 /*
2815 * Here is a more important optimization than it appears:
2816 * a second SGP_FALLOC on the same huge page will clear it,
2817 * making it PageUptodate and un-undoable if we fail later.
2818 */
2819 if (PageTransCompound(page)) {
2820 index = round_up(index, HPAGE_PMD_NR);
2821 /* Beware 32-bit wraparound */
2822 if (!index)
2823 index--;
2824 }
2825
1aac1400
HD
2826 /*
2827 * Inform shmem_writepage() how far we have reached.
2828 * No need for lock or barrier: we have the page lock.
2829 */
1aac1400 2830 if (!PageUptodate(page))
050dcb5c
HD
2831 shmem_falloc.nr_falloced += index - shmem_falloc.next;
2832 shmem_falloc.next = index;
1aac1400 2833
e2d12e22 2834 /*
1635f6a7
HD
2835 * If !PageUptodate, leave it that way so that freeable pages
2836 * can be recognized if we need to rollback on error later.
2837 * But set_page_dirty so that memory pressure will swap rather
e2d12e22
HD
2838 * than free the pages we are allocating (and SGP_CACHE pages
2839 * might still be clean: we now need to mark those dirty too).
2840 */
2841 set_page_dirty(page);
2842 unlock_page(page);
09cbfeaf 2843 put_page(page);
e2d12e22
HD
2844 cond_resched();
2845 }
2846
2847 if (!(mode & FALLOC_FL_KEEP_SIZE) && offset + len > inode->i_size)
2848 i_size_write(inode, offset + len);
1aac1400
HD
2849undone:
2850 spin_lock(&inode->i_lock);
2851 inode->i_private = NULL;
2852 spin_unlock(&inode->i_lock);
e2d12e22 2853out:
15f242bb
HD
2854 if (!error)
2855 file_modified(file);
5955102c 2856 inode_unlock(inode);
83e4fa9c
HD
2857 return error;
2858}
2859
726c3342 2860static int shmem_statfs(struct dentry *dentry, struct kstatfs *buf)
1da177e4 2861{
726c3342 2862 struct shmem_sb_info *sbinfo = SHMEM_SB(dentry->d_sb);
1da177e4
LT
2863
2864 buf->f_type = TMPFS_MAGIC;
09cbfeaf 2865 buf->f_bsize = PAGE_SIZE;
1da177e4 2866 buf->f_namelen = NAME_MAX;
0edd73b3 2867 if (sbinfo->max_blocks) {
1da177e4 2868 buf->f_blocks = sbinfo->max_blocks;
41ffe5d5
HD
2869 buf->f_bavail =
2870 buf->f_bfree = sbinfo->max_blocks -
2871 percpu_counter_sum(&sbinfo->used_blocks);
0edd73b3
HD
2872 }
2873 if (sbinfo->max_inodes) {
1da177e4
LT
2874 buf->f_files = sbinfo->max_inodes;
2875 buf->f_ffree = sbinfo->free_inodes;
1da177e4
LT
2876 }
2877 /* else leave those fields 0 like simple_statfs */
59cda49e
AG
2878
2879 buf->f_fsid = uuid_to_fsid(dentry->d_sb->s_uuid.b);
2880
1da177e4
LT
2881 return 0;
2882}
2883
2884/*
2885 * File creation. Allocate an inode, and we're done..
2886 */
2887static int
549c7297
CB
2888shmem_mknod(struct user_namespace *mnt_userns, struct inode *dir,
2889 struct dentry *dentry, umode_t mode, dev_t dev)
1da177e4 2890{
0b0a0806 2891 struct inode *inode;
1da177e4
LT
2892 int error = -ENOSPC;
2893
454abafe 2894 inode = shmem_get_inode(dir->i_sb, dir, mode, dev, VM_NORESERVE);
1da177e4 2895 if (inode) {
feda821e
CH
2896 error = simple_acl_create(dir, inode);
2897 if (error)
2898 goto out_iput;
2a7dba39 2899 error = security_inode_init_security(inode, dir,
9d8f13ba 2900 &dentry->d_name,
6d9d88d0 2901 shmem_initxattrs, NULL);
feda821e
CH
2902 if (error && error != -EOPNOTSUPP)
2903 goto out_iput;
37ec43cd 2904
718deb6b 2905 error = 0;
1da177e4 2906 dir->i_size += BOGO_DIRENT_SIZE;
078cd827 2907 dir->i_ctime = dir->i_mtime = current_time(dir);
1da177e4
LT
2908 d_instantiate(dentry, inode);
2909 dget(dentry); /* Extra count - pin the dentry in core */
1da177e4
LT
2910 }
2911 return error;
feda821e
CH
2912out_iput:
2913 iput(inode);
2914 return error;
1da177e4
LT
2915}
2916
60545d0d 2917static int
549c7297
CB
2918shmem_tmpfile(struct user_namespace *mnt_userns, struct inode *dir,
2919 struct dentry *dentry, umode_t mode)
60545d0d
AV
2920{
2921 struct inode *inode;
2922 int error = -ENOSPC;
2923
2924 inode = shmem_get_inode(dir->i_sb, dir, mode, 0, VM_NORESERVE);
2925 if (inode) {
2926 error = security_inode_init_security(inode, dir,
2927 NULL,
2928 shmem_initxattrs, NULL);
feda821e
CH
2929 if (error && error != -EOPNOTSUPP)
2930 goto out_iput;
2931 error = simple_acl_create(dir, inode);
2932 if (error)
2933 goto out_iput;
60545d0d
AV
2934 d_tmpfile(dentry, inode);
2935 }
2936 return error;
feda821e
CH
2937out_iput:
2938 iput(inode);
2939 return error;
60545d0d
AV
2940}
2941
549c7297
CB
2942static int shmem_mkdir(struct user_namespace *mnt_userns, struct inode *dir,
2943 struct dentry *dentry, umode_t mode)
1da177e4
LT
2944{
2945 int error;
2946
549c7297
CB
2947 if ((error = shmem_mknod(&init_user_ns, dir, dentry,
2948 mode | S_IFDIR, 0)))
1da177e4 2949 return error;
d8c76e6f 2950 inc_nlink(dir);
1da177e4
LT
2951 return 0;
2952}
2953
549c7297
CB
2954static int shmem_create(struct user_namespace *mnt_userns, struct inode *dir,
2955 struct dentry *dentry, umode_t mode, bool excl)
1da177e4 2956{
549c7297 2957 return shmem_mknod(&init_user_ns, dir, dentry, mode | S_IFREG, 0);
1da177e4
LT
2958}
2959
2960/*
2961 * Link a file..
2962 */
2963static int shmem_link(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry)
2964{
75c3cfa8 2965 struct inode *inode = d_inode(old_dentry);
29b00e60 2966 int ret = 0;
1da177e4
LT
2967
2968 /*
2969 * No ordinary (disk based) filesystem counts links as inodes;
2970 * but each new link needs a new dentry, pinning lowmem, and
2971 * tmpfs dentries cannot be pruned until they are unlinked.
1062af92
DW
2972 * But if an O_TMPFILE file is linked into the tmpfs, the
2973 * first link must skip that, to get the accounting right.
1da177e4 2974 */
1062af92 2975 if (inode->i_nlink) {
e809d5f0 2976 ret = shmem_reserve_inode(inode->i_sb, NULL);
1062af92
DW
2977 if (ret)
2978 goto out;
2979 }
1da177e4
LT
2980
2981 dir->i_size += BOGO_DIRENT_SIZE;
078cd827 2982 inode->i_ctime = dir->i_ctime = dir->i_mtime = current_time(inode);
d8c76e6f 2983 inc_nlink(inode);
7de9c6ee 2984 ihold(inode); /* New dentry reference */
1da177e4
LT
2985 dget(dentry); /* Extra pinning count for the created dentry */
2986 d_instantiate(dentry, inode);
5b04c689
PE
2987out:
2988 return ret;
1da177e4
LT
2989}
2990
2991static int shmem_unlink(struct inode *dir, struct dentry *dentry)
2992{
75c3cfa8 2993 struct inode *inode = d_inode(dentry);
1da177e4 2994
5b04c689
PE
2995 if (inode->i_nlink > 1 && !S_ISDIR(inode->i_mode))
2996 shmem_free_inode(inode->i_sb);
1da177e4
LT
2997
2998 dir->i_size -= BOGO_DIRENT_SIZE;
078cd827 2999 inode->i_ctime = dir->i_ctime = dir->i_mtime = current_time(inode);
9a53c3a7 3000 drop_nlink(inode);
1da177e4
LT
3001 dput(dentry); /* Undo the count from "create" - this does all the work */
3002 return 0;
3003}
3004
3005static int shmem_rmdir(struct inode *dir, struct dentry *dentry)
3006{
3007 if (!simple_empty(dentry))
3008 return -ENOTEMPTY;
3009
75c3cfa8 3010 drop_nlink(d_inode(dentry));
9a53c3a7 3011 drop_nlink(dir);
1da177e4
LT
3012 return shmem_unlink(dir, dentry);
3013}
3014
549c7297
CB
3015static int shmem_whiteout(struct user_namespace *mnt_userns,
3016 struct inode *old_dir, struct dentry *old_dentry)
46fdb794
MS
3017{
3018 struct dentry *whiteout;
3019 int error;
3020
3021 whiteout = d_alloc(old_dentry->d_parent, &old_dentry->d_name);
3022 if (!whiteout)
3023 return -ENOMEM;
3024
549c7297 3025 error = shmem_mknod(&init_user_ns, old_dir, whiteout,
46fdb794
MS
3026 S_IFCHR | WHITEOUT_MODE, WHITEOUT_DEV);
3027 dput(whiteout);
3028 if (error)
3029 return error;
3030
3031 /*
3032 * Cheat and hash the whiteout while the old dentry is still in
3033 * place, instead of playing games with FS_RENAME_DOES_D_MOVE.
3034 *
3035 * d_lookup() will consistently find one of them at this point,
3036 * not sure which one, but that isn't even important.
3037 */
3038 d_rehash(whiteout);
3039 return 0;
3040}
3041
1da177e4
LT
3042/*
3043 * The VFS layer already does all the dentry stuff for rename,
3044 * we just have to decrement the usage count for the target if
3045 * it exists so that the VFS layer correctly free's it when it
3046 * gets overwritten.
3047 */
549c7297
CB
3048static int shmem_rename2(struct user_namespace *mnt_userns,
3049 struct inode *old_dir, struct dentry *old_dentry,
3050 struct inode *new_dir, struct dentry *new_dentry,
3051 unsigned int flags)
1da177e4 3052{
75c3cfa8 3053 struct inode *inode = d_inode(old_dentry);
1da177e4
LT
3054 int they_are_dirs = S_ISDIR(inode->i_mode);
3055
46fdb794 3056 if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE | RENAME_WHITEOUT))
3b69ff51
MS
3057 return -EINVAL;
3058
37456771 3059 if (flags & RENAME_EXCHANGE)
6429e463 3060 return simple_rename_exchange(old_dir, old_dentry, new_dir, new_dentry);
37456771 3061
1da177e4
LT
3062 if (!simple_empty(new_dentry))
3063 return -ENOTEMPTY;
3064
46fdb794
MS
3065 if (flags & RENAME_WHITEOUT) {
3066 int error;
3067
549c7297 3068 error = shmem_whiteout(&init_user_ns, old_dir, old_dentry);
46fdb794
MS
3069 if (error)
3070 return error;
3071 }
3072
75c3cfa8 3073 if (d_really_is_positive(new_dentry)) {
1da177e4 3074 (void) shmem_unlink(new_dir, new_dentry);
b928095b 3075 if (they_are_dirs) {
75c3cfa8 3076 drop_nlink(d_inode(new_dentry));
9a53c3a7 3077 drop_nlink(old_dir);
b928095b 3078 }
1da177e4 3079 } else if (they_are_dirs) {
9a53c3a7 3080 drop_nlink(old_dir);
d8c76e6f 3081 inc_nlink(new_dir);
1da177e4
LT
3082 }
3083
3084 old_dir->i_size -= BOGO_DIRENT_SIZE;
3085 new_dir->i_size += BOGO_DIRENT_SIZE;
3086 old_dir->i_ctime = old_dir->i_mtime =
3087 new_dir->i_ctime = new_dir->i_mtime =
078cd827 3088 inode->i_ctime = current_time(old_dir);
1da177e4
LT
3089 return 0;
3090}
3091
549c7297
CB
3092static int shmem_symlink(struct user_namespace *mnt_userns, struct inode *dir,
3093 struct dentry *dentry, const char *symname)
1da177e4
LT
3094{
3095 int error;
3096 int len;
3097 struct inode *inode;
9276aad6 3098 struct page *page;
1da177e4
LT
3099
3100 len = strlen(symname) + 1;
09cbfeaf 3101 if (len > PAGE_SIZE)
1da177e4
LT
3102 return -ENAMETOOLONG;
3103
0825a6f9
JP
3104 inode = shmem_get_inode(dir->i_sb, dir, S_IFLNK | 0777, 0,
3105 VM_NORESERVE);
1da177e4
LT
3106 if (!inode)
3107 return -ENOSPC;
3108
9d8f13ba 3109 error = security_inode_init_security(inode, dir, &dentry->d_name,
6d9d88d0 3110 shmem_initxattrs, NULL);
343c3d7f
MN
3111 if (error && error != -EOPNOTSUPP) {
3112 iput(inode);
3113 return error;
570bc1c2
SS
3114 }
3115
1da177e4 3116 inode->i_size = len-1;
69f07ec9 3117 if (len <= SHORT_SYMLINK_LEN) {
3ed47db3
AV
3118 inode->i_link = kmemdup(symname, len, GFP_KERNEL);
3119 if (!inode->i_link) {
69f07ec9
HD
3120 iput(inode);
3121 return -ENOMEM;
3122 }
3123 inode->i_op = &shmem_short_symlink_operations;
1da177e4 3124 } else {
e8ecde25 3125 inode_nohighmem(inode);
9e18eb29 3126 error = shmem_getpage(inode, 0, &page, SGP_WRITE);
1da177e4
LT
3127 if (error) {
3128 iput(inode);
3129 return error;
3130 }
14fcc23f 3131 inode->i_mapping->a_ops = &shmem_aops;
1da177e4 3132 inode->i_op = &shmem_symlink_inode_operations;
21fc61c7 3133 memcpy(page_address(page), symname, len);
ec9516fb 3134 SetPageUptodate(page);
1da177e4 3135 set_page_dirty(page);
6746aff7 3136 unlock_page(page);
09cbfeaf 3137 put_page(page);
1da177e4 3138 }
1da177e4 3139 dir->i_size += BOGO_DIRENT_SIZE;
078cd827 3140 dir->i_ctime = dir->i_mtime = current_time(dir);
1da177e4
LT
3141 d_instantiate(dentry, inode);
3142 dget(dentry);
3143 return 0;
3144}
3145
fceef393 3146static void shmem_put_link(void *arg)
1da177e4 3147{
fceef393
AV
3148 mark_page_accessed(arg);
3149 put_page(arg);
1da177e4
LT
3150}
3151
6b255391 3152static const char *shmem_get_link(struct dentry *dentry,
fceef393
AV
3153 struct inode *inode,
3154 struct delayed_call *done)
1da177e4 3155{
1da177e4 3156 struct page *page = NULL;
6b255391 3157 int error;
6a6c9904
AV
3158 if (!dentry) {
3159 page = find_get_page(inode->i_mapping, 0);
3160 if (!page)
3161 return ERR_PTR(-ECHILD);
a7605426
YS
3162 if (PageHWPoison(page) ||
3163 !PageUptodate(page)) {
6a6c9904
AV
3164 put_page(page);
3165 return ERR_PTR(-ECHILD);
3166 }
3167 } else {
9e18eb29 3168 error = shmem_getpage(inode, 0, &page, SGP_READ);
6a6c9904
AV
3169 if (error)
3170 return ERR_PTR(error);
a7605426
YS
3171 if (!page)
3172 return ERR_PTR(-ECHILD);
3173 if (PageHWPoison(page)) {
3174 unlock_page(page);
3175 put_page(page);
3176 return ERR_PTR(-ECHILD);
3177 }
6a6c9904
AV
3178 unlock_page(page);
3179 }
fceef393 3180 set_delayed_call(done, shmem_put_link, page);
21fc61c7 3181 return page_address(page);
1da177e4
LT
3182}
3183
b09e0fa4 3184#ifdef CONFIG_TMPFS_XATTR
e408e695
TT
3185
3186static int shmem_fileattr_get(struct dentry *dentry, struct fileattr *fa)
3187{
3188 struct shmem_inode_info *info = SHMEM_I(d_inode(dentry));
3189
3190 fileattr_fill_flags(fa, info->fsflags & SHMEM_FL_USER_VISIBLE);
3191
3192 return 0;
3193}
3194
3195static int shmem_fileattr_set(struct user_namespace *mnt_userns,
3196 struct dentry *dentry, struct fileattr *fa)
3197{
3198 struct inode *inode = d_inode(dentry);
3199 struct shmem_inode_info *info = SHMEM_I(inode);
3200
3201 if (fileattr_has_fsx(fa))
3202 return -EOPNOTSUPP;
cb241339
HD
3203 if (fa->flags & ~SHMEM_FL_USER_MODIFIABLE)
3204 return -EOPNOTSUPP;
e408e695
TT
3205
3206 info->fsflags = (info->fsflags & ~SHMEM_FL_USER_MODIFIABLE) |
3207 (fa->flags & SHMEM_FL_USER_MODIFIABLE);
3208
cb241339 3209 shmem_set_inode_flags(inode, info->fsflags);
e408e695
TT
3210 inode->i_ctime = current_time(inode);
3211 return 0;
3212}
3213
46711810 3214/*
b09e0fa4
EP
3215 * Superblocks without xattr inode operations may get some security.* xattr
3216 * support from the LSM "for free". As soon as we have any other xattrs
39f0247d
AG
3217 * like ACLs, we also need to implement the security.* handlers at
3218 * filesystem level, though.
3219 */
3220
6d9d88d0
JS
3221/*
3222 * Callback for security_inode_init_security() for acquiring xattrs.
3223 */
3224static int shmem_initxattrs(struct inode *inode,
3225 const struct xattr *xattr_array,
3226 void *fs_info)
3227{
3228 struct shmem_inode_info *info = SHMEM_I(inode);
3229 const struct xattr *xattr;
38f38657 3230 struct simple_xattr *new_xattr;
6d9d88d0
JS
3231 size_t len;
3232
3233 for (xattr = xattr_array; xattr->name != NULL; xattr++) {
38f38657 3234 new_xattr = simple_xattr_alloc(xattr->value, xattr->value_len);
6d9d88d0
JS
3235 if (!new_xattr)
3236 return -ENOMEM;
3237
3238 len = strlen(xattr->name) + 1;
3239 new_xattr->name = kmalloc(XATTR_SECURITY_PREFIX_LEN + len,
3240 GFP_KERNEL);
3241 if (!new_xattr->name) {
3bef735a 3242 kvfree(new_xattr);
6d9d88d0
JS
3243 return -ENOMEM;
3244 }
3245
3246 memcpy(new_xattr->name, XATTR_SECURITY_PREFIX,
3247 XATTR_SECURITY_PREFIX_LEN);
3248 memcpy(new_xattr->name + XATTR_SECURITY_PREFIX_LEN,
3249 xattr->name, len);
3250
38f38657 3251 simple_xattr_list_add(&info->xattrs, new_xattr);
6d9d88d0
JS
3252 }
3253
3254 return 0;
3255}
3256
aa7c5241 3257static int shmem_xattr_handler_get(const struct xattr_handler *handler,
b296821a
AV
3258 struct dentry *unused, struct inode *inode,
3259 const char *name, void *buffer, size_t size)
b09e0fa4 3260{
b296821a 3261 struct shmem_inode_info *info = SHMEM_I(inode);
b09e0fa4 3262
aa7c5241 3263 name = xattr_full_name(handler, name);
38f38657 3264 return simple_xattr_get(&info->xattrs, name, buffer, size);
b09e0fa4
EP
3265}
3266
aa7c5241 3267static int shmem_xattr_handler_set(const struct xattr_handler *handler,
e65ce2a5 3268 struct user_namespace *mnt_userns,
59301226
AV
3269 struct dentry *unused, struct inode *inode,
3270 const char *name, const void *value,
3271 size_t size, int flags)
b09e0fa4 3272{
59301226 3273 struct shmem_inode_info *info = SHMEM_I(inode);
b09e0fa4 3274
aa7c5241 3275 name = xattr_full_name(handler, name);
a46a2295 3276 return simple_xattr_set(&info->xattrs, name, value, size, flags, NULL);
b09e0fa4
EP
3277}
3278
aa7c5241
AG
3279static const struct xattr_handler shmem_security_xattr_handler = {
3280 .prefix = XATTR_SECURITY_PREFIX,
3281 .get = shmem_xattr_handler_get,
3282 .set = shmem_xattr_handler_set,
3283};
b09e0fa4 3284
aa7c5241
AG
3285static const struct xattr_handler shmem_trusted_xattr_handler = {
3286 .prefix = XATTR_TRUSTED_PREFIX,
3287 .get = shmem_xattr_handler_get,
3288 .set = shmem_xattr_handler_set,
3289};
b09e0fa4 3290
aa7c5241
AG
3291static const struct xattr_handler *shmem_xattr_handlers[] = {
3292#ifdef CONFIG_TMPFS_POSIX_ACL
3293 &posix_acl_access_xattr_handler,
3294 &posix_acl_default_xattr_handler,
3295#endif
3296 &shmem_security_xattr_handler,
3297 &shmem_trusted_xattr_handler,
3298 NULL
3299};
b09e0fa4
EP
3300
3301static ssize_t shmem_listxattr(struct dentry *dentry, char *buffer, size_t size)
3302{
75c3cfa8 3303 struct shmem_inode_info *info = SHMEM_I(d_inode(dentry));
786534b9 3304 return simple_xattr_list(d_inode(dentry), &info->xattrs, buffer, size);
b09e0fa4
EP
3305}
3306#endif /* CONFIG_TMPFS_XATTR */
3307
69f07ec9 3308static const struct inode_operations shmem_short_symlink_operations = {
f7cd16a5 3309 .getattr = shmem_getattr,
6b255391 3310 .get_link = simple_get_link,
b09e0fa4 3311#ifdef CONFIG_TMPFS_XATTR
b09e0fa4 3312 .listxattr = shmem_listxattr,
b09e0fa4
EP
3313#endif
3314};
3315
3316static const struct inode_operations shmem_symlink_inode_operations = {
f7cd16a5 3317 .getattr = shmem_getattr,
6b255391 3318 .get_link = shmem_get_link,
b09e0fa4 3319#ifdef CONFIG_TMPFS_XATTR
b09e0fa4 3320 .listxattr = shmem_listxattr,
39f0247d 3321#endif
b09e0fa4 3322};
39f0247d 3323
91828a40
DG
3324static struct dentry *shmem_get_parent(struct dentry *child)
3325{
3326 return ERR_PTR(-ESTALE);
3327}
3328
3329static int shmem_match(struct inode *ino, void *vfh)
3330{
3331 __u32 *fh = vfh;
3332 __u64 inum = fh[2];
3333 inum = (inum << 32) | fh[1];
3334 return ino->i_ino == inum && fh[0] == ino->i_generation;
3335}
3336
12ba780d
AG
3337/* Find any alias of inode, but prefer a hashed alias */
3338static struct dentry *shmem_find_alias(struct inode *inode)
3339{
3340 struct dentry *alias = d_find_alias(inode);
3341
3342 return alias ?: d_find_any_alias(inode);
3343}
3344
3345
480b116c
CH
3346static struct dentry *shmem_fh_to_dentry(struct super_block *sb,
3347 struct fid *fid, int fh_len, int fh_type)
91828a40 3348{
91828a40 3349 struct inode *inode;
480b116c 3350 struct dentry *dentry = NULL;
35c2a7f4 3351 u64 inum;
480b116c
CH
3352
3353 if (fh_len < 3)
3354 return NULL;
91828a40 3355
35c2a7f4
HD
3356 inum = fid->raw[2];
3357 inum = (inum << 32) | fid->raw[1];
3358
480b116c
CH
3359 inode = ilookup5(sb, (unsigned long)(inum + fid->raw[0]),
3360 shmem_match, fid->raw);
91828a40 3361 if (inode) {
12ba780d 3362 dentry = shmem_find_alias(inode);
91828a40
DG
3363 iput(inode);
3364 }
3365
480b116c 3366 return dentry;
91828a40
DG
3367}
3368
b0b0382b
AV
3369static int shmem_encode_fh(struct inode *inode, __u32 *fh, int *len,
3370 struct inode *parent)
91828a40 3371{
5fe0c237
AK
3372 if (*len < 3) {
3373 *len = 3;
94e07a75 3374 return FILEID_INVALID;
5fe0c237 3375 }
91828a40 3376
1d3382cb 3377 if (inode_unhashed(inode)) {
91828a40
DG
3378 /* Unfortunately insert_inode_hash is not idempotent,
3379 * so as we hash inodes here rather than at creation
3380 * time, we need a lock to ensure we only try
3381 * to do it once
3382 */
3383 static DEFINE_SPINLOCK(lock);
3384 spin_lock(&lock);
1d3382cb 3385 if (inode_unhashed(inode))
91828a40
DG
3386 __insert_inode_hash(inode,
3387 inode->i_ino + inode->i_generation);
3388 spin_unlock(&lock);
3389 }
3390
3391 fh[0] = inode->i_generation;
3392 fh[1] = inode->i_ino;
3393 fh[2] = ((__u64)inode->i_ino) >> 32;
3394
3395 *len = 3;
3396 return 1;
3397}
3398
39655164 3399static const struct export_operations shmem_export_ops = {
91828a40 3400 .get_parent = shmem_get_parent,
91828a40 3401 .encode_fh = shmem_encode_fh,
480b116c 3402 .fh_to_dentry = shmem_fh_to_dentry,
91828a40
DG
3403};
3404
626c3920
AV
3405enum shmem_param {
3406 Opt_gid,
3407 Opt_huge,
3408 Opt_mode,
3409 Opt_mpol,
3410 Opt_nr_blocks,
3411 Opt_nr_inodes,
3412 Opt_size,
3413 Opt_uid,
ea3271f7
CD
3414 Opt_inode32,
3415 Opt_inode64,
626c3920
AV
3416};
3417
5eede625 3418static const struct constant_table shmem_param_enums_huge[] = {
2710c957
AV
3419 {"never", SHMEM_HUGE_NEVER },
3420 {"always", SHMEM_HUGE_ALWAYS },
3421 {"within_size", SHMEM_HUGE_WITHIN_SIZE },
3422 {"advise", SHMEM_HUGE_ADVISE },
2710c957
AV
3423 {}
3424};
3425
d7167b14 3426const struct fs_parameter_spec shmem_fs_parameters[] = {
626c3920 3427 fsparam_u32 ("gid", Opt_gid),
2710c957 3428 fsparam_enum ("huge", Opt_huge, shmem_param_enums_huge),
626c3920
AV
3429 fsparam_u32oct("mode", Opt_mode),
3430 fsparam_string("mpol", Opt_mpol),
3431 fsparam_string("nr_blocks", Opt_nr_blocks),
3432 fsparam_string("nr_inodes", Opt_nr_inodes),
3433 fsparam_string("size", Opt_size),
3434 fsparam_u32 ("uid", Opt_uid),
ea3271f7
CD
3435 fsparam_flag ("inode32", Opt_inode32),
3436 fsparam_flag ("inode64", Opt_inode64),
626c3920
AV
3437 {}
3438};
3439
f3235626 3440static int shmem_parse_one(struct fs_context *fc, struct fs_parameter *param)
1da177e4 3441{
f3235626 3442 struct shmem_options *ctx = fc->fs_private;
626c3920
AV
3443 struct fs_parse_result result;
3444 unsigned long long size;
e04dc423 3445 char *rest;
626c3920
AV
3446 int opt;
3447
d7167b14 3448 opt = fs_parse(fc, shmem_fs_parameters, param, &result);
f3235626 3449 if (opt < 0)
626c3920 3450 return opt;
1da177e4 3451
626c3920
AV
3452 switch (opt) {
3453 case Opt_size:
3454 size = memparse(param->string, &rest);
e04dc423
AV
3455 if (*rest == '%') {
3456 size <<= PAGE_SHIFT;
3457 size *= totalram_pages();
3458 do_div(size, 100);
3459 rest++;
3460 }
3461 if (*rest)
626c3920 3462 goto bad_value;
e04dc423
AV
3463 ctx->blocks = DIV_ROUND_UP(size, PAGE_SIZE);
3464 ctx->seen |= SHMEM_SEEN_BLOCKS;
626c3920
AV
3465 break;
3466 case Opt_nr_blocks:
3467 ctx->blocks = memparse(param->string, &rest);
0c98c8e1 3468 if (*rest || ctx->blocks > S64_MAX)
626c3920 3469 goto bad_value;
e04dc423 3470 ctx->seen |= SHMEM_SEEN_BLOCKS;
626c3920
AV
3471 break;
3472 case Opt_nr_inodes:
3473 ctx->inodes = memparse(param->string, &rest);
e04dc423 3474 if (*rest)
626c3920 3475 goto bad_value;
e04dc423 3476 ctx->seen |= SHMEM_SEEN_INODES;
626c3920
AV
3477 break;
3478 case Opt_mode:
3479 ctx->mode = result.uint_32 & 07777;
3480 break;
3481 case Opt_uid:
3482 ctx->uid = make_kuid(current_user_ns(), result.uint_32);
e04dc423 3483 if (!uid_valid(ctx->uid))
626c3920
AV
3484 goto bad_value;
3485 break;
3486 case Opt_gid:
3487 ctx->gid = make_kgid(current_user_ns(), result.uint_32);
e04dc423 3488 if (!gid_valid(ctx->gid))
626c3920
AV
3489 goto bad_value;
3490 break;
3491 case Opt_huge:
3492 ctx->huge = result.uint_32;
3493 if (ctx->huge != SHMEM_HUGE_NEVER &&
396bcc52 3494 !(IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) &&
626c3920
AV
3495 has_transparent_hugepage()))
3496 goto unsupported_parameter;
e04dc423 3497 ctx->seen |= SHMEM_SEEN_HUGE;
626c3920
AV
3498 break;
3499 case Opt_mpol:
3500 if (IS_ENABLED(CONFIG_NUMA)) {
3501 mpol_put(ctx->mpol);
3502 ctx->mpol = NULL;
3503 if (mpol_parse_str(param->string, &ctx->mpol))
3504 goto bad_value;
3505 break;
3506 }
3507 goto unsupported_parameter;
ea3271f7
CD
3508 case Opt_inode32:
3509 ctx->full_inums = false;
3510 ctx->seen |= SHMEM_SEEN_INUMS;
3511 break;
3512 case Opt_inode64:
3513 if (sizeof(ino_t) < 8) {
3514 return invalfc(fc,
3515 "Cannot use inode64 with <64bit inums in kernel\n");
3516 }
3517 ctx->full_inums = true;
3518 ctx->seen |= SHMEM_SEEN_INUMS;
3519 break;
e04dc423
AV
3520 }
3521 return 0;
3522
626c3920 3523unsupported_parameter:
f35aa2bc 3524 return invalfc(fc, "Unsupported parameter '%s'", param->key);
626c3920 3525bad_value:
f35aa2bc 3526 return invalfc(fc, "Bad value for '%s'", param->key);
e04dc423
AV
3527}
3528
f3235626 3529static int shmem_parse_options(struct fs_context *fc, void *data)
e04dc423 3530{
f3235626
DH
3531 char *options = data;
3532
33f37c64
AV
3533 if (options) {
3534 int err = security_sb_eat_lsm_opts(options, &fc->security);
3535 if (err)
3536 return err;
3537 }
3538
b00dc3ad 3539 while (options != NULL) {
626c3920 3540 char *this_char = options;
b00dc3ad
HD
3541 for (;;) {
3542 /*
3543 * NUL-terminate this option: unfortunately,
3544 * mount options form a comma-separated list,
3545 * but mpol's nodelist may also contain commas.
3546 */
3547 options = strchr(options, ',');
3548 if (options == NULL)
3549 break;
3550 options++;
3551 if (!isdigit(*options)) {
3552 options[-1] = '\0';
3553 break;
3554 }
3555 }
626c3920 3556 if (*this_char) {
68d68ff6 3557 char *value = strchr(this_char, '=');
f3235626 3558 size_t len = 0;
626c3920
AV
3559 int err;
3560
3561 if (value) {
3562 *value++ = '\0';
f3235626 3563 len = strlen(value);
626c3920 3564 }
f3235626
DH
3565 err = vfs_parse_fs_string(fc, this_char, value, len);
3566 if (err < 0)
3567 return err;
1da177e4 3568 }
1da177e4
LT
3569 }
3570 return 0;
1da177e4
LT
3571}
3572
f3235626
DH
3573/*
3574 * Reconfigure a shmem filesystem.
3575 *
3576 * Note that we disallow change from limited->unlimited blocks/inodes while any
3577 * are in use; but we must separately disallow unlimited->limited, because in
3578 * that case we have no record of how much is already in use.
3579 */
3580static int shmem_reconfigure(struct fs_context *fc)
1da177e4 3581{
f3235626
DH
3582 struct shmem_options *ctx = fc->fs_private;
3583 struct shmem_sb_info *sbinfo = SHMEM_SB(fc->root->d_sb);
0edd73b3 3584 unsigned long inodes;
bf11b9a8 3585 struct mempolicy *mpol = NULL;
f3235626 3586 const char *err;
1da177e4 3587
bf11b9a8 3588 raw_spin_lock(&sbinfo->stat_lock);
0edd73b3 3589 inodes = sbinfo->max_inodes - sbinfo->free_inodes;
0c98c8e1 3590
f3235626
DH
3591 if ((ctx->seen & SHMEM_SEEN_BLOCKS) && ctx->blocks) {
3592 if (!sbinfo->max_blocks) {
3593 err = "Cannot retroactively limit size";
0b5071dd 3594 goto out;
f3235626 3595 }
0b5071dd 3596 if (percpu_counter_compare(&sbinfo->used_blocks,
f3235626
DH
3597 ctx->blocks) > 0) {
3598 err = "Too small a size for current use";
0b5071dd 3599 goto out;
f3235626 3600 }
0b5071dd 3601 }
f3235626
DH
3602 if ((ctx->seen & SHMEM_SEEN_INODES) && ctx->inodes) {
3603 if (!sbinfo->max_inodes) {
3604 err = "Cannot retroactively limit inodes";
0b5071dd 3605 goto out;
f3235626
DH
3606 }
3607 if (ctx->inodes < inodes) {
3608 err = "Too few inodes for current use";
0b5071dd 3609 goto out;
f3235626 3610 }
0b5071dd 3611 }
0edd73b3 3612
ea3271f7
CD
3613 if ((ctx->seen & SHMEM_SEEN_INUMS) && !ctx->full_inums &&
3614 sbinfo->next_ino > UINT_MAX) {
3615 err = "Current inum too high to switch to 32-bit inums";
3616 goto out;
3617 }
3618
f3235626
DH
3619 if (ctx->seen & SHMEM_SEEN_HUGE)
3620 sbinfo->huge = ctx->huge;
ea3271f7
CD
3621 if (ctx->seen & SHMEM_SEEN_INUMS)
3622 sbinfo->full_inums = ctx->full_inums;
f3235626
DH
3623 if (ctx->seen & SHMEM_SEEN_BLOCKS)
3624 sbinfo->max_blocks = ctx->blocks;
3625 if (ctx->seen & SHMEM_SEEN_INODES) {
3626 sbinfo->max_inodes = ctx->inodes;
3627 sbinfo->free_inodes = ctx->inodes - inodes;
0b5071dd 3628 }
71fe804b 3629
5f00110f
GT
3630 /*
3631 * Preserve previous mempolicy unless mpol remount option was specified.
3632 */
f3235626 3633 if (ctx->mpol) {
bf11b9a8 3634 mpol = sbinfo->mpol;
f3235626
DH
3635 sbinfo->mpol = ctx->mpol; /* transfers initial ref */
3636 ctx->mpol = NULL;
5f00110f 3637 }
bf11b9a8
SAS
3638 raw_spin_unlock(&sbinfo->stat_lock);
3639 mpol_put(mpol);
f3235626 3640 return 0;
0edd73b3 3641out:
bf11b9a8 3642 raw_spin_unlock(&sbinfo->stat_lock);
f35aa2bc 3643 return invalfc(fc, "%s", err);
1da177e4 3644}
680d794b 3645
34c80b1d 3646static int shmem_show_options(struct seq_file *seq, struct dentry *root)
680d794b 3647{
34c80b1d 3648 struct shmem_sb_info *sbinfo = SHMEM_SB(root->d_sb);
680d794b 3649
3650 if (sbinfo->max_blocks != shmem_default_max_blocks())
3651 seq_printf(seq, ",size=%luk",
09cbfeaf 3652 sbinfo->max_blocks << (PAGE_SHIFT - 10));
680d794b 3653 if (sbinfo->max_inodes != shmem_default_max_inodes())
3654 seq_printf(seq, ",nr_inodes=%lu", sbinfo->max_inodes);
0825a6f9 3655 if (sbinfo->mode != (0777 | S_ISVTX))
09208d15 3656 seq_printf(seq, ",mode=%03ho", sbinfo->mode);
8751e039
EB
3657 if (!uid_eq(sbinfo->uid, GLOBAL_ROOT_UID))
3658 seq_printf(seq, ",uid=%u",
3659 from_kuid_munged(&init_user_ns, sbinfo->uid));
3660 if (!gid_eq(sbinfo->gid, GLOBAL_ROOT_GID))
3661 seq_printf(seq, ",gid=%u",
3662 from_kgid_munged(&init_user_ns, sbinfo->gid));
ea3271f7
CD
3663
3664 /*
3665 * Showing inode{64,32} might be useful even if it's the system default,
3666 * since then people don't have to resort to checking both here and
3667 * /proc/config.gz to confirm 64-bit inums were successfully applied
3668 * (which may not even exist if IKCONFIG_PROC isn't enabled).
3669 *
3670 * We hide it when inode64 isn't the default and we are using 32-bit
3671 * inodes, since that probably just means the feature isn't even under
3672 * consideration.
3673 *
3674 * As such:
3675 *
3676 * +-----------------+-----------------+
3677 * | TMPFS_INODE64=y | TMPFS_INODE64=n |
3678 * +------------------+-----------------+-----------------+
3679 * | full_inums=true | show | show |
3680 * | full_inums=false | show | hide |
3681 * +------------------+-----------------+-----------------+
3682 *
3683 */
3684 if (IS_ENABLED(CONFIG_TMPFS_INODE64) || sbinfo->full_inums)
3685 seq_printf(seq, ",inode%d", (sbinfo->full_inums ? 64 : 32));
396bcc52 3686#ifdef CONFIG_TRANSPARENT_HUGEPAGE
5a6e75f8
KS
3687 /* Rightly or wrongly, show huge mount option unmasked by shmem_huge */
3688 if (sbinfo->huge)
3689 seq_printf(seq, ",huge=%s", shmem_format_huge(sbinfo->huge));
3690#endif
71fe804b 3691 shmem_show_mpol(seq, sbinfo->mpol);
680d794b 3692 return 0;
3693}
9183df25 3694
680d794b 3695#endif /* CONFIG_TMPFS */
1da177e4
LT
3696
3697static void shmem_put_super(struct super_block *sb)
3698{
602586a8
HD
3699 struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
3700
e809d5f0 3701 free_percpu(sbinfo->ino_batch);
602586a8 3702 percpu_counter_destroy(&sbinfo->used_blocks);
49cd0a5c 3703 mpol_put(sbinfo->mpol);
602586a8 3704 kfree(sbinfo);
1da177e4
LT
3705 sb->s_fs_info = NULL;
3706}
3707
f3235626 3708static int shmem_fill_super(struct super_block *sb, struct fs_context *fc)
1da177e4 3709{
f3235626 3710 struct shmem_options *ctx = fc->fs_private;
1da177e4 3711 struct inode *inode;
0edd73b3 3712 struct shmem_sb_info *sbinfo;
680d794b 3713
3714 /* Round up to L1_CACHE_BYTES to resist false sharing */
425fbf04 3715 sbinfo = kzalloc(max((int)sizeof(struct shmem_sb_info),
680d794b 3716 L1_CACHE_BYTES), GFP_KERNEL);
3717 if (!sbinfo)
3718 return -ENOMEM;
3719
680d794b 3720 sb->s_fs_info = sbinfo;
1da177e4 3721
0edd73b3 3722#ifdef CONFIG_TMPFS
1da177e4
LT
3723 /*
3724 * Per default we only allow half of the physical ram per
3725 * tmpfs instance, limiting inodes to one per page of lowmem;
3726 * but the internal instance is left unlimited.
3727 */
1751e8a6 3728 if (!(sb->s_flags & SB_KERNMOUNT)) {
f3235626
DH
3729 if (!(ctx->seen & SHMEM_SEEN_BLOCKS))
3730 ctx->blocks = shmem_default_max_blocks();
3731 if (!(ctx->seen & SHMEM_SEEN_INODES))
3732 ctx->inodes = shmem_default_max_inodes();
ea3271f7
CD
3733 if (!(ctx->seen & SHMEM_SEEN_INUMS))
3734 ctx->full_inums = IS_ENABLED(CONFIG_TMPFS_INODE64);
ca4e0519 3735 } else {
1751e8a6 3736 sb->s_flags |= SB_NOUSER;
1da177e4 3737 }
91828a40 3738 sb->s_export_op = &shmem_export_ops;
1751e8a6 3739 sb->s_flags |= SB_NOSEC;
1da177e4 3740#else
1751e8a6 3741 sb->s_flags |= SB_NOUSER;
1da177e4 3742#endif
f3235626
DH
3743 sbinfo->max_blocks = ctx->blocks;
3744 sbinfo->free_inodes = sbinfo->max_inodes = ctx->inodes;
e809d5f0
CD
3745 if (sb->s_flags & SB_KERNMOUNT) {
3746 sbinfo->ino_batch = alloc_percpu(ino_t);
3747 if (!sbinfo->ino_batch)
3748 goto failed;
3749 }
f3235626
DH
3750 sbinfo->uid = ctx->uid;
3751 sbinfo->gid = ctx->gid;
ea3271f7 3752 sbinfo->full_inums = ctx->full_inums;
f3235626
DH
3753 sbinfo->mode = ctx->mode;
3754 sbinfo->huge = ctx->huge;
3755 sbinfo->mpol = ctx->mpol;
3756 ctx->mpol = NULL;
1da177e4 3757
bf11b9a8 3758 raw_spin_lock_init(&sbinfo->stat_lock);
908c7f19 3759 if (percpu_counter_init(&sbinfo->used_blocks, 0, GFP_KERNEL))
602586a8 3760 goto failed;
779750d2
KS
3761 spin_lock_init(&sbinfo->shrinklist_lock);
3762 INIT_LIST_HEAD(&sbinfo->shrinklist);
0edd73b3 3763
285b2c4f 3764 sb->s_maxbytes = MAX_LFS_FILESIZE;
09cbfeaf
KS
3765 sb->s_blocksize = PAGE_SIZE;
3766 sb->s_blocksize_bits = PAGE_SHIFT;
1da177e4
LT
3767 sb->s_magic = TMPFS_MAGIC;
3768 sb->s_op = &shmem_ops;
cfd95a9c 3769 sb->s_time_gran = 1;
b09e0fa4 3770#ifdef CONFIG_TMPFS_XATTR
39f0247d 3771 sb->s_xattr = shmem_xattr_handlers;
b09e0fa4
EP
3772#endif
3773#ifdef CONFIG_TMPFS_POSIX_ACL
1751e8a6 3774 sb->s_flags |= SB_POSIXACL;
39f0247d 3775#endif
2b4db796 3776 uuid_gen(&sb->s_uuid);
0edd73b3 3777
454abafe 3778 inode = shmem_get_inode(sb, NULL, S_IFDIR | sbinfo->mode, 0, VM_NORESERVE);
1da177e4
LT
3779 if (!inode)
3780 goto failed;
680d794b 3781 inode->i_uid = sbinfo->uid;
3782 inode->i_gid = sbinfo->gid;
318ceed0
AV
3783 sb->s_root = d_make_root(inode);
3784 if (!sb->s_root)
48fde701 3785 goto failed;
1da177e4
LT
3786 return 0;
3787
1da177e4
LT
3788failed:
3789 shmem_put_super(sb);
f2b346e4 3790 return -ENOMEM;
1da177e4
LT
3791}
3792
f3235626
DH
3793static int shmem_get_tree(struct fs_context *fc)
3794{
3795 return get_tree_nodev(fc, shmem_fill_super);
3796}
3797
3798static void shmem_free_fc(struct fs_context *fc)
3799{
3800 struct shmem_options *ctx = fc->fs_private;
3801
3802 if (ctx) {
3803 mpol_put(ctx->mpol);
3804 kfree(ctx);
3805 }
3806}
3807
3808static const struct fs_context_operations shmem_fs_context_ops = {
3809 .free = shmem_free_fc,
3810 .get_tree = shmem_get_tree,
3811#ifdef CONFIG_TMPFS
3812 .parse_monolithic = shmem_parse_options,
3813 .parse_param = shmem_parse_one,
3814 .reconfigure = shmem_reconfigure,
3815#endif
3816};
3817
fcc234f8 3818static struct kmem_cache *shmem_inode_cachep;
1da177e4
LT
3819
3820static struct inode *shmem_alloc_inode(struct super_block *sb)
3821{
41ffe5d5 3822 struct shmem_inode_info *info;
fd60b288 3823 info = alloc_inode_sb(sb, shmem_inode_cachep, GFP_KERNEL);
41ffe5d5 3824 if (!info)
1da177e4 3825 return NULL;
41ffe5d5 3826 return &info->vfs_inode;
1da177e4
LT
3827}
3828
74b1da56 3829static void shmem_free_in_core_inode(struct inode *inode)
fa0d7e3d 3830{
84e710da
AV
3831 if (S_ISLNK(inode->i_mode))
3832 kfree(inode->i_link);
fa0d7e3d
NP
3833 kmem_cache_free(shmem_inode_cachep, SHMEM_I(inode));
3834}
3835
1da177e4
LT
3836static void shmem_destroy_inode(struct inode *inode)
3837{
09208d15 3838 if (S_ISREG(inode->i_mode))
1da177e4 3839 mpol_free_shared_policy(&SHMEM_I(inode)->policy);
1da177e4
LT
3840}
3841
41ffe5d5 3842static void shmem_init_inode(void *foo)
1da177e4 3843{
41ffe5d5
HD
3844 struct shmem_inode_info *info = foo;
3845 inode_init_once(&info->vfs_inode);
1da177e4
LT
3846}
3847
9a8ec03e 3848static void shmem_init_inodecache(void)
1da177e4
LT
3849{
3850 shmem_inode_cachep = kmem_cache_create("shmem_inode_cache",
3851 sizeof(struct shmem_inode_info),
5d097056 3852 0, SLAB_PANIC|SLAB_ACCOUNT, shmem_init_inode);
1da177e4
LT
3853}
3854
41ffe5d5 3855static void shmem_destroy_inodecache(void)
1da177e4 3856{
1a1d92c1 3857 kmem_cache_destroy(shmem_inode_cachep);
1da177e4
LT
3858}
3859
a7605426
YS
3860/* Keep the page in page cache instead of truncating it */
3861static int shmem_error_remove_page(struct address_space *mapping,
3862 struct page *page)
3863{
3864 return 0;
3865}
3866
30e6a51d 3867const struct address_space_operations shmem_aops = {
1da177e4 3868 .writepage = shmem_writepage,
46de8b97 3869 .dirty_folio = noop_dirty_folio,
1da177e4 3870#ifdef CONFIG_TMPFS
800d15a5
NP
3871 .write_begin = shmem_write_begin,
3872 .write_end = shmem_write_end,
1da177e4 3873#endif
1c93923c 3874#ifdef CONFIG_MIGRATION
54184650 3875 .migrate_folio = migrate_folio,
1c93923c 3876#endif
a7605426 3877 .error_remove_page = shmem_error_remove_page,
1da177e4 3878};
30e6a51d 3879EXPORT_SYMBOL(shmem_aops);
1da177e4 3880
15ad7cdc 3881static const struct file_operations shmem_file_operations = {
1da177e4 3882 .mmap = shmem_mmap,
c01d5b30 3883 .get_unmapped_area = shmem_get_unmapped_area,
1da177e4 3884#ifdef CONFIG_TMPFS
220f2ac9 3885 .llseek = shmem_file_llseek,
2ba5bbed 3886 .read_iter = shmem_file_read_iter,
8174202b 3887 .write_iter = generic_file_write_iter,
1b061d92 3888 .fsync = noop_fsync,
82c156f8 3889 .splice_read = generic_file_splice_read,
f6cb85d0 3890 .splice_write = iter_file_splice_write,
83e4fa9c 3891 .fallocate = shmem_fallocate,
1da177e4
LT
3892#endif
3893};
3894
92e1d5be 3895static const struct inode_operations shmem_inode_operations = {
44a30220 3896 .getattr = shmem_getattr,
94c1e62d 3897 .setattr = shmem_setattr,
b09e0fa4 3898#ifdef CONFIG_TMPFS_XATTR
b09e0fa4 3899 .listxattr = shmem_listxattr,
feda821e 3900 .set_acl = simple_set_acl,
e408e695
TT
3901 .fileattr_get = shmem_fileattr_get,
3902 .fileattr_set = shmem_fileattr_set,
b09e0fa4 3903#endif
1da177e4
LT
3904};
3905
92e1d5be 3906static const struct inode_operations shmem_dir_inode_operations = {
1da177e4 3907#ifdef CONFIG_TMPFS
f7cd16a5 3908 .getattr = shmem_getattr,
1da177e4
LT
3909 .create = shmem_create,
3910 .lookup = simple_lookup,
3911 .link = shmem_link,
3912 .unlink = shmem_unlink,
3913 .symlink = shmem_symlink,
3914 .mkdir = shmem_mkdir,
3915 .rmdir = shmem_rmdir,
3916 .mknod = shmem_mknod,
2773bf00 3917 .rename = shmem_rename2,
60545d0d 3918 .tmpfile = shmem_tmpfile,
1da177e4 3919#endif
b09e0fa4 3920#ifdef CONFIG_TMPFS_XATTR
b09e0fa4 3921 .listxattr = shmem_listxattr,
e408e695
TT
3922 .fileattr_get = shmem_fileattr_get,
3923 .fileattr_set = shmem_fileattr_set,
b09e0fa4 3924#endif
39f0247d 3925#ifdef CONFIG_TMPFS_POSIX_ACL
94c1e62d 3926 .setattr = shmem_setattr,
feda821e 3927 .set_acl = simple_set_acl,
39f0247d
AG
3928#endif
3929};
3930
92e1d5be 3931static const struct inode_operations shmem_special_inode_operations = {
f7cd16a5 3932 .getattr = shmem_getattr,
b09e0fa4 3933#ifdef CONFIG_TMPFS_XATTR
b09e0fa4 3934 .listxattr = shmem_listxattr,
b09e0fa4 3935#endif
39f0247d 3936#ifdef CONFIG_TMPFS_POSIX_ACL
94c1e62d 3937 .setattr = shmem_setattr,
feda821e 3938 .set_acl = simple_set_acl,
39f0247d 3939#endif
1da177e4
LT
3940};
3941
759b9775 3942static const struct super_operations shmem_ops = {
1da177e4 3943 .alloc_inode = shmem_alloc_inode,
74b1da56 3944 .free_inode = shmem_free_in_core_inode,
1da177e4
LT
3945 .destroy_inode = shmem_destroy_inode,
3946#ifdef CONFIG_TMPFS
3947 .statfs = shmem_statfs,
680d794b 3948 .show_options = shmem_show_options,
1da177e4 3949#endif
1f895f75 3950 .evict_inode = shmem_evict_inode,
1da177e4
LT
3951 .drop_inode = generic_delete_inode,
3952 .put_super = shmem_put_super,
396bcc52 3953#ifdef CONFIG_TRANSPARENT_HUGEPAGE
779750d2
KS
3954 .nr_cached_objects = shmem_unused_huge_count,
3955 .free_cached_objects = shmem_unused_huge_scan,
3956#endif
1da177e4
LT
3957};
3958
f0f37e2f 3959static const struct vm_operations_struct shmem_vm_ops = {
54cb8821 3960 .fault = shmem_fault,
d7c17551 3961 .map_pages = filemap_map_pages,
1da177e4
LT
3962#ifdef CONFIG_NUMA
3963 .set_policy = shmem_set_policy,
3964 .get_policy = shmem_get_policy,
3965#endif
3966};
3967
f3235626 3968int shmem_init_fs_context(struct fs_context *fc)
1da177e4 3969{
f3235626
DH
3970 struct shmem_options *ctx;
3971
3972 ctx = kzalloc(sizeof(struct shmem_options), GFP_KERNEL);
3973 if (!ctx)
3974 return -ENOMEM;
3975
3976 ctx->mode = 0777 | S_ISVTX;
3977 ctx->uid = current_fsuid();
3978 ctx->gid = current_fsgid();
3979
3980 fc->fs_private = ctx;
3981 fc->ops = &shmem_fs_context_ops;
3982 return 0;
1da177e4
LT
3983}
3984
41ffe5d5 3985static struct file_system_type shmem_fs_type = {
1da177e4
LT
3986 .owner = THIS_MODULE,
3987 .name = "tmpfs",
f3235626
DH
3988 .init_fs_context = shmem_init_fs_context,
3989#ifdef CONFIG_TMPFS
d7167b14 3990 .parameters = shmem_fs_parameters,
f3235626 3991#endif
1da177e4 3992 .kill_sb = kill_litter_super,
ff36da69 3993 .fs_flags = FS_USERNS_MOUNT,
1da177e4 3994};
1da177e4 3995
9096bbe9 3996void __init shmem_init(void)
1da177e4
LT
3997{
3998 int error;
3999
9a8ec03e 4000 shmem_init_inodecache();
1da177e4 4001
41ffe5d5 4002 error = register_filesystem(&shmem_fs_type);
1da177e4 4003 if (error) {
1170532b 4004 pr_err("Could not register tmpfs\n");
1da177e4
LT
4005 goto out2;
4006 }
95dc112a 4007
ca4e0519 4008 shm_mnt = kern_mount(&shmem_fs_type);
1da177e4
LT
4009 if (IS_ERR(shm_mnt)) {
4010 error = PTR_ERR(shm_mnt);
1170532b 4011 pr_err("Could not kern_mount tmpfs\n");
1da177e4
LT
4012 goto out1;
4013 }
5a6e75f8 4014
396bcc52 4015#ifdef CONFIG_TRANSPARENT_HUGEPAGE
435c0b87 4016 if (has_transparent_hugepage() && shmem_huge > SHMEM_HUGE_DENY)
5a6e75f8
KS
4017 SHMEM_SB(shm_mnt->mnt_sb)->huge = shmem_huge;
4018 else
5e6e5a12 4019 shmem_huge = SHMEM_HUGE_NEVER; /* just in case it was patched */
5a6e75f8 4020#endif
9096bbe9 4021 return;
1da177e4
LT
4022
4023out1:
41ffe5d5 4024 unregister_filesystem(&shmem_fs_type);
1da177e4 4025out2:
41ffe5d5 4026 shmem_destroy_inodecache();
1da177e4 4027 shm_mnt = ERR_PTR(error);
1da177e4 4028}
853ac43a 4029
396bcc52 4030#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && defined(CONFIG_SYSFS)
5a6e75f8 4031static ssize_t shmem_enabled_show(struct kobject *kobj,
79d4d38a 4032 struct kobj_attribute *attr, char *buf)
5a6e75f8 4033{
26083eb6 4034 static const int values[] = {
5a6e75f8
KS
4035 SHMEM_HUGE_ALWAYS,
4036 SHMEM_HUGE_WITHIN_SIZE,
4037 SHMEM_HUGE_ADVISE,
4038 SHMEM_HUGE_NEVER,
4039 SHMEM_HUGE_DENY,
4040 SHMEM_HUGE_FORCE,
4041 };
79d4d38a
JP
4042 int len = 0;
4043 int i;
5a6e75f8 4044
79d4d38a
JP
4045 for (i = 0; i < ARRAY_SIZE(values); i++) {
4046 len += sysfs_emit_at(buf, len,
4047 shmem_huge == values[i] ? "%s[%s]" : "%s%s",
4048 i ? " " : "",
4049 shmem_format_huge(values[i]));
5a6e75f8 4050 }
79d4d38a
JP
4051
4052 len += sysfs_emit_at(buf, len, "\n");
4053
4054 return len;
5a6e75f8
KS
4055}
4056
4057static ssize_t shmem_enabled_store(struct kobject *kobj,
4058 struct kobj_attribute *attr, const char *buf, size_t count)
4059{
4060 char tmp[16];
4061 int huge;
4062
4063 if (count + 1 > sizeof(tmp))
4064 return -EINVAL;
4065 memcpy(tmp, buf, count);
4066 tmp[count] = '\0';
4067 if (count && tmp[count - 1] == '\n')
4068 tmp[count - 1] = '\0';
4069
4070 huge = shmem_parse_huge(tmp);
4071 if (huge == -EINVAL)
4072 return -EINVAL;
4073 if (!has_transparent_hugepage() &&
4074 huge != SHMEM_HUGE_NEVER && huge != SHMEM_HUGE_DENY)
4075 return -EINVAL;
4076
4077 shmem_huge = huge;
435c0b87 4078 if (shmem_huge > SHMEM_HUGE_DENY)
5a6e75f8
KS
4079 SHMEM_SB(shm_mnt->mnt_sb)->huge = shmem_huge;
4080 return count;
4081}
4082
4bfa8ada 4083struct kobj_attribute shmem_enabled_attr = __ATTR_RW(shmem_enabled);
396bcc52 4084#endif /* CONFIG_TRANSPARENT_HUGEPAGE && CONFIG_SYSFS */
f3f0e1d2 4085
853ac43a
MM
4086#else /* !CONFIG_SHMEM */
4087
4088/*
4089 * tiny-shmem: simple shmemfs and tmpfs using ramfs code
4090 *
4091 * This is intended for small system where the benefits of the full
4092 * shmem code (swap-backed and resource-limited) are outweighed by
4093 * their complexity. On systems without swap this code should be
4094 * effectively equivalent, but much lighter weight.
4095 */
4096
41ffe5d5 4097static struct file_system_type shmem_fs_type = {
853ac43a 4098 .name = "tmpfs",
f3235626 4099 .init_fs_context = ramfs_init_fs_context,
d7167b14 4100 .parameters = ramfs_fs_parameters,
853ac43a 4101 .kill_sb = kill_litter_super,
2b8576cb 4102 .fs_flags = FS_USERNS_MOUNT,
853ac43a
MM
4103};
4104
9096bbe9 4105void __init shmem_init(void)
853ac43a 4106{
41ffe5d5 4107 BUG_ON(register_filesystem(&shmem_fs_type) != 0);
853ac43a 4108
41ffe5d5 4109 shm_mnt = kern_mount(&shmem_fs_type);
853ac43a 4110 BUG_ON(IS_ERR(shm_mnt));
853ac43a
MM
4111}
4112
10a9c496 4113int shmem_unuse(unsigned int type)
853ac43a
MM
4114{
4115 return 0;
4116}
4117
d7c9e99a 4118int shmem_lock(struct file *file, int lock, struct ucounts *ucounts)
3f96b79a
HD
4119{
4120 return 0;
4121}
4122
24513264
HD
4123void shmem_unlock_mapping(struct address_space *mapping)
4124{
4125}
4126
c01d5b30
HD
4127#ifdef CONFIG_MMU
4128unsigned long shmem_get_unmapped_area(struct file *file,
4129 unsigned long addr, unsigned long len,
4130 unsigned long pgoff, unsigned long flags)
4131{
4132 return current->mm->get_unmapped_area(file, addr, len, pgoff, flags);
4133}
4134#endif
4135
41ffe5d5 4136void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend)
94c1e62d 4137{
41ffe5d5 4138 truncate_inode_pages_range(inode->i_mapping, lstart, lend);
94c1e62d
HD
4139}
4140EXPORT_SYMBOL_GPL(shmem_truncate_range);
4141
0b0a0806
HD
4142#define shmem_vm_ops generic_file_vm_ops
4143#define shmem_file_operations ramfs_file_operations
454abafe 4144#define shmem_get_inode(sb, dir, mode, dev, flags) ramfs_get_inode(sb, dir, mode, dev)
0b0a0806
HD
4145#define shmem_acct_size(flags, size) 0
4146#define shmem_unacct_size(flags, size) do {} while (0)
853ac43a
MM
4147
4148#endif /* CONFIG_SHMEM */
4149
4150/* common code */
1da177e4 4151
703321b6 4152static struct file *__shmem_file_setup(struct vfsmount *mnt, const char *name, loff_t size,
c7277090 4153 unsigned long flags, unsigned int i_flags)
1da177e4 4154{
1da177e4 4155 struct inode *inode;
93dec2da 4156 struct file *res;
1da177e4 4157
703321b6
MA
4158 if (IS_ERR(mnt))
4159 return ERR_CAST(mnt);
1da177e4 4160
285b2c4f 4161 if (size < 0 || size > MAX_LFS_FILESIZE)
1da177e4
LT
4162 return ERR_PTR(-EINVAL);
4163
4164 if (shmem_acct_size(flags, size))
4165 return ERR_PTR(-ENOMEM);
4166
93dec2da
AV
4167 inode = shmem_get_inode(mnt->mnt_sb, NULL, S_IFREG | S_IRWXUGO, 0,
4168 flags);
dac2d1f6
AV
4169 if (unlikely(!inode)) {
4170 shmem_unacct_size(flags, size);
4171 return ERR_PTR(-ENOSPC);
4172 }
c7277090 4173 inode->i_flags |= i_flags;
1da177e4 4174 inode->i_size = size;
6d6b77f1 4175 clear_nlink(inode); /* It is unlinked */
26567cdb 4176 res = ERR_PTR(ramfs_nommu_expand_for_mapping(inode, size));
93dec2da
AV
4177 if (!IS_ERR(res))
4178 res = alloc_file_pseudo(inode, mnt, name, O_RDWR,
4179 &shmem_file_operations);
26567cdb 4180 if (IS_ERR(res))
93dec2da 4181 iput(inode);
6b4d0b27 4182 return res;
1da177e4 4183}
c7277090
EP
4184
4185/**
4186 * shmem_kernel_file_setup - get an unlinked file living in tmpfs which must be
4187 * kernel internal. There will be NO LSM permission checks against the
4188 * underlying inode. So users of this interface must do LSM checks at a
e1832f29
SS
4189 * higher layer. The users are the big_key and shm implementations. LSM
4190 * checks are provided at the key or shm level rather than the inode.
c7277090
EP
4191 * @name: name for dentry (to be seen in /proc/<pid>/maps
4192 * @size: size to be set for the file
4193 * @flags: VM_NORESERVE suppresses pre-accounting of the entire object size
4194 */
4195struct file *shmem_kernel_file_setup(const char *name, loff_t size, unsigned long flags)
4196{
703321b6 4197 return __shmem_file_setup(shm_mnt, name, size, flags, S_PRIVATE);
c7277090
EP
4198}
4199
4200/**
4201 * shmem_file_setup - get an unlinked file living in tmpfs
4202 * @name: name for dentry (to be seen in /proc/<pid>/maps
4203 * @size: size to be set for the file
4204 * @flags: VM_NORESERVE suppresses pre-accounting of the entire object size
4205 */
4206struct file *shmem_file_setup(const char *name, loff_t size, unsigned long flags)
4207{
703321b6 4208 return __shmem_file_setup(shm_mnt, name, size, flags, 0);
c7277090 4209}
395e0ddc 4210EXPORT_SYMBOL_GPL(shmem_file_setup);
1da177e4 4211
703321b6
MA
4212/**
4213 * shmem_file_setup_with_mnt - get an unlinked file living in tmpfs
4214 * @mnt: the tmpfs mount where the file will be created
4215 * @name: name for dentry (to be seen in /proc/<pid>/maps
4216 * @size: size to be set for the file
4217 * @flags: VM_NORESERVE suppresses pre-accounting of the entire object size
4218 */
4219struct file *shmem_file_setup_with_mnt(struct vfsmount *mnt, const char *name,
4220 loff_t size, unsigned long flags)
4221{
4222 return __shmem_file_setup(mnt, name, size, flags, 0);
4223}
4224EXPORT_SYMBOL_GPL(shmem_file_setup_with_mnt);
4225
46711810 4226/**
1da177e4 4227 * shmem_zero_setup - setup a shared anonymous mapping
45e55300 4228 * @vma: the vma to be mmapped is prepared by do_mmap
1da177e4
LT
4229 */
4230int shmem_zero_setup(struct vm_area_struct *vma)
4231{
4232 struct file *file;
4233 loff_t size = vma->vm_end - vma->vm_start;
4234
66fc1303 4235 /*
c1e8d7c6 4236 * Cloning a new file under mmap_lock leads to a lock ordering conflict
66fc1303
HD
4237 * between XFS directory reading and selinux: since this file is only
4238 * accessible to the user through its mapping, use S_PRIVATE flag to
4239 * bypass file security, in the same way as shmem_kernel_file_setup().
4240 */
703321b6 4241 file = shmem_kernel_file_setup("dev/zero", size, vma->vm_flags);
1da177e4
LT
4242 if (IS_ERR(file))
4243 return PTR_ERR(file);
4244
4245 if (vma->vm_file)
4246 fput(vma->vm_file);
4247 vma->vm_file = file;
4248 vma->vm_ops = &shmem_vm_ops;
f3f0e1d2 4249
1da177e4
LT
4250 return 0;
4251}
d9d90e5e
HD
4252
4253/**
4254 * shmem_read_mapping_page_gfp - read into page cache, using specified page allocation flags.
4255 * @mapping: the page's address_space
4256 * @index: the page index
4257 * @gfp: the page allocator flags to use if allocating
4258 *
4259 * This behaves as a tmpfs "read_cache_page_gfp(mapping, index, gfp)",
4260 * with any new page allocations done using the specified allocation flags.
7e0a1265 4261 * But read_cache_page_gfp() uses the ->read_folio() method: which does not
d9d90e5e
HD
4262 * suit tmpfs, since it may have pages in swapcache, and needs to find those
4263 * for itself; although drivers/gpu/drm i915 and ttm rely upon this support.
4264 *
68da9f05
HD
4265 * i915_gem_object_get_pages_gtt() mixes __GFP_NORETRY | __GFP_NOWARN in
4266 * with the mapping_gfp_mask(), to avoid OOMing the machine unnecessarily.
d9d90e5e
HD
4267 */
4268struct page *shmem_read_mapping_page_gfp(struct address_space *mapping,
4269 pgoff_t index, gfp_t gfp)
4270{
68da9f05
HD
4271#ifdef CONFIG_SHMEM
4272 struct inode *inode = mapping->host;
a3a9c397 4273 struct folio *folio;
9276aad6 4274 struct page *page;
68da9f05
HD
4275 int error;
4276
30e6a51d 4277 BUG_ON(!shmem_mapping(mapping));
a3a9c397 4278 error = shmem_get_folio_gfp(inode, index, &folio, SGP_CACHE,
cfda0526 4279 gfp, NULL, NULL, NULL);
68da9f05 4280 if (error)
a7605426
YS
4281 return ERR_PTR(error);
4282
a3a9c397
MWO
4283 folio_unlock(folio);
4284 page = folio_file_page(folio, index);
a7605426 4285 if (PageHWPoison(page)) {
a3a9c397 4286 folio_put(folio);
a7605426
YS
4287 return ERR_PTR(-EIO);
4288 }
4289
68da9f05
HD
4290 return page;
4291#else
4292 /*
4293 * The tiny !SHMEM case uses ramfs without swap
4294 */
d9d90e5e 4295 return read_cache_page_gfp(mapping, index, gfp);
68da9f05 4296#endif
d9d90e5e
HD
4297}
4298EXPORT_SYMBOL_GPL(shmem_read_mapping_page_gfp);