xarray: add the xa_lock to the radix_tree_root
[linux-2.6-block.git] / fs / dax.c
CommitLineData
d475c634
MW
1/*
2 * fs/dax.c - Direct Access filesystem code
3 * Copyright (c) 2013-2014 Intel Corporation
4 * Author: Matthew Wilcox <matthew.r.wilcox@intel.com>
5 * Author: Ross Zwisler <ross.zwisler@linux.intel.com>
6 *
7 * This program is free software; you can redistribute it and/or modify it
8 * under the terms and conditions of the GNU General Public License,
9 * version 2, as published by the Free Software Foundation.
10 *
11 * This program is distributed in the hope it will be useful, but WITHOUT
12 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
13 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
14 * more details.
15 */
16
17#include <linux/atomic.h>
18#include <linux/blkdev.h>
19#include <linux/buffer_head.h>
d77e92e2 20#include <linux/dax.h>
d475c634
MW
21#include <linux/fs.h>
22#include <linux/genhd.h>
f7ca90b1
MW
23#include <linux/highmem.h>
24#include <linux/memcontrol.h>
25#include <linux/mm.h>
d475c634 26#include <linux/mutex.h>
9973c98e 27#include <linux/pagevec.h>
289c6aed 28#include <linux/sched.h>
f361bf4a 29#include <linux/sched/signal.h>
d475c634 30#include <linux/uio.h>
f7ca90b1 31#include <linux/vmstat.h>
34c0fd54 32#include <linux/pfn_t.h>
0e749e54 33#include <linux/sizes.h>
4b4bb46d 34#include <linux/mmu_notifier.h>
a254e568
CH
35#include <linux/iomap.h>
36#include "internal.h"
d475c634 37
282a8e03
RZ
38#define CREATE_TRACE_POINTS
39#include <trace/events/fs_dax.h>
40
ac401cc7
JK
41/* We choose 4096 entries - same as per-zone page wait tables */
42#define DAX_WAIT_TABLE_BITS 12
43#define DAX_WAIT_TABLE_ENTRIES (1 << DAX_WAIT_TABLE_BITS)
44
917f3452
RZ
45/* The 'colour' (ie low bits) within a PMD of a page offset. */
46#define PG_PMD_COLOUR ((PMD_SIZE >> PAGE_SHIFT) - 1)
977fbdcd 47#define PG_PMD_NR (PMD_SIZE >> PAGE_SHIFT)
917f3452 48
ce95ab0f 49static wait_queue_head_t wait_table[DAX_WAIT_TABLE_ENTRIES];
ac401cc7
JK
50
51static int __init init_dax_wait_table(void)
52{
53 int i;
54
55 for (i = 0; i < DAX_WAIT_TABLE_ENTRIES; i++)
56 init_waitqueue_head(wait_table + i);
57 return 0;
58}
59fs_initcall(init_dax_wait_table);
60
527b19d0
RZ
61/*
62 * We use lowest available bit in exceptional entry for locking, one bit for
63 * the entry size (PMD) and two more to tell us if the entry is a zero page or
64 * an empty entry that is just used for locking. In total four special bits.
65 *
66 * If the PMD bit isn't set the entry has size PAGE_SIZE, and if the ZERO_PAGE
67 * and EMPTY bits aren't set the entry is a normal DAX entry with a filesystem
68 * block allocation.
69 */
70#define RADIX_DAX_SHIFT (RADIX_TREE_EXCEPTIONAL_SHIFT + 4)
71#define RADIX_DAX_ENTRY_LOCK (1 << RADIX_TREE_EXCEPTIONAL_SHIFT)
72#define RADIX_DAX_PMD (1 << (RADIX_TREE_EXCEPTIONAL_SHIFT + 1))
73#define RADIX_DAX_ZERO_PAGE (1 << (RADIX_TREE_EXCEPTIONAL_SHIFT + 2))
74#define RADIX_DAX_EMPTY (1 << (RADIX_TREE_EXCEPTIONAL_SHIFT + 3))
75
3fe0791c 76static unsigned long dax_radix_pfn(void *entry)
527b19d0
RZ
77{
78 return (unsigned long)entry >> RADIX_DAX_SHIFT;
79}
80
3fe0791c 81static void *dax_radix_locked_entry(unsigned long pfn, unsigned long flags)
527b19d0
RZ
82{
83 return (void *)(RADIX_TREE_EXCEPTIONAL_ENTRY | flags |
3fe0791c 84 (pfn << RADIX_DAX_SHIFT) | RADIX_DAX_ENTRY_LOCK);
527b19d0
RZ
85}
86
87static unsigned int dax_radix_order(void *entry)
88{
89 if ((unsigned long)entry & RADIX_DAX_PMD)
90 return PMD_SHIFT - PAGE_SHIFT;
91 return 0;
92}
93
642261ac 94static int dax_is_pmd_entry(void *entry)
d1a5f2b4 95{
642261ac 96 return (unsigned long)entry & RADIX_DAX_PMD;
d1a5f2b4
DW
97}
98
642261ac 99static int dax_is_pte_entry(void *entry)
d475c634 100{
642261ac 101 return !((unsigned long)entry & RADIX_DAX_PMD);
d475c634
MW
102}
103
642261ac 104static int dax_is_zero_entry(void *entry)
d475c634 105{
91d25ba8 106 return (unsigned long)entry & RADIX_DAX_ZERO_PAGE;
d475c634
MW
107}
108
642261ac 109static int dax_is_empty_entry(void *entry)
b2e0d162 110{
642261ac 111 return (unsigned long)entry & RADIX_DAX_EMPTY;
b2e0d162
DW
112}
113
ac401cc7
JK
114/*
115 * DAX radix tree locking
116 */
117struct exceptional_entry_key {
118 struct address_space *mapping;
63e95b5c 119 pgoff_t entry_start;
ac401cc7
JK
120};
121
122struct wait_exceptional_entry_queue {
ac6424b9 123 wait_queue_entry_t wait;
ac401cc7
JK
124 struct exceptional_entry_key key;
125};
126
63e95b5c
RZ
127static wait_queue_head_t *dax_entry_waitqueue(struct address_space *mapping,
128 pgoff_t index, void *entry, struct exceptional_entry_key *key)
129{
130 unsigned long hash;
131
132 /*
133 * If 'entry' is a PMD, align the 'index' that we use for the wait
134 * queue to the start of that PMD. This ensures that all offsets in
135 * the range covered by the PMD map to the same bit lock.
136 */
642261ac 137 if (dax_is_pmd_entry(entry))
917f3452 138 index &= ~PG_PMD_COLOUR;
63e95b5c
RZ
139
140 key->mapping = mapping;
141 key->entry_start = index;
142
143 hash = hash_long((unsigned long)mapping ^ index, DAX_WAIT_TABLE_BITS);
144 return wait_table + hash;
145}
146
ac6424b9 147static int wake_exceptional_entry_func(wait_queue_entry_t *wait, unsigned int mode,
ac401cc7
JK
148 int sync, void *keyp)
149{
150 struct exceptional_entry_key *key = keyp;
151 struct wait_exceptional_entry_queue *ewait =
152 container_of(wait, struct wait_exceptional_entry_queue, wait);
153
154 if (key->mapping != ewait->key.mapping ||
63e95b5c 155 key->entry_start != ewait->key.entry_start)
ac401cc7
JK
156 return 0;
157 return autoremove_wake_function(wait, mode, sync, NULL);
158}
159
e30331ff
RZ
160/*
161 * We do not necessarily hold the mapping->tree_lock when we call this
162 * function so it is possible that 'entry' is no longer a valid item in the
163 * radix tree. This is okay because all we really need to do is to find the
164 * correct waitqueue where tasks might be waiting for that old 'entry' and
165 * wake them.
166 */
d01ad197 167static void dax_wake_mapping_entry_waiter(struct address_space *mapping,
e30331ff
RZ
168 pgoff_t index, void *entry, bool wake_all)
169{
170 struct exceptional_entry_key key;
171 wait_queue_head_t *wq;
172
173 wq = dax_entry_waitqueue(mapping, index, entry, &key);
174
175 /*
176 * Checking for locked entry and prepare_to_wait_exclusive() happens
177 * under mapping->tree_lock, ditto for entry handling in our callers.
178 * So at this point all tasks that could have seen our entry locked
179 * must be in the waitqueue and the following check will see them.
180 */
181 if (waitqueue_active(wq))
182 __wake_up(wq, TASK_NORMAL, wake_all ? 0 : 1, &key);
183}
184
ac401cc7
JK
185/*
186 * Check whether the given slot is locked. The function must be called with
187 * mapping->tree_lock held
188 */
189static inline int slot_locked(struct address_space *mapping, void **slot)
190{
191 unsigned long entry = (unsigned long)
192 radix_tree_deref_slot_protected(slot, &mapping->tree_lock);
193 return entry & RADIX_DAX_ENTRY_LOCK;
194}
195
196/*
197 * Mark the given slot is locked. The function must be called with
198 * mapping->tree_lock held
199 */
200static inline void *lock_slot(struct address_space *mapping, void **slot)
201{
202 unsigned long entry = (unsigned long)
203 radix_tree_deref_slot_protected(slot, &mapping->tree_lock);
204
205 entry |= RADIX_DAX_ENTRY_LOCK;
6d75f366 206 radix_tree_replace_slot(&mapping->page_tree, slot, (void *)entry);
ac401cc7
JK
207 return (void *)entry;
208}
209
210/*
211 * Mark the given slot is unlocked. The function must be called with
212 * mapping->tree_lock held
213 */
214static inline void *unlock_slot(struct address_space *mapping, void **slot)
215{
216 unsigned long entry = (unsigned long)
217 radix_tree_deref_slot_protected(slot, &mapping->tree_lock);
218
219 entry &= ~(unsigned long)RADIX_DAX_ENTRY_LOCK;
6d75f366 220 radix_tree_replace_slot(&mapping->page_tree, slot, (void *)entry);
ac401cc7
JK
221 return (void *)entry;
222}
223
224/*
225 * Lookup entry in radix tree, wait for it to become unlocked if it is
226 * exceptional entry and return it. The caller must call
227 * put_unlocked_mapping_entry() when he decided not to lock the entry or
228 * put_locked_mapping_entry() when he locked the entry and now wants to
229 * unlock it.
230 *
231 * The function must be called with mapping->tree_lock held.
232 */
233static void *get_unlocked_mapping_entry(struct address_space *mapping,
234 pgoff_t index, void ***slotp)
235{
e3ad61c6 236 void *entry, **slot;
ac401cc7 237 struct wait_exceptional_entry_queue ewait;
63e95b5c 238 wait_queue_head_t *wq;
ac401cc7
JK
239
240 init_wait(&ewait.wait);
241 ewait.wait.func = wake_exceptional_entry_func;
ac401cc7
JK
242
243 for (;;) {
e3ad61c6 244 entry = __radix_tree_lookup(&mapping->page_tree, index, NULL,
ac401cc7 245 &slot);
91d25ba8
RZ
246 if (!entry ||
247 WARN_ON_ONCE(!radix_tree_exceptional_entry(entry)) ||
ac401cc7
JK
248 !slot_locked(mapping, slot)) {
249 if (slotp)
250 *slotp = slot;
e3ad61c6 251 return entry;
ac401cc7 252 }
63e95b5c
RZ
253
254 wq = dax_entry_waitqueue(mapping, index, entry, &ewait.key);
ac401cc7
JK
255 prepare_to_wait_exclusive(wq, &ewait.wait,
256 TASK_UNINTERRUPTIBLE);
257 spin_unlock_irq(&mapping->tree_lock);
258 schedule();
259 finish_wait(wq, &ewait.wait);
260 spin_lock_irq(&mapping->tree_lock);
261 }
262}
263
b1aa812b
JK
264static void dax_unlock_mapping_entry(struct address_space *mapping,
265 pgoff_t index)
266{
267 void *entry, **slot;
268
269 spin_lock_irq(&mapping->tree_lock);
270 entry = __radix_tree_lookup(&mapping->page_tree, index, NULL, &slot);
271 if (WARN_ON_ONCE(!entry || !radix_tree_exceptional_entry(entry) ||
272 !slot_locked(mapping, slot))) {
273 spin_unlock_irq(&mapping->tree_lock);
274 return;
275 }
276 unlock_slot(mapping, slot);
277 spin_unlock_irq(&mapping->tree_lock);
278 dax_wake_mapping_entry_waiter(mapping, index, entry, false);
279}
280
422476c4 281static void put_locked_mapping_entry(struct address_space *mapping,
91d25ba8 282 pgoff_t index)
422476c4 283{
91d25ba8 284 dax_unlock_mapping_entry(mapping, index);
422476c4
RZ
285}
286
287/*
288 * Called when we are done with radix tree entry we looked up via
289 * get_unlocked_mapping_entry() and which we didn't lock in the end.
290 */
291static void put_unlocked_mapping_entry(struct address_space *mapping,
292 pgoff_t index, void *entry)
293{
91d25ba8 294 if (!entry)
422476c4
RZ
295 return;
296
297 /* We have to wake up next waiter for the radix tree entry lock */
298 dax_wake_mapping_entry_waiter(mapping, index, entry, false);
299}
300
d2c997c0
DW
301static unsigned long dax_entry_size(void *entry)
302{
303 if (dax_is_zero_entry(entry))
304 return 0;
305 else if (dax_is_empty_entry(entry))
306 return 0;
307 else if (dax_is_pmd_entry(entry))
308 return PMD_SIZE;
309 else
310 return PAGE_SIZE;
311}
312
313static unsigned long dax_radix_end_pfn(void *entry)
314{
315 return dax_radix_pfn(entry) + dax_entry_size(entry) / PAGE_SIZE;
316}
317
318/*
319 * Iterate through all mapped pfns represented by an entry, i.e. skip
320 * 'empty' and 'zero' entries.
321 */
322#define for_each_mapped_pfn(entry, pfn) \
323 for (pfn = dax_radix_pfn(entry); \
324 pfn < dax_radix_end_pfn(entry); pfn++)
325
326static void dax_associate_entry(void *entry, struct address_space *mapping)
327{
328 unsigned long pfn;
329
330 if (IS_ENABLED(CONFIG_FS_DAX_LIMITED))
331 return;
332
333 for_each_mapped_pfn(entry, pfn) {
334 struct page *page = pfn_to_page(pfn);
335
336 WARN_ON_ONCE(page->mapping);
337 page->mapping = mapping;
338 }
339}
340
341static void dax_disassociate_entry(void *entry, struct address_space *mapping,
342 bool trunc)
343{
344 unsigned long pfn;
345
346 if (IS_ENABLED(CONFIG_FS_DAX_LIMITED))
347 return;
348
349 for_each_mapped_pfn(entry, pfn) {
350 struct page *page = pfn_to_page(pfn);
351
352 WARN_ON_ONCE(trunc && page_ref_count(page) > 1);
353 WARN_ON_ONCE(page->mapping && page->mapping != mapping);
354 page->mapping = NULL;
355 }
356}
357
ac401cc7 358/*
91d25ba8
RZ
359 * Find radix tree entry at given index. If it points to an exceptional entry,
360 * return it with the radix tree entry locked. If the radix tree doesn't
361 * contain given index, create an empty exceptional entry for the index and
362 * return with it locked.
ac401cc7 363 *
642261ac
RZ
364 * When requesting an entry with size RADIX_DAX_PMD, grab_mapping_entry() will
365 * either return that locked entry or will return an error. This error will
91d25ba8
RZ
366 * happen if there are any 4k entries within the 2MiB range that we are
367 * requesting.
642261ac
RZ
368 *
369 * We always favor 4k entries over 2MiB entries. There isn't a flow where we
370 * evict 4k entries in order to 'upgrade' them to a 2MiB entry. A 2MiB
371 * insertion will fail if it finds any 4k entries already in the tree, and a
372 * 4k insertion will cause an existing 2MiB entry to be unmapped and
373 * downgraded to 4k entries. This happens for both 2MiB huge zero pages as
374 * well as 2MiB empty entries.
375 *
376 * The exception to this downgrade path is for 2MiB DAX PMD entries that have
377 * real storage backing them. We will leave these real 2MiB DAX entries in
378 * the tree, and PTE writes will simply dirty the entire 2MiB DAX entry.
379 *
ac401cc7
JK
380 * Note: Unlike filemap_fault() we don't honor FAULT_FLAG_RETRY flags. For
381 * persistent memory the benefit is doubtful. We can add that later if we can
382 * show it helps.
383 */
642261ac
RZ
384static void *grab_mapping_entry(struct address_space *mapping, pgoff_t index,
385 unsigned long size_flag)
ac401cc7 386{
642261ac 387 bool pmd_downgrade = false; /* splitting 2MiB entry into 4k entries? */
e3ad61c6 388 void *entry, **slot;
ac401cc7
JK
389
390restart:
391 spin_lock_irq(&mapping->tree_lock);
e3ad61c6 392 entry = get_unlocked_mapping_entry(mapping, index, &slot);
642261ac 393
91d25ba8
RZ
394 if (WARN_ON_ONCE(entry && !radix_tree_exceptional_entry(entry))) {
395 entry = ERR_PTR(-EIO);
396 goto out_unlock;
397 }
398
642261ac
RZ
399 if (entry) {
400 if (size_flag & RADIX_DAX_PMD) {
91d25ba8 401 if (dax_is_pte_entry(entry)) {
642261ac
RZ
402 put_unlocked_mapping_entry(mapping, index,
403 entry);
404 entry = ERR_PTR(-EEXIST);
405 goto out_unlock;
406 }
407 } else { /* trying to grab a PTE entry */
91d25ba8 408 if (dax_is_pmd_entry(entry) &&
642261ac
RZ
409 (dax_is_zero_entry(entry) ||
410 dax_is_empty_entry(entry))) {
411 pmd_downgrade = true;
412 }
413 }
414 }
415
ac401cc7 416 /* No entry for given index? Make sure radix tree is big enough. */
642261ac 417 if (!entry || pmd_downgrade) {
ac401cc7
JK
418 int err;
419
642261ac
RZ
420 if (pmd_downgrade) {
421 /*
422 * Make sure 'entry' remains valid while we drop
423 * mapping->tree_lock.
424 */
425 entry = lock_slot(mapping, slot);
426 }
427
ac401cc7 428 spin_unlock_irq(&mapping->tree_lock);
642261ac
RZ
429 /*
430 * Besides huge zero pages the only other thing that gets
431 * downgraded are empty entries which don't need to be
432 * unmapped.
433 */
434 if (pmd_downgrade && dax_is_zero_entry(entry))
977fbdcd
MW
435 unmap_mapping_pages(mapping, index & ~PG_PMD_COLOUR,
436 PG_PMD_NR, false);
642261ac 437
ac401cc7
JK
438 err = radix_tree_preload(
439 mapping_gfp_mask(mapping) & ~__GFP_HIGHMEM);
0cb80b48
JK
440 if (err) {
441 if (pmd_downgrade)
91d25ba8 442 put_locked_mapping_entry(mapping, index);
ac401cc7 443 return ERR_PTR(err);
0cb80b48 444 }
ac401cc7 445 spin_lock_irq(&mapping->tree_lock);
642261ac 446
e11f8b7b
RZ
447 if (!entry) {
448 /*
449 * We needed to drop the page_tree lock while calling
450 * radix_tree_preload() and we didn't have an entry to
451 * lock. See if another thread inserted an entry at
452 * our index during this time.
453 */
454 entry = __radix_tree_lookup(&mapping->page_tree, index,
455 NULL, &slot);
456 if (entry) {
457 radix_tree_preload_end();
458 spin_unlock_irq(&mapping->tree_lock);
459 goto restart;
460 }
461 }
462
642261ac 463 if (pmd_downgrade) {
d2c997c0 464 dax_disassociate_entry(entry, mapping, false);
642261ac
RZ
465 radix_tree_delete(&mapping->page_tree, index);
466 mapping->nrexceptional--;
467 dax_wake_mapping_entry_waiter(mapping, index, entry,
468 true);
469 }
470
471 entry = dax_radix_locked_entry(0, size_flag | RADIX_DAX_EMPTY);
472
473 err = __radix_tree_insert(&mapping->page_tree, index,
474 dax_radix_order(entry), entry);
ac401cc7
JK
475 radix_tree_preload_end();
476 if (err) {
477 spin_unlock_irq(&mapping->tree_lock);
642261ac 478 /*
e11f8b7b
RZ
479 * Our insertion of a DAX entry failed, most likely
480 * because we were inserting a PMD entry and it
481 * collided with a PTE sized entry at a different
482 * index in the PMD range. We haven't inserted
483 * anything into the radix tree and have no waiters to
484 * wake.
642261ac 485 */
ac401cc7
JK
486 return ERR_PTR(err);
487 }
488 /* Good, we have inserted empty locked entry into the tree. */
489 mapping->nrexceptional++;
490 spin_unlock_irq(&mapping->tree_lock);
e3ad61c6 491 return entry;
ac401cc7 492 }
e3ad61c6 493 entry = lock_slot(mapping, slot);
642261ac 494 out_unlock:
ac401cc7 495 spin_unlock_irq(&mapping->tree_lock);
e3ad61c6 496 return entry;
ac401cc7
JK
497}
498
c6dcf52c
JK
499static int __dax_invalidate_mapping_entry(struct address_space *mapping,
500 pgoff_t index, bool trunc)
501{
502 int ret = 0;
503 void *entry;
504 struct radix_tree_root *page_tree = &mapping->page_tree;
505
506 spin_lock_irq(&mapping->tree_lock);
507 entry = get_unlocked_mapping_entry(mapping, index, NULL);
91d25ba8 508 if (!entry || WARN_ON_ONCE(!radix_tree_exceptional_entry(entry)))
c6dcf52c
JK
509 goto out;
510 if (!trunc &&
511 (radix_tree_tag_get(page_tree, index, PAGECACHE_TAG_DIRTY) ||
512 radix_tree_tag_get(page_tree, index, PAGECACHE_TAG_TOWRITE)))
513 goto out;
d2c997c0 514 dax_disassociate_entry(entry, mapping, trunc);
c6dcf52c
JK
515 radix_tree_delete(page_tree, index);
516 mapping->nrexceptional--;
517 ret = 1;
518out:
519 put_unlocked_mapping_entry(mapping, index, entry);
520 spin_unlock_irq(&mapping->tree_lock);
521 return ret;
522}
ac401cc7
JK
523/*
524 * Delete exceptional DAX entry at @index from @mapping. Wait for radix tree
525 * entry to get unlocked before deleting it.
526 */
527int dax_delete_mapping_entry(struct address_space *mapping, pgoff_t index)
528{
c6dcf52c 529 int ret = __dax_invalidate_mapping_entry(mapping, index, true);
ac401cc7 530
ac401cc7
JK
531 /*
532 * This gets called from truncate / punch_hole path. As such, the caller
533 * must hold locks protecting against concurrent modifications of the
534 * radix tree (usually fs-private i_mmap_sem for writing). Since the
535 * caller has seen exceptional entry for this index, we better find it
536 * at that index as well...
537 */
c6dcf52c
JK
538 WARN_ON_ONCE(!ret);
539 return ret;
540}
541
c6dcf52c
JK
542/*
543 * Invalidate exceptional DAX entry if it is clean.
544 */
545int dax_invalidate_mapping_entry_sync(struct address_space *mapping,
546 pgoff_t index)
547{
548 return __dax_invalidate_mapping_entry(mapping, index, false);
ac401cc7
JK
549}
550
cccbce67
DW
551static int copy_user_dax(struct block_device *bdev, struct dax_device *dax_dev,
552 sector_t sector, size_t size, struct page *to,
553 unsigned long vaddr)
f7ca90b1 554{
cccbce67
DW
555 void *vto, *kaddr;
556 pgoff_t pgoff;
557 pfn_t pfn;
558 long rc;
559 int id;
560
561 rc = bdev_dax_pgoff(bdev, sector, size, &pgoff);
562 if (rc)
563 return rc;
564
565 id = dax_read_lock();
566 rc = dax_direct_access(dax_dev, pgoff, PHYS_PFN(size), &kaddr, &pfn);
567 if (rc < 0) {
568 dax_read_unlock(id);
569 return rc;
570 }
f7ca90b1 571 vto = kmap_atomic(to);
cccbce67 572 copy_user_page(vto, (void __force *)kaddr, vaddr, to);
f7ca90b1 573 kunmap_atomic(vto);
cccbce67 574 dax_read_unlock(id);
f7ca90b1
MW
575 return 0;
576}
577
642261ac
RZ
578/*
579 * By this point grab_mapping_entry() has ensured that we have a locked entry
580 * of the appropriate size so we don't have to worry about downgrading PMDs to
581 * PTEs. If we happen to be trying to insert a PTE and there is a PMD
582 * already in the tree, we will skip the insertion and just dirty the PMD as
583 * appropriate.
584 */
ac401cc7
JK
585static void *dax_insert_mapping_entry(struct address_space *mapping,
586 struct vm_fault *vmf,
3fe0791c 587 void *entry, pfn_t pfn_t,
f5b7b748 588 unsigned long flags, bool dirty)
9973c98e
RZ
589{
590 struct radix_tree_root *page_tree = &mapping->page_tree;
3fe0791c 591 unsigned long pfn = pfn_t_to_pfn(pfn_t);
ac401cc7 592 pgoff_t index = vmf->pgoff;
3fe0791c 593 void *new_entry;
9973c98e 594
f5b7b748 595 if (dirty)
d2b2a28e 596 __mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
9973c98e 597
91d25ba8
RZ
598 if (dax_is_zero_entry(entry) && !(flags & RADIX_DAX_ZERO_PAGE)) {
599 /* we are replacing a zero page with block mapping */
600 if (dax_is_pmd_entry(entry))
977fbdcd
MW
601 unmap_mapping_pages(mapping, index & ~PG_PMD_COLOUR,
602 PG_PMD_NR, false);
91d25ba8 603 else /* pte entry */
977fbdcd 604 unmap_mapping_pages(mapping, vmf->pgoff, 1, false);
9973c98e
RZ
605 }
606
ac401cc7 607 spin_lock_irq(&mapping->tree_lock);
3fe0791c 608 new_entry = dax_radix_locked_entry(pfn, flags);
d2c997c0
DW
609 if (dax_entry_size(entry) != dax_entry_size(new_entry)) {
610 dax_disassociate_entry(entry, mapping, false);
611 dax_associate_entry(new_entry, mapping);
612 }
642261ac 613
91d25ba8 614 if (dax_is_zero_entry(entry) || dax_is_empty_entry(entry)) {
642261ac
RZ
615 /*
616 * Only swap our new entry into the radix tree if the current
617 * entry is a zero page or an empty entry. If a normal PTE or
618 * PMD entry is already in the tree, we leave it alone. This
619 * means that if we are trying to insert a PTE and the
620 * existing entry is a PMD, we will just leave the PMD in the
621 * tree and dirty it if necessary.
622 */
f7942430 623 struct radix_tree_node *node;
ac401cc7
JK
624 void **slot;
625 void *ret;
9973c98e 626
f7942430 627 ret = __radix_tree_lookup(page_tree, index, &node, &slot);
ac401cc7 628 WARN_ON_ONCE(ret != entry);
4d693d08 629 __radix_tree_replace(page_tree, node, slot,
c7df8ad2 630 new_entry, NULL);
91d25ba8 631 entry = new_entry;
9973c98e 632 }
91d25ba8 633
f5b7b748 634 if (dirty)
9973c98e 635 radix_tree_tag_set(page_tree, index, PAGECACHE_TAG_DIRTY);
91d25ba8 636
9973c98e 637 spin_unlock_irq(&mapping->tree_lock);
91d25ba8 638 return entry;
9973c98e
RZ
639}
640
4b4bb46d
JK
641static inline unsigned long
642pgoff_address(pgoff_t pgoff, struct vm_area_struct *vma)
643{
644 unsigned long address;
645
646 address = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
647 VM_BUG_ON_VMA(address < vma->vm_start || address >= vma->vm_end, vma);
648 return address;
649}
650
651/* Walk all mappings of a given index of a file and writeprotect them */
652static void dax_mapping_entry_mkclean(struct address_space *mapping,
653 pgoff_t index, unsigned long pfn)
654{
655 struct vm_area_struct *vma;
f729c8c9
RZ
656 pte_t pte, *ptep = NULL;
657 pmd_t *pmdp = NULL;
4b4bb46d 658 spinlock_t *ptl;
4b4bb46d
JK
659
660 i_mmap_lock_read(mapping);
661 vma_interval_tree_foreach(vma, &mapping->i_mmap, index, index) {
a4d1a885 662 unsigned long address, start, end;
4b4bb46d
JK
663
664 cond_resched();
665
666 if (!(vma->vm_flags & VM_SHARED))
667 continue;
668
669 address = pgoff_address(index, vma);
a4d1a885
JG
670
671 /*
672 * Note because we provide start/end to follow_pte_pmd it will
673 * call mmu_notifier_invalidate_range_start() on our behalf
674 * before taking any lock.
675 */
676 if (follow_pte_pmd(vma->vm_mm, address, &start, &end, &ptep, &pmdp, &ptl))
4b4bb46d 677 continue;
4b4bb46d 678
0f10851e
JG
679 /*
680 * No need to call mmu_notifier_invalidate_range() as we are
681 * downgrading page table protection not changing it to point
682 * to a new page.
683 *
684 * See Documentation/vm/mmu_notifier.txt
685 */
f729c8c9
RZ
686 if (pmdp) {
687#ifdef CONFIG_FS_DAX_PMD
688 pmd_t pmd;
689
690 if (pfn != pmd_pfn(*pmdp))
691 goto unlock_pmd;
f6f37321 692 if (!pmd_dirty(*pmdp) && !pmd_write(*pmdp))
f729c8c9
RZ
693 goto unlock_pmd;
694
695 flush_cache_page(vma, address, pfn);
696 pmd = pmdp_huge_clear_flush(vma, address, pmdp);
697 pmd = pmd_wrprotect(pmd);
698 pmd = pmd_mkclean(pmd);
699 set_pmd_at(vma->vm_mm, address, pmdp, pmd);
f729c8c9 700unlock_pmd:
f729c8c9 701#endif
ee190ca6 702 spin_unlock(ptl);
f729c8c9
RZ
703 } else {
704 if (pfn != pte_pfn(*ptep))
705 goto unlock_pte;
706 if (!pte_dirty(*ptep) && !pte_write(*ptep))
707 goto unlock_pte;
708
709 flush_cache_page(vma, address, pfn);
710 pte = ptep_clear_flush(vma, address, ptep);
711 pte = pte_wrprotect(pte);
712 pte = pte_mkclean(pte);
713 set_pte_at(vma->vm_mm, address, ptep, pte);
f729c8c9
RZ
714unlock_pte:
715 pte_unmap_unlock(ptep, ptl);
716 }
4b4bb46d 717
a4d1a885 718 mmu_notifier_invalidate_range_end(vma->vm_mm, start, end);
4b4bb46d
JK
719 }
720 i_mmap_unlock_read(mapping);
721}
722
3fe0791c
DW
723static int dax_writeback_one(struct dax_device *dax_dev,
724 struct address_space *mapping, pgoff_t index, void *entry)
9973c98e
RZ
725{
726 struct radix_tree_root *page_tree = &mapping->page_tree;
3fe0791c
DW
727 void *entry2, **slot;
728 unsigned long pfn;
729 long ret = 0;
cccbce67 730 size_t size;
9973c98e 731
9973c98e 732 /*
a6abc2c0
JK
733 * A page got tagged dirty in DAX mapping? Something is seriously
734 * wrong.
9973c98e 735 */
a6abc2c0
JK
736 if (WARN_ON(!radix_tree_exceptional_entry(entry)))
737 return -EIO;
9973c98e 738
a6abc2c0
JK
739 spin_lock_irq(&mapping->tree_lock);
740 entry2 = get_unlocked_mapping_entry(mapping, index, &slot);
741 /* Entry got punched out / reallocated? */
91d25ba8 742 if (!entry2 || WARN_ON_ONCE(!radix_tree_exceptional_entry(entry2)))
a6abc2c0
JK
743 goto put_unlocked;
744 /*
745 * Entry got reallocated elsewhere? No need to writeback. We have to
3fe0791c 746 * compare pfns as we must not bail out due to difference in lockbit
a6abc2c0
JK
747 * or entry type.
748 */
3fe0791c 749 if (dax_radix_pfn(entry2) != dax_radix_pfn(entry))
a6abc2c0 750 goto put_unlocked;
642261ac
RZ
751 if (WARN_ON_ONCE(dax_is_empty_entry(entry) ||
752 dax_is_zero_entry(entry))) {
9973c98e 753 ret = -EIO;
a6abc2c0 754 goto put_unlocked;
9973c98e
RZ
755 }
756
a6abc2c0
JK
757 /* Another fsync thread may have already written back this entry */
758 if (!radix_tree_tag_get(page_tree, index, PAGECACHE_TAG_TOWRITE))
759 goto put_unlocked;
760 /* Lock the entry to serialize with page faults */
761 entry = lock_slot(mapping, slot);
762 /*
763 * We can clear the tag now but we have to be careful so that concurrent
764 * dax_writeback_one() calls for the same index cannot finish before we
765 * actually flush the caches. This is achieved as the calls will look
766 * at the entry only under tree_lock and once they do that they will
767 * see the entry locked and wait for it to unlock.
768 */
769 radix_tree_tag_clear(page_tree, index, PAGECACHE_TAG_TOWRITE);
770 spin_unlock_irq(&mapping->tree_lock);
771
642261ac
RZ
772 /*
773 * Even if dax_writeback_mapping_range() was given a wbc->range_start
774 * in the middle of a PMD, the 'index' we are given will be aligned to
3fe0791c
DW
775 * the start index of the PMD, as will the pfn we pull from 'entry'.
776 * This allows us to flush for PMD_SIZE and not have to worry about
777 * partial PMD writebacks.
642261ac 778 */
3fe0791c 779 pfn = dax_radix_pfn(entry);
cccbce67
DW
780 size = PAGE_SIZE << dax_radix_order(entry);
781
3fe0791c
DW
782 dax_mapping_entry_mkclean(mapping, index, pfn);
783 dax_flush(dax_dev, page_address(pfn_to_page(pfn)), size);
4b4bb46d
JK
784 /*
785 * After we have flushed the cache, we can clear the dirty tag. There
786 * cannot be new dirty data in the pfn after the flush has completed as
787 * the pfn mappings are writeprotected and fault waits for mapping
788 * entry lock.
789 */
790 spin_lock_irq(&mapping->tree_lock);
791 radix_tree_tag_clear(page_tree, index, PAGECACHE_TAG_DIRTY);
792 spin_unlock_irq(&mapping->tree_lock);
f9bc3a07 793 trace_dax_writeback_one(mapping->host, index, size >> PAGE_SHIFT);
91d25ba8 794 put_locked_mapping_entry(mapping, index);
9973c98e
RZ
795 return ret;
796
a6abc2c0
JK
797 put_unlocked:
798 put_unlocked_mapping_entry(mapping, index, entry2);
9973c98e
RZ
799 spin_unlock_irq(&mapping->tree_lock);
800 return ret;
801}
802
803/*
804 * Flush the mapping to the persistent domain within the byte range of [start,
805 * end]. This is required by data integrity operations to ensure file data is
806 * on persistent storage prior to completion of the operation.
807 */
7f6d5b52
RZ
808int dax_writeback_mapping_range(struct address_space *mapping,
809 struct block_device *bdev, struct writeback_control *wbc)
9973c98e
RZ
810{
811 struct inode *inode = mapping->host;
642261ac 812 pgoff_t start_index, end_index;
9973c98e 813 pgoff_t indices[PAGEVEC_SIZE];
cccbce67 814 struct dax_device *dax_dev;
9973c98e
RZ
815 struct pagevec pvec;
816 bool done = false;
817 int i, ret = 0;
9973c98e
RZ
818
819 if (WARN_ON_ONCE(inode->i_blkbits != PAGE_SHIFT))
820 return -EIO;
821
7f6d5b52
RZ
822 if (!mapping->nrexceptional || wbc->sync_mode != WB_SYNC_ALL)
823 return 0;
824
cccbce67
DW
825 dax_dev = dax_get_by_host(bdev->bd_disk->disk_name);
826 if (!dax_dev)
827 return -EIO;
828
09cbfeaf
KS
829 start_index = wbc->range_start >> PAGE_SHIFT;
830 end_index = wbc->range_end >> PAGE_SHIFT;
9973c98e 831
d14a3f48
RZ
832 trace_dax_writeback_range(inode, start_index, end_index);
833
9973c98e
RZ
834 tag_pages_for_writeback(mapping, start_index, end_index);
835
86679820 836 pagevec_init(&pvec);
9973c98e
RZ
837 while (!done) {
838 pvec.nr = find_get_entries_tag(mapping, start_index,
839 PAGECACHE_TAG_TOWRITE, PAGEVEC_SIZE,
840 pvec.pages, indices);
841
842 if (pvec.nr == 0)
843 break;
844
845 for (i = 0; i < pvec.nr; i++) {
846 if (indices[i] > end_index) {
847 done = true;
848 break;
849 }
850
3fe0791c
DW
851 ret = dax_writeback_one(dax_dev, mapping, indices[i],
852 pvec.pages[i]);
819ec6b9
JL
853 if (ret < 0) {
854 mapping_set_error(mapping, ret);
d14a3f48 855 goto out;
819ec6b9 856 }
9973c98e 857 }
1eb643d0 858 start_index = indices[pvec.nr - 1] + 1;
9973c98e 859 }
d14a3f48 860out:
cccbce67 861 put_dax(dax_dev);
d14a3f48
RZ
862 trace_dax_writeback_range_done(inode, start_index, end_index);
863 return (ret < 0 ? ret : 0);
9973c98e
RZ
864}
865EXPORT_SYMBOL_GPL(dax_writeback_mapping_range);
866
31a6f1a6 867static sector_t dax_iomap_sector(struct iomap *iomap, loff_t pos)
f7ca90b1 868{
a3841f94 869 return (iomap->addr + (pos & PAGE_MASK) - iomap->offset) >> 9;
31a6f1a6
JK
870}
871
5e161e40
JK
872static int dax_iomap_pfn(struct iomap *iomap, loff_t pos, size_t size,
873 pfn_t *pfnp)
f7ca90b1 874{
31a6f1a6 875 const sector_t sector = dax_iomap_sector(iomap, pos);
cccbce67 876 pgoff_t pgoff;
5e161e40 877 void *kaddr;
cccbce67 878 int id, rc;
5e161e40 879 long length;
f7ca90b1 880
5e161e40 881 rc = bdev_dax_pgoff(iomap->bdev, sector, size, &pgoff);
cccbce67
DW
882 if (rc)
883 return rc;
cccbce67 884 id = dax_read_lock();
5e161e40
JK
885 length = dax_direct_access(iomap->dax_dev, pgoff, PHYS_PFN(size),
886 &kaddr, pfnp);
887 if (length < 0) {
888 rc = length;
889 goto out;
cccbce67 890 }
5e161e40
JK
891 rc = -EINVAL;
892 if (PFN_PHYS(length) < size)
893 goto out;
894 if (pfn_t_to_pfn(*pfnp) & (PHYS_PFN(size)-1))
895 goto out;
896 /* For larger pages we need devmap */
897 if (length > 1 && !pfn_t_devmap(*pfnp))
898 goto out;
899 rc = 0;
900out:
cccbce67 901 dax_read_unlock(id);
5e161e40 902 return rc;
0e3b210c 903}
0e3b210c 904
e30331ff 905/*
91d25ba8
RZ
906 * The user has performed a load from a hole in the file. Allocating a new
907 * page in the file would cause excessive storage usage for workloads with
908 * sparse files. Instead we insert a read-only mapping of the 4k zero page.
909 * If this page is ever written to we will re-fault and change the mapping to
910 * point to real DAX storage instead.
e30331ff 911 */
91d25ba8 912static int dax_load_hole(struct address_space *mapping, void *entry,
e30331ff
RZ
913 struct vm_fault *vmf)
914{
915 struct inode *inode = mapping->host;
91d25ba8
RZ
916 unsigned long vaddr = vmf->address;
917 int ret = VM_FAULT_NOPAGE;
918 struct page *zero_page;
919 void *entry2;
3fe0791c 920 pfn_t pfn;
e30331ff 921
91d25ba8
RZ
922 zero_page = ZERO_PAGE(0);
923 if (unlikely(!zero_page)) {
e30331ff
RZ
924 ret = VM_FAULT_OOM;
925 goto out;
926 }
927
3fe0791c
DW
928 pfn = page_to_pfn_t(zero_page);
929 entry2 = dax_insert_mapping_entry(mapping, vmf, entry, pfn,
f5b7b748 930 RADIX_DAX_ZERO_PAGE, false);
91d25ba8
RZ
931 if (IS_ERR(entry2)) {
932 ret = VM_FAULT_SIGBUS;
933 goto out;
e30331ff 934 }
91d25ba8 935
3fe0791c 936 vm_insert_mixed(vmf->vma, vaddr, pfn);
e30331ff
RZ
937out:
938 trace_dax_load_hole(inode, vmf, ret);
939 return ret;
940}
941
4b0228fa
VV
942static bool dax_range_is_aligned(struct block_device *bdev,
943 unsigned int offset, unsigned int length)
944{
945 unsigned short sector_size = bdev_logical_block_size(bdev);
946
947 if (!IS_ALIGNED(offset, sector_size))
948 return false;
949 if (!IS_ALIGNED(length, sector_size))
950 return false;
951
952 return true;
953}
954
cccbce67
DW
955int __dax_zero_page_range(struct block_device *bdev,
956 struct dax_device *dax_dev, sector_t sector,
957 unsigned int offset, unsigned int size)
679c8bd3 958{
cccbce67
DW
959 if (dax_range_is_aligned(bdev, offset, size)) {
960 sector_t start_sector = sector + (offset >> 9);
4b0228fa
VV
961
962 return blkdev_issue_zeroout(bdev, start_sector,
53ef7d0e 963 size >> 9, GFP_NOFS, 0);
4b0228fa 964 } else {
cccbce67
DW
965 pgoff_t pgoff;
966 long rc, id;
967 void *kaddr;
968 pfn_t pfn;
969
e84b83b9 970 rc = bdev_dax_pgoff(bdev, sector, PAGE_SIZE, &pgoff);
cccbce67
DW
971 if (rc)
972 return rc;
973
974 id = dax_read_lock();
e84b83b9 975 rc = dax_direct_access(dax_dev, pgoff, 1, &kaddr,
cccbce67
DW
976 &pfn);
977 if (rc < 0) {
978 dax_read_unlock(id);
979 return rc;
980 }
81f55870 981 memset(kaddr + offset, 0, size);
c3ca015f 982 dax_flush(dax_dev, kaddr + offset, size);
cccbce67 983 dax_read_unlock(id);
4b0228fa 984 }
679c8bd3
CH
985 return 0;
986}
987EXPORT_SYMBOL_GPL(__dax_zero_page_range);
988
a254e568 989static loff_t
11c59c92 990dax_iomap_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
a254e568
CH
991 struct iomap *iomap)
992{
cccbce67
DW
993 struct block_device *bdev = iomap->bdev;
994 struct dax_device *dax_dev = iomap->dax_dev;
a254e568
CH
995 struct iov_iter *iter = data;
996 loff_t end = pos + length, done = 0;
997 ssize_t ret = 0;
cccbce67 998 int id;
a254e568
CH
999
1000 if (iov_iter_rw(iter) == READ) {
1001 end = min(end, i_size_read(inode));
1002 if (pos >= end)
1003 return 0;
1004
1005 if (iomap->type == IOMAP_HOLE || iomap->type == IOMAP_UNWRITTEN)
1006 return iov_iter_zero(min(length, end - pos), iter);
1007 }
1008
1009 if (WARN_ON_ONCE(iomap->type != IOMAP_MAPPED))
1010 return -EIO;
1011
e3fce68c
JK
1012 /*
1013 * Write can allocate block for an area which has a hole page mapped
1014 * into page tables. We have to tear down these mappings so that data
1015 * written by write(2) is visible in mmap.
1016 */
cd656375 1017 if (iomap->flags & IOMAP_F_NEW) {
e3fce68c
JK
1018 invalidate_inode_pages2_range(inode->i_mapping,
1019 pos >> PAGE_SHIFT,
1020 (end - 1) >> PAGE_SHIFT);
1021 }
1022
cccbce67 1023 id = dax_read_lock();
a254e568
CH
1024 while (pos < end) {
1025 unsigned offset = pos & (PAGE_SIZE - 1);
cccbce67
DW
1026 const size_t size = ALIGN(length + offset, PAGE_SIZE);
1027 const sector_t sector = dax_iomap_sector(iomap, pos);
a254e568 1028 ssize_t map_len;
cccbce67
DW
1029 pgoff_t pgoff;
1030 void *kaddr;
1031 pfn_t pfn;
a254e568 1032
d1908f52
MH
1033 if (fatal_signal_pending(current)) {
1034 ret = -EINTR;
1035 break;
1036 }
1037
cccbce67
DW
1038 ret = bdev_dax_pgoff(bdev, sector, size, &pgoff);
1039 if (ret)
1040 break;
1041
1042 map_len = dax_direct_access(dax_dev, pgoff, PHYS_PFN(size),
1043 &kaddr, &pfn);
a254e568
CH
1044 if (map_len < 0) {
1045 ret = map_len;
1046 break;
1047 }
1048
cccbce67
DW
1049 map_len = PFN_PHYS(map_len);
1050 kaddr += offset;
a254e568
CH
1051 map_len -= offset;
1052 if (map_len > end - pos)
1053 map_len = end - pos;
1054
a2e050f5
RZ
1055 /*
1056 * The userspace address for the memory copy has already been
1057 * validated via access_ok() in either vfs_read() or
1058 * vfs_write(), depending on which operation we are doing.
1059 */
a254e568 1060 if (iov_iter_rw(iter) == WRITE)
fec53774
DW
1061 map_len = dax_copy_from_iter(dax_dev, pgoff, kaddr,
1062 map_len, iter);
a254e568 1063 else
cccbce67 1064 map_len = copy_to_iter(kaddr, map_len, iter);
a254e568
CH
1065 if (map_len <= 0) {
1066 ret = map_len ? map_len : -EFAULT;
1067 break;
1068 }
1069
1070 pos += map_len;
1071 length -= map_len;
1072 done += map_len;
1073 }
cccbce67 1074 dax_read_unlock(id);
a254e568
CH
1075
1076 return done ? done : ret;
1077}
1078
1079/**
11c59c92 1080 * dax_iomap_rw - Perform I/O to a DAX file
a254e568
CH
1081 * @iocb: The control block for this I/O
1082 * @iter: The addresses to do I/O from or to
1083 * @ops: iomap ops passed from the file system
1084 *
1085 * This function performs read and write operations to directly mapped
1086 * persistent memory. The callers needs to take care of read/write exclusion
1087 * and evicting any page cache pages in the region under I/O.
1088 */
1089ssize_t
11c59c92 1090dax_iomap_rw(struct kiocb *iocb, struct iov_iter *iter,
8ff6daa1 1091 const struct iomap_ops *ops)
a254e568
CH
1092{
1093 struct address_space *mapping = iocb->ki_filp->f_mapping;
1094 struct inode *inode = mapping->host;
1095 loff_t pos = iocb->ki_pos, ret = 0, done = 0;
1096 unsigned flags = 0;
1097
168316db
CH
1098 if (iov_iter_rw(iter) == WRITE) {
1099 lockdep_assert_held_exclusive(&inode->i_rwsem);
a254e568 1100 flags |= IOMAP_WRITE;
168316db
CH
1101 } else {
1102 lockdep_assert_held(&inode->i_rwsem);
1103 }
a254e568 1104
a254e568
CH
1105 while (iov_iter_count(iter)) {
1106 ret = iomap_apply(inode, pos, iov_iter_count(iter), flags, ops,
11c59c92 1107 iter, dax_iomap_actor);
a254e568
CH
1108 if (ret <= 0)
1109 break;
1110 pos += ret;
1111 done += ret;
1112 }
1113
1114 iocb->ki_pos += done;
1115 return done ? done : ret;
1116}
11c59c92 1117EXPORT_SYMBOL_GPL(dax_iomap_rw);
a7d73fe6 1118
9f141d6e
JK
1119static int dax_fault_return(int error)
1120{
1121 if (error == 0)
1122 return VM_FAULT_NOPAGE;
1123 if (error == -ENOMEM)
1124 return VM_FAULT_OOM;
1125 return VM_FAULT_SIGBUS;
1126}
1127
aaa422c4
DW
1128/*
1129 * MAP_SYNC on a dax mapping guarantees dirty metadata is
1130 * flushed on write-faults (non-cow), but not read-faults.
1131 */
1132static bool dax_fault_is_synchronous(unsigned long flags,
1133 struct vm_area_struct *vma, struct iomap *iomap)
1134{
1135 return (flags & IOMAP_WRITE) && (vma->vm_flags & VM_SYNC)
1136 && (iomap->flags & IOMAP_F_DIRTY);
1137}
1138
9a0dd422 1139static int dax_iomap_pte_fault(struct vm_fault *vmf, pfn_t *pfnp,
c0b24625 1140 int *iomap_errp, const struct iomap_ops *ops)
a7d73fe6 1141{
a0987ad5
JK
1142 struct vm_area_struct *vma = vmf->vma;
1143 struct address_space *mapping = vma->vm_file->f_mapping;
a7d73fe6 1144 struct inode *inode = mapping->host;
1a29d85e 1145 unsigned long vaddr = vmf->address;
a7d73fe6 1146 loff_t pos = (loff_t)vmf->pgoff << PAGE_SHIFT;
a7d73fe6 1147 struct iomap iomap = { 0 };
9484ab1b 1148 unsigned flags = IOMAP_FAULT;
a7d73fe6 1149 int error, major = 0;
d2c43ef1 1150 bool write = vmf->flags & FAULT_FLAG_WRITE;
caa51d26 1151 bool sync;
b1aa812b 1152 int vmf_ret = 0;
a7d73fe6 1153 void *entry;
1b5a1cb2 1154 pfn_t pfn;
a7d73fe6 1155
a9c42b33 1156 trace_dax_pte_fault(inode, vmf, vmf_ret);
a7d73fe6
CH
1157 /*
1158 * Check whether offset isn't beyond end of file now. Caller is supposed
1159 * to hold locks serializing us with truncate / punch hole so this is
1160 * a reliable test.
1161 */
a9c42b33
RZ
1162 if (pos >= i_size_read(inode)) {
1163 vmf_ret = VM_FAULT_SIGBUS;
1164 goto out;
1165 }
a7d73fe6 1166
d2c43ef1 1167 if (write && !vmf->cow_page)
a7d73fe6
CH
1168 flags |= IOMAP_WRITE;
1169
13e451fd
JK
1170 entry = grab_mapping_entry(mapping, vmf->pgoff, 0);
1171 if (IS_ERR(entry)) {
1172 vmf_ret = dax_fault_return(PTR_ERR(entry));
1173 goto out;
1174 }
1175
e2093926
RZ
1176 /*
1177 * It is possible, particularly with mixed reads & writes to private
1178 * mappings, that we have raced with a PMD fault that overlaps with
1179 * the PTE we need to set up. If so just return and the fault will be
1180 * retried.
1181 */
1182 if (pmd_trans_huge(*vmf->pmd) || pmd_devmap(*vmf->pmd)) {
1183 vmf_ret = VM_FAULT_NOPAGE;
1184 goto unlock_entry;
1185 }
1186
a7d73fe6
CH
1187 /*
1188 * Note that we don't bother to use iomap_apply here: DAX required
1189 * the file system block size to be equal the page size, which means
1190 * that we never have to deal with more than a single extent here.
1191 */
1192 error = ops->iomap_begin(inode, pos, PAGE_SIZE, flags, &iomap);
c0b24625
JK
1193 if (iomap_errp)
1194 *iomap_errp = error;
a9c42b33
RZ
1195 if (error) {
1196 vmf_ret = dax_fault_return(error);
13e451fd 1197 goto unlock_entry;
a9c42b33 1198 }
a7d73fe6 1199 if (WARN_ON_ONCE(iomap.offset + iomap.length < pos + PAGE_SIZE)) {
13e451fd
JK
1200 error = -EIO; /* fs corruption? */
1201 goto error_finish_iomap;
a7d73fe6
CH
1202 }
1203
a7d73fe6 1204 if (vmf->cow_page) {
31a6f1a6
JK
1205 sector_t sector = dax_iomap_sector(&iomap, pos);
1206
a7d73fe6
CH
1207 switch (iomap.type) {
1208 case IOMAP_HOLE:
1209 case IOMAP_UNWRITTEN:
1210 clear_user_highpage(vmf->cow_page, vaddr);
1211 break;
1212 case IOMAP_MAPPED:
cccbce67
DW
1213 error = copy_user_dax(iomap.bdev, iomap.dax_dev,
1214 sector, PAGE_SIZE, vmf->cow_page, vaddr);
a7d73fe6
CH
1215 break;
1216 default:
1217 WARN_ON_ONCE(1);
1218 error = -EIO;
1219 break;
1220 }
1221
1222 if (error)
13e451fd 1223 goto error_finish_iomap;
b1aa812b
JK
1224
1225 __SetPageUptodate(vmf->cow_page);
1226 vmf_ret = finish_fault(vmf);
1227 if (!vmf_ret)
1228 vmf_ret = VM_FAULT_DONE_COW;
13e451fd 1229 goto finish_iomap;
a7d73fe6
CH
1230 }
1231
aaa422c4 1232 sync = dax_fault_is_synchronous(flags, vma, &iomap);
caa51d26 1233
a7d73fe6
CH
1234 switch (iomap.type) {
1235 case IOMAP_MAPPED:
1236 if (iomap.flags & IOMAP_F_NEW) {
1237 count_vm_event(PGMAJFAULT);
a0987ad5 1238 count_memcg_event_mm(vma->vm_mm, PGMAJFAULT);
a7d73fe6
CH
1239 major = VM_FAULT_MAJOR;
1240 }
1b5a1cb2
JK
1241 error = dax_iomap_pfn(&iomap, pos, PAGE_SIZE, &pfn);
1242 if (error < 0)
1243 goto error_finish_iomap;
1244
3fe0791c 1245 entry = dax_insert_mapping_entry(mapping, vmf, entry, pfn,
caa51d26 1246 0, write && !sync);
1b5a1cb2
JK
1247 if (IS_ERR(entry)) {
1248 error = PTR_ERR(entry);
1249 goto error_finish_iomap;
1250 }
1251
caa51d26
JK
1252 /*
1253 * If we are doing synchronous page fault and inode needs fsync,
1254 * we can insert PTE into page tables only after that happens.
1255 * Skip insertion for now and return the pfn so that caller can
1256 * insert it after fsync is done.
1257 */
1258 if (sync) {
1259 if (WARN_ON_ONCE(!pfnp)) {
1260 error = -EIO;
1261 goto error_finish_iomap;
1262 }
1263 *pfnp = pfn;
1264 vmf_ret = VM_FAULT_NEEDDSYNC | major;
1265 goto finish_iomap;
1266 }
1b5a1cb2
JK
1267 trace_dax_insert_mapping(inode, vmf, entry);
1268 if (write)
1269 error = vm_insert_mixed_mkwrite(vma, vaddr, pfn);
1270 else
1271 error = vm_insert_mixed(vma, vaddr, pfn);
1272
9f141d6e
JK
1273 /* -EBUSY is fine, somebody else faulted on the same PTE */
1274 if (error == -EBUSY)
1275 error = 0;
a7d73fe6
CH
1276 break;
1277 case IOMAP_UNWRITTEN:
1278 case IOMAP_HOLE:
d2c43ef1 1279 if (!write) {
91d25ba8 1280 vmf_ret = dax_load_hole(mapping, entry, vmf);
13e451fd 1281 goto finish_iomap;
1550290b 1282 }
a7d73fe6
CH
1283 /*FALLTHRU*/
1284 default:
1285 WARN_ON_ONCE(1);
1286 error = -EIO;
1287 break;
1288 }
1289
13e451fd 1290 error_finish_iomap:
9f141d6e 1291 vmf_ret = dax_fault_return(error) | major;
9f141d6e
JK
1292 finish_iomap:
1293 if (ops->iomap_end) {
1294 int copied = PAGE_SIZE;
1295
1296 if (vmf_ret & VM_FAULT_ERROR)
1297 copied = 0;
1298 /*
1299 * The fault is done by now and there's no way back (other
1300 * thread may be already happily using PTE we have installed).
1301 * Just ignore error from ->iomap_end since we cannot do much
1302 * with it.
1303 */
1304 ops->iomap_end(inode, pos, PAGE_SIZE, copied, flags, &iomap);
1550290b 1305 }
13e451fd 1306 unlock_entry:
91d25ba8 1307 put_locked_mapping_entry(mapping, vmf->pgoff);
13e451fd 1308 out:
a9c42b33 1309 trace_dax_pte_fault_done(inode, vmf, vmf_ret);
9f141d6e 1310 return vmf_ret;
a7d73fe6 1311}
642261ac
RZ
1312
1313#ifdef CONFIG_FS_DAX_PMD
f4200391 1314static int dax_pmd_load_hole(struct vm_fault *vmf, struct iomap *iomap,
91d25ba8 1315 void *entry)
642261ac 1316{
f4200391
DJ
1317 struct address_space *mapping = vmf->vma->vm_file->f_mapping;
1318 unsigned long pmd_addr = vmf->address & PMD_MASK;
653b2ea3 1319 struct inode *inode = mapping->host;
642261ac 1320 struct page *zero_page;
653b2ea3 1321 void *ret = NULL;
642261ac
RZ
1322 spinlock_t *ptl;
1323 pmd_t pmd_entry;
3fe0791c 1324 pfn_t pfn;
642261ac 1325
f4200391 1326 zero_page = mm_get_huge_zero_page(vmf->vma->vm_mm);
642261ac
RZ
1327
1328 if (unlikely(!zero_page))
653b2ea3 1329 goto fallback;
642261ac 1330
3fe0791c
DW
1331 pfn = page_to_pfn_t(zero_page);
1332 ret = dax_insert_mapping_entry(mapping, vmf, entry, pfn,
f5b7b748 1333 RADIX_DAX_PMD | RADIX_DAX_ZERO_PAGE, false);
642261ac 1334 if (IS_ERR(ret))
653b2ea3 1335 goto fallback;
642261ac 1336
f4200391
DJ
1337 ptl = pmd_lock(vmf->vma->vm_mm, vmf->pmd);
1338 if (!pmd_none(*(vmf->pmd))) {
642261ac 1339 spin_unlock(ptl);
653b2ea3 1340 goto fallback;
642261ac
RZ
1341 }
1342
f4200391 1343 pmd_entry = mk_pmd(zero_page, vmf->vma->vm_page_prot);
642261ac 1344 pmd_entry = pmd_mkhuge(pmd_entry);
f4200391 1345 set_pmd_at(vmf->vma->vm_mm, pmd_addr, vmf->pmd, pmd_entry);
642261ac 1346 spin_unlock(ptl);
f4200391 1347 trace_dax_pmd_load_hole(inode, vmf, zero_page, ret);
642261ac 1348 return VM_FAULT_NOPAGE;
653b2ea3
RZ
1349
1350fallback:
f4200391 1351 trace_dax_pmd_load_hole_fallback(inode, vmf, zero_page, ret);
653b2ea3 1352 return VM_FAULT_FALLBACK;
642261ac
RZ
1353}
1354
9a0dd422 1355static int dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp,
a2d58167 1356 const struct iomap_ops *ops)
642261ac 1357{
f4200391 1358 struct vm_area_struct *vma = vmf->vma;
642261ac 1359 struct address_space *mapping = vma->vm_file->f_mapping;
d8a849e1
DJ
1360 unsigned long pmd_addr = vmf->address & PMD_MASK;
1361 bool write = vmf->flags & FAULT_FLAG_WRITE;
caa51d26 1362 bool sync;
9484ab1b 1363 unsigned int iomap_flags = (write ? IOMAP_WRITE : 0) | IOMAP_FAULT;
642261ac
RZ
1364 struct inode *inode = mapping->host;
1365 int result = VM_FAULT_FALLBACK;
1366 struct iomap iomap = { 0 };
1367 pgoff_t max_pgoff, pgoff;
642261ac
RZ
1368 void *entry;
1369 loff_t pos;
1370 int error;
302a5e31 1371 pfn_t pfn;
642261ac 1372
282a8e03
RZ
1373 /*
1374 * Check whether offset isn't beyond end of file now. Caller is
1375 * supposed to hold locks serializing us with truncate / punch hole so
1376 * this is a reliable test.
1377 */
1378 pgoff = linear_page_index(vma, pmd_addr);
957ac8c4 1379 max_pgoff = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE);
282a8e03 1380
f4200391 1381 trace_dax_pmd_fault(inode, vmf, max_pgoff, 0);
282a8e03 1382
fffa281b
RZ
1383 /*
1384 * Make sure that the faulting address's PMD offset (color) matches
1385 * the PMD offset from the start of the file. This is necessary so
1386 * that a PMD range in the page table overlaps exactly with a PMD
1387 * range in the radix tree.
1388 */
1389 if ((vmf->pgoff & PG_PMD_COLOUR) !=
1390 ((vmf->address >> PAGE_SHIFT) & PG_PMD_COLOUR))
1391 goto fallback;
1392
642261ac
RZ
1393 /* Fall back to PTEs if we're going to COW */
1394 if (write && !(vma->vm_flags & VM_SHARED))
1395 goto fallback;
1396
1397 /* If the PMD would extend outside the VMA */
1398 if (pmd_addr < vma->vm_start)
1399 goto fallback;
1400 if ((pmd_addr + PMD_SIZE) > vma->vm_end)
1401 goto fallback;
1402
957ac8c4 1403 if (pgoff >= max_pgoff) {
282a8e03
RZ
1404 result = VM_FAULT_SIGBUS;
1405 goto out;
1406 }
642261ac
RZ
1407
1408 /* If the PMD would extend beyond the file size */
957ac8c4 1409 if ((pgoff | PG_PMD_COLOUR) >= max_pgoff)
642261ac
RZ
1410 goto fallback;
1411
876f2946 1412 /*
91d25ba8
RZ
1413 * grab_mapping_entry() will make sure we get a 2MiB empty entry, a
1414 * 2MiB zero page entry or a DAX PMD. If it can't (because a 4k page
1415 * is already in the tree, for instance), it will return -EEXIST and
1416 * we just fall back to 4k entries.
876f2946
RZ
1417 */
1418 entry = grab_mapping_entry(mapping, pgoff, RADIX_DAX_PMD);
1419 if (IS_ERR(entry))
1420 goto fallback;
1421
e2093926
RZ
1422 /*
1423 * It is possible, particularly with mixed reads & writes to private
1424 * mappings, that we have raced with a PTE fault that overlaps with
1425 * the PMD we need to set up. If so just return and the fault will be
1426 * retried.
1427 */
1428 if (!pmd_none(*vmf->pmd) && !pmd_trans_huge(*vmf->pmd) &&
1429 !pmd_devmap(*vmf->pmd)) {
1430 result = 0;
1431 goto unlock_entry;
1432 }
1433
642261ac
RZ
1434 /*
1435 * Note that we don't use iomap_apply here. We aren't doing I/O, only
1436 * setting up a mapping, so really we're using iomap_begin() as a way
1437 * to look up our filesystem block.
1438 */
1439 pos = (loff_t)pgoff << PAGE_SHIFT;
1440 error = ops->iomap_begin(inode, pos, PMD_SIZE, iomap_flags, &iomap);
1441 if (error)
876f2946 1442 goto unlock_entry;
9f141d6e 1443
642261ac
RZ
1444 if (iomap.offset + iomap.length < pos + PMD_SIZE)
1445 goto finish_iomap;
1446
aaa422c4 1447 sync = dax_fault_is_synchronous(iomap_flags, vma, &iomap);
caa51d26 1448
642261ac
RZ
1449 switch (iomap.type) {
1450 case IOMAP_MAPPED:
302a5e31
JK
1451 error = dax_iomap_pfn(&iomap, pos, PMD_SIZE, &pfn);
1452 if (error < 0)
1453 goto finish_iomap;
1454
3fe0791c 1455 entry = dax_insert_mapping_entry(mapping, vmf, entry, pfn,
caa51d26 1456 RADIX_DAX_PMD, write && !sync);
302a5e31
JK
1457 if (IS_ERR(entry))
1458 goto finish_iomap;
1459
caa51d26
JK
1460 /*
1461 * If we are doing synchronous page fault and inode needs fsync,
1462 * we can insert PMD into page tables only after that happens.
1463 * Skip insertion for now and return the pfn so that caller can
1464 * insert it after fsync is done.
1465 */
1466 if (sync) {
1467 if (WARN_ON_ONCE(!pfnp))
1468 goto finish_iomap;
1469 *pfnp = pfn;
1470 result = VM_FAULT_NEEDDSYNC;
1471 goto finish_iomap;
1472 }
1473
302a5e31
JK
1474 trace_dax_pmd_insert_mapping(inode, vmf, PMD_SIZE, pfn, entry);
1475 result = vmf_insert_pfn_pmd(vma, vmf->address, vmf->pmd, pfn,
1476 write);
642261ac
RZ
1477 break;
1478 case IOMAP_UNWRITTEN:
1479 case IOMAP_HOLE:
1480 if (WARN_ON_ONCE(write))
876f2946 1481 break;
91d25ba8 1482 result = dax_pmd_load_hole(vmf, &iomap, entry);
642261ac
RZ
1483 break;
1484 default:
1485 WARN_ON_ONCE(1);
1486 break;
1487 }
1488
1489 finish_iomap:
1490 if (ops->iomap_end) {
9f141d6e
JK
1491 int copied = PMD_SIZE;
1492
1493 if (result == VM_FAULT_FALLBACK)
1494 copied = 0;
1495 /*
1496 * The fault is done by now and there's no way back (other
1497 * thread may be already happily using PMD we have installed).
1498 * Just ignore error from ->iomap_end since we cannot do much
1499 * with it.
1500 */
1501 ops->iomap_end(inode, pos, PMD_SIZE, copied, iomap_flags,
1502 &iomap);
642261ac 1503 }
876f2946 1504 unlock_entry:
91d25ba8 1505 put_locked_mapping_entry(mapping, pgoff);
642261ac
RZ
1506 fallback:
1507 if (result == VM_FAULT_FALLBACK) {
d8a849e1 1508 split_huge_pmd(vma, vmf->pmd, vmf->address);
642261ac
RZ
1509 count_vm_event(THP_FAULT_FALLBACK);
1510 }
282a8e03 1511out:
f4200391 1512 trace_dax_pmd_fault_done(inode, vmf, max_pgoff, result);
642261ac
RZ
1513 return result;
1514}
a2d58167 1515#else
9a0dd422 1516static int dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp,
01cddfe9 1517 const struct iomap_ops *ops)
a2d58167
DJ
1518{
1519 return VM_FAULT_FALLBACK;
1520}
642261ac 1521#endif /* CONFIG_FS_DAX_PMD */
a2d58167
DJ
1522
1523/**
1524 * dax_iomap_fault - handle a page fault on a DAX file
1525 * @vmf: The description of the fault
cec04e8c 1526 * @pe_size: Size of the page to fault in
9a0dd422 1527 * @pfnp: PFN to insert for synchronous faults if fsync is required
c0b24625 1528 * @iomap_errp: Storage for detailed error code in case of error
cec04e8c 1529 * @ops: Iomap ops passed from the file system
a2d58167
DJ
1530 *
1531 * When a page fault occurs, filesystems may call this helper in
1532 * their fault handler for DAX files. dax_iomap_fault() assumes the caller
1533 * has done all the necessary locking for page fault to proceed
1534 * successfully.
1535 */
c791ace1 1536int dax_iomap_fault(struct vm_fault *vmf, enum page_entry_size pe_size,
c0b24625 1537 pfn_t *pfnp, int *iomap_errp, const struct iomap_ops *ops)
a2d58167 1538{
c791ace1
DJ
1539 switch (pe_size) {
1540 case PE_SIZE_PTE:
c0b24625 1541 return dax_iomap_pte_fault(vmf, pfnp, iomap_errp, ops);
c791ace1 1542 case PE_SIZE_PMD:
9a0dd422 1543 return dax_iomap_pmd_fault(vmf, pfnp, ops);
a2d58167
DJ
1544 default:
1545 return VM_FAULT_FALLBACK;
1546 }
1547}
1548EXPORT_SYMBOL_GPL(dax_iomap_fault);
71eab6df
JK
1549
1550/**
1551 * dax_insert_pfn_mkwrite - insert PTE or PMD entry into page tables
1552 * @vmf: The description of the fault
1553 * @pe_size: Size of entry to be inserted
1554 * @pfn: PFN to insert
1555 *
1556 * This function inserts writeable PTE or PMD entry into page tables for mmaped
1557 * DAX file. It takes care of marking corresponding radix tree entry as dirty
1558 * as well.
1559 */
1560static int dax_insert_pfn_mkwrite(struct vm_fault *vmf,
1561 enum page_entry_size pe_size,
1562 pfn_t pfn)
1563{
1564 struct address_space *mapping = vmf->vma->vm_file->f_mapping;
1565 void *entry, **slot;
1566 pgoff_t index = vmf->pgoff;
1567 int vmf_ret, error;
1568
1569 spin_lock_irq(&mapping->tree_lock);
1570 entry = get_unlocked_mapping_entry(mapping, index, &slot);
1571 /* Did we race with someone splitting entry or so? */
1572 if (!entry ||
1573 (pe_size == PE_SIZE_PTE && !dax_is_pte_entry(entry)) ||
1574 (pe_size == PE_SIZE_PMD && !dax_is_pmd_entry(entry))) {
1575 put_unlocked_mapping_entry(mapping, index, entry);
1576 spin_unlock_irq(&mapping->tree_lock);
1577 trace_dax_insert_pfn_mkwrite_no_entry(mapping->host, vmf,
1578 VM_FAULT_NOPAGE);
1579 return VM_FAULT_NOPAGE;
1580 }
1581 radix_tree_tag_set(&mapping->page_tree, index, PAGECACHE_TAG_DIRTY);
1582 entry = lock_slot(mapping, slot);
1583 spin_unlock_irq(&mapping->tree_lock);
1584 switch (pe_size) {
1585 case PE_SIZE_PTE:
1586 error = vm_insert_mixed_mkwrite(vmf->vma, vmf->address, pfn);
1587 vmf_ret = dax_fault_return(error);
1588 break;
1589#ifdef CONFIG_FS_DAX_PMD
1590 case PE_SIZE_PMD:
1591 vmf_ret = vmf_insert_pfn_pmd(vmf->vma, vmf->address, vmf->pmd,
1592 pfn, true);
1593 break;
1594#endif
1595 default:
1596 vmf_ret = VM_FAULT_FALLBACK;
1597 }
1598 put_locked_mapping_entry(mapping, index);
1599 trace_dax_insert_pfn_mkwrite(mapping->host, vmf, vmf_ret);
1600 return vmf_ret;
1601}
1602
1603/**
1604 * dax_finish_sync_fault - finish synchronous page fault
1605 * @vmf: The description of the fault
1606 * @pe_size: Size of entry to be inserted
1607 * @pfn: PFN to insert
1608 *
1609 * This function ensures that the file range touched by the page fault is
1610 * stored persistently on the media and handles inserting of appropriate page
1611 * table entry.
1612 */
1613int dax_finish_sync_fault(struct vm_fault *vmf, enum page_entry_size pe_size,
1614 pfn_t pfn)
1615{
1616 int err;
1617 loff_t start = ((loff_t)vmf->pgoff) << PAGE_SHIFT;
1618 size_t len = 0;
1619
1620 if (pe_size == PE_SIZE_PTE)
1621 len = PAGE_SIZE;
1622 else if (pe_size == PE_SIZE_PMD)
1623 len = PMD_SIZE;
1624 else
1625 WARN_ON_ONCE(1);
1626 err = vfs_fsync_range(vmf->vma->vm_file, start, start + len - 1, 1);
1627 if (err)
1628 return VM_FAULT_SIGBUS;
1629 return dax_insert_pfn_mkwrite(vmf, pe_size, pfn);
1630}
1631EXPORT_SYMBOL_GPL(dax_finish_sync_fault);