mm/hmm: invalidate device page table at start of invalidation
[linux-2.6-block.git] / mm / filemap.c
index 52517f28e6f4a69020cfc60867f5a57461c89fb8..218d0b2ec82d1534dcb66b4744f886d7d0262d55 100644 (file)
@@ -36,6 +36,8 @@
 #include <linux/cleancache.h>
 #include <linux/shmem_fs.h>
 #include <linux/rmap.h>
+#include <linux/delayacct.h>
+#include <linux/psi.h>
 #include "internal.h"
 
 #define CREATE_TRACE_POINTS
  *   ->tasklist_lock            (memory_failure, collect_procs_ao)
  */
 
-static int page_cache_tree_insert(struct address_space *mapping,
-                                 struct page *page, void **shadowp)
-{
-       struct radix_tree_node *node;
-       void **slot;
-       int error;
-
-       error = __radix_tree_create(&mapping->i_pages, page->index, 0,
-                                   &node, &slot);
-       if (error)
-               return error;
-       if (*slot) {
-               void *p;
-
-               p = radix_tree_deref_slot_protected(slot,
-                                                   &mapping->i_pages.xa_lock);
-               if (!radix_tree_exceptional_entry(p))
-                       return -EEXIST;
-
-               mapping->nrexceptional--;
-               if (shadowp)
-                       *shadowp = p;
-       }
-       __radix_tree_replace(&mapping->i_pages, node, slot, page,
-                            workingset_lookup_update(mapping));
-       mapping->nrpages++;
-       return 0;
-}
-
-static void page_cache_tree_delete(struct address_space *mapping,
+static void page_cache_delete(struct address_space *mapping,
                                   struct page *page, void *shadow)
 {
-       int i, nr;
+       XA_STATE(xas, &mapping->i_pages, page->index);
+       unsigned int nr = 1;
 
-       /* hugetlb pages are represented by one entry in the radix tree */
-       nr = PageHuge(page) ? 1 : hpage_nr_pages(page);
+       mapping_set_update(&xas, mapping);
+
+       /* hugetlb pages are represented by a single entry in the xarray */
+       if (!PageHuge(page)) {
+               xas_set_order(&xas, page->index, compound_order(page));
+               nr = 1U << compound_order(page);
+       }
 
        VM_BUG_ON_PAGE(!PageLocked(page), page);
        VM_BUG_ON_PAGE(PageTail(page), page);
        VM_BUG_ON_PAGE(nr != 1 && shadow, page);
 
-       for (i = 0; i < nr; i++) {
-               struct radix_tree_node *node;
-               void **slot;
-
-               __radix_tree_lookup(&mapping->i_pages, page->index + i,
-                                   &node, &slot);
-
-               VM_BUG_ON_PAGE(!node && nr != 1, page);
-
-               radix_tree_clear_tags(&mapping->i_pages, node, slot);
-               __radix_tree_replace(&mapping->i_pages, node, slot, shadow,
-                               workingset_lookup_update(mapping));
-       }
+       xas_store(&xas, shadow);
+       xas_init_marks(&xas);
 
        page->mapping = NULL;
        /* Leave page->index set: truncation lookup relies upon it */
@@ -263,7 +231,7 @@ void __delete_from_page_cache(struct page *page, void *shadow)
        trace_mm_filemap_delete_from_page_cache(page);
 
        unaccount_page_cache_page(mapping, page);
-       page_cache_tree_delete(mapping, page, shadow);
+       page_cache_delete(mapping, page, shadow);
 }
 
 static void page_cache_free_page(struct address_space *mapping,
@@ -306,7 +274,7 @@ void delete_from_page_cache(struct page *page)
 EXPORT_SYMBOL(delete_from_page_cache);
 
 /*
- * page_cache_tree_delete_batch - delete several pages from page cache
+ * page_cache_delete_batch - delete several pages from page cache
  * @mapping: the mapping to which pages belong
  * @pvec: pagevec with pages to delete
  *
@@ -319,24 +287,19 @@ EXPORT_SYMBOL(delete_from_page_cache);
  *
  * The function expects the i_pages lock to be held.
  */
-static void
-page_cache_tree_delete_batch(struct address_space *mapping,
+static void page_cache_delete_batch(struct address_space *mapping,
                             struct pagevec *pvec)
 {
-       struct radix_tree_iter iter;
-       void **slot;
+       XA_STATE(xas, &mapping->i_pages, pvec->pages[0]->index);
        int total_pages = 0;
        int i = 0, tail_pages = 0;
        struct page *page;
-       pgoff_t start;
 
-       start = pvec->pages[0]->index;
-       radix_tree_for_each_slot(slot, &mapping->i_pages, &iter, start) {
+       mapping_set_update(&xas, mapping);
+       xas_for_each(&xas, page, ULONG_MAX) {
                if (i >= pagevec_count(pvec) && !tail_pages)
                        break;
-               page = radix_tree_deref_slot_protected(slot,
-                                                      &mapping->i_pages.xa_lock);
-               if (radix_tree_exceptional_entry(page))
+               if (xa_is_value(page))
                        continue;
                if (!tail_pages) {
                        /*
@@ -344,8 +307,11 @@ page_cache_tree_delete_batch(struct address_space *mapping,
                         * have our pages locked so they are protected from
                         * being removed.
                         */
-                       if (page != pvec->pages[i])
+                       if (page != pvec->pages[i]) {
+                               VM_BUG_ON_PAGE(page->index >
+                                               pvec->pages[i]->index, page);
                                continue;
+                       }
                        WARN_ON_ONCE(!PageLocked(page));
                        if (PageTransHuge(page) && !PageHuge(page))
                                tail_pages = HPAGE_PMD_NR - 1;
@@ -356,11 +322,11 @@ page_cache_tree_delete_batch(struct address_space *mapping,
                         */
                        i++;
                } else {
+                       VM_BUG_ON_PAGE(page->index + HPAGE_PMD_NR - tail_pages
+                                       != pvec->pages[i]->index, page);
                        tail_pages--;
                }
-               radix_tree_clear_tags(&mapping->i_pages, iter.node, slot);
-               __radix_tree_replace(&mapping->i_pages, iter.node, slot, NULL,
-                               workingset_lookup_update(mapping));
+               xas_store(&xas, NULL);
                total_pages++;
        }
        mapping->nrpages -= total_pages;
@@ -381,7 +347,7 @@ void delete_from_page_cache_batch(struct address_space *mapping,
 
                unaccount_page_cache_page(mapping, pvec->pages[i]);
        }
-       page_cache_tree_delete_batch(mapping, pvec);
+       page_cache_delete_batch(mapping, pvec);
        xa_unlock_irqrestore(&mapping->i_pages, flags);
 
        for (i = 0; i < pagevec_count(pvec); i++)
@@ -491,20 +457,31 @@ EXPORT_SYMBOL(filemap_flush);
 bool filemap_range_has_page(struct address_space *mapping,
                           loff_t start_byte, loff_t end_byte)
 {
-       pgoff_t index = start_byte >> PAGE_SHIFT;
-       pgoff_t end = end_byte >> PAGE_SHIFT;
        struct page *page;
+       XA_STATE(xas, &mapping->i_pages, start_byte >> PAGE_SHIFT);
+       pgoff_t max = end_byte >> PAGE_SHIFT;
 
        if (end_byte < start_byte)
                return false;
 
-       if (mapping->nrpages == 0)
-               return false;
+       rcu_read_lock();
+       for (;;) {
+               page = xas_find(&xas, max);
+               if (xas_retry(&xas, page))
+                       continue;
+               /* Shadow entries don't count */
+               if (xa_is_value(page))
+                       continue;
+               /*
+                * We don't need to try to pin this page; we're about to
+                * release the RCU lock anyway.  It is enough to know that
+                * there was a page here recently.
+                */
+               break;
+       }
+       rcu_read_unlock();
 
-       if (!find_get_pages_range(mapping, &index, end, 1, &page))
-               return false;
-       put_page(page);
-       return true;
+       return page != NULL;
 }
 EXPORT_SYMBOL(filemap_range_has_page);
 
@@ -775,51 +752,44 @@ EXPORT_SYMBOL(file_write_and_wait_range);
  * locked.  This function does not add the new page to the LRU, the
  * caller must do that.
  *
- * The remove + add is atomic.  The only way this function can fail is
- * memory allocation failure.
+ * The remove + add is atomic.  This function cannot fail.
  */
 int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask)
 {
-       int error;
+       struct address_space *mapping = old->mapping;
+       void (*freepage)(struct page *) = mapping->a_ops->freepage;
+       pgoff_t offset = old->index;
+       XA_STATE(xas, &mapping->i_pages, offset);
+       unsigned long flags;
 
        VM_BUG_ON_PAGE(!PageLocked(old), old);
        VM_BUG_ON_PAGE(!PageLocked(new), new);
        VM_BUG_ON_PAGE(new->mapping, new);
 
-       error = radix_tree_preload(gfp_mask & GFP_RECLAIM_MASK);
-       if (!error) {
-               struct address_space *mapping = old->mapping;
-               void (*freepage)(struct page *);
-               unsigned long flags;
-
-               pgoff_t offset = old->index;
-               freepage = mapping->a_ops->freepage;
+       get_page(new);
+       new->mapping = mapping;
+       new->index = offset;
 
-               get_page(new);
-               new->mapping = mapping;
-               new->index = offset;
+       xas_lock_irqsave(&xas, flags);
+       xas_store(&xas, new);
 
-               xa_lock_irqsave(&mapping->i_pages, flags);
-               __delete_from_page_cache(old, NULL);
-               error = page_cache_tree_insert(mapping, new, NULL);
-               BUG_ON(error);
-
-               /*
-                * hugetlb pages do not participate in page cache accounting.
-                */
-               if (!PageHuge(new))
-                       __inc_node_page_state(new, NR_FILE_PAGES);
-               if (PageSwapBacked(new))
-                       __inc_node_page_state(new, NR_SHMEM);
-               xa_unlock_irqrestore(&mapping->i_pages, flags);
-               mem_cgroup_migrate(old, new);
-               radix_tree_preload_end();
-               if (freepage)
-                       freepage(old);
-               put_page(old);
-       }
+       old->mapping = NULL;
+       /* hugetlb pages do not participate in page cache accounting. */
+       if (!PageHuge(old))
+               __dec_node_page_state(new, NR_FILE_PAGES);
+       if (!PageHuge(new))
+               __inc_node_page_state(new, NR_FILE_PAGES);
+       if (PageSwapBacked(old))
+               __dec_node_page_state(new, NR_SHMEM);
+       if (PageSwapBacked(new))
+               __inc_node_page_state(new, NR_SHMEM);
+       xas_unlock_irqrestore(&xas, flags);
+       mem_cgroup_migrate(old, new);
+       if (freepage)
+               freepage(old);
+       put_page(old);
 
-       return error;
+       return 0;
 }
 EXPORT_SYMBOL_GPL(replace_page_cache_page);
 
@@ -828,12 +798,15 @@ static int __add_to_page_cache_locked(struct page *page,
                                      pgoff_t offset, gfp_t gfp_mask,
                                      void **shadowp)
 {
+       XA_STATE(xas, &mapping->i_pages, offset);
        int huge = PageHuge(page);
        struct mem_cgroup *memcg;
        int error;
+       void *old;
 
        VM_BUG_ON_PAGE(!PageLocked(page), page);
        VM_BUG_ON_PAGE(PageSwapBacked(page), page);
+       mapping_set_update(&xas, mapping);
 
        if (!huge) {
                error = mem_cgroup_try_charge(page, current->mm,
@@ -842,39 +815,47 @@ static int __add_to_page_cache_locked(struct page *page,
                        return error;
        }
 
-       error = radix_tree_maybe_preload(gfp_mask & GFP_RECLAIM_MASK);
-       if (error) {
-               if (!huge)
-                       mem_cgroup_cancel_charge(page, memcg, false);
-               return error;
-       }
-
        get_page(page);
        page->mapping = mapping;
        page->index = offset;
 
-       xa_lock_irq(&mapping->i_pages);
-       error = page_cache_tree_insert(mapping, page, shadowp);
-       radix_tree_preload_end();
-       if (unlikely(error))
-               goto err_insert;
+       do {
+               xas_lock_irq(&xas);
+               old = xas_load(&xas);
+               if (old && !xa_is_value(old))
+                       xas_set_err(&xas, -EEXIST);
+               xas_store(&xas, page);
+               if (xas_error(&xas))
+                       goto unlock;
+
+               if (xa_is_value(old)) {
+                       mapping->nrexceptional--;
+                       if (shadowp)
+                               *shadowp = old;
+               }
+               mapping->nrpages++;
+
+               /* hugetlb pages do not participate in page cache accounting */
+               if (!huge)
+                       __inc_node_page_state(page, NR_FILE_PAGES);
+unlock:
+               xas_unlock_irq(&xas);
+       } while (xas_nomem(&xas, gfp_mask & GFP_RECLAIM_MASK));
+
+       if (xas_error(&xas))
+               goto error;
 
-       /* hugetlb pages do not participate in page cache accounting. */
-       if (!huge)
-               __inc_node_page_state(page, NR_FILE_PAGES);
-       xa_unlock_irq(&mapping->i_pages);
        if (!huge)
                mem_cgroup_commit_charge(page, memcg, false, false);
        trace_mm_filemap_add_to_page_cache(page);
        return 0;
-err_insert:
+error:
        page->mapping = NULL;
        /* Leave page->index set: truncation relies upon it */
-       xa_unlock_irq(&mapping->i_pages);
        if (!huge)
                mem_cgroup_cancel_charge(page, memcg, false);
        put_page(page);
-       return error;
+       return xas_error(&xas);
 }
 
 /**
@@ -915,12 +896,9 @@ int add_to_page_cache_lru(struct page *page, struct address_space *mapping,
                 * data from the working set, only to cache data that will
                 * get overwritten with something else, is a waste of memory.
                 */
-               if (!(gfp_mask & __GFP_WRITE) &&
-                   shadow && workingset_refault(shadow)) {
-                       SetPageActive(page);
-                       workingset_activation(page);
-               } else
-                       ClearPageActive(page);
+               WARN_ON_ONCE(PageActive(page));
+               if (!(gfp_mask & __GFP_WRITE) && shadow)
+                       workingset_refault(page, shadow);
                lru_cache_add(page);
        }
        return ret;
@@ -1076,8 +1054,18 @@ static inline int wait_on_page_bit_common(wait_queue_head_t *q,
 {
        struct wait_page_queue wait_page;
        wait_queue_entry_t *wait = &wait_page.wait;
+       bool thrashing = false;
+       unsigned long pflags;
        int ret = 0;
 
+       if (bit_nr == PG_locked &&
+           !PageUptodate(page) && PageWorkingset(page)) {
+               if (!PageSwapBacked(page))
+                       delayacct_thrashing_start();
+               psi_memstall_enter(&pflags);
+               thrashing = true;
+       }
+
        init_wait(wait);
        wait->flags = lock ? WQ_FLAG_EXCLUSIVE : 0;
        wait->func = wake_page_function;
@@ -1116,6 +1104,12 @@ static inline int wait_on_page_bit_common(wait_queue_head_t *q,
 
        finish_wait(q, wait);
 
+       if (thrashing) {
+               if (!PageSwapBacked(page))
+                       delayacct_thrashing_end();
+               psi_memstall_leave(&pflags);
+       }
+
        /*
         * A signal could leave PageWaiters set. Clearing it here if
         * !waitqueue_active would be possible (by open-coding finish_wait),
@@ -1326,86 +1320,76 @@ int __lock_page_or_retry(struct page *page, struct mm_struct *mm,
 }
 
 /**
- * page_cache_next_hole - find the next hole (not-present entry)
- * @mapping: mapping
- * @index: index
- * @max_scan: maximum range to search
- *
- * Search the set [index, min(index+max_scan-1, MAX_INDEX)] for the
- * lowest indexed hole.
- *
- * Returns: the index of the hole if found, otherwise returns an index
- * outside of the set specified (in which case 'return - index >=
- * max_scan' will be true). In rare cases of index wrap-around, 0 will
- * be returned.
- *
- * page_cache_next_hole may be called under rcu_read_lock. However,
- * like radix_tree_gang_lookup, this will not atomically search a
- * snapshot of the tree at a single point in time. For example, if a
- * hole is created at index 5, then subsequently a hole is created at
- * index 10, page_cache_next_hole covering both indexes may return 10
- * if called under rcu_read_lock.
+ * page_cache_next_miss() - Find the next gap in the page cache.
+ * @mapping: Mapping.
+ * @index: Index.
+ * @max_scan: Maximum range to search.
+ *
+ * Search the range [index, min(index + max_scan - 1, ULONG_MAX)] for the
+ * gap with the lowest index.
+ *
+ * This function may be called under the rcu_read_lock.  However, this will
+ * not atomically search a snapshot of the cache at a single point in time.
+ * For example, if a gap is created at index 5, then subsequently a gap is
+ * created at index 10, page_cache_next_miss covering both indices may
+ * return 10 if called under the rcu_read_lock.
+ *
+ * Return: The index of the gap if found, otherwise an index outside the
+ * range specified (in which case 'return - index >= max_scan' will be true).
+ * In the rare case of index wrap-around, 0 will be returned.
  */
-pgoff_t page_cache_next_hole(struct address_space *mapping,
+pgoff_t page_cache_next_miss(struct address_space *mapping,
                             pgoff_t index, unsigned long max_scan)
 {
-       unsigned long i;
-
-       for (i = 0; i < max_scan; i++) {
-               struct page *page;
+       XA_STATE(xas, &mapping->i_pages, index);
 
-               page = radix_tree_lookup(&mapping->i_pages, index);
-               if (!page || radix_tree_exceptional_entry(page))
+       while (max_scan--) {
+               void *entry = xas_next(&xas);
+               if (!entry || xa_is_value(entry))
                        break;
-               index++;
-               if (index == 0)
+               if (xas.xa_index == 0)
                        break;
        }
 
-       return index;
+       return xas.xa_index;
 }
-EXPORT_SYMBOL(page_cache_next_hole);
+EXPORT_SYMBOL(page_cache_next_miss);
 
 /**
- * page_cache_prev_hole - find the prev hole (not-present entry)
- * @mapping: mapping
- * @index: index
- * @max_scan: maximum range to search
- *
- * Search backwards in the range [max(index-max_scan+1, 0), index] for
- * the first hole.
- *
- * Returns: the index of the hole if found, otherwise returns an index
- * outside of the set specified (in which case 'index - return >=
- * max_scan' will be true). In rare cases of wrap-around, ULONG_MAX
- * will be returned.
- *
- * page_cache_prev_hole may be called under rcu_read_lock. However,
- * like radix_tree_gang_lookup, this will not atomically search a
- * snapshot of the tree at a single point in time. For example, if a
- * hole is created at index 10, then subsequently a hole is created at
- * index 5, page_cache_prev_hole covering both indexes may return 5 if
- * called under rcu_read_lock.
+ * page_cache_prev_miss() - Find the next gap in the page cache.
+ * @mapping: Mapping.
+ * @index: Index.
+ * @max_scan: Maximum range to search.
+ *
+ * Search the range [max(index - max_scan + 1, 0), index] for the
+ * gap with the highest index.
+ *
+ * This function may be called under the rcu_read_lock.  However, this will
+ * not atomically search a snapshot of the cache at a single point in time.
+ * For example, if a gap is created at index 10, then subsequently a gap is
+ * created at index 5, page_cache_prev_miss() covering both indices may
+ * return 5 if called under the rcu_read_lock.
+ *
+ * Return: The index of the gap if found, otherwise an index outside the
+ * range specified (in which case 'index - return >= max_scan' will be true).
+ * In the rare case of wrap-around, ULONG_MAX will be returned.
  */
-pgoff_t page_cache_prev_hole(struct address_space *mapping,
+pgoff_t page_cache_prev_miss(struct address_space *mapping,
                             pgoff_t index, unsigned long max_scan)
 {
-       unsigned long i;
+       XA_STATE(xas, &mapping->i_pages, index);
 
-       for (i = 0; i < max_scan; i++) {
-               struct page *page;
-
-               page = radix_tree_lookup(&mapping->i_pages, index);
-               if (!page || radix_tree_exceptional_entry(page))
+       while (max_scan--) {
+               void *entry = xas_prev(&xas);
+               if (!entry || xa_is_value(entry))
                        break;
-               index--;
-               if (index == ULONG_MAX)
+               if (xas.xa_index == ULONG_MAX)
                        break;
        }
 
-       return index;
+       return xas.xa_index;
 }
-EXPORT_SYMBOL(page_cache_prev_hole);
+EXPORT_SYMBOL(page_cache_prev_miss);
 
 /**
  * find_get_entry - find and get a page cache entry
@@ -1422,47 +1406,40 @@ EXPORT_SYMBOL(page_cache_prev_hole);
  */
 struct page *find_get_entry(struct address_space *mapping, pgoff_t offset)
 {
-       void **pagep;
+       XA_STATE(xas, &mapping->i_pages, offset);
        struct page *head, *page;
 
        rcu_read_lock();
 repeat:
-       page = NULL;
-       pagep = radix_tree_lookup_slot(&mapping->i_pages, offset);
-       if (pagep) {
-               page = radix_tree_deref_slot(pagep);
-               if (unlikely(!page))
-                       goto out;
-               if (radix_tree_exception(page)) {
-                       if (radix_tree_deref_retry(page))
-                               goto repeat;
-                       /*
-                        * A shadow entry of a recently evicted page,
-                        * or a swap entry from shmem/tmpfs.  Return
-                        * it without attempting to raise page count.
-                        */
-                       goto out;
-               }
+       xas_reset(&xas);
+       page = xas_load(&xas);
+       if (xas_retry(&xas, page))
+               goto repeat;
+       /*
+        * A shadow entry of a recently evicted page, or a swap entry from
+        * shmem/tmpfs.  Return it without attempting to raise page count.
+        */
+       if (!page || xa_is_value(page))
+               goto out;
 
-               head = compound_head(page);
-               if (!page_cache_get_speculative(head))
-                       goto repeat;
+       head = compound_head(page);
+       if (!page_cache_get_speculative(head))
+               goto repeat;
 
-               /* The page was split under us? */
-               if (compound_head(page) != head) {
-                       put_page(head);
-                       goto repeat;
-               }
+       /* The page was split under us? */
+       if (compound_head(page) != head) {
+               put_page(head);
+               goto repeat;
+       }
 
-               /*
-                * Has the page moved?
-                * This is part of the lockless pagecache protocol. See
-                * include/linux/pagemap.h for details.
-                */
-               if (unlikely(page != *pagep)) {
-                       put_page(head);
-                       goto repeat;
-               }
+       /*
+        * Has the page moved?
+        * This is part of the lockless pagecache protocol. See
+        * include/linux/pagemap.h for details.
+        */
+       if (unlikely(page != xas_reload(&xas))) {
+               put_page(head);
+               goto repeat;
        }
 out:
        rcu_read_unlock();
@@ -1493,7 +1470,7 @@ struct page *find_lock_entry(struct address_space *mapping, pgoff_t offset)
 
 repeat:
        page = find_get_entry(mapping, offset);
-       if (page && !radix_tree_exception(page)) {
+       if (page && !xa_is_value(page)) {
                lock_page(page);
                /* Has the page been truncated? */
                if (unlikely(page_mapping(page) != mapping)) {
@@ -1539,7 +1516,7 @@ struct page *pagecache_get_page(struct address_space *mapping, pgoff_t offset,
 
 repeat:
        page = find_get_entry(mapping, offset);
-       if (radix_tree_exceptional_entry(page))
+       if (xa_is_value(page))
                page = NULL;
        if (!page)
                goto no_page;
@@ -1625,53 +1602,48 @@ unsigned find_get_entries(struct address_space *mapping,
                          pgoff_t start, unsigned int nr_entries,
                          struct page **entries, pgoff_t *indices)
 {
-       void **slot;
+       XA_STATE(xas, &mapping->i_pages, start);
+       struct page *page;
        unsigned int ret = 0;
-       struct radix_tree_iter iter;
 
        if (!nr_entries)
                return 0;
 
        rcu_read_lock();
-       radix_tree_for_each_slot(slot, &mapping->i_pages, &iter, start) {
-               struct page *head, *page;
-repeat:
-               page = radix_tree_deref_slot(slot);
-               if (unlikely(!page))
+       xas_for_each(&xas, page, ULONG_MAX) {
+               struct page *head;
+               if (xas_retry(&xas, page))
                        continue;
-               if (radix_tree_exception(page)) {
-                       if (radix_tree_deref_retry(page)) {
-                               slot = radix_tree_iter_retry(&iter);
-                               continue;
-                       }
-                       /*
-                        * A shadow entry of a recently evicted page, a swap
-                        * entry from shmem/tmpfs or a DAX entry.  Return it
-                        * without attempting to raise page count.
-                        */
+               /*
+                * A shadow entry of a recently evicted page, a swap
+                * entry from shmem/tmpfs or a DAX entry.  Return it
+                * without attempting to raise page count.
+                */
+               if (xa_is_value(page))
                        goto export;
-               }
 
                head = compound_head(page);
                if (!page_cache_get_speculative(head))
-                       goto repeat;
+                       goto retry;
 
                /* The page was split under us? */
-               if (compound_head(page) != head) {
-                       put_page(head);
-                       goto repeat;
-               }
+               if (compound_head(page) != head)
+                       goto put_page;
 
                /* Has the page moved? */
-               if (unlikely(page != *slot)) {
-                       put_page(head);
-                       goto repeat;
-               }
+               if (unlikely(page != xas_reload(&xas)))
+                       goto put_page;
+
 export:
-               indices[ret] = iter.index;
+               indices[ret] = xas.xa_index;
                entries[ret] = page;
                if (++ret == nr_entries)
                        break;
+               continue;
+put_page:
+               put_page(head);
+retry:
+               xas_reset(&xas);
        }
        rcu_read_unlock();
        return ret;
@@ -1702,64 +1674,50 @@ unsigned find_get_pages_range(struct address_space *mapping, pgoff_t *start,
                              pgoff_t end, unsigned int nr_pages,
                              struct page **pages)
 {
-       struct radix_tree_iter iter;
-       void **slot;
+       XA_STATE(xas, &mapping->i_pages, *start);
+       struct page *page;
        unsigned ret = 0;
 
        if (unlikely(!nr_pages))
                return 0;
 
        rcu_read_lock();
-       radix_tree_for_each_slot(slot, &mapping->i_pages, &iter, *start) {
-               struct page *head, *page;
-
-               if (iter.index > end)
-                       break;
-repeat:
-               page = radix_tree_deref_slot(slot);
-               if (unlikely(!page))
+       xas_for_each(&xas, page, end) {
+               struct page *head;
+               if (xas_retry(&xas, page))
                        continue;
-
-               if (radix_tree_exception(page)) {
-                       if (radix_tree_deref_retry(page)) {
-                               slot = radix_tree_iter_retry(&iter);
-                               continue;
-                       }
-                       /*
-                        * A shadow entry of a recently evicted page,
-                        * or a swap entry from shmem/tmpfs.  Skip
-                        * over it.
-                        */
+               /* Skip over shadow, swap and DAX entries */
+               if (xa_is_value(page))
                        continue;
-               }
 
                head = compound_head(page);
                if (!page_cache_get_speculative(head))
-                       goto repeat;
+                       goto retry;
 
                /* The page was split under us? */
-               if (compound_head(page) != head) {
-                       put_page(head);
-                       goto repeat;
-               }
+               if (compound_head(page) != head)
+                       goto put_page;
 
                /* Has the page moved? */
-               if (unlikely(page != *slot)) {
-                       put_page(head);
-                       goto repeat;
-               }
+               if (unlikely(page != xas_reload(&xas)))
+                       goto put_page;
 
                pages[ret] = page;
                if (++ret == nr_pages) {
-                       *start = pages[ret - 1]->index + 1;
+                       *start = page->index + 1;
                        goto out;
                }
+               continue;
+put_page:
+               put_page(head);
+retry:
+               xas_reset(&xas);
        }
 
        /*
         * We come here when there is no page beyond @end. We take care to not
         * overflow the index @start as it confuses some of the callers. This
-        * breaks the iteration when there is page at index -1 but that is
+        * breaks the iteration when there is page at index -1 but that is
         * already broken anyway.
         */
        if (end == (pgoff_t)-1)
@@ -1787,57 +1745,43 @@ out:
 unsigned find_get_pages_contig(struct address_space *mapping, pgoff_t index,
                               unsigned int nr_pages, struct page **pages)
 {
-       struct radix_tree_iter iter;
-       void **slot;
+       XA_STATE(xas, &mapping->i_pages, index);
+       struct page *page;
        unsigned int ret = 0;
 
        if (unlikely(!nr_pages))
                return 0;
 
        rcu_read_lock();
-       radix_tree_for_each_contig(slot, &mapping->i_pages, &iter, index) {
-               struct page *head, *page;
-repeat:
-               page = radix_tree_deref_slot(slot);
-               /* The hole, there no reason to continue */
-               if (unlikely(!page))
-                       break;
-
-               if (radix_tree_exception(page)) {
-                       if (radix_tree_deref_retry(page)) {
-                               slot = radix_tree_iter_retry(&iter);
-                               continue;
-                       }
-                       /*
-                        * A shadow entry of a recently evicted page,
-                        * or a swap entry from shmem/tmpfs.  Stop
-                        * looking for contiguous pages.
-                        */
+       for (page = xas_load(&xas); page; page = xas_next(&xas)) {
+               struct page *head;
+               if (xas_retry(&xas, page))
+                       continue;
+               /*
+                * If the entry has been swapped out, we can stop looking.
+                * No current caller is looking for DAX entries.
+                */
+               if (xa_is_value(page))
                        break;
-               }
 
                head = compound_head(page);
                if (!page_cache_get_speculative(head))
-                       goto repeat;
+                       goto retry;
 
                /* The page was split under us? */
-               if (compound_head(page) != head) {
-                       put_page(head);
-                       goto repeat;
-               }
+               if (compound_head(page) != head)
+                       goto put_page;
 
                /* Has the page moved? */
-               if (unlikely(page != *slot)) {
-                       put_page(head);
-                       goto repeat;
-               }
+               if (unlikely(page != xas_reload(&xas)))
+                       goto put_page;
 
                /*
                 * must check mapping and index after taking the ref.
                 * otherwise we can get both false positives and false
                 * negatives, which is just confusing to the caller.
                 */
-               if (page->mapping == NULL || page_to_pgoff(page) != iter.index) {
+               if (!page->mapping || page_to_pgoff(page) != xas.xa_index) {
                        put_page(page);
                        break;
                }
@@ -1845,6 +1789,11 @@ repeat:
                pages[ret] = page;
                if (++ret == nr_pages)
                        break;
+               continue;
+put_page:
+               put_page(head);
+retry:
+               xas_reset(&xas);
        }
        rcu_read_unlock();
        return ret;
@@ -1864,74 +1813,58 @@ EXPORT_SYMBOL(find_get_pages_contig);
  * @tag.   We update @index to index the next page for the traversal.
  */
 unsigned find_get_pages_range_tag(struct address_space *mapping, pgoff_t *index,
-                       pgoff_t end, int tag, unsigned int nr_pages,
+                       pgoff_t end, xa_mark_t tag, unsigned int nr_pages,
                        struct page **pages)
 {
-       struct radix_tree_iter iter;
-       void **slot;
+       XA_STATE(xas, &mapping->i_pages, *index);
+       struct page *page;
        unsigned ret = 0;
 
        if (unlikely(!nr_pages))
                return 0;
 
        rcu_read_lock();
-       radix_tree_for_each_tagged(slot, &mapping->i_pages, &iter, *index, tag) {
-               struct page *head, *page;
-
-               if (iter.index > end)
-                       break;
-repeat:
-               page = radix_tree_deref_slot(slot);
-               if (unlikely(!page))
+       xas_for_each_marked(&xas, page, end, tag) {
+               struct page *head;
+               if (xas_retry(&xas, page))
                        continue;
-
-               if (radix_tree_exception(page)) {
-                       if (radix_tree_deref_retry(page)) {
-                               slot = radix_tree_iter_retry(&iter);
-                               continue;
-                       }
-                       /*
-                        * A shadow entry of a recently evicted page.
-                        *
-                        * Those entries should never be tagged, but
-                        * this tree walk is lockless and the tags are
-                        * looked up in bulk, one radix tree node at a
-                        * time, so there is a sizable window for page
-                        * reclaim to evict a page we saw tagged.
-                        *
-                        * Skip over it.
-                        */
+               /*
+                * Shadow entries should never be tagged, but this iteration
+                * is lockless so there is a window for page reclaim to evict
+                * a page we saw tagged.  Skip over it.
+                */
+               if (xa_is_value(page))
                        continue;
-               }
 
                head = compound_head(page);
                if (!page_cache_get_speculative(head))
-                       goto repeat;
+                       goto retry;
 
                /* The page was split under us? */
-               if (compound_head(page) != head) {
-                       put_page(head);
-                       goto repeat;
-               }
+               if (compound_head(page) != head)
+                       goto put_page;
 
                /* Has the page moved? */
-               if (unlikely(page != *slot)) {
-                       put_page(head);
-                       goto repeat;
-               }
+               if (unlikely(page != xas_reload(&xas)))
+                       goto put_page;
 
                pages[ret] = page;
                if (++ret == nr_pages) {
-                       *index = pages[ret - 1]->index + 1;
+                       *index = page->index + 1;
                        goto out;
                }
+               continue;
+put_page:
+               put_page(head);
+retry:
+               xas_reset(&xas);
        }
 
        /*
-        * We come here when we got at @end. We take care to not overflow the
+        * We come here when we got to @end. We take care to not overflow the
         * index @index as it confuses some of the callers. This breaks the
-        * iteration when there is page at index -1 but that is already broken
-        * anyway.
+        * iteration when there is a page at index -1 but that is already
+        * broken anyway.
         */
        if (end == (pgoff_t)-1)
                *index = (pgoff_t)-1;
@@ -1957,57 +1890,51 @@ EXPORT_SYMBOL(find_get_pages_range_tag);
  * @tag.
  */
 unsigned find_get_entries_tag(struct address_space *mapping, pgoff_t start,
-                       int tag, unsigned int nr_entries,
+                       xa_mark_t tag, unsigned int nr_entries,
                        struct page **entries, pgoff_t *indices)
 {
-       void **slot;
+       XA_STATE(xas, &mapping->i_pages, start);
+       struct page *page;
        unsigned int ret = 0;
-       struct radix_tree_iter iter;
 
        if (!nr_entries)
                return 0;
 
        rcu_read_lock();
-       radix_tree_for_each_tagged(slot, &mapping->i_pages, &iter, start, tag) {
-               struct page *head, *page;
-repeat:
-               page = radix_tree_deref_slot(slot);
-               if (unlikely(!page))
+       xas_for_each_marked(&xas, page, ULONG_MAX, tag) {
+               struct page *head;
+               if (xas_retry(&xas, page))
                        continue;
-               if (radix_tree_exception(page)) {
-                       if (radix_tree_deref_retry(page)) {
-                               slot = radix_tree_iter_retry(&iter);
-                               continue;
-                       }
-
-                       /*
-                        * A shadow entry of a recently evicted page, a swap
-                        * entry from shmem/tmpfs or a DAX entry.  Return it
-                        * without attempting to raise page count.
-                        */
+               /*
+                * A shadow entry of a recently evicted page, a swap
+                * entry from shmem/tmpfs or a DAX entry.  Return it
+                * without attempting to raise page count.
+                */
+               if (xa_is_value(page))
                        goto export;
-               }
 
                head = compound_head(page);
                if (!page_cache_get_speculative(head))
-                       goto repeat;
+                       goto retry;
 
                /* The page was split under us? */
-               if (compound_head(page) != head) {
-                       put_page(head);
-                       goto repeat;
-               }
+               if (compound_head(page) != head)
+                       goto put_page;
 
                /* Has the page moved? */
-               if (unlikely(page != *slot)) {
-                       put_page(head);
-                       goto repeat;
-               }
+               if (unlikely(page != xas_reload(&xas)))
+                       goto put_page;
+
 export:
-               indices[ret] = iter.index;
+               indices[ret] = xas.xa_index;
                entries[ret] = page;
                if (++ret == nr_entries)
                        break;
+               continue;
+put_page:
+               put_page(head);
+retry:
+               xas_reset(&xas);
        }
        rcu_read_unlock();
        return ret;
@@ -2581,9 +2508,7 @@ no_cached_page:
         * system is low on memory, or a problem occurs while trying
         * to schedule I/O.
         */
-       if (error == -ENOMEM)
-               return VM_FAULT_OOM;
-       return VM_FAULT_SIGBUS;
+       return vmf_error(error);
 
 page_not_uptodate:
        /*
@@ -2613,45 +2538,31 @@ EXPORT_SYMBOL(filemap_fault);
 void filemap_map_pages(struct vm_fault *vmf,
                pgoff_t start_pgoff, pgoff_t end_pgoff)
 {
-       struct radix_tree_iter iter;
-       void **slot;
        struct file *file = vmf->vma->vm_file;
        struct address_space *mapping = file->f_mapping;
        pgoff_t last_pgoff = start_pgoff;
        unsigned long max_idx;
+       XA_STATE(xas, &mapping->i_pages, start_pgoff);
        struct page *head, *page;
 
        rcu_read_lock();
-       radix_tree_for_each_slot(slot, &mapping->i_pages, &iter, start_pgoff) {
-               if (iter.index > end_pgoff)
-                       break;
-repeat:
-               page = radix_tree_deref_slot(slot);
-               if (unlikely(!page))
-                       goto next;
-               if (radix_tree_exception(page)) {
-                       if (radix_tree_deref_retry(page)) {
-                               slot = radix_tree_iter_retry(&iter);
-                               continue;
-                       }
+       xas_for_each(&xas, page, end_pgoff) {
+               if (xas_retry(&xas, page))
+                       continue;
+               if (xa_is_value(page))
                        goto next;
-               }
 
                head = compound_head(page);
                if (!page_cache_get_speculative(head))
-                       goto repeat;
+                       goto next;
 
                /* The page was split under us? */
-               if (compound_head(page) != head) {
-                       put_page(head);
-                       goto repeat;
-               }
+               if (compound_head(page) != head)
+                       goto skip;
 
                /* Has the page moved? */
-               if (unlikely(page != *slot)) {
-                       put_page(head);
-                       goto repeat;
-               }
+               if (unlikely(page != xas_reload(&xas)))
+                       goto skip;
 
                if (!PageUptodate(page) ||
                                PageReadahead(page) ||
@@ -2670,10 +2581,10 @@ repeat:
                if (file->f_ra.mmap_miss > 0)
                        file->f_ra.mmap_miss--;
 
-               vmf->address += (iter.index - last_pgoff) << PAGE_SHIFT;
+               vmf->address += (xas.xa_index - last_pgoff) << PAGE_SHIFT;
                if (vmf->pte)
-                       vmf->pte += iter.index - last_pgoff;
-               last_pgoff = iter.index;
+                       vmf->pte += xas.xa_index - last_pgoff;
+               last_pgoff = xas.xa_index;
                if (alloc_set_pte(vmf, NULL, page))
                        goto unlock;
                unlock_page(page);
@@ -2686,8 +2597,6 @@ next:
                /* Huge page is mapped? No need to proceed. */
                if (pmd_trans_huge(*vmf->pmd))
                        break;
-               if (iter.index == end_pgoff)
-                       break;
        }
        rcu_read_unlock();
 }
@@ -2748,9 +2657,9 @@ int generic_file_readonly_mmap(struct file *file, struct vm_area_struct *vma)
        return generic_file_mmap(file, vma);
 }
 #else
-int filemap_page_mkwrite(struct vm_fault *vmf)
+vm_fault_t filemap_page_mkwrite(struct vm_fault *vmf)
 {
-       return -ENOSYS;
+       return VM_FAULT_SIGBUS;
 }
 int generic_file_mmap(struct file * file, struct vm_area_struct * vma)
 {
@@ -2797,7 +2706,7 @@ repeat:
                        put_page(page);
                        if (err == -EEXIST)
                                goto repeat;
-                       /* Presumably ENOMEM for radix tree node */
+                       /* Presumably ENOMEM for xarray node */
                        return ERR_PTR(err);
                }
 
@@ -3012,7 +2921,7 @@ generic_file_direct_write(struct kiocb *iocb, struct iov_iter *from)
        if (iocb->ki_flags & IOCB_NOWAIT) {
                /* If there are pages to writeback, return */
                if (filemap_range_has_page(inode->i_mapping, pos,
-                                          pos + iov_iter_count(from)))
+                                          pos + write_len))
                        return -EAGAIN;
        } else {
                written = filemap_write_and_wait_range(mapping, pos,