Merge branch 'hugepage-fallbacks' (hugepatch patches from David Rientjes)
authorLinus Torvalds <torvalds@linux-foundation.org>
Sat, 28 Sep 2019 21:26:47 +0000 (14:26 -0700)
committerLinus Torvalds <torvalds@linux-foundation.org>
Sat, 28 Sep 2019 21:26:47 +0000 (14:26 -0700)
Merge hugepage allocation updates from David Rientjes:
 "We (mostly Linus, Andrea, and myself) have been discussing offlist how
  to implement a sane default allocation strategy for hugepages on NUMA
  platforms.

  With these reverts in place, the page allocator will happily allocate
  a remote hugepage immediately rather than try to make a local hugepage
  available. This incurs a substantial performance degradation when
  memory compaction would have otherwise made a local hugepage
  available.

  This series reverts those reverts and attempts to propose a more sane
  default allocation strategy specifically for hugepages. Andrea
  acknowledges this is likely to fix the swap storms that he originally
  reported that resulted in the patches that removed __GFP_THISNODE from
  hugepage allocations.

  The immediate goal is to return 5.3 to the behavior the kernel has
  implemented over the past several years so that remote hugepages are
  not immediately allocated when local hugepages could have been made
  available because the increased access latency is untenable.

  The next goal is to introduce a sane default allocation strategy for
  hugepages allocations in general regardless of the configuration of
  the system so that we prevent thrashing of local memory when
  compaction is unlikely to succeed and can prefer remote hugepages over
  remote native pages when the local node is low on memory."

Note on timing: this reverts the hugepage VM behavior changes that got
introduced fairly late in the 5.3 cycle, and that fixed a huge
performance regression for certain loads that had been around since
4.18.

Andrea had this note:

 "The regression of 4.18 was that it was taking hours to start a VM
  where 3.10 was only taking a few seconds, I reported all the details
  on lkml when it was finally tracked down in August 2018.

     https://lore.kernel.org/linux-mm/20180820032640.9896-2-aarcange@redhat.com/

  __GFP_THISNODE in MADV_HUGEPAGE made the above enterprise vfio
  workload degrade like in the "current upstream" above. And it still
  would have been that bad as above until 5.3-rc5"

where the bad behavior ends up happening as you fill up a local node,
and without that change, you'd get into the nasty swap storm behavior
due to compaction working overtime to make room for more memory on the
nodes.

As a result 5.3 got the two performance fix reverts in rc5.

However, David Rientjes then noted that those performance fixes in turn
regressed performance for other loads - although not quite to the same
degree.  He suggested reverting the reverts and instead replacing them
with two small changes to how hugepage allocations are done (patch
descriptions rephrased by me):

 - "avoid expensive reclaim when compaction may not succeed": just admit
   that the allocation failed when you're trying to allocate a huge-page
   and compaction wasn't successful.

 - "allow hugepage fallback to remote nodes when madvised": when that
   node-local huge-page allocation failed, retry without forcing the
   local node.

but by then I judged it too late to replace the fixes for a 5.3 release.
So 5.3 was released with behavior that harked back to the pre-4.18 logic.

But now we're in the merge window for 5.4, and we can see if this
alternate model fixes not just the horrendous swap storm behavior, but
also restores the performance regression that the late reverts caused.

Fingers crossed.

* emailed patches from David Rientjes <rientjes@google.com>:
  mm, page_alloc: allow hugepage fallback to remote nodes when madvised
  mm, page_alloc: avoid expensive reclaim when compaction may not succeed
  Revert "Revert "Revert "mm, thp: consolidate THP gfp handling into alloc_hugepage_direct_gfpmask""
  Revert "Revert "mm, thp: restore node-local hugepage allocations""

1  2 
mm/huge_memory.c
mm/mempolicy.c
mm/page_alloc.c
mm/shmem.c

diff --combined mm/huge_memory.c
index 73fc517c08d222723b3e2a987775bd83dc6a4697,aec462cc5d4632e01301f55eb7687a1419fea351..c5cb6dcd6c69664c4e9c71d02c20ecae53362e9b
@@@ -496,25 -496,11 +496,25 @@@ pmd_t maybe_pmd_mkwrite(pmd_t pmd, stru
        return pmd;
  }
  
 -static inline struct list_head *page_deferred_list(struct page *page)
 +#ifdef CONFIG_MEMCG
 +static inline struct deferred_split *get_deferred_split_queue(struct page *page)
  {
 -      /* ->lru in the tail pages is occupied by compound_head. */
 -      return &page[2].deferred_list;
 +      struct mem_cgroup *memcg = compound_head(page)->mem_cgroup;
 +      struct pglist_data *pgdat = NODE_DATA(page_to_nid(page));
 +
 +      if (memcg)
 +              return &memcg->deferred_split_queue;
 +      else
 +              return &pgdat->deferred_split_queue;
 +}
 +#else
 +static inline struct deferred_split *get_deferred_split_queue(struct page *page)
 +{
 +      struct pglist_data *pgdat = NODE_DATA(page_to_nid(page));
 +
 +      return &pgdat->deferred_split_queue;
  }
 +#endif
  
  void prep_transhuge_page(struct page *page)
  {
@@@ -659,40 -645,30 +659,30 @@@ release
   *        available
   * never: never stall for any thp allocation
   */
- static inline gfp_t alloc_hugepage_direct_gfpmask(struct vm_area_struct *vma, unsigned long addr)
+ static inline gfp_t alloc_hugepage_direct_gfpmask(struct vm_area_struct *vma)
  {
        const bool vma_madvised = !!(vma->vm_flags & VM_HUGEPAGE);
-       gfp_t this_node = 0;
- #ifdef CONFIG_NUMA
-       struct mempolicy *pol;
-       /*
-        * __GFP_THISNODE is used only when __GFP_DIRECT_RECLAIM is not
-        * specified, to express a general desire to stay on the current
-        * node for optimistic allocation attempts. If the defrag mode
-        * and/or madvise hint requires the direct reclaim then we prefer
-        * to fallback to other node rather than node reclaim because that
-        * can lead to excessive reclaim even though there is free memory
-        * on other nodes. We expect that NUMA preferences are specified
-        * by memory policies.
-        */
-       pol = get_vma_policy(vma, addr);
-       if (pol->mode != MPOL_BIND)
-               this_node = __GFP_THISNODE;
-       mpol_cond_put(pol);
- #endif
  
+       /* Always do synchronous compaction */
        if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags))
                return GFP_TRANSHUGE | (vma_madvised ? 0 : __GFP_NORETRY);
+       /* Kick kcompactd and fail quickly */
        if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags))
-               return GFP_TRANSHUGE_LIGHT | __GFP_KSWAPD_RECLAIM | this_node;
+               return GFP_TRANSHUGE_LIGHT | __GFP_KSWAPD_RECLAIM;
+       /* Synchronous compaction if madvised, otherwise kick kcompactd */
        if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags))
-               return GFP_TRANSHUGE_LIGHT | (vma_madvised ? __GFP_DIRECT_RECLAIM :
-                                                            __GFP_KSWAPD_RECLAIM | this_node);
+               return GFP_TRANSHUGE_LIGHT |
+                       (vma_madvised ? __GFP_DIRECT_RECLAIM :
+                                       __GFP_KSWAPD_RECLAIM);
+       /* Only do synchronous compaction if madvised */
        if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags))
-               return GFP_TRANSHUGE_LIGHT | (vma_madvised ? __GFP_DIRECT_RECLAIM :
-                                                            this_node);
-       return GFP_TRANSHUGE_LIGHT | this_node;
+               return GFP_TRANSHUGE_LIGHT |
+                      (vma_madvised ? __GFP_DIRECT_RECLAIM : 0);
+       return GFP_TRANSHUGE_LIGHT;
  }
  
  /* Caller must hold page table lock. */
@@@ -764,8 -740,8 +754,8 @@@ vm_fault_t do_huge_pmd_anonymous_page(s
                        pte_free(vma->vm_mm, pgtable);
                return ret;
        }
-       gfp = alloc_hugepage_direct_gfpmask(vma, haddr);
-       page = alloc_pages_vma(gfp, HPAGE_PMD_ORDER, vma, haddr, numa_node_id());
+       gfp = alloc_hugepage_direct_gfpmask(vma);
+       page = alloc_hugepage_vma(gfp, vma, haddr, HPAGE_PMD_ORDER);
        if (unlikely(!page)) {
                count_vm_event(THP_FAULT_FALLBACK);
                return VM_FAULT_FALLBACK;
@@@ -1372,9 -1348,8 +1362,8 @@@ vm_fault_t do_huge_pmd_wp_page(struct v
  alloc:
        if (__transparent_hugepage_enabled(vma) &&
            !transparent_hugepage_debug_cow()) {
-               huge_gfp = alloc_hugepage_direct_gfpmask(vma, haddr);
-               new_page = alloc_pages_vma(huge_gfp, HPAGE_PMD_ORDER, vma,
-                               haddr, numa_node_id());
+               huge_gfp = alloc_hugepage_direct_gfpmask(vma);
+               new_page = alloc_hugepage_vma(huge_gfp, vma, haddr, HPAGE_PMD_ORDER);
        } else
                new_page = NULL;
  
@@@ -2511,8 -2486,6 +2500,8 @@@ static void __split_huge_page(struct pa
        struct page *head = compound_head(page);
        pg_data_t *pgdat = page_pgdat(head);
        struct lruvec *lruvec;
 +      struct address_space *swap_cache = NULL;
 +      unsigned long offset = 0;
        int i;
  
        lruvec = mem_cgroup_page_lruvec(head, pgdat);
        /* complete memcg works before add pages to LRU */
        mem_cgroup_split_huge_fixup(head);
  
 +      if (PageAnon(head) && PageSwapCache(head)) {
 +              swp_entry_t entry = { .val = page_private(head) };
 +
 +              offset = swp_offset(entry);
 +              swap_cache = swap_address_space(entry);
 +              xa_lock(&swap_cache->i_pages);
 +      }
 +
        for (i = HPAGE_PMD_NR - 1; i >= 1; i--) {
                __split_huge_page_tail(head, i, lruvec, list);
                /* Some pages can be beyond i_size: drop them from page cache */
                        if (IS_ENABLED(CONFIG_SHMEM) && PageSwapBacked(head))
                                shmem_uncharge(head->mapping->host, 1);
                        put_page(head + i);
 +              } else if (!PageAnon(page)) {
 +                      __xa_store(&head->mapping->i_pages, head[i].index,
 +                                      head + i, 0);
 +              } else if (swap_cache) {
 +                      __xa_store(&swap_cache->i_pages, offset + i,
 +                                      head + i, 0);
                }
        }
  
        /* See comment in __split_huge_page_tail() */
        if (PageAnon(head)) {
                /* Additional pin to swap cache */
 -              if (PageSwapCache(head))
 +              if (PageSwapCache(head)) {
                        page_ref_add(head, 2);
 -              else
 +                      xa_unlock(&swap_cache->i_pages);
 +              } else {
                        page_ref_inc(head);
 +              }
        } else {
                /* Additional pin to page cache */
                page_ref_add(head, 2);
@@@ -2705,7 -2662,6 +2694,7 @@@ int split_huge_page_to_list(struct pag
  {
        struct page *head = compound_head(page);
        struct pglist_data *pgdata = NODE_DATA(page_to_nid(head));
 +      struct deferred_split *ds_queue = get_deferred_split_queue(page);
        struct anon_vma *anon_vma = NULL;
        struct address_space *mapping = NULL;
        int count, mapcount, extra_pins, ret;
        }
  
        /* Prevent deferred_split_scan() touching ->_refcount */
 -      spin_lock(&pgdata->split_queue_lock);
 +      spin_lock(&ds_queue->split_queue_lock);
        count = page_count(head);
        mapcount = total_mapcount(head);
        if (!mapcount && page_ref_freeze(head, 1 + extra_pins)) {
                if (!list_empty(page_deferred_list(head))) {
 -                      pgdata->split_queue_len--;
 +                      ds_queue->split_queue_len--;
                        list_del(page_deferred_list(head));
                }
                if (mapping)
                        __dec_node_page_state(page, NR_SHMEM_THPS);
 -              spin_unlock(&pgdata->split_queue_lock);
 +              spin_unlock(&ds_queue->split_queue_lock);
                __split_huge_page(page, list, end, flags);
                if (PageSwapCache(head)) {
                        swp_entry_t entry = { .val = page_private(head) };
                        dump_page(page, "total_mapcount(head) > 0");
                        BUG();
                }
 -              spin_unlock(&pgdata->split_queue_lock);
 +              spin_unlock(&ds_queue->split_queue_lock);
  fail:         if (mapping)
                        xa_unlock(&mapping->i_pages);
                spin_unlock_irqrestore(&pgdata->lru_lock, flags);
  
  void free_transhuge_page(struct page *page)
  {
 -      struct pglist_data *pgdata = NODE_DATA(page_to_nid(page));
 +      struct deferred_split *ds_queue = get_deferred_split_queue(page);
        unsigned long flags;
  
 -      spin_lock_irqsave(&pgdata->split_queue_lock, flags);
 +      spin_lock_irqsave(&ds_queue->split_queue_lock, flags);
        if (!list_empty(page_deferred_list(page))) {
 -              pgdata->split_queue_len--;
 +              ds_queue->split_queue_len--;
                list_del(page_deferred_list(page));
        }
 -      spin_unlock_irqrestore(&pgdata->split_queue_lock, flags);
 +      spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags);
        free_compound_page(page);
  }
  
  void deferred_split_huge_page(struct page *page)
  {
 -      struct pglist_data *pgdata = NODE_DATA(page_to_nid(page));
 +      struct deferred_split *ds_queue = get_deferred_split_queue(page);
 +#ifdef CONFIG_MEMCG
 +      struct mem_cgroup *memcg = compound_head(page)->mem_cgroup;
 +#endif
        unsigned long flags;
  
        VM_BUG_ON_PAGE(!PageTransHuge(page), page);
  
 -      spin_lock_irqsave(&pgdata->split_queue_lock, flags);
 +      /*
 +       * The try_to_unmap() in page reclaim path might reach here too,
 +       * this may cause a race condition to corrupt deferred split queue.
 +       * And, if page reclaim is already handling the same page, it is
 +       * unnecessary to handle it again in shrinker.
 +       *
 +       * Check PageSwapCache to determine if the page is being
 +       * handled by page reclaim since THP swap would add the page into
 +       * swap cache before calling try_to_unmap().
 +       */
 +      if (PageSwapCache(page))
 +              return;
 +
 +      spin_lock_irqsave(&ds_queue->split_queue_lock, flags);
        if (list_empty(page_deferred_list(page))) {
                count_vm_event(THP_DEFERRED_SPLIT_PAGE);
 -              list_add_tail(page_deferred_list(page), &pgdata->split_queue);
 -              pgdata->split_queue_len++;
 +              list_add_tail(page_deferred_list(page), &ds_queue->split_queue);
 +              ds_queue->split_queue_len++;
 +#ifdef CONFIG_MEMCG
 +              if (memcg)
 +                      memcg_set_shrinker_bit(memcg, page_to_nid(page),
 +                                             deferred_split_shrinker.id);
 +#endif
        }
 -      spin_unlock_irqrestore(&pgdata->split_queue_lock, flags);
 +      spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags);
  }
  
  static unsigned long deferred_split_count(struct shrinker *shrink,
                struct shrink_control *sc)
  {
        struct pglist_data *pgdata = NODE_DATA(sc->nid);
 -      return READ_ONCE(pgdata->split_queue_len);
 +      struct deferred_split *ds_queue = &pgdata->deferred_split_queue;
 +
 +#ifdef CONFIG_MEMCG
 +      if (sc->memcg)
 +              ds_queue = &sc->memcg->deferred_split_queue;
 +#endif
 +      return READ_ONCE(ds_queue->split_queue_len);
  }
  
  static unsigned long deferred_split_scan(struct shrinker *shrink,
                struct shrink_control *sc)
  {
        struct pglist_data *pgdata = NODE_DATA(sc->nid);
 +      struct deferred_split *ds_queue = &pgdata->deferred_split_queue;
        unsigned long flags;
        LIST_HEAD(list), *pos, *next;
        struct page *page;
        int split = 0;
  
 -      spin_lock_irqsave(&pgdata->split_queue_lock, flags);
 +#ifdef CONFIG_MEMCG
 +      if (sc->memcg)
 +              ds_queue = &sc->memcg->deferred_split_queue;
 +#endif
 +
 +      spin_lock_irqsave(&ds_queue->split_queue_lock, flags);
        /* Take pin on all head pages to avoid freeing them under us */
 -      list_for_each_safe(pos, next, &pgdata->split_queue) {
 +      list_for_each_safe(pos, next, &ds_queue->split_queue) {
                page = list_entry((void *)pos, struct page, mapping);
                page = compound_head(page);
                if (get_page_unless_zero(page)) {
                } else {
                        /* We lost race with put_compound_page() */
                        list_del_init(page_deferred_list(page));
 -                      pgdata->split_queue_len--;
 +                      ds_queue->split_queue_len--;
                }
                if (!--sc->nr_to_scan)
                        break;
        }
 -      spin_unlock_irqrestore(&pgdata->split_queue_lock, flags);
 +      spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags);
  
        list_for_each_safe(pos, next, &list) {
                page = list_entry((void *)pos, struct page, mapping);
@@@ -2947,15 -2870,15 +2936,15 @@@ next
                put_page(page);
        }
  
 -      spin_lock_irqsave(&pgdata->split_queue_lock, flags);
 -      list_splice_tail(&list, &pgdata->split_queue);
 -      spin_unlock_irqrestore(&pgdata->split_queue_lock, flags);
 +      spin_lock_irqsave(&ds_queue->split_queue_lock, flags);
 +      list_splice_tail(&list, &ds_queue->split_queue);
 +      spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags);
  
        /*
         * Stop shrinker if we didn't split any page, but the queue is empty.
         * This can happen if pages were freed under us.
         */
 -      if (!split && list_empty(&pgdata->split_queue))
 +      if (!split && list_empty(&ds_queue->split_queue))
                return SHRINK_STOP;
        return split;
  }
@@@ -2964,8 -2887,7 +2953,8 @@@ static struct shrinker deferred_split_s
        .count_objects = deferred_split_count,
        .scan_objects = deferred_split_scan,
        .seeks = DEFAULT_SEEKS,
 -      .flags = SHRINKER_NUMA_AWARE,
 +      .flags = SHRINKER_NUMA_AWARE | SHRINKER_MEMCG_AWARE |
 +               SHRINKER_NONSLAB,
  };
  
  #ifdef CONFIG_DEBUG_FS
diff --combined mm/mempolicy.c
index de27d08b1ff8d286ce0eb463041eefc8a746d869,8caab1f81a52efc3b4d5fe222433b0754dfd511f..4ae967bcf95481bc5082904e92426320fa67d64b
@@@ -68,7 -68,7 +68,7 @@@
  #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
  
  #include <linux/mempolicy.h>
 -#include <linux/mm.h>
 +#include <linux/pagewalk.h>
  #include <linux/highmem.h>
  #include <linux/hugetlb.h>
  #include <linux/kernel.h>
@@@ -655,12 -655,6 +655,12 @@@ static int queue_pages_test_walk(unsign
        return 1;
  }
  
 +static const struct mm_walk_ops queue_pages_walk_ops = {
 +      .hugetlb_entry          = queue_pages_hugetlb,
 +      .pmd_entry              = queue_pages_pte_range,
 +      .test_walk              = queue_pages_test_walk,
 +};
 +
  /*
   * Walk through page tables and collect pages to be migrated.
   *
@@@ -685,8 -679,15 +685,8 @@@ queue_pages_range(struct mm_struct *mm
                .nmask = nodes,
                .prev = NULL,
        };
 -      struct mm_walk queue_pages_walk = {
 -              .hugetlb_entry = queue_pages_hugetlb,
 -              .pmd_entry = queue_pages_pte_range,
 -              .test_walk = queue_pages_test_walk,
 -              .mm = mm,
 -              .private = &qp,
 -      };
  
 -      return walk_page_range(start, end, &queue_pages_walk);
 +      return walk_page_range(mm, start, end, &queue_pages_walk_ops, &qp);
  }
  
  /*
@@@ -1179,8 -1180,8 +1179,8 @@@ static struct page *new_page(struct pag
        } else if (PageTransHuge(page)) {
                struct page *thp;
  
-               thp = alloc_pages_vma(GFP_TRANSHUGE, HPAGE_PMD_ORDER, vma,
-                               address, numa_node_id());
+               thp = alloc_hugepage_vma(GFP_TRANSHUGE, vma, address,
+                                        HPAGE_PMD_ORDER);
                if (!thp)
                        return NULL;
                prep_transhuge_page(thp);
@@@ -1405,7 -1406,6 +1405,7 @@@ static long kernel_mbind(unsigned long 
        int err;
        unsigned short mode_flags;
  
 +      start = untagged_addr(start);
        mode_flags = mode & MPOL_MODE_FLAGS;
        mode &= ~MPOL_MODE_FLAGS;
        if (mode >= MPOL_MAX)
@@@ -1513,6 -1513,10 +1513,6 @@@ static int kernel_migrate_pages(pid_t p
        if (nodes_empty(*new))
                goto out_put;
  
 -      nodes_and(*new, *new, node_states[N_MEMORY]);
 -      if (nodes_empty(*new))
 -              goto out_put;
 -
        err = security_task_movememory(task);
        if (err)
                goto out_put;
@@@ -1559,8 -1563,6 +1559,8 @@@ static int kernel_get_mempolicy(int __u
        int uninitialized_var(pval);
        nodemask_t nodes;
  
 +      addr = untagged_addr(addr);
 +
        if (nmask != NULL && maxnode < nr_node_ids)
                return -EINVAL;
  
@@@ -1732,7 -1734,7 +1732,7 @@@ struct mempolicy *__get_vma_policy(stru
   * freeing by another task.  It is the caller's responsibility to free the
   * extra reference for shared policies.
   */
- struct mempolicy *get_vma_policy(struct vm_area_struct *vma,
+ static struct mempolicy *get_vma_policy(struct vm_area_struct *vma,
                                                unsigned long addr)
  {
        struct mempolicy *pol = __get_vma_policy(vma, addr);
@@@ -2081,6 -2083,7 +2081,7 @@@ static struct page *alloc_page_interlea
   *    @vma:  Pointer to VMA or NULL if not available.
   *    @addr: Virtual Address of the allocation. Must be inside the VMA.
   *    @node: Which node to prefer for allocation (modulo policy).
+  *    @hugepage: for hugepages try only the preferred node if possible
   *
   *    This function allocates a page from the kernel page pool and applies
   *    a NUMA policy associated with the VMA or the current process.
   */
  struct page *
  alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma,
-               unsigned long addr, int node)
+               unsigned long addr, int node, bool hugepage)
  {
        struct mempolicy *pol;
        struct page *page;
                mpol_cond_put(pol);
                page = alloc_page_interleave(gfp, order, nid);
                goto out;
+       }
+       if (unlikely(IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && hugepage)) {
+               int hpage_node = node;
+               /*
+                * For hugepage allocation and non-interleave policy which
+                * allows the current node (or other explicitly preferred
+                * node) we only try to allocate from the current/preferred
+                * node and don't fall back to other nodes, as the cost of
+                * remote accesses would likely offset THP benefits.
+                *
+                * If the policy is interleave, or does not allow the current
+                * node in its nodemask, we allocate the standard way.
+                */
+               if (pol->mode == MPOL_PREFERRED && !(pol->flags & MPOL_F_LOCAL))
+                       hpage_node = pol->v.preferred_node;
+               nmask = policy_nodemask(gfp, pol);
+               if (!nmask || node_isset(hpage_node, *nmask)) {
+                       mpol_cond_put(pol);
+                       page = __alloc_pages_node(hpage_node,
+                                               gfp | __GFP_THISNODE, order);
+                       /*
+                        * If hugepage allocations are configured to always
+                        * synchronous compact or the vma has been madvised
+                        * to prefer hugepage backing, retry allowing remote
+                        * memory as well.
+                        */
+                       if (!page && (gfp & __GFP_DIRECT_RECLAIM))
+                               page = __alloc_pages_node(hpage_node,
+                                               gfp | __GFP_NORETRY, order);
+                       goto out;
+               }
        }
  
        nmask = policy_nodemask(gfp, pol);
diff --combined mm/page_alloc.c
index 3334a769eb91e1c1cc374560125c8c64e32da979,87cbd92065e53cd45d1412b200f96e8769896aaf..15c2050c629b1d8aacb2f36aac7ac09c54c95449
@@@ -670,7 -670,6 +670,7 @@@ out
  
  void free_compound_page(struct page *page)
  {
 +      mem_cgroup_uncharge(page);
        __free_pages_ok(page, compound_order(page));
  }
  
@@@ -3512,7 -3511,7 +3512,7 @@@ bool zone_watermark_ok_safe(struct zon
  static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone)
  {
        return node_distance(zone_to_nid(local_zone), zone_to_nid(zone)) <=
 -                              RECLAIM_DISTANCE;
 +                              node_reclaim_distance;
  }
  #else /* CONFIG_NUMA */
  static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone)
@@@ -3955,23 -3954,15 +3955,23 @@@ should_compact_retry(struct alloc_conte
        if (compaction_failed(compact_result))
                goto check_priority;
  
 +      /*
 +       * compaction was skipped because there are not enough order-0 pages
 +       * to work with, so we retry only if it looks like reclaim can help.
 +       */
 +      if (compaction_needs_reclaim(compact_result)) {
 +              ret = compaction_zonelist_suitable(ac, order, alloc_flags);
 +              goto out;
 +      }
 +
        /*
         * make sure the compaction wasn't deferred or didn't bail out early
         * due to locks contention before we declare that we should give up.
 -       * But do not retry if the given zonelist is not suitable for
 -       * compaction.
 +       * But the next retry should use a higher priority if allowed, so
 +       * we don't just keep bailing out endlessly.
         */
        if (compaction_withdrawn(compact_result)) {
 -              ret = compaction_zonelist_suitable(ac, order, alloc_flags);
 -              goto out;
 +              goto check_priority;
        }
  
        /*
@@@ -4467,6 -4458,28 +4467,28 @@@ retry_cpuset
                if (page)
                        goto got_pg;
  
+                if (order >= pageblock_order && (gfp_mask & __GFP_IO)) {
+                       /*
+                        * If allocating entire pageblock(s) and compaction
+                        * failed because all zones are below low watermarks
+                        * or is prohibited because it recently failed at this
+                        * order, fail immediately.
+                        *
+                        * Reclaim is
+                        *  - potentially very expensive because zones are far
+                        *    below their low watermarks or this is part of very
+                        *    bursty high order allocations,
+                        *  - not guaranteed to help because isolate_freepages()
+                        *    may not iterate over freed pages as part of its
+                        *    linear scan, and
+                        *  - unlikely to make entire pageblocks free on its
+                        *    own.
+                        */
+                       if (compact_result == COMPACT_SKIPPED ||
+                           compact_result == COMPACT_DEFERRED)
+                               goto nopage;
+               }
                /*
                 * Checks for costly allocations with __GFP_NORETRY, which
                 * includes THP page fault allocations
@@@ -5980,7 -5993,7 +6002,7 @@@ void __ref memmap_init_zone_device(stru
                }
        }
  
 -      pr_info("%s initialised, %lu pages in %ums\n", dev_name(pgmap->dev),
 +      pr_info("%s initialised %lu pages in %ums\n", __func__,
                size, jiffies_to_msecs(jiffies - start));
  }
  
@@@ -6647,11 -6660,9 +6669,11 @@@ static unsigned long __init calc_memmap
  #ifdef CONFIG_TRANSPARENT_HUGEPAGE
  static void pgdat_init_split_queue(struct pglist_data *pgdat)
  {
 -      spin_lock_init(&pgdat->split_queue_lock);
 -      INIT_LIST_HEAD(&pgdat->split_queue);
 -      pgdat->split_queue_len = 0;
 +      struct deferred_split *ds_queue = &pgdat->deferred_split_queue;
 +
 +      spin_lock_init(&ds_queue->split_queue_lock);
 +      INIT_LIST_HEAD(&ds_queue->split_queue);
 +      ds_queue->split_queue_len = 0;
  }
  #else
  static void pgdat_init_split_queue(struct pglist_data *pgdat) {}
@@@ -8207,7 -8218,7 +8229,7 @@@ bool has_unmovable_pages(struct zone *z
                        if (!hugepage_migration_supported(page_hstate(head)))
                                goto unmovable;
  
 -                      skip_pages = (1 << compound_order(head)) - (page - head);
 +                      skip_pages = compound_nr(head) - (page - head);
                        iter += skip_pages - 1;
                        continue;
                }
diff --combined mm/shmem.c
index 30ce722c23fa976cbc79ed6fdadd73d556b92881,626d8c74b973f173d3062ee118580b649d35073a..cd570cc79c76ab9873ce123dd7fd1d0e4412c0bc
@@@ -37,7 -37,6 +37,7 @@@
  #include <linux/khugepaged.h>
  #include <linux/hugetlb.h>
  #include <linux/frontswap.h>
 +#include <linux/fs_parser.h>
  
  #include <asm/tlbflush.h> /* for arch/microblaze update_mmu_cache() */
  
@@@ -108,20 -107,6 +108,20 @@@ struct shmem_falloc 
        pgoff_t nr_unswapped;   /* how often writepage refused to swap out */
  };
  
 +struct shmem_options {
 +      unsigned long long blocks;
 +      unsigned long long inodes;
 +      struct mempolicy *mpol;
 +      kuid_t uid;
 +      kgid_t gid;
 +      umode_t mode;
 +      int huge;
 +      int seen;
 +#define SHMEM_SEEN_BLOCKS 1
 +#define SHMEM_SEEN_INODES 2
 +#define SHMEM_SEEN_HUGE 4
 +};
 +
  #ifdef CONFIG_TMPFS
  static unsigned long shmem_default_max_blocks(void)
  {
@@@ -609,7 -594,7 +609,7 @@@ static int shmem_add_to_page_cache(stru
  {
        XA_STATE_ORDER(xas, &mapping->i_pages, index, compound_order(page));
        unsigned long i = 0;
 -      unsigned long nr = 1UL << compound_order(page);
 +      unsigned long nr = compound_nr(page);
  
        VM_BUG_ON_PAGE(PageTail(page), page);
        VM_BUG_ON_PAGE(index != round_down(index, nr), page);
                if (xas_error(&xas))
                        goto unlock;
  next:
 -              xas_store(&xas, page + i);
 +              xas_store(&xas, page);
                if (++i < nr) {
                        xas_next(&xas);
                        goto next;
@@@ -1481,7 -1466,7 +1481,7 @@@ static struct page *shmem_alloc_hugepag
  
        shmem_pseudo_vma_init(&pvma, info, hindex);
        page = alloc_pages_vma(gfp | __GFP_COMP | __GFP_NORETRY | __GFP_NOWARN,
-                       HPAGE_PMD_ORDER, &pvma, 0, numa_node_id());
+                       HPAGE_PMD_ORDER, &pvma, 0, numa_node_id(), true);
        shmem_pseudo_vma_destroy(&pvma);
        if (page)
                prep_transhuge_page(page);
@@@ -1734,7 -1719,7 +1734,7 @@@ unlock
   * vm. If we swap it in we mark it dirty since we also free the swap
   * entry since a page cannot live in both the swap and page cache.
   *
 - * fault_mm and fault_type are only supplied by shmem_fault:
 + * vmf and fault_type are only supplied by shmem_fault:
   * otherwise they are NULL.
   */
  static int shmem_getpage_gfp(struct inode *inode, pgoff_t index,
@@@ -1884,7 -1869,7 +1884,7 @@@ alloc_nohuge
        lru_cache_add_anon(page);
  
        spin_lock_irq(&info->lock);
 -      info->alloced += 1 << compound_order(page);
 +      info->alloced += compound_nr(page);
        inode->i_blocks += BLOCKS_PER_PAGE << compound_order(page);
        shmem_recalc_inode(inode);
        spin_unlock_irq(&info->lock);
@@@ -1925,7 -1910,7 +1925,7 @@@ clear
                struct page *head = compound_head(page);
                int i;
  
 -              for (i = 0; i < (1 << compound_order(head)); i++) {
 +              for (i = 0; i < compound_nr(head); i++) {
                        clear_highpage(head + i);
                        flush_dcache_page(head + i);
                }
         * Error recovery.
         */
  unacct:
 -      shmem_inode_unacct_blocks(inode, 1 << compound_order(page));
 +      shmem_inode_unacct_blocks(inode, compound_nr(page));
  
        if (PageTransHuge(page)) {
                unlock_page(page);
@@@ -3364,126 -3349,16 +3364,126 @@@ static const struct export_operations s
        .fh_to_dentry   = shmem_fh_to_dentry,
  };
  
 -static int shmem_parse_options(char *options, struct shmem_sb_info *sbinfo,
 -                             bool remount)
 +enum shmem_param {
 +      Opt_gid,
 +      Opt_huge,
 +      Opt_mode,
 +      Opt_mpol,
 +      Opt_nr_blocks,
 +      Opt_nr_inodes,
 +      Opt_size,
 +      Opt_uid,
 +};
 +
 +static const struct fs_parameter_spec shmem_param_specs[] = {
 +      fsparam_u32   ("gid",           Opt_gid),
 +      fsparam_enum  ("huge",          Opt_huge),
 +      fsparam_u32oct("mode",          Opt_mode),
 +      fsparam_string("mpol",          Opt_mpol),
 +      fsparam_string("nr_blocks",     Opt_nr_blocks),
 +      fsparam_string("nr_inodes",     Opt_nr_inodes),
 +      fsparam_string("size",          Opt_size),
 +      fsparam_u32   ("uid",           Opt_uid),
 +      {}
 +};
 +
 +static const struct fs_parameter_enum shmem_param_enums[] = {
 +      { Opt_huge,     "never",        SHMEM_HUGE_NEVER },
 +      { Opt_huge,     "always",       SHMEM_HUGE_ALWAYS },
 +      { Opt_huge,     "within_size",  SHMEM_HUGE_WITHIN_SIZE },
 +      { Opt_huge,     "advise",       SHMEM_HUGE_ADVISE },
 +      {}
 +};
 +
 +const struct fs_parameter_description shmem_fs_parameters = {
 +      .name           = "tmpfs",
 +      .specs          = shmem_param_specs,
 +      .enums          = shmem_param_enums,
 +};
 +
 +static int shmem_parse_one(struct fs_context *fc, struct fs_parameter *param)
 +{
 +      struct shmem_options *ctx = fc->fs_private;
 +      struct fs_parse_result result;
 +      unsigned long long size;
 +      char *rest;
 +      int opt;
 +
 +      opt = fs_parse(fc, &shmem_fs_parameters, param, &result);
 +      if (opt < 0)
 +              return opt;
 +
 +      switch (opt) {
 +      case Opt_size:
 +              size = memparse(param->string, &rest);
 +              if (*rest == '%') {
 +                      size <<= PAGE_SHIFT;
 +                      size *= totalram_pages();
 +                      do_div(size, 100);
 +                      rest++;
 +              }
 +              if (*rest)
 +                      goto bad_value;
 +              ctx->blocks = DIV_ROUND_UP(size, PAGE_SIZE);
 +              ctx->seen |= SHMEM_SEEN_BLOCKS;
 +              break;
 +      case Opt_nr_blocks:
 +              ctx->blocks = memparse(param->string, &rest);
 +              if (*rest)
 +                      goto bad_value;
 +              ctx->seen |= SHMEM_SEEN_BLOCKS;
 +              break;
 +      case Opt_nr_inodes:
 +              ctx->inodes = memparse(param->string, &rest);
 +              if (*rest)
 +                      goto bad_value;
 +              ctx->seen |= SHMEM_SEEN_INODES;
 +              break;
 +      case Opt_mode:
 +              ctx->mode = result.uint_32 & 07777;
 +              break;
 +      case Opt_uid:
 +              ctx->uid = make_kuid(current_user_ns(), result.uint_32);
 +              if (!uid_valid(ctx->uid))
 +                      goto bad_value;
 +              break;
 +      case Opt_gid:
 +              ctx->gid = make_kgid(current_user_ns(), result.uint_32);
 +              if (!gid_valid(ctx->gid))
 +                      goto bad_value;
 +              break;
 +      case Opt_huge:
 +              ctx->huge = result.uint_32;
 +              if (ctx->huge != SHMEM_HUGE_NEVER &&
 +                  !(IS_ENABLED(CONFIG_TRANSPARENT_HUGE_PAGECACHE) &&
 +                    has_transparent_hugepage()))
 +                      goto unsupported_parameter;
 +              ctx->seen |= SHMEM_SEEN_HUGE;
 +              break;
 +      case Opt_mpol:
 +              if (IS_ENABLED(CONFIG_NUMA)) {
 +                      mpol_put(ctx->mpol);
 +                      ctx->mpol = NULL;
 +                      if (mpol_parse_str(param->string, &ctx->mpol))
 +                              goto bad_value;
 +                      break;
 +              }
 +              goto unsupported_parameter;
 +      }
 +      return 0;
 +
 +unsupported_parameter:
 +      return invalf(fc, "tmpfs: Unsupported parameter '%s'", param->key);
 +bad_value:
 +      return invalf(fc, "tmpfs: Bad value for '%s'", param->key);
 +}
 +
 +static int shmem_parse_options(struct fs_context *fc, void *data)
  {
 -      char *this_char, *value, *rest;
 -      struct mempolicy *mpol = NULL;
 -      uid_t uid;
 -      gid_t gid;
 +      char *options = data;
  
        while (options != NULL) {
 -              this_char = options;
 +              char *this_char = options;
                for (;;) {
                        /*
                         * NUL-terminate this option: unfortunately,
                                break;
                        }
                }
 -              if (!*this_char)
 -                      continue;
 -              if ((value = strchr(this_char,'=')) != NULL) {
 -                      *value++ = 0;
 -              } else {
 -                      pr_err("tmpfs: No value for mount option '%s'\n",
 -                             this_char);
 -                      goto error;
 -              }
 -
 -              if (!strcmp(this_char,"size")) {
 -                      unsigned long long size;
 -                      size = memparse(value,&rest);
 -                      if (*rest == '%') {
 -                              size <<= PAGE_SHIFT;
 -                              size *= totalram_pages();
 -                              do_div(size, 100);
 -                              rest++;
 +              if (*this_char) {
 +                      char *value = strchr(this_char,'=');
 +                      size_t len = 0;
 +                      int err;
 +
 +                      if (value) {
 +                              *value++ = '\0';
 +                              len = strlen(value);
                        }
 -                      if (*rest)
 -                              goto bad_val;
 -                      sbinfo->max_blocks =
 -                              DIV_ROUND_UP(size, PAGE_SIZE);
 -              } else if (!strcmp(this_char,"nr_blocks")) {
 -                      sbinfo->max_blocks = memparse(value, &rest);
 -                      if (*rest)
 -                              goto bad_val;
 -              } else if (!strcmp(this_char,"nr_inodes")) {
 -                      sbinfo->max_inodes = memparse(value, &rest);
 -                      if (*rest)
 -                              goto bad_val;
 -              } else if (!strcmp(this_char,"mode")) {
 -                      if (remount)
 -                              continue;
 -                      sbinfo->mode = simple_strtoul(value, &rest, 8) & 07777;
 -                      if (*rest)
 -                              goto bad_val;
 -              } else if (!strcmp(this_char,"uid")) {
 -                      if (remount)
 -                              continue;
 -                      uid = simple_strtoul(value, &rest, 0);
 -                      if (*rest)
 -                              goto bad_val;
 -                      sbinfo->uid = make_kuid(current_user_ns(), uid);
 -                      if (!uid_valid(sbinfo->uid))
 -                              goto bad_val;
 -              } else if (!strcmp(this_char,"gid")) {
 -                      if (remount)
 -                              continue;
 -                      gid = simple_strtoul(value, &rest, 0);
 -                      if (*rest)
 -                              goto bad_val;
 -                      sbinfo->gid = make_kgid(current_user_ns(), gid);
 -                      if (!gid_valid(sbinfo->gid))
 -                              goto bad_val;
 -#ifdef CONFIG_TRANSPARENT_HUGE_PAGECACHE
 -              } else if (!strcmp(this_char, "huge")) {
 -                      int huge;
 -                      huge = shmem_parse_huge(value);
 -                      if (huge < 0)
 -                              goto bad_val;
 -                      if (!has_transparent_hugepage() &&
 -                                      huge != SHMEM_HUGE_NEVER)
 -                              goto bad_val;
 -                      sbinfo->huge = huge;
 -#endif
 -#ifdef CONFIG_NUMA
 -              } else if (!strcmp(this_char,"mpol")) {
 -                      mpol_put(mpol);
 -                      mpol = NULL;
 -                      if (mpol_parse_str(value, &mpol))
 -                              goto bad_val;
 -#endif
 -              } else {
 -                      pr_err("tmpfs: Bad mount option %s\n", this_char);
 -                      goto error;
 +                      err = vfs_parse_fs_string(fc, this_char, value, len);
 +                      if (err < 0)
 +                              return err;
                }
        }
 -      sbinfo->mpol = mpol;
        return 0;
 -
 -bad_val:
 -      pr_err("tmpfs: Bad value '%s' for mount option '%s'\n",
 -             value, this_char);
 -error:
 -      mpol_put(mpol);
 -      return 1;
 -
  }
  
 -static int shmem_remount_fs(struct super_block *sb, int *flags, char *data)
 +/*
 + * Reconfigure a shmem filesystem.
 + *
 + * Note that we disallow change from limited->unlimited blocks/inodes while any
 + * are in use; but we must separately disallow unlimited->limited, because in
 + * that case we have no record of how much is already in use.
 + */
 +static int shmem_reconfigure(struct fs_context *fc)
  {
 -      struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
 -      struct shmem_sb_info config = *sbinfo;
 +      struct shmem_options *ctx = fc->fs_private;
 +      struct shmem_sb_info *sbinfo = SHMEM_SB(fc->root->d_sb);
        unsigned long inodes;
 -      int error = -EINVAL;
 -
 -      config.mpol = NULL;
 -      if (shmem_parse_options(data, &config, true))
 -              return error;
 +      const char *err;
  
        spin_lock(&sbinfo->stat_lock);
        inodes = sbinfo->max_inodes - sbinfo->free_inodes;
 -      if (percpu_counter_compare(&sbinfo->used_blocks, config.max_blocks) > 0)
 -              goto out;
 -      if (config.max_inodes < inodes)
 -              goto out;
 -      /*
 -       * Those tests disallow limited->unlimited while any are in use;
 -       * but we must separately disallow unlimited->limited, because
 -       * in that case we have no record of how much is already in use.
 -       */
 -      if (config.max_blocks && !sbinfo->max_blocks)
 -              goto out;
 -      if (config.max_inodes && !sbinfo->max_inodes)
 -              goto out;
 +      if ((ctx->seen & SHMEM_SEEN_BLOCKS) && ctx->blocks) {
 +              if (!sbinfo->max_blocks) {
 +                      err = "Cannot retroactively limit size";
 +                      goto out;
 +              }
 +              if (percpu_counter_compare(&sbinfo->used_blocks,
 +                                         ctx->blocks) > 0) {
 +                      err = "Too small a size for current use";
 +                      goto out;
 +              }
 +      }
 +      if ((ctx->seen & SHMEM_SEEN_INODES) && ctx->inodes) {
 +              if (!sbinfo->max_inodes) {
 +                      err = "Cannot retroactively limit inodes";
 +                      goto out;
 +              }
 +              if (ctx->inodes < inodes) {
 +                      err = "Too few inodes for current use";
 +                      goto out;
 +              }
 +      }
  
 -      error = 0;
 -      sbinfo->huge = config.huge;
 -      sbinfo->max_blocks  = config.max_blocks;
 -      sbinfo->max_inodes  = config.max_inodes;
 -      sbinfo->free_inodes = config.max_inodes - inodes;
 +      if (ctx->seen & SHMEM_SEEN_HUGE)
 +              sbinfo->huge = ctx->huge;
 +      if (ctx->seen & SHMEM_SEEN_BLOCKS)
 +              sbinfo->max_blocks  = ctx->blocks;
 +      if (ctx->seen & SHMEM_SEEN_INODES) {
 +              sbinfo->max_inodes  = ctx->inodes;
 +              sbinfo->free_inodes = ctx->inodes - inodes;
 +      }
  
        /*
         * Preserve previous mempolicy unless mpol remount option was specified.
         */
 -      if (config.mpol) {
 +      if (ctx->mpol) {
                mpol_put(sbinfo->mpol);
 -              sbinfo->mpol = config.mpol;     /* transfers initial ref */
 +              sbinfo->mpol = ctx->mpol;       /* transfers initial ref */
 +              ctx->mpol = NULL;
        }
 +      spin_unlock(&sbinfo->stat_lock);
 +      return 0;
  out:
        spin_unlock(&sbinfo->stat_lock);
 -      return error;
 +      return invalf(fc, "tmpfs: %s", err);
  }
  
  static int shmem_show_options(struct seq_file *seq, struct dentry *root)
@@@ -3616,9 -3547,8 +3616,9 @@@ static void shmem_put_super(struct supe
        sb->s_fs_info = NULL;
  }
  
 -int shmem_fill_super(struct super_block *sb, void *data, int silent)
 +static int shmem_fill_super(struct super_block *sb, struct fs_context *fc)
  {
 +      struct shmem_options *ctx = fc->fs_private;
        struct inode *inode;
        struct shmem_sb_info *sbinfo;
        int err = -ENOMEM;
        if (!sbinfo)
                return -ENOMEM;
  
 -      sbinfo->mode = 0777 | S_ISVTX;
 -      sbinfo->uid = current_fsuid();
 -      sbinfo->gid = current_fsgid();
        sb->s_fs_info = sbinfo;
  
  #ifdef CONFIG_TMPFS
         * but the internal instance is left unlimited.
         */
        if (!(sb->s_flags & SB_KERNMOUNT)) {
 -              sbinfo->max_blocks = shmem_default_max_blocks();
 -              sbinfo->max_inodes = shmem_default_max_inodes();
 -              if (shmem_parse_options(data, sbinfo, false)) {
 -                      err = -EINVAL;
 -                      goto failed;
 -              }
 +              if (!(ctx->seen & SHMEM_SEEN_BLOCKS))
 +                      ctx->blocks = shmem_default_max_blocks();
 +              if (!(ctx->seen & SHMEM_SEEN_INODES))
 +                      ctx->inodes = shmem_default_max_inodes();
        } else {
                sb->s_flags |= SB_NOUSER;
        }
  #else
        sb->s_flags |= SB_NOUSER;
  #endif
 +      sbinfo->max_blocks = ctx->blocks;
 +      sbinfo->free_inodes = sbinfo->max_inodes = ctx->inodes;
 +      sbinfo->uid = ctx->uid;
 +      sbinfo->gid = ctx->gid;
 +      sbinfo->mode = ctx->mode;
 +      sbinfo->huge = ctx->huge;
 +      sbinfo->mpol = ctx->mpol;
 +      ctx->mpol = NULL;
  
        spin_lock_init(&sbinfo->stat_lock);
        if (percpu_counter_init(&sbinfo->used_blocks, 0, GFP_KERNEL))
                goto failed;
 -      sbinfo->free_inodes = sbinfo->max_inodes;
        spin_lock_init(&sbinfo->shrinklist_lock);
        INIT_LIST_HEAD(&sbinfo->shrinklist);
  
@@@ -3694,31 -3622,6 +3694,31 @@@ failed
        return err;
  }
  
 +static int shmem_get_tree(struct fs_context *fc)
 +{
 +      return get_tree_nodev(fc, shmem_fill_super);
 +}
 +
 +static void shmem_free_fc(struct fs_context *fc)
 +{
 +      struct shmem_options *ctx = fc->fs_private;
 +
 +      if (ctx) {
 +              mpol_put(ctx->mpol);
 +              kfree(ctx);
 +      }
 +}
 +
 +static const struct fs_context_operations shmem_fs_context_ops = {
 +      .free                   = shmem_free_fc,
 +      .get_tree               = shmem_get_tree,
 +#ifdef CONFIG_TMPFS
 +      .parse_monolithic       = shmem_parse_options,
 +      .parse_param            = shmem_parse_one,
 +      .reconfigure            = shmem_reconfigure,
 +#endif
 +};
 +
  static struct kmem_cache *shmem_inode_cachep;
  
  static struct inode *shmem_alloc_inode(struct super_block *sb)
@@@ -3835,6 -3738,7 +3835,6 @@@ static const struct super_operations sh
        .destroy_inode  = shmem_destroy_inode,
  #ifdef CONFIG_TMPFS
        .statfs         = shmem_statfs,
 -      .remount_fs     = shmem_remount_fs,
        .show_options   = shmem_show_options,
  #endif
        .evict_inode    = shmem_evict_inode,
@@@ -3855,30 -3759,16 +3855,30 @@@ static const struct vm_operations_struc
  #endif
  };
  
 -static struct dentry *shmem_mount(struct file_system_type *fs_type,
 -      int flags, const char *dev_name, void *data)
 +int shmem_init_fs_context(struct fs_context *fc)
  {
 -      return mount_nodev(fs_type, flags, data, shmem_fill_super);
 +      struct shmem_options *ctx;
 +
 +      ctx = kzalloc(sizeof(struct shmem_options), GFP_KERNEL);
 +      if (!ctx)
 +              return -ENOMEM;
 +
 +      ctx->mode = 0777 | S_ISVTX;
 +      ctx->uid = current_fsuid();
 +      ctx->gid = current_fsgid();
 +
 +      fc->fs_private = ctx;
 +      fc->ops = &shmem_fs_context_ops;
 +      return 0;
  }
  
  static struct file_system_type shmem_fs_type = {
        .owner          = THIS_MODULE,
        .name           = "tmpfs",
 -      .mount          = shmem_mount,
 +      .init_fs_context = shmem_init_fs_context,
 +#ifdef CONFIG_TMPFS
 +      .parameters     = &shmem_fs_parameters,
 +#endif
        .kill_sb        = kill_litter_super,
        .fs_flags       = FS_USERNS_MOUNT,
  };
@@@ -4022,8 -3912,7 +4022,8 @@@ bool shmem_huge_enabled(struct vm_area_
  
  static struct file_system_type shmem_fs_type = {
        .name           = "tmpfs",
 -      .mount          = ramfs_mount,
 +      .init_fs_context = ramfs_init_fs_context,
 +      .parameters     = &ramfs_fs_parameters,
        .kill_sb        = kill_litter_super,
        .fs_flags       = FS_USERNS_MOUNT,
  };